mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-03 04:40:09 +00:00
cc8e106a2b
Introduce per-user configurable home page sections and a unified Audible cache/category model. Adds Prisma models (UserHomeSection, AudibleCacheCategory) and migrations to create tables and remove legacy popular/new_release flags; updates schema.prisma accordingly. Add API routes for user home sections, live Audible categories, and category-based audiobook listing, and refactor popular/new-releases/covers routes to read from AudibleCacheCategory. Frontend: new HomeSection component, HomeSectionConfigModal, useHomeSections hook, and homepage changes to render dynamic sections plus image fallback to a placeholder SVG. Also add placeholder_cover.svg and tests for home sections and the audible refresh processor.
1350 lines
49 KiB
TypeScript
1350 lines
49 KiB
TypeScript
/**
|
||
* Component: Audible Integration Service (Web Scraping)
|
||
* Documentation: documentation/integrations/audible.md
|
||
*/
|
||
|
||
import axios, { AxiosInstance } from 'axios';
|
||
import * as cheerio from 'cheerio';
|
||
import { RMABLogger } from '../utils/logger';
|
||
import { getConfigService } from '../services/config.service';
|
||
import { AudibleRegion, AUDIBLE_REGIONS, DEFAULT_AUDIBLE_REGION } from '../types/audible';
|
||
import {
|
||
getLanguageForRegion,
|
||
stripPrefixes,
|
||
buildContainsSelector,
|
||
extractByPatterns,
|
||
isAcceptedLanguage,
|
||
type LanguageConfig,
|
||
} from '../constants/language-config';
|
||
import {
|
||
pickUserAgent,
|
||
getBrowserHeaders,
|
||
jitteredBackoff,
|
||
AdaptivePacer,
|
||
FetchResultMeta,
|
||
} from '../utils/scrape-resilience';
|
||
import { parseRuntime as parseRuntimeUtil } from '../utils/parse-runtime';
|
||
|
||
// Module-level logger
|
||
const logger = RMABLogger.create('Audible');
|
||
|
||
/**
|
||
* Audible supports a pageSize query parameter (default ~20).
|
||
* Using 50 significantly reduces the number of HTTP requests needed
|
||
* for bulk operations like popular/new-release refreshes and search.
|
||
*/
|
||
const AUDIBLE_PAGE_SIZE = 50;
|
||
|
||
export interface AudibleAudiobook {
|
||
asin: string;
|
||
title: string;
|
||
author: string;
|
||
authorAsin?: string;
|
||
narrator?: string;
|
||
description?: string;
|
||
coverArtUrl?: string;
|
||
durationMinutes?: number;
|
||
releaseDate?: string;
|
||
rating?: number;
|
||
genres?: string[];
|
||
series?: string;
|
||
seriesPart?: string;
|
||
seriesAsin?: string;
|
||
}
|
||
|
||
export interface AudibleSearchResult {
|
||
query: string;
|
||
results: AudibleAudiobook[];
|
||
totalResults: number;
|
||
page: number;
|
||
hasMore: boolean;
|
||
}
|
||
|
||
export interface AuthorBooksResult {
|
||
books: AudibleAudiobook[];
|
||
hasMore: boolean;
|
||
page: number;
|
||
totalResults: number;
|
||
}
|
||
|
||
export class AudibleService {
|
||
private client!: AxiosInstance;
|
||
private baseUrl: string = 'https://www.audible.com';
|
||
private region: AudibleRegion = 'us';
|
||
private initialized: boolean = false;
|
||
private sessionUserAgent: string = '';
|
||
private pacer: AdaptivePacer = new AdaptivePacer();
|
||
|
||
constructor() {
|
||
// Client will be created lazily on first use
|
||
}
|
||
|
||
/**
|
||
* Get the current Audible base URL for the configured region
|
||
*/
|
||
public getBaseUrl(): string {
|
||
return this.baseUrl;
|
||
}
|
||
|
||
/**
|
||
* Get the current Audible region code
|
||
*/
|
||
public getRegion(): AudibleRegion {
|
||
return this.region;
|
||
}
|
||
|
||
/**
|
||
* Public fetch wrapper for external scraping modules (e.g. audible-series.ts).
|
||
* Ensures the service is initialized and delegates to fetchWithRetry.
|
||
*/
|
||
public async fetch(url: string, config: any = {}): Promise<{ data: any; meta: FetchResultMeta }> {
|
||
await this.initialize();
|
||
return this.fetchWithRetry(url, config);
|
||
}
|
||
|
||
/**
|
||
* Get the language config for the current region
|
||
*/
|
||
private getLangConfig(): LanguageConfig {
|
||
return getLanguageForRegion(this.region);
|
||
}
|
||
|
||
/**
|
||
* Force re-initialization (used when region config changes)
|
||
*/
|
||
public forceReinitialize(): void {
|
||
logger.info('Force re-initializing AudibleService');
|
||
this.initialized = false;
|
||
}
|
||
|
||
/**
|
||
* Initialize service with configured region
|
||
* Lazy initialization allows async config loading
|
||
* Automatically re-initializes if region has changed
|
||
*/
|
||
private async initialize(): Promise<void> {
|
||
// If already initialized, check if region has changed
|
||
if (this.initialized) {
|
||
const configService = getConfigService();
|
||
const currentRegion = await configService.getAudibleRegion();
|
||
|
||
// If region changed, force re-initialization
|
||
if (currentRegion !== this.region) {
|
||
logger.info(`Region changed from ${this.region} to ${currentRegion}, re-initializing`);
|
||
this.initialized = false;
|
||
} else {
|
||
return; // Region unchanged, use existing initialization
|
||
}
|
||
}
|
||
|
||
try {
|
||
const configService = getConfigService();
|
||
this.region = await configService.getAudibleRegion();
|
||
this.baseUrl = AUDIBLE_REGIONS[this.region].baseUrl;
|
||
this.sessionUserAgent = pickUserAgent();
|
||
this.pacer.reset();
|
||
|
||
logger.info(`Initializing Audible service with region: ${this.region} (${this.baseUrl})`);
|
||
|
||
// Get language config for the region
|
||
const langConfig = getLanguageForRegion(this.region);
|
||
|
||
// Create axios client with region-specific base URL and realistic browser headers
|
||
this.client = axios.create({
|
||
baseURL: this.baseUrl,
|
||
timeout: 15000,
|
||
headers: getBrowserHeaders(this.sessionUserAgent),
|
||
params: {
|
||
ipRedirectOverride: 'true', // Prevent IP-based region redirects
|
||
language: langConfig.scraping.audibleLocaleParam, // Force locale (prevents IP-based language serving)
|
||
},
|
||
});
|
||
|
||
this.initialized = true;
|
||
} catch (error) {
|
||
logger.error('Failed to initialize AudibleService', { error: error instanceof Error ? error.message : String(error) });
|
||
// Fallback to default region
|
||
this.region = DEFAULT_AUDIBLE_REGION;
|
||
this.baseUrl = AUDIBLE_REGIONS[this.region].baseUrl;
|
||
this.sessionUserAgent = pickUserAgent();
|
||
this.pacer.reset();
|
||
|
||
const fallbackLangConfig = getLanguageForRegion(this.region);
|
||
|
||
this.client = axios.create({
|
||
baseURL: this.baseUrl,
|
||
timeout: 15000,
|
||
headers: getBrowserHeaders(this.sessionUserAgent),
|
||
params: {
|
||
ipRedirectOverride: 'true',
|
||
language: fallbackLangConfig.scraping.audibleLocaleParam,
|
||
},
|
||
});
|
||
this.initialized = true;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Fetch with retry logic and jittered exponential backoff.
|
||
* Returns the axios response plus metadata about retries encountered.
|
||
*/
|
||
private async fetchWithRetry(
|
||
url: string,
|
||
config: any = {},
|
||
maxRetries: number = 5
|
||
): Promise<{ data: any; meta: FetchResultMeta }> {
|
||
let lastError: Error | null = null;
|
||
let retriesUsed = 0;
|
||
let encountered503 = false;
|
||
|
||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||
try {
|
||
const response = await this.client.get(url, config);
|
||
return { data: response, meta: { retriesUsed, encountered503 } };
|
||
} catch (error: any) {
|
||
lastError = error;
|
||
const status = error.response?.status;
|
||
const isRetryable = !status || status === 503 || status === 429 || status >= 500;
|
||
|
||
if (status === 503) encountered503 = true;
|
||
|
||
// Don't retry on 404, 403, etc.
|
||
if (!isRetryable) {
|
||
throw error;
|
||
}
|
||
|
||
// Don't retry on last attempt
|
||
if (attempt === maxRetries) {
|
||
break;
|
||
}
|
||
|
||
retriesUsed++;
|
||
|
||
// Jittered exponential backoff instead of predictable doubling
|
||
const backoffMs = jitteredBackoff(attempt);
|
||
logger.info(` Request failed (${status || 'network error'}), retrying in ${backoffMs}ms (attempt ${attempt + 1}/${maxRetries})...`);
|
||
|
||
await this.delay(backoffMs);
|
||
}
|
||
}
|
||
|
||
// All retries exhausted
|
||
throw lastError || new Error('Request failed after retries');
|
||
}
|
||
|
||
/**
|
||
* External API fetch with retry logic and exponential backoff
|
||
* Used for Audnexus and other external APIs
|
||
*/
|
||
private async externalFetchWithRetry(
|
||
url: string,
|
||
config: any = {},
|
||
maxRetries: number = 3
|
||
): Promise<any> {
|
||
let lastError: Error | null = null;
|
||
|
||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||
try {
|
||
return await axios.get(url, config);
|
||
} catch (error: any) {
|
||
lastError = error;
|
||
const status = error.response?.status;
|
||
const isRetryable = !status || status === 503 || status === 429 || status >= 500;
|
||
|
||
// Don't retry on 404, 403, etc.
|
||
if (!isRetryable) {
|
||
throw error;
|
||
}
|
||
|
||
// Don't retry on deterministic 500 errors (e.g. "Release date is in the future")
|
||
if (status === 500) {
|
||
const message = error.response?.data?.message || '';
|
||
if (message.includes('Release date is in the future')) {
|
||
logger.info(` External API returned non-retryable error: ${message}`);
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
// Don't retry on last attempt
|
||
if (attempt === maxRetries) {
|
||
break;
|
||
}
|
||
|
||
// Exponential backoff: 2^attempt * 1000ms (1s, 2s, 4s...)
|
||
const backoffMs = Math.pow(2, attempt) * 1000;
|
||
logger.info(` External API request failed (${status || 'network error'}), retrying in ${backoffMs}ms (attempt ${attempt + 1}/${maxRetries})...`);
|
||
|
||
await this.delay(backoffMs);
|
||
}
|
||
}
|
||
|
||
// All retries exhausted
|
||
throw lastError || new Error('External API request failed after retries');
|
||
}
|
||
|
||
/**
|
||
* Get popular audiobooks from best sellers (with pagination support)
|
||
*/
|
||
async getPopularAudiobooks(limit: number = 20): Promise<AudibleAudiobook[]> {
|
||
await this.initialize();
|
||
|
||
logger.info(` Fetching popular audiobooks (limit: ${limit})...`);
|
||
|
||
const audiobooks: AudibleAudiobook[] = [];
|
||
let page = 1;
|
||
const maxPages = Math.ceil(limit / AUDIBLE_PAGE_SIZE);
|
||
|
||
this.pacer.reset();
|
||
|
||
while (audiobooks.length < limit && page <= maxPages) {
|
||
try {
|
||
logger.info(` Fetching page ${page}/${maxPages}...`);
|
||
|
||
const { data: response, meta } = await this.fetchWithRetry('/adblbestsellers', {
|
||
params: {
|
||
ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects
|
||
pageSize: AUDIBLE_PAGE_SIZE,
|
||
...(page > 1 ? { page } : {}),
|
||
},
|
||
});
|
||
const $ = cheerio.load(response.data);
|
||
|
||
let foundOnPage = 0;
|
||
|
||
// Parse audiobook items from best sellers page
|
||
$('.productListItem').each((index, element) => {
|
||
if (audiobooks.length >= limit) return false;
|
||
|
||
const $el = $(element);
|
||
|
||
// Extract ASIN from data attribute or link - handle both /pd/ and /ac/ URLs
|
||
const asin = $el.find('li').attr('data-asin') ||
|
||
$el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || '';
|
||
|
||
if (!asin) return;
|
||
|
||
// Skip duplicates
|
||
if (audiobooks.some(book => book.asin === asin)) return;
|
||
|
||
const title = $el.find('h3 a').text().trim() ||
|
||
$el.find('.bc-heading a').text().trim();
|
||
|
||
const authorText = $el.find('.authorLabel').text().trim() ||
|
||
$el.find('.bc-size-small .bc-text-bold').first().text().trim();
|
||
|
||
// Extract author ASIN from author link if available
|
||
const authorHref = $el.find('a[href*="/author/"]').first().attr('href') || '';
|
||
const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||
|
||
const narratorText = $el.find('.narratorLabel').text().trim() ||
|
||
$el.find('.bc-size-small .bc-text-bold').eq(1).text().trim();
|
||
|
||
const coverArtUrl = $el.find('img').attr('src') || '';
|
||
|
||
const ratingText = $el.find('.ratingsLabel').text().trim();
|
||
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
|
||
|
||
const langConfig = this.getLangConfig();
|
||
|
||
audiobooks.push({
|
||
asin,
|
||
title,
|
||
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
|
||
authorAsin: authorAsinMatch?.[1] || undefined,
|
||
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
|
||
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
|
||
rating,
|
||
});
|
||
|
||
foundOnPage++;
|
||
});
|
||
|
||
logger.info(` Found ${foundOnPage} audiobooks on page ${page}`);
|
||
|
||
// If we got significantly fewer than requested, probably no more pages
|
||
if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) {
|
||
logger.info(` Reached end of available pages`);
|
||
break;
|
||
}
|
||
|
||
page++;
|
||
|
||
// Adaptive delay between pages based on retry pressure
|
||
if (page <= maxPages && audiobooks.length < limit) {
|
||
await this.delay(this.pacer.reportPageResult(meta));
|
||
}
|
||
} catch (error) {
|
||
logger.error(`Failed to fetch page ${page} of popular audiobooks`, {
|
||
error: error instanceof Error ? error.message : String(error),
|
||
collectedSoFar: audiobooks.length
|
||
});
|
||
// Stop pagination on error, but return what we collected
|
||
break;
|
||
}
|
||
}
|
||
|
||
logger.info(` Found ${audiobooks.length} popular audiobooks across ${page - 1} pages`);
|
||
return audiobooks;
|
||
}
|
||
|
||
/**
|
||
* Get new release audiobooks (with pagination support)
|
||
*/
|
||
async getNewReleases(limit: number = 20): Promise<AudibleAudiobook[]> {
|
||
await this.initialize();
|
||
|
||
logger.info(` Fetching new releases (limit: ${limit})...`);
|
||
|
||
const audiobooks: AudibleAudiobook[] = [];
|
||
let page = 1;
|
||
const maxPages = Math.ceil(limit / AUDIBLE_PAGE_SIZE);
|
||
|
||
this.pacer.reset();
|
||
|
||
while (audiobooks.length < limit && page <= maxPages) {
|
||
try {
|
||
logger.info(` Fetching page ${page}/${maxPages}...`);
|
||
|
||
const { data: response, meta } = await this.fetchWithRetry('/newreleases', {
|
||
params: {
|
||
ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects
|
||
pageSize: AUDIBLE_PAGE_SIZE,
|
||
...(page > 1 ? { page } : {}),
|
||
},
|
||
});
|
||
const $ = cheerio.load(response.data);
|
||
|
||
let foundOnPage = 0;
|
||
|
||
// Parse audiobook items from new releases page
|
||
$('.productListItem').each((index, element) => {
|
||
if (audiobooks.length >= limit) return false;
|
||
|
||
const $el = $(element);
|
||
|
||
// Extract ASIN from data attribute or link - handle both /pd/ and /ac/ URLs
|
||
const asin = $el.find('li').attr('data-asin') ||
|
||
$el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || '';
|
||
|
||
if (!asin) return;
|
||
|
||
// Skip duplicates
|
||
if (audiobooks.some(book => book.asin === asin)) return;
|
||
|
||
const title = $el.find('h3 a').text().trim() ||
|
||
$el.find('.bc-heading a').text().trim();
|
||
|
||
const authorText = $el.find('.authorLabel').text().trim() ||
|
||
$el.find('.bc-size-small .bc-text-bold').first().text().trim();
|
||
|
||
// Extract author ASIN from author link if available
|
||
const authorHref = $el.find('a[href*="/author/"]').first().attr('href') || '';
|
||
const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||
|
||
const narratorText = $el.find('.narratorLabel').text().trim();
|
||
|
||
const coverArtUrl = $el.find('img').attr('src') || '';
|
||
|
||
const ratingText = $el.find('.ratingsLabel').text().trim();
|
||
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
|
||
|
||
const langConfig = this.getLangConfig();
|
||
|
||
audiobooks.push({
|
||
asin,
|
||
title,
|
||
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
|
||
authorAsin: authorAsinMatch?.[1] || undefined,
|
||
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
|
||
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
|
||
rating,
|
||
});
|
||
|
||
foundOnPage++;
|
||
});
|
||
|
||
logger.info(` Found ${foundOnPage} audiobooks on page ${page}`);
|
||
|
||
// If we got significantly fewer than requested, probably no more pages
|
||
if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) {
|
||
logger.info(` Reached end of available pages`);
|
||
break;
|
||
}
|
||
|
||
page++;
|
||
|
||
// Adaptive delay between pages based on retry pressure
|
||
if (page <= maxPages && audiobooks.length < limit) {
|
||
await this.delay(this.pacer.reportPageResult(meta));
|
||
}
|
||
} catch (error) {
|
||
logger.error(`Failed to fetch page ${page} of new releases`, {
|
||
error: error instanceof Error ? error.message : String(error),
|
||
collectedSoFar: audiobooks.length
|
||
});
|
||
// Stop pagination on error, but return what we collected
|
||
break;
|
||
}
|
||
}
|
||
|
||
logger.info(` Found ${audiobooks.length} new releases across ${page - 1} pages`);
|
||
return audiobooks;
|
||
}
|
||
|
||
/**
|
||
* Search for audiobooks
|
||
*/
|
||
async search(query: string, page: number = 1): Promise<AudibleSearchResult> {
|
||
await this.initialize();
|
||
|
||
try {
|
||
logger.info(` Searching for "${query}"...`);
|
||
|
||
const { data: response } = await this.fetchWithRetry('/search', {
|
||
params: {
|
||
ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects
|
||
keywords: query,
|
||
pageSize: AUDIBLE_PAGE_SIZE,
|
||
page,
|
||
},
|
||
});
|
||
|
||
const $ = cheerio.load(response.data);
|
||
|
||
const audiobooks: AudibleAudiobook[] = [];
|
||
|
||
// Parse search results - Audible uses s-result-item for search pages
|
||
$('.s-result-item, .productListItem').each((index, element) => {
|
||
const $el = $(element);
|
||
|
||
// Extract ASIN from product detail link - handle both /pd/ and /ac/ URLs
|
||
const asin = $el.find('li').attr('data-asin') ||
|
||
$el.find('a[href*="/pd/"]').attr('href')?.match(/\/pd\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
|
||
$el.find('a[href*="/ac/"]').attr('href')?.match(/\/ac\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
|
||
$el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || '';
|
||
|
||
if (!asin) return;
|
||
|
||
// Extract title from h2 tag (search results) or h3 (legacy)
|
||
const title = $el.find('h2').first().text().trim() ||
|
||
$el.find('h3 a').text().trim() ||
|
||
$el.find('.bc-heading a').text().trim();
|
||
|
||
// Extract author from author link
|
||
const authorLink = $el.find('a[href*="/author/"]').first();
|
||
const authorText = authorLink.text().trim() ||
|
||
$el.find('.authorLabel').text().trim() ||
|
||
$el.find('.bc-size-small .bc-text-bold').first().text().trim();
|
||
|
||
// Extract author ASIN from author link href
|
||
const authorHref = authorLink.attr('href') || '';
|
||
const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||
|
||
// Extract narrator from narrator search link
|
||
const narratorText = $el.find('a[href*="searchNarrator="]').first().text().trim() ||
|
||
$el.find('.narratorLabel').text().trim();
|
||
|
||
const coverArtUrl = $el.find('img').attr('src') || '';
|
||
|
||
const langConfig = this.getLangConfig();
|
||
|
||
// Extract runtime/duration
|
||
const runtimeText = $el.find('.runtimeLabel').text().trim() ||
|
||
$el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
|
||
const durationMinutes = this.parseRuntime(runtimeText);
|
||
|
||
// Extract rating
|
||
const ratingText = $el.find('.ratingsLabel').text().trim() ||
|
||
$el.find('.a-icon-star span').first().text().trim();
|
||
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
|
||
|
||
audiobooks.push({
|
||
asin,
|
||
title,
|
||
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
|
||
authorAsin: authorAsinMatch?.[1] || undefined,
|
||
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
|
||
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
|
||
durationMinutes,
|
||
rating,
|
||
});
|
||
});
|
||
|
||
// Try to extract total results count
|
||
const resultsText = $('.resultsInfo').text().trim();
|
||
const totalResults = parseInt(resultsText.match(/of ([\d,]+)/)?.[1]?.replace(/,/g, '') || '0');
|
||
|
||
logger.info(` Found ${audiobooks.length} results for "${query}"`);
|
||
|
||
return {
|
||
query,
|
||
results: audiobooks,
|
||
totalResults,
|
||
page,
|
||
hasMore: audiobooks.length > 0 && (totalResults > 0
|
||
? totalResults > page * AUDIBLE_PAGE_SIZE
|
||
: audiobooks.length >= AUDIBLE_PAGE_SIZE),
|
||
};
|
||
} catch (error) {
|
||
logger.error('Search failed', { error: error instanceof Error ? error.message : String(error) });
|
||
return {
|
||
query,
|
||
results: [],
|
||
totalResults: 0,
|
||
page,
|
||
hasMore: false,
|
||
};
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Search for all books by a specific author, validated by ASIN.
|
||
* Uses Audible's searchAuthor parameter and paginates through all results.
|
||
* Filters: (1) author link must contain the target ASIN, (2) language must be English.
|
||
*/
|
||
async searchByAuthorAsin(authorName: string, authorAsin: string, page: number = 1): Promise<AuthorBooksResult> {
|
||
await this.initialize();
|
||
|
||
const books: AudibleAudiobook[] = [];
|
||
const seenAsins = new Set<string>();
|
||
|
||
try {
|
||
logger.info(`Searching books by author "${authorName}" (ASIN: ${authorAsin}), page ${page}...`);
|
||
|
||
const { data: response } = await this.fetchWithRetry('/search', {
|
||
params: {
|
||
ipRedirectOverride: 'true',
|
||
searchAuthor: authorName,
|
||
pageSize: AUDIBLE_PAGE_SIZE,
|
||
page,
|
||
},
|
||
});
|
||
|
||
const $ = cheerio.load(response.data);
|
||
|
||
// Count raw items on page before filtering (for hasMore fallback)
|
||
const pageItemCount = $('.s-result-item, .productListItem').length;
|
||
|
||
$('.s-result-item, .productListItem').each((_index, element) => {
|
||
const $el = $(element);
|
||
|
||
// --- Language filter: require matching language for region ---
|
||
const langConfig = this.getLangConfig();
|
||
const langText = $el.find(buildContainsSelector('span', langConfig.scraping.languageLabels)).text().trim() ||
|
||
$el.find('.languageLabel').text().trim();
|
||
const langLabelPattern = new RegExp(`(?:${langConfig.scraping.languageLabels.map(l => l.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})\\s*(.+)`, 'i');
|
||
const langMatch = langText.match(langLabelPattern);
|
||
const language = langMatch?.[1]?.trim();
|
||
if (!language || !isAcceptedLanguage(language, langConfig)) return;
|
||
|
||
// --- Author ASIN filter: verify target ASIN in author links ---
|
||
const authorLinks = $el.find('a[href*="/author/"]');
|
||
let hasMatchingAuthor = false;
|
||
authorLinks.each((_i, link) => {
|
||
const href = $(link).attr('href') || '';
|
||
const asinMatch = href.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||
if (asinMatch && asinMatch[1] === authorAsin) {
|
||
hasMatchingAuthor = true;
|
||
return false; // break .each()
|
||
}
|
||
});
|
||
if (!hasMatchingAuthor) return;
|
||
|
||
// --- Extract book ASIN ---
|
||
const bookAsin = $el.find('li').attr('data-asin') ||
|
||
$el.find('a[href*="/pd/"]').attr('href')?.match(/\/pd\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
|
||
$el.find('a[href*="/ac/"]').attr('href')?.match(/\/ac\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
|
||
$el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || '';
|
||
if (!bookAsin || seenAsins.has(bookAsin)) return;
|
||
seenAsins.add(bookAsin);
|
||
|
||
// --- Parse book details ---
|
||
const title = $el.find('h2').first().text().trim() ||
|
||
$el.find('h3 a').text().trim() ||
|
||
$el.find('.bc-heading a').text().trim();
|
||
|
||
const authorText = $el.find('a[href*="/author/"]').first().text().trim() ||
|
||
$el.find('.authorLabel').text().trim() ||
|
||
$el.find('.bc-size-small .bc-text-bold').first().text().trim();
|
||
|
||
const narratorText = $el.find('a[href*="searchNarrator="]').first().text().trim() ||
|
||
$el.find('.narratorLabel').text().trim();
|
||
|
||
const coverArtUrl = $el.find('img').attr('src') || '';
|
||
|
||
const runtimeText = $el.find('.runtimeLabel').text().trim() ||
|
||
$el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
|
||
const durationMinutes = this.parseRuntime(runtimeText);
|
||
|
||
const ratingText = $el.find('.ratingsLabel').text().trim() ||
|
||
$el.find('.a-icon-star span').first().text().trim();
|
||
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
|
||
|
||
books.push({
|
||
asin: bookAsin,
|
||
title,
|
||
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
|
||
authorAsin,
|
||
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
|
||
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
|
||
durationMinutes,
|
||
rating,
|
||
});
|
||
});
|
||
|
||
// Check total results for pagination
|
||
const resultsText = $('.resultsInfo').text().trim();
|
||
const totalResults = parseInt(resultsText.match(/of ([\d,]+)/)?.[1]?.replace(/,/g, '') || '0');
|
||
// Use totalResults if available; otherwise fall back to whether Audible returned a full page
|
||
const hasMore = books.length > 0 && (totalResults > 0
|
||
? totalResults > page * AUDIBLE_PAGE_SIZE
|
||
: pageItemCount >= AUDIBLE_PAGE_SIZE);
|
||
|
||
logger.info(`Author books page ${page}: ${books.length} valid results (${totalResults} Audible total)`);
|
||
return { books, hasMore, page, totalResults };
|
||
} catch (error) {
|
||
logger.error(`Author books search failed for "${authorName}"`, {
|
||
error: error instanceof Error ? error.message : String(error),
|
||
});
|
||
return { books, hasMore: false, page, totalResults: 0 };
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get detailed audiobook information
|
||
* Primary: Audnexus API (reliable, structured data)
|
||
* Fallback: Audible scraping
|
||
*/
|
||
async getAudiobookDetails(asin: string): Promise<AudibleAudiobook | null> {
|
||
await this.initialize();
|
||
|
||
try {
|
||
logger.info(` Fetching details for ASIN ${asin}...`);
|
||
|
||
// Try Audnexus first (more reliable)
|
||
const audnexusData = await this.fetchFromAudnexus(asin);
|
||
if (audnexusData) {
|
||
logger.info(` Successfully fetched from Audnexus for "${audnexusData.title}"`);
|
||
return audnexusData;
|
||
}
|
||
|
||
logger.info(` Audnexus failed, falling back to Audible scraping...`);
|
||
|
||
// Fallback to Audible scraping
|
||
return await this.scrapeAudibleDetails(asin);
|
||
} catch (error) {
|
||
logger.error(`Failed to fetch details for ${asin}`, { error: error instanceof Error ? error.message : String(error) });
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Fetch audiobook details from Audnexus API
|
||
*/
|
||
private async fetchFromAudnexus(asin: string): Promise<AudibleAudiobook | null> {
|
||
try {
|
||
const audnexusRegion = AUDIBLE_REGIONS[this.region].audnexusParam;
|
||
logger.debug(`Fetching ASIN from Audnexus: ${asin} (region: ${audnexusRegion})`);
|
||
|
||
const response = await this.externalFetchWithRetry(`https://api.audnex.us/books/${asin}`, {
|
||
params: {
|
||
region: audnexusRegion, // Pass region parameter to Audnexus
|
||
},
|
||
timeout: 10000,
|
||
headers: {
|
||
'User-Agent': 'ReadMeABook/1.0',
|
||
},
|
||
});
|
||
|
||
const data = response.data;
|
||
|
||
// Build result from Audnexus data
|
||
const result: AudibleAudiobook = {
|
||
asin,
|
||
title: data.title || '',
|
||
author: data.authors?.map((a: any) => a.name).join(', ') || '',
|
||
authorAsin: data.authors?.[0]?.asin || undefined,
|
||
narrator: data.narrators?.map((n: any) => n.name).join(', ') || '',
|
||
description: data.description || data.summary || '',
|
||
coverArtUrl: data.image || '',
|
||
durationMinutes: data.runtimeLengthMin ? parseInt(data.runtimeLengthMin) : undefined,
|
||
releaseDate: data.releaseDate || undefined,
|
||
rating: data.rating ? parseFloat(data.rating) : undefined,
|
||
genres: data.genres?.map((g: any) => typeof g === 'string' ? g : g.name).slice(0, 5) || undefined,
|
||
series: data.seriesPrimary?.name || undefined,
|
||
seriesPart: data.seriesPrimary?.position || undefined,
|
||
seriesAsin: data.seriesPrimary?.asin || undefined,
|
||
};
|
||
|
||
// Ensure cover art URL is high quality
|
||
if (result.coverArtUrl && !result.coverArtUrl.includes('_SL500_')) {
|
||
result.coverArtUrl = result.coverArtUrl.replace(/\._.*_\./, '._SL500_.');
|
||
}
|
||
|
||
logger.debug('Audnexus success', {
|
||
title: result.title,
|
||
author: result.author,
|
||
narrator: result.narrator,
|
||
descLength: result.description?.length || 0,
|
||
duration: result.durationMinutes,
|
||
rating: result.rating,
|
||
genreCount: result.genres?.length || 0,
|
||
series: result.series,
|
||
seriesPart: result.seriesPart,
|
||
seriesAsin: result.seriesAsin
|
||
});
|
||
|
||
return result;
|
||
} catch (error: any) {
|
||
if (error.response?.status === 404) {
|
||
logger.debug(`Book not found (404) on Audnexus for ASIN ${asin}`);
|
||
} else {
|
||
logger.warn(`Error fetching from Audnexus for ASIN ${asin}`, { error: error.message });
|
||
}
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Scrape audiobook details from Audible (fallback method)
|
||
*/
|
||
private async scrapeAudibleDetails(asin: string): Promise<AudibleAudiobook | null> {
|
||
try {
|
||
const { data: response } = await this.fetchWithRetry(`/pd/${asin}`, {
|
||
params: {
|
||
ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects
|
||
},
|
||
});
|
||
const $ = cheerio.load(response.data);
|
||
|
||
// Initialize result object
|
||
let result: AudibleAudiobook = {
|
||
asin,
|
||
title: '',
|
||
author: '',
|
||
narrator: '',
|
||
description: '',
|
||
coverArtUrl: '',
|
||
};
|
||
|
||
// Debug: Save HTML in development
|
||
const isDev = process.env.NODE_ENV === 'development';
|
||
if (isDev) {
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
const debugPath = path.join('/tmp', `audible-${asin}.html`);
|
||
fs.writeFileSync(debugPath, response.data);
|
||
logger.info(` Saved HTML to ${debugPath} for debugging`);
|
||
}
|
||
|
||
// Try to extract JSON-LD structured data first
|
||
const jsonLdScripts = $('script[type="application/ld+json"]');
|
||
logger.info(` Found ${jsonLdScripts.length} JSON-LD script tags`);
|
||
|
||
jsonLdScripts.each((i, elem) => {
|
||
try {
|
||
const jsonData = JSON.parse($(elem).html() || '{}');
|
||
logger.info(` JSON-LD ${i} type:`, jsonData['@type']);
|
||
|
||
if (jsonData['@type'] === 'Book' || jsonData['@type'] === 'Audiobook' || jsonData['@type'] === 'Product') {
|
||
logger.debug('Found valid JSON-LD structured data');
|
||
|
||
if (jsonData.name) result.title = jsonData.name;
|
||
|
||
if (jsonData.author) {
|
||
result.author = Array.isArray(jsonData.author)
|
||
? jsonData.author.map((a: any) => a.name || a).join(', ')
|
||
: jsonData.author?.name || jsonData.author || '';
|
||
}
|
||
|
||
if (jsonData.readBy) {
|
||
result.narrator = Array.isArray(jsonData.readBy)
|
||
? jsonData.readBy.map((n: any) => n.name || n).join(', ')
|
||
: jsonData.readBy?.name || jsonData.readBy || '';
|
||
}
|
||
|
||
if (jsonData.description) result.description = jsonData.description;
|
||
if (jsonData.image) result.coverArtUrl = jsonData.image;
|
||
if (jsonData.aggregateRating?.ratingValue) result.rating = jsonData.aggregateRating.ratingValue;
|
||
if (jsonData.datePublished) result.releaseDate = jsonData.datePublished;
|
||
|
||
if (jsonData.duration) {
|
||
const durationMatch = jsonData.duration.match(/PT(\d+)H(\d+)M/);
|
||
if (durationMatch) {
|
||
result.durationMinutes = parseInt(durationMatch[1]) * 60 + parseInt(durationMatch[2]);
|
||
}
|
||
}
|
||
}
|
||
} catch (e) {
|
||
logger.debug(`JSON-LD ${i} parsing failed`, { error: e instanceof Error ? e.message : String(e) });
|
||
}
|
||
});
|
||
|
||
// Fallback to HTML parsing for any missing fields
|
||
// Title - try multiple selectors
|
||
if (!result.title) {
|
||
result.title = $('h1.bc-heading').first().text().trim() ||
|
||
$('h1[class*="heading"]').first().text().trim() ||
|
||
$('.bc-container h1').first().text().trim() ||
|
||
$('h1').first().text().trim();
|
||
logger.info(` Title from HTML: "${result.title}"`);
|
||
}
|
||
|
||
// Author - try multiple approaches (only in product details area)
|
||
if (!result.author) {
|
||
// Look specifically in the product details section, not the whole page
|
||
const productSection = $('.bc-section, .product-top-section, [class*="product"]').first();
|
||
const authors: string[] = [];
|
||
|
||
// First try labeled author sections
|
||
productSection.find('li.authorLabel a, span.authorLabel a, .authorLabel a').each((_, elem) => {
|
||
const text = $(elem).text().trim();
|
||
if (text && text.length > 0 && text.length < 80) {
|
||
authors.push(text);
|
||
}
|
||
});
|
||
|
||
// If no labeled authors, look for author links near the title (first 3 only to avoid recommendations)
|
||
if (authors.length === 0) {
|
||
$('a[href*="/author/"]').slice(0, 3).each((_, elem) => {
|
||
const text = $(elem).text().trim();
|
||
// Filter out navigation breadcrumbs and promotional text
|
||
if (text && text.length > 1 && text.length < 80 &&
|
||
!text.includes('›') && !text.includes('...') &&
|
||
!text.toLowerCase().includes('more') && !text.toLowerCase().includes('see all')) {
|
||
authors.push(text);
|
||
}
|
||
});
|
||
}
|
||
|
||
if (authors.length > 0) {
|
||
// Deduplicate and limit to max 3 authors
|
||
result.author = [...new Set(authors)].slice(0, 3).join(', ');
|
||
}
|
||
|
||
const authorLangConfig = this.getLangConfig();
|
||
result.author = stripPrefixes(result.author, authorLangConfig.scraping.authorPrefixes);
|
||
logger.info(` Author from HTML: "${result.author}"`);
|
||
}
|
||
|
||
// Author ASIN - extract from the first author link
|
||
if (!result.authorAsin) {
|
||
const firstAuthorHref = $('a[href*="/author/"]').first().attr('href') || '';
|
||
const authorAsinMatch = firstAuthorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||
if (authorAsinMatch) {
|
||
result.authorAsin = authorAsinMatch[1];
|
||
}
|
||
}
|
||
|
||
// Narrator - try multiple approaches (only in product details area)
|
||
if (!result.narrator) {
|
||
// Look specifically in the product details section
|
||
const productSection = $('.bc-section, .product-top-section, [class*="product"]').first();
|
||
const narrators: string[] = [];
|
||
|
||
// First try labeled narrator sections
|
||
productSection.find('li.narratorLabel a, span.narratorLabel a, .narratorLabel a').each((_, elem) => {
|
||
const text = $(elem).text().trim();
|
||
if (text && text.length > 0 && text.length < 80) {
|
||
narrators.push(text);
|
||
}
|
||
});
|
||
|
||
// If no labeled narrators, look for narrator links (first 5 only)
|
||
if (narrators.length === 0) {
|
||
$('a[href*="/narrator/"]').slice(0, 5).each((_, elem) => {
|
||
const text = $(elem).text().trim();
|
||
if (text && text.length > 1 && text.length < 80 &&
|
||
!text.includes('›') && !text.includes('...')) {
|
||
narrators.push(text);
|
||
}
|
||
});
|
||
}
|
||
|
||
if (narrators.length > 0) {
|
||
// Deduplicate and limit to reasonable count
|
||
result.narrator = [...new Set(narrators)].slice(0, 5).join(', ');
|
||
}
|
||
|
||
if (result.narrator) {
|
||
const detailLangConfig = this.getLangConfig();
|
||
result.narrator = stripPrefixes(result.narrator, detailLangConfig.scraping.narratorPrefixes);
|
||
}
|
||
logger.info(` Narrator from HTML: "${result.narrator || ''}"`);
|
||
}
|
||
|
||
// Description - try multiple approaches with strict filtering
|
||
if (!result.description) {
|
||
const descLangConfig = this.getLangConfig();
|
||
const excludePatterns = descLangConfig.scraping.descriptionExcludePatterns;
|
||
|
||
const isValidDescription = (text: string): boolean => {
|
||
if (!text || text.length < 50 || text.length > 5000) return false;
|
||
// Reject if it contains promotional patterns
|
||
for (const pattern of excludePatterns) {
|
||
if (pattern.test(text)) return false;
|
||
}
|
||
return true;
|
||
};
|
||
|
||
// Try specific description selectors first
|
||
const candidates = [
|
||
$('.bc-expander-content').first().text().trim(),
|
||
$('[class*="productPublisherSummary"]').first().text().trim(),
|
||
$('[data-widget="publisherSummary"]').first().text().trim(),
|
||
$('.bc-section p').first().text().trim(),
|
||
];
|
||
|
||
// Find first valid candidate
|
||
for (const candidate of candidates) {
|
||
if (isValidDescription(candidate)) {
|
||
result.description = candidate;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// If still no description, search for valid paragraphs
|
||
if (!result.description) {
|
||
$('p, div[class*="description"]').each((_, elem) => {
|
||
const text = $(elem).text().trim();
|
||
if (isValidDescription(text) && text.length > (result.description?.length || 0)) {
|
||
result.description = text;
|
||
}
|
||
});
|
||
}
|
||
|
||
logger.info(` Description length: ${result.description?.length || 0} chars`);
|
||
}
|
||
|
||
// Cover art - try multiple selectors
|
||
if (!result.coverArtUrl) {
|
||
result.coverArtUrl = $('img.bc-image-inset-border').attr('src') ||
|
||
$('img[class*="product-image"]').first().attr('src') ||
|
||
$('img[class*="cover"]').first().attr('src') ||
|
||
$('.bc-pub-detail-image img').attr('src') ||
|
||
$('img[src*="images-na.ssl-images-amazon.com"]').first().attr('src') ||
|
||
$('img[src*="m.media-amazon.com"]').first().attr('src') ||
|
||
'';
|
||
if (result.coverArtUrl) {
|
||
result.coverArtUrl = result.coverArtUrl.replace(/\._.*_\./, '._SL500_.');
|
||
}
|
||
}
|
||
|
||
// Runtime/Duration - try multiple approaches
|
||
if (!result.durationMinutes) {
|
||
const rtLangConfig = this.getLangConfig();
|
||
|
||
// Look for runtime text in various places
|
||
const runtimeText =
|
||
$('li.runtimeLabel span').text().trim() ||
|
||
$('.runtimeLabel').text().trim() ||
|
||
$(buildContainsSelector('span', rtLangConfig.scraping.lengthLabels)).parent().text().trim() ||
|
||
$(buildContainsSelector('li', rtLangConfig.scraping.lengthLabels)).text().trim() ||
|
||
(() => {
|
||
// Look for any text matching duration pattern
|
||
let found = '';
|
||
$('li, span, div').each((_, elem) => {
|
||
const text = $(elem).text().trim();
|
||
if (text.match(rtLangConfig.scraping.durationDetectionPattern) && text.length < 100) {
|
||
found = text;
|
||
return false; // break
|
||
}
|
||
});
|
||
return found;
|
||
})();
|
||
|
||
result.durationMinutes = this.parseRuntime(runtimeText);
|
||
logger.info(` Duration from "${runtimeText}": ${result.durationMinutes} minutes`);
|
||
}
|
||
|
||
// Rating - try multiple approaches
|
||
if (!result.rating) {
|
||
const ratingLangConfig = this.getLangConfig();
|
||
const ratingText =
|
||
$('.ratingsLabel').text().trim() ||
|
||
$('[class*="rating"]').first().text().trim() ||
|
||
$(`span:contains("${ratingLangConfig.scraping.ratingTextSelector}")`).parent().text().trim() ||
|
||
(() => {
|
||
// Look for rating pattern using language-specific patterns
|
||
let found = '';
|
||
$('span, div').each((_, elem) => {
|
||
const text = $(elem).text().trim();
|
||
if (text.length < 50) {
|
||
for (const pattern of ratingLangConfig.scraping.ratingPatterns) {
|
||
if (pattern.test(text)) {
|
||
found = text;
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
});
|
||
return found;
|
||
})();
|
||
|
||
if (ratingText) {
|
||
let ratingValue: number | undefined;
|
||
for (const pattern of ratingLangConfig.scraping.ratingPatterns) {
|
||
const ratingMatch = ratingText.match(pattern);
|
||
if (ratingMatch) {
|
||
// Handle comma as decimal separator (e.g. "4,5" in German/Spanish)
|
||
ratingValue = parseFloat(ratingMatch[1].replace(',', '.'));
|
||
break;
|
||
}
|
||
}
|
||
result.rating = ratingValue;
|
||
}
|
||
logger.info(` Rating from "${ratingText}": ${result.rating}`);
|
||
}
|
||
|
||
// Release date - try multiple selectors
|
||
if (!result.releaseDate) {
|
||
const rdLangConfig = this.getLangConfig();
|
||
const releaseDateText =
|
||
$(buildContainsSelector('li', rdLangConfig.scraping.releaseDateLabels)).text().trim() ||
|
||
$(buildContainsSelector('span', rdLangConfig.scraping.releaseDateLabels)).parent().text().trim() ||
|
||
$('[class*="release"]').text().trim();
|
||
|
||
const dateMatch = extractByPatterns(releaseDateText, rdLangConfig.scraping.releaseDatePatterns) ||
|
||
releaseDateText.match(/(\w+ \d{1,2},? \d{4})/)?.[1];
|
||
if (dateMatch) {
|
||
result.releaseDate = dateMatch.trim();
|
||
}
|
||
logger.info(` Release date from "${releaseDateText}": ${result.releaseDate}`);
|
||
}
|
||
|
||
// Genres - try to extract categories
|
||
const genres: string[] = [];
|
||
$('a[href*="/cat/"]').each((_, el) => {
|
||
const genre = $(el).text().trim();
|
||
if (genre && !genres.includes(genre) && genre.length < 50 && genre.length > 2) {
|
||
genres.push(genre);
|
||
}
|
||
});
|
||
if (genres.length > 0) {
|
||
result.genres = genres.slice(0, 5); // Limit to 5 genres
|
||
logger.info(` Genres: ${result.genres.join(', ')}`);
|
||
}
|
||
|
||
logger.info(`Successfully fetched details for "${result.title}"`);
|
||
logger.debug('Final result', {
|
||
title: result.title,
|
||
author: result.author,
|
||
narrator: result.narrator,
|
||
descLength: result.description?.length || 0,
|
||
duration: result.durationMinutes,
|
||
rating: result.rating,
|
||
genreCount: result.genres?.length || 0
|
||
});
|
||
|
||
return result;
|
||
} catch (error) {
|
||
logger.error(`Failed to fetch details for ${asin}`, { error: error instanceof Error ? error.message : String(error) });
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Parse runtime text to minutes using language-specific patterns.
|
||
* Delegates to shared utility in src/lib/utils/parse-runtime.ts.
|
||
*/
|
||
private parseRuntime(runtimeText: string): number | undefined {
|
||
return parseRuntimeUtil(runtimeText, this.getLangConfig());
|
||
}
|
||
|
||
/**
|
||
* Get runtime (in minutes) for an audiobook by ASIN
|
||
* Lightweight method for size validation during search
|
||
* Returns null if not found or error
|
||
*/
|
||
async getRuntime(asin: string): Promise<number | null> {
|
||
try {
|
||
// Use Audnexus API for fast, reliable runtime data
|
||
const audnexusRegion = AUDIBLE_REGIONS[this.region].audnexusParam;
|
||
|
||
const response = await this.externalFetchWithRetry(`https://api.audnex.us/books/${asin}`, {
|
||
params: { region: audnexusRegion },
|
||
timeout: 5000, // Quick timeout for search performance
|
||
headers: { 'User-Agent': 'ReadMeABook/1.0' },
|
||
});
|
||
|
||
const runtimeMin = response.data?.runtimeLengthMin;
|
||
if (runtimeMin) {
|
||
return parseInt(runtimeMin);
|
||
}
|
||
|
||
return null;
|
||
} catch (error: any) {
|
||
if (error.response?.status !== 404) {
|
||
logger.debug(`Runtime fetch failed for ASIN ${asin}: ${error.message}`);
|
||
}
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get top-level categories from Audible's categories page.
|
||
* Scrapes {baseUrl}/categories and returns {id, name}[] for top-level nodes.
|
||
*/
|
||
async getCategories(): Promise<{ id: string; name: string }[]> {
|
||
await this.initialize();
|
||
|
||
logger.info('Fetching Audible categories...');
|
||
|
||
try {
|
||
const { data: response } = await this.fetchWithRetry('/categories', {
|
||
params: { ipRedirectOverride: 'true' },
|
||
});
|
||
|
||
const $ = cheerio.load(response.data);
|
||
const categories: { id: string; name: string }[] = [];
|
||
|
||
// Top-level category links are in the main categories grid
|
||
// They follow the pattern /cat/{name}/{nodeId}
|
||
$('a[href*="/cat/"]').each((_index, element) => {
|
||
const $el = $(element);
|
||
const href = $el.attr('href') || '';
|
||
const match = href.match(/\/cat\/[^\/]+\/(\d+)/);
|
||
if (!match) return;
|
||
|
||
const id = match[1];
|
||
const name = $el.text().trim();
|
||
|
||
if (name && !categories.some((c) => c.id === id)) {
|
||
categories.push({ id, name });
|
||
}
|
||
});
|
||
|
||
logger.info(`Found ${categories.length} top-level categories`);
|
||
return categories;
|
||
} catch (error) {
|
||
logger.error('Failed to fetch categories', {
|
||
error: error instanceof Error ? error.message : String(error),
|
||
});
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get audiobooks for a specific category using Audible search with node parameter.
|
||
* Scrapes {baseUrl}/search?node={categoryId}&pageSize=50, up to `limit` results.
|
||
*/
|
||
async getCategoryBooks(categoryId: string, limit: number = 200): Promise<AudibleAudiobook[]> {
|
||
await this.initialize();
|
||
|
||
logger.info(`Fetching category books for node ${categoryId} (limit: ${limit})...`);
|
||
|
||
const audiobooks: AudibleAudiobook[] = [];
|
||
let page = 1;
|
||
const maxPages = Math.ceil(limit / AUDIBLE_PAGE_SIZE);
|
||
|
||
this.pacer.reset();
|
||
|
||
while (audiobooks.length < limit && page <= maxPages) {
|
||
try {
|
||
const { data: response, meta } = await this.fetchWithRetry('/search', {
|
||
params: {
|
||
ipRedirectOverride: 'true',
|
||
node: categoryId,
|
||
pageSize: AUDIBLE_PAGE_SIZE,
|
||
sort: 'popularity-rank',
|
||
...(page > 1 ? { page } : {}),
|
||
},
|
||
});
|
||
|
||
const $ = cheerio.load(response.data);
|
||
let foundOnPage = 0;
|
||
|
||
// Parse search results — same selectors as search()
|
||
$('.s-result-item, .productListItem').each((_index, element) => {
|
||
if (audiobooks.length >= limit) return false;
|
||
const $el = $(element);
|
||
|
||
const asin =
|
||
$el.find('li').attr('data-asin') ||
|
||
$el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
|
||
'';
|
||
if (!asin || audiobooks.some((b) => b.asin === asin)) return;
|
||
|
||
const title =
|
||
$el.find('h2').first().text().trim() ||
|
||
$el.find('h3 a').text().trim() ||
|
||
$el.find('.bc-heading a').text().trim();
|
||
|
||
const authorLink = $el.find('a[href*="/author/"]').first();
|
||
const authorText =
|
||
authorLink.text().trim() ||
|
||
$el.find('.authorLabel').text().trim();
|
||
const authorHref = authorLink.attr('href') || '';
|
||
const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||
|
||
const narratorText =
|
||
$el.find('a[href*="searchNarrator="]').first().text().trim() ||
|
||
$el.find('.narratorLabel').text().trim();
|
||
|
||
const coverArtUrl = $el.find('img').attr('src') || '';
|
||
|
||
const langConfig = this.getLangConfig();
|
||
const runtimeText =
|
||
$el.find('.runtimeLabel').text().trim() ||
|
||
$el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
|
||
const durationMinutes = this.parseRuntime(runtimeText);
|
||
|
||
const ratingText =
|
||
$el.find('.ratingsLabel').text().trim() ||
|
||
$el.find('.a-icon-star span').first().text().trim();
|
||
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
|
||
|
||
audiobooks.push({
|
||
asin,
|
||
title,
|
||
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
|
||
authorAsin: authorAsinMatch?.[1] || undefined,
|
||
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
|
||
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
|
||
durationMinutes,
|
||
rating,
|
||
});
|
||
|
||
foundOnPage++;
|
||
});
|
||
|
||
logger.info(`Category ${categoryId}: found ${foundOnPage} books on page ${page}`);
|
||
|
||
if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) break;
|
||
|
||
page++;
|
||
|
||
if (page <= maxPages && audiobooks.length < limit) {
|
||
await this.delay(this.pacer.reportPageResult(meta));
|
||
}
|
||
} catch (error) {
|
||
logger.error(`Failed to fetch category ${categoryId} page ${page}`, {
|
||
error: error instanceof Error ? error.message : String(error),
|
||
collectedSoFar: audiobooks.length,
|
||
});
|
||
break;
|
||
}
|
||
}
|
||
|
||
logger.info(`Category ${categoryId}: collected ${audiobooks.length} books across ${page - 1} pages`);
|
||
return audiobooks;
|
||
}
|
||
|
||
/**
|
||
* Add delay between requests to respect rate limits
|
||
*/
|
||
private async delay(ms: number): Promise<void> {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
}
|
||
|
||
// Singleton instance
|
||
let audibleService: AudibleService | null = null;
|
||
|
||
export function getAudibleService(): AudibleService {
|
||
if (!audibleService) {
|
||
audibleService = new AudibleService();
|
||
}
|
||
return audibleService;
|
||
}
|