/** * Component: Audible Integration Service (Web Scraping) * Documentation: documentation/integrations/audible.md */ import axios, { AxiosInstance } from 'axios'; import * as cheerio from 'cheerio'; import { RMABLogger } from '../utils/logger'; import { getConfigService } from '../services/config.service'; import { AudibleRegion, AUDIBLE_REGIONS, DEFAULT_AUDIBLE_REGION } from '../types/audible'; import { getLanguageForRegion, stripPrefixes, buildContainsSelector, extractByPatterns, isAcceptedLanguage, type LanguageConfig, } from '../constants/language-config'; import { pickUserAgent, getBrowserHeaders, jitteredBackoff, AdaptivePacer, FetchResultMeta, } from '../utils/scrape-resilience'; import { parseRuntime as parseRuntimeUtil } from '../utils/parse-runtime'; // Module-level logger const logger = RMABLogger.create('Audible'); /** * Audible supports a pageSize query parameter (default ~20). * Using 50 significantly reduces the number of HTTP requests needed * for bulk operations like popular/new-release refreshes and search. */ const AUDIBLE_PAGE_SIZE = 50; export interface AudibleAudiobook { asin: string; title: string; author: string; authorAsin?: string; narrator?: string; description?: string; coverArtUrl?: string; durationMinutes?: number; releaseDate?: string; rating?: number; genres?: string[]; series?: string; seriesPart?: string; seriesAsin?: string; } export interface AudibleSearchResult { query: string; results: AudibleAudiobook[]; totalResults: number; page: number; hasMore: boolean; } export interface AuthorBooksResult { books: AudibleAudiobook[]; hasMore: boolean; page: number; totalResults: number; } export class AudibleService { private client!: AxiosInstance; private baseUrl: string = 'https://www.audible.com'; private region: AudibleRegion = 'us'; private initialized: boolean = false; private sessionUserAgent: string = ''; private pacer: AdaptivePacer = new AdaptivePacer(); constructor() { // Client will be created lazily on first use } /** * Get the current Audible base URL for the configured region */ public getBaseUrl(): string { return this.baseUrl; } /** * Get the current Audible region code */ public getRegion(): AudibleRegion { return this.region; } /** * Public fetch wrapper for external scraping modules (e.g. audible-series.ts). * Ensures the service is initialized and delegates to fetchWithRetry. */ public async fetch(url: string, config: any = {}): Promise<{ data: any; meta: FetchResultMeta }> { await this.initialize(); return this.fetchWithRetry(url, config); } /** * Get the language config for the current region */ private getLangConfig(): LanguageConfig { return getLanguageForRegion(this.region); } /** * Force re-initialization (used when region config changes) */ public forceReinitialize(): void { logger.info('Force re-initializing AudibleService'); this.initialized = false; } /** * Initialize service with configured region * Lazy initialization allows async config loading * Automatically re-initializes if region has changed */ private async initialize(): Promise { // If already initialized, check if region has changed if (this.initialized) { const configService = getConfigService(); const currentRegion = await configService.getAudibleRegion(); // If region changed, force re-initialization if (currentRegion !== this.region) { logger.info(`Region changed from ${this.region} to ${currentRegion}, re-initializing`); this.initialized = false; } else { return; // Region unchanged, use existing initialization } } try { const configService = getConfigService(); this.region = await configService.getAudibleRegion(); this.baseUrl = AUDIBLE_REGIONS[this.region].baseUrl; this.sessionUserAgent = pickUserAgent(); this.pacer.reset(); logger.info(`Initializing Audible service with region: ${this.region} (${this.baseUrl})`); // Get language config for the region const langConfig = getLanguageForRegion(this.region); // Create axios client with region-specific base URL and realistic browser headers this.client = axios.create({ baseURL: this.baseUrl, timeout: 15000, headers: getBrowserHeaders(this.sessionUserAgent), params: { ipRedirectOverride: 'true', // Prevent IP-based region redirects language: langConfig.scraping.audibleLocaleParam, // Force locale (prevents IP-based language serving) }, }); this.initialized = true; } catch (error) { logger.error('Failed to initialize AudibleService', { error: error instanceof Error ? error.message : String(error) }); // Fallback to default region this.region = DEFAULT_AUDIBLE_REGION; this.baseUrl = AUDIBLE_REGIONS[this.region].baseUrl; this.sessionUserAgent = pickUserAgent(); this.pacer.reset(); const fallbackLangConfig = getLanguageForRegion(this.region); this.client = axios.create({ baseURL: this.baseUrl, timeout: 15000, headers: getBrowserHeaders(this.sessionUserAgent), params: { ipRedirectOverride: 'true', language: fallbackLangConfig.scraping.audibleLocaleParam, }, }); this.initialized = true; } } /** * Fetch with retry logic and jittered exponential backoff. * Returns the axios response plus metadata about retries encountered. */ private async fetchWithRetry( url: string, config: any = {}, maxRetries: number = 5 ): Promise<{ data: any; meta: FetchResultMeta }> { let lastError: Error | null = null; let retriesUsed = 0; let encountered503 = false; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { const response = await this.client.get(url, config); return { data: response, meta: { retriesUsed, encountered503 } }; } catch (error: any) { lastError = error; const status = error.response?.status; const isRetryable = !status || status === 503 || status === 429 || status >= 500; if (status === 503) encountered503 = true; // Don't retry on 404, 403, etc. if (!isRetryable) { throw error; } // Don't retry on last attempt if (attempt === maxRetries) { break; } retriesUsed++; // Jittered exponential backoff instead of predictable doubling const backoffMs = jitteredBackoff(attempt); logger.info(` Request failed (${status || 'network error'}), retrying in ${backoffMs}ms (attempt ${attempt + 1}/${maxRetries})...`); await this.delay(backoffMs); } } // All retries exhausted throw lastError || new Error('Request failed after retries'); } /** * External API fetch with retry logic and exponential backoff * Used for Audnexus and other external APIs */ private async externalFetchWithRetry( url: string, config: any = {}, maxRetries: number = 3 ): Promise { let lastError: Error | null = null; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return await axios.get(url, config); } catch (error: any) { lastError = error; const status = error.response?.status; const isRetryable = !status || status === 503 || status === 429 || status >= 500; // Don't retry on 404, 403, etc. if (!isRetryable) { throw error; } // Don't retry on deterministic 500 errors (e.g. "Release date is in the future") if (status === 500) { const message = error.response?.data?.message || ''; if (message.includes('Release date is in the future')) { logger.info(` External API returned non-retryable error: ${message}`); throw error; } } // Don't retry on last attempt if (attempt === maxRetries) { break; } // Exponential backoff: 2^attempt * 1000ms (1s, 2s, 4s...) const backoffMs = Math.pow(2, attempt) * 1000; logger.info(` External API request failed (${status || 'network error'}), retrying in ${backoffMs}ms (attempt ${attempt + 1}/${maxRetries})...`); await this.delay(backoffMs); } } // All retries exhausted throw lastError || new Error('External API request failed after retries'); } /** * Get popular audiobooks from best sellers (with pagination support) */ async getPopularAudiobooks(limit: number = 20): Promise { await this.initialize(); logger.info(` Fetching popular audiobooks (limit: ${limit})...`); const audiobooks: AudibleAudiobook[] = []; let page = 1; const maxPages = Math.ceil(limit / AUDIBLE_PAGE_SIZE); this.pacer.reset(); while (audiobooks.length < limit && page <= maxPages) { try { logger.info(` Fetching page ${page}/${maxPages}...`); const { data: response, meta } = await this.fetchWithRetry('/adblbestsellers', { params: { ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects pageSize: AUDIBLE_PAGE_SIZE, ...(page > 1 ? { page } : {}), }, }); const $ = cheerio.load(response.data); let foundOnPage = 0; // Parse audiobook items from best sellers page $('.productListItem').each((index, element) => { if (audiobooks.length >= limit) return false; const $el = $(element); // Extract ASIN from data attribute or link - handle both /pd/ and /ac/ URLs const asin = $el.find('li').attr('data-asin') || $el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || ''; if (!asin) return; // Skip duplicates if (audiobooks.some(book => book.asin === asin)) return; const title = $el.find('h3 a').text().trim() || $el.find('.bc-heading a').text().trim(); const authorText = $el.find('.authorLabel').text().trim() || $el.find('.bc-size-small .bc-text-bold').first().text().trim(); // Extract author ASIN from author link if available const authorHref = $el.find('a[href*="/author/"]').first().attr('href') || ''; const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/); const narratorText = $el.find('.narratorLabel').text().trim() || $el.find('.bc-size-small .bc-text-bold').eq(1).text().trim(); const coverArtUrl = $el.find('img').attr('src') || ''; const ratingText = $el.find('.ratingsLabel').text().trim(); const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined; const langConfig = this.getLangConfig(); audiobooks.push({ asin, title, author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin: authorAsinMatch?.[1] || undefined, narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), rating, }); foundOnPage++; }); logger.info(` Found ${foundOnPage} audiobooks on page ${page}`); // If we got significantly fewer than requested, probably no more pages if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) { logger.info(` Reached end of available pages`); break; } page++; // Adaptive delay between pages based on retry pressure if (page <= maxPages && audiobooks.length < limit) { await this.delay(this.pacer.reportPageResult(meta)); } } catch (error) { logger.error(`Failed to fetch page ${page} of popular audiobooks`, { error: error instanceof Error ? error.message : String(error), collectedSoFar: audiobooks.length }); // Stop pagination on error, but return what we collected break; } } logger.info(` Found ${audiobooks.length} popular audiobooks across ${page - 1} pages`); return audiobooks; } /** * Get new release audiobooks (with pagination support) */ async getNewReleases(limit: number = 20): Promise { await this.initialize(); logger.info(` Fetching new releases (limit: ${limit})...`); const audiobooks: AudibleAudiobook[] = []; let page = 1; const maxPages = Math.ceil(limit / AUDIBLE_PAGE_SIZE); this.pacer.reset(); while (audiobooks.length < limit && page <= maxPages) { try { logger.info(` Fetching page ${page}/${maxPages}...`); const { data: response, meta } = await this.fetchWithRetry('/newreleases', { params: { ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects pageSize: AUDIBLE_PAGE_SIZE, ...(page > 1 ? { page } : {}), }, }); const $ = cheerio.load(response.data); let foundOnPage = 0; // Parse audiobook items from new releases page $('.productListItem').each((index, element) => { if (audiobooks.length >= limit) return false; const $el = $(element); // Extract ASIN from data attribute or link - handle both /pd/ and /ac/ URLs const asin = $el.find('li').attr('data-asin') || $el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || ''; if (!asin) return; // Skip duplicates if (audiobooks.some(book => book.asin === asin)) return; const title = $el.find('h3 a').text().trim() || $el.find('.bc-heading a').text().trim(); const authorText = $el.find('.authorLabel').text().trim() || $el.find('.bc-size-small .bc-text-bold').first().text().trim(); // Extract author ASIN from author link if available const authorHref = $el.find('a[href*="/author/"]').first().attr('href') || ''; const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/); const narratorText = $el.find('.narratorLabel').text().trim(); const coverArtUrl = $el.find('img').attr('src') || ''; const ratingText = $el.find('.ratingsLabel').text().trim(); const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined; const langConfig = this.getLangConfig(); audiobooks.push({ asin, title, author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin: authorAsinMatch?.[1] || undefined, narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), rating, }); foundOnPage++; }); logger.info(` Found ${foundOnPage} audiobooks on page ${page}`); // If we got significantly fewer than requested, probably no more pages if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) { logger.info(` Reached end of available pages`); break; } page++; // Adaptive delay between pages based on retry pressure if (page <= maxPages && audiobooks.length < limit) { await this.delay(this.pacer.reportPageResult(meta)); } } catch (error) { logger.error(`Failed to fetch page ${page} of new releases`, { error: error instanceof Error ? error.message : String(error), collectedSoFar: audiobooks.length }); // Stop pagination on error, but return what we collected break; } } logger.info(` Found ${audiobooks.length} new releases across ${page - 1} pages`); return audiobooks; } /** * Search for audiobooks */ async search(query: string, page: number = 1): Promise { await this.initialize(); try { logger.info(` Searching for "${query}"...`); const { data: response } = await this.fetchWithRetry('/search', { params: { ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects keywords: query, pageSize: AUDIBLE_PAGE_SIZE, page, }, }); const $ = cheerio.load(response.data); const audiobooks: AudibleAudiobook[] = []; // Parse search results - Audible uses s-result-item for search pages $('.s-result-item, .productListItem').each((index, element) => { const $el = $(element); // Extract ASIN from product detail link - handle both /pd/ and /ac/ URLs const asin = $el.find('li').attr('data-asin') || $el.find('a[href*="/pd/"]').attr('href')?.match(/\/pd\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || $el.find('a[href*="/ac/"]').attr('href')?.match(/\/ac\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || $el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || ''; if (!asin) return; // Extract title from h2 tag (search results) or h3 (legacy) const title = $el.find('h2').first().text().trim() || $el.find('h3 a').text().trim() || $el.find('.bc-heading a').text().trim(); // Extract author from author link const authorLink = $el.find('a[href*="/author/"]').first(); const authorText = authorLink.text().trim() || $el.find('.authorLabel').text().trim() || $el.find('.bc-size-small .bc-text-bold').first().text().trim(); // Extract author ASIN from author link href const authorHref = authorLink.attr('href') || ''; const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/); // Extract narrator from narrator search link const narratorText = $el.find('a[href*="searchNarrator="]').first().text().trim() || $el.find('.narratorLabel').text().trim(); const coverArtUrl = $el.find('img').attr('src') || ''; const langConfig = this.getLangConfig(); // Extract runtime/duration const runtimeText = $el.find('.runtimeLabel').text().trim() || $el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim(); const durationMinutes = this.parseRuntime(runtimeText); // Extract rating const ratingText = $el.find('.ratingsLabel').text().trim() || $el.find('.a-icon-star span').first().text().trim(); const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined; audiobooks.push({ asin, title, author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin: authorAsinMatch?.[1] || undefined, narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), durationMinutes, rating, }); }); // Try to extract total results count const resultsText = $('.resultsInfo').text().trim(); const totalResults = parseInt(resultsText.match(/of ([\d,]+)/)?.[1]?.replace(/,/g, '') || '0'); logger.info(` Found ${audiobooks.length} results for "${query}"`); return { query, results: audiobooks, totalResults, page, hasMore: audiobooks.length > 0 && (totalResults > 0 ? totalResults > page * AUDIBLE_PAGE_SIZE : audiobooks.length >= AUDIBLE_PAGE_SIZE), }; } catch (error) { logger.error('Search failed', { error: error instanceof Error ? error.message : String(error) }); return { query, results: [], totalResults: 0, page, hasMore: false, }; } } /** * Search for all books by a specific author, validated by ASIN. * Uses Audible's searchAuthor parameter and paginates through all results. * Filters: (1) author link must contain the target ASIN, (2) language must be English. */ async searchByAuthorAsin(authorName: string, authorAsin: string, page: number = 1): Promise { await this.initialize(); const books: AudibleAudiobook[] = []; const seenAsins = new Set(); try { logger.info(`Searching books by author "${authorName}" (ASIN: ${authorAsin}), page ${page}...`); const { data: response } = await this.fetchWithRetry('/search', { params: { ipRedirectOverride: 'true', searchAuthor: authorName, pageSize: AUDIBLE_PAGE_SIZE, page, }, }); const $ = cheerio.load(response.data); // Count raw items on page before filtering (for hasMore fallback) const pageItemCount = $('.s-result-item, .productListItem').length; $('.s-result-item, .productListItem').each((_index, element) => { const $el = $(element); // --- Language filter: require matching language for region --- const langConfig = this.getLangConfig(); const langText = $el.find(buildContainsSelector('span', langConfig.scraping.languageLabels)).text().trim() || $el.find('.languageLabel').text().trim(); const langLabelPattern = new RegExp(`(?:${langConfig.scraping.languageLabels.map(l => l.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})\\s*(.+)`, 'i'); const langMatch = langText.match(langLabelPattern); const language = langMatch?.[1]?.trim(); if (!language || !isAcceptedLanguage(language, langConfig)) return; // --- Author ASIN filter: verify target ASIN in author links --- const authorLinks = $el.find('a[href*="/author/"]'); let hasMatchingAuthor = false; authorLinks.each((_i, link) => { const href = $(link).attr('href') || ''; const asinMatch = href.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/); if (asinMatch && asinMatch[1] === authorAsin) { hasMatchingAuthor = true; return false; // break .each() } }); if (!hasMatchingAuthor) return; // --- Extract book ASIN --- const bookAsin = $el.find('li').attr('data-asin') || $el.find('a[href*="/pd/"]').attr('href')?.match(/\/pd\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || $el.find('a[href*="/ac/"]').attr('href')?.match(/\/ac\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || $el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || ''; if (!bookAsin || seenAsins.has(bookAsin)) return; seenAsins.add(bookAsin); // --- Parse book details --- const title = $el.find('h2').first().text().trim() || $el.find('h3 a').text().trim() || $el.find('.bc-heading a').text().trim(); const authorText = $el.find('a[href*="/author/"]').first().text().trim() || $el.find('.authorLabel').text().trim() || $el.find('.bc-size-small .bc-text-bold').first().text().trim(); const narratorText = $el.find('a[href*="searchNarrator="]').first().text().trim() || $el.find('.narratorLabel').text().trim(); const coverArtUrl = $el.find('img').attr('src') || ''; const runtimeText = $el.find('.runtimeLabel').text().trim() || $el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim(); const durationMinutes = this.parseRuntime(runtimeText); const ratingText = $el.find('.ratingsLabel').text().trim() || $el.find('.a-icon-star span').first().text().trim(); const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined; books.push({ asin: bookAsin, title, author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin, narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), durationMinutes, rating, }); }); // Check total results for pagination const resultsText = $('.resultsInfo').text().trim(); const totalResults = parseInt(resultsText.match(/of ([\d,]+)/)?.[1]?.replace(/,/g, '') || '0'); // Use totalResults if available; otherwise fall back to whether Audible returned a full page const hasMore = books.length > 0 && (totalResults > 0 ? totalResults > page * AUDIBLE_PAGE_SIZE : pageItemCount >= AUDIBLE_PAGE_SIZE); logger.info(`Author books page ${page}: ${books.length} valid results (${totalResults} Audible total)`); return { books, hasMore, page, totalResults }; } catch (error) { logger.error(`Author books search failed for "${authorName}"`, { error: error instanceof Error ? error.message : String(error), }); return { books, hasMore: false, page, totalResults: 0 }; } } /** * Get detailed audiobook information * Primary: Audnexus API (reliable, structured data) * Fallback: Audible scraping */ async getAudiobookDetails(asin: string): Promise { await this.initialize(); try { logger.info(` Fetching details for ASIN ${asin}...`); // Try Audnexus first (more reliable) const audnexusData = await this.fetchFromAudnexus(asin); if (audnexusData) { logger.info(` Successfully fetched from Audnexus for "${audnexusData.title}"`); return audnexusData; } logger.info(` Audnexus failed, falling back to Audible scraping...`); // Fallback to Audible scraping return await this.scrapeAudibleDetails(asin); } catch (error) { logger.error(`Failed to fetch details for ${asin}`, { error: error instanceof Error ? error.message : String(error) }); return null; } } /** * Fetch audiobook details from Audnexus API */ private async fetchFromAudnexus(asin: string): Promise { try { const audnexusRegion = AUDIBLE_REGIONS[this.region].audnexusParam; logger.debug(`Fetching ASIN from Audnexus: ${asin} (region: ${audnexusRegion})`); const response = await this.externalFetchWithRetry(`https://api.audnex.us/books/${asin}`, { params: { region: audnexusRegion, // Pass region parameter to Audnexus }, timeout: 10000, headers: { 'User-Agent': 'ReadMeABook/1.0', }, }); const data = response.data; // Build result from Audnexus data const result: AudibleAudiobook = { asin, title: data.title || '', author: data.authors?.map((a: any) => a.name).join(', ') || '', authorAsin: data.authors?.[0]?.asin || undefined, narrator: data.narrators?.map((n: any) => n.name).join(', ') || '', description: data.description || data.summary || '', coverArtUrl: data.image || '', durationMinutes: data.runtimeLengthMin ? parseInt(data.runtimeLengthMin) : undefined, releaseDate: data.releaseDate || undefined, rating: data.rating ? parseFloat(data.rating) : undefined, genres: data.genres?.map((g: any) => typeof g === 'string' ? g : g.name).slice(0, 5) || undefined, series: data.seriesPrimary?.name || undefined, seriesPart: data.seriesPrimary?.position || undefined, seriesAsin: data.seriesPrimary?.asin || undefined, }; // Ensure cover art URL is high quality if (result.coverArtUrl && !result.coverArtUrl.includes('_SL500_')) { result.coverArtUrl = result.coverArtUrl.replace(/\._.*_\./, '._SL500_.'); } logger.debug('Audnexus success', { title: result.title, author: result.author, narrator: result.narrator, descLength: result.description?.length || 0, duration: result.durationMinutes, rating: result.rating, genreCount: result.genres?.length || 0, series: result.series, seriesPart: result.seriesPart, seriesAsin: result.seriesAsin }); return result; } catch (error: any) { if (error.response?.status === 404) { logger.debug(`Book not found (404) on Audnexus for ASIN ${asin}`); } else { logger.warn(`Error fetching from Audnexus for ASIN ${asin}`, { error: error.message }); } return null; } } /** * Scrape audiobook details from Audible (fallback method) */ private async scrapeAudibleDetails(asin: string): Promise { try { const { data: response } = await this.fetchWithRetry(`/pd/${asin}`, { params: { ipRedirectOverride: 'true', // Explicitly include to prevent IP-based region redirects }, }); const $ = cheerio.load(response.data); // Initialize result object let result: AudibleAudiobook = { asin, title: '', author: '', narrator: '', description: '', coverArtUrl: '', }; // Debug: Save HTML in development const isDev = process.env.NODE_ENV === 'development'; if (isDev) { const fs = require('fs'); const path = require('path'); const debugPath = path.join('/tmp', `audible-${asin}.html`); fs.writeFileSync(debugPath, response.data); logger.info(` Saved HTML to ${debugPath} for debugging`); } // Try to extract JSON-LD structured data first const jsonLdScripts = $('script[type="application/ld+json"]'); logger.info(` Found ${jsonLdScripts.length} JSON-LD script tags`); jsonLdScripts.each((i, elem) => { try { const jsonData = JSON.parse($(elem).html() || '{}'); logger.info(` JSON-LD ${i} type:`, jsonData['@type']); if (jsonData['@type'] === 'Book' || jsonData['@type'] === 'Audiobook' || jsonData['@type'] === 'Product') { logger.debug('Found valid JSON-LD structured data'); if (jsonData.name) result.title = jsonData.name; if (jsonData.author) { result.author = Array.isArray(jsonData.author) ? jsonData.author.map((a: any) => a.name || a).join(', ') : jsonData.author?.name || jsonData.author || ''; } if (jsonData.readBy) { result.narrator = Array.isArray(jsonData.readBy) ? jsonData.readBy.map((n: any) => n.name || n).join(', ') : jsonData.readBy?.name || jsonData.readBy || ''; } if (jsonData.description) result.description = jsonData.description; if (jsonData.image) result.coverArtUrl = jsonData.image; if (jsonData.aggregateRating?.ratingValue) result.rating = jsonData.aggregateRating.ratingValue; if (jsonData.datePublished) result.releaseDate = jsonData.datePublished; if (jsonData.duration) { const durationMatch = jsonData.duration.match(/PT(\d+)H(\d+)M/); if (durationMatch) { result.durationMinutes = parseInt(durationMatch[1]) * 60 + parseInt(durationMatch[2]); } } } } catch (e) { logger.debug(`JSON-LD ${i} parsing failed`, { error: e instanceof Error ? e.message : String(e) }); } }); // Fallback to HTML parsing for any missing fields // Title - try multiple selectors if (!result.title) { result.title = $('h1.bc-heading').first().text().trim() || $('h1[class*="heading"]').first().text().trim() || $('.bc-container h1').first().text().trim() || $('h1').first().text().trim(); logger.info(` Title from HTML: "${result.title}"`); } // Author - try multiple approaches (only in product details area) if (!result.author) { // Look specifically in the product details section, not the whole page const productSection = $('.bc-section, .product-top-section, [class*="product"]').first(); const authors: string[] = []; // First try labeled author sections productSection.find('li.authorLabel a, span.authorLabel a, .authorLabel a').each((_, elem) => { const text = $(elem).text().trim(); if (text && text.length > 0 && text.length < 80) { authors.push(text); } }); // If no labeled authors, look for author links near the title (first 3 only to avoid recommendations) if (authors.length === 0) { $('a[href*="/author/"]').slice(0, 3).each((_, elem) => { const text = $(elem).text().trim(); // Filter out navigation breadcrumbs and promotional text if (text && text.length > 1 && text.length < 80 && !text.includes('›') && !text.includes('...') && !text.toLowerCase().includes('more') && !text.toLowerCase().includes('see all')) { authors.push(text); } }); } if (authors.length > 0) { // Deduplicate and limit to max 3 authors result.author = [...new Set(authors)].slice(0, 3).join(', '); } const authorLangConfig = this.getLangConfig(); result.author = stripPrefixes(result.author, authorLangConfig.scraping.authorPrefixes); logger.info(` Author from HTML: "${result.author}"`); } // Author ASIN - extract from the first author link if (!result.authorAsin) { const firstAuthorHref = $('a[href*="/author/"]').first().attr('href') || ''; const authorAsinMatch = firstAuthorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/); if (authorAsinMatch) { result.authorAsin = authorAsinMatch[1]; } } // Narrator - try multiple approaches (only in product details area) if (!result.narrator) { // Look specifically in the product details section const productSection = $('.bc-section, .product-top-section, [class*="product"]').first(); const narrators: string[] = []; // First try labeled narrator sections productSection.find('li.narratorLabel a, span.narratorLabel a, .narratorLabel a').each((_, elem) => { const text = $(elem).text().trim(); if (text && text.length > 0 && text.length < 80) { narrators.push(text); } }); // If no labeled narrators, look for narrator links (first 5 only) if (narrators.length === 0) { $('a[href*="/narrator/"]').slice(0, 5).each((_, elem) => { const text = $(elem).text().trim(); if (text && text.length > 1 && text.length < 80 && !text.includes('›') && !text.includes('...')) { narrators.push(text); } }); } if (narrators.length > 0) { // Deduplicate and limit to reasonable count result.narrator = [...new Set(narrators)].slice(0, 5).join(', '); } if (result.narrator) { const detailLangConfig = this.getLangConfig(); result.narrator = stripPrefixes(result.narrator, detailLangConfig.scraping.narratorPrefixes); } logger.info(` Narrator from HTML: "${result.narrator || ''}"`); } // Description - try multiple approaches with strict filtering if (!result.description) { const descLangConfig = this.getLangConfig(); const excludePatterns = descLangConfig.scraping.descriptionExcludePatterns; const isValidDescription = (text: string): boolean => { if (!text || text.length < 50 || text.length > 5000) return false; // Reject if it contains promotional patterns for (const pattern of excludePatterns) { if (pattern.test(text)) return false; } return true; }; // Try specific description selectors first const candidates = [ $('.bc-expander-content').first().text().trim(), $('[class*="productPublisherSummary"]').first().text().trim(), $('[data-widget="publisherSummary"]').first().text().trim(), $('.bc-section p').first().text().trim(), ]; // Find first valid candidate for (const candidate of candidates) { if (isValidDescription(candidate)) { result.description = candidate; break; } } // If still no description, search for valid paragraphs if (!result.description) { $('p, div[class*="description"]').each((_, elem) => { const text = $(elem).text().trim(); if (isValidDescription(text) && text.length > (result.description?.length || 0)) { result.description = text; } }); } logger.info(` Description length: ${result.description?.length || 0} chars`); } // Cover art - try multiple selectors if (!result.coverArtUrl) { result.coverArtUrl = $('img.bc-image-inset-border').attr('src') || $('img[class*="product-image"]').first().attr('src') || $('img[class*="cover"]').first().attr('src') || $('.bc-pub-detail-image img').attr('src') || $('img[src*="images-na.ssl-images-amazon.com"]').first().attr('src') || $('img[src*="m.media-amazon.com"]').first().attr('src') || ''; if (result.coverArtUrl) { result.coverArtUrl = result.coverArtUrl.replace(/\._.*_\./, '._SL500_.'); } } // Runtime/Duration - try multiple approaches if (!result.durationMinutes) { const rtLangConfig = this.getLangConfig(); // Look for runtime text in various places const runtimeText = $('li.runtimeLabel span').text().trim() || $('.runtimeLabel').text().trim() || $(buildContainsSelector('span', rtLangConfig.scraping.lengthLabels)).parent().text().trim() || $(buildContainsSelector('li', rtLangConfig.scraping.lengthLabels)).text().trim() || (() => { // Look for any text matching duration pattern let found = ''; $('li, span, div').each((_, elem) => { const text = $(elem).text().trim(); if (text.match(rtLangConfig.scraping.durationDetectionPattern) && text.length < 100) { found = text; return false; // break } }); return found; })(); result.durationMinutes = this.parseRuntime(runtimeText); logger.info(` Duration from "${runtimeText}": ${result.durationMinutes} minutes`); } // Rating - try multiple approaches if (!result.rating) { const ratingLangConfig = this.getLangConfig(); const ratingText = $('.ratingsLabel').text().trim() || $('[class*="rating"]').first().text().trim() || $(`span:contains("${ratingLangConfig.scraping.ratingTextSelector}")`).parent().text().trim() || (() => { // Look for rating pattern using language-specific patterns let found = ''; $('span, div').each((_, elem) => { const text = $(elem).text().trim(); if (text.length < 50) { for (const pattern of ratingLangConfig.scraping.ratingPatterns) { if (pattern.test(text)) { found = text; return false; } } } }); return found; })(); if (ratingText) { let ratingValue: number | undefined; for (const pattern of ratingLangConfig.scraping.ratingPatterns) { const ratingMatch = ratingText.match(pattern); if (ratingMatch) { // Handle comma as decimal separator (e.g. "4,5" in German/Spanish) ratingValue = parseFloat(ratingMatch[1].replace(',', '.')); break; } } result.rating = ratingValue; } logger.info(` Rating from "${ratingText}": ${result.rating}`); } // Release date - try multiple selectors if (!result.releaseDate) { const rdLangConfig = this.getLangConfig(); const releaseDateText = $(buildContainsSelector('li', rdLangConfig.scraping.releaseDateLabels)).text().trim() || $(buildContainsSelector('span', rdLangConfig.scraping.releaseDateLabels)).parent().text().trim() || $('[class*="release"]').text().trim(); const dateMatch = extractByPatterns(releaseDateText, rdLangConfig.scraping.releaseDatePatterns) || releaseDateText.match(/(\w+ \d{1,2},? \d{4})/)?.[1]; if (dateMatch) { result.releaseDate = dateMatch.trim(); } logger.info(` Release date from "${releaseDateText}": ${result.releaseDate}`); } // Genres - try to extract categories const genres: string[] = []; $('a[href*="/cat/"]').each((_, el) => { const genre = $(el).text().trim(); if (genre && !genres.includes(genre) && genre.length < 50 && genre.length > 2) { genres.push(genre); } }); if (genres.length > 0) { result.genres = genres.slice(0, 5); // Limit to 5 genres logger.info(` Genres: ${result.genres.join(', ')}`); } logger.info(`Successfully fetched details for "${result.title}"`); logger.debug('Final result', { title: result.title, author: result.author, narrator: result.narrator, descLength: result.description?.length || 0, duration: result.durationMinutes, rating: result.rating, genreCount: result.genres?.length || 0 }); return result; } catch (error) { logger.error(`Failed to fetch details for ${asin}`, { error: error instanceof Error ? error.message : String(error) }); return null; } } /** * Parse runtime text to minutes using language-specific patterns. * Delegates to shared utility in src/lib/utils/parse-runtime.ts. */ private parseRuntime(runtimeText: string): number | undefined { return parseRuntimeUtil(runtimeText, this.getLangConfig()); } /** * Get runtime (in minutes) for an audiobook by ASIN * Lightweight method for size validation during search * Returns null if not found or error */ async getRuntime(asin: string): Promise { try { // Use Audnexus API for fast, reliable runtime data const audnexusRegion = AUDIBLE_REGIONS[this.region].audnexusParam; const response = await this.externalFetchWithRetry(`https://api.audnex.us/books/${asin}`, { params: { region: audnexusRegion }, timeout: 5000, // Quick timeout for search performance headers: { 'User-Agent': 'ReadMeABook/1.0' }, }); const runtimeMin = response.data?.runtimeLengthMin; if (runtimeMin) { return parseInt(runtimeMin); } return null; } catch (error: any) { if (error.response?.status !== 404) { logger.debug(`Runtime fetch failed for ASIN ${asin}: ${error.message}`); } return null; } } /** * Get top-level categories from Audible's categories page. * Scrapes {baseUrl}/categories and returns {id, name}[] for top-level nodes. */ async getCategories(): Promise<{ id: string; name: string }[]> { await this.initialize(); logger.info('Fetching Audible categories...'); try { const { data: response } = await this.fetchWithRetry('/categories', { params: { ipRedirectOverride: 'true' }, }); const $ = cheerio.load(response.data); const categories: { id: string; name: string }[] = []; // Top-level category links are in the main categories grid // They follow the pattern /cat/{name}/{nodeId} $('a[href*="/cat/"]').each((_index, element) => { const $el = $(element); const href = $el.attr('href') || ''; const match = href.match(/\/cat\/[^\/]+\/(\d+)/); if (!match) return; const id = match[1]; const name = $el.text().trim(); if (name && !categories.some((c) => c.id === id)) { categories.push({ id, name }); } }); logger.info(`Found ${categories.length} top-level categories`); return categories; } catch (error) { logger.error('Failed to fetch categories', { error: error instanceof Error ? error.message : String(error), }); return []; } } /** * Get audiobooks for a specific category using Audible search with node parameter. * Scrapes {baseUrl}/search?node={categoryId}&pageSize=50, up to `limit` results. */ async getCategoryBooks(categoryId: string, limit: number = 200): Promise { await this.initialize(); logger.info(`Fetching category books for node ${categoryId} (limit: ${limit})...`); const audiobooks: AudibleAudiobook[] = []; let page = 1; const maxPages = Math.ceil(limit / AUDIBLE_PAGE_SIZE); this.pacer.reset(); while (audiobooks.length < limit && page <= maxPages) { try { const { data: response, meta } = await this.fetchWithRetry('/search', { params: { ipRedirectOverride: 'true', node: categoryId, pageSize: AUDIBLE_PAGE_SIZE, sort: 'popularity-rank', ...(page > 1 ? { page } : {}), }, }); const $ = cheerio.load(response.data); let foundOnPage = 0; // Parse search results — same selectors as search() $('.s-result-item, .productListItem').each((_index, element) => { if (audiobooks.length >= limit) return false; const $el = $(element); const asin = $el.find('li').attr('data-asin') || $el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] || ''; if (!asin || audiobooks.some((b) => b.asin === asin)) return; const title = $el.find('h2').first().text().trim() || $el.find('h3 a').text().trim() || $el.find('.bc-heading a').text().trim(); const authorLink = $el.find('a[href*="/author/"]').first(); const authorText = authorLink.text().trim() || $el.find('.authorLabel').text().trim(); const authorHref = authorLink.attr('href') || ''; const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/); const narratorText = $el.find('a[href*="searchNarrator="]').first().text().trim() || $el.find('.narratorLabel').text().trim(); const coverArtUrl = $el.find('img').attr('src') || ''; const langConfig = this.getLangConfig(); const runtimeText = $el.find('.runtimeLabel').text().trim() || $el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim(); const durationMinutes = this.parseRuntime(runtimeText); const ratingText = $el.find('.ratingsLabel').text().trim() || $el.find('.a-icon-star span').first().text().trim(); const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined; audiobooks.push({ asin, title, author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin: authorAsinMatch?.[1] || undefined, narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), durationMinutes, rating, }); foundOnPage++; }); logger.info(`Category ${categoryId}: found ${foundOnPage} books on page ${page}`); if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) break; page++; if (page <= maxPages && audiobooks.length < limit) { await this.delay(this.pacer.reportPageResult(meta)); } } catch (error) { logger.error(`Failed to fetch category ${categoryId} page ${page}`, { error: error instanceof Error ? error.message : String(error), collectedSoFar: audiobooks.length, }); break; } } logger.info(`Category ${categoryId}: collected ${audiobooks.length} books across ${page - 1} pages`); return audiobooks; } /** * Add delay between requests to respect rate limits */ private async delay(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } } // Singleton instance let audibleService: AudibleService | null = null; export function getAudibleService(): AudibleService { if (!audibleService) { audibleService = new AudibleService(); } return audibleService; }