Audible: HTML refresh, multi-narrator & works dedup

Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
2026-06-03 04:40:09 +00:00 · 2026-05-14 15:23:15 -04:00
parent 5f0855b2f8
commit fcae3bcf09
17 changed files with 1241 additions and 214 deletions
@@ -19,6 +19,7 @@ import {
 import { RMABLogger } from '../utils/logger';
 import { parseRuntime } from '../utils/parse-runtime';
 import { randomDelay } from '../utils/scrape-resilience';
+import { extractAllNarrators } from '../utils/extract-narrator';

 const logger = RMABLogger.create('Audible.Series');

@@ -442,10 +443,8 @@ function parseSeriesBooks(
    const authorHref = authorLink.attr('href') || '';
    const authorAsinMatch = authorHref.match(/\/author\/[^/]+\/([A-Z0-9]{10})/);

-    // Narrator
-    const narratorText = $el.find('a[href*="searchNarrator="]').first().text().trim() ||
-      $el.find('.narratorLabel').text().trim() ||
-      '';
+    // Narrator — capture all narrator links (multi-narrator productions are common)
+    const narratorText = extractAllNarrators($, $el);

    // Cover art
    const coverArtUrl = $el.find('img').first().attr('src')?.replace(/\._.*_\./, '._SL500_.') || '';
@@ -4,21 +4,26 @@
 */

 import axios, { AxiosInstance } from 'axios';
+import * as cheerio from 'cheerio';
 import { RMABLogger } from '../utils/logger';
 import { getConfigService } from '../services/config.service';
 import { AudibleRegion, AUDIBLE_REGIONS, DEFAULT_AUDIBLE_REGION } from '../types/audible';
 import {
  getLanguageForRegion,
  isAcceptedLanguage,
+  stripPrefixes,
+  buildContainsSelector,
+  type LanguageConfig,
 } from '../constants/language-config';
 import {
  pickUserAgent,
  getBrowserHeaders,
  jitteredBackoff,
-  randomDelay,
  AdaptivePacer,
  FetchResultMeta,
 } from '../utils/scrape-resilience';
+import { parseRuntime as parseRuntimeUtil } from '../utils/parse-runtime';
+import { extractAllNarrators } from '../utils/extract-narrator';

 const logger = RMABLogger.create('Audible');

@@ -27,6 +32,13 @@ const AUDIBLE_PAGE_SIZE = 50;
 const CATALOG_RESPONSE_GROUPS =
  'contributors,product_desc,product_attrs,product_extended_attrs,media,rating,series,category_ladders,product_details';

+// Retry/backoff knobs for HTML scraping (nightly refresh job only).
+// Healthy users still finish quickly — per-page success returns on attempt 0
+// with a 2-4s inter-page delay. Struggling users grind through 503 storms
+// patiently: up to ~12 retries per request, with each backoff capped at 3 min.
+const HTML_MAX_RETRIES = 12;
+const HTML_MAX_BACKOFF_MS = 180_000;
+
 export interface AudibleAudiobook {
  asin: string;
  title: string;
@@ -298,6 +310,7 @@ export class AudibleService {
    config: any = {},
    maxRetries: number = 5,
    client: AxiosInstance = this.htmlClient,
+    maxBackoffMs: number = Number.POSITIVE_INFINITY,
  ): Promise<{ data: any; meta: FetchResultMeta }> {
    let lastError: Error | null = null;
    let retriesUsed = 0;
@@ -324,7 +337,7 @@ export class AudibleService {

        retriesUsed++;

-        const backoffMs = jitteredBackoff(attempt);
+        const backoffMs = jitteredBackoff(attempt, 1000, maxBackoffMs);
        logger.info(
          ` Request failed (${status || 'network error'}), retrying in ${backoffMs}ms (attempt ${attempt + 1}/${maxRetries})...`,
        );
@@ -379,6 +392,12 @@ export class AudibleService {
    throw lastError || new Error('External API request failed after retries');
  }

+  /**
+   * Popular audiobooks from Audible's curated /adblbestsellers HTML page.
+   * Uses HTML scraping (not the catalog API) because the API's BestSellers sort
+   * is a right-now velocity rank that surfaces launch-day shovelware and preorders;
+   * the HTML page reflects Audible's editorial curation.
+   */
  async getPopularAudiobooks(limit: number = 20): Promise<AudibleAudiobook[]> {
    await this.initialize();

@@ -395,42 +414,36 @@ export class AudibleService {
        logger.info(` Fetching page ${page}/${maxPages}...`);

        const { data: response, meta } = await this.fetchWithRetry(
-          '/1.0/catalog/products',
+          '/adblbestsellers',
          {
            params: {
-              products_sort_by: 'BestSellers',
-              num_results: AUDIBLE_PAGE_SIZE,
-              page: page - 1,
-              response_groups: CATALOG_RESPONSE_GROUPS,
+              ipRedirectOverride: 'true',
+              pageSize: AUDIBLE_PAGE_SIZE,
+              ...(page > 1 ? { page } : {}),
            },
          },
-          5,
-          this.apiClient,
+          HTML_MAX_RETRIES,
+          this.htmlClient,
+          HTML_MAX_BACKOFF_MS,
        );

-        const envelope: CatalogProductsResponse = response.data;
-        const products = envelope.products ?? [];
-        const totalResults = envelope.total_results ?? 0;
+        const foundOnPage = this.parseProductListItems(
+          response.data,
+          audiobooks,
+          limit,
+        );

-        for (const product of products) {
-          if (audiobooks.length >= limit) break;
-          if (audiobooks.some((b) => b.asin === product.asin)) continue;
-          audiobooks.push(mapCatalogProduct(product));
+        logger.info(` Found ${foundOnPage} audiobooks on page ${page}`);
+
+        if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) {
+          logger.info(` Reached end of available pages`);
+          break;
        }

-        logger.info(` Found ${products.length} audiobooks on page ${page}`);
-
-        const hasMore =
-          totalResults > 0
-            ? totalResults > page * AUDIBLE_PAGE_SIZE
-            : products.length >= AUDIBLE_PAGE_SIZE;
-
-        if (!hasMore) break;
-
        page++;

        if (page <= maxPages && audiobooks.length < limit) {
-          await this.delay(this.apiPageDelay(meta));
+          await this.delay(this.pacer.reportPageResult(meta));
        }
      } catch (error) {
        logger.error(`Failed to fetch page ${page} of popular audiobooks`, {
@@ -445,6 +458,11 @@ export class AudibleService {
    return audiobooks;
  }

+  /**
+   * New release audiobooks from Audible's curated /newreleases HTML page.
+   * Uses HTML scraping (not the catalog API) because the API's -ReleaseDate sort
+   * returns 100% future preorders with no released-only filter available.
+   */
  async getNewReleases(limit: number = 20): Promise<AudibleAudiobook[]> {
    await this.initialize();

@@ -461,42 +479,36 @@ export class AudibleService {
        logger.info(` Fetching page ${page}/${maxPages}...`);

        const { data: response, meta } = await this.fetchWithRetry(
-          '/1.0/catalog/products',
+          '/newreleases',
          {
            params: {
-              products_sort_by: '-ReleaseDate',
-              num_results: AUDIBLE_PAGE_SIZE,
-              page: page - 1,
-              response_groups: CATALOG_RESPONSE_GROUPS,
+              ipRedirectOverride: 'true',
+              pageSize: AUDIBLE_PAGE_SIZE,
+              ...(page > 1 ? { page } : {}),
            },
          },
-          5,
-          this.apiClient,
+          HTML_MAX_RETRIES,
+          this.htmlClient,
+          HTML_MAX_BACKOFF_MS,
        );

-        const envelope: CatalogProductsResponse = response.data;
-        const products = envelope.products ?? [];
-        const totalResults = envelope.total_results ?? 0;
+        const foundOnPage = this.parseProductListItems(
+          response.data,
+          audiobooks,
+          limit,
+        );

-        for (const product of products) {
-          if (audiobooks.length >= limit) break;
-          if (audiobooks.some((b) => b.asin === product.asin)) continue;
-          audiobooks.push(mapCatalogProduct(product));
+        logger.info(` Found ${foundOnPage} audiobooks on page ${page}`);
+
+        if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) {
+          logger.info(` Reached end of available pages`);
+          break;
        }

-        logger.info(` Found ${products.length} audiobooks on page ${page}`);
-
-        const hasMore =
-          totalResults > 0
-            ? totalResults > page * AUDIBLE_PAGE_SIZE
-            : products.length >= AUDIBLE_PAGE_SIZE;
-
-        if (!hasMore) break;
-
        page++;

        if (page <= maxPages && audiobooks.length < limit) {
-          await this.delay(this.apiPageDelay(meta));
+          await this.delay(this.pacer.reportPageResult(meta));
        }
      } catch (error) {
        logger.error(`Failed to fetch page ${page} of new releases`, {
@@ -791,6 +803,11 @@ export class AudibleService {
    }
  }

+  /**
+   * Category audiobooks from Audible's HTML /search?node=<categoryId> page,
+   * sorted by popularity-rank. Uses HTML scraping (not the catalog API) so
+   * results match Audible's curated category-storefront ordering.
+   */
  async getCategoryBooks(categoryId: string, limit: number = 200): Promise<AudibleAudiobook[]> {
    await this.initialize();

@@ -805,43 +822,35 @@ export class AudibleService {
    while (audiobooks.length < limit && page <= maxPages) {
      try {
        const { data: response, meta } = await this.fetchWithRetry(
-          '/1.0/catalog/products',
+          '/search',
          {
            params: {
-              category_id: categoryId,
-              products_sort_by: 'BestSellers',
-              num_results: AUDIBLE_PAGE_SIZE,
-              page: page - 1,
-              response_groups: CATALOG_RESPONSE_GROUPS,
+              ipRedirectOverride: 'true',
+              node: categoryId,
+              pageSize: AUDIBLE_PAGE_SIZE,
+              sort: 'popularity-rank',
+              ...(page > 1 ? { page } : {}),
            },
          },
-          5,
-          this.apiClient,
+          HTML_MAX_RETRIES,
+          this.htmlClient,
+          HTML_MAX_BACKOFF_MS,
        );

-        const envelope: CatalogProductsResponse = response.data;
-        const products = envelope.products ?? [];
-        const totalResults = envelope.total_results ?? 0;
+        const foundOnPage = this.parseSearchResultItems(
+          response.data,
+          audiobooks,
+          limit,
+        );

-        for (const product of products) {
-          if (audiobooks.length >= limit) break;
-          if (audiobooks.some((b) => b.asin === product.asin)) continue;
-          audiobooks.push(mapCatalogProduct(product));
-        }
+        logger.info(`Category ${categoryId}: found ${foundOnPage} books on page ${page}`);

-        logger.info(`Category ${categoryId}: found ${products.length} books on page ${page}`);
-
-        const hasMore =
-          totalResults > 0
-            ? totalResults > page * AUDIBLE_PAGE_SIZE
-            : products.length >= AUDIBLE_PAGE_SIZE;
-
-        if (!hasMore) break;
+        if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) break;

        page++;

        if (page <= maxPages && audiobooks.length < limit) {
-          await this.delay(this.apiPageDelay(meta));
+          await this.delay(this.pacer.reportPageResult(meta));
        }
      } catch (error) {
        logger.error(`Failed to fetch category ${categoryId} page ${page}`, {
@@ -858,12 +867,148 @@ export class AudibleService {
    return audiobooks;
  }

-  private apiPageDelay(meta: FetchResultMeta): number {
-    if (meta.retriesUsed > 0) {
-      return this.pacer.reportPageResult(meta);
-    }
-    this.pacer.reportPageResult(meta);
-    return randomDelay(500, 1500);
+  private getLangConfig(): LanguageConfig {
+    return getLanguageForRegion(this.region);
+  }
+
+  private parseRuntime(runtimeText: string): number | undefined {
+    return parseRuntimeUtil(runtimeText, this.getLangConfig());
+  }
+
+  /**
+   * Parse the `.productListItem` blocks used by /adblbestsellers and /newreleases.
+   * Pushes matched books into `audiobooks` (skipping duplicates and respecting `limit`)
+   * and returns the count parsed from this page.
+   */
+  private parseProductListItems(
+    html: string,
+    audiobooks: AudibleAudiobook[],
+    limit: number,
+  ): number {
+    const $ = cheerio.load(html);
+    const langConfig = this.getLangConfig();
+    let foundOnPage = 0;
+
+    $('.productListItem').each((_index, element) => {
+      if (audiobooks.length >= limit) return false;
+
+      const $el = $(element);
+
+      const asin =
+        $el.find('li').attr('data-asin') ||
+        $el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
+        '';
+      if (!asin) return;
+      if (audiobooks.some((book) => book.asin === asin)) return;
+
+      const title =
+        $el.find('h3 a').text().trim() ||
+        $el.find('.bc-heading a').text().trim();
+
+      const authorText =
+        $el.find('.authorLabel').text().trim() ||
+        $el.find('.bc-size-small .bc-text-bold').first().text().trim();
+
+      const authorHref = $el.find('a[href*="/author/"]').first().attr('href') || '';
+      const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
+
+      // Narrator — capture all narrator links (multi-narrator productions are common);
+      // fall back to .narratorLabel text, then to the bc-text-bold sibling for layouts
+      // that omit both anchor links and the .narratorLabel span.
+      const narratorText =
+        extractAllNarrators($, $el) ||
+        $el.find('.bc-size-small .bc-text-bold').eq(1).text().trim();
+
+      const coverArtUrl = $el.find('img').attr('src') || '';
+
+      const ratingText = $el.find('.ratingsLabel').text().trim();
+      const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
+
+      audiobooks.push({
+        asin,
+        title,
+        author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
+        authorAsin: authorAsinMatch?.[1] || undefined,
+        narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
+        coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
+        rating,
+      });
+
+      foundOnPage++;
+    });
+
+    return foundOnPage;
+  }
+
+  /**
+   * Parse the `.s-result-item` / `.productListItem` blocks used by
+   * /search?node=<categoryId>. Pushes matched books into `audiobooks`
+   * (skipping duplicates and respecting `limit`) and returns the count parsed
+   * from this page.
+   */
+  private parseSearchResultItems(
+    html: string,
+    audiobooks: AudibleAudiobook[],
+    limit: number,
+  ): number {
+    const $ = cheerio.load(html);
+    const langConfig = this.getLangConfig();
+    let foundOnPage = 0;
+
+    $('.s-result-item, .productListItem').each((_index, element) => {
+      if (audiobooks.length >= limit) return false;
+
+      const $el = $(element);
+
+      const asin =
+        $el.find('li').attr('data-asin') ||
+        $el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
+        '';
+      if (!asin) return;
+      if (audiobooks.some((b) => b.asin === asin)) return;
+
+      const title =
+        $el.find('h2').first().text().trim() ||
+        $el.find('h3 a').text().trim() ||
+        $el.find('.bc-heading a').text().trim();
+
+      const authorLink = $el.find('a[href*="/author/"]').first();
+      const authorText =
+        authorLink.text().trim() ||
+        $el.find('.authorLabel').text().trim();
+      const authorHref = authorLink.attr('href') || '';
+      const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
+
+      // Narrator — capture all narrator links (multi-narrator productions are common)
+      const narratorText = extractAllNarrators($, $el);
+
+      const coverArtUrl = $el.find('img').attr('src') || '';
+
+      const runtimeText =
+        $el.find('.runtimeLabel').text().trim() ||
+        $el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
+      const durationMinutes = this.parseRuntime(runtimeText);
+
+      const ratingText =
+        $el.find('.ratingsLabel').text().trim() ||
+        $el.find('.a-icon-star span').first().text().trim();
+      const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
+
+      audiobooks.push({
+        asin,
+        title,
+        author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
+        authorAsin: authorAsinMatch?.[1] || undefined,
+        narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
+        coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
+        durationMinutes,
+        rating,
+      });
+
+      foundOnPage++;
+    });
+
+    return foundOnPage;
  }

  private async delay(ms: number): Promise<void> {
@@ -138,16 +138,37 @@ async function persistSectionBooks(
  logger: ReturnType<typeof RMABLogger.forJob>,
  labelForErrors: string,
 ): Promise<number> {
+  // Defensive dedup: the (asin, categoryId) unique constraint means a duplicate ASIN
+  // in `books` crashes the second .create() with P2002. The HTML parser already dedupes
+  // per page and across pages against the cumulative accumulator, but a warn-on-fire
+  // signal here lets us detect upstream surprises (e.g. Audible serving the same item
+  // in both a carousel and the main grid) without the noisy duplicate-key Postgres
+  // errors. Keep the first occurrence so Audible's editorial ordering is preserved.
+  const seenAsins = new Set<string>();
+  const dedupedBooks = books.filter((b) => {
+    if (!b?.asin || seenAsins.has(b.asin)) return false;
+    seenAsins.add(b.asin);
+    return true;
+  });
+  const droppedCount = books.length - dedupedBooks.length;
+  if (droppedCount > 0) {
+    logger.warn(
+      `Dropped ${droppedCount} duplicate ASIN(s) from ${categoryId} input list before persist`,
+    );
+  }
+
  // Wipe previous entries for this section
  logger.info(`Clearing previous data for ${categoryId}...`);
  await prisma.audibleCacheCategory.deleteMany({
    where: { categoryId },
  });
-  logger.info(`Cleared previous entries for ${categoryId}, saving ${books.length} books...`);
+  logger.info(
+    `Cleared previous entries for ${categoryId}, saving ${dedupedBooks.length} books...`,
+  );

  let saved = 0;
-  for (let i = 0; i < books.length; i++) {
-    const book = books[i];
+  for (let i = 0; i < dedupedBooks.length; i++) {
+    const book = dedupedBooks[i];
    try {
      // Cache thumbnail if coverArtUrl exists
      let cachedCoverPath: string | null = null;
@@ -9,7 +9,8 @@

 import { prisma } from '@/lib/db';
 import { RMABLogger } from '@/lib/utils/logger';
-import type { DedupGroup } from '@/lib/utils/deduplicate-audiobooks';
+import { metadataScore, type DedupGroup } from '@/lib/utils/deduplicate-audiobooks';
+import type { AudibleAudiobook } from '@/lib/integrations/audible.service';

 const logger = RMABLogger.create('WorksService');

@@ -182,6 +183,96 @@ export async function seedAsin(
  }
 }

+// ---------------------------------------------------------------------------
+// View-level collapse (consult the works table after local dedup)
+// ---------------------------------------------------------------------------
+
+/**
+ * Collapse books that already share a Work record according to the works table.
+ *
+ * The local `deduplicateAndCollectGroups()` pass is title/narrator/duration-based
+ * and stateless — it can fail to merge ASINs whose source metadata diverges (e.g.
+ * a series-page scrape captures different "first narrators" for two ASINs of the
+ * same recording, or two paginated pages each contain one ASIN and never compare
+ * them). The works table is the durable source of truth for "same book" identity,
+ * populated by every prior dedup pass and by request-time seeding. This pass
+ * applies that knowledge to the current view.
+ *
+ * Behavior:
+ *  - Books whose ASINs map to a shared workId collapse to a single representative
+ *    chosen by `metadataScore()` (same ranking as local dedup).
+ *  - Books not present in any work, or in single-ASIN works, pass through untouched.
+ *  - Original ordering is preserved (the kept representative sits at the position
+ *    of the first occurrence of its work in the input list).
+ *  - DB failure is non-fatal: the input list is returned unchanged so the view
+ *    still renders (degrades to local-dedup-only behavior).
+ */
+export async function collapseByExistingWorks(
+  books: AudibleAudiobook[],
+): Promise<AudibleAudiobook[]> {
+  if (books.length <= 1) return books;
+
+  try {
+    const asins = books.map(b => b.asin);
+    const entries = await prisma.workAsin.findMany({
+      where: { asin: { in: asins } },
+      select: { asin: true, workId: true },
+    });
+
+    if (entries.length === 0) return books;
+
+    // Map ASIN → workId for fast lookup in the loop below
+    const asinToWorkId = new Map<string, string>();
+    for (const entry of entries) {
+      asinToWorkId.set(entry.asin, entry.workId);
+    }
+
+    // Walk the input once, preserving position. For each work seen, keep a
+    // running "best" book; for books not in any work, emit immediately.
+    const result: AudibleAudiobook[] = [];
+    const workIdToResultIndex = new Map<string, number>();
+
+    for (const book of books) {
+      const workId = asinToWorkId.get(book.asin);
+      if (!workId) {
+        result.push(book);
+        continue;
+      }
+
+      const existingIndex = workIdToResultIndex.get(workId);
+      if (existingIndex === undefined) {
+        workIdToResultIndex.set(workId, result.length);
+        result.push(book);
+        continue;
+      }
+
+      // A sibling from this work is already in the result. Keep whichever
+      // has the richer metadata; on tie, keep the earlier entry (already there).
+      const existing = result[existingIndex];
+      if (metadataScore(book) > metadataScore(existing)) {
+        result[existingIndex] = book;
+      }
+    }
+
+    const collapsed = books.length - result.length;
+    if (collapsed > 0) {
+      logger.debug('Collapsed books via works table', {
+        inputCount: books.length,
+        outputCount: result.length,
+        collapsed,
+      });
+    }
+
+    return result;
+  } catch (error) {
+    logger.error('collapseByExistingWorks failed; returning input unchanged', {
+      error: error instanceof Error ? error.message : String(error),
+      bookCount: books.length,
+    });
+    return books;
+  }
+}
+
 // ---------------------------------------------------------------------------
 // Sibling ASIN lookup (for library matching expansion)
 // ---------------------------------------------------------------------------
@@ -109,7 +109,12 @@ export function areDurationsCompatible(a?: number, b?: number): boolean {
 // Metadata scoring (for picking best representative)
 // ---------------------------------------------------------------------------

-function metadataScore(book: AudibleAudiobook): number {
+/**
+ * Score a book by how much metadata it carries. Used as the tie-breaker when
+ * collapsing duplicates — the entry with the richest metadata wins. Exported
+ * so the works-table collapse pass can apply the same ranking.
+ */
+export function metadataScore(book: AudibleAudiobook): number {
  let score = 0;
  if (book.coverArtUrl) score++;
  if (book.rating != null) score++;
@@ -0,0 +1,37 @@
+/**
+ * Component: Narrator Extraction Utility
+ * Documentation: documentation/integrations/audible.md
+ *
+ * Shared helper for Audible HTML scrapers. Audible product listings render
+ * each narrator as a separate `<a href="?searchNarrator=...">` link; using
+ * `.first()` on that selector silently drops co-narrators and breaks dedup
+ * for multi-narrator productions (e.g. full-cast audiobooks). This helper
+ * captures every narrator link and joins them, falling back to the
+ * `.narratorLabel` span when no anchor links are present.
+ */
+
+import type * as cheerio from 'cheerio';
+import type { AnyNode } from 'domhandler';
+
+/**
+ * Extract a comma-joined narrator string from an Audible product list item.
+ *
+ * Order is not semantically significant — downstream `normalizeNarrator()`
+ * sorts before comparison — but document-order preserves a stable, legible
+ * value for caching and logging.
+ */
+export function extractAllNarrators(
+  $: cheerio.CheerioAPI,
+  $el: cheerio.Cheerio<AnyNode>,
+): string {
+  const links = $el.find('a[href*="searchNarrator="]');
+  if (links.length > 0) {
+    const names: string[] = [];
+    links.each((_, link) => {
+      const name = $(link).text().trim();
+      if (name) names.push(name);
+    });
+    if (names.length > 0) return names.join(', ');
+  }
+  return $el.find('.narratorLabel').text().trim();
+}
@@ -38,12 +38,18 @@ export function getBrowserHeaders(userAgent: string): Record<string, string> {
 }

 /**
- * Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5)
+ * Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5),
+ * optionally capped so high attempt counts don't produce absurd waits.
 * Avoids predictable retry timing that is trivially fingerprinted.
 */
-export function jitteredBackoff(attempt: number, baseMs: number = 1000): number {
+export function jitteredBackoff(
+  attempt: number,
+  baseMs: number = 1000,
+  maxBackoffMs: number = Number.POSITIVE_INFINITY,
+): number {
  const jitter = 0.5 + Math.random(); // 0.5 – 1.5
-  return Math.round(Math.pow(2, attempt) * baseMs * jitter);
+  const raw = Math.pow(2, attempt) * baseMs * jitter;
+  return Math.round(Math.min(raw, maxBackoffMs));
 }

 /** Random integer in [minMs, maxMs] */