Audible: HTML refresh, multi-narrator & works dedup

Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
kikootwo
2026-05-14 15:23:15 -04:00
parent 5f0855b2f8
commit fcae3bcf09
17 changed files with 1241 additions and 214 deletions
+6 -1
View File
@@ -109,7 +109,12 @@ export function areDurationsCompatible(a?: number, b?: number): boolean {
// Metadata scoring (for picking best representative)
// ---------------------------------------------------------------------------
function metadataScore(book: AudibleAudiobook): number {
/**
* Score a book by how much metadata it carries. Used as the tie-breaker when
* collapsing duplicates — the entry with the richest metadata wins. Exported
* so the works-table collapse pass can apply the same ranking.
*/
export function metadataScore(book: AudibleAudiobook): number {
let score = 0;
if (book.coverArtUrl) score++;
if (book.rating != null) score++;
+37
View File
@@ -0,0 +1,37 @@
/**
* Component: Narrator Extraction Utility
* Documentation: documentation/integrations/audible.md
*
* Shared helper for Audible HTML scrapers. Audible product listings render
* each narrator as a separate `<a href="?searchNarrator=...">` link; using
* `.first()` on that selector silently drops co-narrators and breaks dedup
* for multi-narrator productions (e.g. full-cast audiobooks). This helper
* captures every narrator link and joins them, falling back to the
* `.narratorLabel` span when no anchor links are present.
*/
import type * as cheerio from 'cheerio';
import type { AnyNode } from 'domhandler';
/**
* Extract a comma-joined narrator string from an Audible product list item.
*
* Order is not semantically significant — downstream `normalizeNarrator()`
* sorts before comparison — but document-order preserves a stable, legible
* value for caching and logging.
*/
export function extractAllNarrators(
$: cheerio.CheerioAPI,
$el: cheerio.Cheerio<AnyNode>,
): string {
const links = $el.find('a[href*="searchNarrator="]');
if (links.length > 0) {
const names: string[] = [];
links.each((_, link) => {
const name = $(link).text().trim();
if (name) names.push(name);
});
if (names.length > 0) return names.join(', ');
}
return $el.find('.narratorLabel').text().trim();
}
+9 -3
View File
@@ -38,12 +38,18 @@ export function getBrowserHeaders(userAgent: string): Record<string, string> {
}
/**
* Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5)
* Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5),
* optionally capped so high attempt counts don't produce absurd waits.
* Avoids predictable retry timing that is trivially fingerprinted.
*/
export function jitteredBackoff(attempt: number, baseMs: number = 1000): number {
export function jitteredBackoff(
attempt: number,
baseMs: number = 1000,
maxBackoffMs: number = Number.POSITIVE_INFINITY,
): number {
const jitter = 0.5 + Math.random(); // 0.5 1.5
return Math.round(Math.pow(2, attempt) * baseMs * jitter);
const raw = Math.pow(2, attempt) * baseMs * jitter;
return Math.round(Math.min(raw, maxBackoffMs));
}
/** Random integer in [minMs, maxMs] */