mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-02 20:30:10 +00:00
fcae3bcf09
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
38 lines
1.3 KiB
TypeScript
38 lines
1.3 KiB
TypeScript
/**
|
|
* Component: Narrator Extraction Utility
|
|
* Documentation: documentation/integrations/audible.md
|
|
*
|
|
* Shared helper for Audible HTML scrapers. Audible product listings render
|
|
* each narrator as a separate `<a href="?searchNarrator=...">` link; using
|
|
* `.first()` on that selector silently drops co-narrators and breaks dedup
|
|
* for multi-narrator productions (e.g. full-cast audiobooks). This helper
|
|
* captures every narrator link and joins them, falling back to the
|
|
* `.narratorLabel` span when no anchor links are present.
|
|
*/
|
|
|
|
import type * as cheerio from 'cheerio';
|
|
import type { AnyNode } from 'domhandler';
|
|
|
|
/**
|
|
* Extract a comma-joined narrator string from an Audible product list item.
|
|
*
|
|
* Order is not semantically significant — downstream `normalizeNarrator()`
|
|
* sorts before comparison — but document-order preserves a stable, legible
|
|
* value for caching and logging.
|
|
*/
|
|
export function extractAllNarrators(
|
|
$: cheerio.CheerioAPI,
|
|
$el: cheerio.Cheerio<AnyNode>,
|
|
): string {
|
|
const links = $el.find('a[href*="searchNarrator="]');
|
|
if (links.length > 0) {
|
|
const names: string[] = [];
|
|
links.each((_, link) => {
|
|
const name = $(link).text().trim();
|
|
if (name) names.push(name);
|
|
});
|
|
if (names.length > 0) return names.join(', ');
|
|
}
|
|
return $el.find('.narratorLabel').text().trim();
|
|
}
|