mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-04 13:20:11 +00:00
Audible: HTML refresh, multi-narrator & works dedup
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
@@ -109,7 +109,12 @@ export function areDurationsCompatible(a?: number, b?: number): boolean {
|
||||
// Metadata scoring (for picking best representative)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function metadataScore(book: AudibleAudiobook): number {
|
||||
/**
|
||||
* Score a book by how much metadata it carries. Used as the tie-breaker when
|
||||
* collapsing duplicates — the entry with the richest metadata wins. Exported
|
||||
* so the works-table collapse pass can apply the same ranking.
|
||||
*/
|
||||
export function metadataScore(book: AudibleAudiobook): number {
|
||||
let score = 0;
|
||||
if (book.coverArtUrl) score++;
|
||||
if (book.rating != null) score++;
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
/**
|
||||
* Component: Narrator Extraction Utility
|
||||
* Documentation: documentation/integrations/audible.md
|
||||
*
|
||||
* Shared helper for Audible HTML scrapers. Audible product listings render
|
||||
* each narrator as a separate `<a href="?searchNarrator=...">` link; using
|
||||
* `.first()` on that selector silently drops co-narrators and breaks dedup
|
||||
* for multi-narrator productions (e.g. full-cast audiobooks). This helper
|
||||
* captures every narrator link and joins them, falling back to the
|
||||
* `.narratorLabel` span when no anchor links are present.
|
||||
*/
|
||||
|
||||
import type * as cheerio from 'cheerio';
|
||||
import type { AnyNode } from 'domhandler';
|
||||
|
||||
/**
|
||||
* Extract a comma-joined narrator string from an Audible product list item.
|
||||
*
|
||||
* Order is not semantically significant — downstream `normalizeNarrator()`
|
||||
* sorts before comparison — but document-order preserves a stable, legible
|
||||
* value for caching and logging.
|
||||
*/
|
||||
export function extractAllNarrators(
|
||||
$: cheerio.CheerioAPI,
|
||||
$el: cheerio.Cheerio<AnyNode>,
|
||||
): string {
|
||||
const links = $el.find('a[href*="searchNarrator="]');
|
||||
if (links.length > 0) {
|
||||
const names: string[] = [];
|
||||
links.each((_, link) => {
|
||||
const name = $(link).text().trim();
|
||||
if (name) names.push(name);
|
||||
});
|
||||
if (names.length > 0) return names.join(', ');
|
||||
}
|
||||
return $el.find('.narratorLabel').text().trim();
|
||||
}
|
||||
@@ -38,12 +38,18 @@ export function getBrowserHeaders(userAgent: string): Record<string, string> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5)
|
||||
* Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5),
|
||||
* optionally capped so high attempt counts don't produce absurd waits.
|
||||
* Avoids predictable retry timing that is trivially fingerprinted.
|
||||
*/
|
||||
export function jitteredBackoff(attempt: number, baseMs: number = 1000): number {
|
||||
export function jitteredBackoff(
|
||||
attempt: number,
|
||||
baseMs: number = 1000,
|
||||
maxBackoffMs: number = Number.POSITIVE_INFINITY,
|
||||
): number {
|
||||
const jitter = 0.5 + Math.random(); // 0.5 – 1.5
|
||||
return Math.round(Math.pow(2, attempt) * baseMs * jitter);
|
||||
const raw = Math.pow(2, attempt) * baseMs * jitter;
|
||||
return Math.round(Math.min(raw, maxBackoffMs));
|
||||
}
|
||||
|
||||
/** Random integer in [minMs, maxMs] */
|
||||
|
||||
Reference in New Issue
Block a user