mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-03 21:00:09 +00:00
Audible: HTML refresh, multi-narrator & works dedup
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
@@ -9,7 +9,7 @@ import { RMABLogger } from '@/lib/utils/logger';
|
||||
import { scrapeSeriesPage } from '@/lib/integrations/audible-series';
|
||||
import { enrichAudiobooksWithMatches } from '@/lib/utils/audiobook-matcher';
|
||||
import { deduplicateAndCollectGroups } from '@/lib/utils/deduplicate-audiobooks';
|
||||
import { persistDedupGroups } from '@/lib/services/works.service';
|
||||
import { persistDedupGroups, collapseByExistingWorks } from '@/lib/services/works.service';
|
||||
import { annotateWithIgnoreStatus } from '@/lib/utils/ignored-audiobooks';
|
||||
|
||||
const logger = RMABLogger.create('API.Series.Detail');
|
||||
@@ -52,17 +52,20 @@ export async function GET(
|
||||
);
|
||||
}
|
||||
|
||||
// Deduplicate before enrichment to avoid wasted DB queries on duplicate entries
|
||||
// Two-pass dedup: local title/narrator/duration matching first, then collapse
|
||||
// any remaining duplicates that the works table already knows are the same book
|
||||
// (handles cases where source metadata diverges across paths or pages).
|
||||
const { books: dedupedBooks, groups } = deduplicateAndCollectGroups(detail.books);
|
||||
|
||||
// Fire-and-forget: persist dedup groups to works table for cross-ASIN matching
|
||||
if (groups.length > 0) {
|
||||
persistDedupGroups(groups).catch(() => {});
|
||||
}
|
||||
|
||||
const collapsedBooks = await collapseByExistingWorks(dedupedBooks);
|
||||
|
||||
// Enrich books with library availability and request status
|
||||
const userId = currentUser.sub || undefined;
|
||||
const enrichedBooks = await enrichAudiobooksWithMatches(dedupedBooks, userId);
|
||||
const enrichedBooks = await enrichAudiobooksWithMatches(collapsedBooks, userId);
|
||||
|
||||
// Annotate with per-user ignore status
|
||||
const annotatedBooks = await annotateWithIgnoreStatus(enrichedBooks, userId);
|
||||
|
||||
Reference in New Issue
Block a user