mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-02 20:30:10 +00:00
Audible: HTML refresh, multi-narrator & works dedup
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
@@ -7,7 +7,7 @@ import { NextRequest, NextResponse } from 'next/server';
|
||||
import { getAudibleService } from '@/lib/integrations/audible.service';
|
||||
import { enrichAudiobooksWithMatches } from '@/lib/utils/audiobook-matcher';
|
||||
import { deduplicateAndCollectGroups } from '@/lib/utils/deduplicate-audiobooks';
|
||||
import { persistDedupGroups } from '@/lib/services/works.service';
|
||||
import { persistDedupGroups, collapseByExistingWorks } from '@/lib/services/works.service';
|
||||
import { getCurrentUser } from '@/lib/middleware/auth';
|
||||
import { RMABLogger } from '@/lib/utils/logger';
|
||||
import { annotateWithIgnoreStatus } from '@/lib/utils/ignored-audiobooks';
|
||||
@@ -41,16 +41,19 @@ export async function GET(request: NextRequest) {
|
||||
const currentUser = getCurrentUser(request);
|
||||
const userId = currentUser?.sub || undefined;
|
||||
|
||||
// Deduplicate before enrichment to avoid wasted DB queries on duplicate entries
|
||||
// Two-pass dedup: local title/narrator/duration matching first, then collapse
|
||||
// any remaining duplicates that the works table already knows are the same book
|
||||
// (handles cases where source metadata diverges across paths or pages).
|
||||
const { books: dedupedResults, groups } = deduplicateAndCollectGroups(results.results);
|
||||
|
||||
// Fire-and-forget: persist dedup groups to works table for cross-ASIN matching
|
||||
if (groups.length > 0) {
|
||||
persistDedupGroups(groups).catch(() => {});
|
||||
}
|
||||
|
||||
const collapsedResults = await collapseByExistingWorks(dedupedResults);
|
||||
|
||||
// Enrich search results with availability and request status information
|
||||
const enrichedResults = await enrichAudiobooksWithMatches(dedupedResults, userId);
|
||||
const enrichedResults = await enrichAudiobooksWithMatches(collapsedResults, userId);
|
||||
|
||||
// Annotate with per-user ignore status
|
||||
const annotatedResults = await annotateWithIgnoreStatus(enrichedResults, userId);
|
||||
|
||||
@@ -7,7 +7,7 @@ import { NextRequest, NextResponse } from 'next/server';
|
||||
import { getAudibleService } from '@/lib/integrations/audible.service';
|
||||
import { enrichAudiobooksWithMatches } from '@/lib/utils/audiobook-matcher';
|
||||
import { deduplicateAndCollectGroups } from '@/lib/utils/deduplicate-audiobooks';
|
||||
import { persistDedupGroups } from '@/lib/services/works.service';
|
||||
import { persistDedupGroups, collapseByExistingWorks } from '@/lib/services/works.service';
|
||||
import { getCurrentUser } from '@/lib/middleware/auth';
|
||||
import { RMABLogger } from '@/lib/utils/logger';
|
||||
import { annotateWithIgnoreStatus } from '@/lib/utils/ignored-audiobooks';
|
||||
@@ -56,17 +56,20 @@ export async function GET(
|
||||
const audibleService = getAudibleService();
|
||||
const result = await audibleService.searchByAuthorAsin(authorName.trim(), asin, page);
|
||||
|
||||
// Deduplicate before enrichment to avoid wasted DB queries on duplicate entries
|
||||
// Two-pass dedup: local title/narrator/duration matching first, then collapse
|
||||
// any remaining duplicates that the works table already knows are the same book
|
||||
// (handles cases where source metadata diverges across paths or pages).
|
||||
const { books: dedupedBooks, groups } = deduplicateAndCollectGroups(result.books);
|
||||
|
||||
// Fire-and-forget: persist dedup groups to works table for cross-ASIN matching
|
||||
if (groups.length > 0) {
|
||||
persistDedupGroups(groups).catch(() => {});
|
||||
}
|
||||
|
||||
const collapsedBooks = await collapseByExistingWorks(dedupedBooks);
|
||||
|
||||
// Enrich with library availability and request status
|
||||
const userId = currentUser.sub || undefined;
|
||||
const enrichedBooks = await enrichAudiobooksWithMatches(dedupedBooks, userId);
|
||||
const enrichedBooks = await enrichAudiobooksWithMatches(collapsedBooks, userId);
|
||||
|
||||
// Annotate with per-user ignore status
|
||||
const annotatedBooks = await annotateWithIgnoreStatus(enrichedBooks, userId);
|
||||
|
||||
@@ -9,7 +9,7 @@ import { RMABLogger } from '@/lib/utils/logger';
|
||||
import { scrapeSeriesPage } from '@/lib/integrations/audible-series';
|
||||
import { enrichAudiobooksWithMatches } from '@/lib/utils/audiobook-matcher';
|
||||
import { deduplicateAndCollectGroups } from '@/lib/utils/deduplicate-audiobooks';
|
||||
import { persistDedupGroups } from '@/lib/services/works.service';
|
||||
import { persistDedupGroups, collapseByExistingWorks } from '@/lib/services/works.service';
|
||||
import { annotateWithIgnoreStatus } from '@/lib/utils/ignored-audiobooks';
|
||||
|
||||
const logger = RMABLogger.create('API.Series.Detail');
|
||||
@@ -52,17 +52,20 @@ export async function GET(
|
||||
);
|
||||
}
|
||||
|
||||
// Deduplicate before enrichment to avoid wasted DB queries on duplicate entries
|
||||
// Two-pass dedup: local title/narrator/duration matching first, then collapse
|
||||
// any remaining duplicates that the works table already knows are the same book
|
||||
// (handles cases where source metadata diverges across paths or pages).
|
||||
const { books: dedupedBooks, groups } = deduplicateAndCollectGroups(detail.books);
|
||||
|
||||
// Fire-and-forget: persist dedup groups to works table for cross-ASIN matching
|
||||
if (groups.length > 0) {
|
||||
persistDedupGroups(groups).catch(() => {});
|
||||
}
|
||||
|
||||
const collapsedBooks = await collapseByExistingWorks(dedupedBooks);
|
||||
|
||||
// Enrich books with library availability and request status
|
||||
const userId = currentUser.sub || undefined;
|
||||
const enrichedBooks = await enrichAudiobooksWithMatches(dedupedBooks, userId);
|
||||
const enrichedBooks = await enrichAudiobooksWithMatches(collapsedBooks, userId);
|
||||
|
||||
// Annotate with per-user ignore status
|
||||
const annotatedBooks = await annotateWithIgnoreStatus(enrichedBooks, userId);
|
||||
|
||||
Reference in New Issue
Block a user