mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-03 04:40:09 +00:00
Audible: HTML refresh, multi-narrator & works dedup
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
@@ -19,6 +19,7 @@ import {
|
||||
import { RMABLogger } from '../utils/logger';
|
||||
import { parseRuntime } from '../utils/parse-runtime';
|
||||
import { randomDelay } from '../utils/scrape-resilience';
|
||||
import { extractAllNarrators } from '../utils/extract-narrator';
|
||||
|
||||
const logger = RMABLogger.create('Audible.Series');
|
||||
|
||||
@@ -442,10 +443,8 @@ function parseSeriesBooks(
|
||||
const authorHref = authorLink.attr('href') || '';
|
||||
const authorAsinMatch = authorHref.match(/\/author\/[^/]+\/([A-Z0-9]{10})/);
|
||||
|
||||
// Narrator
|
||||
const narratorText = $el.find('a[href*="searchNarrator="]').first().text().trim() ||
|
||||
$el.find('.narratorLabel').text().trim() ||
|
||||
'';
|
||||
// Narrator — capture all narrator links (multi-narrator productions are common)
|
||||
const narratorText = extractAllNarrators($, $el);
|
||||
|
||||
// Cover art
|
||||
const coverArtUrl = $el.find('img').first().attr('src')?.replace(/\._.*_\./, '._SL500_.') || '';
|
||||
|
||||
@@ -4,21 +4,26 @@
|
||||
*/
|
||||
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { RMABLogger } from '../utils/logger';
|
||||
import { getConfigService } from '../services/config.service';
|
||||
import { AudibleRegion, AUDIBLE_REGIONS, DEFAULT_AUDIBLE_REGION } from '../types/audible';
|
||||
import {
|
||||
getLanguageForRegion,
|
||||
isAcceptedLanguage,
|
||||
stripPrefixes,
|
||||
buildContainsSelector,
|
||||
type LanguageConfig,
|
||||
} from '../constants/language-config';
|
||||
import {
|
||||
pickUserAgent,
|
||||
getBrowserHeaders,
|
||||
jitteredBackoff,
|
||||
randomDelay,
|
||||
AdaptivePacer,
|
||||
FetchResultMeta,
|
||||
} from '../utils/scrape-resilience';
|
||||
import { parseRuntime as parseRuntimeUtil } from '../utils/parse-runtime';
|
||||
import { extractAllNarrators } from '../utils/extract-narrator';
|
||||
|
||||
const logger = RMABLogger.create('Audible');
|
||||
|
||||
@@ -27,6 +32,13 @@ const AUDIBLE_PAGE_SIZE = 50;
|
||||
const CATALOG_RESPONSE_GROUPS =
|
||||
'contributors,product_desc,product_attrs,product_extended_attrs,media,rating,series,category_ladders,product_details';
|
||||
|
||||
// Retry/backoff knobs for HTML scraping (nightly refresh job only).
|
||||
// Healthy users still finish quickly — per-page success returns on attempt 0
|
||||
// with a 2-4s inter-page delay. Struggling users grind through 503 storms
|
||||
// patiently: up to ~12 retries per request, with each backoff capped at 3 min.
|
||||
const HTML_MAX_RETRIES = 12;
|
||||
const HTML_MAX_BACKOFF_MS = 180_000;
|
||||
|
||||
export interface AudibleAudiobook {
|
||||
asin: string;
|
||||
title: string;
|
||||
@@ -298,6 +310,7 @@ export class AudibleService {
|
||||
config: any = {},
|
||||
maxRetries: number = 5,
|
||||
client: AxiosInstance = this.htmlClient,
|
||||
maxBackoffMs: number = Number.POSITIVE_INFINITY,
|
||||
): Promise<{ data: any; meta: FetchResultMeta }> {
|
||||
let lastError: Error | null = null;
|
||||
let retriesUsed = 0;
|
||||
@@ -324,7 +337,7 @@ export class AudibleService {
|
||||
|
||||
retriesUsed++;
|
||||
|
||||
const backoffMs = jitteredBackoff(attempt);
|
||||
const backoffMs = jitteredBackoff(attempt, 1000, maxBackoffMs);
|
||||
logger.info(
|
||||
` Request failed (${status || 'network error'}), retrying in ${backoffMs}ms (attempt ${attempt + 1}/${maxRetries})...`,
|
||||
);
|
||||
@@ -379,6 +392,12 @@ export class AudibleService {
|
||||
throw lastError || new Error('External API request failed after retries');
|
||||
}
|
||||
|
||||
/**
|
||||
* Popular audiobooks from Audible's curated /adblbestsellers HTML page.
|
||||
* Uses HTML scraping (not the catalog API) because the API's BestSellers sort
|
||||
* is a right-now velocity rank that surfaces launch-day shovelware and preorders;
|
||||
* the HTML page reflects Audible's editorial curation.
|
||||
*/
|
||||
async getPopularAudiobooks(limit: number = 20): Promise<AudibleAudiobook[]> {
|
||||
await this.initialize();
|
||||
|
||||
@@ -395,42 +414,36 @@ export class AudibleService {
|
||||
logger.info(` Fetching page ${page}/${maxPages}...`);
|
||||
|
||||
const { data: response, meta } = await this.fetchWithRetry(
|
||||
'/1.0/catalog/products',
|
||||
'/adblbestsellers',
|
||||
{
|
||||
params: {
|
||||
products_sort_by: 'BestSellers',
|
||||
num_results: AUDIBLE_PAGE_SIZE,
|
||||
page: page - 1,
|
||||
response_groups: CATALOG_RESPONSE_GROUPS,
|
||||
ipRedirectOverride: 'true',
|
||||
pageSize: AUDIBLE_PAGE_SIZE,
|
||||
...(page > 1 ? { page } : {}),
|
||||
},
|
||||
},
|
||||
5,
|
||||
this.apiClient,
|
||||
HTML_MAX_RETRIES,
|
||||
this.htmlClient,
|
||||
HTML_MAX_BACKOFF_MS,
|
||||
);
|
||||
|
||||
const envelope: CatalogProductsResponse = response.data;
|
||||
const products = envelope.products ?? [];
|
||||
const totalResults = envelope.total_results ?? 0;
|
||||
const foundOnPage = this.parseProductListItems(
|
||||
response.data,
|
||||
audiobooks,
|
||||
limit,
|
||||
);
|
||||
|
||||
for (const product of products) {
|
||||
if (audiobooks.length >= limit) break;
|
||||
if (audiobooks.some((b) => b.asin === product.asin)) continue;
|
||||
audiobooks.push(mapCatalogProduct(product));
|
||||
logger.info(` Found ${foundOnPage} audiobooks on page ${page}`);
|
||||
|
||||
if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) {
|
||||
logger.info(` Reached end of available pages`);
|
||||
break;
|
||||
}
|
||||
|
||||
logger.info(` Found ${products.length} audiobooks on page ${page}`);
|
||||
|
||||
const hasMore =
|
||||
totalResults > 0
|
||||
? totalResults > page * AUDIBLE_PAGE_SIZE
|
||||
: products.length >= AUDIBLE_PAGE_SIZE;
|
||||
|
||||
if (!hasMore) break;
|
||||
|
||||
page++;
|
||||
|
||||
if (page <= maxPages && audiobooks.length < limit) {
|
||||
await this.delay(this.apiPageDelay(meta));
|
||||
await this.delay(this.pacer.reportPageResult(meta));
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to fetch page ${page} of popular audiobooks`, {
|
||||
@@ -445,6 +458,11 @@ export class AudibleService {
|
||||
return audiobooks;
|
||||
}
|
||||
|
||||
/**
|
||||
* New release audiobooks from Audible's curated /newreleases HTML page.
|
||||
* Uses HTML scraping (not the catalog API) because the API's -ReleaseDate sort
|
||||
* returns 100% future preorders with no released-only filter available.
|
||||
*/
|
||||
async getNewReleases(limit: number = 20): Promise<AudibleAudiobook[]> {
|
||||
await this.initialize();
|
||||
|
||||
@@ -461,42 +479,36 @@ export class AudibleService {
|
||||
logger.info(` Fetching page ${page}/${maxPages}...`);
|
||||
|
||||
const { data: response, meta } = await this.fetchWithRetry(
|
||||
'/1.0/catalog/products',
|
||||
'/newreleases',
|
||||
{
|
||||
params: {
|
||||
products_sort_by: '-ReleaseDate',
|
||||
num_results: AUDIBLE_PAGE_SIZE,
|
||||
page: page - 1,
|
||||
response_groups: CATALOG_RESPONSE_GROUPS,
|
||||
ipRedirectOverride: 'true',
|
||||
pageSize: AUDIBLE_PAGE_SIZE,
|
||||
...(page > 1 ? { page } : {}),
|
||||
},
|
||||
},
|
||||
5,
|
||||
this.apiClient,
|
||||
HTML_MAX_RETRIES,
|
||||
this.htmlClient,
|
||||
HTML_MAX_BACKOFF_MS,
|
||||
);
|
||||
|
||||
const envelope: CatalogProductsResponse = response.data;
|
||||
const products = envelope.products ?? [];
|
||||
const totalResults = envelope.total_results ?? 0;
|
||||
const foundOnPage = this.parseProductListItems(
|
||||
response.data,
|
||||
audiobooks,
|
||||
limit,
|
||||
);
|
||||
|
||||
for (const product of products) {
|
||||
if (audiobooks.length >= limit) break;
|
||||
if (audiobooks.some((b) => b.asin === product.asin)) continue;
|
||||
audiobooks.push(mapCatalogProduct(product));
|
||||
logger.info(` Found ${foundOnPage} audiobooks on page ${page}`);
|
||||
|
||||
if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) {
|
||||
logger.info(` Reached end of available pages`);
|
||||
break;
|
||||
}
|
||||
|
||||
logger.info(` Found ${products.length} audiobooks on page ${page}`);
|
||||
|
||||
const hasMore =
|
||||
totalResults > 0
|
||||
? totalResults > page * AUDIBLE_PAGE_SIZE
|
||||
: products.length >= AUDIBLE_PAGE_SIZE;
|
||||
|
||||
if (!hasMore) break;
|
||||
|
||||
page++;
|
||||
|
||||
if (page <= maxPages && audiobooks.length < limit) {
|
||||
await this.delay(this.apiPageDelay(meta));
|
||||
await this.delay(this.pacer.reportPageResult(meta));
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to fetch page ${page} of new releases`, {
|
||||
@@ -791,6 +803,11 @@ export class AudibleService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Category audiobooks from Audible's HTML /search?node=<categoryId> page,
|
||||
* sorted by popularity-rank. Uses HTML scraping (not the catalog API) so
|
||||
* results match Audible's curated category-storefront ordering.
|
||||
*/
|
||||
async getCategoryBooks(categoryId: string, limit: number = 200): Promise<AudibleAudiobook[]> {
|
||||
await this.initialize();
|
||||
|
||||
@@ -805,43 +822,35 @@ export class AudibleService {
|
||||
while (audiobooks.length < limit && page <= maxPages) {
|
||||
try {
|
||||
const { data: response, meta } = await this.fetchWithRetry(
|
||||
'/1.0/catalog/products',
|
||||
'/search',
|
||||
{
|
||||
params: {
|
||||
category_id: categoryId,
|
||||
products_sort_by: 'BestSellers',
|
||||
num_results: AUDIBLE_PAGE_SIZE,
|
||||
page: page - 1,
|
||||
response_groups: CATALOG_RESPONSE_GROUPS,
|
||||
ipRedirectOverride: 'true',
|
||||
node: categoryId,
|
||||
pageSize: AUDIBLE_PAGE_SIZE,
|
||||
sort: 'popularity-rank',
|
||||
...(page > 1 ? { page } : {}),
|
||||
},
|
||||
},
|
||||
5,
|
||||
this.apiClient,
|
||||
HTML_MAX_RETRIES,
|
||||
this.htmlClient,
|
||||
HTML_MAX_BACKOFF_MS,
|
||||
);
|
||||
|
||||
const envelope: CatalogProductsResponse = response.data;
|
||||
const products = envelope.products ?? [];
|
||||
const totalResults = envelope.total_results ?? 0;
|
||||
const foundOnPage = this.parseSearchResultItems(
|
||||
response.data,
|
||||
audiobooks,
|
||||
limit,
|
||||
);
|
||||
|
||||
for (const product of products) {
|
||||
if (audiobooks.length >= limit) break;
|
||||
if (audiobooks.some((b) => b.asin === product.asin)) continue;
|
||||
audiobooks.push(mapCatalogProduct(product));
|
||||
}
|
||||
logger.info(`Category ${categoryId}: found ${foundOnPage} books on page ${page}`);
|
||||
|
||||
logger.info(`Category ${categoryId}: found ${products.length} books on page ${page}`);
|
||||
|
||||
const hasMore =
|
||||
totalResults > 0
|
||||
? totalResults > page * AUDIBLE_PAGE_SIZE
|
||||
: products.length >= AUDIBLE_PAGE_SIZE;
|
||||
|
||||
if (!hasMore) break;
|
||||
if (foundOnPage < AUDIBLE_PAGE_SIZE / 2) break;
|
||||
|
||||
page++;
|
||||
|
||||
if (page <= maxPages && audiobooks.length < limit) {
|
||||
await this.delay(this.apiPageDelay(meta));
|
||||
await this.delay(this.pacer.reportPageResult(meta));
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to fetch category ${categoryId} page ${page}`, {
|
||||
@@ -858,12 +867,148 @@ export class AudibleService {
|
||||
return audiobooks;
|
||||
}
|
||||
|
||||
private apiPageDelay(meta: FetchResultMeta): number {
|
||||
if (meta.retriesUsed > 0) {
|
||||
return this.pacer.reportPageResult(meta);
|
||||
}
|
||||
this.pacer.reportPageResult(meta);
|
||||
return randomDelay(500, 1500);
|
||||
private getLangConfig(): LanguageConfig {
|
||||
return getLanguageForRegion(this.region);
|
||||
}
|
||||
|
||||
private parseRuntime(runtimeText: string): number | undefined {
|
||||
return parseRuntimeUtil(runtimeText, this.getLangConfig());
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the `.productListItem` blocks used by /adblbestsellers and /newreleases.
|
||||
* Pushes matched books into `audiobooks` (skipping duplicates and respecting `limit`)
|
||||
* and returns the count parsed from this page.
|
||||
*/
|
||||
private parseProductListItems(
|
||||
html: string,
|
||||
audiobooks: AudibleAudiobook[],
|
||||
limit: number,
|
||||
): number {
|
||||
const $ = cheerio.load(html);
|
||||
const langConfig = this.getLangConfig();
|
||||
let foundOnPage = 0;
|
||||
|
||||
$('.productListItem').each((_index, element) => {
|
||||
if (audiobooks.length >= limit) return false;
|
||||
|
||||
const $el = $(element);
|
||||
|
||||
const asin =
|
||||
$el.find('li').attr('data-asin') ||
|
||||
$el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
|
||||
'';
|
||||
if (!asin) return;
|
||||
if (audiobooks.some((book) => book.asin === asin)) return;
|
||||
|
||||
const title =
|
||||
$el.find('h3 a').text().trim() ||
|
||||
$el.find('.bc-heading a').text().trim();
|
||||
|
||||
const authorText =
|
||||
$el.find('.authorLabel').text().trim() ||
|
||||
$el.find('.bc-size-small .bc-text-bold').first().text().trim();
|
||||
|
||||
const authorHref = $el.find('a[href*="/author/"]').first().attr('href') || '';
|
||||
const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||||
|
||||
// Narrator — capture all narrator links (multi-narrator productions are common);
|
||||
// fall back to .narratorLabel text, then to the bc-text-bold sibling for layouts
|
||||
// that omit both anchor links and the .narratorLabel span.
|
||||
const narratorText =
|
||||
extractAllNarrators($, $el) ||
|
||||
$el.find('.bc-size-small .bc-text-bold').eq(1).text().trim();
|
||||
|
||||
const coverArtUrl = $el.find('img').attr('src') || '';
|
||||
|
||||
const ratingText = $el.find('.ratingsLabel').text().trim();
|
||||
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
|
||||
|
||||
audiobooks.push({
|
||||
asin,
|
||||
title,
|
||||
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
|
||||
authorAsin: authorAsinMatch?.[1] || undefined,
|
||||
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
|
||||
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
|
||||
rating,
|
||||
});
|
||||
|
||||
foundOnPage++;
|
||||
});
|
||||
|
||||
return foundOnPage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the `.s-result-item` / `.productListItem` blocks used by
|
||||
* /search?node=<categoryId>. Pushes matched books into `audiobooks`
|
||||
* (skipping duplicates and respecting `limit`) and returns the count parsed
|
||||
* from this page.
|
||||
*/
|
||||
private parseSearchResultItems(
|
||||
html: string,
|
||||
audiobooks: AudibleAudiobook[],
|
||||
limit: number,
|
||||
): number {
|
||||
const $ = cheerio.load(html);
|
||||
const langConfig = this.getLangConfig();
|
||||
let foundOnPage = 0;
|
||||
|
||||
$('.s-result-item, .productListItem').each((_index, element) => {
|
||||
if (audiobooks.length >= limit) return false;
|
||||
|
||||
const $el = $(element);
|
||||
|
||||
const asin =
|
||||
$el.find('li').attr('data-asin') ||
|
||||
$el.find('a').attr('href')?.match(/\/(?:pd|ac)\/[^\/]+\/([A-Z0-9]{10})/)?.[1] ||
|
||||
'';
|
||||
if (!asin) return;
|
||||
if (audiobooks.some((b) => b.asin === asin)) return;
|
||||
|
||||
const title =
|
||||
$el.find('h2').first().text().trim() ||
|
||||
$el.find('h3 a').text().trim() ||
|
||||
$el.find('.bc-heading a').text().trim();
|
||||
|
||||
const authorLink = $el.find('a[href*="/author/"]').first();
|
||||
const authorText =
|
||||
authorLink.text().trim() ||
|
||||
$el.find('.authorLabel').text().trim();
|
||||
const authorHref = authorLink.attr('href') || '';
|
||||
const authorAsinMatch = authorHref.match(/\/author\/[^\/]+\/([A-Z0-9]{10})/);
|
||||
|
||||
// Narrator — capture all narrator links (multi-narrator productions are common)
|
||||
const narratorText = extractAllNarrators($, $el);
|
||||
|
||||
const coverArtUrl = $el.find('img').attr('src') || '';
|
||||
|
||||
const runtimeText =
|
||||
$el.find('.runtimeLabel').text().trim() ||
|
||||
$el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
|
||||
const durationMinutes = this.parseRuntime(runtimeText);
|
||||
|
||||
const ratingText =
|
||||
$el.find('.ratingsLabel').text().trim() ||
|
||||
$el.find('.a-icon-star span').first().text().trim();
|
||||
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
|
||||
|
||||
audiobooks.push({
|
||||
asin,
|
||||
title,
|
||||
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
|
||||
authorAsin: authorAsinMatch?.[1] || undefined,
|
||||
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
|
||||
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
|
||||
durationMinutes,
|
||||
rating,
|
||||
});
|
||||
|
||||
foundOnPage++;
|
||||
});
|
||||
|
||||
return foundOnPage;
|
||||
}
|
||||
|
||||
private async delay(ms: number): Promise<void> {
|
||||
|
||||
@@ -138,16 +138,37 @@ async function persistSectionBooks(
|
||||
logger: ReturnType<typeof RMABLogger.forJob>,
|
||||
labelForErrors: string,
|
||||
): Promise<number> {
|
||||
// Defensive dedup: the (asin, categoryId) unique constraint means a duplicate ASIN
|
||||
// in `books` crashes the second .create() with P2002. The HTML parser already dedupes
|
||||
// per page and across pages against the cumulative accumulator, but a warn-on-fire
|
||||
// signal here lets us detect upstream surprises (e.g. Audible serving the same item
|
||||
// in both a carousel and the main grid) without the noisy duplicate-key Postgres
|
||||
// errors. Keep the first occurrence so Audible's editorial ordering is preserved.
|
||||
const seenAsins = new Set<string>();
|
||||
const dedupedBooks = books.filter((b) => {
|
||||
if (!b?.asin || seenAsins.has(b.asin)) return false;
|
||||
seenAsins.add(b.asin);
|
||||
return true;
|
||||
});
|
||||
const droppedCount = books.length - dedupedBooks.length;
|
||||
if (droppedCount > 0) {
|
||||
logger.warn(
|
||||
`Dropped ${droppedCount} duplicate ASIN(s) from ${categoryId} input list before persist`,
|
||||
);
|
||||
}
|
||||
|
||||
// Wipe previous entries for this section
|
||||
logger.info(`Clearing previous data for ${categoryId}...`);
|
||||
await prisma.audibleCacheCategory.deleteMany({
|
||||
where: { categoryId },
|
||||
});
|
||||
logger.info(`Cleared previous entries for ${categoryId}, saving ${books.length} books...`);
|
||||
logger.info(
|
||||
`Cleared previous entries for ${categoryId}, saving ${dedupedBooks.length} books...`,
|
||||
);
|
||||
|
||||
let saved = 0;
|
||||
for (let i = 0; i < books.length; i++) {
|
||||
const book = books[i];
|
||||
for (let i = 0; i < dedupedBooks.length; i++) {
|
||||
const book = dedupedBooks[i];
|
||||
try {
|
||||
// Cache thumbnail if coverArtUrl exists
|
||||
let cachedCoverPath: string | null = null;
|
||||
|
||||
@@ -9,7 +9,8 @@
|
||||
|
||||
import { prisma } from '@/lib/db';
|
||||
import { RMABLogger } from '@/lib/utils/logger';
|
||||
import type { DedupGroup } from '@/lib/utils/deduplicate-audiobooks';
|
||||
import { metadataScore, type DedupGroup } from '@/lib/utils/deduplicate-audiobooks';
|
||||
import type { AudibleAudiobook } from '@/lib/integrations/audible.service';
|
||||
|
||||
const logger = RMABLogger.create('WorksService');
|
||||
|
||||
@@ -182,6 +183,96 @@ export async function seedAsin(
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// View-level collapse (consult the works table after local dedup)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Collapse books that already share a Work record according to the works table.
|
||||
*
|
||||
* The local `deduplicateAndCollectGroups()` pass is title/narrator/duration-based
|
||||
* and stateless — it can fail to merge ASINs whose source metadata diverges (e.g.
|
||||
* a series-page scrape captures different "first narrators" for two ASINs of the
|
||||
* same recording, or two paginated pages each contain one ASIN and never compare
|
||||
* them). The works table is the durable source of truth for "same book" identity,
|
||||
* populated by every prior dedup pass and by request-time seeding. This pass
|
||||
* applies that knowledge to the current view.
|
||||
*
|
||||
* Behavior:
|
||||
* - Books whose ASINs map to a shared workId collapse to a single representative
|
||||
* chosen by `metadataScore()` (same ranking as local dedup).
|
||||
* - Books not present in any work, or in single-ASIN works, pass through untouched.
|
||||
* - Original ordering is preserved (the kept representative sits at the position
|
||||
* of the first occurrence of its work in the input list).
|
||||
* - DB failure is non-fatal: the input list is returned unchanged so the view
|
||||
* still renders (degrades to local-dedup-only behavior).
|
||||
*/
|
||||
export async function collapseByExistingWorks(
|
||||
books: AudibleAudiobook[],
|
||||
): Promise<AudibleAudiobook[]> {
|
||||
if (books.length <= 1) return books;
|
||||
|
||||
try {
|
||||
const asins = books.map(b => b.asin);
|
||||
const entries = await prisma.workAsin.findMany({
|
||||
where: { asin: { in: asins } },
|
||||
select: { asin: true, workId: true },
|
||||
});
|
||||
|
||||
if (entries.length === 0) return books;
|
||||
|
||||
// Map ASIN → workId for fast lookup in the loop below
|
||||
const asinToWorkId = new Map<string, string>();
|
||||
for (const entry of entries) {
|
||||
asinToWorkId.set(entry.asin, entry.workId);
|
||||
}
|
||||
|
||||
// Walk the input once, preserving position. For each work seen, keep a
|
||||
// running "best" book; for books not in any work, emit immediately.
|
||||
const result: AudibleAudiobook[] = [];
|
||||
const workIdToResultIndex = new Map<string, number>();
|
||||
|
||||
for (const book of books) {
|
||||
const workId = asinToWorkId.get(book.asin);
|
||||
if (!workId) {
|
||||
result.push(book);
|
||||
continue;
|
||||
}
|
||||
|
||||
const existingIndex = workIdToResultIndex.get(workId);
|
||||
if (existingIndex === undefined) {
|
||||
workIdToResultIndex.set(workId, result.length);
|
||||
result.push(book);
|
||||
continue;
|
||||
}
|
||||
|
||||
// A sibling from this work is already in the result. Keep whichever
|
||||
// has the richer metadata; on tie, keep the earlier entry (already there).
|
||||
const existing = result[existingIndex];
|
||||
if (metadataScore(book) > metadataScore(existing)) {
|
||||
result[existingIndex] = book;
|
||||
}
|
||||
}
|
||||
|
||||
const collapsed = books.length - result.length;
|
||||
if (collapsed > 0) {
|
||||
logger.debug('Collapsed books via works table', {
|
||||
inputCount: books.length,
|
||||
outputCount: result.length,
|
||||
collapsed,
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
logger.error('collapseByExistingWorks failed; returning input unchanged', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
bookCount: books.length,
|
||||
});
|
||||
return books;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Sibling ASIN lookup (for library matching expansion)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -109,7 +109,12 @@ export function areDurationsCompatible(a?: number, b?: number): boolean {
|
||||
// Metadata scoring (for picking best representative)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function metadataScore(book: AudibleAudiobook): number {
|
||||
/**
|
||||
* Score a book by how much metadata it carries. Used as the tie-breaker when
|
||||
* collapsing duplicates — the entry with the richest metadata wins. Exported
|
||||
* so the works-table collapse pass can apply the same ranking.
|
||||
*/
|
||||
export function metadataScore(book: AudibleAudiobook): number {
|
||||
let score = 0;
|
||||
if (book.coverArtUrl) score++;
|
||||
if (book.rating != null) score++;
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
/**
|
||||
* Component: Narrator Extraction Utility
|
||||
* Documentation: documentation/integrations/audible.md
|
||||
*
|
||||
* Shared helper for Audible HTML scrapers. Audible product listings render
|
||||
* each narrator as a separate `<a href="?searchNarrator=...">` link; using
|
||||
* `.first()` on that selector silently drops co-narrators and breaks dedup
|
||||
* for multi-narrator productions (e.g. full-cast audiobooks). This helper
|
||||
* captures every narrator link and joins them, falling back to the
|
||||
* `.narratorLabel` span when no anchor links are present.
|
||||
*/
|
||||
|
||||
import type * as cheerio from 'cheerio';
|
||||
import type { AnyNode } from 'domhandler';
|
||||
|
||||
/**
|
||||
* Extract a comma-joined narrator string from an Audible product list item.
|
||||
*
|
||||
* Order is not semantically significant — downstream `normalizeNarrator()`
|
||||
* sorts before comparison — but document-order preserves a stable, legible
|
||||
* value for caching and logging.
|
||||
*/
|
||||
export function extractAllNarrators(
|
||||
$: cheerio.CheerioAPI,
|
||||
$el: cheerio.Cheerio<AnyNode>,
|
||||
): string {
|
||||
const links = $el.find('a[href*="searchNarrator="]');
|
||||
if (links.length > 0) {
|
||||
const names: string[] = [];
|
||||
links.each((_, link) => {
|
||||
const name = $(link).text().trim();
|
||||
if (name) names.push(name);
|
||||
});
|
||||
if (names.length > 0) return names.join(', ');
|
||||
}
|
||||
return $el.find('.narratorLabel').text().trim();
|
||||
}
|
||||
@@ -38,12 +38,18 @@ export function getBrowserHeaders(userAgent: string): Record<string, string> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5)
|
||||
* Jittered exponential backoff: 2^attempt * baseMs * random(0.5, 1.5),
|
||||
* optionally capped so high attempt counts don't produce absurd waits.
|
||||
* Avoids predictable retry timing that is trivially fingerprinted.
|
||||
*/
|
||||
export function jitteredBackoff(attempt: number, baseMs: number = 1000): number {
|
||||
export function jitteredBackoff(
|
||||
attempt: number,
|
||||
baseMs: number = 1000,
|
||||
maxBackoffMs: number = Number.POSITIVE_INFINITY,
|
||||
): number {
|
||||
const jitter = 0.5 + Math.random(); // 0.5 – 1.5
|
||||
return Math.round(Math.pow(2, attempt) * baseMs * jitter);
|
||||
const raw = Math.pow(2, attempt) * baseMs * jitter;
|
||||
return Math.round(Math.min(raw, maxBackoffMs));
|
||||
}
|
||||
|
||||
/** Random integer in [minMs, maxMs] */
|
||||
|
||||
Reference in New Issue
Block a user