Audible: HTML refresh, multi-narrator & works dedup

Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
kikootwo
2026-05-14 15:23:15 -04:00
parent 5f0855b2f8
commit fcae3bcf09
17 changed files with 1241 additions and 214 deletions
+95
View File
@@ -0,0 +1,95 @@
/**
* Component: Narrator Extraction Utility Tests
* Documentation: documentation/integrations/audible.md
*/
import { describe, expect, it } from 'vitest';
import * as cheerio from 'cheerio';
import { extractAllNarrators } from '@/lib/utils/extract-narrator';
function load(html: string) {
const $ = cheerio.load(`<div id="item">${html}</div>`);
return { $, $el: $('#item') };
}
describe('extractAllNarrators', () => {
it('returns the single narrator name when only one searchNarrator link is present', () => {
const { $, $el } = load(
`<a href="/search?searchNarrator=Andy%20Serkis">Andy Serkis</a>`,
);
expect(extractAllNarrators($, $el)).toBe('Andy Serkis');
});
it('joins multiple narrator names from separate searchNarrator links', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=Kristin%20Atherton">Kristin Atherton</a>,
<a href="/search?searchNarrator=Roy%20McMillan">Roy McMillan</a>,
<a href="/search?searchNarrator=Clare%20Corbett">Clare Corbett</a>,
<a href="/search?searchNarrator=Tom%20Bateman">Tom Bateman</a>,
<a href="/search?searchNarrator=Patience%20Tomlinson">Patience Tomlinson</a>,
<a href="/search?searchNarrator=Shaheen%20Khan">Shaheen Khan</a>
`);
expect(extractAllNarrators($, $el)).toBe(
'Kristin Atherton, Roy McMillan, Clare Corbett, Tom Bateman, Patience Tomlinson, Shaheen Khan',
);
});
it('preserves document order (downstream sorts before comparing, but order should be stable)', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=Z">Zelda</a>
<a href="/search?searchNarrator=A">Alice</a>
<a href="/search?searchNarrator=M">Mallory</a>
`);
expect(extractAllNarrators($, $el)).toBe('Zelda, Alice, Mallory');
});
it('falls back to .narratorLabel text when no searchNarrator links exist', () => {
const { $, $el } = load(
`<span class="narratorLabel">Narrated by: Single Narrator</span>`,
);
expect(extractAllNarrators($, $el)).toBe('Narrated by: Single Narrator');
});
it('prefers searchNarrator links over .narratorLabel when both are present', () => {
const { $, $el } = load(`
<span class="narratorLabel">Narrated by: ONLY ONE</span>
<a href="/search?searchNarrator=First">First</a>
<a href="/search?searchNarrator=Second">Second</a>
`);
expect(extractAllNarrators($, $el)).toBe('First, Second');
});
it('returns empty string when neither links nor .narratorLabel exist', () => {
const { $, $el } = load(`<span>some other content</span>`);
expect(extractAllNarrators($, $el)).toBe('');
});
it('skips empty link text and joins only non-empty names', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=A"></a>
<a href="/search?searchNarrator=B">Bob</a>
<a href="/search?searchNarrator=C"> </a>
<a href="/search?searchNarrator=D">Diana</a>
`);
expect(extractAllNarrators($, $el)).toBe('Bob, Diana');
});
it('trims whitespace from each captured name', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=A"> Alice </a>
<a href="/search?searchNarrator=B">
Bob
</a>
`);
expect(extractAllNarrators($, $el)).toBe('Alice, Bob');
});
it('falls back to .narratorLabel when all searchNarrator links are empty', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=A"></a>
<a href="/search?searchNarrator=B"> </a>
<span class="narratorLabel">Fallback Narrator</span>
`);
expect(extractAllNarrators($, $el)).toBe('Fallback Narrator');
});
});
+18
View File
@@ -67,6 +67,24 @@ describe('jitteredBackoff', () => {
expect(value).toBeGreaterThanOrEqual(250);
expect(value).toBeLessThanOrEqual(750);
});
it('caps the result at maxBackoffMs when the raw backoff would exceed it', () => {
// attempt=10 with base=1000 produces 2^10 * 1000 * [0.5..1.5] = 512_000..1_536_000,
// all of which exceed a 60_000ms cap.
for (let i = 0; i < 50; i++) {
const value = jitteredBackoff(10, 1000, 60_000);
expect(value).toBeLessThanOrEqual(60_000);
}
});
it('returns the un-capped jittered value when below the cap', () => {
// attempt=0 with base=1000 produces 500..1500, all below a 60_000ms cap.
for (let i = 0; i < 50; i++) {
const value = jitteredBackoff(0, 1000, 60_000);
expect(value).toBeGreaterThanOrEqual(500);
expect(value).toBeLessThanOrEqual(1500);
}
});
});
describe('randomDelay', () => {