mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-02 20:30:10 +00:00
Audible: HTML refresh, multi-narrator & works dedup
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
/**
|
||||
* Component: Narrator Extraction Utility Tests
|
||||
* Documentation: documentation/integrations/audible.md
|
||||
*/
|
||||
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { extractAllNarrators } from '@/lib/utils/extract-narrator';
|
||||
|
||||
function load(html: string) {
|
||||
const $ = cheerio.load(`<div id="item">${html}</div>`);
|
||||
return { $, $el: $('#item') };
|
||||
}
|
||||
|
||||
describe('extractAllNarrators', () => {
|
||||
it('returns the single narrator name when only one searchNarrator link is present', () => {
|
||||
const { $, $el } = load(
|
||||
`<a href="/search?searchNarrator=Andy%20Serkis">Andy Serkis</a>`,
|
||||
);
|
||||
expect(extractAllNarrators($, $el)).toBe('Andy Serkis');
|
||||
});
|
||||
|
||||
it('joins multiple narrator names from separate searchNarrator links', () => {
|
||||
const { $, $el } = load(`
|
||||
<a href="/search?searchNarrator=Kristin%20Atherton">Kristin Atherton</a>,
|
||||
<a href="/search?searchNarrator=Roy%20McMillan">Roy McMillan</a>,
|
||||
<a href="/search?searchNarrator=Clare%20Corbett">Clare Corbett</a>,
|
||||
<a href="/search?searchNarrator=Tom%20Bateman">Tom Bateman</a>,
|
||||
<a href="/search?searchNarrator=Patience%20Tomlinson">Patience Tomlinson</a>,
|
||||
<a href="/search?searchNarrator=Shaheen%20Khan">Shaheen Khan</a>
|
||||
`);
|
||||
expect(extractAllNarrators($, $el)).toBe(
|
||||
'Kristin Atherton, Roy McMillan, Clare Corbett, Tom Bateman, Patience Tomlinson, Shaheen Khan',
|
||||
);
|
||||
});
|
||||
|
||||
it('preserves document order (downstream sorts before comparing, but order should be stable)', () => {
|
||||
const { $, $el } = load(`
|
||||
<a href="/search?searchNarrator=Z">Zelda</a>
|
||||
<a href="/search?searchNarrator=A">Alice</a>
|
||||
<a href="/search?searchNarrator=M">Mallory</a>
|
||||
`);
|
||||
expect(extractAllNarrators($, $el)).toBe('Zelda, Alice, Mallory');
|
||||
});
|
||||
|
||||
it('falls back to .narratorLabel text when no searchNarrator links exist', () => {
|
||||
const { $, $el } = load(
|
||||
`<span class="narratorLabel">Narrated by: Single Narrator</span>`,
|
||||
);
|
||||
expect(extractAllNarrators($, $el)).toBe('Narrated by: Single Narrator');
|
||||
});
|
||||
|
||||
it('prefers searchNarrator links over .narratorLabel when both are present', () => {
|
||||
const { $, $el } = load(`
|
||||
<span class="narratorLabel">Narrated by: ONLY ONE</span>
|
||||
<a href="/search?searchNarrator=First">First</a>
|
||||
<a href="/search?searchNarrator=Second">Second</a>
|
||||
`);
|
||||
expect(extractAllNarrators($, $el)).toBe('First, Second');
|
||||
});
|
||||
|
||||
it('returns empty string when neither links nor .narratorLabel exist', () => {
|
||||
const { $, $el } = load(`<span>some other content</span>`);
|
||||
expect(extractAllNarrators($, $el)).toBe('');
|
||||
});
|
||||
|
||||
it('skips empty link text and joins only non-empty names', () => {
|
||||
const { $, $el } = load(`
|
||||
<a href="/search?searchNarrator=A"></a>
|
||||
<a href="/search?searchNarrator=B">Bob</a>
|
||||
<a href="/search?searchNarrator=C"> </a>
|
||||
<a href="/search?searchNarrator=D">Diana</a>
|
||||
`);
|
||||
expect(extractAllNarrators($, $el)).toBe('Bob, Diana');
|
||||
});
|
||||
|
||||
it('trims whitespace from each captured name', () => {
|
||||
const { $, $el } = load(`
|
||||
<a href="/search?searchNarrator=A"> Alice </a>
|
||||
<a href="/search?searchNarrator=B">
|
||||
Bob
|
||||
</a>
|
||||
`);
|
||||
expect(extractAllNarrators($, $el)).toBe('Alice, Bob');
|
||||
});
|
||||
|
||||
it('falls back to .narratorLabel when all searchNarrator links are empty', () => {
|
||||
const { $, $el } = load(`
|
||||
<a href="/search?searchNarrator=A"></a>
|
||||
<a href="/search?searchNarrator=B"> </a>
|
||||
<span class="narratorLabel">Fallback Narrator</span>
|
||||
`);
|
||||
expect(extractAllNarrators($, $el)).toBe('Fallback Narrator');
|
||||
});
|
||||
});
|
||||
@@ -67,6 +67,24 @@ describe('jitteredBackoff', () => {
|
||||
expect(value).toBeGreaterThanOrEqual(250);
|
||||
expect(value).toBeLessThanOrEqual(750);
|
||||
});
|
||||
|
||||
it('caps the result at maxBackoffMs when the raw backoff would exceed it', () => {
|
||||
// attempt=10 with base=1000 produces 2^10 * 1000 * [0.5..1.5] = 512_000..1_536_000,
|
||||
// all of which exceed a 60_000ms cap.
|
||||
for (let i = 0; i < 50; i++) {
|
||||
const value = jitteredBackoff(10, 1000, 60_000);
|
||||
expect(value).toBeLessThanOrEqual(60_000);
|
||||
}
|
||||
});
|
||||
|
||||
it('returns the un-capped jittered value when below the cap', () => {
|
||||
// attempt=0 with base=1000 produces 500..1500, all below a 60_000ms cap.
|
||||
for (let i = 0; i < 50; i++) {
|
||||
const value = jitteredBackoff(0, 1000, 60_000);
|
||||
expect(value).toBeGreaterThanOrEqual(500);
|
||||
expect(value).toBeLessThanOrEqual(1500);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('randomDelay', () => {
|
||||
|
||||
Reference in New Issue
Block a user