Audible: HTML refresh, multi-narrator & works dedup

Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
This commit is contained in:
kikootwo
2026-05-14 15:23:15 -04:00
parent 5f0855b2f8
commit fcae3bcf09
17 changed files with 1241 additions and 214 deletions
+371 -88
View File
@@ -81,6 +81,122 @@ function apiResponse(envelope: object) {
return { data: envelope };
}
// ---------------------------------------------------------------------------
// HTML fixture helpers (for getPopularAudiobooks / getNewReleases / getCategoryBooks,
// which scrape Audible's curated HTML pages)
// ---------------------------------------------------------------------------
interface HtmlBookOverrides {
asin?: string;
title?: string;
author?: string;
authorAsin?: string;
/** Single-narrator shorthand; mutually exclusive with `narrators`. */
narrator?: string;
/** Multi-narrator productions render each name as its own searchNarrator anchor. */
narrators?: string[];
coverArtUrl?: string;
rating?: number;
}
/** Render one or more narrator anchor links suitable for embedding in .narratorLabel. */
function renderNarratorLinks(names: string[]): string {
return names
.map(
(name) =>
`<a href="/search?searchNarrator=${encodeURIComponent(name)}">${name}</a>`,
)
.join(', ');
}
/**
* Produces a single .productListItem block matching the selectors parsed by
* parseProductListItems(). The parser looks for an `<li data-asin>` descendant,
* with an `<a href="/pd/...">` fallback — using a real `<li>` here both
* exercises the primary path and keeps the markup well-formed.
*/
function makeProductListItemHtml(overrides: HtmlBookOverrides = {}): string {
const {
asin = 'B000000001',
title = 'Test Book',
author = 'Test Author',
authorAsin = 'A000000001',
narrator = 'Test Narrator',
narrators,
coverArtUrl = 'https://images.example.com/cover._SL500_.jpg',
rating = 4.5,
} = overrides;
// Real Audible storefront markup embeds each narrator as its own anchor inside
// .narratorLabel for multi-narrator productions. The single-narrator case keeps
// the original plain-text span for backward compatibility with existing tests.
const narratorMarkup = narrators && narrators.length > 0
? `<span class="narratorLabel">Narrated by: ${renderNarratorLinks(narrators)}</span>`
: `<span class="narratorLabel">${narrator}</span>`;
return `
<div class="productListItem">
<ul>
<li data-asin="${asin}">
<img src="${coverArtUrl}" />
<h3><a href="/pd/test/${asin}">${title}</a></h3>
<a class="authorLabel" href="/author/test/${authorAsin}">${author}</a>
${narratorMarkup}
<span class="ratingsLabel">${rating} out of 5</span>
</li>
</ul>
</div>
`;
}
/**
* Produces a single .s-result-item block matching the selectors parsed by
* parseSearchResultItems(). Used for /search?node=<categoryId> category pages.
*/
function makeSearchResultItemHtml(overrides: HtmlBookOverrides = {}): string {
const {
asin = 'B000000001',
title = 'Test Book',
author = 'Test Author',
authorAsin = 'A000000001',
narrator = 'Test Narrator',
narrators,
coverArtUrl = 'https://images.example.com/cover._SL500_.jpg',
rating = 4.5,
} = overrides;
const narratorLinks = narrators && narrators.length > 0
? renderNarratorLinks(narrators)
: `<a href="/search?searchNarrator=${encodeURIComponent(narrator)}">${narrator}</a>`;
return `
<div class="s-result-item">
<ul>
<li data-asin="${asin}">
<img src="${coverArtUrl}" />
<h2><a href="/pd/test/${asin}">${title}</a></h2>
<a href="/author/test/${authorAsin}">${author}</a>
${narratorLinks}
<span class="ratingsLabel">${rating} out of 5</span>
</li>
</ul>
</div>
`;
}
/** Wrap one or more item-HTML strings in a minimal page document. */
function makeHtmlPage(items: string[]): string {
return `<html><body>${items.join('')}</body></html>`;
}
/**
* Produces the value that client.get() should resolve to for HTML responses.
* cheerio.load() is called on response.data, so .data must be the raw HTML string.
*/
function htmlResponse(html: string) {
return { data: html };
}
// ---------------------------------------------------------------------------
// Test setup
// ---------------------------------------------------------------------------
@@ -683,61 +799,66 @@ describe('AudibleService', () => {
});
// -------------------------------------------------------------------------
// getPopularAudiobooks()
// getPopularAudiobooks() — HTML scraping of /adblbestsellers
// -------------------------------------------------------------------------
describe('getPopularAudiobooks()', () => {
it('uses products_sort_by: BestSellers', async () => {
apiClientMock.get.mockResolvedValue(apiResponse(makeProductsResponse([])));
it('hits /adblbestsellers on the htmlClient with pageSize=50', async () => {
htmlClientMock.get.mockResolvedValue(htmlResponse(makeHtmlPage([makeProductListItemHtml()])));
const service = new AudibleService();
await service.getPopularAudiobooks(1);
expect(apiClientMock.get.mock.calls[0][1].params.products_sort_by).toBe('BestSellers');
expect(htmlClientMock.get).toHaveBeenCalledWith(
'/adblbestsellers',
expect.objectContaining({
params: expect.objectContaining({ pageSize: 50 }),
}),
);
});
it('subtracts 1 from public page=1 before calling the API', async () => {
apiClientMock.get.mockResolvedValue(apiResponse(makeProductsResponse([])));
it('does not include a page param on the first request (only from page 2 onward)', async () => {
htmlClientMock.get.mockResolvedValue(htmlResponse(makeHtmlPage([makeProductListItemHtml()])));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
await service.getPopularAudiobooks(1);
expect(apiClientMock.get.mock.calls[0][1].params.page).toBe(0);
expect(htmlClientMock.get.mock.calls[0][1].params.page).toBeUndefined();
delaySpy.mockRestore();
});
it('makes a second call with page=1 when paginating to page 2', async () => {
const page1Products = Array.from({ length: 50 }, (_, i) =>
makeProduct({ asin: `B${String(i).padStart(9, '0')}`, title: `Book ${i}` }),
it('includes page=2 on the second request when paginating', async () => {
const page1Items = Array.from({ length: 50 }, (_, i) =>
makeProductListItemHtml({ asin: `B${String(i).padStart(9, '0')}`, title: `Book ${i}` }),
);
const page2Products = Array.from({ length: 25 }, (_, i) =>
makeProduct({ asin: `B${String(i + 50).padStart(9, '0')}`, title: `Book ${i + 50}` }),
const page2Items = Array.from({ length: 25 }, (_, i) =>
makeProductListItemHtml({ asin: `B${String(i + 50).padStart(9, '0')}`, title: `Book ${i + 50}` }),
);
apiClientMock.get
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page1Products, 75)))
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page2Products, 75)));
htmlClientMock.get
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page1Items)))
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page2Items)));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
await service.getPopularAudiobooks(75);
expect(apiClientMock.get.mock.calls[1][1].params.page).toBe(1);
expect(htmlClientMock.get.mock.calls[1][1].params.page).toBe(2);
delaySpy.mockRestore();
});
it('paginates and returns up to the requested limit', async () => {
const page1Products = Array.from({ length: 50 }, (_, i) =>
makeProduct({ asin: `B${String(i).padStart(9, '0')}`, title: `Book ${i}` }),
it('paginates across pages and returns up to the requested limit', async () => {
const page1Items = Array.from({ length: 50 }, (_, i) =>
makeProductListItemHtml({ asin: `B${String(i).padStart(9, '0')}`, title: `Book ${i}` }),
);
const page2Products = Array.from({ length: 25 }, (_, i) =>
makeProduct({ asin: `B${String(i + 50).padStart(9, '0')}`, title: `Book ${i + 50}` }),
const page2Items = Array.from({ length: 25 }, (_, i) =>
makeProductListItemHtml({ asin: `B${String(i + 50).padStart(9, '0')}`, title: `Book ${i + 50}` }),
);
apiClientMock.get
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page1Products, 75)))
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page2Products, 75)));
htmlClientMock.get
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page1Items)))
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page2Items)));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
@@ -747,176 +868,338 @@ describe('AudibleService', () => {
delaySpy.mockRestore();
});
it('stops early when a page returns fewer than the page size', async () => {
const products = [makeProduct()];
apiClientMock.get.mockResolvedValueOnce(apiResponse(makeProductsResponse(products, 1)));
it('stops early when a page returns fewer than half the page size', async () => {
htmlClientMock.get.mockResolvedValueOnce(
htmlResponse(makeHtmlPage([makeProductListItemHtml()])),
);
const service = new AudibleService();
const results = await service.getPopularAudiobooks(50);
expect(results).toHaveLength(1);
expect(apiClientMock.get).toHaveBeenCalledTimes(1);
expect(htmlClientMock.get).toHaveBeenCalledTimes(1);
});
it('deduplicates by ASIN across pages', async () => {
const sharedProduct = makeProduct({ asin: 'BDUP000001', title: 'Duplicated Book' });
const uniqueProduct = makeProduct({ asin: 'BUNIQ000001', title: 'Unique Book' });
const sharedAsin = 'BDUP000001';
const uniqueAsin = 'BUNIQ000001';
apiClientMock.get
.mockResolvedValueOnce(
apiResponse(makeProductsResponse([sharedProduct], 51)),
)
.mockResolvedValueOnce(
// page 2 returns the same ASIN plus a new one
apiResponse(makeProductsResponse([sharedProduct, uniqueProduct], 51)),
);
// Build a "full" first page (50 items, all with the shared ASIN duplicated as filler)
// so the parser proceeds to page 2.
const page1Items = [
makeProductListItemHtml({ asin: sharedAsin, title: 'Duplicated Book' }),
...Array.from({ length: 49 }, (_, i) =>
makeProductListItemHtml({ asin: `BFILL${String(i).padStart(5, '0')}`, title: `Filler ${i}` }),
),
];
const page2Items = [
makeProductListItemHtml({ asin: sharedAsin, title: 'Duplicated Book' }),
makeProductListItemHtml({ asin: uniqueAsin, title: 'Unique Book' }),
...Array.from({ length: 48 }, (_, i) =>
makeProductListItemHtml({ asin: `BFILL2${String(i).padStart(4, '0')}`, title: `Filler2 ${i}` }),
),
];
htmlClientMock.get
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page1Items)))
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page2Items)));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
const results = await service.getPopularAudiobooks(100);
const results = await service.getPopularAudiobooks(150);
const asins = results.map((r) => r.asin);
expect(asins.filter((a) => a === 'BDUP000001')).toHaveLength(1);
expect(asins.filter((a) => a === sharedAsin)).toHaveLength(1);
expect(asins).toContain(uniqueAsin);
delaySpy.mockRestore();
});
it('returns empty array on error without throwing', async () => {
const error: Error & { response?: { status: number } } = new Error('Not Found');
error.response = { status: 404 };
apiClientMock.get.mockRejectedValue(error);
htmlClientMock.get.mockRejectedValue(error);
const service = new AudibleService();
const results = await service.getPopularAudiobooks(5);
expect(results).toEqual([]);
});
it('uses htmlClient (not apiClient) for the request', async () => {
htmlClientMock.get.mockResolvedValue(htmlResponse(makeHtmlPage([makeProductListItemHtml()])));
const service = new AudibleService();
await service.getPopularAudiobooks(1);
expect(htmlClientMock.get).toHaveBeenCalled();
expect(apiClientMock.get).not.toHaveBeenCalled();
});
it('maps title, author, narrator, and rating from the parsed item', async () => {
htmlClientMock.get.mockResolvedValue(
htmlResponse(
makeHtmlPage([
makeProductListItemHtml({
asin: 'B0HTMLMAP1',
title: 'Mapped Title',
author: 'Mapped Author',
authorAsin: 'A00MAPAUTH',
narrator: 'Mapped Narrator',
rating: 4.7,
}),
]),
),
);
const service = new AudibleService();
const [book] = await service.getPopularAudiobooks(1);
expect(book.asin).toBe('B0HTMLMAP1');
expect(book.title).toBe('Mapped Title');
expect(book.author).toBe('Mapped Author');
expect(book.authorAsin).toBe('A00MAPAUTH');
expect(book.narrator).toBe('Mapped Narrator');
expect(book.rating).toBeCloseTo(4.7);
});
it('captures every co-narrator on multi-narrator productions (regression: prior code took only the first link)', async () => {
htmlClientMock.get.mockResolvedValue(
htmlResponse(
makeHtmlPage([
makeProductListItemHtml({
asin: 'B0FULLCAST',
narrators: [
'Kristin Atherton',
'Roy McMillan',
'Clare Corbett',
'Tom Bateman',
'Patience Tomlinson',
'Shaheen Khan',
],
}),
]),
),
);
const service = new AudibleService();
const [book] = await service.getPopularAudiobooks(1);
// Every narrator must round-trip — order is not significant downstream,
// but document order should be preserved for stable cache values.
expect(book.narrator).toBe(
'Kristin Atherton, Roy McMillan, Clare Corbett, Tom Bateman, Patience Tomlinson, Shaheen Khan',
);
});
});
// -------------------------------------------------------------------------
// getNewReleases()
// getNewReleases() — HTML scraping of /newreleases
// -------------------------------------------------------------------------
describe('getNewReleases()', () => {
it('uses products_sort_by: -ReleaseDate', async () => {
apiClientMock.get.mockResolvedValue(apiResponse(makeProductsResponse([])));
it('hits /newreleases on the htmlClient with pageSize=50', async () => {
htmlClientMock.get.mockResolvedValue(htmlResponse(makeHtmlPage([makeProductListItemHtml()])));
const service = new AudibleService();
await service.getNewReleases(1);
expect(apiClientMock.get.mock.calls[0][1].params.products_sort_by).toBe('-ReleaseDate');
expect(htmlClientMock.get).toHaveBeenCalledWith(
'/newreleases',
expect.objectContaining({
params: expect.objectContaining({ pageSize: 50 }),
}),
);
});
it('subtracts 1 from public page=1 before calling the API', async () => {
apiClientMock.get.mockResolvedValue(apiResponse(makeProductsResponse([])));
it('does not include a page param on the first request', async () => {
htmlClientMock.get.mockResolvedValue(htmlResponse(makeHtmlPage([makeProductListItemHtml()])));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
await service.getNewReleases(1);
expect(apiClientMock.get.mock.calls[0][1].params.page).toBe(0);
expect(htmlClientMock.get.mock.calls[0][1].params.page).toBeUndefined();
delaySpy.mockRestore();
});
it('subtracts 1 from public page=2 when paginating to the second page', async () => {
const page1Products = Array.from({ length: 50 }, (_, i) =>
makeProduct({ asin: `B${String(i).padStart(9, '0')}` }),
it('includes page=2 on the second request when paginating', async () => {
const page1Items = Array.from({ length: 50 }, (_, i) =>
makeProductListItemHtml({ asin: `B${String(i).padStart(9, '0')}` }),
);
const page2Items = Array.from({ length: 50 }, (_, i) =>
makeProductListItemHtml({ asin: `B${String(i + 50).padStart(9, '0')}` }),
);
const page2Products = [makeProduct({ asin: 'BNEW000099' })];
apiClientMock.get
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page1Products, 51)))
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page2Products, 51)));
htmlClientMock.get
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page1Items)))
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page2Items)));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
await service.getNewReleases(51);
expect(apiClientMock.get.mock.calls[1][1].params.page).toBe(1);
await service.getNewReleases(100);
expect(htmlClientMock.get.mock.calls[1][1].params.page).toBe(2);
delaySpy.mockRestore();
});
it('deduplicates by ASIN across pages', async () => {
const sharedProduct = makeProduct({ asin: 'BDUP000002' });
apiClientMock.get
.mockResolvedValueOnce(apiResponse(makeProductsResponse([sharedProduct], 51)))
.mockResolvedValueOnce(apiResponse(makeProductsResponse([sharedProduct], 51)));
const sharedAsin = 'BDUP000002';
const page1Items = [
makeProductListItemHtml({ asin: sharedAsin }),
...Array.from({ length: 49 }, (_, i) =>
makeProductListItemHtml({ asin: `BNEW${String(i).padStart(6, '0')}` }),
),
];
const page2Items = [
makeProductListItemHtml({ asin: sharedAsin }),
...Array.from({ length: 49 }, (_, i) =>
makeProductListItemHtml({ asin: `BNEW2${String(i).padStart(5, '0')}` }),
),
];
htmlClientMock.get
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page1Items)))
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page2Items)));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
const results = await service.getNewReleases(100);
const results = await service.getNewReleases(150);
expect(results.filter((r) => r.asin === 'BDUP000002')).toHaveLength(1);
expect(results.filter((r) => r.asin === sharedAsin)).toHaveLength(1);
delaySpy.mockRestore();
});
it('returns empty array on error without throwing', async () => {
const error: Error & { response?: { status: number } } = new Error('Not Found');
error.response = { status: 404 };
apiClientMock.get.mockRejectedValue(error);
htmlClientMock.get.mockRejectedValue(error);
const service = new AudibleService();
const results = await service.getNewReleases(5);
expect(results).toEqual([]);
});
it('uses htmlClient (not apiClient) for the request', async () => {
htmlClientMock.get.mockResolvedValue(htmlResponse(makeHtmlPage([makeProductListItemHtml()])));
const service = new AudibleService();
await service.getNewReleases(1);
expect(htmlClientMock.get).toHaveBeenCalled();
expect(apiClientMock.get).not.toHaveBeenCalled();
});
});
// -------------------------------------------------------------------------
// getCategoryBooks()
// getCategoryBooks() — HTML scraping of /search?node=<categoryId>
// -------------------------------------------------------------------------
describe('getCategoryBooks()', () => {
it('sends category_id and BestSellers sort param', async () => {
apiClientMock.get.mockResolvedValue(apiResponse(makeProductsResponse([])));
it('hits /search on the htmlClient with node, pageSize, and popularity-rank sort', async () => {
htmlClientMock.get.mockResolvedValue(
htmlResponse(makeHtmlPage([makeSearchResultItemHtml()])),
);
const service = new AudibleService();
await service.getCategoryBooks('18685580011', 1);
const params = apiClientMock.get.mock.calls[0][1].params;
expect(params.category_id).toBe('18685580011');
expect(params.products_sort_by).toBe('BestSellers');
const params = htmlClientMock.get.mock.calls[0][1].params;
expect(htmlClientMock.get.mock.calls[0][0]).toBe('/search');
expect(params.node).toBe('18685580011');
expect(params.pageSize).toBe(50);
expect(params.sort).toBe('popularity-rank');
});
it('subtracts 1 from public page=1 before calling the API', async () => {
apiClientMock.get.mockResolvedValue(apiResponse(makeProductsResponse([])));
it('does not include a page param on the first request', async () => {
htmlClientMock.get.mockResolvedValue(
htmlResponse(makeHtmlPage([makeSearchResultItemHtml()])),
);
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
await service.getCategoryBooks('CAT001', 1);
expect(apiClientMock.get.mock.calls[0][1].params.page).toBe(0);
expect(htmlClientMock.get.mock.calls[0][1].params.page).toBeUndefined();
delaySpy.mockRestore();
});
it('subtracts 1 from public page=2 when paginating to the second page', async () => {
const page1Products = Array.from({ length: 50 }, (_, i) =>
makeProduct({ asin: `B${String(i).padStart(9, '0')}` }),
it('includes page=2 on the second request when paginating', async () => {
const page1Items = Array.from({ length: 50 }, (_, i) =>
makeSearchResultItemHtml({ asin: `B${String(i).padStart(9, '0')}` }),
);
const page2Items = Array.from({ length: 50 }, (_, i) =>
makeSearchResultItemHtml({ asin: `B${String(i + 50).padStart(9, '0')}` }),
);
const page2Products = [makeProduct({ asin: 'BCAT000099' })];
apiClientMock.get
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page1Products, 51)))
.mockResolvedValueOnce(apiResponse(makeProductsResponse(page2Products, 51)));
htmlClientMock.get
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page1Items)))
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page2Items)));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
await service.getCategoryBooks('CAT001', 51);
expect(apiClientMock.get.mock.calls[1][1].params.page).toBe(1);
await service.getCategoryBooks('CAT001', 100);
expect(htmlClientMock.get.mock.calls[1][1].params.page).toBe(2);
delaySpy.mockRestore();
});
it('deduplicates by ASIN across pages', async () => {
const sharedProduct = makeProduct({ asin: 'BDUP000003' });
apiClientMock.get
.mockResolvedValueOnce(apiResponse(makeProductsResponse([sharedProduct], 51)))
.mockResolvedValueOnce(apiResponse(makeProductsResponse([sharedProduct], 51)));
const sharedAsin = 'BDUP000003';
const page1Items = [
makeSearchResultItemHtml({ asin: sharedAsin }),
...Array.from({ length: 49 }, (_, i) =>
makeSearchResultItemHtml({ asin: `BCAT${String(i).padStart(6, '0')}` }),
),
];
const page2Items = [
makeSearchResultItemHtml({ asin: sharedAsin }),
...Array.from({ length: 49 }, (_, i) =>
makeSearchResultItemHtml({ asin: `BCAT2${String(i).padStart(5, '0')}` }),
),
];
htmlClientMock.get
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page1Items)))
.mockResolvedValueOnce(htmlResponse(makeHtmlPage(page2Items)));
const service = new AudibleService();
const delaySpy = vi.spyOn(service as any, 'delay').mockResolvedValue(undefined);
const results = await service.getCategoryBooks('CAT001', 100);
const results = await service.getCategoryBooks('CAT001', 150);
expect(results.filter((r) => r.asin === 'BDUP000003')).toHaveLength(1);
expect(results.filter((r) => r.asin === sharedAsin)).toHaveLength(1);
delaySpy.mockRestore();
});
it('uses htmlClient (not apiClient) for the request', async () => {
htmlClientMock.get.mockResolvedValue(
htmlResponse(makeHtmlPage([makeSearchResultItemHtml()])),
);
const service = new AudibleService();
await service.getCategoryBooks('CAT001', 1);
expect(htmlClientMock.get).toHaveBeenCalled();
expect(apiClientMock.get).not.toHaveBeenCalled();
});
it('captures every co-narrator on multi-narrator productions (regression: prior code took only the first link)', async () => {
htmlClientMock.get.mockResolvedValue(
htmlResponse(
makeHtmlPage([
makeSearchResultItemHtml({
asin: 'B0FULLCAST',
narrators: ['Alice', 'Bob', 'Carol', 'Dan'],
}),
]),
),
);
const service = new AudibleService();
const [book] = await service.getCategoryBooks('CAT001', 1);
expect(book.narrator).toBe('Alice, Bob, Carol, Dan');
});
});
// -------------------------------------------------------------------------
@@ -198,4 +198,69 @@ describe('processAudibleRefresh', () => {
const { processAudibleRefresh } = await import('@/lib/processors/audible-refresh.processor');
await expect(processAudibleRefresh({ jobId: 'job-2' })).rejects.toThrow('DB down');
});
it('deduplicates ASINs in the input list before persisting, preserving order', async () => {
// Two `A` entries should collapse to one. Final ranks must be contiguous
// (1, 2, 3) and follow Audible's editorial ordering (A, B, C).
const popular = [
{ asin: 'A', title: 'Book A', author: 'X', coverArtUrl: null },
{ asin: 'B', title: 'Book B', author: 'X', coverArtUrl: null },
{ asin: 'A', title: 'Book A (duplicate)', author: 'X', coverArtUrl: null },
{ asin: 'C', title: 'Book C', author: 'X', coverArtUrl: null },
];
audibleServiceMock.getPopularAudiobooks.mockResolvedValue(popular);
audibleServiceMock.getNewReleases.mockResolvedValue([]);
thumbnailCacheMock.cleanupUnusedThumbnails.mockResolvedValue(0);
prismaMock.audibleCache.upsert.mockResolvedValue({});
prismaMock.audibleCacheCategory.deleteMany.mockResolvedValue({ count: 0 });
prismaMock.audibleCacheCategory.create.mockResolvedValue({});
prismaMock.userHomeSection.findMany.mockResolvedValue([]);
prismaMock.audibleCache.findMany.mockResolvedValue([]);
const { processAudibleRefresh } = await import('@/lib/processors/audible-refresh.processor');
const result = await processAudibleRefresh({ jobId: 'job-dedup' });
expect(result.popularSaved).toBe(3);
// Only 3 category entries created — the duplicate `A` was dropped.
const popularCreates = (prismaMock.audibleCacheCategory.create.mock.calls as Array<[{ data: { asin: string; categoryId: string; rank: number } }]>)
.map((c) => c[0].data)
.filter((d) => d.categoryId === '__popular__');
expect(popularCreates).toHaveLength(3);
expect(popularCreates.map((d) => d.asin)).toEqual(['A', 'B', 'C']);
expect(popularCreates.map((d) => d.rank)).toEqual([1, 2, 3]);
// upsert called once per unique ASIN, not per input row.
expect(prismaMock.audibleCache.upsert).toHaveBeenCalledTimes(3);
});
it('drops entries with missing ASINs as part of dedup', async () => {
const popular = [
{ asin: 'A', title: 'Book A', author: 'X', coverArtUrl: null },
{ asin: '', title: 'Book with empty asin', author: 'X', coverArtUrl: null },
{ asin: null, title: 'Book with null asin', author: 'X', coverArtUrl: null },
{ asin: 'B', title: 'Book B', author: 'X', coverArtUrl: null },
];
audibleServiceMock.getPopularAudiobooks.mockResolvedValue(popular as any);
audibleServiceMock.getNewReleases.mockResolvedValue([]);
thumbnailCacheMock.cleanupUnusedThumbnails.mockResolvedValue(0);
prismaMock.audibleCache.upsert.mockResolvedValue({});
prismaMock.audibleCacheCategory.deleteMany.mockResolvedValue({ count: 0 });
prismaMock.audibleCacheCategory.create.mockResolvedValue({});
prismaMock.userHomeSection.findMany.mockResolvedValue([]);
prismaMock.audibleCache.findMany.mockResolvedValue([]);
const { processAudibleRefresh } = await import('@/lib/processors/audible-refresh.processor');
const result = await processAudibleRefresh({ jobId: 'job-empty-asin' });
expect(result.popularSaved).toBe(2);
const popularCreates = (prismaMock.audibleCacheCategory.create.mock.calls as Array<[{ data: { asin: string; categoryId: string; rank: number } }]>)
.map((c) => c[0].data)
.filter((d) => d.categoryId === '__popular__');
expect(popularCreates.map((d) => d.asin)).toEqual(['A', 'B']);
expect(popularCreates.map((d) => d.rank)).toEqual([1, 2]);
});
});
+189
View File
@@ -6,6 +6,15 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { createPrismaMock } from '../helpers/prisma';
import type { DedupGroup } from '@/lib/utils/deduplicate-audiobooks';
import type { AudibleAudiobook } from '@/lib/integrations/audible.service';
function makeBook(overrides: Partial<AudibleAudiobook> & { asin: string }): AudibleAudiobook {
return {
title: 'Test Book',
author: 'Test Author',
...overrides,
};
}
const prismaMock = createPrismaMock();
@@ -304,3 +313,183 @@ describe('getSiblingAsins', () => {
expect(result.has('ASIN_LONELY')).toBe(false);
});
});
describe('collapseByExistingWorks', () => {
beforeEach(() => {
vi.clearAllMocks();
vi.resetModules();
});
it('returns input unchanged when the list is empty or has one entry', async () => {
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
expect(await collapseByExistingWorks([])).toEqual([]);
expect(prismaMock.workAsin.findMany).not.toHaveBeenCalled();
const single = [makeBook({ asin: 'A1' })];
expect(await collapseByExistingWorks(single)).toEqual(single);
expect(prismaMock.workAsin.findMany).not.toHaveBeenCalled();
});
it('returns input unchanged when none of the ASINs are in any work', async () => {
prismaMock.workAsin.findMany.mockResolvedValue([]);
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
const books = [
makeBook({ asin: 'A1', title: 'Alpha' }),
makeBook({ asin: 'A2', title: 'Beta' }),
];
const result = await collapseByExistingWorks(books);
expect(result).toEqual(books);
});
it('collapses two ASINs that share a work to a single representative', async () => {
prismaMock.workAsin.findMany.mockResolvedValue([
{ asin: 'A1', workId: 'work-1' },
{ asin: 'A2', workId: 'work-1' },
]);
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
const books = [
makeBook({ asin: 'A1', title: 'The Passengers', coverArtUrl: 'cover.jpg' }),
makeBook({ asin: 'A2', title: 'The Passengers' }),
];
const result = await collapseByExistingWorks(books);
expect(result).toHaveLength(1);
// A1 wins — it has the cover URL (higher metadata score)
expect(result[0].asin).toBe('A1');
});
it('keeps the richest-metadata entry when collapsing, regardless of input order', async () => {
prismaMock.workAsin.findMany.mockResolvedValue([
{ asin: 'A1', workId: 'work-1' },
{ asin: 'A2', workId: 'work-1' },
]);
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
// A1 first (sparse), A2 second (rich) — A2 should win on score
const books = [
makeBook({ asin: 'A1', title: 'Book' }),
makeBook({
asin: 'A2',
title: 'Book',
coverArtUrl: 'cover.jpg',
rating: 4.5,
durationMinutes: 600,
narrator: 'Full Cast',
description: 'Rich book',
releaseDate: '2024-01-01',
genres: ['Fiction'],
}),
];
const result = await collapseByExistingWorks(books);
expect(result).toHaveLength(1);
expect(result[0].asin).toBe('A2');
});
it('preserves position of the work in the input order', async () => {
prismaMock.workAsin.findMany.mockResolvedValue([
{ asin: 'A2', workId: 'work-1' },
{ asin: 'A4', workId: 'work-1' },
]);
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
const books = [
makeBook({ asin: 'A1', title: 'Alpha' }),
makeBook({ asin: 'A2', title: 'Beta' }),
makeBook({ asin: 'A3', title: 'Gamma' }),
makeBook({ asin: 'A4', title: 'Beta' }),
makeBook({ asin: 'A5', title: 'Delta' }),
];
const result = await collapseByExistingWorks(books);
// A2 and A4 collapse to one entry at position 1 (the first occurrence)
expect(result.map(b => b.asin)).toEqual(['A1', 'A2', 'A3', 'A5']);
});
it('handles multiple independent works in the same batch', async () => {
prismaMock.workAsin.findMany.mockResolvedValue([
{ asin: 'A1', workId: 'work-1' },
{ asin: 'A2', workId: 'work-1' },
{ asin: 'B1', workId: 'work-2' },
{ asin: 'B2', workId: 'work-2' },
{ asin: 'B3', workId: 'work-2' },
]);
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
const books = [
makeBook({ asin: 'A1' }),
makeBook({ asin: 'B1' }),
makeBook({ asin: 'A2' }),
makeBook({ asin: 'B2' }),
makeBook({ asin: 'B3' }),
makeBook({ asin: 'C1' }),
];
const result = await collapseByExistingWorks(books);
expect(result.map(b => b.asin)).toEqual(['A1', 'B1', 'C1']);
});
it('passes through books that are not in any work alongside collapsed ones', async () => {
prismaMock.workAsin.findMany.mockResolvedValue([
{ asin: 'A1', workId: 'work-1' },
{ asin: 'A2', workId: 'work-1' },
]);
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
const books = [
makeBook({ asin: 'STANDALONE_1', title: 'Standalone 1' }),
makeBook({ asin: 'A1', title: 'Same Book' }),
makeBook({ asin: 'STANDALONE_2', title: 'Standalone 2' }),
makeBook({ asin: 'A2', title: 'Same Book' }),
];
const result = await collapseByExistingWorks(books);
expect(result).toHaveLength(3);
expect(result.map(b => b.asin)).toEqual(['STANDALONE_1', 'A1', 'STANDALONE_2']);
});
it('returns input unchanged on DB failure (does not throw)', async () => {
prismaMock.workAsin.findMany.mockRejectedValue(new Error('DB exploded'));
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
const books = [
makeBook({ asin: 'A1' }),
makeBook({ asin: 'A2' }),
];
const result = await collapseByExistingWorks(books);
expect(result).toEqual(books);
});
it('only queries the workAsin table once per call', async () => {
prismaMock.workAsin.findMany.mockResolvedValue([
{ asin: 'A1', workId: 'work-1' },
{ asin: 'A2', workId: 'work-1' },
]);
const { collapseByExistingWorks } = await import('@/lib/services/works.service');
await collapseByExistingWorks([
makeBook({ asin: 'A1' }),
makeBook({ asin: 'A2' }),
makeBook({ asin: 'A3' }),
]);
expect(prismaMock.workAsin.findMany).toHaveBeenCalledTimes(1);
expect(prismaMock.workAsin.findMany).toHaveBeenCalledWith({
where: { asin: { in: ['A1', 'A2', 'A3'] } },
select: { asin: true, workId: true },
});
});
});
+95
View File
@@ -0,0 +1,95 @@
/**
* Component: Narrator Extraction Utility Tests
* Documentation: documentation/integrations/audible.md
*/
import { describe, expect, it } from 'vitest';
import * as cheerio from 'cheerio';
import { extractAllNarrators } from '@/lib/utils/extract-narrator';
function load(html: string) {
const $ = cheerio.load(`<div id="item">${html}</div>`);
return { $, $el: $('#item') };
}
describe('extractAllNarrators', () => {
it('returns the single narrator name when only one searchNarrator link is present', () => {
const { $, $el } = load(
`<a href="/search?searchNarrator=Andy%20Serkis">Andy Serkis</a>`,
);
expect(extractAllNarrators($, $el)).toBe('Andy Serkis');
});
it('joins multiple narrator names from separate searchNarrator links', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=Kristin%20Atherton">Kristin Atherton</a>,
<a href="/search?searchNarrator=Roy%20McMillan">Roy McMillan</a>,
<a href="/search?searchNarrator=Clare%20Corbett">Clare Corbett</a>,
<a href="/search?searchNarrator=Tom%20Bateman">Tom Bateman</a>,
<a href="/search?searchNarrator=Patience%20Tomlinson">Patience Tomlinson</a>,
<a href="/search?searchNarrator=Shaheen%20Khan">Shaheen Khan</a>
`);
expect(extractAllNarrators($, $el)).toBe(
'Kristin Atherton, Roy McMillan, Clare Corbett, Tom Bateman, Patience Tomlinson, Shaheen Khan',
);
});
it('preserves document order (downstream sorts before comparing, but order should be stable)', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=Z">Zelda</a>
<a href="/search?searchNarrator=A">Alice</a>
<a href="/search?searchNarrator=M">Mallory</a>
`);
expect(extractAllNarrators($, $el)).toBe('Zelda, Alice, Mallory');
});
it('falls back to .narratorLabel text when no searchNarrator links exist', () => {
const { $, $el } = load(
`<span class="narratorLabel">Narrated by: Single Narrator</span>`,
);
expect(extractAllNarrators($, $el)).toBe('Narrated by: Single Narrator');
});
it('prefers searchNarrator links over .narratorLabel when both are present', () => {
const { $, $el } = load(`
<span class="narratorLabel">Narrated by: ONLY ONE</span>
<a href="/search?searchNarrator=First">First</a>
<a href="/search?searchNarrator=Second">Second</a>
`);
expect(extractAllNarrators($, $el)).toBe('First, Second');
});
it('returns empty string when neither links nor .narratorLabel exist', () => {
const { $, $el } = load(`<span>some other content</span>`);
expect(extractAllNarrators($, $el)).toBe('');
});
it('skips empty link text and joins only non-empty names', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=A"></a>
<a href="/search?searchNarrator=B">Bob</a>
<a href="/search?searchNarrator=C"> </a>
<a href="/search?searchNarrator=D">Diana</a>
`);
expect(extractAllNarrators($, $el)).toBe('Bob, Diana');
});
it('trims whitespace from each captured name', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=A"> Alice </a>
<a href="/search?searchNarrator=B">
Bob
</a>
`);
expect(extractAllNarrators($, $el)).toBe('Alice, Bob');
});
it('falls back to .narratorLabel when all searchNarrator links are empty', () => {
const { $, $el } = load(`
<a href="/search?searchNarrator=A"></a>
<a href="/search?searchNarrator=B"> </a>
<span class="narratorLabel">Fallback Narrator</span>
`);
expect(extractAllNarrators($, $el)).toBe('Fallback Narrator');
});
});
+18
View File
@@ -67,6 +67,24 @@ describe('jitteredBackoff', () => {
expect(value).toBeGreaterThanOrEqual(250);
expect(value).toBeLessThanOrEqual(750);
});
it('caps the result at maxBackoffMs when the raw backoff would exceed it', () => {
// attempt=10 with base=1000 produces 2^10 * 1000 * [0.5..1.5] = 512_000..1_536_000,
// all of which exceed a 60_000ms cap.
for (let i = 0; i < 50; i++) {
const value = jitteredBackoff(10, 1000, 60_000);
expect(value).toBeLessThanOrEqual(60_000);
}
});
it('returns the un-capped jittered value when below the cap', () => {
// attempt=0 with base=1000 produces 500..1500, all below a 60_000ms cap.
for (let i = 0; i < 50; i++) {
const value = jitteredBackoff(0, 1000, 60_000);
expect(value).toBeGreaterThanOrEqual(500);
expect(value).toBeLessThanOrEqual(1500);
}
});
});
describe('randomDelay', () => {