Files
ReadMeABook/tests/utils/scrape-resilience.test.ts
kikootwo fcae3bcf09 Audible: HTML refresh, multi-narrator & works dedup
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
2026-05-14 15:23:15 -04:00

166 lines
5.7 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Component: Scrape Resilience Utility Tests
* Documentation: documentation/integrations/audible.md
*/
import { describe, expect, it } from 'vitest';
import {
pickUserAgent,
getBrowserHeaders,
jitteredBackoff,
randomDelay,
AdaptivePacer,
} from '@/lib/utils/scrape-resilience';
describe('pickUserAgent', () => {
it('returns a string containing Mozilla', () => {
const ua = pickUserAgent();
expect(typeof ua).toBe('string');
expect(ua).toContain('Mozilla');
});
it('returns values from the known pool', () => {
const seen = new Set<string>();
for (let i = 0; i < 100; i++) {
seen.add(pickUserAgent());
}
// Should have picked at least 2 different UAs over 100 draws
expect(seen.size).toBeGreaterThanOrEqual(2);
for (const ua of seen) {
expect(ua).toContain('Mozilla/5.0');
}
});
});
describe('getBrowserHeaders', () => {
it('includes all expected header keys', () => {
const headers = getBrowserHeaders('TestUA/1.0');
expect(headers['User-Agent']).toBe('TestUA/1.0');
expect(headers['Accept']).toBeDefined();
expect(headers['Accept-Language']).toBeDefined();
expect(headers['Accept-Encoding']).toBeDefined();
expect(headers['Connection']).toBeDefined();
expect(headers['Sec-Fetch-Site']).toBeDefined();
expect(headers['Sec-Fetch-Mode']).toBeDefined();
expect(headers['Sec-Fetch-Dest']).toBeDefined();
expect(headers['Sec-Fetch-User']).toBeDefined();
expect(headers['Upgrade-Insecure-Requests']).toBeDefined();
});
});
describe('jitteredBackoff', () => {
it('returns values within the expected jitter range', () => {
for (let attempt = 0; attempt < 5; attempt++) {
for (let i = 0; i < 50; i++) {
const value = jitteredBackoff(attempt, 1000);
const base = Math.pow(2, attempt) * 1000;
// Jitter range is 0.5x 1.5x
expect(value).toBeGreaterThanOrEqual(Math.round(base * 0.5));
expect(value).toBeLessThanOrEqual(Math.round(base * 1.5));
}
}
});
it('uses custom base ms', () => {
const value = jitteredBackoff(0, 500);
// attempt=0: 1 * 500 * [0.5..1.5] → [250..750]
expect(value).toBeGreaterThanOrEqual(250);
expect(value).toBeLessThanOrEqual(750);
});
it('caps the result at maxBackoffMs when the raw backoff would exceed it', () => {
// attempt=10 with base=1000 produces 2^10 * 1000 * [0.5..1.5] = 512_000..1_536_000,
// all of which exceed a 60_000ms cap.
for (let i = 0; i < 50; i++) {
const value = jitteredBackoff(10, 1000, 60_000);
expect(value).toBeLessThanOrEqual(60_000);
}
});
it('returns the un-capped jittered value when below the cap', () => {
// attempt=0 with base=1000 produces 500..1500, all below a 60_000ms cap.
for (let i = 0; i < 50; i++) {
const value = jitteredBackoff(0, 1000, 60_000);
expect(value).toBeGreaterThanOrEqual(500);
expect(value).toBeLessThanOrEqual(1500);
}
});
});
describe('randomDelay', () => {
it('returns values within bounds', () => {
for (let i = 0; i < 100; i++) {
const val = randomDelay(100, 200);
expect(val).toBeGreaterThanOrEqual(100);
expect(val).toBeLessThanOrEqual(200);
}
});
});
describe('AdaptivePacer', () => {
it('returns base delay range when no retries needed', () => {
const pacer = new AdaptivePacer();
for (let i = 0; i < 50; i++) {
const delay = pacer.reportPageResult({ retriesUsed: 0, encountered503: false });
expect(delay).toBeGreaterThanOrEqual(2000);
expect(delay).toBeLessThanOrEqual(4000);
}
});
it('increases delay when retries occurred', () => {
const pacer = new AdaptivePacer();
// First retry page: consecutiveRetryPages becomes 1, multiplier = 1.5
const delay = pacer.reportPageResult({ retriesUsed: 2, encountered503: true });
// Range: [2000*1.5, 4000*1.5] = [3000, 6000]
expect(delay).toBeGreaterThanOrEqual(3000);
expect(delay).toBeLessThanOrEqual(6000);
});
it('triggers circuit breaker after 3 consecutive retry pages', () => {
const pacer = new AdaptivePacer();
const retryMeta = { retriesUsed: 1, encountered503: true };
pacer.reportPageResult(retryMeta); // consecutive = 1
pacer.reportPageResult(retryMeta); // consecutive = 2
const cooldown = pacer.reportPageResult(retryMeta); // consecutive = 3 → circuit breaker
expect(cooldown).toBeGreaterThanOrEqual(45000);
expect(cooldown).toBeLessThanOrEqual(60000);
});
it('recovers gradually after successful pages', () => {
const pacer = new AdaptivePacer();
const retryMeta = { retriesUsed: 1, encountered503: true };
const successMeta = { retriesUsed: 0, encountered503: false };
// Build up to 2 consecutive retries
pacer.reportPageResult(retryMeta); // consecutive = 1
pacer.reportPageResult(retryMeta); // consecutive = 2
// Success decrements: consecutive goes from 2 → 1
const delay = pacer.reportPageResult(successMeta);
expect(delay).toBeGreaterThanOrEqual(2000);
expect(delay).toBeLessThanOrEqual(4000);
// Another success: consecutive goes from 1 → 0
const delay2 = pacer.reportPageResult(successMeta);
expect(delay2).toBeGreaterThanOrEqual(2000);
expect(delay2).toBeLessThanOrEqual(4000);
});
it('resets state', () => {
const pacer = new AdaptivePacer();
const retryMeta = { retriesUsed: 1, encountered503: true };
pacer.reportPageResult(retryMeta); // consecutive = 1
pacer.reportPageResult(retryMeta); // consecutive = 2
pacer.reset();
// After reset, should be back to base range behavior for retries
const delay = pacer.reportPageResult(retryMeta);
// consecutive = 1 again, multiplier = 1.5 → [3000, 6000]
expect(delay).toBeGreaterThanOrEqual(3000);
expect(delay).toBeLessThanOrEqual(6000);
});
});