mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-02 20:30:10 +00:00
fcae3bcf09
Switch nightly discovery refresh to scrape Audible's curated HTML storefronts (popular, new releases, category pages) while keeping real-time user paths on the JSON catalog API. Add robust HTML resilience knobs (increased retries, capped jittered backoff, AdaptivePacer changes and per-batch cooldowns) to avoid failing nightly jobs during 503 storms. Implement multi-narrator capture via a new extractAllNarrators helper and update parsers to preserve all narrator anchors. Introduce two-pass dedup: in-memory deduplicateAndCollectGroups + collapseByExistingWorks that consults the works table, export metadataScore for consistent representative selection, and persist dedup groups (fire-and-forget). Wire collapseByExistingWorks into search/author/series routes and make defensive dedup in the refresh processor. Add HTML parsing helpers, runtime/lang-aware parsing, jitteredBackoff cap, and tests for the new behaviors.
166 lines
5.7 KiB
TypeScript
166 lines
5.7 KiB
TypeScript
/**
|
||
* Component: Scrape Resilience Utility Tests
|
||
* Documentation: documentation/integrations/audible.md
|
||
*/
|
||
|
||
import { describe, expect, it } from 'vitest';
|
||
import {
|
||
pickUserAgent,
|
||
getBrowserHeaders,
|
||
jitteredBackoff,
|
||
randomDelay,
|
||
AdaptivePacer,
|
||
} from '@/lib/utils/scrape-resilience';
|
||
|
||
describe('pickUserAgent', () => {
|
||
it('returns a string containing Mozilla', () => {
|
||
const ua = pickUserAgent();
|
||
expect(typeof ua).toBe('string');
|
||
expect(ua).toContain('Mozilla');
|
||
});
|
||
|
||
it('returns values from the known pool', () => {
|
||
const seen = new Set<string>();
|
||
for (let i = 0; i < 100; i++) {
|
||
seen.add(pickUserAgent());
|
||
}
|
||
// Should have picked at least 2 different UAs over 100 draws
|
||
expect(seen.size).toBeGreaterThanOrEqual(2);
|
||
for (const ua of seen) {
|
||
expect(ua).toContain('Mozilla/5.0');
|
||
}
|
||
});
|
||
});
|
||
|
||
describe('getBrowserHeaders', () => {
|
||
it('includes all expected header keys', () => {
|
||
const headers = getBrowserHeaders('TestUA/1.0');
|
||
expect(headers['User-Agent']).toBe('TestUA/1.0');
|
||
expect(headers['Accept']).toBeDefined();
|
||
expect(headers['Accept-Language']).toBeDefined();
|
||
expect(headers['Accept-Encoding']).toBeDefined();
|
||
expect(headers['Connection']).toBeDefined();
|
||
expect(headers['Sec-Fetch-Site']).toBeDefined();
|
||
expect(headers['Sec-Fetch-Mode']).toBeDefined();
|
||
expect(headers['Sec-Fetch-Dest']).toBeDefined();
|
||
expect(headers['Sec-Fetch-User']).toBeDefined();
|
||
expect(headers['Upgrade-Insecure-Requests']).toBeDefined();
|
||
});
|
||
});
|
||
|
||
describe('jitteredBackoff', () => {
|
||
it('returns values within the expected jitter range', () => {
|
||
for (let attempt = 0; attempt < 5; attempt++) {
|
||
for (let i = 0; i < 50; i++) {
|
||
const value = jitteredBackoff(attempt, 1000);
|
||
const base = Math.pow(2, attempt) * 1000;
|
||
// Jitter range is 0.5x – 1.5x
|
||
expect(value).toBeGreaterThanOrEqual(Math.round(base * 0.5));
|
||
expect(value).toBeLessThanOrEqual(Math.round(base * 1.5));
|
||
}
|
||
}
|
||
});
|
||
|
||
it('uses custom base ms', () => {
|
||
const value = jitteredBackoff(0, 500);
|
||
// attempt=0: 1 * 500 * [0.5..1.5] → [250..750]
|
||
expect(value).toBeGreaterThanOrEqual(250);
|
||
expect(value).toBeLessThanOrEqual(750);
|
||
});
|
||
|
||
it('caps the result at maxBackoffMs when the raw backoff would exceed it', () => {
|
||
// attempt=10 with base=1000 produces 2^10 * 1000 * [0.5..1.5] = 512_000..1_536_000,
|
||
// all of which exceed a 60_000ms cap.
|
||
for (let i = 0; i < 50; i++) {
|
||
const value = jitteredBackoff(10, 1000, 60_000);
|
||
expect(value).toBeLessThanOrEqual(60_000);
|
||
}
|
||
});
|
||
|
||
it('returns the un-capped jittered value when below the cap', () => {
|
||
// attempt=0 with base=1000 produces 500..1500, all below a 60_000ms cap.
|
||
for (let i = 0; i < 50; i++) {
|
||
const value = jitteredBackoff(0, 1000, 60_000);
|
||
expect(value).toBeGreaterThanOrEqual(500);
|
||
expect(value).toBeLessThanOrEqual(1500);
|
||
}
|
||
});
|
||
});
|
||
|
||
describe('randomDelay', () => {
|
||
it('returns values within bounds', () => {
|
||
for (let i = 0; i < 100; i++) {
|
||
const val = randomDelay(100, 200);
|
||
expect(val).toBeGreaterThanOrEqual(100);
|
||
expect(val).toBeLessThanOrEqual(200);
|
||
}
|
||
});
|
||
});
|
||
|
||
describe('AdaptivePacer', () => {
|
||
it('returns base delay range when no retries needed', () => {
|
||
const pacer = new AdaptivePacer();
|
||
for (let i = 0; i < 50; i++) {
|
||
const delay = pacer.reportPageResult({ retriesUsed: 0, encountered503: false });
|
||
expect(delay).toBeGreaterThanOrEqual(2000);
|
||
expect(delay).toBeLessThanOrEqual(4000);
|
||
}
|
||
});
|
||
|
||
it('increases delay when retries occurred', () => {
|
||
const pacer = new AdaptivePacer();
|
||
// First retry page: consecutiveRetryPages becomes 1, multiplier = 1.5
|
||
const delay = pacer.reportPageResult({ retriesUsed: 2, encountered503: true });
|
||
// Range: [2000*1.5, 4000*1.5] = [3000, 6000]
|
||
expect(delay).toBeGreaterThanOrEqual(3000);
|
||
expect(delay).toBeLessThanOrEqual(6000);
|
||
});
|
||
|
||
it('triggers circuit breaker after 3 consecutive retry pages', () => {
|
||
const pacer = new AdaptivePacer();
|
||
const retryMeta = { retriesUsed: 1, encountered503: true };
|
||
|
||
pacer.reportPageResult(retryMeta); // consecutive = 1
|
||
pacer.reportPageResult(retryMeta); // consecutive = 2
|
||
const cooldown = pacer.reportPageResult(retryMeta); // consecutive = 3 → circuit breaker
|
||
|
||
expect(cooldown).toBeGreaterThanOrEqual(45000);
|
||
expect(cooldown).toBeLessThanOrEqual(60000);
|
||
});
|
||
|
||
it('recovers gradually after successful pages', () => {
|
||
const pacer = new AdaptivePacer();
|
||
const retryMeta = { retriesUsed: 1, encountered503: true };
|
||
const successMeta = { retriesUsed: 0, encountered503: false };
|
||
|
||
// Build up to 2 consecutive retries
|
||
pacer.reportPageResult(retryMeta); // consecutive = 1
|
||
pacer.reportPageResult(retryMeta); // consecutive = 2
|
||
|
||
// Success decrements: consecutive goes from 2 → 1
|
||
const delay = pacer.reportPageResult(successMeta);
|
||
expect(delay).toBeGreaterThanOrEqual(2000);
|
||
expect(delay).toBeLessThanOrEqual(4000);
|
||
|
||
// Another success: consecutive goes from 1 → 0
|
||
const delay2 = pacer.reportPageResult(successMeta);
|
||
expect(delay2).toBeGreaterThanOrEqual(2000);
|
||
expect(delay2).toBeLessThanOrEqual(4000);
|
||
});
|
||
|
||
it('resets state', () => {
|
||
const pacer = new AdaptivePacer();
|
||
const retryMeta = { retriesUsed: 1, encountered503: true };
|
||
|
||
pacer.reportPageResult(retryMeta); // consecutive = 1
|
||
pacer.reportPageResult(retryMeta); // consecutive = 2
|
||
pacer.reset();
|
||
|
||
// After reset, should be back to base range behavior for retries
|
||
const delay = pacer.reportPageResult(retryMeta);
|
||
// consecutive = 1 again, multiplier = 1.5 → [3000, 6000]
|
||
expect(delay).toBeGreaterThanOrEqual(3000);
|
||
expect(delay).toBeLessThanOrEqual(6000);
|
||
});
|
||
});
|