Add works table and ASIN deduping

Add persistent cross-ASIN "works" mapping and client-side deduplication to improve library matching. Introduces a Prisma migration and models (Work, WorkAsin) plus src/lib/services/works.service for persisting dedup groups, seeding ASINs at request time, and sibling lookup. Adds a deduplication utility (deduplicate-audiobooks) that normalizes titles/narrators, compares durations, and returns grouping metadata; API routes (search, author, series) now deduplicate results before enrichment and fire-and-forget persist groups. Adds sibling-ASIN expansion into audiobook matcher and expands getAvailableAsins accordingly. Extracts runtime parsing into a shared parse-runtime util and updates audible scrapers/services to use it. Includes unit tests for dedup logic and works service and updates test Prisma mocks.
This commit is contained in:
kikootwo
2026-03-03 13:31:46 -05:00
parent ff80d995c5
commit 610873af6b
15 changed files with 1446 additions and 32 deletions
+201
View File
@@ -0,0 +1,201 @@
/**
* Component: Audiobook Deduplication Utility
* Documentation: documentation/integrations/audible.md
*
* Deduplicates audiobook listings that represent the same recording
* under different ASINs (publisher re-listings, rights transfers, etc.).
*
* Dedup key: normalized title + normalized narrator
* Duration tolerance: max(longerDuration * 0.01, 5) minutes
* Missing duration treated as compatible (graceful degradation).
*/
import type { AudibleAudiobook } from '../integrations/audible.service';
// ---------------------------------------------------------------------------
// Title / narrator normalization
// ---------------------------------------------------------------------------
/** Patterns in parentheses or brackets to strip (edition markers, format labels) */
const EDITION_PAREN_RE = /[([][^)\]]*?(?:unabridged|abridged|edition|remaster(?:ed)?|anniversary|complete|original|version|narrat(?:ed|or)?|audio(?:book)?|full cast|dramatiz(?:ed|ation))[^)\]]*[)\]]/gi;
/** Trailing subtitle after colon or long dash */
const SUBTITLE_RE = /\s*[:]\s+.+$/;
const LONG_DASH_SUBTITLE_RE = /\s+[-\u2013\u2014]\s+.+$/;
/** Trailing descriptors like "A Novel", "A Memoir" */
const TRAILING_DESCRIPTOR_RE = /\s*[-:,]?\s+a\s+(novel|memoir|thriller|mystery|romance|story|tale|novella)\s*$/i;
/**
* Normalize a title for dedup comparison.
* Strips subtitles, edition markers, and trailing descriptors.
*/
export function normalizeTitle(title: string): string {
let t = title.toLowerCase();
// Remove parenthesized/bracketed edition markers
t = t.replace(EDITION_PAREN_RE, '');
// Remove trailing descriptors before subtitle stripping
t = t.replace(TRAILING_DESCRIPTOR_RE, '');
// Remove subtitle after colon
t = t.replace(SUBTITLE_RE, '');
// Remove subtitle after long dash (but not short hyphenated words)
t = t.replace(LONG_DASH_SUBTITLE_RE, '');
// Collapse whitespace and trim
return t.replace(/\s+/g, ' ').trim();
}
/** Normalize narrator for comparison. */
function normalizeNarrator(narrator?: string): string {
return (narrator || '').toLowerCase().trim();
}
// ---------------------------------------------------------------------------
// Duration compatibility
// ---------------------------------------------------------------------------
/**
* Check if two durations are compatible (represent the same recording).
* Tolerance: max(longerDuration * 0.01, 5) minutes.
* Missing duration on either side is treated as compatible.
*/
export function areDurationsCompatible(a?: number, b?: number): boolean {
if (a == null || b == null) return true;
const longer = Math.max(a, b);
const tolerance = Math.max(longer * 0.01, 5);
return Math.abs(a - b) <= tolerance;
}
// ---------------------------------------------------------------------------
// Metadata scoring (for picking best representative)
// ---------------------------------------------------------------------------
function metadataScore(book: AudibleAudiobook): number {
let score = 0;
if (book.coverArtUrl) score++;
if (book.rating != null) score++;
if (book.durationMinutes != null) score++;
if (book.description) score++;
if (book.narrator) score++;
if (book.releaseDate) score++;
if (book.genres && book.genres.length > 0) score++;
return score;
}
// ---------------------------------------------------------------------------
// Dedup group types (for works-table persistence)
// ---------------------------------------------------------------------------
/** Metadata about a group of ASINs that were collapsed during dedup. */
export interface DedupGroup {
canonicalAsin: string; // ASIN of the "winner" (best metadata score)
allAsins: string[]; // All ASINs in this group (including canonical)
title: string; // Author from the canonical entry
author: string; // Author from the canonical entry
narrator?: string; // Narrator from the canonical entry
durationMinutes?: number; // Duration from the canonical entry
}
/** Result of deduplication with group collection. */
export interface DeduplicateResult {
books: AudibleAudiobook[]; // The deduped list (same as deduplicateAudiobooks returns)
groups: DedupGroup[]; // Groups where 2+ ASINs were collapsed
}
// ---------------------------------------------------------------------------
// Main dedup functions
// ---------------------------------------------------------------------------
/**
* Deduplicate audiobook listings by normalized title + narrator + duration.
*
* Same narrator + compatible duration + similar title = same recording -> collapse.
* Different narrator = different production -> keep both.
* Duration outside tolerance = different content (abridged vs unabridged) -> keep both.
*
* Preserves original ordering (position of first appearance).
*/
export function deduplicateAudiobooks(books: AudibleAudiobook[]): AudibleAudiobook[] {
return deduplicateAndCollectGroups(books).books;
}
/**
* Deduplicate audiobooks AND return grouping metadata for works-table persistence.
* Returns both the deduped list and the groups where 2+ ASINs were collapsed.
*/
export function deduplicateAndCollectGroups(books: AudibleAudiobook[]): DeduplicateResult {
if (books.length <= 1) return { books: [...books], groups: [] };
// Group by normalized title + narrator
const titleNarratorGroups = new Map<string, AudibleAudiobook[]>();
const insertionOrder: string[] = [];
for (const book of books) {
const key = `${normalizeTitle(book.title)}|||${normalizeNarrator(book.narrator)}`;
const group = titleNarratorGroups.get(key);
if (group) {
group.push(book);
} else {
titleNarratorGroups.set(key, [book]);
insertionOrder.push(key);
}
}
const result: AudibleAudiobook[] = [];
const dedupGroups: DedupGroup[] = [];
for (const key of insertionOrder) {
const group = titleNarratorGroups.get(key)!;
if (group.length === 1) {
result.push(group[0]);
continue;
}
// Within a title+narrator group, further split by duration compatibility.
// Build sub-groups where all members are duration-compatible with the
// representative (first member). A book joins the first compatible sub-group.
const subGroups: AudibleAudiobook[][] = [];
for (const book of group) {
let placed = false;
for (const sg of subGroups) {
// Check compatibility against the representative (first member)
if (areDurationsCompatible(sg[0].durationMinutes, book.durationMinutes)) {
sg.push(book);
placed = true;
break;
}
}
if (!placed) {
subGroups.push([book]);
}
}
// From each sub-group, pick the best representative and collect group metadata
for (const sg of subGroups) {
let best = sg[0];
let bestScore = metadataScore(best);
for (let i = 1; i < sg.length; i++) {
const score = metadataScore(sg[i]);
if (score > bestScore) {
best = sg[i];
bestScore = score;
}
}
result.push(best);
// Collect group metadata for works-table persistence (only multi-ASIN groups)
if (sg.length >= 2) {
dedupGroups.push({
canonicalAsin: best.asin,
allAsins: sg.map(b => b.asin),
title: best.title,
author: best.author,
narrator: best.narrator,
durationMinutes: best.durationMinutes,
});
}
}
}
return { books: result, groups: dedupGroups };
}