mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-03 04:40:09 +00:00
5b4aa3fa15
Track and run run-once SQL data migrations: entrypoint now checks _data_migrations before executing each prisma data-migration file, records successful runs, and skips already-applied scripts. Adds a Prisma DataMigration model mapped to _data_migrations and a new reset-works-table.sql migration to clear work tables for a dedup rebuild. Also improves dedup logic: extractSubtitle and subtitle-compatibility checks are added so series entries like "Series: Book A" vs "Series: Book B" are not collapsed, with accompanying unit tests for extraction and behavior.
246 lines
9.2 KiB
TypeScript
246 lines
9.2 KiB
TypeScript
/**
|
|
* Component: Audiobook Deduplication Utility
|
|
* Documentation: documentation/integrations/audible.md
|
|
*
|
|
* Deduplicates audiobook listings that represent the same recording
|
|
* under different ASINs (publisher re-listings, rights transfers, etc.).
|
|
*
|
|
* Dedup key: normalized title + normalized narrator
|
|
* Duration tolerance: max(longerDuration * 0.01, 5) minutes
|
|
* Missing duration treated as compatible (graceful degradation).
|
|
*/
|
|
|
|
import type { AudibleAudiobook } from '../integrations/audible.service';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Title / narrator normalization
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Patterns in parentheses or brackets to strip (edition markers, format labels) */
|
|
const EDITION_PAREN_RE = /[([][^)\]]*?(?:unabridged|abridged|edition|remaster(?:ed)?|anniversary|complete|original|version|narrat(?:ed|or)?|audio(?:book)?|full cast|dramatiz(?:ed|ation))[^)\]]*[)\]]/gi;
|
|
|
|
/** Trailing subtitle after colon or long dash (used for extraction, not blind stripping) */
|
|
const SUBTITLE_RE = /\s*[:]\s+.+$/;
|
|
const LONG_DASH_SUBTITLE_RE = /\s+[-\u2013\u2014]\s+.+$/;
|
|
|
|
/** Trailing descriptors like "A Novel", "A Memoir" */
|
|
const TRAILING_DESCRIPTOR_RE = /\s*[-:,]?\s+a\s+(novel|memoir|thriller|mystery|romance|story|tale|novella)\s*$/i;
|
|
|
|
/**
|
|
* Normalize a title for dedup comparison.
|
|
* Strips subtitles, edition markers, and trailing descriptors.
|
|
*/
|
|
export function normalizeTitle(title: string): string {
|
|
let t = title.toLowerCase();
|
|
// Remove parenthesized/bracketed edition markers
|
|
t = t.replace(EDITION_PAREN_RE, '');
|
|
// Remove trailing descriptors before subtitle stripping
|
|
t = t.replace(TRAILING_DESCRIPTOR_RE, '');
|
|
// Remove subtitle after colon
|
|
t = t.replace(SUBTITLE_RE, '');
|
|
// Remove subtitle after long dash (but not short hyphenated words)
|
|
t = t.replace(LONG_DASH_SUBTITLE_RE, '');
|
|
// Collapse whitespace and trim
|
|
return t.replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
/**
|
|
* Extract the subtitle portion from a title (part after colon or long dash).
|
|
* Returns empty string if no subtitle found.
|
|
* Used to prevent false dedup of series books like "Series: Book A" vs "Series: Book B".
|
|
*/
|
|
export function extractSubtitle(title: string): string {
|
|
let t = title.toLowerCase();
|
|
// Remove parenthesized/bracketed edition markers first (same as normalizeTitle)
|
|
t = t.replace(EDITION_PAREN_RE, '');
|
|
// Remove trailing descriptors
|
|
t = t.replace(TRAILING_DESCRIPTOR_RE, '');
|
|
t = t.replace(/\s+/g, ' ').trim();
|
|
|
|
// Try colon subtitle
|
|
const colonMatch = t.match(/\s*[:]\s+(.+)$/);
|
|
if (colonMatch) return colonMatch[1].trim();
|
|
|
|
// Try long dash subtitle
|
|
const dashMatch = t.match(/\s+[-\u2013\u2014]\s+(.+)$/);
|
|
if (dashMatch) return dashMatch[1].trim();
|
|
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* Check if two titles' subtitles are compatible for dedup purposes.
|
|
* - Both have no subtitle → compatible
|
|
* - One has a subtitle, other doesn't → compatible (re-listing with/without subtitle)
|
|
* - Both have the SAME subtitle → compatible
|
|
* - Both have DIFFERENT subtitles → NOT compatible (different books, e.g. series entries)
|
|
*/
|
|
function areSubtitlesCompatible(titleA: string, titleB: string): boolean {
|
|
const subA = extractSubtitle(titleA);
|
|
const subB = extractSubtitle(titleB);
|
|
if (!subA || !subB) return true; // one or both missing → compatible
|
|
return subA === subB;
|
|
}
|
|
|
|
/** Normalize narrator for comparison. Sorts individual names so order doesn't matter. */
|
|
function normalizeNarrator(narrator?: string): string {
|
|
const raw = (narrator || '').toLowerCase().trim();
|
|
if (!raw) return raw;
|
|
return raw.split(',').map(n => n.trim()).filter(Boolean).sort().join(', ');
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Duration compatibility
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Check if two durations are compatible (represent the same recording).
|
|
* Tolerance: max(longerDuration * 0.01, 5) minutes.
|
|
* Missing duration on either side is treated as compatible.
|
|
*/
|
|
export function areDurationsCompatible(a?: number, b?: number): boolean {
|
|
if (a == null || b == null) return true;
|
|
const longer = Math.max(a, b);
|
|
const tolerance = Math.max(longer * 0.01, 5);
|
|
return Math.abs(a - b) <= tolerance;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Metadata scoring (for picking best representative)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function metadataScore(book: AudibleAudiobook): number {
|
|
let score = 0;
|
|
if (book.coverArtUrl) score++;
|
|
if (book.rating != null) score++;
|
|
if (book.durationMinutes != null) score++;
|
|
if (book.description) score++;
|
|
if (book.narrator) score++;
|
|
if (book.releaseDate) score++;
|
|
if (book.genres && book.genres.length > 0) score++;
|
|
return score;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Dedup group types (for works-table persistence)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Metadata about a group of ASINs that were collapsed during dedup. */
|
|
export interface DedupGroup {
|
|
canonicalAsin: string; // ASIN of the "winner" (best metadata score)
|
|
allAsins: string[]; // All ASINs in this group (including canonical)
|
|
title: string; // Author from the canonical entry
|
|
author: string; // Author from the canonical entry
|
|
narrator?: string; // Narrator from the canonical entry
|
|
durationMinutes?: number; // Duration from the canonical entry
|
|
}
|
|
|
|
/** Result of deduplication with group collection. */
|
|
export interface DeduplicateResult {
|
|
books: AudibleAudiobook[]; // The deduped list (same as deduplicateAudiobooks returns)
|
|
groups: DedupGroup[]; // Groups where 2+ ASINs were collapsed
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main dedup functions
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Deduplicate audiobook listings by normalized title + narrator + duration.
|
|
*
|
|
* Same narrator + compatible duration + similar title = same recording -> collapse.
|
|
* Different narrator = different production -> keep both.
|
|
* Duration outside tolerance = different content (abridged vs unabridged) -> keep both.
|
|
*
|
|
* Preserves original ordering (position of first appearance).
|
|
*/
|
|
export function deduplicateAudiobooks(books: AudibleAudiobook[]): AudibleAudiobook[] {
|
|
return deduplicateAndCollectGroups(books).books;
|
|
}
|
|
|
|
/**
|
|
* Deduplicate audiobooks AND return grouping metadata for works-table persistence.
|
|
* Returns both the deduped list and the groups where 2+ ASINs were collapsed.
|
|
*/
|
|
export function deduplicateAndCollectGroups(books: AudibleAudiobook[]): DeduplicateResult {
|
|
if (books.length <= 1) return { books: [...books], groups: [] };
|
|
|
|
// Group by normalized title + narrator
|
|
const titleNarratorGroups = new Map<string, AudibleAudiobook[]>();
|
|
const insertionOrder: string[] = [];
|
|
|
|
for (const book of books) {
|
|
const key = `${normalizeTitle(book.title)}|||${normalizeNarrator(book.narrator)}`;
|
|
const group = titleNarratorGroups.get(key);
|
|
if (group) {
|
|
group.push(book);
|
|
} else {
|
|
titleNarratorGroups.set(key, [book]);
|
|
insertionOrder.push(key);
|
|
}
|
|
}
|
|
|
|
const result: AudibleAudiobook[] = [];
|
|
const dedupGroups: DedupGroup[] = [];
|
|
|
|
for (const key of insertionOrder) {
|
|
const group = titleNarratorGroups.get(key)!;
|
|
if (group.length === 1) {
|
|
result.push(group[0]);
|
|
continue;
|
|
}
|
|
|
|
// Within a title+narrator group, further split by duration AND subtitle
|
|
// compatibility. Build sub-groups where all members are compatible with
|
|
// the representative (first member). A book joins the first compatible sub-group.
|
|
// This prevents false dedup of series entries like "Series: Book A" vs "Series: Book B".
|
|
const subGroups: AudibleAudiobook[][] = [];
|
|
|
|
for (const book of group) {
|
|
let placed = false;
|
|
for (const sg of subGroups) {
|
|
// Check both duration and subtitle compatibility against the representative
|
|
if (
|
|
areDurationsCompatible(sg[0].durationMinutes, book.durationMinutes) &&
|
|
areSubtitlesCompatible(sg[0].title, book.title)
|
|
) {
|
|
sg.push(book);
|
|
placed = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!placed) {
|
|
subGroups.push([book]);
|
|
}
|
|
}
|
|
|
|
// From each sub-group, pick the best representative and collect group metadata
|
|
for (const sg of subGroups) {
|
|
let best = sg[0];
|
|
let bestScore = metadataScore(best);
|
|
for (let i = 1; i < sg.length; i++) {
|
|
const score = metadataScore(sg[i]);
|
|
if (score > bestScore) {
|
|
best = sg[i];
|
|
bestScore = score;
|
|
}
|
|
}
|
|
result.push(best);
|
|
|
|
// Collect group metadata for works-table persistence (only multi-ASIN groups)
|
|
if (sg.length >= 2) {
|
|
dedupGroups.push({
|
|
canonicalAsin: best.asin,
|
|
allAsins: sg.map(b => b.asin),
|
|
title: best.title,
|
|
author: best.author,
|
|
narrator: best.narrator,
|
|
durationMinutes: best.durationMinutes,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return { books: result, groups: dedupGroups };
|
|
}
|