mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-03 12:50:09 +00:00
Add works table and ASIN deduping
Add persistent cross-ASIN "works" mapping and client-side deduplication to improve library matching. Introduces a Prisma migration and models (Work, WorkAsin) plus src/lib/services/works.service for persisting dedup groups, seeding ASINs at request time, and sibling lookup. Adds a deduplication utility (deduplicate-audiobooks) that normalizes titles/narrators, compares durations, and returns grouping metadata; API routes (search, author, series) now deduplicate results before enrichment and fire-and-forget persist groups. Adds sibling-ASIN expansion into audiobook matcher and expands getAvailableAsins accordingly. Extracts runtime parsing into a shared parse-runtime util and updates audible scrapers/services to use it. Includes unit tests for dedup logic and works service and updates test Prisma mocks.
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
|
||||
import { prisma } from '@/lib/db';
|
||||
import { LibraryItem } from '@/lib/services/library';
|
||||
import { getSiblingAsins } from '@/lib/services/works.service';
|
||||
import { RMABLogger } from './logger';
|
||||
|
||||
// Module-level logger
|
||||
@@ -178,6 +179,61 @@ export async function enrichAudiobooksWithMatches(
|
||||
}
|
||||
}
|
||||
|
||||
// Works-table sibling expansion: check if unmatched ASINs have siblings in the library
|
||||
try {
|
||||
const unmatchedAsins = results.filter(r => !r.isAvailable).map(r => r.asin);
|
||||
if (unmatchedAsins.length > 0) {
|
||||
const siblingMap = await getSiblingAsins(unmatchedAsins);
|
||||
if (siblingMap.size > 0) {
|
||||
// Collect all sibling ASINs for a single batch library query
|
||||
const allSiblingAsins = new Set<string>();
|
||||
for (const siblings of siblingMap.values()) {
|
||||
for (const s of siblings) allSiblingAsins.add(s);
|
||||
}
|
||||
|
||||
if (allSiblingAsins.size > 0) {
|
||||
const siblingLibraryMatches = await prisma.plexLibrary.findMany({
|
||||
where: { asin: { in: [...allSiblingAsins] } },
|
||||
select: { asin: true, plexGuid: true },
|
||||
});
|
||||
const libraryAsinSet = new Set(
|
||||
siblingLibraryMatches.filter(m => m.asin).map(m => m.asin!.toLowerCase())
|
||||
);
|
||||
|
||||
// Update results where a sibling ASIN is found in the library
|
||||
for (const result of results) {
|
||||
if (result.isAvailable) continue;
|
||||
const siblings = siblingMap.get(result.asin);
|
||||
if (!siblings) continue;
|
||||
const matchedSiblingAsin = siblings.find(s => libraryAsinSet.has(s.toLowerCase()));
|
||||
if (matchedSiblingAsin) {
|
||||
const libMatch = siblingLibraryMatches.find(
|
||||
m => m.asin?.toLowerCase() === matchedSiblingAsin.toLowerCase()
|
||||
);
|
||||
(result as any).isAvailable = true;
|
||||
(result as any).plexGuid = libMatch?.plexGuid || null;
|
||||
}
|
||||
}
|
||||
|
||||
const siblingMatchCount = results.filter(r => {
|
||||
if (!r.isAvailable) return false;
|
||||
return siblingMap.has(r.asin);
|
||||
}).length;
|
||||
logger.debug('Sibling expansion', {
|
||||
unmatchedCount: unmatchedAsins.length,
|
||||
siblingGroupsFound: siblingMap.size,
|
||||
siblingMatches: siblingMatchCount,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
// Works table expansion is best-effort — direct matches still work
|
||||
logger.error('Sibling ASIN expansion failed', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
}
|
||||
|
||||
// Always enrich with request status (check ANY user's requests)
|
||||
const asins = audiobooks.map(book => book.asin);
|
||||
|
||||
@@ -307,6 +363,19 @@ export async function getAvailableAsins(): Promise<Set<string>> {
|
||||
for (const item of completedRequests) {
|
||||
if (item.audibleAsin) asins.add(item.audibleAsin);
|
||||
}
|
||||
|
||||
// Expand with works-table sibling ASINs
|
||||
try {
|
||||
if (asins.size > 0) {
|
||||
const siblingMap = await getSiblingAsins([...asins]);
|
||||
for (const siblings of siblingMap.values()) {
|
||||
for (const s of siblings) asins.add(s);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Works table expansion is best-effort
|
||||
}
|
||||
|
||||
return asins;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,201 @@
|
||||
/**
|
||||
* Component: Audiobook Deduplication Utility
|
||||
* Documentation: documentation/integrations/audible.md
|
||||
*
|
||||
* Deduplicates audiobook listings that represent the same recording
|
||||
* under different ASINs (publisher re-listings, rights transfers, etc.).
|
||||
*
|
||||
* Dedup key: normalized title + normalized narrator
|
||||
* Duration tolerance: max(longerDuration * 0.01, 5) minutes
|
||||
* Missing duration treated as compatible (graceful degradation).
|
||||
*/
|
||||
|
||||
import type { AudibleAudiobook } from '../integrations/audible.service';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Title / narrator normalization
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Patterns in parentheses or brackets to strip (edition markers, format labels) */
|
||||
const EDITION_PAREN_RE = /[([][^)\]]*?(?:unabridged|abridged|edition|remaster(?:ed)?|anniversary|complete|original|version|narrat(?:ed|or)?|audio(?:book)?|full cast|dramatiz(?:ed|ation))[^)\]]*[)\]]/gi;
|
||||
|
||||
/** Trailing subtitle after colon or long dash */
|
||||
const SUBTITLE_RE = /\s*[:]\s+.+$/;
|
||||
const LONG_DASH_SUBTITLE_RE = /\s+[-\u2013\u2014]\s+.+$/;
|
||||
|
||||
/** Trailing descriptors like "A Novel", "A Memoir" */
|
||||
const TRAILING_DESCRIPTOR_RE = /\s*[-:,]?\s+a\s+(novel|memoir|thriller|mystery|romance|story|tale|novella)\s*$/i;
|
||||
|
||||
/**
|
||||
* Normalize a title for dedup comparison.
|
||||
* Strips subtitles, edition markers, and trailing descriptors.
|
||||
*/
|
||||
export function normalizeTitle(title: string): string {
|
||||
let t = title.toLowerCase();
|
||||
// Remove parenthesized/bracketed edition markers
|
||||
t = t.replace(EDITION_PAREN_RE, '');
|
||||
// Remove trailing descriptors before subtitle stripping
|
||||
t = t.replace(TRAILING_DESCRIPTOR_RE, '');
|
||||
// Remove subtitle after colon
|
||||
t = t.replace(SUBTITLE_RE, '');
|
||||
// Remove subtitle after long dash (but not short hyphenated words)
|
||||
t = t.replace(LONG_DASH_SUBTITLE_RE, '');
|
||||
// Collapse whitespace and trim
|
||||
return t.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
/** Normalize narrator for comparison. */
|
||||
function normalizeNarrator(narrator?: string): string {
|
||||
return (narrator || '').toLowerCase().trim();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Duration compatibility
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Check if two durations are compatible (represent the same recording).
|
||||
* Tolerance: max(longerDuration * 0.01, 5) minutes.
|
||||
* Missing duration on either side is treated as compatible.
|
||||
*/
|
||||
export function areDurationsCompatible(a?: number, b?: number): boolean {
|
||||
if (a == null || b == null) return true;
|
||||
const longer = Math.max(a, b);
|
||||
const tolerance = Math.max(longer * 0.01, 5);
|
||||
return Math.abs(a - b) <= tolerance;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metadata scoring (for picking best representative)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function metadataScore(book: AudibleAudiobook): number {
|
||||
let score = 0;
|
||||
if (book.coverArtUrl) score++;
|
||||
if (book.rating != null) score++;
|
||||
if (book.durationMinutes != null) score++;
|
||||
if (book.description) score++;
|
||||
if (book.narrator) score++;
|
||||
if (book.releaseDate) score++;
|
||||
if (book.genres && book.genres.length > 0) score++;
|
||||
return score;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dedup group types (for works-table persistence)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Metadata about a group of ASINs that were collapsed during dedup. */
|
||||
export interface DedupGroup {
|
||||
canonicalAsin: string; // ASIN of the "winner" (best metadata score)
|
||||
allAsins: string[]; // All ASINs in this group (including canonical)
|
||||
title: string; // Author from the canonical entry
|
||||
author: string; // Author from the canonical entry
|
||||
narrator?: string; // Narrator from the canonical entry
|
||||
durationMinutes?: number; // Duration from the canonical entry
|
||||
}
|
||||
|
||||
/** Result of deduplication with group collection. */
|
||||
export interface DeduplicateResult {
|
||||
books: AudibleAudiobook[]; // The deduped list (same as deduplicateAudiobooks returns)
|
||||
groups: DedupGroup[]; // Groups where 2+ ASINs were collapsed
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main dedup functions
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Deduplicate audiobook listings by normalized title + narrator + duration.
|
||||
*
|
||||
* Same narrator + compatible duration + similar title = same recording -> collapse.
|
||||
* Different narrator = different production -> keep both.
|
||||
* Duration outside tolerance = different content (abridged vs unabridged) -> keep both.
|
||||
*
|
||||
* Preserves original ordering (position of first appearance).
|
||||
*/
|
||||
export function deduplicateAudiobooks(books: AudibleAudiobook[]): AudibleAudiobook[] {
|
||||
return deduplicateAndCollectGroups(books).books;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deduplicate audiobooks AND return grouping metadata for works-table persistence.
|
||||
* Returns both the deduped list and the groups where 2+ ASINs were collapsed.
|
||||
*/
|
||||
export function deduplicateAndCollectGroups(books: AudibleAudiobook[]): DeduplicateResult {
|
||||
if (books.length <= 1) return { books: [...books], groups: [] };
|
||||
|
||||
// Group by normalized title + narrator
|
||||
const titleNarratorGroups = new Map<string, AudibleAudiobook[]>();
|
||||
const insertionOrder: string[] = [];
|
||||
|
||||
for (const book of books) {
|
||||
const key = `${normalizeTitle(book.title)}|||${normalizeNarrator(book.narrator)}`;
|
||||
const group = titleNarratorGroups.get(key);
|
||||
if (group) {
|
||||
group.push(book);
|
||||
} else {
|
||||
titleNarratorGroups.set(key, [book]);
|
||||
insertionOrder.push(key);
|
||||
}
|
||||
}
|
||||
|
||||
const result: AudibleAudiobook[] = [];
|
||||
const dedupGroups: DedupGroup[] = [];
|
||||
|
||||
for (const key of insertionOrder) {
|
||||
const group = titleNarratorGroups.get(key)!;
|
||||
if (group.length === 1) {
|
||||
result.push(group[0]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Within a title+narrator group, further split by duration compatibility.
|
||||
// Build sub-groups where all members are duration-compatible with the
|
||||
// representative (first member). A book joins the first compatible sub-group.
|
||||
const subGroups: AudibleAudiobook[][] = [];
|
||||
|
||||
for (const book of group) {
|
||||
let placed = false;
|
||||
for (const sg of subGroups) {
|
||||
// Check compatibility against the representative (first member)
|
||||
if (areDurationsCompatible(sg[0].durationMinutes, book.durationMinutes)) {
|
||||
sg.push(book);
|
||||
placed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!placed) {
|
||||
subGroups.push([book]);
|
||||
}
|
||||
}
|
||||
|
||||
// From each sub-group, pick the best representative and collect group metadata
|
||||
for (const sg of subGroups) {
|
||||
let best = sg[0];
|
||||
let bestScore = metadataScore(best);
|
||||
for (let i = 1; i < sg.length; i++) {
|
||||
const score = metadataScore(sg[i]);
|
||||
if (score > bestScore) {
|
||||
best = sg[i];
|
||||
bestScore = score;
|
||||
}
|
||||
}
|
||||
result.push(best);
|
||||
|
||||
// Collect group metadata for works-table persistence (only multi-ASIN groups)
|
||||
if (sg.length >= 2) {
|
||||
dedupGroups.push({
|
||||
canonicalAsin: best.asin,
|
||||
allAsins: sg.map(b => b.asin),
|
||||
title: best.title,
|
||||
author: best.author,
|
||||
narrator: best.narrator,
|
||||
durationMinutes: best.durationMinutes,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { books: result, groups: dedupGroups };
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/**
|
||||
* Component: Runtime Parsing Utility
|
||||
* Documentation: documentation/integrations/audible.md
|
||||
*
|
||||
* Shared runtime/duration text parser extracted from AudibleService.
|
||||
* Handles all i18n patterns (English, German, Spanish, French) via
|
||||
* language-specific regex patterns in LanguageConfig.
|
||||
*/
|
||||
|
||||
import type { LanguageConfig } from '../constants/language-config';
|
||||
|
||||
/**
|
||||
* Parse runtime text (e.g. "12 hrs and 30 mins", "5 Std. 20 Min.")
|
||||
* into total minutes using language-specific patterns.
|
||||
*
|
||||
* @param runtimeText - Raw runtime string from Audible HTML
|
||||
* @param langConfig - Language configuration with hour/minute regex patterns
|
||||
* @returns Total minutes, or undefined if no duration could be parsed
|
||||
*/
|
||||
export function parseRuntime(runtimeText: string, langConfig: LanguageConfig): number | undefined {
|
||||
if (!runtimeText) return undefined;
|
||||
|
||||
let totalMinutes = 0;
|
||||
|
||||
// Try each hour pattern until one matches
|
||||
for (const pattern of langConfig.scraping.runtimeHourPatterns) {
|
||||
const match = runtimeText.match(pattern);
|
||||
if (match) {
|
||||
totalMinutes += parseInt(match[1]) * 60;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Try each minute pattern until one matches
|
||||
for (const pattern of langConfig.scraping.runtimeMinutePatterns) {
|
||||
const match = runtimeText.match(pattern);
|
||||
if (match) {
|
||||
totalMinutes += parseInt(match[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return totalMinutes > 0 ? totalMinutes : undefined;
|
||||
}
|
||||
Reference in New Issue
Block a user