Implement file hash-based library matching and remove fuzzy ASIN matching

Adds file hash-based matching for Audiobookshelf library items to ensure 100% accurate ASIN assignment for RMAB-organized content. Removes fuzzy matching from library availability checks, making all matching ASIN-only to eliminate false positives and race conditions. Updates database schema, processors, and matcher utilities; adds new tests and documentation for the new matching strategy. Removes obsolete scripts, Dockerfile, and related tests; updates docker-compose for test environments.
This commit is contained in:
kikootwo
2026-01-28 10:32:14 -05:00
parent 497849f427
commit a97979358f
111 changed files with 6571 additions and 1426 deletions
+26 -202
View File
@@ -3,11 +3,10 @@
* Documentation: documentation/integrations/audible.md
*
* Real-time matching between Audible books and library backends (Plex or Audiobookshelf).
* Supports ASIN, ISBN, and fuzzy title/author matching.
* ASIN-only matching for library availability checks (exact matches only).
*/
import { prisma } from '@/lib/db';
import { compareTwoStrings } from 'string-similarity';
import { LibraryItem } from '@/lib/services/library';
import { RMABLogger } from './logger';
@@ -28,43 +27,13 @@ export interface AudiobookMatchResult {
author: string;
}
/**
* Normalize audiobook title for matching by removing common suffixes/prefixes
* that don't affect the core title identity.
*/
function normalizeTitle(title: string): string {
let normalized = title.toLowerCase().trim();
// Remove common parenthetical additions (case-insensitive)
normalized = normalized.replace(/\s*\(unabridged\)\s*/gi, ' ');
normalized = normalized.replace(/\s*\(abridged\)\s*/gi, ' ');
normalized = normalized.replace(/\s*\(full cast\)\s*/gi, ' ');
normalized = normalized.replace(/\s*\(full-cast edition\)\s*/gi, ' ');
normalized = normalized.replace(/\s*\(dramatized\)\s*/gi, ' ');
normalized = normalized.replace(/\s*\(narrated by[^)]*\)\s*/gi, ' ');
// Remove common subtitle patterns
normalized = normalized.replace(/:\s*a novel\s*$/gi, '');
normalized = normalized.replace(/:\s*a thriller\s*$/gi, '');
normalized = normalized.replace(/:\s*a memoir\s*$/gi, '');
// Remove book number suffixes (but keep them in main title if they're significant)
// Only remove if they're clearly series indicators at the end
normalized = normalized.replace(/,?\s*book\s+\d+\s*$/gi, '');
normalized = normalized.replace(/:\s*book\s+\d+\s*$/gi, '');
// Clean up extra whitespace
normalized = normalized.replace(/\s+/g, ' ').trim();
return normalized;
}
/**
* Find a matching audiobook in the Plex library for a given Audible audiobook.
*
* Matching logic (in order of priority):
* 1. **ASIN in plexGuid** - Check if any Plex book's GUID contains the Audible ASIN (100% match)
* 2. **Fuzzy matching** - Normalized title/author string similarity with 70% threshold
* Matching logic (ASIN-only, exact matches):
* 1. **ASIN in dedicated field** - Check if plexLibrary.asin matches (100% confidence)
* 2. **ASIN in plexGuid** - Check if Plex GUID contains the Audible ASIN (backward compatibility)
* 3. **No match** - Return null (no fuzzy fallback)
*
* @param audiobook - Audible audiobook to match
* @returns Matched Plex library item or null
@@ -72,25 +41,22 @@ function normalizeTitle(title: string): string {
export async function findPlexMatch(
audiobook: AudiobookMatchInput
): Promise<AudiobookMatchResult | null> {
// Query plex_library for potential matches
// IMPORTANT: Search by TITLE ONLY (not author) because Plex often has narrator as author
const titleSearchLength = Math.min(20, audiobook.title.length);
// Query plex_library directly by ASIN (indexed O(1) lookup)
// Check both dedicated asin field and plexGuid for backward compatibility
const plexBooks = await prisma.plexLibrary.findMany({
where: {
title: {
contains: audiobook.title.substring(0, titleSearchLength),
mode: 'insensitive',
},
OR: [
{ asin: audiobook.asin },
{ plexGuid: { contains: audiobook.asin } },
],
},
select: {
plexGuid: true,
plexRatingKey: true,
title: true,
author: true,
asin: true, // Include ASIN field for direct matching
isbn: true, // Include ISBN field for additional matching
asin: true,
},
take: 20,
});
// Build match result for logging
@@ -107,9 +73,9 @@ export async function findPlexMatch(
result: null,
};
// If no candidates found, log and return null
// If no ASIN matches found, log and return null
if (plexBooks.length === 0) {
matchResult.matchType = 'no_candidates';
matchResult.matchType = 'no_asin_match';
logger.debug('Matcher result', { MATCHER: matchResult });
return null;
}
@@ -147,116 +113,8 @@ export async function findPlexMatch(
}
}
// FILTER OUT candidates with wrong ASINs (check both dedicated field and plexGuid)
const ASIN_PATTERN = /[A-Z0-9]{10}/g;
const rejectedAsins: string[] = [];
const validCandidates = plexBooks.filter((plexBook) => {
// Check dedicated ASIN field first (more reliable)
if (plexBook.asin) {
if (plexBook.asin.toLowerCase() !== audiobook.asin.toLowerCase()) {
rejectedAsins.push(plexBook.asin);
return false; // Wrong ASIN in dedicated field - reject
}
return true; // Correct ASIN in dedicated field - keep
}
// Fall back to checking plexGuid for legacy Plex data
if (!plexBook.plexGuid) return true;
const asinsInGuid = plexBook.plexGuid.match(ASIN_PATTERN);
if (!asinsInGuid || asinsInGuid.length === 0) return true;
const hasOurAsin = asinsInGuid.some(asin => asin === audiobook.asin);
const hasOtherAsins = asinsInGuid.some(asin => asin !== audiobook.asin);
if (hasOtherAsins && !hasOurAsin) {
rejectedAsins.push(...asinsInGuid);
return false;
}
return true;
});
matchResult.asinFiltering = {
beforeCount: plexBooks.length,
afterCount: validCandidates.length,
rejectedAsins: rejectedAsins.length > 0 ? rejectedAsins : undefined,
};
if (validCandidates.length === 0) {
matchResult.matchType = 'asin_filtered_all';
logger.debug('Matcher result', { MATCHER: matchResult });
return null;
}
// Normalize the Audible title
const normalizedAudibleTitle = normalizeTitle(audiobook.title);
// PRIORITY 2: Perform fuzzy matching
const candidates = validCandidates.map((plexBook) => {
const normalizedPlexTitle = normalizeTitle(plexBook.title);
const titleScore = compareTwoStrings(normalizedAudibleTitle, normalizedPlexTitle);
const authorScore = compareTwoStrings(
audiobook.author.toLowerCase(),
plexBook.author.toLowerCase()
);
let narratorScore = 0;
let usedNarratorMatch = false;
if (audiobook.narrator) {
narratorScore = compareTwoStrings(
audiobook.narrator.toLowerCase(),
plexBook.author.toLowerCase()
);
usedNarratorMatch = narratorScore > authorScore;
}
const personScore = usedNarratorMatch ? narratorScore : authorScore;
const overallScore = titleScore * 0.7 + personScore * 0.3;
return {
plexBook,
titleScore,
authorScore,
narratorScore,
usedNarratorMatch,
score: overallScore
};
});
// Sort by score descending
candidates.sort((a, b) => b.score - a.score);
const bestMatch = candidates[0];
// Add best match details to result
matchResult.bestCandidate = {
plexTitle: bestMatch.plexBook.title,
plexAuthor: bestMatch.plexBook.author,
plexGuid: bestMatch.plexBook.plexGuid,
scores: {
title: Math.round(bestMatch.titleScore * 100),
author: Math.round(bestMatch.authorScore * 100),
narrator: audiobook.narrator ? Math.round(bestMatch.narratorScore * 100) : null,
usedMatch: bestMatch.usedNarratorMatch ? 'narrator' : 'author',
overall: Math.round(bestMatch.score * 100),
},
threshold: 70,
};
// Accept match if score >= 70%
if (bestMatch && bestMatch.score >= 0.7) {
matchResult.matchType = 'fuzzy';
matchResult.matched = true;
matchResult.result = {
plexGuid: bestMatch.plexBook.plexGuid,
plexTitle: bestMatch.plexBook.title,
plexAuthor: bestMatch.plexBook.author,
confidence: Math.round(bestMatch.score * 100),
};
logger.debug('Matcher result', { MATCHER: matchResult });
return bestMatch.plexBook;
}
// No match found
matchResult.matchType = 'fuzzy_below_threshold';
// No exact match found (shouldn't happen given the query, but defensive)
matchResult.matchType = 'no_exact_match';
logger.debug('Matcher result', { MATCHER: matchResult });
return null;
}
@@ -384,10 +242,10 @@ function normalizeISBN(isbn: string): string {
* Generic audiobook matching function that works with LibraryItem interface.
* Works with any library backend (Plex, Audiobookshelf, etc.)
*
* Matching priority:
* Matching priority (ASIN-only, exact matches):
* 1. Exact ASIN match (100% confidence)
* 2. Exact ISBN match (95% confidence)
* 3. Fuzzy title/author match (70%+ threshold)
* 3. No match - Return null (no fuzzy fallback)
*
* @param request - Audiobook request details
* @param libraryItems - Items from library backend
@@ -430,49 +288,15 @@ export function matchAudiobook(
}
}
// 3. Fuzzy title/author match
const normalizedRequestTitle = normalizeTitle(request.title);
const normalizedRequestAuthor = request.author.toLowerCase();
const candidates = libraryItems.map(item => {
const normalizedItemTitle = normalizeTitle(item.title);
const normalizedItemAuthor = item.author.toLowerCase();
const titleScore = compareTwoStrings(normalizedRequestTitle, normalizedItemTitle);
const authorScore = compareTwoStrings(normalizedRequestAuthor, normalizedItemAuthor);
// Weighted average: title is more important
const overallScore = titleScore * 0.7 + authorScore * 0.3;
return { item, titleScore, authorScore, score: overallScore };
});
// Sort by score and get best match
candidates.sort((a, b) => b.score - a.score);
const bestMatch = candidates[0];
// Accept if score >= 70%
if (bestMatch && bestMatch.score >= 0.7) {
logger.debug('Generic matcher result', {
matchType: 'fuzzy',
input: { title: request.title, author: request.author },
matched: { title: bestMatch.item.title, author: bestMatch.item.author },
scores: {
title: Math.round(bestMatch.titleScore * 100),
author: Math.round(bestMatch.authorScore * 100),
overall: Math.round(bestMatch.score * 100)
},
confidence: Math.round(bestMatch.score * 100)
});
return bestMatch.item;
}
// No match found
// No match found (no ASIN/ISBN match, no fuzzy fallback)
logger.debug('Generic matcher result', {
matchType: 'no_match',
input: { title: request.title, author: request.author },
bestScore: bestMatch ? Math.round(bestMatch.score * 100) : 0,
threshold: 70
matchType: 'no_asin_isbn_match',
input: {
title: request.title,
author: request.author,
asin: request.asin || 'none',
isbn: request.isbn || 'none'
},
});
return null;