mirror of
https://github.com/kikootwo/ReadMeABook.git
synced 2026-06-18 04:00:10 +00:00
Add works table and ASIN deduping
Add persistent cross-ASIN "works" mapping and client-side deduplication to improve library matching. Introduces a Prisma migration and models (Work, WorkAsin) plus src/lib/services/works.service for persisting dedup groups, seeding ASINs at request time, and sibling lookup. Adds a deduplication utility (deduplicate-audiobooks) that normalizes titles/narrators, compares durations, and returns grouping metadata; API routes (search, author, series) now deduplicate results before enrichment and fire-and-forget persist groups. Adds sibling-ASIN expansion into audiobook matcher and expands getAvailableAsins accordingly. Extracts runtime parsing into a shared parse-runtime util and updates audible scrapers/services to use it. Includes unit tests for dedup logic and works service and updates test Prisma mocks.
This commit is contained in:
@@ -14,8 +14,10 @@ import {
|
||||
getLanguageForRegion,
|
||||
buildContainsSelector,
|
||||
stripPrefixes,
|
||||
type LanguageConfig,
|
||||
} from '../constants/language-config';
|
||||
import { RMABLogger } from '../utils/logger';
|
||||
import { parseRuntime } from '../utils/parse-runtime';
|
||||
import { randomDelay } from '../utils/scrape-resilience';
|
||||
|
||||
const logger = RMABLogger.create('Audible.Series');
|
||||
@@ -311,7 +313,7 @@ export async function scrapeSeriesPage(asin: string, page: number = 1): Promise<
|
||||
undefined;
|
||||
|
||||
// Parse all books from the series page
|
||||
const books = parseSeriesBooks($, langConfig.scraping.authorPrefixes, langConfig.scraping.narratorPrefixes);
|
||||
const books = parseSeriesBooks($, langConfig.scraping.authorPrefixes, langConfig.scraping.narratorPrefixes, langConfig);
|
||||
|
||||
// Use actual book count if we got more from scraping
|
||||
const bookCount = Math.max(summary.bookCount, books.length);
|
||||
@@ -403,7 +405,8 @@ function parseSeriesRating($: cheerio.CheerioAPI): { rating?: number; ratingCoun
|
||||
function parseSeriesBooks(
|
||||
$: cheerio.CheerioAPI,
|
||||
authorPrefixes: string[],
|
||||
narratorPrefixes: string[]
|
||||
narratorPrefixes: string[],
|
||||
langConfig: LanguageConfig
|
||||
): AudibleAudiobook[] {
|
||||
const books: AudibleAudiobook[] = [];
|
||||
const seenAsins = new Set<string>();
|
||||
@@ -453,6 +456,11 @@ function parseSeriesBooks(
|
||||
const ratingMatch = ratingText ? ratingText.match(/(\d+[.,]?\d*)/) : null;
|
||||
const rating = ratingMatch ? parseFloat(ratingMatch[1].replace(',', '.')) : undefined;
|
||||
|
||||
// Duration
|
||||
const runtimeText = $el.find('.runtimeLabel').text().trim() ||
|
||||
$el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
|
||||
const durationMinutes = parseRuntime(runtimeText, langConfig);
|
||||
|
||||
books.push({
|
||||
asin: bookAsin,
|
||||
title,
|
||||
@@ -461,6 +469,7 @@ function parseSeriesBooks(
|
||||
narrator: stripPrefixes(narratorText, narratorPrefixes),
|
||||
coverArtUrl,
|
||||
rating,
|
||||
durationMinutes,
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ import {
|
||||
AdaptivePacer,
|
||||
FetchResultMeta,
|
||||
} from '../utils/scrape-resilience';
|
||||
import { parseRuntime as parseRuntimeUtil } from '../utils/parse-runtime';
|
||||
|
||||
// Module-level logger
|
||||
const logger = RMABLogger.create('Audible');
|
||||
@@ -1134,33 +1135,11 @@ export class AudibleService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse runtime text to minutes using language-specific patterns
|
||||
* Parse runtime text to minutes using language-specific patterns.
|
||||
* Delegates to shared utility in src/lib/utils/parse-runtime.ts.
|
||||
*/
|
||||
private parseRuntime(runtimeText: string): number | undefined {
|
||||
if (!runtimeText) return undefined;
|
||||
|
||||
const langConfig = this.getLangConfig();
|
||||
let totalMinutes = 0;
|
||||
|
||||
// Try each hour pattern until one matches
|
||||
for (const pattern of langConfig.scraping.runtimeHourPatterns) {
|
||||
const match = runtimeText.match(pattern);
|
||||
if (match) {
|
||||
totalMinutes += parseInt(match[1]) * 60;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Try each minute pattern until one matches
|
||||
for (const pattern of langConfig.scraping.runtimeMinutePatterns) {
|
||||
const match = runtimeText.match(pattern);
|
||||
if (match) {
|
||||
totalMinutes += parseInt(match[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return totalMinutes > 0 ? totalMinutes : undefined;
|
||||
return parseRuntimeUtil(runtimeText, this.getLangConfig());
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user