From 5d8ac2f73d35a06a99e9cafbb769468cae3aeb7d Mon Sep 17 00:00:00 2001 From: kikootwo Date: Fri, 20 Feb 2026 06:32:44 -0500 Subject: [PATCH] Add language config and locale-aware parsing Introduce centralized language configuration and wire locale-aware behavior across scraping and ranking. Adds src/lib/constants/language-config.ts with per-language scraping rules, stop words, and character replacements; replaces AudibleRegion.isEnglish with a language field in types and AUDIBLE_REGIONS. Update AudibleService, ebook scraper, processors, and API routes to use getLanguageForRegion so Anna's Archive searches, scraping selectors, runtime/rating parsing, and ranking use language-specific params and filters. Extend ranking algorithm to accept stopWords and characterReplacements and apply them during normalization and matching. Update UI selects to mark non-English regions and adjust tests accordingly. --- .../tabs/LibraryTab/AudiobookshelfSection.tsx | 4 +- .../settings/tabs/LibraryTab/PlexSection.tsx | 4 +- .../[asin]/interactive-search-ebook/route.ts | 23 +- .../api/audiobooks/search-torrents/route.ts | 10 +- .../[id]/interactive-search-ebook/route.ts | 23 +- .../requests/[id]/interactive-search/route.ts | 10 +- src/app/setup/steps/BackendSelectionStep.tsx | 4 +- src/lib/constants/language-config.ts | 252 ++++++++++++++++++ src/lib/integrations/audible.service.ts | 152 +++++++---- src/lib/processors/search-ebook.processor.ts | 17 +- .../processors/search-indexers.processor.ts | 10 +- src/lib/services/ebook-scraper.ts | 23 +- src/lib/types/audible.ts | 18 +- src/lib/utils/ranking-algorithm.ts | 68 +++-- tests/api/audiobooks-search.routes.test.ts | 2 + tests/api/requests-actions.routes.test.ts | 3 +- .../processors/search-ebook.processor.test.ts | 11 +- .../search-indexers.processor.test.ts | 3 +- 18 files changed, 525 insertions(+), 112 deletions(-) create mode 100644 src/lib/constants/language-config.ts diff --git a/src/app/admin/settings/tabs/LibraryTab/AudiobookshelfSection.tsx b/src/app/admin/settings/tabs/LibraryTab/AudiobookshelfSection.tsx index 03bc2d4..46e6c8e 100644 --- a/src/app/admin/settings/tabs/LibraryTab/AudiobookshelfSection.tsx +++ b/src/app/admin/settings/tabs/LibraryTab/AudiobookshelfSection.tsx @@ -164,11 +164,11 @@ export function AudiobookshelfSection({ > {Object.values(AUDIBLE_REGIONS).map((region) => ( ))} - {AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.isEnglish === false && ( + {AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.language !== 'en' && (
{Object.values(AUDIBLE_REGIONS).map((region) => ( ))} - {AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.isEnglish === false && ( + {AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.language !== 'en' && (
{ logger.error(`Anna's Archive search failed: ${err.message}`); return null; @@ -322,7 +330,8 @@ async function searchAnnasArchiveForInteractive( author: string, preferredFormat: string, baseUrl: string, - flaresolverrUrl?: string + flaresolverrUrl?: string, + languageCode: string = 'en' ): Promise { let md5: string | null = null; let searchMethod: 'asin' | 'title' = 'title'; @@ -330,7 +339,7 @@ async function searchAnnasArchiveForInteractive( // Try ASIN search first if (asin) { logger.info(`Searching Anna's Archive by ASIN: ${asin}`); - md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl); + md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode); if (md5) { searchMethod = 'asin'; logger.info(`Found via ASIN: ${md5}`); @@ -340,7 +349,7 @@ async function searchAnnasArchiveForInteractive( // Fallback to title search if (!md5) { logger.info(`Searching Anna's Archive by title: "${title}"`); - md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl); + md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode); if (md5) { logger.info(`Found via title: ${md5}`); } @@ -461,6 +470,10 @@ async function searchIndexersForInteractive( return []; } + // Get language-specific stop words for ranking + const rankRegion = await configService.getAudibleRegion() as AudibleRegion; + const rankLangConfig = getLanguageForRegion(rankRegion); + // Rank results with ebook scoring const rankedResults = rankEbookTorrents(allResults, { title, @@ -470,6 +483,8 @@ async function searchIndexersForInteractive( indexerPriorities, flagConfigs, requireAuthor: false, + stopWords: rankLangConfig.stopWords, + characterReplacements: rankLangConfig.characterReplacements, }); // Convert to unified result type diff --git a/src/app/api/audiobooks/search-torrents/route.ts b/src/app/api/audiobooks/search-torrents/route.ts index 0565f7e..59b8bfe 100644 --- a/src/app/api/audiobooks/search-torrents/route.ts +++ b/src/app/api/audiobooks/search-torrents/route.ts @@ -10,6 +10,8 @@ import { requireAuth, AuthenticatedRequest } from '@/lib/middleware/auth'; import { getProwlarrService } from '@/lib/integrations/prowlarr.service'; import { rankTorrents } from '@/lib/utils/ranking-algorithm'; import { groupIndexersByCategories, getGroupDescription } from '@/lib/utils/indexer-grouping'; +import { getLanguageForRegion } from '@/lib/constants/language-config'; +import type { AudibleRegion } from '@/lib/types/audible'; import { z } from 'zod'; import { RMABLogger } from '@/lib/utils/logger'; @@ -140,13 +142,19 @@ export async function POST(request: NextRequest) { logger.info(`Will filter ${belowThreshold.length} results < ${sizeMBThreshold} MB (likely ebooks)`); } + // Get language-specific stop words for ranking + const region = await configService.getAudibleRegion() as AudibleRegion; + const langConfig = getLanguageForRegion(region); + // Rank torrents using the ranking algorithm with indexer priorities and flag configs // Note: rankTorrents now filters out results < 20 MB internally // requireAuthor: false - interactive search, show all results for user decision const rankedResults = rankTorrents(results, { title, author, durationMinutes }, { indexerPriorities, flagConfigs, - requireAuthor: false // Interactive mode - let user decide + requireAuthor: false, // Interactive mode - let user decide + stopWords: langConfig.stopWords, + characterReplacements: langConfig.characterReplacements, }); // Log filter results diff --git a/src/app/api/requests/[id]/interactive-search-ebook/route.ts b/src/app/api/requests/[id]/interactive-search-ebook/route.ts index 0e3ed6e..379c279 100644 --- a/src/app/api/requests/[id]/interactive-search-ebook/route.ts +++ b/src/app/api/requests/[id]/interactive-search-ebook/route.ts @@ -14,6 +14,8 @@ import { getProwlarrService } from '@/lib/integrations/prowlarr.service'; import { rankEbookTorrents, RankedEbookTorrent } from '@/lib/utils/ranking-algorithm'; import { groupIndexersByCategories, getGroupDescription } from '@/lib/utils/indexer-grouping'; import { RMABLogger } from '@/lib/utils/logger'; +import { getLanguageForRegion } from '@/lib/constants/language-config'; +import type { AudibleRegion } from '@/lib/types/audible'; import { searchByAsin, searchByTitle, @@ -121,6 +123,11 @@ export async function POST( const format = preferredFormat || 'epub'; const annasBaseUrl = baseUrl || 'https://annas-archive.li'; + // Get language code from Audible region config + const region = await configService.getAudibleRegion() as AudibleRegion; + const langConfig = getLanguageForRegion(region); + const languageCode = langConfig.annasArchiveLang; + if (!isAnnasArchiveEnabled && !isIndexerSearchEnabled) { return NextResponse.json( { error: 'No ebook sources enabled. Enable Anna\'s Archive or Indexer Search in settings.' }, @@ -145,7 +152,8 @@ export async function POST( audiobook.author, format, annasBaseUrl, - flaresolverrUrl || undefined + flaresolverrUrl || undefined, + languageCode ).catch((err) => { logger.error(`Anna's Archive search failed: ${err.message}`); return null; @@ -217,7 +225,8 @@ async function searchAnnasArchiveForInteractive( author: string, preferredFormat: string, baseUrl: string, - flaresolverrUrl?: string + flaresolverrUrl?: string, + languageCode: string = 'en' ): Promise { let md5: string | null = null; let searchMethod: 'asin' | 'title' = 'title'; @@ -225,7 +234,7 @@ async function searchAnnasArchiveForInteractive( // Try ASIN search first if (asin) { logger.info(`Searching Anna's Archive by ASIN: ${asin}`); - md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl); + md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode); if (md5) { searchMethod = 'asin'; logger.info(`Found via ASIN: ${md5}`); @@ -235,7 +244,7 @@ async function searchAnnasArchiveForInteractive( // Fallback to title search if (!md5) { logger.info(`Searching Anna's Archive by title: "${title}"`); - md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl); + md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode); if (md5) { logger.info(`Found via title: ${md5}`); } @@ -356,6 +365,10 @@ async function searchIndexersForInteractive( return []; } + // Get language-specific stop words for ranking + const rankRegion = await configService.getAudibleRegion() as AudibleRegion; + const rankLangConfig = getLanguageForRegion(rankRegion); + // Rank results with ebook scoring // Use requireAuthor=false for interactive mode (let user decide) const rankedResults = rankEbookTorrents(allResults, { @@ -366,6 +379,8 @@ async function searchIndexersForInteractive( indexerPriorities, flagConfigs, requireAuthor: false, + stopWords: rankLangConfig.stopWords, + characterReplacements: rankLangConfig.characterReplacements, }); // Log ranking debug info (same format as search-ebook.processor.ts) diff --git a/src/app/api/requests/[id]/interactive-search/route.ts b/src/app/api/requests/[id]/interactive-search/route.ts index 016bbd8..5269f28 100644 --- a/src/app/api/requests/[id]/interactive-search/route.ts +++ b/src/app/api/requests/[id]/interactive-search/route.ts @@ -9,6 +9,8 @@ import { prisma } from '@/lib/db'; import { getProwlarrService } from '@/lib/integrations/prowlarr.service'; import { rankTorrents } from '@/lib/utils/ranking-algorithm'; import { groupIndexersByCategories, getGroupDescription } from '@/lib/utils/indexer-grouping'; +import { getLanguageForRegion } from '@/lib/constants/language-config'; +import type { AudibleRegion } from '@/lib/types/audible'; import { RMABLogger } from '@/lib/utils/logger'; import { resolveInteractiveSearchAccess } from '@/lib/utils/permissions'; @@ -189,6 +191,10 @@ export async function POST( } } + // Get language-specific stop words for ranking + const region = await configService.getAudibleRegion() as AudibleRegion; + const langConfig = getLanguageForRegion(region); + // Rank torrents using the ranking algorithm with indexer priorities and flag configs // Always use the audiobook's title/author for ranking (not custom search query) // requireAuthor: false - interactive mode, show all results for user decision @@ -199,7 +205,9 @@ export async function POST( }, { indexerPriorities, flagConfigs, - requireAuthor: false // Interactive mode - let user decide + requireAuthor: false, // Interactive mode - let user decide + stopWords: langConfig.stopWords, + characterReplacements: langConfig.characterReplacements, }); // No threshold filtering for interactive search - show all results diff --git a/src/app/setup/steps/BackendSelectionStep.tsx b/src/app/setup/steps/BackendSelectionStep.tsx index f8f44bb..35b7826 100644 --- a/src/app/setup/steps/BackendSelectionStep.tsx +++ b/src/app/setup/steps/BackendSelectionStep.tsx @@ -115,11 +115,11 @@ export function BackendSelectionStep({ > {Object.values(AUDIBLE_REGIONS).map((region) => ( ))} - {AUDIBLE_REGIONS[audibleRegion]?.isEnglish === false && ( + {AUDIBLE_REGIONS[audibleRegion]?.language !== 'en' && (
; + /** All scraping-related config */ + scraping: ScrapingConfig; +} + +// --------------------------------------------------------------------------- +// Language Configurations +// --------------------------------------------------------------------------- + +const ENGLISH_CONFIG: LanguageConfig = { + code: 'en', + annasArchiveLang: 'en', + epubCode: 'en', + stopWords: ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for'], + characterReplacements: {}, + scraping: { + audibleLocaleParam: 'english', + authorPrefixes: ['By:', 'Written by:'], + narratorPrefixes: ['Narrated by:'], + lengthLabels: ['Length:'], + languageLabels: ['Language:'], + releaseDateLabels: ['Release date:'], + acceptedLanguageValues: ['english'], + runtimeHourPatterns: [/(\d+)\s*hrs?/i, /(\d+)\s*hours?/i], + runtimeMinutePatterns: [/(\d+)\s*mins?/i, /(\d+)\s*minutes?/i], + ratingPatterns: [/(\d+\.?\d*)\s*out of/i], + releaseDatePatterns: [/Release date:\s*(.+)/i], + descriptionExcludePatterns: [ + /\$\d+\.\d+/, + /cancel anytime/i, + /free trial/i, + /membership/i, + /subscribe/i, + /offer.*ends/i, + /^\s*by\s+[\w\s,]+$/i, + ], + durationDetectionPattern: /\d+\s*(hr|hour|h)\s*\d*\s*(min|minute|m)?/i, + ratingTextSelector: 'out of 5 stars', + }, +}; + +const GERMAN_CONFIG: LanguageConfig = { + code: 'de', + annasArchiveLang: 'de', + epubCode: 'de', + stopWords: ['der', 'die', 'das', 'ein', 'eine', 'und', 'von', 'zu', 'den', 'dem', 'des'], + characterReplacements: { '\u00df': 'ss' }, + scraping: { + audibleLocaleParam: 'deutsch', + authorPrefixes: ['Von:', 'Geschrieben von:', 'Autor:'], + narratorPrefixes: ['Gesprochen von:', 'Sprecher:'], + lengthLabels: ['Spieldauer:', 'Dauer:', 'L\u00e4nge:'], + languageLabels: ['Sprache:'], + releaseDateLabels: ['Erscheinungsdatum:'], + acceptedLanguageValues: ['deutsch', 'german'], + runtimeHourPatterns: [/(\d+)\s*Std\.?/i, /(\d+)\s*Stunden?/i], + runtimeMinutePatterns: [/(\d+)\s*Min\.?/i, /(\d+)\s*Minuten?/i], + ratingPatterns: [/(\d+[.,]?\d*)\s*von\s*5/i], + releaseDatePatterns: [/Erscheinungsdatum:\s*(.+)/i], + descriptionExcludePatterns: [ + /\$\d+\.\d+/, + /\d+,\d+\s*\u20ac/, + /jederzeit k\u00fcndbar/i, + /kostenlos testen/i, + /Mitgliedschaft/i, + /abonnieren/i, + /Angebot.*endet/i, + /^\s*von\s+[\w\s,]+$/i, + ], + durationDetectionPattern: /\d+\s*(Std|Stunden?|h)\s*\.?\s*\d*\s*(Min|Minuten?|m)?/i, + ratingTextSelector: 'von 5 Sternen', + }, +}; + +const SPANISH_CONFIG: LanguageConfig = { + code: 'es', + annasArchiveLang: 'es', + epubCode: 'es', + stopWords: ['el', 'la', 'los', 'las', 'un', 'una', 'de', 'del', 'en', 'y', 'por'], + characterReplacements: {}, + scraping: { + audibleLocaleParam: 'espa\u00f1ol', + authorPrefixes: ['De:', 'Escrito por:', 'Autor:'], + narratorPrefixes: ['Narrado por:'], + lengthLabels: ['Duraci\u00f3n:'], + languageLabels: ['Idioma:'], + releaseDateLabels: ['Fecha de lanzamiento:'], + acceptedLanguageValues: ['espa\u00f1ol', 'spanish'], + runtimeHourPatterns: [/(\d+)\s*h\b/i, /(\d+)\s*horas?/i], + runtimeMinutePatterns: [/(\d+)\s*min/i, /(\d+)\s*minutos?/i], + ratingPatterns: [/(\d+[.,]?\d*)\s*de\s*5/i], + releaseDatePatterns: [/Fecha de lanzamiento:\s*(.+)/i], + descriptionExcludePatterns: [ + /\$\d+\.\d+/, + /\d+,\d+\s*\u20ac/, + /cancela cuando quieras/i, + /prueba gratis/i, + /suscripci\u00f3n/i, + /suscr\u00edbete/i, + /oferta.*termina/i, + /^\s*de\s+[\w\s,]+$/i, + ], + durationDetectionPattern: /\d+\s*(h|horas?)\s*\d*\s*(min|minutos?)?/i, + ratingTextSelector: 'de 5 estrellas', + }, +}; + +// --------------------------------------------------------------------------- +// Lookup Maps +// --------------------------------------------------------------------------- + +export const LANGUAGE_CONFIGS: Record = { + en: ENGLISH_CONFIG, + de: GERMAN_CONFIG, + es: SPANISH_CONFIG, +}; + +/** + * Maps Audible region codes to language codes. + * All English-speaking regions map to 'en'. + */ +export const REGION_LANGUAGE_MAP: Record = { + us: 'en', + ca: 'en', + uk: 'en', + au: 'en', + in: 'en', + de: 'de', + es: 'es', +}; + +// --------------------------------------------------------------------------- +// Helper Functions +// --------------------------------------------------------------------------- + +/** + * Get the full language configuration for an Audible region. + */ +export function getLanguageForRegion(region: AudibleRegion): LanguageConfig { + const langCode = REGION_LANGUAGE_MAP[region]; + return LANGUAGE_CONFIGS[langCode]; +} + +/** + * Strip any matching prefixes from text (case-insensitive). + * Returns the text with the first matching prefix removed, trimmed. + * + * Example: stripPrefixes('By: Author Name', ['By:', 'Written by:']) => 'Author Name' + */ +export function stripPrefixes(text: string, prefixes: string[]): string { + const trimmed = text.trim(); + for (const prefix of prefixes) { + if (trimmed.toLowerCase().startsWith(prefix.toLowerCase())) { + return trimmed.slice(prefix.length).trim(); + } + } + return trimmed; +} + +/** + * Build a Cheerio selector that matches any of the given labels using :contains(). + * Returns a comma-separated selector string. + * + * Example: buildContainsSelector('span', ['Length:', 'Dauer:']) + * => 'span:contains("Length:"), span:contains("Dauer:")' + */ +export function buildContainsSelector(element: string, labels: string[]): string { + return labels.map(label => `${element}:contains("${label}")`).join(', '); +} + +/** + * Extract a value from text by trying multiple label patterns. + * Returns the captured group from the first matching pattern, or null. + */ +export function extractByPatterns(text: string, patterns: RegExp[]): string | null { + for (const pattern of patterns) { + const match = text.match(pattern); + if (match?.[1]) { + return match[1].trim(); + } + } + return null; +} + +/** + * Check if a language value matches the accepted values for a language config. + * Comparison is case-insensitive. + */ +export function isAcceptedLanguage(languageValue: string, config: LanguageConfig): boolean { + const normalized = languageValue.toLowerCase().trim(); + return config.scraping.acceptedLanguageValues.includes(normalized); +} diff --git a/src/lib/integrations/audible.service.ts b/src/lib/integrations/audible.service.ts index 21cfab2..cef42ed 100644 --- a/src/lib/integrations/audible.service.ts +++ b/src/lib/integrations/audible.service.ts @@ -8,6 +8,14 @@ import * as cheerio from 'cheerio'; import { RMABLogger } from '../utils/logger'; import { getConfigService } from '../services/config.service'; import { AudibleRegion, AUDIBLE_REGIONS, DEFAULT_AUDIBLE_REGION } from '../types/audible'; +import { + getLanguageForRegion, + stripPrefixes, + buildContainsSelector, + extractByPatterns, + isAcceptedLanguage, + type LanguageConfig, +} from '../constants/language-config'; import { pickUserAgent, getBrowserHeaders, @@ -69,6 +77,13 @@ export class AudibleService { return this.baseUrl; } + /** + * Get the language config for the current region + */ + private getLangConfig(): LanguageConfig { + return getLanguageForRegion(this.region); + } + /** * Force re-initialization (used when region config changes) */ @@ -106,6 +121,9 @@ export class AudibleService { logger.info(`Initializing Audible service with region: ${this.region} (${this.baseUrl})`); + // Get language config for the region + const langConfig = getLanguageForRegion(this.region); + // Create axios client with region-specific base URL and realistic browser headers this.client = axios.create({ baseURL: this.baseUrl, @@ -113,7 +131,7 @@ export class AudibleService { headers: getBrowserHeaders(this.sessionUserAgent), params: { ipRedirectOverride: 'true', // Prevent IP-based region redirects - language: 'english', // Force English locale (prevents IP-based language serving for non-English IPs) + language: langConfig.scraping.audibleLocaleParam, // Force locale (prevents IP-based language serving) }, }); @@ -125,13 +143,16 @@ export class AudibleService { this.baseUrl = AUDIBLE_REGIONS[this.region].baseUrl; this.sessionUserAgent = pickUserAgent(); this.pacer.reset(); + + const fallbackLangConfig = getLanguageForRegion(this.region); + this.client = axios.create({ baseURL: this.baseUrl, timeout: 15000, headers: getBrowserHeaders(this.sessionUserAgent), params: { ipRedirectOverride: 'true', - language: 'english', + language: fallbackLangConfig.scraping.audibleLocaleParam, }, }); this.initialized = true; @@ -289,12 +310,14 @@ export class AudibleService { const ratingText = $el.find('.ratingsLabel').text().trim(); const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined; + const langConfig = this.getLangConfig(); + audiobooks.push({ asin, title, - author: authorText.replace('By:', '').replace('Written by:', '').trim(), + author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin: authorAsinMatch?.[1] || undefined, - narrator: narratorText.replace('Narrated by:', '').trim(), + narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), rating, }); @@ -391,12 +414,14 @@ export class AudibleService { const ratingText = $el.find('.ratingsLabel').text().trim(); const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined; + const langConfig = this.getLangConfig(); + audiobooks.push({ asin, title, - author: authorText.replace('By:', '').replace('Written by:', '').trim(), + author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin: authorAsinMatch?.[1] || undefined, - narrator: narratorText.replace('Narrated by:', '').trim(), + narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), rating, }); @@ -487,9 +512,11 @@ export class AudibleService { const coverArtUrl = $el.find('img').attr('src') || ''; + const langConfig = this.getLangConfig(); + // Extract runtime/duration const runtimeText = $el.find('.runtimeLabel').text().trim() || - $el.find('span:contains("Length:")').text().trim(); + $el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim(); const durationMinutes = this.parseRuntime(runtimeText); // Extract rating @@ -500,9 +527,9 @@ export class AudibleService { audiobooks.push({ asin, title, - author: authorText.replace('By:', '').replace('Written by:', '').trim(), + author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin: authorAsinMatch?.[1] || undefined, - narrator: narratorText.replace('Narrated by:', '').trim(), + narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), durationMinutes, rating, @@ -565,13 +592,15 @@ export class AudibleService { $('.s-result-item, .productListItem').each((_index, element) => { const $el = $(element); - // --- Language filter: require explicit "English" --- - const langText = $el.find('span:contains("Language:")').text().trim() || + // --- Language filter: require matching language for region --- + const langConfig = this.getLangConfig(); + const langText = $el.find(buildContainsSelector('span', langConfig.scraping.languageLabels)).text().trim() || $el.find('.languageLabel').text().trim(); - // Extract language value (e.g. "Language: English" → "English") - const langMatch = langText.match(/Language:\s*(.+)/i); + // Extract language value (e.g. "Language: English" -> "English", "Sprache: Deutsch" -> "Deutsch") + const langLabelPattern = new RegExp(`(?:${langConfig.scraping.languageLabels.map(l => l.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})\\s*(.+)`, 'i'); + const langMatch = langText.match(langLabelPattern); const language = langMatch?.[1]?.trim(); - if (!language || language.toLowerCase() !== 'english') return; + if (!language || !isAcceptedLanguage(language, langConfig)) return; // --- Author ASIN filter: verify target ASIN in author links --- const authorLinks = $el.find('a[href*="/author/"]'); @@ -609,7 +638,7 @@ export class AudibleService { const coverArtUrl = $el.find('img').attr('src') || ''; const runtimeText = $el.find('.runtimeLabel').text().trim() || - $el.find('span:contains("Length:")').text().trim(); + $el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim(); const durationMinutes = this.parseRuntime(runtimeText); const ratingText = $el.find('.ratingsLabel').text().trim() || @@ -619,9 +648,9 @@ export class AudibleService { allBooks.push({ asin: bookAsin, title, - author: authorText.replace('By:', '').replace('Written by:', '').trim(), + author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes), authorAsin, - narrator: narratorText.replace('Narrated by:', '').trim(), + narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes), coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'), durationMinutes, rating, @@ -867,7 +896,8 @@ export class AudibleService { result.author = [...new Set(authors)].slice(0, 3).join(', '); } - result.author = result.author.replace(/^By:\s*/i, '').replace(/^Written by:\s*/i, '').trim(); + const authorLangConfig = this.getLangConfig(); + result.author = stripPrefixes(result.author, authorLangConfig.scraping.authorPrefixes); logger.info(` Author from HTML: "${result.author}"`); } @@ -911,22 +941,16 @@ export class AudibleService { } if (result.narrator) { - result.narrator = result.narrator.replace(/^Narrated by:\s*/i, '').trim(); + const detailLangConfig = this.getLangConfig(); + result.narrator = stripPrefixes(result.narrator, detailLangConfig.scraping.narratorPrefixes); } logger.info(` Narrator from HTML: "${result.narrator || ''}"`); } // Description - try multiple approaches with strict filtering if (!result.description) { - const excludePatterns = [ - /\$\d+\.\d+/, // Price patterns - /cancel anytime/i, - /free trial/i, - /membership/i, - /subscribe/i, - /offer.*ends/i, - /^\s*by\s+[\w\s,]+$/i, // Just author names - ]; + const descLangConfig = this.getLangConfig(); + const excludePatterns = descLangConfig.scraping.descriptionExcludePatterns; const isValidDescription = (text: string): boolean => { if (!text || text.length < 50 || text.length > 5000) return false; @@ -982,18 +1006,20 @@ export class AudibleService { // Runtime/Duration - try multiple approaches if (!result.durationMinutes) { + const rtLangConfig = this.getLangConfig(); + // Look for runtime text in various places const runtimeText = $('li.runtimeLabel span').text().trim() || $('.runtimeLabel').text().trim() || - $('span:contains("Length:")').parent().text().trim() || - $('li:contains("Length:")').text().trim() || + $(buildContainsSelector('span', rtLangConfig.scraping.lengthLabels)).parent().text().trim() || + $(buildContainsSelector('li', rtLangConfig.scraping.lengthLabels)).text().trim() || (() => { // Look for any text matching duration pattern let found = ''; $('li, span, div').each((_, elem) => { const text = $(elem).text().trim(); - if (text.match(/\d+\s*(hr|hour|h)\s*\d*\s*(min|minute|m)?/i) && text.length < 100) { + if (text.match(rtLangConfig.scraping.durationDetectionPattern) && text.length < 100) { found = text; return false; // break } @@ -1007,41 +1033,55 @@ export class AudibleService { // Rating - try multiple approaches if (!result.rating) { + const ratingLangConfig = this.getLangConfig(); const ratingText = $('.ratingsLabel').text().trim() || $('[class*="rating"]').first().text().trim() || - $('span:contains("out of 5 stars")').parent().text().trim() || + $(`span:contains("${ratingLangConfig.scraping.ratingTextSelector}")`).parent().text().trim() || (() => { - // Look for rating pattern + // Look for rating pattern using language-specific patterns let found = ''; $('span, div').each((_, elem) => { const text = $(elem).text().trim(); - if (text.match(/\d+\.?\d*\s*out of\s*5/i) && text.length < 50) { - found = text; - return false; + if (text.length < 50) { + for (const pattern of ratingLangConfig.scraping.ratingPatterns) { + if (pattern.test(text)) { + found = text; + return false; + } + } } }); return found; })(); if (ratingText) { - const ratingMatch = ratingText.match(/(\d+\.?\d*)\s*out of/i); - result.rating = ratingMatch ? parseFloat(ratingMatch[1]) : undefined; + let ratingValue: number | undefined; + for (const pattern of ratingLangConfig.scraping.ratingPatterns) { + const ratingMatch = ratingText.match(pattern); + if (ratingMatch) { + // Handle comma as decimal separator (e.g. "4,5" in German/Spanish) + ratingValue = parseFloat(ratingMatch[1].replace(',', '.')); + break; + } + } + result.rating = ratingValue; } logger.info(` Rating from "${ratingText}": ${result.rating}`); } // Release date - try multiple selectors if (!result.releaseDate) { + const rdLangConfig = this.getLangConfig(); const releaseDateText = - $('li:contains("Release date:")').text().trim() || - $('span:contains("Release date:")').parent().text().trim() || + $(buildContainsSelector('li', rdLangConfig.scraping.releaseDateLabels)).text().trim() || + $(buildContainsSelector('span', rdLangConfig.scraping.releaseDateLabels)).parent().text().trim() || $('[class*="release"]').text().trim(); - const dateMatch = releaseDateText.match(/Release date:\s*(.+)/i) || - releaseDateText.match(/(\w+ \d{1,2},? \d{4})/); + const dateMatch = extractByPatterns(releaseDateText, rdLangConfig.scraping.releaseDatePatterns) || + releaseDateText.match(/(\w+ \d{1,2},? \d{4})/)?.[1]; if (dateMatch) { - result.releaseDate = dateMatch[1].trim(); + result.releaseDate = dateMatch.trim(); } logger.info(` Release date from "${releaseDateText}": ${result.releaseDate}`); } @@ -1078,20 +1118,30 @@ export class AudibleService { } /** - * Parse runtime text to minutes + * Parse runtime text to minutes using language-specific patterns */ private parseRuntime(runtimeText: string): number | undefined { if (!runtimeText) return undefined; - const hoursMatch = runtimeText.match(/(\d+)\s*hrs?/i); - const minutesMatch = runtimeText.match(/(\d+)\s*mins?/i); - + const langConfig = this.getLangConfig(); let totalMinutes = 0; - if (hoursMatch) { - totalMinutes += parseInt(hoursMatch[1]) * 60; + + // Try each hour pattern until one matches + for (const pattern of langConfig.scraping.runtimeHourPatterns) { + const match = runtimeText.match(pattern); + if (match) { + totalMinutes += parseInt(match[1]) * 60; + break; + } } - if (minutesMatch) { - totalMinutes += parseInt(minutesMatch[1]); + + // Try each minute pattern until one matches + for (const pattern of langConfig.scraping.runtimeMinutePatterns) { + const match = runtimeText.match(pattern); + if (match) { + totalMinutes += parseInt(match[1]); + break; + } } return totalMinutes > 0 ? totalMinutes : undefined; diff --git a/src/lib/processors/search-ebook.processor.ts b/src/lib/processors/search-ebook.processor.ts index 2e92190..6fbd667 100644 --- a/src/lib/processors/search-ebook.processor.ts +++ b/src/lib/processors/search-ebook.processor.ts @@ -14,6 +14,8 @@ import { RMABLogger } from '../utils/logger'; import { getProwlarrService } from '../integrations/prowlarr.service'; import { rankEbookTorrents, RankedEbookTorrent } from '../utils/ranking-algorithm'; import { groupIndexersByCategories, getGroupDescription } from '../utils/indexer-grouping'; +import { getLanguageForRegion } from '../constants/language-config'; +import type { AudibleRegion } from '../types/audible'; // Import ebook scraper functions for Anna's Archive import { @@ -151,6 +153,11 @@ async function searchAnnasArchive( const baseUrl = await configService.get('ebook_sidecar_base_url') || 'https://annas-archive.li'; const flaresolverrUrl = await configService.get('ebook_sidecar_flaresolverr_url') || undefined; + // Get language code from Audible region config + const region = await configService.getAudibleRegion() as AudibleRegion; + const langConfig = getLanguageForRegion(region); + const languageCode = langConfig.annasArchiveLang; + if (flaresolverrUrl) { logger.info(`Using FlareSolverr at ${flaresolverrUrl}`); } @@ -161,7 +168,7 @@ async function searchAnnasArchive( // Try ASIN search first (exact match - best) if (audiobook.asin) { logger.info(`Searching Anna's Archive by ASIN: ${audiobook.asin} (format: ${preferredFormat})...`); - md5 = await searchByAsin(audiobook.asin, preferredFormat, baseUrl, logger, flaresolverrUrl); + md5 = await searchByAsin(audiobook.asin, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode); if (md5) { logger.info(`Found via ASIN: ${md5}`); @@ -174,7 +181,7 @@ async function searchAnnasArchive( // Fallback to title + author search if (!md5) { logger.info(`Searching Anna's Archive by title + author: "${audiobook.title}" by ${audiobook.author}...`); - md5 = await searchByTitle(audiobook.title, audiobook.author, preferredFormat, baseUrl, logger, flaresolverrUrl); + md5 = await searchByTitle(audiobook.title, audiobook.author, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode); if (md5) { logger.info(`Found via title search: ${md5}`); @@ -301,6 +308,10 @@ async function searchIndexers( logger.info(`Will filter ${aboveThreshold.length} results > 20 MB (too large for ebooks)`); } + // Get language-specific stop words for ranking + const ebookRegion = await configService.getAudibleRegion() as AudibleRegion; + const ebookLangConfig = getLanguageForRegion(ebookRegion); + // Rank results with ebook-specific scoring // This filters out > 20MB and uses inverted size scoring const rankedResults = rankEbookTorrents(allResults, { @@ -311,6 +322,8 @@ async function searchIndexers( indexerPriorities, flagConfigs, requireAuthor: true, // Automatic mode - prevent wrong authors + stopWords: ebookLangConfig.stopWords, + characterReplacements: ebookLangConfig.characterReplacements, }); // Log filter results diff --git a/src/lib/processors/search-indexers.processor.ts b/src/lib/processors/search-indexers.processor.ts index 46fd2c0..9157cee 100644 --- a/src/lib/processors/search-indexers.processor.ts +++ b/src/lib/processors/search-indexers.processor.ts @@ -9,6 +9,8 @@ import { getProwlarrService } from '../integrations/prowlarr.service'; import { getRankingAlgorithm } from '../utils/ranking-algorithm'; import { groupIndexersByCategories, getGroupDescription } from '../utils/indexer-grouping'; import { RMABLogger } from '../utils/logger'; +import { getLanguageForRegion } from '../constants/language-config'; +import type { AudibleRegion } from '../types/audible'; /** * Process search indexers job @@ -146,8 +148,10 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro logger.info(`Will filter ${belowThreshold.length} results < ${sizeMBThreshold} MB (likely ebooks)`); } - // Get ranking algorithm + // Get ranking algorithm and language-specific stop words const ranker = getRankingAlgorithm(); + const region = await configService.getAudibleRegion() as AudibleRegion; + const langConfig = getLanguageForRegion(region); // Rank results with indexer priorities and flag configs // Note: rankTorrents now filters out results < 20 MB internally @@ -159,7 +163,9 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro }, { indexerPriorities, flagConfigs, - requireAuthor: true // Automatic mode - prevent wrong authors + requireAuthor: true, // Automatic mode - prevent wrong authors + stopWords: langConfig.stopWords, + characterReplacements: langConfig.characterReplacements, }); // Log filter results diff --git a/src/lib/services/ebook-scraper.ts b/src/lib/services/ebook-scraper.ts index 610254a..a091fd3 100644 --- a/src/lib/services/ebook-scraper.ts +++ b/src/lib/services/ebook-scraper.ts @@ -170,7 +170,8 @@ export async function downloadEbook( preferredFormat: string = 'epub', baseUrl: string = 'https://annas-archive.li', logger?: RMABLogger, - flaresolverrUrl?: string + flaresolverrUrl?: string, + languageCode: string = 'en' ): Promise { try { let md5: string | null = null; @@ -183,7 +184,7 @@ export async function downloadEbook( // Step 1: Try ASIN search (exact match - best) if (asin) { await logger?.info(`Searching by ASIN: ${asin} (format: ${preferredFormat})...`); - md5 = await searchByAsin(asin, preferredFormat, baseUrl, logger, flaresolverrUrl); + md5 = await searchByAsin(asin, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode); if (md5) { await logger?.info(`Found via ASIN: ${md5}`); @@ -195,7 +196,7 @@ export async function downloadEbook( // Step 2: Fallback to title + author search if (!md5) { await logger?.info(`Searching by title + author: "${title}" by ${author}...`); - md5 = await searchByTitle(title, author, preferredFormat, baseUrl, logger, flaresolverrUrl); + md5 = await searchByTitle(title, author, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode); if (md5) { await logger?.info(`Found via title search: ${md5}`); @@ -312,10 +313,11 @@ export async function searchByAsin( format: string, baseUrl: string, logger?: RMABLogger, - flaresolverrUrl?: string + flaresolverrUrl?: string, + languageCode: string = 'en' ): Promise { // Check cache first - const cacheKey = `${asin}-${format}`; + const cacheKey = `${asin}-${format}-${languageCode}`; if (md5Cache.has(cacheKey)) { const cached = md5Cache.get(cacheKey); if (cached) { @@ -327,7 +329,7 @@ export async function searchByAsin( try { // Build search URL with ASIN and optional format filter const formatParam = format && format !== 'any' ? `ext=${format}&` : ''; - const searchUrl = `${baseUrl}/search?${formatParam}lang=en&q=%22asin:${asin}%22`; + const searchUrl = `${baseUrl}/search?${formatParam}lang=${languageCode}&q=%22asin:${asin}%22`; moduleLogger.debug(`ASIN search URL: ${searchUrl}`); @@ -404,10 +406,11 @@ export async function searchByTitle( format: string, baseUrl: string, logger?: RMABLogger, - flaresolverrUrl?: string + flaresolverrUrl?: string, + languageCode: string = 'en' ): Promise { // Check cache first - const cacheKey = `title-${title}-${author}-${format}`.toLowerCase(); + const cacheKey = `title-${title}-${author}-${format}-${languageCode}`.toLowerCase(); if (md5Cache.has(cacheKey)) { const cached = md5Cache.get(cacheKey); if (cached) { @@ -432,8 +435,8 @@ export async function searchByTitle( // Add content type filters (books only, all fiction/nonfiction/unknown) searchUrl += '&content=book_nonfiction&content=book_fiction&content=book_unknown'; - // Add language filter (English) - searchUrl += '&lang=en'; + // Add language filter + searchUrl += `&lang=${languageCode}`; // Empty raw query (we're using specific terms instead) searchUrl += '&q='; diff --git a/src/lib/types/audible.ts b/src/lib/types/audible.ts index 6d2c418..871ec5a 100644 --- a/src/lib/types/audible.ts +++ b/src/lib/types/audible.ts @@ -3,6 +3,8 @@ * Documentation: documentation/integrations/audible.md */ +import type { SupportedLanguage } from '../constants/language-config'; + export type AudibleRegion = 'us' | 'ca' | 'uk' | 'au' | 'in' | 'de' | 'es'; export interface AudibleRegionConfig { @@ -10,7 +12,7 @@ export interface AudibleRegionConfig { name: string; baseUrl: string; audnexusParam: string; - isEnglish: boolean; + language: SupportedLanguage; } export const AUDIBLE_REGIONS: Record = { @@ -19,49 +21,49 @@ export const AUDIBLE_REGIONS: Record = { name: 'United States', baseUrl: 'https://www.audible.com', audnexusParam: 'us', - isEnglish: true, + language: 'en', }, ca: { code: 'ca', name: 'Canada', baseUrl: 'https://www.audible.ca', audnexusParam: 'ca', - isEnglish: true, + language: 'en', }, uk: { code: 'uk', name: 'United Kingdom', baseUrl: 'https://www.audible.co.uk', audnexusParam: 'uk', - isEnglish: true, + language: 'en', }, au: { code: 'au', name: 'Australia', baseUrl: 'https://www.audible.com.au', audnexusParam: 'au', - isEnglish: true, + language: 'en', }, in: { code: 'in', name: 'India', baseUrl: 'https://www.audible.in', audnexusParam: 'in', - isEnglish: true, + language: 'en', }, de: { code: 'de', name: 'Germany', baseUrl: 'https://www.audible.de', audnexusParam: 'de', - isEnglish: false, + language: 'de', }, es: { code: 'es', name: 'Spain', baseUrl: 'https://www.audible.es', audnexusParam: 'es', - isEnglish: false, + language: 'es', } }; diff --git a/src/lib/utils/ranking-algorithm.ts b/src/lib/utils/ranking-algorithm.ts index a0bec36..17f3d71 100644 --- a/src/lib/utils/ranking-algorithm.ts +++ b/src/lib/utils/ranking-algorithm.ts @@ -40,6 +40,8 @@ export interface RankTorrentsOptions { indexerPriorities?: Map; // indexerId -> priority (1-25) flagConfigs?: IndexerFlagConfig[]; // Flag bonus configurations requireAuthor?: boolean; // Enforce author presence check (default: true) + stopWords?: string[]; // Language-specific stop words for matching + characterReplacements?: Record; // Language-specific char replacements (e.g. ß→ss) } export interface EbookTorrentRequest { @@ -52,6 +54,8 @@ export interface RankEbookTorrentsOptions { indexerPriorities?: Map; // indexerId -> priority (1-25) flagConfigs?: IndexerFlagConfig[]; // Flag bonus configurations requireAuthor?: boolean; // Enforce author presence check (default: true) + stopWords?: string[]; // Language-specific stop words for matching + characterReplacements?: Record; // Language-specific char replacements (e.g. ß→ss) } export interface BonusModifier { @@ -113,7 +117,9 @@ export class RankingAlgorithm { const { indexerPriorities, flagConfigs, - requireAuthor = true // Safe default: require author in automatic mode + requireAuthor = true, // Safe default: require author in automatic mode + stopWords, + characterReplacements, } = options; // Filter out files < 20 MB (likely ebooks/samples) const filteredTorrents = torrents.filter((torrent) => { @@ -126,7 +132,7 @@ export class RankingAlgorithm { const formatScore = this.scoreFormat(torrent); const sizeScore = this.scoreSize(torrent, audiobook.durationMinutes); const seederScore = this.scoreSeeders(torrent.seeders); - const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor); + const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor, stopWords, characterReplacements); const baseScore = formatScore + sizeScore + seederScore + matchScore; @@ -340,11 +346,22 @@ export class RankingAlgorithm { * "Twelve.Months-Jim.Butcher" → "twelve months jim butcher" * "Author_Name_Book" → "author name book" */ - private normalizeForMatching(text: string): string { - return text + private normalizeForMatching(text: string, characterReplacements?: Record): string { + let result = text // Split CamelCase FIRST (before lowercasing): "TheCorrespondent" → "The Correspondent" .replace(/([a-z])([A-Z])/g, '$1 $2') - .toLowerCase() + .toLowerCase(); + // Apply language-specific character replacements before NFD (e.g. ß→ss) + if (characterReplacements) { + for (const [from, to] of Object.entries(characterReplacements)) { + result = result.replace(new RegExp(from, 'g'), to); + } + } + return result + // NFD normalization: convert accented chars to ASCII base forms + // e.g. "uber" from "uber", "senor" from "senor", "cafe" from "cafe" + .normalize('NFD') + .replace(/[\u0300-\u036f]/g, '') // Replace underscores with spaces (must be explicit since \w includes _) .replace(/_/g, ' ') // Replace other punctuation/separators with spaces (preserves apostrophes in contractions) @@ -362,11 +379,13 @@ export class RankingAlgorithm { private scoreMatch( torrent: TorrentResult, audiobook: AudiobookRequest, - requireAuthor: boolean = true + requireAuthor: boolean = true, + customStopWords?: string[], + characterReplacements?: Record ): number { - // Normalize for matching (handles CamelCase, punctuation separators) - const torrentTitle = this.normalizeForMatching(torrent.title); - const requestTitle = this.normalizeForMatching(audiobook.title); + // Normalize for matching (handles CamelCase, punctuation separators, diacritics) + const torrentTitle = this.normalizeForMatching(torrent.title, characterReplacements); + const requestTitle = this.normalizeForMatching(audiobook.title, characterReplacements); // Parse authors from RAW string first (preserving commas for splitting) // Then normalize individual authors for matching @@ -377,19 +396,30 @@ export class RankingAlgorithm { .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a)); // Normalize parsed authors for matching (handles CamelCase in author names) - const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a)); + const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a, characterReplacements)); // Combined normalized author string for fuzzy matching const requestAuthorNormalized = normalizedAuthors.join(' '); // ========== STAGE 1: WORD COVERAGE FILTER (MANDATORY) ========== // Extract significant words (filter out common stop words) - const stopWords = ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for']; + // Use provided language-specific stop words, or fall back to English defaults + const stopWords = customStopWords || ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for']; const extractWords = (text: string, stopList: string[]): string[] => { - return text + let processed = text // Split CamelCase FIRST: "TheCorrespondent" → "The Correspondent" .replace(/([a-z])([A-Z])/g, '$1 $2') - .toLowerCase() + .toLowerCase(); + // Apply language-specific character replacements before NFD + if (characterReplacements) { + for (const [from, to] of Object.entries(characterReplacements)) { + processed = processed.replace(new RegExp(from, 'g'), to); + } + } + return processed + // NFD normalization for accented characters + .normalize('NFD') + .replace(/[\u0300-\u036f]/g, '') // Replace underscores with spaces (must be explicit since \w includes _) .replace(/_/g, ' ') // Remove other punctuation (but keep apostrophes for contractions) @@ -431,7 +461,7 @@ export class RankingAlgorithm { } // Normalize the required portion (handles CamelCase, punctuation) - const required = this.normalizeForMatching(requiredRaw); + const required = this.normalizeForMatching(requiredRaw, characterReplacements); const optional = optionalMatches.join(' '); return { required, optional }; @@ -653,7 +683,7 @@ export class RankingAlgorithm { * @param requestAuthor - Raw author string (will be parsed and normalized internally) * @returns true if at least ONE author is present with high confidence */ - private checkAuthorPresence(torrentTitle: string, requestAuthor: string): boolean { + private checkAuthorPresence(torrentTitle: string, requestAuthor: string, characterReplacements?: Record): boolean { // Parse multiple authors (same logic as Stage 3 author matching) const authors = requestAuthor .split(/,|&| and | - /) @@ -661,7 +691,7 @@ export class RankingAlgorithm { .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a)); // Normalize each author for matching - const normalizedAuthors = authors.map(a => this.normalizeForMatching(a)); + const normalizedAuthors = authors.map(a => this.normalizeForMatching(a, characterReplacements)); return this.checkAuthorPresenceWithParsed(torrentTitle, normalizedAuthors); } @@ -788,7 +818,9 @@ export class RankingAlgorithm { const { indexerPriorities, flagConfigs, - requireAuthor = true // Safe default: require author in automatic mode + requireAuthor = true, // Safe default: require author in automatic mode + stopWords, + characterReplacements, } = options; // Filter out files > 20 MB (too large for ebooks) @@ -809,7 +841,7 @@ export class RankingAlgorithm { const matchScore = this.scoreMatch(torrent, { title: ebook.title, author: ebook.author, - }, requireAuthor); + }, requireAuthor, stopWords, characterReplacements); const baseScore = formatScore + sizeScore + seederScore + matchScore; diff --git a/tests/api/audiobooks-search.routes.test.ts b/tests/api/audiobooks-search.routes.test.ts index 8522d42..d517404 100644 --- a/tests/api/audiobooks-search.routes.test.ts +++ b/tests/api/audiobooks-search.routes.test.ts @@ -10,6 +10,7 @@ let authRequest: any; const requireAuthMock = vi.hoisted(() => vi.fn()); const configServiceMock = vi.hoisted(() => ({ get: vi.fn(), + getAudibleRegion: vi.fn().mockResolvedValue('us'), })); const prowlarrMock = vi.hoisted(() => ({ search: vi.fn(), @@ -43,6 +44,7 @@ vi.mock('@/lib/utils/indexer-grouping', () => ({ describe('Audiobooks search torrents route', () => { beforeEach(() => { vi.clearAllMocks(); + configServiceMock.getAudibleRegion.mockResolvedValue('us'); authRequest = { user: { id: 'user-1', role: 'user' }, json: vi.fn(), diff --git a/tests/api/requests-actions.routes.test.ts b/tests/api/requests-actions.routes.test.ts index 3c00433..2b187bb 100644 --- a/tests/api/requests-actions.routes.test.ts +++ b/tests/api/requests-actions.routes.test.ts @@ -12,7 +12,7 @@ const prismaMock = createPrismaMock(); const requireAuthMock = vi.hoisted(() => vi.fn()); const prowlarrMock = vi.hoisted(() => ({ search: vi.fn(), searchWithVariations: vi.fn() })); const rankTorrentsMock = vi.hoisted(() => vi.fn()); -const configServiceMock = vi.hoisted(() => ({ get: vi.fn() })); +const configServiceMock = vi.hoisted(() => ({ get: vi.fn(), getAudibleRegion: vi.fn().mockResolvedValue('us') })); const groupIndexersMock = vi.hoisted(() => vi.fn()); const groupDescriptionMock = vi.hoisted(() => vi.fn(() => 'Group')); const configState = vi.hoisted(() => ({ @@ -75,6 +75,7 @@ vi.mock('fs/promises', () => ({ default: fsMock, ...fsMock, constants: { R_OK: 4 describe('Request action routes', () => { beforeEach(() => { vi.clearAllMocks(); + configServiceMock.getAudibleRegion.mockResolvedValue('us'); configState.values.clear(); authRequest = { user: { id: 'user-1', role: 'user' }, json: vi.fn() }; requireAuthMock.mockImplementation((_req: any, handler: any) => handler(authRequest)); diff --git a/tests/processors/search-ebook.processor.test.ts b/tests/processors/search-ebook.processor.test.ts index d11e149..f8c8dc2 100644 --- a/tests/processors/search-ebook.processor.test.ts +++ b/tests/processors/search-ebook.processor.test.ts @@ -10,6 +10,7 @@ const prismaMock = createPrismaMock(); const configServiceMock = vi.hoisted(() => ({ get: vi.fn(), + getAudibleRegion: vi.fn().mockResolvedValue('us'), })); const jobQueueMock = vi.hoisted(() => ({ @@ -39,6 +40,7 @@ vi.mock('@/lib/services/ebook-scraper', () => ebookScraperMock); describe('processSearchEbook', () => { beforeEach(() => { vi.clearAllMocks(); + configServiceMock.getAudibleRegion.mockResolvedValue('us'); configServiceMock.get.mockImplementation(async (key: string) => { if (key === 'ebook_sidecar_preferred_format') return 'epub'; if (key === 'ebook_sidecar_base_url') return 'https://annas-archive.li'; @@ -79,7 +81,8 @@ describe('processSearchEbook', () => { 'epub', 'https://annas-archive.li', expect.anything(), - undefined + undefined, + 'en' ); expect(jobQueueMock.addStartDirectDownloadJob).toHaveBeenCalledWith( 'req-1', @@ -123,7 +126,8 @@ describe('processSearchEbook', () => { 'epub', 'https://annas-archive.li', expect.anything(), - undefined + undefined, + 'en' ); }); @@ -253,7 +257,8 @@ describe('processSearchEbook', () => { 'epub', 'https://annas-archive.li', expect.anything(), - 'http://flaresolverr:8191' + 'http://flaresolverr:8191', + 'en' ); }); diff --git a/tests/processors/search-indexers.processor.test.ts b/tests/processors/search-indexers.processor.test.ts index 15cf334..48e25b8 100644 --- a/tests/processors/search-indexers.processor.test.ts +++ b/tests/processors/search-indexers.processor.test.ts @@ -8,7 +8,7 @@ import { createPrismaMock } from '../helpers/prisma'; import { createJobQueueMock } from '../helpers/job-queue'; const prismaMock = createPrismaMock(); -const configMock = vi.hoisted(() => ({ get: vi.fn() })); +const configMock = vi.hoisted(() => ({ get: vi.fn(), getAudibleRegion: vi.fn().mockResolvedValue('us') })); const jobQueueMock = createJobQueueMock(); const prowlarrMock = vi.hoisted(() => ({ search: vi.fn(), searchWithVariations: vi.fn() })); @@ -35,6 +35,7 @@ vi.mock('@/lib/integrations/audible.service', () => ({ describe('processSearchIndexers', () => { beforeEach(() => { vi.clearAllMocks(); + configMock.getAudibleRegion.mockResolvedValue('us'); }); it('marks request awaiting_search when no results found', async () => {