Add language config and locale-aware parsing

Introduce centralized language configuration and wire locale-aware behavior across scraping and ranking. Adds src/lib/constants/language-config.ts with per-language scraping rules, stop words, and character replacements; replaces AudibleRegion.isEnglish with a language field in types and AUDIBLE_REGIONS. Update AudibleService, ebook scraper, processors, and API routes to use getLanguageForRegion so Anna's Archive searches, scraping selectors, runtime/rating parsing, and ranking use language-specific params and filters. Extend ranking algorithm to accept stopWords and characterReplacements and apply them during normalization and matching. Update UI selects to mark non-English regions and adjust tests accordingly.
This commit is contained in:
kikootwo
2026-02-20 06:32:44 -05:00
parent c146383735
commit 5d8ac2f73d
18 changed files with 525 additions and 112 deletions
@@ -164,11 +164,11 @@ export function AudiobookshelfSection({
>
{Object.values(AUDIBLE_REGIONS).map((region) => (
<option key={region.code} value={region.code}>
{region.name}{!region.isEnglish ? ' *' : ''}
{region.name}{region.language !== 'en' ? ' *' : ''}
</option>
))}
</select>
{AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.isEnglish === false && (
{AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.language !== 'en' && (
<div className="bg-amber-50 dark:bg-amber-900/20 rounded-lg p-4 border border-amber-200 dark:border-amber-800 mt-2">
<div className="flex gap-3">
<svg
@@ -164,11 +164,11 @@ export function PlexSection({
>
{Object.values(AUDIBLE_REGIONS).map((region) => (
<option key={region.code} value={region.code}>
{region.name}{!region.isEnglish ? ' *' : ''}
{region.name}{region.language !== 'en' ? ' *' : ''}
</option>
))}
</select>
{AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.isEnglish === false && (
{AUDIBLE_REGIONS[settings.audibleRegion as keyof typeof AUDIBLE_REGIONS]?.language !== 'en' && (
<div className="bg-amber-50 dark:bg-amber-900/20 rounded-lg p-4 border border-amber-200 dark:border-amber-800 mt-2">
<div className="flex gap-3">
<svg
@@ -18,6 +18,8 @@ import { findPlexMatch } from '@/lib/utils/audiobook-matcher';
import { getAudibleService } from '@/lib/integrations/audible.service';
import { RMABLogger } from '@/lib/utils/logger';
import { resolveInteractiveSearchAccess } from '@/lib/utils/permissions';
import { getLanguageForRegion } from '@/lib/constants/language-config';
import type { AudibleRegion } from '@/lib/types/audible';
import {
searchByAsin,
searchByTitle,
@@ -227,6 +229,11 @@ export async function POST(
const format = preferredFormat || 'epub';
const annasBaseUrl = baseUrl || 'https://annas-archive.li';
// Get language code from Audible region config
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
const languageCode = langConfig.annasArchiveLang;
if (!isAnnasArchiveEnabled && !isIndexerSearchEnabled) {
return NextResponse.json(
{ error: 'No ebook sources enabled. Enable Anna\'s Archive or Indexer Search in settings.' },
@@ -250,7 +257,8 @@ export async function POST(
audiobook.author,
format,
annasBaseUrl,
flaresolverrUrl || undefined
flaresolverrUrl || undefined,
languageCode
).catch((err) => {
logger.error(`Anna's Archive search failed: ${err.message}`);
return null;
@@ -322,7 +330,8 @@ async function searchAnnasArchiveForInteractive(
author: string,
preferredFormat: string,
baseUrl: string,
flaresolverrUrl?: string
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<EbookSearchResult[]> {
let md5: string | null = null;
let searchMethod: 'asin' | 'title' = 'title';
@@ -330,7 +339,7 @@ async function searchAnnasArchiveForInteractive(
// Try ASIN search first
if (asin) {
logger.info(`Searching Anna's Archive by ASIN: ${asin}`);
md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl);
md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode);
if (md5) {
searchMethod = 'asin';
logger.info(`Found via ASIN: ${md5}`);
@@ -340,7 +349,7 @@ async function searchAnnasArchiveForInteractive(
// Fallback to title search
if (!md5) {
logger.info(`Searching Anna's Archive by title: "${title}"`);
md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl);
md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode);
if (md5) {
logger.info(`Found via title: ${md5}`);
}
@@ -461,6 +470,10 @@ async function searchIndexersForInteractive(
return [];
}
// Get language-specific stop words for ranking
const rankRegion = await configService.getAudibleRegion() as AudibleRegion;
const rankLangConfig = getLanguageForRegion(rankRegion);
// Rank results with ebook scoring
const rankedResults = rankEbookTorrents(allResults, {
title,
@@ -470,6 +483,8 @@ async function searchIndexersForInteractive(
indexerPriorities,
flagConfigs,
requireAuthor: false,
stopWords: rankLangConfig.stopWords,
characterReplacements: rankLangConfig.characterReplacements,
});
// Convert to unified result type
@@ -10,6 +10,8 @@ import { requireAuth, AuthenticatedRequest } from '@/lib/middleware/auth';
import { getProwlarrService } from '@/lib/integrations/prowlarr.service';
import { rankTorrents } from '@/lib/utils/ranking-algorithm';
import { groupIndexersByCategories, getGroupDescription } from '@/lib/utils/indexer-grouping';
import { getLanguageForRegion } from '@/lib/constants/language-config';
import type { AudibleRegion } from '@/lib/types/audible';
import { z } from 'zod';
import { RMABLogger } from '@/lib/utils/logger';
@@ -140,13 +142,19 @@ export async function POST(request: NextRequest) {
logger.info(`Will filter ${belowThreshold.length} results < ${sizeMBThreshold} MB (likely ebooks)`);
}
// Get language-specific stop words for ranking
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
// Rank torrents using the ranking algorithm with indexer priorities and flag configs
// Note: rankTorrents now filters out results < 20 MB internally
// requireAuthor: false - interactive search, show all results for user decision
const rankedResults = rankTorrents(results, { title, author, durationMinutes }, {
indexerPriorities,
flagConfigs,
requireAuthor: false // Interactive mode - let user decide
requireAuthor: false, // Interactive mode - let user decide
stopWords: langConfig.stopWords,
characterReplacements: langConfig.characterReplacements,
});
// Log filter results
@@ -14,6 +14,8 @@ import { getProwlarrService } from '@/lib/integrations/prowlarr.service';
import { rankEbookTorrents, RankedEbookTorrent } from '@/lib/utils/ranking-algorithm';
import { groupIndexersByCategories, getGroupDescription } from '@/lib/utils/indexer-grouping';
import { RMABLogger } from '@/lib/utils/logger';
import { getLanguageForRegion } from '@/lib/constants/language-config';
import type { AudibleRegion } from '@/lib/types/audible';
import {
searchByAsin,
searchByTitle,
@@ -121,6 +123,11 @@ export async function POST(
const format = preferredFormat || 'epub';
const annasBaseUrl = baseUrl || 'https://annas-archive.li';
// Get language code from Audible region config
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
const languageCode = langConfig.annasArchiveLang;
if (!isAnnasArchiveEnabled && !isIndexerSearchEnabled) {
return NextResponse.json(
{ error: 'No ebook sources enabled. Enable Anna\'s Archive or Indexer Search in settings.' },
@@ -145,7 +152,8 @@ export async function POST(
audiobook.author,
format,
annasBaseUrl,
flaresolverrUrl || undefined
flaresolverrUrl || undefined,
languageCode
).catch((err) => {
logger.error(`Anna's Archive search failed: ${err.message}`);
return null;
@@ -217,7 +225,8 @@ async function searchAnnasArchiveForInteractive(
author: string,
preferredFormat: string,
baseUrl: string,
flaresolverrUrl?: string
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<EbookSearchResult[]> {
let md5: string | null = null;
let searchMethod: 'asin' | 'title' = 'title';
@@ -225,7 +234,7 @@ async function searchAnnasArchiveForInteractive(
// Try ASIN search first
if (asin) {
logger.info(`Searching Anna's Archive by ASIN: ${asin}`);
md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl);
md5 = await searchByAsin(asin, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode);
if (md5) {
searchMethod = 'asin';
logger.info(`Found via ASIN: ${md5}`);
@@ -235,7 +244,7 @@ async function searchAnnasArchiveForInteractive(
// Fallback to title search
if (!md5) {
logger.info(`Searching Anna's Archive by title: "${title}"`);
md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl);
md5 = await searchByTitle(title, author, preferredFormat, baseUrl, undefined, flaresolverrUrl, languageCode);
if (md5) {
logger.info(`Found via title: ${md5}`);
}
@@ -356,6 +365,10 @@ async function searchIndexersForInteractive(
return [];
}
// Get language-specific stop words for ranking
const rankRegion = await configService.getAudibleRegion() as AudibleRegion;
const rankLangConfig = getLanguageForRegion(rankRegion);
// Rank results with ebook scoring
// Use requireAuthor=false for interactive mode (let user decide)
const rankedResults = rankEbookTorrents(allResults, {
@@ -366,6 +379,8 @@ async function searchIndexersForInteractive(
indexerPriorities,
flagConfigs,
requireAuthor: false,
stopWords: rankLangConfig.stopWords,
characterReplacements: rankLangConfig.characterReplacements,
});
// Log ranking debug info (same format as search-ebook.processor.ts)
@@ -9,6 +9,8 @@ import { prisma } from '@/lib/db';
import { getProwlarrService } from '@/lib/integrations/prowlarr.service';
import { rankTorrents } from '@/lib/utils/ranking-algorithm';
import { groupIndexersByCategories, getGroupDescription } from '@/lib/utils/indexer-grouping';
import { getLanguageForRegion } from '@/lib/constants/language-config';
import type { AudibleRegion } from '@/lib/types/audible';
import { RMABLogger } from '@/lib/utils/logger';
import { resolveInteractiveSearchAccess } from '@/lib/utils/permissions';
@@ -189,6 +191,10 @@ export async function POST(
}
}
// Get language-specific stop words for ranking
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
// Rank torrents using the ranking algorithm with indexer priorities and flag configs
// Always use the audiobook's title/author for ranking (not custom search query)
// requireAuthor: false - interactive mode, show all results for user decision
@@ -199,7 +205,9 @@ export async function POST(
}, {
indexerPriorities,
flagConfigs,
requireAuthor: false // Interactive mode - let user decide
requireAuthor: false, // Interactive mode - let user decide
stopWords: langConfig.stopWords,
characterReplacements: langConfig.characterReplacements,
});
// No threshold filtering for interactive search - show all results
+2 -2
View File
@@ -115,11 +115,11 @@ export function BackendSelectionStep({
>
{Object.values(AUDIBLE_REGIONS).map((region) => (
<option key={region.code} value={region.code}>
{region.name}{!region.isEnglish ? ' *' : ''}
{region.name}{region.language !== 'en' ? ' *' : ''}
</option>
))}
</select>
{AUDIBLE_REGIONS[audibleRegion]?.isEnglish === false && (
{AUDIBLE_REGIONS[audibleRegion]?.language !== 'en' && (
<div className="bg-amber-50 dark:bg-amber-900/20 rounded-lg p-4 border border-amber-200 dark:border-amber-800 mt-2">
<div className="flex gap-3">
<svg
+252
View File
@@ -0,0 +1,252 @@
/**
* Component: Centralized Language Configuration
* Documentation: documentation/integrations/audible.md
*
* Single source of truth for all language-specific configuration.
* To add a new language:
* 1. Add code to SupportedLanguage union
* 2. Add full LanguageConfig entry in LANGUAGE_CONFIGS
* 3. Map regions in REGION_LANGUAGE_MAP
* 4. Add region to AUDIBLE_REGIONS in audible.ts with language: 'xx'
*/
import type { AudibleRegion } from '../types/audible';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export type SupportedLanguage = 'en' | 'de' | 'es';
export interface ScrapingConfig {
/** Audible locale query-param value (e.g. 'english', 'deutsch') */
audibleLocaleParam: string;
/** Author label prefixes to strip (e.g. ['By:', 'Written by:']) */
authorPrefixes: string[];
/** Narrator label prefixes to strip */
narratorPrefixes: string[];
/** Length / duration labels used in Cheerio :contains() selectors */
lengthLabels: string[];
/** Language field labels */
languageLabels: string[];
/** Release date field labels */
releaseDateLabels: string[];
/** Accepted language values for filtering (lowercase) */
acceptedLanguageValues: string[];
/** Regex patterns that match hour portions in runtime strings */
runtimeHourPatterns: RegExp[];
/** Regex patterns that match minute portions in runtime strings */
runtimeMinutePatterns: RegExp[];
/** Regex patterns for extracting numeric rating */
ratingPatterns: RegExp[];
/** Regex patterns for extracting release date text */
releaseDatePatterns: RegExp[];
/** Promotional / non-description text patterns to exclude */
descriptionExcludePatterns: RegExp[];
/** Duration detection pattern for generic element scanning */
durationDetectionPattern: RegExp;
/** Rating text selector pattern (e.g. 'out of 5 stars') */
ratingTextSelector: string;
}
export interface LanguageConfig {
code: SupportedLanguage;
/** Anna's Archive language filter code */
annasArchiveLang: string;
/** EPUB language code */
epubCode: string;
/** Stop words for ranking algorithm (filtered from match scoring) */
stopWords: string[];
/** Character replacements applied before NFD normalization in ranking (e.g. ß→ss) */
characterReplacements: Record<string, string>;
/** All scraping-related config */
scraping: ScrapingConfig;
}
// ---------------------------------------------------------------------------
// Language Configurations
// ---------------------------------------------------------------------------
const ENGLISH_CONFIG: LanguageConfig = {
code: 'en',
annasArchiveLang: 'en',
epubCode: 'en',
stopWords: ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for'],
characterReplacements: {},
scraping: {
audibleLocaleParam: 'english',
authorPrefixes: ['By:', 'Written by:'],
narratorPrefixes: ['Narrated by:'],
lengthLabels: ['Length:'],
languageLabels: ['Language:'],
releaseDateLabels: ['Release date:'],
acceptedLanguageValues: ['english'],
runtimeHourPatterns: [/(\d+)\s*hrs?/i, /(\d+)\s*hours?/i],
runtimeMinutePatterns: [/(\d+)\s*mins?/i, /(\d+)\s*minutes?/i],
ratingPatterns: [/(\d+\.?\d*)\s*out of/i],
releaseDatePatterns: [/Release date:\s*(.+)/i],
descriptionExcludePatterns: [
/\$\d+\.\d+/,
/cancel anytime/i,
/free trial/i,
/membership/i,
/subscribe/i,
/offer.*ends/i,
/^\s*by\s+[\w\s,]+$/i,
],
durationDetectionPattern: /\d+\s*(hr|hour|h)\s*\d*\s*(min|minute|m)?/i,
ratingTextSelector: 'out of 5 stars',
},
};
const GERMAN_CONFIG: LanguageConfig = {
code: 'de',
annasArchiveLang: 'de',
epubCode: 'de',
stopWords: ['der', 'die', 'das', 'ein', 'eine', 'und', 'von', 'zu', 'den', 'dem', 'des'],
characterReplacements: { '\u00df': 'ss' },
scraping: {
audibleLocaleParam: 'deutsch',
authorPrefixes: ['Von:', 'Geschrieben von:', 'Autor:'],
narratorPrefixes: ['Gesprochen von:', 'Sprecher:'],
lengthLabels: ['Spieldauer:', 'Dauer:', 'L\u00e4nge:'],
languageLabels: ['Sprache:'],
releaseDateLabels: ['Erscheinungsdatum:'],
acceptedLanguageValues: ['deutsch', 'german'],
runtimeHourPatterns: [/(\d+)\s*Std\.?/i, /(\d+)\s*Stunden?/i],
runtimeMinutePatterns: [/(\d+)\s*Min\.?/i, /(\d+)\s*Minuten?/i],
ratingPatterns: [/(\d+[.,]?\d*)\s*von\s*5/i],
releaseDatePatterns: [/Erscheinungsdatum:\s*(.+)/i],
descriptionExcludePatterns: [
/\$\d+\.\d+/,
/\d+,\d+\s*\u20ac/,
/jederzeit k\u00fcndbar/i,
/kostenlos testen/i,
/Mitgliedschaft/i,
/abonnieren/i,
/Angebot.*endet/i,
/^\s*von\s+[\w\s,]+$/i,
],
durationDetectionPattern: /\d+\s*(Std|Stunden?|h)\s*\.?\s*\d*\s*(Min|Minuten?|m)?/i,
ratingTextSelector: 'von 5 Sternen',
},
};
const SPANISH_CONFIG: LanguageConfig = {
code: 'es',
annasArchiveLang: 'es',
epubCode: 'es',
stopWords: ['el', 'la', 'los', 'las', 'un', 'una', 'de', 'del', 'en', 'y', 'por'],
characterReplacements: {},
scraping: {
audibleLocaleParam: 'espa\u00f1ol',
authorPrefixes: ['De:', 'Escrito por:', 'Autor:'],
narratorPrefixes: ['Narrado por:'],
lengthLabels: ['Duraci\u00f3n:'],
languageLabels: ['Idioma:'],
releaseDateLabels: ['Fecha de lanzamiento:'],
acceptedLanguageValues: ['espa\u00f1ol', 'spanish'],
runtimeHourPatterns: [/(\d+)\s*h\b/i, /(\d+)\s*horas?/i],
runtimeMinutePatterns: [/(\d+)\s*min/i, /(\d+)\s*minutos?/i],
ratingPatterns: [/(\d+[.,]?\d*)\s*de\s*5/i],
releaseDatePatterns: [/Fecha de lanzamiento:\s*(.+)/i],
descriptionExcludePatterns: [
/\$\d+\.\d+/,
/\d+,\d+\s*\u20ac/,
/cancela cuando quieras/i,
/prueba gratis/i,
/suscripci\u00f3n/i,
/suscr\u00edbete/i,
/oferta.*termina/i,
/^\s*de\s+[\w\s,]+$/i,
],
durationDetectionPattern: /\d+\s*(h|horas?)\s*\d*\s*(min|minutos?)?/i,
ratingTextSelector: 'de 5 estrellas',
},
};
// ---------------------------------------------------------------------------
// Lookup Maps
// ---------------------------------------------------------------------------
export const LANGUAGE_CONFIGS: Record<SupportedLanguage, LanguageConfig> = {
en: ENGLISH_CONFIG,
de: GERMAN_CONFIG,
es: SPANISH_CONFIG,
};
/**
* Maps Audible region codes to language codes.
* All English-speaking regions map to 'en'.
*/
export const REGION_LANGUAGE_MAP: Record<AudibleRegion, SupportedLanguage> = {
us: 'en',
ca: 'en',
uk: 'en',
au: 'en',
in: 'en',
de: 'de',
es: 'es',
};
// ---------------------------------------------------------------------------
// Helper Functions
// ---------------------------------------------------------------------------
/**
* Get the full language configuration for an Audible region.
*/
export function getLanguageForRegion(region: AudibleRegion): LanguageConfig {
const langCode = REGION_LANGUAGE_MAP[region];
return LANGUAGE_CONFIGS[langCode];
}
/**
* Strip any matching prefixes from text (case-insensitive).
* Returns the text with the first matching prefix removed, trimmed.
*
* Example: stripPrefixes('By: Author Name', ['By:', 'Written by:']) => 'Author Name'
*/
export function stripPrefixes(text: string, prefixes: string[]): string {
const trimmed = text.trim();
for (const prefix of prefixes) {
if (trimmed.toLowerCase().startsWith(prefix.toLowerCase())) {
return trimmed.slice(prefix.length).trim();
}
}
return trimmed;
}
/**
* Build a Cheerio selector that matches any of the given labels using :contains().
* Returns a comma-separated selector string.
*
* Example: buildContainsSelector('span', ['Length:', 'Dauer:'])
* => 'span:contains("Length:"), span:contains("Dauer:")'
*/
export function buildContainsSelector(element: string, labels: string[]): string {
return labels.map(label => `${element}:contains("${label}")`).join(', ');
}
/**
* Extract a value from text by trying multiple label patterns.
* Returns the captured group from the first matching pattern, or null.
*/
export function extractByPatterns(text: string, patterns: RegExp[]): string | null {
for (const pattern of patterns) {
const match = text.match(pattern);
if (match?.[1]) {
return match[1].trim();
}
}
return null;
}
/**
* Check if a language value matches the accepted values for a language config.
* Comparison is case-insensitive.
*/
export function isAcceptedLanguage(languageValue: string, config: LanguageConfig): boolean {
const normalized = languageValue.toLowerCase().trim();
return config.scraping.acceptedLanguageValues.includes(normalized);
}
+101 -51
View File
@@ -8,6 +8,14 @@ import * as cheerio from 'cheerio';
import { RMABLogger } from '../utils/logger';
import { getConfigService } from '../services/config.service';
import { AudibleRegion, AUDIBLE_REGIONS, DEFAULT_AUDIBLE_REGION } from '../types/audible';
import {
getLanguageForRegion,
stripPrefixes,
buildContainsSelector,
extractByPatterns,
isAcceptedLanguage,
type LanguageConfig,
} from '../constants/language-config';
import {
pickUserAgent,
getBrowserHeaders,
@@ -69,6 +77,13 @@ export class AudibleService {
return this.baseUrl;
}
/**
* Get the language config for the current region
*/
private getLangConfig(): LanguageConfig {
return getLanguageForRegion(this.region);
}
/**
* Force re-initialization (used when region config changes)
*/
@@ -106,6 +121,9 @@ export class AudibleService {
logger.info(`Initializing Audible service with region: ${this.region} (${this.baseUrl})`);
// Get language config for the region
const langConfig = getLanguageForRegion(this.region);
// Create axios client with region-specific base URL and realistic browser headers
this.client = axios.create({
baseURL: this.baseUrl,
@@ -113,7 +131,7 @@ export class AudibleService {
headers: getBrowserHeaders(this.sessionUserAgent),
params: {
ipRedirectOverride: 'true', // Prevent IP-based region redirects
language: 'english', // Force English locale (prevents IP-based language serving for non-English IPs)
language: langConfig.scraping.audibleLocaleParam, // Force locale (prevents IP-based language serving)
},
});
@@ -125,13 +143,16 @@ export class AudibleService {
this.baseUrl = AUDIBLE_REGIONS[this.region].baseUrl;
this.sessionUserAgent = pickUserAgent();
this.pacer.reset();
const fallbackLangConfig = getLanguageForRegion(this.region);
this.client = axios.create({
baseURL: this.baseUrl,
timeout: 15000,
headers: getBrowserHeaders(this.sessionUserAgent),
params: {
ipRedirectOverride: 'true',
language: 'english',
language: fallbackLangConfig.scraping.audibleLocaleParam,
},
});
this.initialized = true;
@@ -289,12 +310,14 @@ export class AudibleService {
const ratingText = $el.find('.ratingsLabel').text().trim();
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
const langConfig = this.getLangConfig();
audiobooks.push({
asin,
title,
author: authorText.replace('By:', '').replace('Written by:', '').trim(),
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
authorAsin: authorAsinMatch?.[1] || undefined,
narrator: narratorText.replace('Narrated by:', '').trim(),
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
rating,
});
@@ -391,12 +414,14 @@ export class AudibleService {
const ratingText = $el.find('.ratingsLabel').text().trim();
const rating = ratingText ? parseFloat(ratingText.split(' ')[0]) : undefined;
const langConfig = this.getLangConfig();
audiobooks.push({
asin,
title,
author: authorText.replace('By:', '').replace('Written by:', '').trim(),
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
authorAsin: authorAsinMatch?.[1] || undefined,
narrator: narratorText.replace('Narrated by:', '').trim(),
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
rating,
});
@@ -487,9 +512,11 @@ export class AudibleService {
const coverArtUrl = $el.find('img').attr('src') || '';
const langConfig = this.getLangConfig();
// Extract runtime/duration
const runtimeText = $el.find('.runtimeLabel').text().trim() ||
$el.find('span:contains("Length:")').text().trim();
$el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
const durationMinutes = this.parseRuntime(runtimeText);
// Extract rating
@@ -500,9 +527,9 @@ export class AudibleService {
audiobooks.push({
asin,
title,
author: authorText.replace('By:', '').replace('Written by:', '').trim(),
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
authorAsin: authorAsinMatch?.[1] || undefined,
narrator: narratorText.replace('Narrated by:', '').trim(),
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
durationMinutes,
rating,
@@ -565,13 +592,15 @@ export class AudibleService {
$('.s-result-item, .productListItem').each((_index, element) => {
const $el = $(element);
// --- Language filter: require explicit "English" ---
const langText = $el.find('span:contains("Language:")').text().trim() ||
// --- Language filter: require matching language for region ---
const langConfig = this.getLangConfig();
const langText = $el.find(buildContainsSelector('span', langConfig.scraping.languageLabels)).text().trim() ||
$el.find('.languageLabel').text().trim();
// Extract language value (e.g. "Language: English" "English")
const langMatch = langText.match(/Language:\s*(.+)/i);
// Extract language value (e.g. "Language: English" -> "English", "Sprache: Deutsch" -> "Deutsch")
const langLabelPattern = new RegExp(`(?:${langConfig.scraping.languageLabels.map(l => l.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})\\s*(.+)`, 'i');
const langMatch = langText.match(langLabelPattern);
const language = langMatch?.[1]?.trim();
if (!language || language.toLowerCase() !== 'english') return;
if (!language || !isAcceptedLanguage(language, langConfig)) return;
// --- Author ASIN filter: verify target ASIN in author links ---
const authorLinks = $el.find('a[href*="/author/"]');
@@ -609,7 +638,7 @@ export class AudibleService {
const coverArtUrl = $el.find('img').attr('src') || '';
const runtimeText = $el.find('.runtimeLabel').text().trim() ||
$el.find('span:contains("Length:")').text().trim();
$el.find(buildContainsSelector('span', langConfig.scraping.lengthLabels)).text().trim();
const durationMinutes = this.parseRuntime(runtimeText);
const ratingText = $el.find('.ratingsLabel').text().trim() ||
@@ -619,9 +648,9 @@ export class AudibleService {
allBooks.push({
asin: bookAsin,
title,
author: authorText.replace('By:', '').replace('Written by:', '').trim(),
author: stripPrefixes(authorText, langConfig.scraping.authorPrefixes),
authorAsin,
narrator: narratorText.replace('Narrated by:', '').trim(),
narrator: stripPrefixes(narratorText, langConfig.scraping.narratorPrefixes),
coverArtUrl: coverArtUrl.replace(/\._.*_\./, '._SL500_.'),
durationMinutes,
rating,
@@ -867,7 +896,8 @@ export class AudibleService {
result.author = [...new Set(authors)].slice(0, 3).join(', ');
}
result.author = result.author.replace(/^By:\s*/i, '').replace(/^Written by:\s*/i, '').trim();
const authorLangConfig = this.getLangConfig();
result.author = stripPrefixes(result.author, authorLangConfig.scraping.authorPrefixes);
logger.info(` Author from HTML: "${result.author}"`);
}
@@ -911,22 +941,16 @@ export class AudibleService {
}
if (result.narrator) {
result.narrator = result.narrator.replace(/^Narrated by:\s*/i, '').trim();
const detailLangConfig = this.getLangConfig();
result.narrator = stripPrefixes(result.narrator, detailLangConfig.scraping.narratorPrefixes);
}
logger.info(` Narrator from HTML: "${result.narrator || ''}"`);
}
// Description - try multiple approaches with strict filtering
if (!result.description) {
const excludePatterns = [
/\$\d+\.\d+/, // Price patterns
/cancel anytime/i,
/free trial/i,
/membership/i,
/subscribe/i,
/offer.*ends/i,
/^\s*by\s+[\w\s,]+$/i, // Just author names
];
const descLangConfig = this.getLangConfig();
const excludePatterns = descLangConfig.scraping.descriptionExcludePatterns;
const isValidDescription = (text: string): boolean => {
if (!text || text.length < 50 || text.length > 5000) return false;
@@ -982,18 +1006,20 @@ export class AudibleService {
// Runtime/Duration - try multiple approaches
if (!result.durationMinutes) {
const rtLangConfig = this.getLangConfig();
// Look for runtime text in various places
const runtimeText =
$('li.runtimeLabel span').text().trim() ||
$('.runtimeLabel').text().trim() ||
$('span:contains("Length:")').parent().text().trim() ||
$('li:contains("Length:")').text().trim() ||
$(buildContainsSelector('span', rtLangConfig.scraping.lengthLabels)).parent().text().trim() ||
$(buildContainsSelector('li', rtLangConfig.scraping.lengthLabels)).text().trim() ||
(() => {
// Look for any text matching duration pattern
let found = '';
$('li, span, div').each((_, elem) => {
const text = $(elem).text().trim();
if (text.match(/\d+\s*(hr|hour|h)\s*\d*\s*(min|minute|m)?/i) && text.length < 100) {
if (text.match(rtLangConfig.scraping.durationDetectionPattern) && text.length < 100) {
found = text;
return false; // break
}
@@ -1007,41 +1033,55 @@ export class AudibleService {
// Rating - try multiple approaches
if (!result.rating) {
const ratingLangConfig = this.getLangConfig();
const ratingText =
$('.ratingsLabel').text().trim() ||
$('[class*="rating"]').first().text().trim() ||
$('span:contains("out of 5 stars")').parent().text().trim() ||
$(`span:contains("${ratingLangConfig.scraping.ratingTextSelector}")`).parent().text().trim() ||
(() => {
// Look for rating pattern
// Look for rating pattern using language-specific patterns
let found = '';
$('span, div').each((_, elem) => {
const text = $(elem).text().trim();
if (text.match(/\d+\.?\d*\s*out of\s*5/i) && text.length < 50) {
found = text;
return false;
if (text.length < 50) {
for (const pattern of ratingLangConfig.scraping.ratingPatterns) {
if (pattern.test(text)) {
found = text;
return false;
}
}
}
});
return found;
})();
if (ratingText) {
const ratingMatch = ratingText.match(/(\d+\.?\d*)\s*out of/i);
result.rating = ratingMatch ? parseFloat(ratingMatch[1]) : undefined;
let ratingValue: number | undefined;
for (const pattern of ratingLangConfig.scraping.ratingPatterns) {
const ratingMatch = ratingText.match(pattern);
if (ratingMatch) {
// Handle comma as decimal separator (e.g. "4,5" in German/Spanish)
ratingValue = parseFloat(ratingMatch[1].replace(',', '.'));
break;
}
}
result.rating = ratingValue;
}
logger.info(` Rating from "${ratingText}": ${result.rating}`);
}
// Release date - try multiple selectors
if (!result.releaseDate) {
const rdLangConfig = this.getLangConfig();
const releaseDateText =
$('li:contains("Release date:")').text().trim() ||
$('span:contains("Release date:")').parent().text().trim() ||
$(buildContainsSelector('li', rdLangConfig.scraping.releaseDateLabels)).text().trim() ||
$(buildContainsSelector('span', rdLangConfig.scraping.releaseDateLabels)).parent().text().trim() ||
$('[class*="release"]').text().trim();
const dateMatch = releaseDateText.match(/Release date:\s*(.+)/i) ||
releaseDateText.match(/(\w+ \d{1,2},? \d{4})/);
const dateMatch = extractByPatterns(releaseDateText, rdLangConfig.scraping.releaseDatePatterns) ||
releaseDateText.match(/(\w+ \d{1,2},? \d{4})/)?.[1];
if (dateMatch) {
result.releaseDate = dateMatch[1].trim();
result.releaseDate = dateMatch.trim();
}
logger.info(` Release date from "${releaseDateText}": ${result.releaseDate}`);
}
@@ -1078,20 +1118,30 @@ export class AudibleService {
}
/**
* Parse runtime text to minutes
* Parse runtime text to minutes using language-specific patterns
*/
private parseRuntime(runtimeText: string): number | undefined {
if (!runtimeText) return undefined;
const hoursMatch = runtimeText.match(/(\d+)\s*hrs?/i);
const minutesMatch = runtimeText.match(/(\d+)\s*mins?/i);
const langConfig = this.getLangConfig();
let totalMinutes = 0;
if (hoursMatch) {
totalMinutes += parseInt(hoursMatch[1]) * 60;
// Try each hour pattern until one matches
for (const pattern of langConfig.scraping.runtimeHourPatterns) {
const match = runtimeText.match(pattern);
if (match) {
totalMinutes += parseInt(match[1]) * 60;
break;
}
}
if (minutesMatch) {
totalMinutes += parseInt(minutesMatch[1]);
// Try each minute pattern until one matches
for (const pattern of langConfig.scraping.runtimeMinutePatterns) {
const match = runtimeText.match(pattern);
if (match) {
totalMinutes += parseInt(match[1]);
break;
}
}
return totalMinutes > 0 ? totalMinutes : undefined;
+15 -2
View File
@@ -14,6 +14,8 @@ import { RMABLogger } from '../utils/logger';
import { getProwlarrService } from '../integrations/prowlarr.service';
import { rankEbookTorrents, RankedEbookTorrent } from '../utils/ranking-algorithm';
import { groupIndexersByCategories, getGroupDescription } from '../utils/indexer-grouping';
import { getLanguageForRegion } from '../constants/language-config';
import type { AudibleRegion } from '../types/audible';
// Import ebook scraper functions for Anna's Archive
import {
@@ -151,6 +153,11 @@ async function searchAnnasArchive(
const baseUrl = await configService.get('ebook_sidecar_base_url') || 'https://annas-archive.li';
const flaresolverrUrl = await configService.get('ebook_sidecar_flaresolverr_url') || undefined;
// Get language code from Audible region config
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
const languageCode = langConfig.annasArchiveLang;
if (flaresolverrUrl) {
logger.info(`Using FlareSolverr at ${flaresolverrUrl}`);
}
@@ -161,7 +168,7 @@ async function searchAnnasArchive(
// Try ASIN search first (exact match - best)
if (audiobook.asin) {
logger.info(`Searching Anna's Archive by ASIN: ${audiobook.asin} (format: ${preferredFormat})...`);
md5 = await searchByAsin(audiobook.asin, preferredFormat, baseUrl, logger, flaresolverrUrl);
md5 = await searchByAsin(audiobook.asin, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
logger.info(`Found via ASIN: ${md5}`);
@@ -174,7 +181,7 @@ async function searchAnnasArchive(
// Fallback to title + author search
if (!md5) {
logger.info(`Searching Anna's Archive by title + author: "${audiobook.title}" by ${audiobook.author}...`);
md5 = await searchByTitle(audiobook.title, audiobook.author, preferredFormat, baseUrl, logger, flaresolverrUrl);
md5 = await searchByTitle(audiobook.title, audiobook.author, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
logger.info(`Found via title search: ${md5}`);
@@ -301,6 +308,10 @@ async function searchIndexers(
logger.info(`Will filter ${aboveThreshold.length} results > 20 MB (too large for ebooks)`);
}
// Get language-specific stop words for ranking
const ebookRegion = await configService.getAudibleRegion() as AudibleRegion;
const ebookLangConfig = getLanguageForRegion(ebookRegion);
// Rank results with ebook-specific scoring
// This filters out > 20MB and uses inverted size scoring
const rankedResults = rankEbookTorrents(allResults, {
@@ -311,6 +322,8 @@ async function searchIndexers(
indexerPriorities,
flagConfigs,
requireAuthor: true, // Automatic mode - prevent wrong authors
stopWords: ebookLangConfig.stopWords,
characterReplacements: ebookLangConfig.characterReplacements,
});
// Log filter results
@@ -9,6 +9,8 @@ import { getProwlarrService } from '../integrations/prowlarr.service';
import { getRankingAlgorithm } from '../utils/ranking-algorithm';
import { groupIndexersByCategories, getGroupDescription } from '../utils/indexer-grouping';
import { RMABLogger } from '../utils/logger';
import { getLanguageForRegion } from '../constants/language-config';
import type { AudibleRegion } from '../types/audible';
/**
* Process search indexers job
@@ -146,8 +148,10 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro
logger.info(`Will filter ${belowThreshold.length} results < ${sizeMBThreshold} MB (likely ebooks)`);
}
// Get ranking algorithm
// Get ranking algorithm and language-specific stop words
const ranker = getRankingAlgorithm();
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
// Rank results with indexer priorities and flag configs
// Note: rankTorrents now filters out results < 20 MB internally
@@ -159,7 +163,9 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro
}, {
indexerPriorities,
flagConfigs,
requireAuthor: true // Automatic mode - prevent wrong authors
requireAuthor: true, // Automatic mode - prevent wrong authors
stopWords: langConfig.stopWords,
characterReplacements: langConfig.characterReplacements,
});
// Log filter results
+13 -10
View File
@@ -170,7 +170,8 @@ export async function downloadEbook(
preferredFormat: string = 'epub',
baseUrl: string = 'https://annas-archive.li',
logger?: RMABLogger,
flaresolverrUrl?: string
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<EbookDownloadResult> {
try {
let md5: string | null = null;
@@ -183,7 +184,7 @@ export async function downloadEbook(
// Step 1: Try ASIN search (exact match - best)
if (asin) {
await logger?.info(`Searching by ASIN: ${asin} (format: ${preferredFormat})...`);
md5 = await searchByAsin(asin, preferredFormat, baseUrl, logger, flaresolverrUrl);
md5 = await searchByAsin(asin, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
await logger?.info(`Found via ASIN: ${md5}`);
@@ -195,7 +196,7 @@ export async function downloadEbook(
// Step 2: Fallback to title + author search
if (!md5) {
await logger?.info(`Searching by title + author: "${title}" by ${author}...`);
md5 = await searchByTitle(title, author, preferredFormat, baseUrl, logger, flaresolverrUrl);
md5 = await searchByTitle(title, author, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
await logger?.info(`Found via title search: ${md5}`);
@@ -312,10 +313,11 @@ export async function searchByAsin(
format: string,
baseUrl: string,
logger?: RMABLogger,
flaresolverrUrl?: string
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<string | null> {
// Check cache first
const cacheKey = `${asin}-${format}`;
const cacheKey = `${asin}-${format}-${languageCode}`;
if (md5Cache.has(cacheKey)) {
const cached = md5Cache.get(cacheKey);
if (cached) {
@@ -327,7 +329,7 @@ export async function searchByAsin(
try {
// Build search URL with ASIN and optional format filter
const formatParam = format && format !== 'any' ? `ext=${format}&` : '';
const searchUrl = `${baseUrl}/search?${formatParam}lang=en&q=%22asin:${asin}%22`;
const searchUrl = `${baseUrl}/search?${formatParam}lang=${languageCode}&q=%22asin:${asin}%22`;
moduleLogger.debug(`ASIN search URL: ${searchUrl}`);
@@ -404,10 +406,11 @@ export async function searchByTitle(
format: string,
baseUrl: string,
logger?: RMABLogger,
flaresolverrUrl?: string
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<string | null> {
// Check cache first
const cacheKey = `title-${title}-${author}-${format}`.toLowerCase();
const cacheKey = `title-${title}-${author}-${format}-${languageCode}`.toLowerCase();
if (md5Cache.has(cacheKey)) {
const cached = md5Cache.get(cacheKey);
if (cached) {
@@ -432,8 +435,8 @@ export async function searchByTitle(
// Add content type filters (books only, all fiction/nonfiction/unknown)
searchUrl += '&content=book_nonfiction&content=book_fiction&content=book_unknown';
// Add language filter (English)
searchUrl += '&lang=en';
// Add language filter
searchUrl += `&lang=${languageCode}`;
// Empty raw query (we're using specific terms instead)
searchUrl += '&q=';
+10 -8
View File
@@ -3,6 +3,8 @@
* Documentation: documentation/integrations/audible.md
*/
import type { SupportedLanguage } from '../constants/language-config';
export type AudibleRegion = 'us' | 'ca' | 'uk' | 'au' | 'in' | 'de' | 'es';
export interface AudibleRegionConfig {
@@ -10,7 +12,7 @@ export interface AudibleRegionConfig {
name: string;
baseUrl: string;
audnexusParam: string;
isEnglish: boolean;
language: SupportedLanguage;
}
export const AUDIBLE_REGIONS: Record<AudibleRegion, AudibleRegionConfig> = {
@@ -19,49 +21,49 @@ export const AUDIBLE_REGIONS: Record<AudibleRegion, AudibleRegionConfig> = {
name: 'United States',
baseUrl: 'https://www.audible.com',
audnexusParam: 'us',
isEnglish: true,
language: 'en',
},
ca: {
code: 'ca',
name: 'Canada',
baseUrl: 'https://www.audible.ca',
audnexusParam: 'ca',
isEnglish: true,
language: 'en',
},
uk: {
code: 'uk',
name: 'United Kingdom',
baseUrl: 'https://www.audible.co.uk',
audnexusParam: 'uk',
isEnglish: true,
language: 'en',
},
au: {
code: 'au',
name: 'Australia',
baseUrl: 'https://www.audible.com.au',
audnexusParam: 'au',
isEnglish: true,
language: 'en',
},
in: {
code: 'in',
name: 'India',
baseUrl: 'https://www.audible.in',
audnexusParam: 'in',
isEnglish: true,
language: 'en',
},
de: {
code: 'de',
name: 'Germany',
baseUrl: 'https://www.audible.de',
audnexusParam: 'de',
isEnglish: false,
language: 'de',
},
es: {
code: 'es',
name: 'Spain',
baseUrl: 'https://www.audible.es',
audnexusParam: 'es',
isEnglish: false,
language: 'es',
}
};
+50 -18
View File
@@ -40,6 +40,8 @@ export interface RankTorrentsOptions {
indexerPriorities?: Map<number, number>; // indexerId -> priority (1-25)
flagConfigs?: IndexerFlagConfig[]; // Flag bonus configurations
requireAuthor?: boolean; // Enforce author presence check (default: true)
stopWords?: string[]; // Language-specific stop words for matching
characterReplacements?: Record<string, string>; // Language-specific char replacements (e.g. ß→ss)
}
export interface EbookTorrentRequest {
@@ -52,6 +54,8 @@ export interface RankEbookTorrentsOptions {
indexerPriorities?: Map<number, number>; // indexerId -> priority (1-25)
flagConfigs?: IndexerFlagConfig[]; // Flag bonus configurations
requireAuthor?: boolean; // Enforce author presence check (default: true)
stopWords?: string[]; // Language-specific stop words for matching
characterReplacements?: Record<string, string>; // Language-specific char replacements (e.g. ß→ss)
}
export interface BonusModifier {
@@ -113,7 +117,9 @@ export class RankingAlgorithm {
const {
indexerPriorities,
flagConfigs,
requireAuthor = true // Safe default: require author in automatic mode
requireAuthor = true, // Safe default: require author in automatic mode
stopWords,
characterReplacements,
} = options;
// Filter out files < 20 MB (likely ebooks/samples)
const filteredTorrents = torrents.filter((torrent) => {
@@ -126,7 +132,7 @@ export class RankingAlgorithm {
const formatScore = this.scoreFormat(torrent);
const sizeScore = this.scoreSize(torrent, audiobook.durationMinutes);
const seederScore = this.scoreSeeders(torrent.seeders);
const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor);
const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor, stopWords, characterReplacements);
const baseScore = formatScore + sizeScore + seederScore + matchScore;
@@ -340,11 +346,22 @@ export class RankingAlgorithm {
* "Twelve.Months-Jim.Butcher" "twelve months jim butcher"
* "Author_Name_Book" "author name book"
*/
private normalizeForMatching(text: string): string {
return text
private normalizeForMatching(text: string, characterReplacements?: Record<string, string>): string {
let result = text
// Split CamelCase FIRST (before lowercasing): "TheCorrespondent" → "The Correspondent"
.replace(/([a-z])([A-Z])/g, '$1 $2')
.toLowerCase()
.toLowerCase();
// Apply language-specific character replacements before NFD (e.g. ß→ss)
if (characterReplacements) {
for (const [from, to] of Object.entries(characterReplacements)) {
result = result.replace(new RegExp(from, 'g'), to);
}
}
return result
// NFD normalization: convert accented chars to ASCII base forms
// e.g. "uber" from "uber", "senor" from "senor", "cafe" from "cafe"
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
// Replace underscores with spaces (must be explicit since \w includes _)
.replace(/_/g, ' ')
// Replace other punctuation/separators with spaces (preserves apostrophes in contractions)
@@ -362,11 +379,13 @@ export class RankingAlgorithm {
private scoreMatch(
torrent: TorrentResult,
audiobook: AudiobookRequest,
requireAuthor: boolean = true
requireAuthor: boolean = true,
customStopWords?: string[],
characterReplacements?: Record<string, string>
): number {
// Normalize for matching (handles CamelCase, punctuation separators)
const torrentTitle = this.normalizeForMatching(torrent.title);
const requestTitle = this.normalizeForMatching(audiobook.title);
// Normalize for matching (handles CamelCase, punctuation separators, diacritics)
const torrentTitle = this.normalizeForMatching(torrent.title, characterReplacements);
const requestTitle = this.normalizeForMatching(audiobook.title, characterReplacements);
// Parse authors from RAW string first (preserving commas for splitting)
// Then normalize individual authors for matching
@@ -377,19 +396,30 @@ export class RankingAlgorithm {
.filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
// Normalize parsed authors for matching (handles CamelCase in author names)
const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a));
const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a, characterReplacements));
// Combined normalized author string for fuzzy matching
const requestAuthorNormalized = normalizedAuthors.join(' ');
// ========== STAGE 1: WORD COVERAGE FILTER (MANDATORY) ==========
// Extract significant words (filter out common stop words)
const stopWords = ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for'];
// Use provided language-specific stop words, or fall back to English defaults
const stopWords = customStopWords || ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for'];
const extractWords = (text: string, stopList: string[]): string[] => {
return text
let processed = text
// Split CamelCase FIRST: "TheCorrespondent" → "The Correspondent"
.replace(/([a-z])([A-Z])/g, '$1 $2')
.toLowerCase()
.toLowerCase();
// Apply language-specific character replacements before NFD
if (characterReplacements) {
for (const [from, to] of Object.entries(characterReplacements)) {
processed = processed.replace(new RegExp(from, 'g'), to);
}
}
return processed
// NFD normalization for accented characters
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
// Replace underscores with spaces (must be explicit since \w includes _)
.replace(/_/g, ' ')
// Remove other punctuation (but keep apostrophes for contractions)
@@ -431,7 +461,7 @@ export class RankingAlgorithm {
}
// Normalize the required portion (handles CamelCase, punctuation)
const required = this.normalizeForMatching(requiredRaw);
const required = this.normalizeForMatching(requiredRaw, characterReplacements);
const optional = optionalMatches.join(' ');
return { required, optional };
@@ -653,7 +683,7 @@ export class RankingAlgorithm {
* @param requestAuthor - Raw author string (will be parsed and normalized internally)
* @returns true if at least ONE author is present with high confidence
*/
private checkAuthorPresence(torrentTitle: string, requestAuthor: string): boolean {
private checkAuthorPresence(torrentTitle: string, requestAuthor: string, characterReplacements?: Record<string, string>): boolean {
// Parse multiple authors (same logic as Stage 3 author matching)
const authors = requestAuthor
.split(/,|&| and | - /)
@@ -661,7 +691,7 @@ export class RankingAlgorithm {
.filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
// Normalize each author for matching
const normalizedAuthors = authors.map(a => this.normalizeForMatching(a));
const normalizedAuthors = authors.map(a => this.normalizeForMatching(a, characterReplacements));
return this.checkAuthorPresenceWithParsed(torrentTitle, normalizedAuthors);
}
@@ -788,7 +818,9 @@ export class RankingAlgorithm {
const {
indexerPriorities,
flagConfigs,
requireAuthor = true // Safe default: require author in automatic mode
requireAuthor = true, // Safe default: require author in automatic mode
stopWords,
characterReplacements,
} = options;
// Filter out files > 20 MB (too large for ebooks)
@@ -809,7 +841,7 @@ export class RankingAlgorithm {
const matchScore = this.scoreMatch(torrent, {
title: ebook.title,
author: ebook.author,
}, requireAuthor);
}, requireAuthor, stopWords, characterReplacements);
const baseScore = formatScore + sizeScore + seederScore + matchScore;