Add language config and locale-aware parsing

Introduce centralized language configuration and wire locale-aware behavior across scraping and ranking. Adds src/lib/constants/language-config.ts with per-language scraping rules, stop words, and character replacements; replaces AudibleRegion.isEnglish with a language field in types and AUDIBLE_REGIONS. Update AudibleService, ebook scraper, processors, and API routes to use getLanguageForRegion so Anna's Archive searches, scraping selectors, runtime/rating parsing, and ranking use language-specific params and filters. Extend ranking algorithm to accept stopWords and characterReplacements and apply them during normalization and matching. Update UI selects to mark non-English regions and adjust tests accordingly.
This commit is contained in:
kikootwo
2026-02-20 06:32:44 -05:00
parent c146383735
commit 5d8ac2f73d
18 changed files with 525 additions and 112 deletions
+15 -2
View File
@@ -14,6 +14,8 @@ import { RMABLogger } from '../utils/logger';
import { getProwlarrService } from '../integrations/prowlarr.service';
import { rankEbookTorrents, RankedEbookTorrent } from '../utils/ranking-algorithm';
import { groupIndexersByCategories, getGroupDescription } from '../utils/indexer-grouping';
import { getLanguageForRegion } from '../constants/language-config';
import type { AudibleRegion } from '../types/audible';
// Import ebook scraper functions for Anna's Archive
import {
@@ -151,6 +153,11 @@ async function searchAnnasArchive(
const baseUrl = await configService.get('ebook_sidecar_base_url') || 'https://annas-archive.li';
const flaresolverrUrl = await configService.get('ebook_sidecar_flaresolverr_url') || undefined;
// Get language code from Audible region config
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
const languageCode = langConfig.annasArchiveLang;
if (flaresolverrUrl) {
logger.info(`Using FlareSolverr at ${flaresolverrUrl}`);
}
@@ -161,7 +168,7 @@ async function searchAnnasArchive(
// Try ASIN search first (exact match - best)
if (audiobook.asin) {
logger.info(`Searching Anna's Archive by ASIN: ${audiobook.asin} (format: ${preferredFormat})...`);
md5 = await searchByAsin(audiobook.asin, preferredFormat, baseUrl, logger, flaresolverrUrl);
md5 = await searchByAsin(audiobook.asin, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
logger.info(`Found via ASIN: ${md5}`);
@@ -174,7 +181,7 @@ async function searchAnnasArchive(
// Fallback to title + author search
if (!md5) {
logger.info(`Searching Anna's Archive by title + author: "${audiobook.title}" by ${audiobook.author}...`);
md5 = await searchByTitle(audiobook.title, audiobook.author, preferredFormat, baseUrl, logger, flaresolverrUrl);
md5 = await searchByTitle(audiobook.title, audiobook.author, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
logger.info(`Found via title search: ${md5}`);
@@ -301,6 +308,10 @@ async function searchIndexers(
logger.info(`Will filter ${aboveThreshold.length} results > 20 MB (too large for ebooks)`);
}
// Get language-specific stop words for ranking
const ebookRegion = await configService.getAudibleRegion() as AudibleRegion;
const ebookLangConfig = getLanguageForRegion(ebookRegion);
// Rank results with ebook-specific scoring
// This filters out > 20MB and uses inverted size scoring
const rankedResults = rankEbookTorrents(allResults, {
@@ -311,6 +322,8 @@ async function searchIndexers(
indexerPriorities,
flagConfigs,
requireAuthor: true, // Automatic mode - prevent wrong authors
stopWords: ebookLangConfig.stopWords,
characterReplacements: ebookLangConfig.characterReplacements,
});
// Log filter results
@@ -9,6 +9,8 @@ import { getProwlarrService } from '../integrations/prowlarr.service';
import { getRankingAlgorithm } from '../utils/ranking-algorithm';
import { groupIndexersByCategories, getGroupDescription } from '../utils/indexer-grouping';
import { RMABLogger } from '../utils/logger';
import { getLanguageForRegion } from '../constants/language-config';
import type { AudibleRegion } from '../types/audible';
/**
* Process search indexers job
@@ -146,8 +148,10 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro
logger.info(`Will filter ${belowThreshold.length} results < ${sizeMBThreshold} MB (likely ebooks)`);
}
// Get ranking algorithm
// Get ranking algorithm and language-specific stop words
const ranker = getRankingAlgorithm();
const region = await configService.getAudibleRegion() as AudibleRegion;
const langConfig = getLanguageForRegion(region);
// Rank results with indexer priorities and flag configs
// Note: rankTorrents now filters out results < 20 MB internally
@@ -159,7 +163,9 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro
}, {
indexerPriorities,
flagConfigs,
requireAuthor: true // Automatic mode - prevent wrong authors
requireAuthor: true, // Automatic mode - prevent wrong authors
stopWords: langConfig.stopWords,
characterReplacements: langConfig.characterReplacements,
});
// Log filter results