Add language config and locale-aware parsing

Introduce centralized language configuration and wire locale-aware behavior across scraping and ranking. Adds src/lib/constants/language-config.ts with per-language scraping rules, stop words, and character replacements; replaces AudibleRegion.isEnglish with a language field in types and AUDIBLE_REGIONS. Update AudibleService, ebook scraper, processors, and API routes to use getLanguageForRegion so Anna's Archive searches, scraping selectors, runtime/rating parsing, and ranking use language-specific params and filters. Extend ranking algorithm to accept stopWords and characterReplacements and apply them during normalization and matching. Update UI selects to mark non-English regions and adjust tests accordingly.
2026-06-04 21:30:11 +00:00 · 2026-02-20 06:32:44 -05:00
parent c146383735
commit 5d8ac2f73d
18 changed files with 525 additions and 112 deletions
@@ -40,6 +40,8 @@ export interface RankTorrentsOptions {
  indexerPriorities?: Map<number, number>;  // indexerId -> priority (1-25)
  flagConfigs?: IndexerFlagConfig[];         // Flag bonus configurations
  requireAuthor?: boolean;                   // Enforce author presence check (default: true)
+  stopWords?: string[];                      // Language-specific stop words for matching
+  characterReplacements?: Record<string, string>;  // Language-specific char replacements (e.g. ß→ss)
 }

 export interface EbookTorrentRequest {
@@ -52,6 +54,8 @@ export interface RankEbookTorrentsOptions {
  indexerPriorities?: Map<number, number>;  // indexerId -> priority (1-25)
  flagConfigs?: IndexerFlagConfig[];         // Flag bonus configurations
  requireAuthor?: boolean;                   // Enforce author presence check (default: true)
+  stopWords?: string[];                      // Language-specific stop words for matching
+  characterReplacements?: Record<string, string>;  // Language-specific char replacements (e.g. ß→ss)
 }

 export interface BonusModifier {
@@ -113,7 +117,9 @@ export class RankingAlgorithm {
    const {
      indexerPriorities,
      flagConfigs,
-      requireAuthor = true  // Safe default: require author in automatic mode
+      requireAuthor = true,  // Safe default: require author in automatic mode
+      stopWords,
+      characterReplacements,
    } = options;
    // Filter out files < 20 MB (likely ebooks/samples)
    const filteredTorrents = torrents.filter((torrent) => {
@@ -126,7 +132,7 @@ export class RankingAlgorithm {
      const formatScore = this.scoreFormat(torrent);
      const sizeScore = this.scoreSize(torrent, audiobook.durationMinutes);
      const seederScore = this.scoreSeeders(torrent.seeders);
-      const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor);
+      const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor, stopWords, characterReplacements);

      const baseScore = formatScore + sizeScore + seederScore + matchScore;

@@ -340,11 +346,22 @@ export class RankingAlgorithm {
   * "Twelve.Months-Jim.Butcher" → "twelve months jim butcher"
   * "Author_Name_Book" → "author name book"
   */
-  private normalizeForMatching(text: string): string {
-    return text
+  private normalizeForMatching(text: string, characterReplacements?: Record<string, string>): string {
+    let result = text
      // Split CamelCase FIRST (before lowercasing): "TheCorrespondent" → "The Correspondent"
      .replace(/([a-z])([A-Z])/g, '$1 $2')
-      .toLowerCase()
+      .toLowerCase();
+    // Apply language-specific character replacements before NFD (e.g. ß→ss)
+    if (characterReplacements) {
+      for (const [from, to] of Object.entries(characterReplacements)) {
+        result = result.replace(new RegExp(from, 'g'), to);
+      }
+    }
+    return result
+      // NFD normalization: convert accented chars to ASCII base forms
+      // e.g. "uber" from "uber", "senor" from "senor", "cafe" from "cafe"
+      .normalize('NFD')
+      .replace(/[\u0300-\u036f]/g, '')
      // Replace underscores with spaces (must be explicit since \w includes _)
      .replace(/_/g, ' ')
      // Replace other punctuation/separators with spaces (preserves apostrophes in contractions)
@@ -362,11 +379,13 @@ export class RankingAlgorithm {
  private scoreMatch(
    torrent: TorrentResult,
    audiobook: AudiobookRequest,
-    requireAuthor: boolean = true
+    requireAuthor: boolean = true,
+    customStopWords?: string[],
+    characterReplacements?: Record<string, string>
  ): number {
-    // Normalize for matching (handles CamelCase, punctuation separators)
-    const torrentTitle = this.normalizeForMatching(torrent.title);
-    const requestTitle = this.normalizeForMatching(audiobook.title);
+    // Normalize for matching (handles CamelCase, punctuation separators, diacritics)
+    const torrentTitle = this.normalizeForMatching(torrent.title, characterReplacements);
+    const requestTitle = this.normalizeForMatching(audiobook.title, characterReplacements);

    // Parse authors from RAW string first (preserving commas for splitting)
    // Then normalize individual authors for matching
@@ -377,19 +396,30 @@ export class RankingAlgorithm {
      .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));

    // Normalize parsed authors for matching (handles CamelCase in author names)
-    const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a));
+    const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a, characterReplacements));
    // Combined normalized author string for fuzzy matching
    const requestAuthorNormalized = normalizedAuthors.join(' ');

    // ========== STAGE 1: WORD COVERAGE FILTER (MANDATORY) ==========
    // Extract significant words (filter out common stop words)
-    const stopWords = ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for'];
+    // Use provided language-specific stop words, or fall back to English defaults
+    const stopWords = customStopWords || ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for'];

    const extractWords = (text: string, stopList: string[]): string[] => {
-      return text
+      let processed = text
        // Split CamelCase FIRST: "TheCorrespondent" → "The Correspondent"
        .replace(/([a-z])([A-Z])/g, '$1 $2')
-        .toLowerCase()
+        .toLowerCase();
+      // Apply language-specific character replacements before NFD
+      if (characterReplacements) {
+        for (const [from, to] of Object.entries(characterReplacements)) {
+          processed = processed.replace(new RegExp(from, 'g'), to);
+        }
+      }
+      return processed
+        // NFD normalization for accented characters
+        .normalize('NFD')
+        .replace(/[\u0300-\u036f]/g, '')
        // Replace underscores with spaces (must be explicit since \w includes _)
        .replace(/_/g, ' ')
        // Remove other punctuation (but keep apostrophes for contractions)
@@ -431,7 +461,7 @@ export class RankingAlgorithm {
      }

      // Normalize the required portion (handles CamelCase, punctuation)
-      const required = this.normalizeForMatching(requiredRaw);
+      const required = this.normalizeForMatching(requiredRaw, characterReplacements);
      const optional = optionalMatches.join(' ');

      return { required, optional };
@@ -653,7 +683,7 @@ export class RankingAlgorithm {
   * @param requestAuthor - Raw author string (will be parsed and normalized internally)
   * @returns true if at least ONE author is present with high confidence
   */
-  private checkAuthorPresence(torrentTitle: string, requestAuthor: string): boolean {
+  private checkAuthorPresence(torrentTitle: string, requestAuthor: string, characterReplacements?: Record<string, string>): boolean {
    // Parse multiple authors (same logic as Stage 3 author matching)
    const authors = requestAuthor
      .split(/,|&| and | - /)
@@ -661,7 +691,7 @@ export class RankingAlgorithm {
      .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));

    // Normalize each author for matching
-    const normalizedAuthors = authors.map(a => this.normalizeForMatching(a));
+    const normalizedAuthors = authors.map(a => this.normalizeForMatching(a, characterReplacements));

    return this.checkAuthorPresenceWithParsed(torrentTitle, normalizedAuthors);
  }
@@ -788,7 +818,9 @@ export class RankingAlgorithm {
    const {
      indexerPriorities,
      flagConfigs,
-      requireAuthor = true  // Safe default: require author in automatic mode
+      requireAuthor = true,  // Safe default: require author in automatic mode
+      stopWords,
+      characterReplacements,
    } = options;

    // Filter out files > 20 MB (too large for ebooks)
@@ -809,7 +841,7 @@ export class RankingAlgorithm {
      const matchScore = this.scoreMatch(torrent, {
        title: ebook.title,
        author: ebook.author,
-      }, requireAuthor);
+      }, requireAuthor, stopWords, characterReplacements);

      const baseScore = formatScore + sizeScore + seederScore + matchScore;