Add interactive ebook search & selection

Introduce interactive ebook support: adds two API endpoints to search (interactive-search-ebook) and create/select ebook requests (select-ebook), plus server-side handlers to route Anna's Archive (direct) and indexer (torrent/NZB) downloads. Frontend: extend RequestActionsDropdown and InteractiveTorrentSearchModal to support an "ebook" search mode and selection flow, and add hooks (useInteractiveSearchEbook / useSelectEbook). Settings: add ebook_auto_grab_enabled with UI toggle and enforce disabling when no ebook sources are enabled; settings GET/PUT updated to persist the flag (default = true to preserve behavior). Documentation updated (scheduler, ebook-sidecar, settings pages) and ranking algorithm docs/tests extended to cover ebook-related normalization and matching cases. Includes logging and ranking integration for indexer results and normalization for Anna's Archive handling.
2026-06-03 21:00:09 +00:00 · 2026-02-02 19:59:58 -05:00
parent c913be5ca2
commit 1afab5d47f
19 changed files with 1339 additions and 115 deletions
@@ -95,6 +95,7 @@ export interface RankedEbookTorrent extends TorrentResult {
  finalScore: number;         // score + bonusPoints
  rank: number;
  breakdown: EbookScoreBreakdown;
+  ebookFormat?: string;       // Detected ebook format (epub, pdf, mobi, etc.)
 }

 export class RankingAlgorithm {
@@ -330,6 +331,26 @@ export class RankingAlgorithm {
  }


+  /**
+   * Normalize text for matching by handling CamelCase and punctuation separators
+   * "VirginaEvans TheCorrespondent" → "virgina evans the correspondent"
+   * "Twelve.Months-Jim.Butcher" → "twelve months jim butcher"
+   * "Author_Name_Book" → "author name book"
+   */
+  private normalizeForMatching(text: string): string {
+    return text
+      // Split CamelCase FIRST (before lowercasing): "TheCorrespondent" → "The Correspondent"
+      .replace(/([a-z])([A-Z])/g, '$1 $2')
+      .toLowerCase()
+      // Replace underscores with spaces (must be explicit since \w includes _)
+      .replace(/_/g, ' ')
+      // Replace other punctuation/separators with spaces (preserves apostrophes in contractions)
+      .replace(/[^\w\s']/g, ' ')
+      // Collapse multiple spaces
+      .replace(/\s+/g, ' ')
+      .trim();
+  }
+
  /**
   * Score title/author match quality (60 points max)
   * Title similarity: 0-45 points (heavily weighted!)
@@ -340,10 +361,22 @@ export class RankingAlgorithm {
    audiobook: AudiobookRequest,
    requireAuthor: boolean = true
  ): number {
-    // Normalize whitespace (multiple spaces → single space) for consistent matching
-    const torrentTitle = torrent.title.toLowerCase().replace(/\s+/g, ' ').trim();
-    const requestTitle = audiobook.title.toLowerCase().replace(/\s+/g, ' ').trim();
-    const requestAuthor = audiobook.author.toLowerCase().replace(/\s+/g, ' ').trim();
+    // Normalize for matching (handles CamelCase, punctuation separators)
+    const torrentTitle = this.normalizeForMatching(torrent.title);
+    const requestTitle = this.normalizeForMatching(audiobook.title);
+
+    // Parse authors from RAW string first (preserving commas for splitting)
+    // Then normalize individual authors for matching
+    const requestAuthorRaw = audiobook.author.toLowerCase().replace(/\s+/g, ' ').trim();
+    const parsedAuthors = requestAuthorRaw
+      .split(/,|&| and | - /)
+      .map(a => a.trim())
+      .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
+
+    // Normalize parsed authors for matching (handles CamelCase in author names)
+    const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a));
+    // Combined normalized author string for fuzzy matching
+    const requestAuthorNormalized = normalizedAuthors.join(' ');

    // ========== STAGE 1: WORD COVERAGE FILTER (MANDATORY) ==========
    // Extract significant words (filter out common stop words)
@@ -351,26 +384,37 @@ export class RankingAlgorithm {

    const extractWords = (text: string, stopList: string[]): string[] => {
      return text
+        // Split CamelCase FIRST: "TheCorrespondent" → "The Correspondent"
+        .replace(/([a-z])([A-Z])/g, '$1 $2')
        .toLowerCase()
-        .replace(/[^\w\s]/g, ' ') // Remove punctuation
+        // Replace underscores with spaces (must be explicit since \w includes _)
+        .replace(/_/g, ' ')
+        // Remove other punctuation (but keep apostrophes for contractions)
+        .replace(/[^\w\s']/g, ' ')
        .split(/\s+/)
        .filter(word => word.length > 0 && !stopList.includes(word));
    };

    // Separate required words (outside parentheses/brackets) from optional words (inside)
    // This handles common patterns like "Title (Subtitle)" where subtitle may be omitted
+    // Note: Run on ORIGINAL title to preserve brackets, then normalize the result
    const separateRequiredOptional = (title: string): { required: string; optional: string } => {
+      // Work with original title format for bracket detection
+      const originalTitle = audiobook.title.toLowerCase();
+
      // Extract content in parentheses/brackets as optional
      const optionalPattern = /[(\[{]([^)\]}]+)[)\]}]/g;
      const optionalMatches: string[] = [];
      let match;

-      while ((match = optionalPattern.exec(title)) !== null) {
+      while ((match = optionalPattern.exec(originalTitle)) !== null) {
        optionalMatches.push(match[1]);
      }

      // Remove parenthetical/bracketed content to get required portion
-      const required = title.replace(/[(\[{][^)\]}]+[)\]}]/g, ' ').trim();
+      const requiredRaw = originalTitle.replace(/[(\[{][^)\]}]+[)\]}]/g, ' ').trim();
+      // Normalize the required portion (handles CamelCase, punctuation)
+      const required = this.normalizeForMatching(requiredRaw);
      const optional = optionalMatches.join(' ');

      return { required, optional };
@@ -400,7 +444,7 @@ export class RankingAlgorithm {
    // ========== STAGE 1.5: AUTHOR PRESENCE CHECK (OPTIONAL) ==========
    // Only enforced in automatic mode (requireAuthor: true)
    // Interactive search (requireAuthor: false) shows all results
-    if (requireAuthor && !this.checkAuthorPresence(torrentTitle, requestAuthor)) {
+    if (requireAuthor && !this.checkAuthorPresenceWithParsed(torrentTitle, normalizedAuthors)) {
      // No high-confidence author match → reject to prevent wrong-author matches
      return 0;
    }
@@ -408,6 +452,10 @@ export class RankingAlgorithm {
    // ========== STAGE 2: TITLE MATCHING (0-35 points) ==========
    let titleScore = 0;

+    // Keep original torrent title (lowercased only) for metadata marker detection
+    // Markers like [ ] ( ) : are removed by normalization but needed for suffix validation
+    const torrentTitleOriginal = torrent.title.toLowerCase().replace(/\s+/g, ' ').trim();
+
    // Try matching with full title first, then fall back to required title (without parentheses)
    const titlesToTry = [requestTitle];
    if (requiredTitle !== requestTitle) {
@@ -422,20 +470,37 @@ export class RankingAlgorithm {
        const beforeTitle = torrentTitle.substring(0, titleIndex);
        const afterTitle = torrentTitle.substring(titleIndex + titleToMatch.length);

+        // For metadata marker detection, try to find where the title starts in the ORIGINAL string
+        // Search for key words from the title to locate position in original
+        const titleWords = titleToMatch.split(/\s+/).filter(w => w.length > 2);
+        let afterTitleOriginal = '';
+        if (titleWords.length > 0) {
+          // Find the last significant title word in the original string
+          const lastTitleWord = titleWords[titleWords.length - 1];
+          const lastWordIdxOriginal = torrentTitleOriginal.lastIndexOf(lastTitleWord);
+          if (lastWordIdxOriginal !== -1) {
+            afterTitleOriginal = torrentTitleOriginal.substring(lastWordIdxOriginal + lastTitleWord.length);
+          }
+        }
+
        // Extract significant words BEFORE the matched title
        const beforeWords = extractWords(beforeTitle, stopWords);

        // Title is complete if:
        // 1. Acceptable prefix (no words, OR structured metadata like "Author - Series - ")
        // 2. Followed by clear metadata markers (not "'s Secret" or " Is Watching")
+        // Check ORIGINAL title for metadata markers ([ ] ( ) etc. not normalized away)
        const metadataMarkers = [' by ', ' - ', ' [', ' (', ' {', ' :', ','];

-        // Check if afterTitle starts with author name (handles space-separated format like "Title Author Year")
-        const afterStartsWithAuthor = requestAuthor.length > 2 &&
-          afterTitle.trim().startsWith(requestAuthor);
+        // Check if afterTitle starts with any author name (handles space-separated format like "Title Author Year")
+        const afterStartsWithAuthor = normalizedAuthors.some(author =>
+          author.length > 2 && afterTitle.trim().startsWith(author)
+        );

+        // Check metadata markers in both normalized and original suffixes
        const hasMetadataSuffix = afterTitle === '' ||
                                  metadataMarkers.some(marker => afterTitle.startsWith(marker)) ||
+                                  metadataMarkers.some(marker => afterTitleOriginal.startsWith(marker)) ||
                                  afterStartsWithAuthor;

        // Check prefix validity:
@@ -446,16 +511,32 @@ export class RankingAlgorithm {

        // Check if title is immediately preceded by a metadata separator
        // This handles "Author - Series - 01 - Title" patterns
+        // Check both normalized and original strings for separators
        const precedingText = beforeTitle.trimEnd();
+
+        // Also check original string for separators that got normalized away (like colons)
+        let beforeTitleOriginal = '';
+        if (titleWords.length > 0) {
+          const firstTitleWord = titleWords[0];
+          const firstWordIdxOriginal = torrentTitleOriginal.indexOf(firstTitleWord);
+          if (firstWordIdxOriginal !== -1) {
+            beforeTitleOriginal = torrentTitleOriginal.substring(0, firstWordIdxOriginal).trimEnd();
+          }
+        }
+
        const titlePrecededBySeparator =
          precedingText.endsWith('-') ||
          precedingText.endsWith(':') ||
-          precedingText.endsWith('—');
+          precedingText.endsWith('—') ||
+          beforeTitleOriginal.endsWith('-') ||
+          beforeTitleOriginal.endsWith(':') ||
+          beforeTitleOriginal.endsWith('—');

-        // Check if author name appears in the prefix
+        // Check if any author name appears in the prefix
        // This handles "Author Name - Title" patterns
-        const authorInPrefix = requestAuthor.length > 2 &&
-          beforeTitle.includes(requestAuthor);
+        const authorInPrefix = normalizedAuthors.some(author =>
+          author.length > 2 && beforeTitle.includes(author)
+        );

        const hasAcceptablePrefix =
          hasNoWordsPrefix ||
@@ -481,24 +562,18 @@ export class RankingAlgorithm {
    }

    // ========== STAGE 3: AUTHOR MATCHING (0-15 points) ==========
-    // Parse requested authors (split on separators, filter out roles)
-    const requestAuthors = requestAuthor
-      .split(/,|&| and | - /)
-      .map(a => a.trim())
-      .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
-
    // Check how many authors appear in torrent title (exact substring match)
-    const authorMatches = requestAuthors.filter(author =>
+    const authorMatches = normalizedAuthors.filter(author =>
      torrentTitle.includes(author)
    );

    let authorScore = 0;
    if (authorMatches.length > 0) {
      // Exact substring match → proportional credit
-      authorScore = (authorMatches.length / requestAuthors.length) * 15;
+      authorScore = (authorMatches.length / normalizedAuthors.length) * 15;
    } else {
      // No exact match → use fuzzy similarity for partial credit
-      authorScore = compareTwoStrings(requestAuthor, torrentTitle) * 15;
+      authorScore = compareTwoStrings(requestAuthorNormalized, torrentTitle) * 15;
    }

    return Math.min(60, titleScore + authorScore);
@@ -506,22 +581,16 @@ export class RankingAlgorithm {

  /**
   * Check if author is present in torrent title with high confidence
-   * Handles variations: middle initials, spacing, punctuation, name order
+   * Uses pre-parsed and normalized authors array
   *
-   * @param torrentTitle - Normalized torrent title (lowercase)
-   * @param requestAuthor - Normalized author name (lowercase)
+   * @param torrentTitle - Normalized torrent title (already processed by normalizeForMatching)
+   * @param normalizedAuthors - Array of normalized author names (roles already filtered)
   * @returns true if at least ONE author is present with high confidence
   */
-  private checkAuthorPresence(torrentTitle: string, requestAuthor: string): boolean {
-    // Parse multiple authors (same logic as Stage 3 author matching)
-    const authors = requestAuthor
-      .split(/,|&| and | - /)
-      .map(a => a.trim())
-      .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
-
+  private checkAuthorPresenceWithParsed(torrentTitle: string, normalizedAuthors: string[]): boolean {
    // At least ONE author must match with high confidence
-    return authors.some(author => {
-      // Check 1: Exact substring match
+    return normalizedAuthors.some(author => {
+      // Check 1: Exact substring match (works well now that both are normalized)
      if (torrentTitle.includes(author)) {
        return true;
      }
@@ -537,6 +606,7 @@ export class RankingAlgorithm {
      // Check 3: Core name components (first + last name present within 30 chars)
      // Handles: "Sanderson, Brandon" vs "Brandon Sanderson"
      // Handles: "Brandon R. Sanderson" vs "Brandon Sanderson"
+      // Now also handles: "VirginaEvans" → "virgina evans" (after normalization)
      const words = author.split(/\s+/).filter(w => w.length > 1);
      if (words.length >= 2) {
        const firstName = words[0];
@@ -558,6 +628,27 @@ export class RankingAlgorithm {
    });
  }

+  /**
+   * Check if author is present in torrent title with high confidence
+   * Handles variations: middle initials, spacing, punctuation, name order, CamelCase
+   *
+   * @param torrentTitle - Normalized torrent title (already processed by normalizeForMatching)
+   * @param requestAuthor - Raw author string (will be parsed and normalized internally)
+   * @returns true if at least ONE author is present with high confidence
+   */
+  private checkAuthorPresence(torrentTitle: string, requestAuthor: string): boolean {
+    // Parse multiple authors (same logic as Stage 3 author matching)
+    const authors = requestAuthor
+      .split(/,|&| and | - /)
+      .map(a => a.trim())
+      .filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
+
+    // Normalize each author for matching
+    const normalizedAuthors = authors.map(a => this.normalizeForMatching(a));
+
+    return this.checkAuthorPresenceWithParsed(torrentTitle, normalizedAuthors);
+  }
+
  /**
   * Detect format from torrent title
   */
@@ -687,6 +778,9 @@ export class RankingAlgorithm {
    });

    const ranked = filteredTorrents.map((torrent) => {
+      // Detect ebook format from title
+      const detectedFormat = this.detectEbookFormat(torrent);
+
      // Calculate base scores (0-100)
      // Reuse scoreMatch and scoreSeeders from audiobook ranking
      const formatScore = this.scoreEbookFormat(torrent, ebook.preferredFormat);
@@ -765,6 +859,7 @@ export class RankingAlgorithm {
            notes: [],
          }, ebook.preferredFormat),
        },
+        ebookFormat: detectedFormat !== 'unknown' ? detectedFormat : undefined,
      };
    });

@@ -824,19 +919,27 @@ export class RankingAlgorithm {

  /**
   * Detect ebook format from torrent title
+   * Handles formats in various positions: .epub, (epub), [epub], " epub"
   */
  private detectEbookFormat(torrent: TorrentResult): string {
    const title = torrent.title.toLowerCase();

    // Check for common ebook format extensions/keywords
-    if (title.includes('.epub') || title.includes(' epub')) return 'epub';
-    if (title.includes('.pdf') || title.includes(' pdf')) return 'pdf';
-    if (title.includes('.mobi') || title.includes(' mobi')) return 'mobi';
-    if (title.includes('.azw3') || title.includes(' azw3')) return 'azw3';
-    if (title.includes('.azw') || title.includes(' azw')) return 'azw';
-    if (title.includes('.fb2') || title.includes(' fb2')) return 'fb2';
-    if (title.includes('.cbz') || title.includes(' cbz')) return 'cbz';
-    if (title.includes('.cbr') || title.includes(' cbr')) return 'cbr';
+    // Patterns: .format, (format), [format], " format", "_format"
+    const formats = ['epub', 'pdf', 'mobi', 'azw3', 'azw', 'fb2', 'cbz', 'cbr'];
+
+    for (const format of formats) {
+      if (
+        title.includes(`.${format}`) ||    // file.epub
+        title.includes(`(${format})`) ||   // (epub)
+        title.includes(`[${format}]`) ||   // [epub]
+        title.includes(` ${format}`) ||    // " epub" (space before)
+        title.includes(`_${format}`) ||    // _epub (underscore)
+        title.endsWith(format)             // ends with format
+      ) {
+        return format;
+      }
+    }

    // Default to unknown
    return 'unknown';