Implement file hash-based library matching and remove fuzzy ASIN matching

Adds file hash-based matching for Audiobookshelf library items to ensure 100% accurate ASIN assignment for RMAB-organized content. Removes fuzzy matching from library availability checks, making all matching ASIN-only to eliminate false positives and race conditions. Updates database schema, processors, and matcher utilities; adds new tests and documentation for the new matching strategy. Removes obsolete scripts, Dockerfile, and related tests; updates docker-compose for test environments.
2026-06-03 21:00:09 +00:00 · 2026-01-28 10:32:14 -05:00
parent 497849f427
commit a97979358f
111 changed files with 6571 additions and 1426 deletions
@@ -67,6 +67,7 @@ export async function processDownloadTorrent(payload: DownloadTorrentPayload): P
        data: {
          requestId,
          indexerName: torrent.indexer,
+          indexerId: torrent.indexerId, // Store indexer ID for configuration lookup
          downloadClient: 'sabnzbd',
          downloadClientId,
          torrentName: torrent.title,
@@ -131,6 +132,7 @@ export async function processDownloadTorrent(payload: DownloadTorrentPayload): P
        data: {
          requestId,
          indexerName: torrent.indexer,
+          indexerId: torrent.indexerId, // Store indexer ID for configuration lookup
          downloadClient: 'qbittorrent',
          downloadClientId,
          torrentName: torrent.title,
@@ -1,191 +0,0 @@
-/**
- * Component: Match Library Job Processor
- * Documentation: documentation/phase3/README.md
- *
- * DEPRECATED: This processor is deprecated. Matching is now handled by scan_library job.
- * Kept for backwards compatibility but should not be used in new code.
- */
-
-import { MatchPlexPayload } from '../services/job-queue.service';
-import { prisma } from '../db';
-import { getLibraryService } from '../services/library';
-import { compareTwoStrings } from 'string-similarity';
-import { getConfigService } from '../services/config.service';
-import { RMABLogger } from '../utils/logger';
-
-/**
- * Process match library job (DEPRECATED - use scan_library instead)
- * Fuzzy matches requested audiobook to library item and updates status
- */
-export async function processMatchPlex(payload: MatchPlexPayload): Promise<any> {
-  const { requestId, audiobookId, title, author, jobId } = payload;
-
-  const logger = RMABLogger.forJob(jobId, 'MatchLibrary');
-
-  logger.warn('DEPRECATED: match_plex job is deprecated. Use scan_plex instead.');
-  logger.info(`Matching "${title}" by ${author} in library`);
-
-  try {
-    // Get library service and configuration
-    const configService = getConfigService();
-    const libraryService = await getLibraryService();
-    const backendMode = await configService.getBackendMode();
-
-    logger.info(`Backend mode: ${backendMode}`);
-
-    // Get configured library ID
-    const libraryId = backendMode === 'audiobookshelf'
-      ? await configService.get('audiobookshelf.library_id')
-      : (await configService.getPlexConfig()).libraryId;
-
-    if (!libraryId) {
-      throw new Error(`${backendMode} library not configured`);
-    }
-
-    // Search library using abstraction layer
-    const searchResults = await libraryService.searchItems(libraryId, title);
-
-    logger.info(`Found ${searchResults.length} results in library`);
-
-    if (searchResults.length === 0) {
-      logger.warn(`No matches found in library for "${title}"`);
-
-      // Mark as completed anyway - the file is there, library just needs time to scan
-      await prisma.request.update({
-        where: { id: requestId },
-        data: {
-          status: 'completed',
-          updatedAt: new Date(),
-          completedAt: new Date(),
-        },
-      });
-
-      return {
-        success: true,
-        message: 'No library match found yet, but request completed',
-        requestId,
-        matched: false,
-        note: 'Library may need time to scan the new files',
-      };
-    }
-
-    // Fuzzy match against results
-    const matches = searchResults.map((item) => {
-      const titleScore = compareTwoStrings(title.toLowerCase(), (item.title || '').toLowerCase());
-      const authorScore = author
-        ? compareTwoStrings(author.toLowerCase(), (item.author || '').toLowerCase())
-        : 0.5;
-
-      // Weighted average: title is more important
-      const overallScore = titleScore * 0.7 + authorScore * 0.3;
-
-      return {
-        item,
-        score: overallScore,
-        titleScore,
-        authorScore,
-      };
-    });
-
-    // Sort by score
-    matches.sort((a, b) => b.score - a.score);
-
-    const bestMatch = matches[0];
-
-    logger.info(`Best match: "${bestMatch.item.title}" by ${bestMatch.item.author || 'Unknown'}`, {
-      score: Math.round(bestMatch.score * 100),
-      titleScore: Math.round(bestMatch.titleScore * 100),
-      authorScore: Math.round(bestMatch.authorScore * 100),
-    });
-
-    // Accept match if score >= 70%
-    if (bestMatch.score >= 0.7) {
-      logger.info(`Match accepted!`);
-
-      // Update audiobook with library item ID
-      const updateData: any = {
-        completedAt: new Date(),
-        updatedAt: new Date(),
-      };
-
-      if (backendMode === 'audiobookshelf') {
-        updateData.absItemId = bestMatch.item.externalId;
-      } else {
-        updateData.plexGuid = bestMatch.item.externalId;
-      }
-
-      await prisma.audiobook.update({
-        where: { id: audiobookId },
-        data: updateData,
-      });
-
-      // Ensure request is marked as completed
-      await prisma.request.update({
-        where: { id: requestId },
-        data: {
-          status: 'completed',
-          updatedAt: new Date(),
-          completedAt: new Date(),
-        },
-      });
-
-      return {
-        success: true,
-        message: `Successfully matched audiobook in library (${backendMode})`,
-        backendMode,
-        requestId,
-        matched: true,
-        matchScore: bestMatch.score,
-        libraryItem: {
-          title: bestMatch.item.title,
-          author: bestMatch.item.author,
-          id: bestMatch.item.id,
-          externalId: bestMatch.item.externalId,
-        },
-      };
-    } else {
-      logger.warn(`Match score too low (${Math.round(bestMatch.score * 100)}%), but marking as completed anyway`);
-
-      // Mark as completed even if match is poor
-      await prisma.request.update({
-        where: { id: requestId },
-        data: {
-          status: 'completed',
-          updatedAt: new Date(),
-          completedAt: new Date(),
-        },
-      });
-
-      return {
-        success: true,
-        message: 'Request completed, but library match uncertain',
-        requestId,
-        matched: false,
-        matchScore: bestMatch.score,
-        note: `Low match score: ${Math.round(bestMatch.score * 100)}%`,
-      };
-    }
-  } catch (error) {
-    logger.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
-
-    // Don't fail the request - the files are organized correctly
-    // Just log the error and mark as completed
-    await prisma.request.update({
-      where: { id: requestId },
-      data: {
-        status: 'completed',
-        errorMessage: `Library matching failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
-        updatedAt: new Date(),
-        completedAt: new Date(),
-      },
-    });
-
-    return {
-      success: false,
-      message: 'Request completed despite library matching error',
-      requestId,
-      matched: false,
-      error: error instanceof Error ? error.message : 'Unknown error',
-    };
-  }
-}
@@ -85,7 +85,7 @@ export async function processMonitorDownload(payload: MonitorDownloadPayload): P

      // Convert NZBInfo to progress format
      progress = {
-        percent: nzbInfo.progress,
+        percent: nzbInfo.progress * 100, // Convert 0.0-1.0 to 0-100 (matches qBittorrent format)
        bytesDownloaded: nzbInfo.size * nzbInfo.progress,
        bytesTotal: nzbInfo.size,
        speed: nzbInfo.downloadSpeed,
@@ -9,6 +9,7 @@ import { getFileOrganizer } from '../utils/file-organizer';
 import { RMABLogger } from '../utils/logger';
 import { getLibraryService } from '../services/library';
 import { getConfigService } from '../services/config.service';
+import { generateFilesHash } from '../utils/files-hash';

 /**
 * Process organize files job
@@ -107,11 +108,18 @@ export async function processOrganizeFiles(payload: OrganizeFilesPayload): Promi

    logger.info(`Successfully moved ${result.filesMovedCount} files to ${result.targetPath}`);

-    // Update audiobook record with file path and status
+    // Generate hash from organized audio files for library matching
+    const filesHash = generateFilesHash(result.audioFiles);
+    if (filesHash) {
+      logger.info(`Generated files hash: ${filesHash.substring(0, 16)}... (${result.audioFiles.length} audio files)`);
+    }
+
+    // Update audiobook record with file path, hash, and status
    await prisma.audiobook.update({
      where: { id: audiobookId },
      data: {
        filePath: result.targetPath,
+        filesHash: filesHash || null,
        status: 'completed',
        completedAt: new Date(),
        updatedAt: new Date(),
@@ -189,6 +197,95 @@ export async function processOrganizeFiles(payload: OrganizeFilesPayload): Promi
      );
    }

+    // Cleanup Usenet downloads if configured
+    try {
+      logger.info('Checking if cleanup is needed for this download');
+
+      // Get download history to find NZB ID and indexer
+      const downloadHistory = await prisma.downloadHistory.findFirst({
+        where: { requestId },
+        orderBy: { createdAt: 'desc' },
+      });
+
+      logger.info(`Download history found: ${downloadHistory ? 'yes' : 'no'}`, {
+        hasNzbId: !!downloadHistory?.nzbId,
+        hasIndexerId: !!downloadHistory?.indexerId,
+        nzbId: downloadHistory?.nzbId || 'none',
+        indexerId: downloadHistory?.indexerId || 'none',
+      });
+
+      if (downloadHistory?.nzbId && downloadHistory?.indexerId) {
+        // Get indexer configuration
+        const indexersConfig = await configService.get('prowlarr_indexers');
+        logger.info(`Indexers config found: ${indexersConfig ? 'yes' : 'no'}`);
+
+        if (indexersConfig) {
+          const indexers: Array<{ id: number; protocol: string; removeAfterProcessing?: boolean }> = JSON.parse(indexersConfig);
+          const indexer = indexers.find(idx => idx.id === downloadHistory.indexerId);
+
+          logger.info(`Indexer found in config: ${indexer ? 'yes' : 'no'}`, {
+            indexerId: downloadHistory.indexerId,
+            protocol: indexer?.protocol || 'none',
+            removeAfterProcessing: indexer?.removeAfterProcessing ?? 'undefined',
+          });
+
+          // Check if this is a Usenet indexer with cleanup enabled
+          if (indexer && indexer.protocol?.toLowerCase() !== 'torrent' && indexer.removeAfterProcessing) {
+            logger.info(`Cleaning up NZB ${downloadHistory.nzbId} (cleanup enabled for indexer ${indexer.id})`);
+
+            // First, manually delete files from filesystem
+            if (downloadPath) {
+              logger.info(`Removing download files from filesystem: ${downloadPath}`);
+
+              const fs = await import('fs/promises');
+
+              try {
+                // Check if it's a file or directory
+                const stats = await fs.stat(downloadPath);
+
+                if (stats.isDirectory()) {
+                  // Remove directory and all contents
+                  await fs.rm(downloadPath, { recursive: true, force: true });
+                  logger.info(`Removed directory: ${downloadPath}`);
+                } else {
+                  // Remove single file
+                  await fs.unlink(downloadPath);
+                  logger.info(`Removed file: ${downloadPath}`);
+                }
+              } catch (fsError) {
+                // File/directory might already be deleted or not exist
+                if ((fsError as NodeJS.ErrnoException).code === 'ENOENT') {
+                  logger.info(`Download path already deleted: ${downloadPath}`);
+                } else {
+                  throw fsError;
+                }
+              }
+            } else {
+              logger.warn(`No download path available, skipping filesystem deletion`);
+            }
+
+            // Then archive from SABnzbd history (hides from UI but preserves for troubleshooting)
+            // Note: We only archive from history, not queue. If the NZB is still in the queue
+            // when we're organizing files, something went wrong with the download monitoring.
+            const { getSABnzbdService } = await import('../integrations/sabnzbd.service');
+            const sabnzbd = await getSABnzbdService();
+
+            await sabnzbd.archiveCompletedNZB(downloadHistory.nzbId);
+
+            logger.info(`Successfully archived NZB ${downloadHistory.nzbId} and removed files`);
+          }
+        }
+      }
+    } catch (error) {
+      // Log error but don't fail the job - cleanup is optional
+      logger.warn(
+        `Failed to cleanup NZB download: ${error instanceof Error ? error.message : 'Unknown error'}`,
+        {
+          error: error instanceof Error ? error.stack : undefined,
+        }
+      );
+    }
+
    return {
      success: true,
      message: 'Files organized successfully',
@@ -178,6 +178,77 @@ export async function processPlexRecentlyAddedCheck(payload: PlexRecentlyAddedPa
      }
    }

+    // For Audiobookshelf: Trigger metadata match for items without ASIN
+    // This ensures ASIN gets populated so items can be matched against requests
+    if (backendMode === 'audiobookshelf') {
+      const { triggerABSItemMatch, getABSItem } = await import('../services/audiobookshelf/api');
+      const { generateFilesHash } = await import('../utils/files-hash');
+
+      const itemsWithoutAsin = recentItems.filter(item => !item.asin && item.externalId);
+
+      if (itemsWithoutAsin.length > 0) {
+        logger.info(`Found ${itemsWithoutAsin.length} recent items without ASIN, attempting file hash matching...`);
+
+        let fileMatchCount = 0;
+        let fuzzyMatchCount = 0;
+
+        for (const item of itemsWithoutAsin) {
+          try {
+            // 1. Fetch full item details to get file list
+            const absItem = await getABSItem(item.externalId);
+
+            // 2. Extract audio filenames and generate hash
+            const audioFilenames = absItem.media?.audioFiles?.map((f: any) => f.metadata?.filename).filter(Boolean) || [];
+            const itemHash = generateFilesHash(audioFilenames);
+
+            // 3. Query database for matching downloaded request
+            let matchedAsin: string | undefined = undefined;
+
+            if (itemHash) {
+              const matchedAudiobook = await prisma.audiobook.findFirst({
+                where: {
+                  filesHash: itemHash,
+                  status: 'completed',
+                },
+                select: {
+                  audibleAsin: true,
+                  title: true,
+                },
+              });
+
+              if (matchedAudiobook?.audibleAsin) {
+                matchedAsin = matchedAudiobook.audibleAsin;
+                logger.info(
+                  `File hash match found for "${item.title}" → ASIN: ${matchedAsin} (from "${matchedAudiobook.title}")`
+                );
+                fileMatchCount++;
+              }
+            }
+
+            // 4. Trigger metadata match (with ASIN if matched, undefined if not)
+            await triggerABSItemMatch(item.externalId, matchedAsin);
+
+            if (matchedAsin) {
+              logger.info(`Triggered metadata match with ASIN ${matchedAsin} for: "${item.title}"`);
+            } else {
+              logger.info(`No file match found, triggering fuzzy metadata match for: "${item.title}"`);
+              fuzzyMatchCount++;
+            }
+
+          } catch (error) {
+            logger.error(
+              `Failed to process metadata match for "${item.title}": ${error instanceof Error ? error.message : 'Unknown error'}`
+            );
+            fuzzyMatchCount++;
+          }
+        }
+
+        logger.info(
+          `Metadata match complete: ${fileMatchCount} file hash matches, ${fuzzyMatchCount} fuzzy matches (ASIN population is async)`
+        );
+      }
+    }
+
    // Check for all non-terminal requests to match
    const matchableRequests = await prisma.request.findMany({
      where: {
@@ -259,15 +330,8 @@ export async function processPlexRecentlyAddedCheck(payload: PlexRecentlyAddedPa

            matchedDownloads++;

-            // Trigger metadata match for Audiobookshelf items (only for our downloaded requests)
-            if (backendMode === 'audiobookshelf') {
-              const itemId = match.plexGuid; // plexGuid contains the Audiobookshelf item ID
-              const asin = audiobook.audibleAsin || undefined;
-              const matchInfo = asin ? ` with ASIN ${asin}` : '';
-              logger.info(`Triggering metadata match for matched item: ${itemId}${matchInfo}`);
-              const { triggerABSItemMatch } = await import('../services/audiobookshelf/api');
-              await triggerABSItemMatch(itemId, asin);
-            }
+            // Note: Audiobookshelf metadata matching is handled in the file hash phase above
+            // Items without ASIN get file-hash-matched ASIN, items with ASIN already have correct metadata
          }
        } catch (error) {
          logger.error(`Failed to match request ${request.id}: ${error instanceof Error ? error.message : 'Unknown error'}`);
@@ -180,6 +180,80 @@ export async function processScanPlex(payload: ScanPlexPayload): Promise<any> {

    logger.info(`Scan complete: ${libraryItems.length} items scanned, ${newCount} new, ${updatedCount} updated, ${skippedCount} skipped`);

+    // 4b. For Audiobookshelf: Trigger metadata match for items without ASIN
+    // This ensures ASIN gets populated so items can be matched against requests
+    if (backendMode === 'audiobookshelf') {
+      logger.info(`Checking for Audiobookshelf items without ASIN...`);
+      const { triggerABSItemMatch, getABSItem } = await import('../services/audiobookshelf/api');
+      const { generateFilesHash } = await import('../utils/files-hash');
+
+      const itemsWithoutAsin = libraryItems.filter(item => !item.asin && item.externalId);
+
+      if (itemsWithoutAsin.length > 0) {
+        logger.info(`Found ${itemsWithoutAsin.length} items without ASIN, attempting file hash matching...`);
+
+        let fileMatchCount = 0;
+        let fuzzyMatchCount = 0;
+
+        for (const item of itemsWithoutAsin) {
+          try {
+            // 1. Fetch full item details to get file list
+            const absItem = await getABSItem(item.externalId);
+
+            // 2. Extract audio filenames and generate hash
+            const audioFilenames = absItem.media?.audioFiles?.map((f: any) => f.metadata?.filename).filter(Boolean) || [];
+            const itemHash = generateFilesHash(audioFilenames);
+
+            // 3. Query database for matching downloaded request
+            let matchedAsin: string | undefined = undefined;
+
+            if (itemHash) {
+              const matchedAudiobook = await prisma.audiobook.findFirst({
+                where: {
+                  filesHash: itemHash,
+                  status: 'completed',
+                },
+                select: {
+                  audibleAsin: true,
+                  title: true,
+                },
+              });
+
+              if (matchedAudiobook?.audibleAsin) {
+                matchedAsin = matchedAudiobook.audibleAsin;
+                logger.info(
+                  `File hash match found for "${item.title}" → ASIN: ${matchedAsin} (from "${matchedAudiobook.title}")`
+                );
+                fileMatchCount++;
+              }
+            }
+
+            // 4. Trigger metadata match (with ASIN if matched, undefined if not)
+            await triggerABSItemMatch(item.externalId, matchedAsin);
+
+            if (matchedAsin) {
+              logger.info(`Triggered metadata match with ASIN ${matchedAsin} for: "${item.title}"`);
+            } else {
+              logger.info(`No file match found, triggering fuzzy metadata match for: "${item.title}"`);
+              fuzzyMatchCount++;
+            }
+
+          } catch (error) {
+            logger.error(
+              `Failed to process metadata match for "${item.title}": ${error instanceof Error ? error.message : 'Unknown error'}`
+            );
+            fuzzyMatchCount++;
+          }
+        }
+
+        logger.info(
+          `Metadata match complete: ${fileMatchCount} file hash matches, ${fuzzyMatchCount} fuzzy matches (ASIN population is async)`
+        );
+      } else {
+        logger.info(`All items have ASIN, no metadata match needed`);
+      }
+    }
+
    // 5. Remove stale records from plex_library (items no longer in the actual library)
    // This ensures the database is a fresh snapshot of the library state
    logger.info(`Checking for stale library records...`);
@@ -445,15 +519,8 @@ export async function processScanPlex(payload: ScanPlexPayload): Promise<any> {

          matchedCount++;

-          // Trigger metadata match for Audiobookshelf items (only for our downloaded requests)
-          if (backendMode === 'audiobookshelf') {
-            const itemId = match.plexGuid; // plexGuid contains the Audiobookshelf item ID
-            const asin = audiobook.audibleAsin || undefined;
-            const matchInfo = asin ? ` with ASIN ${asin}` : '';
-            logger.info(`Triggering metadata match for matched item: ${itemId}${matchInfo}`);
-            const { triggerABSItemMatch } = await import('../services/audiobookshelf/api');
-            await triggerABSItemMatch(itemId, asin);
-          }
+          // Note: Audiobookshelf metadata matching is handled in the file hash phase above
+          // Items without ASIN get file-hash-matched ASIN, items with ASIN already have correct metadata
        }
      } catch (error) {
        logger.error(`Failed to match request ${request.id}: ${error instanceof Error ? error.message : 'Unknown error'}`);
@@ -103,13 +103,13 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro

    if (searchResults.length === 0) {
      // No results found - queue for re-search instead of failing
-      logger.warn(`No torrents found for request ${requestId}, marking as awaiting_search`);
+      logger.warn(`No torrents/nzbs found for request ${requestId}, marking as awaiting_search`);

      await prisma.request.update({
        where: { id: requestId },
        data: {
          status: 'awaiting_search',
-          errorMessage: 'No torrents found. Will retry automatically.',
+          errorMessage: 'No torrents/nzbs found. Will retry automatically.',
          lastSearchAt: new Date(),
          updatedAt: new Date(),
        },
@@ -117,7 +117,7 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro

      return {
        success: false,
-        message: 'No torrents found, queued for re-search',
+        message: 'No torrents/nzbs found, queued for re-search',
        requestId,
      };
    }
@@ -149,11 +149,16 @@ export async function processSearchIndexers(payload: SearchIndexersPayload): Pro

    // Rank results with indexer priorities and flag configs
    // Note: rankTorrents now filters out results < 20 MB internally
+    // requireAuthor: true (default) - strict filtering for automatic selection
    const rankedResults = ranker.rankTorrents(searchResults, {
      title: audiobook.title,
      author: audiobook.author,
      durationMinutes,
-    }, indexerPriorities, flagConfigs);
+    }, {
+      indexerPriorities,
+      flagConfigs,
+      requireAuthor: true  // Automatic mode - prevent wrong authors
+    });

    // Log filter results
    const postFilterCount = rankedResults.length;