Fix bulk import: group tagless files by folder, use folder name as search fallback

2026-07-18 02:31:10 +00:00 · 2026-04-10 10:22:01 +02:00
parent 54b54d343a
commit 35cb318389
5 changed files with 140 additions and 45 deletions
@@ -13,9 +13,13 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
 ## Key Details
 - **Access:** Admin-only, modal opened from admin dashboard Quick Actions
 - **Audio detection:** Uses `AUDIO_EXTENSIONS` from `src/lib/constants/audio-formats.ts`
- **Audiobook boundary:** A folder containing audio files = one audiobook; subfolders not scanned further
- **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from first audio file
- **Fallback:** If metadata tags are empty, folder name used as search term; "Low Confidence" badge shown
+- **Audiobook boundary:** A folder containing audio files = one audiobook. Files with matching metadata tags are grouped by title+author+narrator. Files with no metadata title tag are all grouped together per folder (one entry, not one per file).
+- **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from all audio files in folder
+- **Search term fallback chain** (when no `album` tag):
+  1. **ASIN in folder name** — scans folder name for pattern `B[A-Z0-9]{9}` bounded by bracket/paren/space; if found, uses direct ASIN lookup instead of text search; no badge shown
+  2. **Folder name** — cleaned (strips bracketed ASIN/year, underscores→spaces); skipped if generic (CD1, Disc 2, Part 3, Vol 1, etc.); shows "Low Confidence" badge
+  3. **First file name** — last resort; shows "Low Confidence" badge
+- **Generic folder detection:** `/^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i` — these names are skipped as search terms
 - **Author/narrator dedup:** Splits on `,;& ` delimiters, removes names appearing in both fields
 - **Scan depth:** Max 10 levels recursion
 - **Rate limiting:** 1.5s delay between Audible searches (same as existing scraping rate limit)
@@ -56,7 +60,8 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
 | Already in library | 40% opacity, green "In Library" badge, toggle disabled |
 | Active request exists | 40% opacity, purple "Requested" badge, toggle disabled |
 | No Audible match | Red "No Match" badge, folder name shown, pre-skipped |
-| Low confidence (folder name fallback) | Amber "Low Confidence" badge |
+| ASIN extracted from folder name | No badge (high confidence — direct ASIN lookup) |
+| Low confidence (folder name or file name fallback, no ASIN) | Amber "Low Confidence" badge |

 ## Files

@@ -159,7 +159,29 @@ export async function POST(request: NextRequest) {
              let hasActiveRequest = false;

              try {
-                const searchResult = await audibleService.search(book.searchTerm);
+                // If the scanner extracted an ASIN directly from the folder name,
+                // try an exact ASIN lookup first — faster and more accurate than
+                // a text search. Fall back to text search if it fails or returns
+                // no result.
+                let searchResult: Awaited<ReturnType<typeof audibleService.search>> | null = null;
+
+                if (book.extractedAsin) {
+                  try {
+                    const asinResult = await audibleService.search(book.extractedAsin);
+                    if (
+                      asinResult.results.length > 0 &&
+                      asinResult.results[0].asin === book.extractedAsin
+                    ) {
+                      searchResult = asinResult;
+                    }
+                  } catch {
+                    /* ASIN lookup failed — fall through to text search */
+                  }
+                }
+
+                if (!searchResult) {
+                  searchResult = await audibleService.search(book.searchTerm);
+                }

                if (searchResult.results.length > 0) {
                  match = searchResult.results[0];
@@ -208,6 +230,7 @@ export async function POST(request: NextRequest) {
                audioFileCount: book.audioFileCount,
                totalSizeBytes: book.totalSizeBytes,
                metadataSource: book.metadataSource,
+                extractedAsin: book.extractedAsin,
                searchTerm: book.searchTerm,
                audioFiles: book.audioFiles,
                match: match
@@ -39,7 +39,12 @@ function BookRow({
  const isDisabled = book.inLibrary || book.hasActiveRequest;
  const isSkipped = book.skipped;
  const hasMatch = book.match !== null;
-  const isLowConfidence = book.metadataSource === 'file_name';
+  // Low confidence when search term came from a filename or folder name fallback,
+  // BUT not when an ASIN was extracted directly from the folder name (that's a
+  // direct lookup and is as reliable as embedded metadata tags).
+  const isLowConfidence =
+    (book.metadataSource === 'file_name' || book.metadataSource === 'folder_name') &&
+    !book.extractedAsin;

  return (
    <div
@@ -34,7 +34,9 @@ export interface ScannedBook {
  relativePath: string;
  audioFileCount: number;
  totalSizeBytes: number;
-  metadataSource: 'tags' | 'file_name';
+  metadataSource: 'tags' | 'folder_name' | 'file_name';
+  /** ASIN extracted directly from the folder name, if present. */
+  extractedAsin?: string;
  searchTerm: string;
  audioFiles: string[];
  match: AudibleMatch | null;
@@ -21,6 +21,12 @@ export const MAX_SCAN_DEPTH = 10;
 /** Maximum concurrent ffprobe calls for metadata reads. */
 const METADATA_CONCURRENCY = 10;

+/**
+ * Folder names matching this pattern are considered generic and should not be
+ * used as Audible search terms (e.g. "CD1", "Disc 2", "Part 3", "Volume 1").
+ */
+const GENERIC_FOLDER_NAME_RE = /^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i;
+
 /** Metadata extracted from an audio file via ffprobe. */
 export interface AudioFileMetadata {
  title?: string;              // From 'album' tag (book title)
@@ -39,7 +45,8 @@ export interface DiscoveredAudiobook {
  totalSizeBytes: number;
  metadata: AudioFileMetadata;
  searchTerm: string;         // Constructed search query for Audible
-  metadataSource: 'tags' | 'file_name';  // Where the search term came from
+  metadataSource: 'tags' | 'folder_name' | 'file_name';  // Where the search term came from
+  extractedAsin?: string;     // ASIN extracted directly from folder name, if present
  audioFiles: string[];       // File names (relative to folderPath) belonging to this book
  groupingKey: string;        // Normalized key for cross-folder deduplication
 }
@@ -60,6 +67,18 @@ function isAudioFile(filename: string): boolean {
  return (AUDIO_EXTENSIONS as readonly string[]).includes(ext);
 }

+/**
+ * Extract an Audible ASIN from a string (typically a folder name).
+ * Audible ASINs start with 'B' and are exactly 10 alphanumeric characters.
+ * The ASIN must be bounded by a bracket, parenthesis, whitespace, or string
+ * boundary to avoid false positives from random alphanumeric sequences.
+ * Returns the ASIN string or null if not found.
+ */
+export function extractAsinFromString(str: string): string | null {
+  const match = str.match(/(?:^|[\s\[\(])([B][A-Z0-9]{9})(?:$|[\s\]\)])/);
+  return match ? match[1] : null;
+}
+
 /**
 * Read audio metadata from a file using ffprobe.
 * Extracts album, album_artist, composer, and title tags.
@@ -140,15 +159,36 @@ export function deduplicateNames(
 }

 /**
- * Build a search term from metadata or file name.
+ * Clean a raw string (folder name or file name) for use as an Audible search term.
+ * Strips file extension, bracketed ASINs, bracketed years, leading track numbers,
+ * underscores, and collapses whitespace.
+ */
+function cleanSearchString(raw: string): string {
+  return raw
+    .replace(/\.[^.]+$/, '')                       // Remove file extension
+    .replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '')     // Remove ASIN in brackets
+    .replace(/[\[\(]\d{4}[\]\)]/g, '')             // Remove year in brackets
+    .replace(/^\d+[\s._-]+/, '')                   // Remove leading track numbers
+    .replace(/[_]/g, ' ')                           // Underscores to spaces
+    .replace(/\s+/g, ' ')                           // Collapse whitespace
+    .trim();
+}
+
+/**
+ * Build a search term from metadata or folder/file name.
 * Returns the search term and the source it was derived from.
+ *
+ * Fallback chain (when no album metadata tag is present):
+ *   1. Folder name — if provided and not a generic name (CD1, Disc 2, Part 3, etc.)
+ *   2. First audio file name — last resort, always available
+ *
 * When metadata tags are present, constructs "Title Author Narrator ContributingArtists".
- * When tags are empty, falls back to the first audio file's name (cleaned).
 */
 export function buildSearchTerm(
  metadata: AudioFileMetadata,
-  firstFileName: string
-): { searchTerm: string; source: 'tags' | 'file_name' } {
+  firstFileName: string,
+  folderName?: string
+): { searchTerm: string; source: 'tags' | 'folder_name' | 'file_name' } {
  const { author, narrator, contributingArtists } = deduplicateNames(
    metadata.author,
    metadata.narrator,
@@ -165,23 +205,23 @@ export function buildSearchTerm(
    return { searchTerm: parts.join(' '), source: 'tags' };
  }

-  // Fallback: clean up the first audio file name and use it as search term
-  const cleaned = firstFileName
-    .replace(/\.[^.]+$/, '')                       // Remove file extension
-    .replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '')     // Remove ASIN in brackets
-    .replace(/[\[\(]\d{4}[\]\)]/g, '')             // Remove year in brackets
-    .replace(/^\d+[\s._-]+/, '')                   // Remove leading track numbers
-    .replace(/[_]/g, ' ')                           // Underscores to spaces
-    .replace(/\s+/g, ' ')                           // Collapse whitespace
-    .trim();
+  // Fallback 1: folder name (if provided and not generic)
+  if (folderName && !GENERIC_FOLDER_NAME_RE.test(folderName.trim())) {
+    const cleaned = cleanSearchString(folderName);
+    if (cleaned) {
+      return { searchTerm: cleaned, source: 'folder_name' };
+    }
+  }

+  // Fallback 2: first audio file name
+  const cleaned = cleanSearchString(firstFileName);
  return { searchTerm: cleaned || firstFileName, source: 'file_name' };
 }

 /**
 * Build a normalized grouping key from metadata.
 * Used to determine which files belong to the same book.
- * Returns null if metadata has no title (ungroupable).
+ * Returns null if metadata has no title (ungroupable by metadata).
 */
 function buildGroupingKey(metadata: AudioFileMetadata): string | null {
  if (!metadata.title) return null;
@@ -259,17 +299,23 @@ async function asyncPool<T, R>(
 * Group audio files in a directory by their metadata.
 * Reads metadata from all files using a concurrency pool, then groups them
 * by a normalized key of title + author + narrator.
- * Files with no metadata title each become their own group.
+ *
+ * Files with a metadata title are grouped by their shared key. Files with no
+ * metadata title are all grouped together under a single '__ungrouped_folder'
+ * key (rather than one entry per file), treating the folder as one book.
+ * If a folder contains both tagged and untagged files, the untagged files form
+ * one extra group alongside the tagged groups.
 */
 async function groupAudioFilesByMetadata(
  dirPath: string,
  audioFiles: string[],
-  audioSizes: Map<string, number>
+  audioSizes: Map<string, number>,
+  folderName: string
 ): Promise<Array<{
  files: string[];
  totalSize: number;
  metadata: AudioFileMetadata;
-  metadataSource: 'tags' | 'file_name';
+  metadataSource: 'tags' | 'folder_name' | 'file_name';
  searchTerm: string;
  groupingKey: string;
 }>> {
@@ -291,14 +337,12 @@ async function groupAudioFilesByMetadata(
    metadata: AudioFileMetadata;
  }>();

-  let ungroupedCounter = 0;
-
  for (const { fileName, metadata } of metadataResults) {
    const key = buildGroupingKey(metadata);
    const fileSize = audioSizes.get(fileName) || 0;

    if (key) {
-      // Has metadata — group with others sharing the same key
+      // Has metadata title — group with others sharing the same key
      const existing = groups.get(key);
      if (existing) {
        existing.files.push(fileName);
@@ -311,20 +355,28 @@ async function groupAudioFilesByMetadata(
        });
      }
    } else {
-      // No title metadata — treat as individual book
-      const uniqueKey = `__ungrouped_${ungroupedCounter++}`;
-      groups.set(uniqueKey, {
-        files: [fileName],
-        totalSize: fileSize,
-        metadata,
-      });
+      // No title metadata — collect all such files under one folder-level group.
+      // Key must start with '__ungrouped_' so deduplicateDiscoveries treats it
+      // as unique per folder (prefixes it with folderPath before deduplication).
+      const ungroupedKey = '__ungrouped_folder';
+      const existing = groups.get(ungroupedKey);
+      if (existing) {
+        existing.files.push(fileName);
+        existing.totalSize += fileSize;
+      } else {
+        groups.set(ungroupedKey, {
+          files: [fileName],
+          totalSize: fileSize,
+          metadata,
+        });
+      }
    }
  }

  // Build result with search terms
  return Array.from(groups.entries()).map(([groupingKey, group]) => {
    group.files.sort((a, b) => a.localeCompare(b));
-    const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0]);
+    const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0], folderName);
    return {
      files: group.files,
      totalSize: group.totalSize,
@@ -398,6 +450,7 @@ function deduplicateDiscoveries(
      metadata: first.metadata,
      searchTerm: first.searchTerm,
      metadataSource: first.metadataSource,
+      extractedAsin: first.extractedAsin,
      audioFiles: combinedFiles,
      groupingKey: first.groupingKey,
    });
@@ -434,9 +487,10 @@ function findCommonParent(paths: string[]): string {
 *
 * Scans every folder for audio files. When audio files are found, they are
 * grouped by metadata (title + author + narrator) — each group becomes a
- * separate discovered audiobook. Files with no metadata are treated as
- * individual books. Scanning ALWAYS recurses into subfolders regardless of
- * whether the current folder has audio files.
+ * separate discovered audiobook. Files with no metadata are all grouped
+ * together per folder (treated as one book) rather than one entry per file.
+ * Scanning ALWAYS recurses into subfolders regardless of whether the current
+ * folder has audio files.
 *
 * After the full walk, discoveries sharing the same grouping key across
 * different folders (e.g., CD1/ and CD2/) are merged.
@@ -460,11 +514,13 @@ export async function discoverAudiobooks(

    foldersScanned++;

+    const folderName = path.basename(currentPath);
+
    onProgress?.({
      phase: 'discovering',
      foldersScanned,
      audiobooksFound: results.length,
-      currentFolder: path.basename(currentPath),
+      currentFolder: folderName,
    });

    // Check if this folder contains audio files
@@ -486,19 +542,22 @@ export async function discoverAudiobooks(
        phase: 'grouping',
        foldersScanned,
        audiobooksFound: results.length,
-        currentFolder: path.basename(currentPath),
+        currentFolder: folderName,
      });

-      // Group audio files by metadata
+      // Group audio files by metadata, passing folder name for fallback search terms
      const groups = await groupAudioFilesByMetadata(
        currentPath,
        audioResult.audioFiles,
-        audioSizes
+        audioSizes,
+        folderName
      );

-      const folderName = path.basename(currentPath);
      const relativePath = path.relative(rootPath, currentPath).replace(/\\/g, '/');

+      // Extract ASIN from folder name once for all groups in this folder
+      const extractedAsin = extractAsinFromString(folderName) ?? undefined;
+
      for (const group of groups) {
        results.push({
          folderPath: currentPath.replace(/\\/g, '/'),
@@ -509,6 +568,7 @@ export async function discoverAudiobooks(
          metadata: group.metadata,
          searchTerm: group.searchTerm,
          metadataSource: group.metadataSource,
+          extractedAsin,
          audioFiles: group.files,
          groupingKey: group.groupingKey,
        });
@@ -518,7 +578,7 @@ export async function discoverAudiobooks(
        phase: 'reading_metadata',
        foldersScanned,
        audiobooksFound: results.length,
-        currentFolder: path.basename(currentPath),
+        currentFolder: folderName,
      });
    }