Merge pull request #173 from MattiasC/feature/bulk-import-folder-fallback

Bulk import enhancement: group tagless files by folder and use folder name as search fallback
2026-06-03 21:00:09 +00:00 · 2026-05-14 16:15:41 -04:00
parent d1a980e210 8376355233
commit 1711d256c2
5 changed files with 169 additions and 47 deletions
@@ -13,9 +13,13 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
 ## Key Details
 - **Access:** Admin-only, modal opened from admin dashboard Quick Actions
 - **Audio detection:** Uses `AUDIO_EXTENSIONS` from `src/lib/constants/audio-formats.ts`
- **Audiobook boundary:** A folder containing audio files = one audiobook; subfolders not scanned further
+- **Audiobook boundary:** A folder containing audio files = one audiobook. Files with matching metadata tags are grouped by title+author+narrator. Files with no metadata title tag are all grouped together per folder (one entry, not one per file).
- **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from first audio file
+- **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from all audio files in folder
- **Fallback:** If metadata tags are empty, folder name used as search term; "Low Confidence" badge shown
+- **Search term fallback chain** (when no `album` tag):
  1. **ASIN in folder name** — scans folder name for pattern `B[A-Z0-9]{9}` bounded by bracket/paren/space; if found, uses direct ASIN lookup instead of text search; no badge shown
  2. **Folder name** — cleaned (strips bracketed ASIN/year, underscores→spaces); skipped if generic (CD1, Disc 2, Part 3, Vol 1, etc.); shows "Low Confidence" badge
  3. **First file name** — last resort; shows "Low Confidence" badge
 - **Generic folder detection:** `/^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i` — these names are skipped as search terms
 - **Author/narrator dedup:** Splits on `,;& ` delimiters, removes names appearing in both fields
 - **Scan depth:** Max 10 levels recursion
 - **Rate limiting:** 1.5s delay between Audible searches (same as existing scraping rate limit)
@@ -56,7 +60,8 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
 | Already in library | 40% opacity, green "In Library" badge, toggle disabled |
 | Active request exists | 40% opacity, purple "Requested" badge, toggle disabled |
 | No Audible match | Red "No Match" badge, folder name shown, pre-skipped |
-| Low confidence (folder name fallback) | Amber "Low Confidence" badge |
+| ASIN extracted from folder name | No badge (high confidence — direct ASIN lookup) |
 | Low confidence (folder name or file name fallback, no ASIN) | Amber "Low Confidence" badge |
 ## Files
@@ -159,10 +159,42 @@ export async function POST(request: NextRequest) {
              let hasActiveRequest = false;
              try {
-                const searchResult = await audibleService.search(book.searchTerm);
+                // If the scanner extracted an ASIN directly from the folder name,
                // use a direct ASIN lookup (Audnexus API) — more reliable than a
                // keyword text search. Fall back to text search if the lookup fails.
                if (book.extractedAsin) {
                  try {
                    const asinResult = await audibleService.getAudiobookDetails(book.extractedAsin);
                    if (asinResult) {
                      match = asinResult;
                    }
                  } catch {
                    /* ASIN lookup failed — fall through to text search */
                  }
                }
                if (!match) {
                  // When an ASIN was extracted from the folder name but the direct
                  // lookup failed, prefer the folder name as the text search term
                  // over book.searchTerm. book.searchTerm may come from a single
                  // tagged file whose album tag is unreliable (e.g. a series name
                  // or intro track), whereas the folder name is the human-assigned
                  // title and is more likely to be accurate.
                  const textSearchTerm = book.extractedAsin
                    ? book.folderName
                        .replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '') // strip ASIN
                        .replace(/[\[\(]\d{4}[\]\)]/g, '')         // strip year
                        .replace(/[_]/g, ' ')
                        .replace(/\s+/g, ' ')
                        .trim()
                    : book.searchTerm;
                  const searchResult = await audibleService.search(textSearchTerm);
                  if (searchResult.results.length > 0) {
                    match = searchResult.results[0];
                  }
                }
                if (match) {
                  // Check library availability
                  const plexMatch = await findPlexMatch({
@@ -208,6 +240,7 @@ export async function POST(request: NextRequest) {
                audioFileCount: book.audioFileCount,
                totalSizeBytes: book.totalSizeBytes,
                metadataSource: book.metadataSource,
                extractedAsin: book.extractedAsin,
                searchTerm: book.searchTerm,
                audioFiles: book.audioFiles,
                match: match
@@ -39,7 +39,12 @@ function BookRow({
  const isDisabled = book.inLibrary || book.hasActiveRequest;
  const isSkipped = book.skipped;
  const hasMatch = book.match !== null;
-  const isLowConfidence = book.metadataSource === 'file_name';
+  // Low confidence when search term came from a filename or folder name fallback,
  // BUT not when an ASIN was extracted directly from the folder name (that's a
  // direct lookup and is as reliable as embedded metadata tags).
  const isLowConfidence =
    (book.metadataSource === 'file_name' || book.metadataSource === 'folder_name') &&
    !book.extractedAsin;
  return (
    <div
@@ -34,7 +34,9 @@ export interface ScannedBook {
  relativePath: string;
  audioFileCount: number;
  totalSizeBytes: number;
-  metadataSource: 'tags' | 'file_name';
+  metadataSource: 'tags' | 'folder_name' | 'file_name';
  /** ASIN extracted directly from the folder name, if present. */
  extractedAsin?: string;
  searchTerm: string;
  audioFiles: string[];
  match: AudibleMatch | null;
@@ -21,6 +21,12 @@ export const MAX_SCAN_DEPTH = 10;
 /** Maximum concurrent ffprobe calls for metadata reads. */
 const METADATA_CONCURRENCY = 10;
 /**
 * Folder names matching this pattern are considered generic and should not be
 * used as Audible search terms (e.g. "CD1", "Disc 2", "Part 3", "Volume 1").
 */
 const GENERIC_FOLDER_NAME_RE = /^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i;
 /** Metadata extracted from an audio file via ffprobe. */
 export interface AudioFileMetadata {
  title?: string;              // From 'album' tag (book title)
@@ -39,7 +45,8 @@ export interface DiscoveredAudiobook {
  totalSizeBytes: number;
  metadata: AudioFileMetadata;
  searchTerm: string;         // Constructed search query for Audible
-  metadataSource: 'tags' | 'file_name';  // Where the search term came from
+  metadataSource: 'tags' | 'folder_name' | 'file_name';  // Where the search term came from
  extractedAsin?: string;     // ASIN extracted directly from folder name, if present
  audioFiles: string[];       // File names (relative to folderPath) belonging to this book
  groupingKey: string;        // Normalized key for cross-folder deduplication
 }
@@ -60,6 +67,18 @@ function isAudioFile(filename: string): boolean {
  return (AUDIO_EXTENSIONS as readonly string[]).includes(ext);
 }
 /**
 * Extract an Audible ASIN from a string (typically a folder name).
 * Audible ASINs start with 'B' and are exactly 10 alphanumeric characters.
 * The ASIN must be bounded by a bracket, parenthesis, whitespace, or string
 * boundary to avoid false positives from random alphanumeric sequences.
 * Returns the ASIN string or null if not found.
 */
 export function extractAsinFromString(str: string): string | null {
  const match = str.match(/(?:^|[\s\[\(])([B][A-Z0-9]{9})(?:$|[\s\]\)])/);
  return match ? match[1] : null;
 }
 /**
 * Read audio metadata from a file using ffprobe.
 * Extracts album, album_artist, composer, and title tags.
@@ -140,15 +159,36 @@ export function deduplicateNames(
 }
 /**
- * Build a search term from metadata or file name.
+ * Clean a raw string (folder name or file name) for use as an Audible search term.
 * Strips file extension, bracketed ASINs, bracketed years, leading track numbers,
 * underscores, and collapses whitespace.
 */
 function cleanSearchString(raw: string): string {
  return raw
    .replace(/\.[^.]+$/, '')                       // Remove file extension
    .replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '')     // Remove ASIN in brackets
    .replace(/[\[\(]\d{4}[\]\)]/g, '')             // Remove year in brackets
    .replace(/^\d+[\s._-]+/, '')                   // Remove leading track numbers
    .replace(/[_]/g, ' ')                           // Underscores to spaces
    .replace(/\s+/g, ' ')                           // Collapse whitespace
    .trim();
 }
 /**
 * Build a search term from metadata or folder/file name.
 * Returns the search term and the source it was derived from.
 *
 * Fallback chain (when no album metadata tag is present):
 *   1. Folder name — if provided and not a generic name (CD1, Disc 2, Part 3, etc.)
 *   2. First audio file name — last resort, always available
 *
 * When metadata tags are present, constructs "Title Author Narrator ContributingArtists".
 * When tags are empty, falls back to the first audio file's name (cleaned).
 */
 export function buildSearchTerm(
  metadata: AudioFileMetadata,
-  firstFileName: string
+  firstFileName: string,
-): { searchTerm: string; source: 'tags' | 'file_name' } {
+  folderName?: string
 ): { searchTerm: string; source: 'tags' | 'folder_name' | 'file_name' } {
  const { author, narrator, contributingArtists } = deduplicateNames(
    metadata.author,
    metadata.narrator,
@@ -165,23 +205,23 @@ export function buildSearchTerm(
    return { searchTerm: parts.join(' '), source: 'tags' };
  }
-  // Fallback: clean up the first audio file name and use it as search term
+  // Fallback 1: folder name (if provided and not generic)
-  const cleaned = firstFileName
+  if (folderName && !GENERIC_FOLDER_NAME_RE.test(folderName.trim())) {
-    .replace(/\.[^.]+$/, '')                       // Remove file extension
+    const cleaned = cleanSearchString(folderName);
-    .replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '')     // Remove ASIN in brackets
+    if (cleaned) {
-    .replace(/[\[\(]\d{4}[\]\)]/g, '')             // Remove year in brackets
+      return { searchTerm: cleaned, source: 'folder_name' };
-    .replace(/^\d+[\s._-]+/, '')                   // Remove leading track numbers
+    }
-    .replace(/[_]/g, ' ')                           // Underscores to spaces
+  }
    .replace(/\s+/g, ' ')                           // Collapse whitespace
    .trim();
  // Fallback 2: first audio file name
  const cleaned = cleanSearchString(firstFileName);
  return { searchTerm: cleaned || firstFileName, source: 'file_name' };
 }
 /**
 * Build a normalized grouping key from metadata.
 * Used to determine which files belong to the same book.
- * Returns null if metadata has no title (ungroupable).
+ * Returns null if metadata has no title (ungroupable by metadata).
 */
 function buildGroupingKey(metadata: AudioFileMetadata): string | null {
  if (!metadata.title) return null;
@@ -259,17 +299,23 @@ async function asyncPool<T, R>(
 * Group audio files in a directory by their metadata.
 * Reads metadata from all files using a concurrency pool, then groups them
 * by a normalized key of title + author + narrator.
- * Files with no metadata title each become their own group.
+ *
 * Files with a metadata title are grouped by their shared key. Files with no
 * metadata title are all grouped together under a single '__ungrouped_folder'
 * key (rather than one entry per file), treating the folder as one book.
 * If a folder contains both tagged and untagged files, the untagged files form
 * one extra group alongside the tagged groups.
 */
 async function groupAudioFilesByMetadata(
  dirPath: string,
  audioFiles: string[],
-  audioSizes: Map<string, number>
+  audioSizes: Map<string, number>,
  folderName: string
 ): Promise<Array<{
  files: string[];
  totalSize: number;
  metadata: AudioFileMetadata;
-  metadataSource: 'tags' | 'file_name';
+  metadataSource: 'tags' | 'folder_name' | 'file_name';
  searchTerm: string;
  groupingKey: string;
 }>> {
@@ -291,14 +337,12 @@ async function groupAudioFilesByMetadata(
    metadata: AudioFileMetadata;
  }>();
  let ungroupedCounter = 0;
  for (const { fileName, metadata } of metadataResults) {
    const key = buildGroupingKey(metadata);
    const fileSize = audioSizes.get(fileName) || 0;
    if (key) {
-      // Has metadata — group with others sharing the same key
+      // Has metadata title — group with others sharing the same key
      const existing = groups.get(key);
      if (existing) {
        existing.files.push(fileName);
@@ -311,20 +355,45 @@ async function groupAudioFilesByMetadata(
        });
      }
    } else {
-      // No title metadata — treat as individual book
+      // No title metadata — collect all such files under one folder-level group.
-      const uniqueKey = `__ungrouped_${ungroupedCounter++}`;
+      // Key must start with '__ungrouped_' so deduplicateDiscoveries treats it
-      groups.set(uniqueKey, {
+      // as unique per folder (prefixes it with folderPath before deduplication).
      const ungroupedKey = '__ungrouped_folder';
      const existing = groups.get(ungroupedKey);
      if (existing) {
        existing.files.push(fileName);
        existing.totalSize += fileSize;
      } else {
        groups.set(ungroupedKey, {
          files: [fileName],
          totalSize: fileSize,
          metadata,
        });
      }
    }
  }
  // If there is exactly one tagged group alongside an ungrouped group, absorb
  // the untagged files into the tagged group. Untagged files in the same folder
  // almost certainly belong to the same book (e.g. one chapter was ripped
  // without tags, or a cover/intro file carries different metadata).
  // Only do this when there is a single tagged group — multiple tagged groups
  // mean genuinely different books are mixed in the folder, so keep them separate.
  const ungrouped = groups.get('__ungrouped_folder');
  if (ungrouped) {
    const taggedKeys = Array.from(groups.keys()).filter((k) => k !== '__ungrouped_folder');
    if (taggedKeys.length === 1) {
      const taggedGroup = groups.get(taggedKeys[0])!;
      taggedGroup.files.push(...ungrouped.files);
      taggedGroup.totalSize += ungrouped.totalSize;
      groups.delete('__ungrouped_folder');
    }
  }
  // Build result with search terms
  return Array.from(groups.entries()).map(([groupingKey, group]) => {
    group.files.sort((a, b) => a.localeCompare(b));
-    const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0]);
+    const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0], folderName);
    return {
      files: group.files,
      totalSize: group.totalSize,
@@ -398,6 +467,7 @@ function deduplicateDiscoveries(
      metadata: first.metadata,
      searchTerm: first.searchTerm,
      metadataSource: first.metadataSource,
      extractedAsin: first.extractedAsin,
      audioFiles: combinedFiles,
      groupingKey: first.groupingKey,
    });
@@ -434,9 +504,10 @@ function findCommonParent(paths: string[]): string {
 *
 * Scans every folder for audio files. When audio files are found, they are
 * grouped by metadata (title + author + narrator) — each group becomes a
- * separate discovered audiobook. Files with no metadata are treated as
+ * separate discovered audiobook. Files with no metadata are all grouped
- * individual books. Scanning ALWAYS recurses into subfolders regardless of
+ * together per folder (treated as one book) rather than one entry per file.
- * whether the current folder has audio files.
+ * Scanning ALWAYS recurses into subfolders regardless of whether the current
 * folder has audio files.
 *
 * After the full walk, discoveries sharing the same grouping key across
 * different folders (e.g., CD1/ and CD2/) are merged.
@@ -460,11 +531,13 @@ export async function discoverAudiobooks(
    foldersScanned++;
    const folderName = path.basename(currentPath);
    onProgress?.({
      phase: 'discovering',
      foldersScanned,
      audiobooksFound: results.length,
-      currentFolder: path.basename(currentPath),
+      currentFolder: folderName,
    });
    // Check if this folder contains audio files
@@ -486,19 +559,22 @@ export async function discoverAudiobooks(
        phase: 'grouping',
        foldersScanned,
        audiobooksFound: results.length,
-        currentFolder: path.basename(currentPath),
+        currentFolder: folderName,
      });
-      // Group audio files by metadata
+      // Group audio files by metadata, passing folder name for fallback search terms
      const groups = await groupAudioFilesByMetadata(
        currentPath,
        audioResult.audioFiles,
-        audioSizes
+        audioSizes,
        folderName
      );
      const folderName = path.basename(currentPath);
      const relativePath = path.relative(rootPath, currentPath).replace(/\\/g, '/');
      // Extract ASIN from folder name once for all groups in this folder
      const extractedAsin = extractAsinFromString(folderName) ?? undefined;
      for (const group of groups) {
        results.push({
          folderPath: currentPath.replace(/\\/g, '/'),
@@ -509,6 +585,7 @@ export async function discoverAudiobooks(
          metadata: group.metadata,
          searchTerm: group.searchTerm,
          metadataSource: group.metadataSource,
          extractedAsin,
          audioFiles: group.files,
          groupingKey: group.groupingKey,
        });
@@ -518,7 +595,7 @@ export async function discoverAudiobooks(
        phase: 'reading_metadata',
        foldersScanned,
        audiobooksFound: results.length,
-        currentFolder: path.basename(currentPath),
+        currentFolder: folderName,
      });
    }