Add admin Bulk Import feature

Introduce a Bulk Import feature for admins to scan server folders, match discovered audiobook folders against Audible, review matches, and queue batch imports. What changed: - Added documentation: documentation/features/bulk-import.md and TABLEOFCONTENTS update. - Backend: SSE scan endpoint (POST /api/admin/bulk-import/scan) streams discovery and matching events; execute endpoint (POST /api/admin/bulk-import/execute) validates paths, creates/resolves audiobook & request records, and queues organize_files jobs. Both endpoints enforce admin-only access and validate allowed root directories (download_dir, media_dir, /bookdrop). - Frontend: Modal wizard and steps for folder selection, scan progress, and match review (BulkImportWizard + ScanFolderStep, ScanProgressStep, MatchReviewStep + shared types). - Utilities: bulk-import-scanner for folder discovery and ffprobe metadata extraction; shared types for scanned books/events. - UI: Added Bulk Import quick action to admin dashboard (src/app/admin/page.tsx). Key details: - Audible searches are rate-limited (≈1.5s) and matching results include library/request status checks. - Reuses existing organize_files job queue and manual-import pipeline; no new database tables introduced (state is ephemeral during the wizard). - Includes error handling, path normalization, and security checks for allowed directories. This commit wires frontend, backend, and docs together to provide an admin-only multi-step bulk import workflow.
2026-06-03 21:00:09 +00:00 · 2026-03-13 12:03:21 -04:00
parent 6fc622c4e7
commit c8bfcdb611
11 changed files with 2275 additions and 1 deletions
@@ -0,0 +1,294 @@
+/**
+ * Component: Bulk Import Scanner Utility
+ * Documentation: documentation/features/bulk-import.md
+ *
+ * Recursively discovers audiobook folders, reads embedded metadata via ffprobe,
+ * and prepares search terms for Audible matching. Used by the bulk import API.
+ */
+
+import { exec } from 'child_process';
+import { promisify } from 'util';
+import path from 'path';
+import fs from 'fs/promises';
+import { AUDIO_EXTENSIONS } from '../constants/audio-formats';
+
+const execPromise = promisify(exec);
+
+/** Maximum recursion depth for folder scanning. */
+export const MAX_SCAN_DEPTH = 10;
+
+/** Metadata extracted from an audio file via ffprobe. */
+export interface AudioFileMetadata {
+  title?: string;              // From 'album' tag (book title)
+  author?: string;             // From 'album_artist' tag
+  narrator?: string;           // From 'composer' tag
+  contributingArtists?: string; // From 'artist' tag (contributing artists)
+  trackTitle?: string;         // From 'title' tag (chapter/track name)
+}
+
+/** A discovered audiobook folder with its metadata and file info. */
+export interface DiscoveredAudiobook {
+  folderPath: string;
+  folderName: string;
+  relativePath: string;       // Relative to scan root
+  audioFileCount: number;
+  totalSizeBytes: number;
+  metadata: AudioFileMetadata;
+  searchTerm: string;         // Constructed search query for Audible
+  metadataSource: 'tags' | 'file_name';  // Where the search term came from
+}
+
+/** Progress callback for streaming updates to the caller. */
+export interface ScanProgress {
+  phase: 'discovering' | 'reading_metadata';
+  foldersScanned: number;
+  audiobooksFound: number;
+  currentFolder?: string;
+}
+
+/**
+ * Check if a file has a supported audio extension.
+ */
+function isAudioFile(filename: string): boolean {
+  const ext = path.extname(filename).toLowerCase();
+  return (AUDIO_EXTENSIONS as readonly string[]).includes(ext);
+}
+
+/**
+ * Read audio metadata from a file using ffprobe.
+ * Extracts album, album_artist, composer, and title tags.
+ * Returns empty metadata on any failure (non-blocking).
+ */
+export async function readAudioMetadata(filePath: string): Promise<AudioFileMetadata> {
+  try {
+    const command = `ffprobe -v quiet -print_format json -show_format "${filePath}"`;
+    const { stdout } = await execPromise(command, { timeout: 15000 });
+    const data = JSON.parse(stdout);
+
+    const tags = data?.format?.tags || {};
+
+    // ffprobe tag names can be case-insensitive; check common variants
+    const album = tags.album || tags.ALBUM || tags.Album || undefined;
+    const albumArtist = tags.album_artist || tags.ALBUM_ARTIST || tags['Album Artist']
+      || tags.albumartist || tags.ALBUMARTIST || undefined;
+    const composer = tags.composer || tags.COMPOSER || tags.Composer || undefined;
+    const artist = tags.artist || tags.ARTIST || tags.Artist
+      || tags['Contributing artists'] || tags['CONTRIBUTING ARTISTS'] || undefined;
+    const title = tags.title || tags.TITLE || tags.Title || undefined;
+
+    return {
+      title: album || undefined,
+      author: albumArtist || undefined,
+      narrator: composer || undefined,
+      contributingArtists: artist || undefined,
+      trackTitle: title || undefined,
+    };
+  } catch {
+    return {};
+  }
+}
+
+/**
+ * Deduplicate names across author, narrator, and contributing artists fields.
+ * Sometimes Album Artist contains "Author, Narrator" and Composer also has "Narrator",
+ * and Contributing Artists may overlap with both.
+ * We split on common delimiters and cross-reference to remove duplicates.
+ */
+export function deduplicateNames(
+  rawAuthor?: string,
+  rawNarrator?: string,
+  rawContributingArtists?: string
+): { author?: string; narrator?: string; contributingArtists?: string } {
+  const splitNames = (str: string): string[] =>
+    str.split(/[,;&]/).map((s) => s.trim()).filter(Boolean);
+
+  const normalize = (s: string) => s.toLowerCase().replace(/\s+/g, ' ').trim();
+
+  const authorNames = rawAuthor ? splitNames(rawAuthor) : [];
+  const narratorNames = rawNarrator ? splitNames(rawNarrator) : [];
+  const contributingNames = rawContributingArtists ? splitNames(rawContributingArtists) : [];
+
+  // Build sets for cross-referencing
+  const authorNormalized = new Set(authorNames.map(normalize));
+  const narratorNormalized = new Set(narratorNames.map(normalize));
+
+  // Remove from author list any name that appears in narrator list
+  const dedupedAuthors = authorNames.filter(
+    (name) => !narratorNormalized.has(normalize(name))
+  );
+
+  // Remove from contributing artists any name already in author or narrator
+  const allKnown = new Set([...authorNormalized, ...narratorNormalized]);
+  const dedupedContributing = contributingNames.filter(
+    (name) => !allKnown.has(normalize(name))
+  );
+
+  return {
+    author: dedupedAuthors.length > 0 ? dedupedAuthors.join(', ')
+      : rawAuthor || undefined,
+    narrator: rawNarrator || undefined,
+    contributingArtists: dedupedContributing.length > 0
+      ? dedupedContributing.join(', ')
+      : undefined,
+  };
+}
+
+/**
+ * Build a search term from metadata or file name.
+ * Returns the search term and the source it was derived from.
+ * When metadata tags are present, constructs "Title Author Narrator ContributingArtists".
+ * When tags are empty, falls back to the first audio file's name (cleaned).
+ */
+export function buildSearchTerm(
+  metadata: AudioFileMetadata,
+  firstFileName: string
+): { searchTerm: string; source: 'tags' | 'file_name' } {
+  const { author, narrator, contributingArtists } = deduplicateNames(
+    metadata.author,
+    metadata.narrator,
+    metadata.contributingArtists
+  );
+  const title = metadata.title;
+
+  // If we have at least a title from metadata, use tags
+  if (title) {
+    const parts = [title];
+    if (author) parts.push(author);
+    if (narrator) parts.push(narrator);
+    if (contributingArtists) parts.push(contributingArtists);
+    return { searchTerm: parts.join(' '), source: 'tags' };
+  }
+
+  // Fallback: clean up the first audio file name and use it as search term
+  const cleaned = firstFileName
+    .replace(/\.[^.]+$/, '')                       // Remove file extension
+    .replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '')     // Remove ASIN in brackets
+    .replace(/[\[\(]\d{4}[\]\)]/g, '')             // Remove year in brackets
+    .replace(/^\d+[\s._-]+/, '')                   // Remove leading track numbers
+    .replace(/[_]/g, ' ')                           // Underscores to spaces
+    .replace(/\s+/g, ' ')                           // Collapse whitespace
+    .trim();
+
+  return { searchTerm: cleaned || firstFileName, source: 'file_name' };
+}
+
+/**
+ * Scan a single directory for audio files.
+ * Returns audio file names and total size, or null if no audio files found.
+ */
+async function scanDirectoryForAudio(
+  dirPath: string
+): Promise<{ audioFiles: string[]; totalSize: number } | null> {
+  try {
+    const children = await fs.readdir(dirPath, { withFileTypes: true });
+    const audioFiles: string[] = [];
+    let totalSize = 0;
+
+    for (const child of children) {
+      if (child.isFile() && isAudioFile(child.name)) {
+        audioFiles.push(child.name);
+        try {
+          const stat = await fs.stat(path.join(dirPath, child.name));
+          totalSize += stat.size;
+        } catch {
+          /* skip unreadable files */
+        }
+      }
+    }
+
+    if (audioFiles.length === 0) return null;
+
+    audioFiles.sort((a, b) => a.localeCompare(b));
+    return { audioFiles, totalSize };
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Recursively discover audiobook folders starting from a root path.
+ *
+ * A folder is classified as an "audiobook folder" if it contains audio files.
+ * Once a folder is classified as an audiobook, its subfolders are NOT scanned
+ * further (the audio-containing folder is the audiobook boundary).
+ *
+ * @param rootPath - The root directory to scan
+ * @param onProgress - Optional callback for progress updates
+ * @param abortSignal - Optional AbortSignal to cancel the scan
+ * @returns Array of discovered audiobook folders with metadata
+ */
+export async function discoverAudiobooks(
+  rootPath: string,
+  onProgress?: (progress: ScanProgress) => void,
+  abortSignal?: AbortSignal
+): Promise<DiscoveredAudiobook[]> {
+  const results: DiscoveredAudiobook[] = [];
+  let foldersScanned = 0;
+
+  async function walk(currentPath: string, depth: number): Promise<void> {
+    if (depth > MAX_SCAN_DEPTH) return;
+    if (abortSignal?.aborted) return;
+
+    foldersScanned++;
+
+    onProgress?.({
+      phase: 'discovering',
+      foldersScanned,
+      audiobooksFound: results.length,
+      currentFolder: path.basename(currentPath),
+    });
+
+    // Check if this folder contains audio files
+    const audioResult = await scanDirectoryForAudio(currentPath);
+
+    if (audioResult) {
+      // This is an audiobook folder — read metadata and add to results
+      const firstFile = path.join(currentPath, audioResult.audioFiles[0]);
+      const metadata = await readAudioMetadata(firstFile);
+
+      onProgress?.({
+        phase: 'reading_metadata',
+        foldersScanned,
+        audiobooksFound: results.length + 1,
+        currentFolder: path.basename(currentPath),
+      });
+
+      const folderName = path.basename(currentPath);
+      const relativePath = path.relative(rootPath, currentPath).replace(/\\/g, '/');
+      const firstFileName = audioResult.audioFiles[0];
+      const { searchTerm, source } = buildSearchTerm(metadata, firstFileName);
+
+      results.push({
+        folderPath: currentPath.replace(/\\/g, '/'),
+        folderName,
+        relativePath: relativePath || folderName,
+        audioFileCount: audioResult.audioFiles.length,
+        totalSizeBytes: audioResult.totalSize,
+        metadata,
+        searchTerm,
+        metadataSource: source,
+      });
+
+      // Do NOT recurse into subfolders of audiobook folders
+      return;
+    }
+
+    // No audio files here — recurse into subfolders
+    try {
+      const children = await fs.readdir(currentPath, { withFileTypes: true });
+      const subdirs = children
+        .filter((c) => c.isDirectory() && !c.name.startsWith('.'))
+        .sort((a, b) => a.name.localeCompare(b.name));
+
+      for (const subdir of subdirs) {
+        if (abortSignal?.aborted) return;
+        await walk(path.join(currentPath, subdir.name), depth + 1);
+      }
+    } catch {
+      /* directory not readable — skip */
+    }
+  }
+
+  await walk(rootPath, 0);
+  return results;
+}