Fix bulk import: group tagless files by folder, use folder name as search fallback

This commit is contained in:
Mattias Carlsson
2026-04-10 10:22:01 +02:00
parent 54b54d343a
commit 35cb318389
5 changed files with 140 additions and 45 deletions
+9 -4
View File
@@ -13,9 +13,13 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
## Key Details
- **Access:** Admin-only, modal opened from admin dashboard Quick Actions
- **Audio detection:** Uses `AUDIO_EXTENSIONS` from `src/lib/constants/audio-formats.ts`
- **Audiobook boundary:** A folder containing audio files = one audiobook; subfolders not scanned further
- **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from first audio file
- **Fallback:** If metadata tags are empty, folder name used as search term; "Low Confidence" badge shown
- **Audiobook boundary:** A folder containing audio files = one audiobook. Files with matching metadata tags are grouped by title+author+narrator. Files with no metadata title tag are all grouped together per folder (one entry, not one per file).
- **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from all audio files in folder
- **Search term fallback chain** (when no `album` tag):
1. **ASIN in folder name** — scans folder name for pattern `B[A-Z0-9]{9}` bounded by bracket/paren/space; if found, uses direct ASIN lookup instead of text search; no badge shown
2. **Folder name** — cleaned (strips bracketed ASIN/year, underscores→spaces); skipped if generic (CD1, Disc 2, Part 3, Vol 1, etc.); shows "Low Confidence" badge
3. **First file name** — last resort; shows "Low Confidence" badge
- **Generic folder detection:** `/^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i` — these names are skipped as search terms
- **Author/narrator dedup:** Splits on `,;& ` delimiters, removes names appearing in both fields
- **Scan depth:** Max 10 levels recursion
- **Rate limiting:** 1.5s delay between Audible searches (same as existing scraping rate limit)
@@ -56,7 +60,8 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
| Already in library | 40% opacity, green "In Library" badge, toggle disabled |
| Active request exists | 40% opacity, purple "Requested" badge, toggle disabled |
| No Audible match | Red "No Match" badge, folder name shown, pre-skipped |
| Low confidence (folder name fallback) | Amber "Low Confidence" badge |
| ASIN extracted from folder name | No badge (high confidence — direct ASIN lookup) |
| Low confidence (folder name or file name fallback, no ASIN) | Amber "Low Confidence" badge |
## Files
+24 -1
View File
@@ -159,7 +159,29 @@ export async function POST(request: NextRequest) {
let hasActiveRequest = false;
try {
const searchResult = await audibleService.search(book.searchTerm);
// If the scanner extracted an ASIN directly from the folder name,
// try an exact ASIN lookup first — faster and more accurate than
// a text search. Fall back to text search if it fails or returns
// no result.
let searchResult: Awaited<ReturnType<typeof audibleService.search>> | null = null;
if (book.extractedAsin) {
try {
const asinResult = await audibleService.search(book.extractedAsin);
if (
asinResult.results.length > 0 &&
asinResult.results[0].asin === book.extractedAsin
) {
searchResult = asinResult;
}
} catch {
/* ASIN lookup failed — fall through to text search */
}
}
if (!searchResult) {
searchResult = await audibleService.search(book.searchTerm);
}
if (searchResult.results.length > 0) {
match = searchResult.results[0];
@@ -208,6 +230,7 @@ export async function POST(request: NextRequest) {
audioFileCount: book.audioFileCount,
totalSizeBytes: book.totalSizeBytes,
metadataSource: book.metadataSource,
extractedAsin: book.extractedAsin,
searchTerm: book.searchTerm,
audioFiles: book.audioFiles,
match: match
@@ -39,7 +39,12 @@ function BookRow({
const isDisabled = book.inLibrary || book.hasActiveRequest;
const isSkipped = book.skipped;
const hasMatch = book.match !== null;
const isLowConfidence = book.metadataSource === 'file_name';
// Low confidence when search term came from a filename or folder name fallback,
// BUT not when an ASIN was extracted directly from the folder name (that's a
// direct lookup and is as reliable as embedded metadata tags).
const isLowConfidence =
(book.metadataSource === 'file_name' || book.metadataSource === 'folder_name') &&
!book.extractedAsin;
return (
<div
+3 -1
View File
@@ -34,7 +34,9 @@ export interface ScannedBook {
relativePath: string;
audioFileCount: number;
totalSizeBytes: number;
metadataSource: 'tags' | 'file_name';
metadataSource: 'tags' | 'folder_name' | 'file_name';
/** ASIN extracted directly from the folder name, if present. */
extractedAsin?: string;
searchTerm: string;
audioFiles: string[];
match: AudibleMatch | null;
+98 -38
View File
@@ -21,6 +21,12 @@ export const MAX_SCAN_DEPTH = 10;
/** Maximum concurrent ffprobe calls for metadata reads. */
const METADATA_CONCURRENCY = 10;
/**
* Folder names matching this pattern are considered generic and should not be
* used as Audible search terms (e.g. "CD1", "Disc 2", "Part 3", "Volume 1").
*/
const GENERIC_FOLDER_NAME_RE = /^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i;
/** Metadata extracted from an audio file via ffprobe. */
export interface AudioFileMetadata {
title?: string; // From 'album' tag (book title)
@@ -39,7 +45,8 @@ export interface DiscoveredAudiobook {
totalSizeBytes: number;
metadata: AudioFileMetadata;
searchTerm: string; // Constructed search query for Audible
metadataSource: 'tags' | 'file_name'; // Where the search term came from
metadataSource: 'tags' | 'folder_name' | 'file_name'; // Where the search term came from
extractedAsin?: string; // ASIN extracted directly from folder name, if present
audioFiles: string[]; // File names (relative to folderPath) belonging to this book
groupingKey: string; // Normalized key for cross-folder deduplication
}
@@ -60,6 +67,18 @@ function isAudioFile(filename: string): boolean {
return (AUDIO_EXTENSIONS as readonly string[]).includes(ext);
}
/**
* Extract an Audible ASIN from a string (typically a folder name).
* Audible ASINs start with 'B' and are exactly 10 alphanumeric characters.
* The ASIN must be bounded by a bracket, parenthesis, whitespace, or string
* boundary to avoid false positives from random alphanumeric sequences.
* Returns the ASIN string or null if not found.
*/
export function extractAsinFromString(str: string): string | null {
const match = str.match(/(?:^|[\s\[\(])([B][A-Z0-9]{9})(?:$|[\s\]\)])/);
return match ? match[1] : null;
}
/**
* Read audio metadata from a file using ffprobe.
* Extracts album, album_artist, composer, and title tags.
@@ -140,15 +159,36 @@ export function deduplicateNames(
}
/**
* Build a search term from metadata or file name.
* Clean a raw string (folder name or file name) for use as an Audible search term.
* Strips file extension, bracketed ASINs, bracketed years, leading track numbers,
* underscores, and collapses whitespace.
*/
function cleanSearchString(raw: string): string {
return raw
.replace(/\.[^.]+$/, '') // Remove file extension
.replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '') // Remove ASIN in brackets
.replace(/[\[\(]\d{4}[\]\)]/g, '') // Remove year in brackets
.replace(/^\d+[\s._-]+/, '') // Remove leading track numbers
.replace(/[_]/g, ' ') // Underscores to spaces
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Build a search term from metadata or folder/file name.
* Returns the search term and the source it was derived from.
*
* Fallback chain (when no album metadata tag is present):
* 1. Folder name — if provided and not a generic name (CD1, Disc 2, Part 3, etc.)
* 2. First audio file name — last resort, always available
*
* When metadata tags are present, constructs "Title Author Narrator ContributingArtists".
* When tags are empty, falls back to the first audio file's name (cleaned).
*/
export function buildSearchTerm(
metadata: AudioFileMetadata,
firstFileName: string
): { searchTerm: string; source: 'tags' | 'file_name' } {
firstFileName: string,
folderName?: string
): { searchTerm: string; source: 'tags' | 'folder_name' | 'file_name' } {
const { author, narrator, contributingArtists } = deduplicateNames(
metadata.author,
metadata.narrator,
@@ -165,23 +205,23 @@ export function buildSearchTerm(
return { searchTerm: parts.join(' '), source: 'tags' };
}
// Fallback: clean up the first audio file name and use it as search term
const cleaned = firstFileName
.replace(/\.[^.]+$/, '') // Remove file extension
.replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '') // Remove ASIN in brackets
.replace(/[\[\(]\d{4}[\]\)]/g, '') // Remove year in brackets
.replace(/^\d+[\s._-]+/, '') // Remove leading track numbers
.replace(/[_]/g, ' ') // Underscores to spaces
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
// Fallback 1: folder name (if provided and not generic)
if (folderName && !GENERIC_FOLDER_NAME_RE.test(folderName.trim())) {
const cleaned = cleanSearchString(folderName);
if (cleaned) {
return { searchTerm: cleaned, source: 'folder_name' };
}
}
// Fallback 2: first audio file name
const cleaned = cleanSearchString(firstFileName);
return { searchTerm: cleaned || firstFileName, source: 'file_name' };
}
/**
* Build a normalized grouping key from metadata.
* Used to determine which files belong to the same book.
* Returns null if metadata has no title (ungroupable).
* Returns null if metadata has no title (ungroupable by metadata).
*/
function buildGroupingKey(metadata: AudioFileMetadata): string | null {
if (!metadata.title) return null;
@@ -259,17 +299,23 @@ async function asyncPool<T, R>(
* Group audio files in a directory by their metadata.
* Reads metadata from all files using a concurrency pool, then groups them
* by a normalized key of title + author + narrator.
* Files with no metadata title each become their own group.
*
* Files with a metadata title are grouped by their shared key. Files with no
* metadata title are all grouped together under a single '__ungrouped_folder'
* key (rather than one entry per file), treating the folder as one book.
* If a folder contains both tagged and untagged files, the untagged files form
* one extra group alongside the tagged groups.
*/
async function groupAudioFilesByMetadata(
dirPath: string,
audioFiles: string[],
audioSizes: Map<string, number>
audioSizes: Map<string, number>,
folderName: string
): Promise<Array<{
files: string[];
totalSize: number;
metadata: AudioFileMetadata;
metadataSource: 'tags' | 'file_name';
metadataSource: 'tags' | 'folder_name' | 'file_name';
searchTerm: string;
groupingKey: string;
}>> {
@@ -291,14 +337,12 @@ async function groupAudioFilesByMetadata(
metadata: AudioFileMetadata;
}>();
let ungroupedCounter = 0;
for (const { fileName, metadata } of metadataResults) {
const key = buildGroupingKey(metadata);
const fileSize = audioSizes.get(fileName) || 0;
if (key) {
// Has metadata — group with others sharing the same key
// Has metadata title — group with others sharing the same key
const existing = groups.get(key);
if (existing) {
existing.files.push(fileName);
@@ -311,20 +355,28 @@ async function groupAudioFilesByMetadata(
});
}
} else {
// No title metadata — treat as individual book
const uniqueKey = `__ungrouped_${ungroupedCounter++}`;
groups.set(uniqueKey, {
files: [fileName],
totalSize: fileSize,
metadata,
});
// No title metadata — collect all such files under one folder-level group.
// Key must start with '__ungrouped_' so deduplicateDiscoveries treats it
// as unique per folder (prefixes it with folderPath before deduplication).
const ungroupedKey = '__ungrouped_folder';
const existing = groups.get(ungroupedKey);
if (existing) {
existing.files.push(fileName);
existing.totalSize += fileSize;
} else {
groups.set(ungroupedKey, {
files: [fileName],
totalSize: fileSize,
metadata,
});
}
}
}
// Build result with search terms
return Array.from(groups.entries()).map(([groupingKey, group]) => {
group.files.sort((a, b) => a.localeCompare(b));
const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0]);
const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0], folderName);
return {
files: group.files,
totalSize: group.totalSize,
@@ -398,6 +450,7 @@ function deduplicateDiscoveries(
metadata: first.metadata,
searchTerm: first.searchTerm,
metadataSource: first.metadataSource,
extractedAsin: first.extractedAsin,
audioFiles: combinedFiles,
groupingKey: first.groupingKey,
});
@@ -434,9 +487,10 @@ function findCommonParent(paths: string[]): string {
*
* Scans every folder for audio files. When audio files are found, they are
* grouped by metadata (title + author + narrator) — each group becomes a
* separate discovered audiobook. Files with no metadata are treated as
* individual books. Scanning ALWAYS recurses into subfolders regardless of
* whether the current folder has audio files.
* separate discovered audiobook. Files with no metadata are all grouped
* together per folder (treated as one book) rather than one entry per file.
* Scanning ALWAYS recurses into subfolders regardless of whether the current
* folder has audio files.
*
* After the full walk, discoveries sharing the same grouping key across
* different folders (e.g., CD1/ and CD2/) are merged.
@@ -460,11 +514,13 @@ export async function discoverAudiobooks(
foldersScanned++;
const folderName = path.basename(currentPath);
onProgress?.({
phase: 'discovering',
foldersScanned,
audiobooksFound: results.length,
currentFolder: path.basename(currentPath),
currentFolder: folderName,
});
// Check if this folder contains audio files
@@ -486,19 +542,22 @@ export async function discoverAudiobooks(
phase: 'grouping',
foldersScanned,
audiobooksFound: results.length,
currentFolder: path.basename(currentPath),
currentFolder: folderName,
});
// Group audio files by metadata
// Group audio files by metadata, passing folder name for fallback search terms
const groups = await groupAudioFilesByMetadata(
currentPath,
audioResult.audioFiles,
audioSizes
audioSizes,
folderName
);
const folderName = path.basename(currentPath);
const relativePath = path.relative(rootPath, currentPath).replace(/\\/g, '/');
// Extract ASIN from folder name once for all groups in this folder
const extractedAsin = extractAsinFromString(folderName) ?? undefined;
for (const group of groups) {
results.push({
folderPath: currentPath.replace(/\\/g, '/'),
@@ -509,6 +568,7 @@ export async function discoverAudiobooks(
metadata: group.metadata,
searchTerm: group.searchTerm,
metadataSource: group.metadataSource,
extractedAsin,
audioFiles: group.files,
groupingKey: group.groupingKey,
});
@@ -518,7 +578,7 @@ export async function discoverAudiobooks(
phase: 'reading_metadata',
foldersScanned,
audiobooksFound: results.length,
currentFolder: path.basename(currentPath),
currentFolder: folderName,
});
}