Merge pull request #173 from MattiasC/feature/bulk-import-folder-fallback

Bulk import enhancement: group tagless files by folder and use folder name as search fallback
This commit is contained in:
kikootwo
2026-05-14 16:15:41 -04:00
committed by GitHub
5 changed files with 169 additions and 47 deletions
+9 -4
View File
@@ -13,9 +13,13 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
## Key Details ## Key Details
- **Access:** Admin-only, modal opened from admin dashboard Quick Actions - **Access:** Admin-only, modal opened from admin dashboard Quick Actions
- **Audio detection:** Uses `AUDIO_EXTENSIONS` from `src/lib/constants/audio-formats.ts` - **Audio detection:** Uses `AUDIO_EXTENSIONS` from `src/lib/constants/audio-formats.ts`
- **Audiobook boundary:** A folder containing audio files = one audiobook; subfolders not scanned further - **Audiobook boundary:** A folder containing audio files = one audiobook. Files with matching metadata tags are grouped by title+author+narrator. Files with no metadata title tag are all grouped together per folder (one entry, not one per file).
- **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from first audio file - **Metadata extraction:** ffprobe reads `album` (title), `album_artist` (author), `composer` (narrator) from all audio files in folder
- **Fallback:** If metadata tags are empty, folder name used as search term; "Low Confidence" badge shown - **Search term fallback chain** (when no `album` tag):
1. **ASIN in folder name** — scans folder name for pattern `B[A-Z0-9]{9}` bounded by bracket/paren/space; if found, uses direct ASIN lookup instead of text search; no badge shown
2. **Folder name** — cleaned (strips bracketed ASIN/year, underscores→spaces); skipped if generic (CD1, Disc 2, Part 3, Vol 1, etc.); shows "Low Confidence" badge
3. **First file name** — last resort; shows "Low Confidence" badge
- **Generic folder detection:** `/^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i` — these names are skipped as search terms
- **Author/narrator dedup:** Splits on `,;& ` delimiters, removes names appearing in both fields - **Author/narrator dedup:** Splits on `,;& ` delimiters, removes names appearing in both fields
- **Scan depth:** Max 10 levels recursion - **Scan depth:** Max 10 levels recursion
- **Rate limiting:** 1.5s delay between Audible searches (same as existing scraping rate limit) - **Rate limiting:** 1.5s delay between Audible searches (same as existing scraping rate limit)
@@ -56,7 +60,8 @@ Lets admins scan a server folder recursively, discover audiobook subfolders, mat
| Already in library | 40% opacity, green "In Library" badge, toggle disabled | | Already in library | 40% opacity, green "In Library" badge, toggle disabled |
| Active request exists | 40% opacity, purple "Requested" badge, toggle disabled | | Active request exists | 40% opacity, purple "Requested" badge, toggle disabled |
| No Audible match | Red "No Match" badge, folder name shown, pre-skipped | | No Audible match | Red "No Match" badge, folder name shown, pre-skipped |
| Low confidence (folder name fallback) | Amber "Low Confidence" badge | | ASIN extracted from folder name | No badge (high confidence — direct ASIN lookup) |
| Low confidence (folder name or file name fallback, no ASIN) | Amber "Low Confidence" badge |
## Files ## Files
+34 -1
View File
@@ -159,10 +159,42 @@ export async function POST(request: NextRequest) {
let hasActiveRequest = false; let hasActiveRequest = false;
try { try {
const searchResult = await audibleService.search(book.searchTerm); // If the scanner extracted an ASIN directly from the folder name,
// use a direct ASIN lookup (Audnexus API) — more reliable than a
// keyword text search. Fall back to text search if the lookup fails.
if (book.extractedAsin) {
try {
const asinResult = await audibleService.getAudiobookDetails(book.extractedAsin);
if (asinResult) {
match = asinResult;
}
} catch {
/* ASIN lookup failed — fall through to text search */
}
}
if (!match) {
// When an ASIN was extracted from the folder name but the direct
// lookup failed, prefer the folder name as the text search term
// over book.searchTerm. book.searchTerm may come from a single
// tagged file whose album tag is unreliable (e.g. a series name
// or intro track), whereas the folder name is the human-assigned
// title and is more likely to be accurate.
const textSearchTerm = book.extractedAsin
? book.folderName
.replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '') // strip ASIN
.replace(/[\[\(]\d{4}[\]\)]/g, '') // strip year
.replace(/[_]/g, ' ')
.replace(/\s+/g, ' ')
.trim()
: book.searchTerm;
const searchResult = await audibleService.search(textSearchTerm);
if (searchResult.results.length > 0) { if (searchResult.results.length > 0) {
match = searchResult.results[0]; match = searchResult.results[0];
}
}
if (match) {
// Check library availability // Check library availability
const plexMatch = await findPlexMatch({ const plexMatch = await findPlexMatch({
@@ -208,6 +240,7 @@ export async function POST(request: NextRequest) {
audioFileCount: book.audioFileCount, audioFileCount: book.audioFileCount,
totalSizeBytes: book.totalSizeBytes, totalSizeBytes: book.totalSizeBytes,
metadataSource: book.metadataSource, metadataSource: book.metadataSource,
extractedAsin: book.extractedAsin,
searchTerm: book.searchTerm, searchTerm: book.searchTerm,
audioFiles: book.audioFiles, audioFiles: book.audioFiles,
match: match match: match
@@ -39,7 +39,12 @@ function BookRow({
const isDisabled = book.inLibrary || book.hasActiveRequest; const isDisabled = book.inLibrary || book.hasActiveRequest;
const isSkipped = book.skipped; const isSkipped = book.skipped;
const hasMatch = book.match !== null; const hasMatch = book.match !== null;
const isLowConfidence = book.metadataSource === 'file_name'; // Low confidence when search term came from a filename or folder name fallback,
// BUT not when an ASIN was extracted directly from the folder name (that's a
// direct lookup and is as reliable as embedded metadata tags).
const isLowConfidence =
(book.metadataSource === 'file_name' || book.metadataSource === 'folder_name') &&
!book.extractedAsin;
return ( return (
<div <div
+3 -1
View File
@@ -34,7 +34,9 @@ export interface ScannedBook {
relativePath: string; relativePath: string;
audioFileCount: number; audioFileCount: number;
totalSizeBytes: number; totalSizeBytes: number;
metadataSource: 'tags' | 'file_name'; metadataSource: 'tags' | 'folder_name' | 'file_name';
/** ASIN extracted directly from the folder name, if present. */
extractedAsin?: string;
searchTerm: string; searchTerm: string;
audioFiles: string[]; audioFiles: string[];
match: AudibleMatch | null; match: AudibleMatch | null;
+111 -34
View File
@@ -21,6 +21,12 @@ export const MAX_SCAN_DEPTH = 10;
/** Maximum concurrent ffprobe calls for metadata reads. */ /** Maximum concurrent ffprobe calls for metadata reads. */
const METADATA_CONCURRENCY = 10; const METADATA_CONCURRENCY = 10;
/**
* Folder names matching this pattern are considered generic and should not be
* used as Audible search terms (e.g. "CD1", "Disc 2", "Part 3", "Volume 1").
*/
const GENERIC_FOLDER_NAME_RE = /^(cd|disc|disk|part|vol(ume)?)\s*\d+$/i;
/** Metadata extracted from an audio file via ffprobe. */ /** Metadata extracted from an audio file via ffprobe. */
export interface AudioFileMetadata { export interface AudioFileMetadata {
title?: string; // From 'album' tag (book title) title?: string; // From 'album' tag (book title)
@@ -39,7 +45,8 @@ export interface DiscoveredAudiobook {
totalSizeBytes: number; totalSizeBytes: number;
metadata: AudioFileMetadata; metadata: AudioFileMetadata;
searchTerm: string; // Constructed search query for Audible searchTerm: string; // Constructed search query for Audible
metadataSource: 'tags' | 'file_name'; // Where the search term came from metadataSource: 'tags' | 'folder_name' | 'file_name'; // Where the search term came from
extractedAsin?: string; // ASIN extracted directly from folder name, if present
audioFiles: string[]; // File names (relative to folderPath) belonging to this book audioFiles: string[]; // File names (relative to folderPath) belonging to this book
groupingKey: string; // Normalized key for cross-folder deduplication groupingKey: string; // Normalized key for cross-folder deduplication
} }
@@ -60,6 +67,18 @@ function isAudioFile(filename: string): boolean {
return (AUDIO_EXTENSIONS as readonly string[]).includes(ext); return (AUDIO_EXTENSIONS as readonly string[]).includes(ext);
} }
/**
* Extract an Audible ASIN from a string (typically a folder name).
* Audible ASINs start with 'B' and are exactly 10 alphanumeric characters.
* The ASIN must be bounded by a bracket, parenthesis, whitespace, or string
* boundary to avoid false positives from random alphanumeric sequences.
* Returns the ASIN string or null if not found.
*/
export function extractAsinFromString(str: string): string | null {
const match = str.match(/(?:^|[\s\[\(])([B][A-Z0-9]{9})(?:$|[\s\]\)])/);
return match ? match[1] : null;
}
/** /**
* Read audio metadata from a file using ffprobe. * Read audio metadata from a file using ffprobe.
* Extracts album, album_artist, composer, and title tags. * Extracts album, album_artist, composer, and title tags.
@@ -140,15 +159,36 @@ export function deduplicateNames(
} }
/** /**
* Build a search term from metadata or file name. * Clean a raw string (folder name or file name) for use as an Audible search term.
* Strips file extension, bracketed ASINs, bracketed years, leading track numbers,
* underscores, and collapses whitespace.
*/
function cleanSearchString(raw: string): string {
return raw
.replace(/\.[^.]+$/, '') // Remove file extension
.replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '') // Remove ASIN in brackets
.replace(/[\[\(]\d{4}[\]\)]/g, '') // Remove year in brackets
.replace(/^\d+[\s._-]+/, '') // Remove leading track numbers
.replace(/[_]/g, ' ') // Underscores to spaces
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Build a search term from metadata or folder/file name.
* Returns the search term and the source it was derived from. * Returns the search term and the source it was derived from.
*
* Fallback chain (when no album metadata tag is present):
* 1. Folder name — if provided and not a generic name (CD1, Disc 2, Part 3, etc.)
* 2. First audio file name — last resort, always available
*
* When metadata tags are present, constructs "Title Author Narrator ContributingArtists". * When metadata tags are present, constructs "Title Author Narrator ContributingArtists".
* When tags are empty, falls back to the first audio file's name (cleaned).
*/ */
export function buildSearchTerm( export function buildSearchTerm(
metadata: AudioFileMetadata, metadata: AudioFileMetadata,
firstFileName: string firstFileName: string,
): { searchTerm: string; source: 'tags' | 'file_name' } { folderName?: string
): { searchTerm: string; source: 'tags' | 'folder_name' | 'file_name' } {
const { author, narrator, contributingArtists } = deduplicateNames( const { author, narrator, contributingArtists } = deduplicateNames(
metadata.author, metadata.author,
metadata.narrator, metadata.narrator,
@@ -165,23 +205,23 @@ export function buildSearchTerm(
return { searchTerm: parts.join(' '), source: 'tags' }; return { searchTerm: parts.join(' '), source: 'tags' };
} }
// Fallback: clean up the first audio file name and use it as search term // Fallback 1: folder name (if provided and not generic)
const cleaned = firstFileName if (folderName && !GENERIC_FOLDER_NAME_RE.test(folderName.trim())) {
.replace(/\.[^.]+$/, '') // Remove file extension const cleaned = cleanSearchString(folderName);
.replace(/[\[\(][A-Z0-9]{10}[\]\)]/g, '') // Remove ASIN in brackets if (cleaned) {
.replace(/[\[\(]\d{4}[\]\)]/g, '') // Remove year in brackets return { searchTerm: cleaned, source: 'folder_name' };
.replace(/^\d+[\s._-]+/, '') // Remove leading track numbers }
.replace(/[_]/g, ' ') // Underscores to spaces }
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
// Fallback 2: first audio file name
const cleaned = cleanSearchString(firstFileName);
return { searchTerm: cleaned || firstFileName, source: 'file_name' }; return { searchTerm: cleaned || firstFileName, source: 'file_name' };
} }
/** /**
* Build a normalized grouping key from metadata. * Build a normalized grouping key from metadata.
* Used to determine which files belong to the same book. * Used to determine which files belong to the same book.
* Returns null if metadata has no title (ungroupable). * Returns null if metadata has no title (ungroupable by metadata).
*/ */
function buildGroupingKey(metadata: AudioFileMetadata): string | null { function buildGroupingKey(metadata: AudioFileMetadata): string | null {
if (!metadata.title) return null; if (!metadata.title) return null;
@@ -259,17 +299,23 @@ async function asyncPool<T, R>(
* Group audio files in a directory by their metadata. * Group audio files in a directory by their metadata.
* Reads metadata from all files using a concurrency pool, then groups them * Reads metadata from all files using a concurrency pool, then groups them
* by a normalized key of title + author + narrator. * by a normalized key of title + author + narrator.
* Files with no metadata title each become their own group. *
* Files with a metadata title are grouped by their shared key. Files with no
* metadata title are all grouped together under a single '__ungrouped_folder'
* key (rather than one entry per file), treating the folder as one book.
* If a folder contains both tagged and untagged files, the untagged files form
* one extra group alongside the tagged groups.
*/ */
async function groupAudioFilesByMetadata( async function groupAudioFilesByMetadata(
dirPath: string, dirPath: string,
audioFiles: string[], audioFiles: string[],
audioSizes: Map<string, number> audioSizes: Map<string, number>,
folderName: string
): Promise<Array<{ ): Promise<Array<{
files: string[]; files: string[];
totalSize: number; totalSize: number;
metadata: AudioFileMetadata; metadata: AudioFileMetadata;
metadataSource: 'tags' | 'file_name'; metadataSource: 'tags' | 'folder_name' | 'file_name';
searchTerm: string; searchTerm: string;
groupingKey: string; groupingKey: string;
}>> { }>> {
@@ -291,14 +337,12 @@ async function groupAudioFilesByMetadata(
metadata: AudioFileMetadata; metadata: AudioFileMetadata;
}>(); }>();
let ungroupedCounter = 0;
for (const { fileName, metadata } of metadataResults) { for (const { fileName, metadata } of metadataResults) {
const key = buildGroupingKey(metadata); const key = buildGroupingKey(metadata);
const fileSize = audioSizes.get(fileName) || 0; const fileSize = audioSizes.get(fileName) || 0;
if (key) { if (key) {
// Has metadata — group with others sharing the same key // Has metadata title — group with others sharing the same key
const existing = groups.get(key); const existing = groups.get(key);
if (existing) { if (existing) {
existing.files.push(fileName); existing.files.push(fileName);
@@ -311,20 +355,45 @@ async function groupAudioFilesByMetadata(
}); });
} }
} else { } else {
// No title metadata — treat as individual book // No title metadata — collect all such files under one folder-level group.
const uniqueKey = `__ungrouped_${ungroupedCounter++}`; // Key must start with '__ungrouped_' so deduplicateDiscoveries treats it
groups.set(uniqueKey, { // as unique per folder (prefixes it with folderPath before deduplication).
const ungroupedKey = '__ungrouped_folder';
const existing = groups.get(ungroupedKey);
if (existing) {
existing.files.push(fileName);
existing.totalSize += fileSize;
} else {
groups.set(ungroupedKey, {
files: [fileName], files: [fileName],
totalSize: fileSize, totalSize: fileSize,
metadata, metadata,
}); });
} }
} }
}
// If there is exactly one tagged group alongside an ungrouped group, absorb
// the untagged files into the tagged group. Untagged files in the same folder
// almost certainly belong to the same book (e.g. one chapter was ripped
// without tags, or a cover/intro file carries different metadata).
// Only do this when there is a single tagged group — multiple tagged groups
// mean genuinely different books are mixed in the folder, so keep them separate.
const ungrouped = groups.get('__ungrouped_folder');
if (ungrouped) {
const taggedKeys = Array.from(groups.keys()).filter((k) => k !== '__ungrouped_folder');
if (taggedKeys.length === 1) {
const taggedGroup = groups.get(taggedKeys[0])!;
taggedGroup.files.push(...ungrouped.files);
taggedGroup.totalSize += ungrouped.totalSize;
groups.delete('__ungrouped_folder');
}
}
// Build result with search terms // Build result with search terms
return Array.from(groups.entries()).map(([groupingKey, group]) => { return Array.from(groups.entries()).map(([groupingKey, group]) => {
group.files.sort((a, b) => a.localeCompare(b)); group.files.sort((a, b) => a.localeCompare(b));
const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0]); const { searchTerm, source } = buildSearchTerm(group.metadata, group.files[0], folderName);
return { return {
files: group.files, files: group.files,
totalSize: group.totalSize, totalSize: group.totalSize,
@@ -398,6 +467,7 @@ function deduplicateDiscoveries(
metadata: first.metadata, metadata: first.metadata,
searchTerm: first.searchTerm, searchTerm: first.searchTerm,
metadataSource: first.metadataSource, metadataSource: first.metadataSource,
extractedAsin: first.extractedAsin,
audioFiles: combinedFiles, audioFiles: combinedFiles,
groupingKey: first.groupingKey, groupingKey: first.groupingKey,
}); });
@@ -434,9 +504,10 @@ function findCommonParent(paths: string[]): string {
* *
* Scans every folder for audio files. When audio files are found, they are * Scans every folder for audio files. When audio files are found, they are
* grouped by metadata (title + author + narrator) — each group becomes a * grouped by metadata (title + author + narrator) — each group becomes a
* separate discovered audiobook. Files with no metadata are treated as * separate discovered audiobook. Files with no metadata are all grouped
* individual books. Scanning ALWAYS recurses into subfolders regardless of * together per folder (treated as one book) rather than one entry per file.
* whether the current folder has audio files. * Scanning ALWAYS recurses into subfolders regardless of whether the current
* folder has audio files.
* *
* After the full walk, discoveries sharing the same grouping key across * After the full walk, discoveries sharing the same grouping key across
* different folders (e.g., CD1/ and CD2/) are merged. * different folders (e.g., CD1/ and CD2/) are merged.
@@ -460,11 +531,13 @@ export async function discoverAudiobooks(
foldersScanned++; foldersScanned++;
const folderName = path.basename(currentPath);
onProgress?.({ onProgress?.({
phase: 'discovering', phase: 'discovering',
foldersScanned, foldersScanned,
audiobooksFound: results.length, audiobooksFound: results.length,
currentFolder: path.basename(currentPath), currentFolder: folderName,
}); });
// Check if this folder contains audio files // Check if this folder contains audio files
@@ -486,19 +559,22 @@ export async function discoverAudiobooks(
phase: 'grouping', phase: 'grouping',
foldersScanned, foldersScanned,
audiobooksFound: results.length, audiobooksFound: results.length,
currentFolder: path.basename(currentPath), currentFolder: folderName,
}); });
// Group audio files by metadata // Group audio files by metadata, passing folder name for fallback search terms
const groups = await groupAudioFilesByMetadata( const groups = await groupAudioFilesByMetadata(
currentPath, currentPath,
audioResult.audioFiles, audioResult.audioFiles,
audioSizes audioSizes,
folderName
); );
const folderName = path.basename(currentPath);
const relativePath = path.relative(rootPath, currentPath).replace(/\\/g, '/'); const relativePath = path.relative(rootPath, currentPath).replace(/\\/g, '/');
// Extract ASIN from folder name once for all groups in this folder
const extractedAsin = extractAsinFromString(folderName) ?? undefined;
for (const group of groups) { for (const group of groups) {
results.push({ results.push({
folderPath: currentPath.replace(/\\/g, '/'), folderPath: currentPath.replace(/\\/g, '/'),
@@ -509,6 +585,7 @@ export async function discoverAudiobooks(
metadata: group.metadata, metadata: group.metadata,
searchTerm: group.searchTerm, searchTerm: group.searchTerm,
metadataSource: group.metadataSource, metadataSource: group.metadataSource,
extractedAsin,
audioFiles: group.files, audioFiles: group.files,
groupingKey: group.groupingKey, groupingKey: group.groupingKey,
}); });
@@ -518,7 +595,7 @@ export async function discoverAudiobooks(
phase: 'reading_metadata', phase: 'reading_metadata',
foldersScanned, foldersScanned,
audiobooksFound: results.length, audiobooksFound: results.length,
currentFolder: path.basename(currentPath), currentFolder: folderName,
}); });
} }