Files
ReadMeABook/src/lib/utils/ranking-algorithm.ts
T
kikootwo 5d8ac2f73d Add language config and locale-aware parsing
Introduce centralized language configuration and wire locale-aware behavior across scraping and ranking. Adds src/lib/constants/language-config.ts with per-language scraping rules, stop words, and character replacements; replaces AudibleRegion.isEnglish with a language field in types and AUDIBLE_REGIONS. Update AudibleService, ebook scraper, processors, and API routes to use getLanguageForRegion so Anna's Archive searches, scraping selectors, runtime/rating parsing, and ranking use language-specific params and filters. Extend ranking algorithm to accept stopWords and characterReplacements and apply them during normalization and matching. Update UI selects to mark non-English regions and adjust tests accordingly.
2026-02-20 06:32:44 -05:00

1306 lines
46 KiB
TypeScript

/**
* Component: Intelligent Ranking Algorithm
* Documentation: documentation/phase3/ranking-algorithm.md
*/
import { compareTwoStrings } from 'string-similarity';
export interface TorrentResult {
indexer: string;
indexerId?: number;
title: string;
size: number;
seeders?: number; // Optional for NZB/Usenet results (no seeders concept)
leechers?: number; // Optional for NZB/Usenet results (no leechers concept)
publishDate: Date;
downloadUrl: string;
infoUrl?: string; // Link to indexer's info page (for user reference)
infoHash?: string;
guid: string;
format?: 'M4B' | 'M4A' | 'MP3' | 'FLAC' | 'OTHER';
bitrate?: string;
hasChapters?: boolean;
flags?: string[]; // Indexer flags like "Freeleech", "Internal", etc.
protocol?: string; // 'torrent' or 'usenet' - from Prowlarr API
}
export interface AudiobookRequest {
title: string;
author: string;
narrator?: string;
durationMinutes?: number;
}
export interface IndexerFlagConfig {
name: string; // Flag name (e.g., "Freeleech")
modifier: number; // -100 to 100 (percentage)
}
export interface RankTorrentsOptions {
indexerPriorities?: Map<number, number>; // indexerId -> priority (1-25)
flagConfigs?: IndexerFlagConfig[]; // Flag bonus configurations
requireAuthor?: boolean; // Enforce author presence check (default: true)
stopWords?: string[]; // Language-specific stop words for matching
characterReplacements?: Record<string, string>; // Language-specific char replacements (e.g. ß→ss)
}
export interface EbookTorrentRequest {
title: string;
author: string;
preferredFormat: string; // User's preferred format (epub, pdf, etc.)
}
export interface RankEbookTorrentsOptions {
indexerPriorities?: Map<number, number>; // indexerId -> priority (1-25)
flagConfigs?: IndexerFlagConfig[]; // Flag bonus configurations
requireAuthor?: boolean; // Enforce author presence check (default: true)
stopWords?: string[]; // Language-specific stop words for matching
characterReplacements?: Record<string, string>; // Language-specific char replacements (e.g. ß→ss)
}
export interface BonusModifier {
type: 'indexer_priority' | 'indexer_flag' | 'custom';
value: number; // Multiplier (e.g., 0.4 for 40%)
points: number; // Calculated bonus points from this modifier
reason: string; // Human-readable explanation
}
export interface ScoreBreakdown {
formatScore: number;
sizeScore: number;
seederScore: number;
matchScore: number;
totalScore: number;
notes: string[];
}
export interface RankedTorrent extends TorrentResult {
score: number; // Base score (0-100)
bonusModifiers: BonusModifier[];
bonusPoints: number; // Sum of all bonus points
finalScore: number; // score + bonusPoints
rank: number;
breakdown: ScoreBreakdown;
}
export interface EbookScoreBreakdown {
formatScore: number; // 0-10 points (match preferred = 10, else 0)
sizeScore: number; // 0-15 points (inverted - smaller is better)
seederScore: number; // 0-15 points (same as audiobooks)
matchScore: number; // 0-60 points (same as audiobooks)
totalScore: number;
notes: string[];
}
export interface RankedEbookTorrent extends TorrentResult {
score: number; // Base score (0-100)
bonusModifiers: BonusModifier[];
bonusPoints: number; // Sum of all bonus points
finalScore: number; // score + bonusPoints
rank: number;
breakdown: EbookScoreBreakdown;
ebookFormat?: string; // Detected ebook format (epub, pdf, mobi, etc.)
}
export class RankingAlgorithm {
/**
* Rank all torrents and return sorted by finalScore (best first)
* @param torrents - Array of torrent results to rank
* @param audiobook - Audiobook request details for matching (includes durationMinutes for size scoring)
* @param options - Optional configuration for ranking behavior
*/
rankTorrents(
torrents: TorrentResult[],
audiobook: AudiobookRequest,
options: RankTorrentsOptions = {}
): RankedTorrent[] {
const {
indexerPriorities,
flagConfigs,
requireAuthor = true, // Safe default: require author in automatic mode
stopWords,
characterReplacements,
} = options;
// Filter out files < 20 MB (likely ebooks/samples)
const filteredTorrents = torrents.filter((torrent) => {
const sizeMB = torrent.size / (1024 * 1024);
return sizeMB >= 20;
});
const ranked = filteredTorrents.map((torrent) => {
// Calculate base scores (0-100)
const formatScore = this.scoreFormat(torrent);
const sizeScore = this.scoreSize(torrent, audiobook.durationMinutes);
const seederScore = this.scoreSeeders(torrent.seeders);
const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor, stopWords, characterReplacements);
const baseScore = formatScore + sizeScore + seederScore + matchScore;
// Calculate bonus modifiers
const bonusModifiers: BonusModifier[] = [];
// Indexer priority bonus (default: 10/25 = 40%)
if (torrent.indexerId !== undefined) {
const priority = indexerPriorities?.get(torrent.indexerId) ?? 10;
const modifier = priority / 25; // Convert 1-25 to 0.04-1.0 (4%-100%)
const points = baseScore * modifier;
bonusModifiers.push({
type: 'indexer_priority',
value: modifier,
points: points,
reason: `Indexer priority ${priority}/25 (${Math.round(modifier * 100)}%)`,
});
}
// Flag bonuses/penalties
if (torrent.flags && torrent.flags.length > 0 && flagConfigs && flagConfigs.length > 0) {
torrent.flags.forEach(torrentFlag => {
// Case-insensitive, whitespace-trimmed matching
const matchingConfig = flagConfigs.find(cfg =>
cfg.name.trim().toLowerCase() === torrentFlag.trim().toLowerCase()
);
if (matchingConfig) {
const modifier = matchingConfig.modifier / 100; // Convert -100 to 100 → -1.0 to 1.0
const points = baseScore * modifier;
bonusModifiers.push({
type: 'indexer_flag',
value: modifier,
points: points,
reason: `Flag "${torrentFlag}" (${matchingConfig.modifier > 0 ? '+' : ''}${matchingConfig.modifier}%)`,
});
}
});
}
// Sum all bonus points
const bonusPoints = bonusModifiers.reduce((sum, mod) => sum + mod.points, 0);
// Calculate final score
const finalScore = baseScore + bonusPoints;
return {
...torrent,
score: baseScore,
bonusModifiers,
bonusPoints,
finalScore,
rank: 0, // Will be assigned after sorting
breakdown: {
formatScore,
sizeScore,
seederScore,
matchScore,
totalScore: baseScore,
notes: this.generateNotes(torrent, {
formatScore,
sizeScore,
seederScore,
matchScore,
totalScore: baseScore,
notes: [],
}, audiobook.durationMinutes),
},
};
});
// Sort by finalScore descending (best first), then by publishDate descending (newest first) for tiebreakers
ranked.sort((a, b) => {
// Primary: sort by final score
if (b.finalScore !== a.finalScore) {
return b.finalScore - a.finalScore;
}
// Tiebreaker: sort by publishDate (newest first)
return b.publishDate.getTime() - a.publishDate.getTime();
});
// Assign ranks
ranked.forEach((r, index) => {
r.rank = index + 1;
});
return ranked;
}
/**
* Get detailed scoring breakdown for a torrent
*/
getScoreBreakdown(
torrent: TorrentResult,
audiobook: AudiobookRequest,
requireAuthor: boolean = true
): ScoreBreakdown {
const formatScore = this.scoreFormat(torrent);
const sizeScore = this.scoreSize(torrent, audiobook.durationMinutes);
const seederScore = this.scoreSeeders(torrent.seeders);
const matchScore = this.scoreMatch(torrent, audiobook, requireAuthor);
const totalScore = formatScore + sizeScore + seederScore + matchScore;
return {
formatScore,
sizeScore,
seederScore,
matchScore,
totalScore,
notes: this.generateNotes(torrent, {
formatScore,
sizeScore,
seederScore,
matchScore,
totalScore,
notes: [],
}, audiobook.durationMinutes),
};
}
/**
* Score format quality (10 points max)
* Reduced from 25 to make room for data-driven size scoring
* M4B with chapters: 10 pts
* M4B without chapters: 9 pts
* FLAC: 7 pts (lossless audio, excellent quality)
* M4A: 6 pts
* MP3: 4 pts
* Other: 1 pt
*/
private scoreFormat(torrent: TorrentResult): number {
const format = this.detectFormat(torrent);
switch (format) {
case 'M4B':
return torrent.hasChapters !== false ? 10 : 9;
case 'FLAC':
return 7;
case 'M4A':
return 6;
case 'MP3':
return 4;
default:
return 1;
}
}
/**
* Score file size quality (15 points max)
* Uses book runtime and file size to validate correct file type
* Filters out ebooks and ranks audiobook quality
*
* @param torrent - Torrent result with size in bytes
* @param runtimeMinutes - Book runtime in minutes from Audnexus
* @returns 0-15 points based on MB/min ratio
*
* Algorithm:
* - >= 1.0 MB/min → 15/15 points (high quality baseline)
* - Linear scaling below 1.0 MB/min
* - 0 points if no runtime data (graceful degradation)
*
* Note: Files < 20 MB are pre-filtered in rankTorrents()
*/
private scoreSize(torrent: TorrentResult, runtimeMinutes: number | undefined): number {
// Graceful degradation: no runtime data = no size scoring
if (!runtimeMinutes || runtimeMinutes === 0) {
return 0;
}
const sizeMB = torrent.size / (1024 * 1024);
const mbPerMin = sizeMB / runtimeMinutes;
// High quality baseline: 1.0 MB/min or higher gets full points
// This is ~64 kbps MP3 equivalent
if (mbPerMin >= 1.0) {
return 15;
}
// Linear scaling below baseline
// 0.5 MB/min = 7.5 points
// 0.3 MB/min = 4.5 points
return mbPerMin * 15;
}
/**
* Score seeder count (15 points max)
* Logarithmic scaling:
* 1 seeder: 0 points
* 10 seeders: 6 points
* 100 seeders: 12 points
* 1000+ seeders: 15 points
*
* Note: NZB/Usenet results don't have seeders concept - centralized servers provide guaranteed availability
*/
private scoreSeeders(seeders: number | undefined): number {
// Handle undefined/null (NZB results) - give full score since Usenet has centralized availability
if (seeders === undefined || seeders === null || isNaN(seeders)) {
return 15; // Full score - Usenet doesn't need seeders, content is on centralized servers
}
if (seeders === 0) return 0;
return Math.min(15, Math.log10(seeders + 1) * 6);
}
/**
* Normalize text for matching by handling CamelCase and punctuation separators
* "VirginaEvans TheCorrespondent" → "virgina evans the correspondent"
* "Twelve.Months-Jim.Butcher" → "twelve months jim butcher"
* "Author_Name_Book" → "author name book"
*/
private normalizeForMatching(text: string, characterReplacements?: Record<string, string>): string {
let result = text
// Split CamelCase FIRST (before lowercasing): "TheCorrespondent" → "The Correspondent"
.replace(/([a-z])([A-Z])/g, '$1 $2')
.toLowerCase();
// Apply language-specific character replacements before NFD (e.g. ß→ss)
if (characterReplacements) {
for (const [from, to] of Object.entries(characterReplacements)) {
result = result.replace(new RegExp(from, 'g'), to);
}
}
return result
// NFD normalization: convert accented chars to ASCII base forms
// e.g. "uber" from "uber", "senor" from "senor", "cafe" from "cafe"
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
// Replace underscores with spaces (must be explicit since \w includes _)
.replace(/_/g, ' ')
// Replace other punctuation/separators with spaces (preserves apostrophes in contractions)
.replace(/[^\w\s']/g, ' ')
// Collapse multiple spaces
.replace(/\s+/g, ' ')
.trim();
}
/**
* Score title/author match quality (60 points max)
* Title similarity: 0-45 points (heavily weighted!)
* Author presence: 0-15 points
*/
private scoreMatch(
torrent: TorrentResult,
audiobook: AudiobookRequest,
requireAuthor: boolean = true,
customStopWords?: string[],
characterReplacements?: Record<string, string>
): number {
// Normalize for matching (handles CamelCase, punctuation separators, diacritics)
const torrentTitle = this.normalizeForMatching(torrent.title, characterReplacements);
const requestTitle = this.normalizeForMatching(audiobook.title, characterReplacements);
// Parse authors from RAW string first (preserving commas for splitting)
// Then normalize individual authors for matching
const requestAuthorRaw = audiobook.author.toLowerCase().replace(/\s+/g, ' ').trim();
const parsedAuthors = requestAuthorRaw
.split(/,|&| and | - /)
.map(a => a.trim())
.filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
// Normalize parsed authors for matching (handles CamelCase in author names)
const normalizedAuthors = parsedAuthors.map(a => this.normalizeForMatching(a, characterReplacements));
// Combined normalized author string for fuzzy matching
const requestAuthorNormalized = normalizedAuthors.join(' ');
// ========== STAGE 1: WORD COVERAGE FILTER (MANDATORY) ==========
// Extract significant words (filter out common stop words)
// Use provided language-specific stop words, or fall back to English defaults
const stopWords = customStopWords || ['the', 'a', 'an', 'of', 'on', 'in', 'at', 'by', 'for'];
const extractWords = (text: string, stopList: string[]): string[] => {
let processed = text
// Split CamelCase FIRST: "TheCorrespondent" → "The Correspondent"
.replace(/([a-z])([A-Z])/g, '$1 $2')
.toLowerCase();
// Apply language-specific character replacements before NFD
if (characterReplacements) {
for (const [from, to] of Object.entries(characterReplacements)) {
processed = processed.replace(new RegExp(from, 'g'), to);
}
}
return processed
// NFD normalization for accented characters
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
// Replace underscores with spaces (must be explicit since \w includes _)
.replace(/_/g, ' ')
// Remove other punctuation (but keep apostrophes for contractions)
.replace(/[^\w\s']/g, ' ')
.split(/\s+/)
.filter(word => word.length > 0 && !stopList.includes(word));
};
// Separate required words (outside parentheses/brackets/colon subtitles) from optional words
// This handles common patterns like:
// "Title (Subtitle)" where subtitle may be omitted
// "Title: Series Name" where Audible appends series names after a colon
// Note: Run on ORIGINAL title to preserve brackets/colons, then normalize the result
const separateRequiredOptional = (title: string): { required: string; optional: string } => {
// Work with original title format for bracket/colon detection
const originalTitle = audiobook.title.toLowerCase();
// Extract content in parentheses/brackets as optional
const optionalPattern = /[(\[{]([^)\]}]+)[)\]}]/g;
const optionalMatches: string[] = [];
let match;
while ((match = optionalPattern.exec(originalTitle)) !== null) {
optionalMatches.push(match[1]);
}
// Remove parenthetical/bracketed content to get the non-bracketed portion
let requiredRaw = originalTitle.replace(/[(\[{][^)\]}]+[)\]}]/g, ' ').trim();
// Treat content after a colon as optional (Audible commonly appends series names)
// e.g., "The Finest Edge of Twilight: Dungeons & Dragons" → required: title, optional: series
const colonIndex = requiredRaw.indexOf(':');
if (colonIndex > 0 && colonIndex < requiredRaw.length - 1) {
const afterColon = requiredRaw.substring(colonIndex + 1).trim();
if (afterColon.length > 0) {
optionalMatches.push(afterColon);
}
requiredRaw = requiredRaw.substring(0, colonIndex).trim();
}
// Normalize the required portion (handles CamelCase, punctuation)
const required = this.normalizeForMatching(requiredRaw, characterReplacements);
const optional = optionalMatches.join(' ');
return { required, optional };
};
const { required: requiredTitle, optional: optionalTitle } = separateRequiredOptional(requestTitle);
// Extract words from required portion only for coverage check
const requiredWords = extractWords(requiredTitle, stopWords);
const torrentWords = extractWords(torrentTitle, stopWords);
// Calculate word coverage: how many REQUIRED words appear in TORRENT
if (requiredWords.length === 0) {
// Edge case: title is only stop words or only optional content, skip filter
// Fall through to normal scoring
} else {
const matchedWords = requiredWords.filter(word => torrentWords.includes(word));
const coverage = matchedWords.length / requiredWords.length;
// HARD REQUIREMENT: Must have 80%+ coverage of REQUIRED words
if (coverage < 0.80) {
// Automatic rejection - doesn't contain enough of the requested words
return 0;
}
}
// ========== STAGE 1.5: AUTHOR PRESENCE CHECK (OPTIONAL) ==========
// Only enforced in automatic mode (requireAuthor: true)
// Interactive search (requireAuthor: false) shows all results
if (requireAuthor && !this.checkAuthorPresenceWithParsed(torrentTitle, normalizedAuthors)) {
// No high-confidence author match → reject to prevent wrong-author matches
return 0;
}
// ========== STAGE 2: TITLE MATCHING (0-35 points) ==========
let titleScore = 0;
// Keep original torrent title (lowercased only) for metadata marker detection
// Markers like [ ] ( ) : are removed by normalization but needed for suffix validation
const torrentTitleOriginal = torrent.title.toLowerCase().replace(/\s+/g, ' ').trim();
// Try matching with full title first, then fall back to required title (without parentheses)
const titlesToTry = [requestTitle];
if (requiredTitle !== requestTitle) {
titlesToTry.push(requiredTitle); // Add required-only version if different
}
let bestMatch = false;
for (const titleToMatch of titlesToTry) {
if (torrentTitle.includes(titleToMatch)) {
// Found the title, but is it the complete title or part of a longer one?
const titleIndex = torrentTitle.indexOf(titleToMatch);
const beforeTitle = torrentTitle.substring(0, titleIndex);
const afterTitle = torrentTitle.substring(titleIndex + titleToMatch.length);
// For metadata marker detection, try to find where the title starts in the ORIGINAL string
// Search for key words from the title to locate position in original
const titleWords = titleToMatch.split(/\s+/).filter(w => w.length > 2);
let afterTitleOriginal = '';
if (titleWords.length > 0) {
// Find the last significant title word in the original string
const lastTitleWord = titleWords[titleWords.length - 1];
const lastWordIdxOriginal = torrentTitleOriginal.lastIndexOf(lastTitleWord);
if (lastWordIdxOriginal !== -1) {
afterTitleOriginal = torrentTitleOriginal.substring(lastWordIdxOriginal + lastTitleWord.length);
}
}
// Extract significant words BEFORE the matched title
const beforeWords = extractWords(beforeTitle, stopWords);
// Title is complete if:
// 1. Acceptable prefix (no words, OR structured metadata like "Author - Series - ")
// 2. Followed by clear metadata markers (not "'s Secret" or " Is Watching")
// Check ORIGINAL title for metadata markers ([ ] ( ) etc. not normalized away)
const metadataMarkers = [' by ', ' - ', ' [', ' (', ' {', ' :', ','];
// Check if afterTitle starts with any author name (handles space-separated format like "Title Author Year")
const afterStartsWithAuthor = normalizedAuthors.some(author =>
author.length > 2 && afterTitle.trim().startsWith(author)
);
// Check metadata markers in both normalized and original suffixes
const hasMetadataSuffix = afterTitle === '' ||
metadataMarkers.some(marker => afterTitle.startsWith(marker)) ||
metadataMarkers.some(marker => afterTitleOriginal.startsWith(marker)) ||
afterStartsWithAuthor;
// Check prefix validity:
// - No words before = clean match
// - Title preceded by separator (` - `, `: `) = structured metadata (Author - Series - Title)
// - Author name in prefix = author attribution before title
const hasNoWordsPrefix = beforeWords.length === 0;
// Check if title is immediately preceded by a metadata separator
// This handles "Author - Series - 01 - Title" patterns
// Check both normalized and original strings for separators
const precedingText = beforeTitle.trimEnd();
// Also check original string for separators that got normalized away (like colons)
let beforeTitleOriginal = '';
if (titleWords.length > 0) {
const firstTitleWord = titleWords[0];
const firstWordIdxOriginal = torrentTitleOriginal.indexOf(firstTitleWord);
if (firstWordIdxOriginal !== -1) {
beforeTitleOriginal = torrentTitleOriginal.substring(0, firstWordIdxOriginal).trimEnd();
}
}
const titlePrecededBySeparator =
precedingText.endsWith('-') ||
precedingText.endsWith(':') ||
precedingText.endsWith('—') ||
beforeTitleOriginal.endsWith('-') ||
beforeTitleOriginal.endsWith(':') ||
beforeTitleOriginal.endsWith('—');
// Check if any author name appears in the prefix
// This handles "Author Name - Title" patterns
const authorInPrefix = normalizedAuthors.some(author =>
author.length > 2 && beforeTitle.includes(author)
);
const hasAcceptablePrefix =
hasNoWordsPrefix ||
titlePrecededBySeparator ||
authorInPrefix;
const isCompleteTitle = hasAcceptablePrefix && hasMetadataSuffix;
if (isCompleteTitle) {
// Complete title match → full points
titleScore = 45;
bestMatch = true;
break; // Found a good match, stop trying
}
}
}
if (!bestMatch) {
// No complete match found, use fuzzy similarity as fallback
// Try against full title first, then required title
const fuzzyScores = titlesToTry.map(title => compareTwoStrings(title, torrentTitle));
titleScore = Math.max(...fuzzyScores) * 45;
}
// ========== STAGE 3: AUTHOR MATCHING (0-15 points) ==========
// Check how many authors appear in torrent title (exact substring match)
const authorMatches = normalizedAuthors.filter(author =>
torrentTitle.includes(author)
);
let authorScore = 0;
if (authorMatches.length > 0) {
// Exact substring match → proportional credit
authorScore = (authorMatches.length / normalizedAuthors.length) * 15;
} else {
// No exact match → use fuzzy similarity for partial credit
authorScore = compareTwoStrings(requestAuthorNormalized, torrentTitle) * 15;
}
return Math.min(60, titleScore + authorScore);
}
/**
* Check if author is present in torrent title with high confidence
* Uses pre-parsed and normalized authors array
*
* @param torrentTitle - Normalized torrent title (already processed by normalizeForMatching)
* @param normalizedAuthors - Array of normalized author names (roles already filtered)
* @returns true if at least ONE author is present with high confidence
*/
private checkAuthorPresenceWithParsed(torrentTitle: string, normalizedAuthors: string[]): boolean {
// At least ONE author must match with high confidence
return normalizedAuthors.some(author => {
// Check 1: Exact substring match (works well now that both are normalized)
if (torrentTitle.includes(author)) {
return true;
}
// Check 2: High fuzzy similarity (≥ 0.85)
// Handles: "J.K. Rowling" vs "J. K. Rowling" vs "JK Rowling"
// Also handles: "Dennis E. Taylor" vs "Dennis Taylor"
const similarity = compareTwoStrings(author, torrentTitle);
if (similarity >= 0.85) {
return true;
}
// Check 3: Core name components (first + last name present within 30 chars)
// Handles: "Sanderson, Brandon" vs "Brandon Sanderson"
// Handles: "Brandon R. Sanderson" vs "Brandon Sanderson"
// Now also handles: "VirginaEvans" → "virgina evans" (after normalization)
const words = author.split(/\s+/).filter(w => w.length > 1);
if (words.length >= 2) {
const firstName = words[0];
const lastName = words[words.length - 1];
const firstIdx = torrentTitle.indexOf(firstName);
const lastIdx = torrentTitle.indexOf(lastName);
// Both components present and reasonably close?
if (firstIdx !== -1 && lastIdx !== -1) {
const distance = Math.abs(lastIdx - firstIdx);
if (distance <= 30) {
return true;
}
}
}
return false;
});
}
/**
* Check if author is present in torrent title with high confidence
* Handles variations: middle initials, spacing, punctuation, name order, CamelCase
*
* @param torrentTitle - Normalized torrent title (already processed by normalizeForMatching)
* @param requestAuthor - Raw author string (will be parsed and normalized internally)
* @returns true if at least ONE author is present with high confidence
*/
private checkAuthorPresence(torrentTitle: string, requestAuthor: string, characterReplacements?: Record<string, string>): boolean {
// Parse multiple authors (same logic as Stage 3 author matching)
const authors = requestAuthor
.split(/,|&| and | - /)
.map(a => a.trim())
.filter(a => a.length > 2 && !['translator', 'narrator'].includes(a));
// Normalize each author for matching
const normalizedAuthors = authors.map(a => this.normalizeForMatching(a, characterReplacements));
return this.checkAuthorPresenceWithParsed(torrentTitle, normalizedAuthors);
}
/**
* Detect format from torrent title
*/
private detectFormat(torrent: TorrentResult): 'M4B' | 'M4A' | 'MP3' | 'FLAC' | 'OTHER' {
// Use explicit format if provided
if (torrent.format) {
return torrent.format;
}
const title = torrent.title.toUpperCase();
// Check for format keywords in title
if (title.includes('M4B')) return 'M4B';
if (title.includes('M4A')) return 'M4A';
if (title.includes('MP3')) return 'MP3';
if (title.includes('FLAC')) return 'FLAC';
// Default to OTHER if no format detected
return 'OTHER';
}
/**
* Generate human-readable notes about scoring
*/
private generateNotes(
torrent: TorrentResult,
breakdown: ScoreBreakdown,
runtimeMinutes?: number
): string[] {
const notes: string[] = [];
// Format notes
const format = this.detectFormat(torrent);
if (format === 'M4B') {
notes.push('Excellent format (M4B)');
if (torrent.hasChapters !== false) {
notes.push('Has chapter markers');
}
} else if (format === 'FLAC') {
notes.push('Lossless format (FLAC)');
} else if (format === 'M4A') {
notes.push('Good format (M4A)');
} else if (format === 'MP3') {
notes.push('Acceptable format (MP3)');
} else {
notes.push('Unknown or uncommon format');
}
// Size notes
if (runtimeMinutes && runtimeMinutes > 0) {
const sizeMB = torrent.size / (1024 * 1024);
const mbPerMin = sizeMB / runtimeMinutes;
if (mbPerMin >= 1.5) {
notes.push('✓ Premium quality (high bitrate)');
} else if (mbPerMin >= 1.0) {
notes.push('✓ High quality');
} else if (mbPerMin >= 0.5) {
notes.push('Standard quality');
} else if (mbPerMin >= 0.3) {
notes.push('⚠️ Low quality (low bitrate)');
} else {
notes.push('⚠️ Very low quality - may be ebook');
}
}
// Seeder notes (skip for NZB/Usenet results which don't have seeders)
if (torrent.seeders !== undefined && torrent.seeders !== null && !isNaN(torrent.seeders)) {
if (torrent.seeders === 0) {
notes.push('⚠️ No seeders available');
} else if (torrent.seeders < 5) {
notes.push(`Low seeders (${torrent.seeders})`);
} else if (torrent.seeders >= 50) {
notes.push(`Excellent availability (${torrent.seeders} seeders)`);
}
}
// Match notes (now worth 60 points!)
if (breakdown.matchScore < 24) {
notes.push('⚠️ Poor title/author match');
} else if (breakdown.matchScore < 42) {
notes.push('⚠️ Weak title/author match');
} else if (breakdown.matchScore >= 54) {
notes.push('✓ Excellent title/author match');
}
// Overall quality assessment
if (breakdown.totalScore >= 75) {
notes.push('✓ Excellent choice');
} else if (breakdown.totalScore >= 55) {
notes.push('✓ Good choice');
} else if (breakdown.totalScore < 35) {
notes.push('⚠️ Consider reviewing this choice');
}
return notes;
}
// =========================================================================
// EBOOK TORRENT RANKING (for indexer results)
// Reuses scoreMatch() and scoreSeeders() from audiobook ranking
// Uses ebook-specific format and size scoring
// =========================================================================
/**
* Rank ebook torrents from indexers
* Reuses title/author matching and seeder scoring from audiobook ranking
* Uses ebook-specific format scoring (10 pts for match, 0 otherwise)
* Uses inverted size scoring (smaller = better, > 20MB filtered)
*
* @param torrents - Array of torrent results from Prowlarr
* @param ebook - Ebook request details (title, author, preferredFormat)
* @param options - Optional configuration for ranking behavior
*/
rankEbookTorrents(
torrents: TorrentResult[],
ebook: EbookTorrentRequest,
options: RankEbookTorrentsOptions = {}
): RankedEbookTorrent[] {
const {
indexerPriorities,
flagConfigs,
requireAuthor = true, // Safe default: require author in automatic mode
stopWords,
characterReplacements,
} = options;
// Filter out files > 20 MB (too large for ebooks)
const filteredTorrents = torrents.filter((torrent) => {
const sizeMB = torrent.size / (1024 * 1024);
return sizeMB <= 20;
});
const ranked = filteredTorrents.map((torrent) => {
// Detect ebook format from title
const detectedFormat = this.detectEbookFormat(torrent);
// Calculate base scores (0-100)
// Reuse scoreMatch and scoreSeeders from audiobook ranking
const formatScore = this.scoreEbookFormat(torrent, ebook.preferredFormat);
const sizeScore = this.scoreEbookSize(torrent);
const seederScore = this.scoreSeeders(torrent.seeders);
const matchScore = this.scoreMatch(torrent, {
title: ebook.title,
author: ebook.author,
}, requireAuthor, stopWords, characterReplacements);
const baseScore = formatScore + sizeScore + seederScore + matchScore;
// Calculate bonus modifiers (same as audiobooks)
const bonusModifiers: BonusModifier[] = [];
// Indexer priority bonus (default: 10/25 = 40%)
if (torrent.indexerId !== undefined) {
const priority = indexerPriorities?.get(torrent.indexerId) ?? 10;
const modifier = priority / 25; // Convert 1-25 to 0.04-1.0 (4%-100%)
const points = baseScore * modifier;
bonusModifiers.push({
type: 'indexer_priority',
value: modifier,
points: points,
reason: `Indexer priority ${priority}/25 (${Math.round(modifier * 100)}%)`,
});
}
// Flag bonuses/penalties (same as audiobooks)
if (torrent.flags && torrent.flags.length > 0 && flagConfigs && flagConfigs.length > 0) {
torrent.flags.forEach(torrentFlag => {
const matchingConfig = flagConfigs.find(cfg =>
cfg.name.trim().toLowerCase() === torrentFlag.trim().toLowerCase()
);
if (matchingConfig) {
const modifier = matchingConfig.modifier / 100;
const points = baseScore * modifier;
bonusModifiers.push({
type: 'indexer_flag',
value: modifier,
points: points,
reason: `Flag "${torrentFlag}" (${matchingConfig.modifier > 0 ? '+' : ''}${matchingConfig.modifier}%)`,
});
}
});
}
// Sum all bonus points
const bonusPoints = bonusModifiers.reduce((sum, mod) => sum + mod.points, 0);
// Calculate final score
const finalScore = baseScore + bonusPoints;
return {
...torrent,
score: baseScore,
bonusModifiers,
bonusPoints,
finalScore,
rank: 0, // Will be assigned after sorting
breakdown: {
formatScore,
sizeScore,
seederScore,
matchScore,
totalScore: baseScore,
notes: this.generateEbookNotes(torrent, {
formatScore,
sizeScore,
seederScore,
matchScore,
totalScore: baseScore,
notes: [],
}, ebook.preferredFormat),
},
ebookFormat: detectedFormat !== 'unknown' ? detectedFormat : undefined,
};
});
// Sort by finalScore descending (best first), then by publishDate descending (newest first)
ranked.sort((a, b) => {
if (b.finalScore !== a.finalScore) {
return b.finalScore - a.finalScore;
}
return b.publishDate.getTime() - a.publishDate.getTime();
});
// Assign ranks
ranked.forEach((r, index) => {
r.rank = index + 1;
});
return ranked;
}
/**
* Score ebook format (10 points max)
* Full points for matching preferred format, 0 otherwise
*/
private scoreEbookFormat(torrent: TorrentResult, preferredFormat: string): number {
const detectedFormat = this.detectEbookFormat(torrent);
const preferred = preferredFormat.toLowerCase();
// Exact match = full points, otherwise 0
if (detectedFormat === preferred) {
return 10;
}
return 0;
}
/**
* Score ebook file size (15 points max, inverted - smaller is better)
* < 5 MB = 15 pts (full)
* 5-15 MB = 10 pts
* 15-20 MB = 5 pts
* > 20 MB = filtered out (not scored)
*/
private scoreEbookSize(torrent: TorrentResult): number {
const sizeMB = torrent.size / (1024 * 1024);
if (sizeMB < 5) {
return 15; // Optimal size for ebooks
} else if (sizeMB <= 15) {
return 10; // Acceptable, may have images
} else if (sizeMB <= 20) {
return 5; // Large but within limit
}
// > 20 MB should have been filtered, but return 0 as safety
return 0;
}
/**
* Detect ebook format from torrent title
* Handles formats in various positions: .epub, (epub), [epub], " epub"
*/
private detectEbookFormat(torrent: TorrentResult): string {
const title = torrent.title.toLowerCase();
// Check for common ebook format extensions/keywords
// Patterns: .format, (format), [format], " format", "_format"
const formats = ['epub', 'pdf', 'mobi', 'azw3', 'azw', 'fb2', 'cbz', 'cbr'];
for (const format of formats) {
if (
title.includes(`.${format}`) || // file.epub
title.includes(`(${format})`) || // (epub)
title.includes(`[${format}]`) || // [epub]
title.includes(` ${format}`) || // " epub" (space before)
title.includes(`_${format}`) || // _epub (underscore)
title.endsWith(format) // ends with format
) {
return format;
}
}
// Default to unknown
return 'unknown';
}
/**
* Generate human-readable notes for ebook scoring
*/
private generateEbookNotes(
torrent: TorrentResult,
breakdown: EbookScoreBreakdown,
preferredFormat: string
): string[] {
const notes: string[] = [];
// Format notes
const detectedFormat = this.detectEbookFormat(torrent);
if (breakdown.formatScore === 10) {
notes.push(`✓ Preferred format (${detectedFormat.toUpperCase()})`);
} else if (detectedFormat !== 'unknown') {
notes.push(`Different format (${detectedFormat.toUpperCase()}, wanted ${preferredFormat.toUpperCase()})`);
} else {
notes.push('⚠️ Unknown format');
}
// Size notes
const sizeMB = torrent.size / (1024 * 1024);
if (sizeMB < 5) {
notes.push('✓ Optimal file size');
} else if (sizeMB <= 15) {
notes.push('Good file size (may have images)');
} else if (sizeMB <= 20) {
notes.push('⚠️ Large file size');
}
// Seeder notes (same logic as audiobooks)
if (torrent.seeders !== undefined && torrent.seeders !== null && !isNaN(torrent.seeders)) {
if (torrent.seeders === 0) {
notes.push('⚠️ No seeders available');
} else if (torrent.seeders < 5) {
notes.push(`Low seeders (${torrent.seeders})`);
} else if (torrent.seeders >= 50) {
notes.push(`Excellent availability (${torrent.seeders} seeders)`);
}
}
// Match notes (same thresholds as audiobooks)
if (breakdown.matchScore < 24) {
notes.push('⚠️ Poor title/author match');
} else if (breakdown.matchScore < 42) {
notes.push('⚠️ Weak title/author match');
} else if (breakdown.matchScore >= 54) {
notes.push('✓ Excellent title/author match');
}
// Overall quality assessment
if (breakdown.totalScore >= 75) {
notes.push('✓ Excellent choice');
} else if (breakdown.totalScore >= 55) {
notes.push('✓ Good choice');
} else if (breakdown.totalScore < 35) {
notes.push('⚠️ Consider reviewing this choice');
}
return notes;
}
}
// =========================================================================
// EBOOK RANKING (simplified algorithm for ebook search results)
// =========================================================================
export interface EbookResult {
md5: string;
title: string;
author: string;
format: string; // epub, pdf, mobi, etc.
fileSize?: number; // in bytes
downloadUrls: string[];
source: 'annas_archive' | 'prowlarr'; // Source of the result
indexerId?: number; // Prowlarr indexer ID (if applicable)
}
export interface EbookRequest {
title: string;
author: string;
preferredFormat: string; // User's preferred format (epub, pdf, etc.)
}
export interface RankedEbook extends EbookResult {
score: number; // Total score (0-100)
rank: number;
breakdown: {
formatScore: number; // 0-40 points
sizeScore: number; // 0-30 points (inverted - smaller is better)
sourceScore: number; // 0-30 points (Anna's Archive priority)
notes: string[];
};
}
/**
* Rank ebook search results
* Scoring priorities (inverted from audiobooks):
* - Format match: 40 points (matching preferred format)
* - Size: 30 points (smaller files = better, inverted from audiobooks)
* - Source: 30 points (Anna's Archive priority for reliability)
*/
export function rankEbooks(
results: EbookResult[],
request: EbookRequest
): RankedEbook[] {
const preferredFormat = request.preferredFormat.toLowerCase();
const ranked = results.map((result): RankedEbook => {
const notes: string[] = [];
// ========== FORMAT SCORING (0-40 points) ==========
// Exact format match gets full points
// Similar formats get partial credit
let formatScore = 0;
const resultFormat = result.format.toLowerCase();
if (resultFormat === preferredFormat) {
formatScore = 40;
notes.push(`✓ Preferred format (${result.format.toUpperCase()})`);
} else {
// Partial credit for compatible formats
const ebookFormatGroups = [
['epub', 'kepub'], // EPUB family
['mobi', 'azw', 'azw3'], // Kindle family
['pdf'], // PDF standalone
['fb2', 'fb2.zip'], // FB2 family
['cbz', 'cbr'], // Comic formats
];
const preferredGroup = ebookFormatGroups.find(g => g.includes(preferredFormat));
const resultGroup = ebookFormatGroups.find(g => g.includes(resultFormat));
if (preferredGroup && resultGroup && preferredGroup === resultGroup) {
formatScore = 30; // Same family
notes.push(`Similar format (${result.format.toUpperCase()})`);
} else if (resultFormat === 'epub') {
formatScore = 25; // EPUB is universally convertible
notes.push(`Convertible format (${result.format.toUpperCase()})`);
} else if (resultFormat === 'pdf') {
formatScore = 15; // PDF is common but less flexible
notes.push(`PDF format (less flexible)`);
} else {
formatScore = 10; // Other formats
notes.push(`Different format (${result.format.toUpperCase()})`);
}
}
// ========== SIZE SCORING (0-30 points, inverted) ==========
// For ebooks, smaller files are generally better (cleaner, no bloat)
// Typical ebook sizes: 0.5-5 MB (good), 5-20 MB (has images), 20+ MB (may have issues)
let sizeScore = 0;
if (result.fileSize !== undefined && result.fileSize > 0) {
const sizeMB = result.fileSize / (1024 * 1024);
if (sizeMB <= 2) {
sizeScore = 30; // Ideal size
notes.push('✓ Optimal file size');
} else if (sizeMB <= 5) {
sizeScore = 25; // Good size
notes.push('Good file size');
} else if (sizeMB <= 15) {
sizeScore = 20; // Has images, acceptable
notes.push('Larger file (may have images)');
} else if (sizeMB <= 50) {
sizeScore = 10; // Large, possibly bloated
notes.push('⚠️ Large file size');
} else {
sizeScore = 5; // Very large, suspicious
notes.push('⚠️ Very large file (may include extras)');
}
} else {
// No size info - give middle score
sizeScore = 15;
notes.push('File size unknown');
}
// ========== SOURCE SCORING (0-30 points) ==========
// Anna's Archive is the primary reliable source
// Future: Prowlarr indexers will get configurable priority
let sourceScore = 0;
if (result.source === 'annas_archive') {
sourceScore = 30; // Full points for Anna's Archive
notes.push('✓ Anna\'s Archive (reliable)');
} else if (result.source === 'prowlarr') {
// Future: Use indexer priority from config
sourceScore = 15; // Base score for Prowlarr results
notes.push('Prowlarr indexer');
}
const totalScore = formatScore + sizeScore + sourceScore;
return {
...result,
score: totalScore,
rank: 0, // Will be assigned after sorting
breakdown: {
formatScore,
sizeScore,
sourceScore,
notes,
},
};
});
// Sort by score descending
ranked.sort((a, b) => b.score - a.score);
// Assign ranks
ranked.forEach((r, index) => {
r.rank = index + 1;
});
return ranked;
}
// Singleton instance
let ranker: RankingAlgorithm | null = null;
export function getRankingAlgorithm(): RankingAlgorithm {
if (!ranker) {
ranker = new RankingAlgorithm();
}
return ranker;
}
/**
* Helper function to rank torrents using the singleton instance
*
* @param torrents - Array of torrent results to rank
* @param audiobook - Audiobook request details
* @param options - Optional ranking configuration
* @returns Ranked torrents with quality scores
*/
export function rankTorrents(
torrents: TorrentResult[],
audiobook: AudiobookRequest,
options?: RankTorrentsOptions
): (RankedTorrent & { qualityScore: number })[];
/**
* Helper function to rank torrents using the singleton instance (legacy signature)
* @deprecated Use options object instead
*/
export function rankTorrents(
torrents: TorrentResult[],
audiobook: AudiobookRequest,
indexerPriorities?: Map<number, number>,
flagConfigs?: IndexerFlagConfig[]
): (RankedTorrent & { qualityScore: number })[];
export function rankTorrents(
torrents: TorrentResult[],
audiobook: AudiobookRequest,
optionsOrPriorities?: RankTorrentsOptions | Map<number, number>,
flagConfigs?: IndexerFlagConfig[]
): (RankedTorrent & { qualityScore: number })[] {
const algorithm = getRankingAlgorithm();
// Handle both new options object and legacy parameters
let options: RankTorrentsOptions;
if (optionsOrPriorities instanceof Map) {
// Legacy call: rankTorrents(torrents, audiobook, priorities, flags)
options = {
indexerPriorities: optionsOrPriorities,
flagConfigs,
requireAuthor: true // Safe default
};
} else {
// New call: rankTorrents(torrents, audiobook, options)
options = optionsOrPriorities || {};
}
const ranked = algorithm.rankTorrents(torrents, audiobook, options);
// Add qualityScore field for UI compatibility (rounded score)
return ranked.map((r) => ({
...r,
qualityScore: Math.round(r.score),
}));
}
/**
* Helper function to rank ebook torrents using the singleton instance
*
* @param torrents - Array of torrent results from Prowlarr
* @param ebook - Ebook request details (title, author, preferredFormat)
* @param options - Optional ranking configuration
* @returns Ranked ebook torrents with quality scores
*/
export function rankEbookTorrents(
torrents: TorrentResult[],
ebook: EbookTorrentRequest,
options?: RankEbookTorrentsOptions
): (RankedEbookTorrent & { qualityScore: number })[] {
const algorithm = getRankingAlgorithm();
const ranked = algorithm.rankEbookTorrents(torrents, ebook, options || {});
// Add qualityScore field for UI compatibility (rounded score)
return ranked.map((r) => ({
...r,
qualityScore: Math.round(r.score),
}));
}