Files
ReadMeABook/src/lib/services/ebook-scraper.ts
T
kikootwo 09e1a0db3a Use .gl for Anna's Archive; add manual-import test
Replace default Anna's Archive base URL from https://annas-archive.li to https://annas-archive.gl across docs, UI components, API routes, processors, services, and tests. Add comprehensive tests for the admin manual-import API route and enhance the manual-import route to fetch missing ASIN details from Audnexus and create audiobook records with proper error handling and logging. Update related test expectations and FlareSolverr test usages to reflect the new default URL.
2026-03-05 12:20:00 -05:00

772 lines
23 KiB
TypeScript

/**
* Component: E-book Sidecar Service
* Documentation: documentation/integrations/ebook-sidecar.md
*/
import axios, { AxiosError } from 'axios';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
import { RMABLogger } from '../utils/logger';
// Module-level logger (renamed to avoid shadowing function parameter 'logger')
const moduleLogger = RMABLogger.create('EbookScraper');
export interface EbookDownloadResult {
success: boolean;
filePath?: string;
format?: string;
error?: string;
}
const USER_AGENT = 'ReadMeABook/1.0 (Audiobook Automation)';
const REQUEST_DELAY_MS = 1500; // 1.5 second delay between requests
const DOWNLOAD_TIMEOUT_MS = 60000; // 60 seconds per download attempt
const MAX_SLOW_LINK_ATTEMPTS = 5;
const MAX_RETRIES = 3;
const FLARESOLVERR_TIMEOUT_MS = 60000; // 60 seconds for FlareSolverr requests
// In-memory cache for MD5 lookups (prevents re-scraping same ASIN)
const md5Cache = new Map<string, string | null>();
// FlareSolverr types
interface FlareSolverrRequest {
cmd: 'request.get';
url: string;
maxTimeout: number;
}
interface FlareSolverrResponse {
status: 'ok' | 'error';
message: string;
solution?: {
url: string;
status: number;
headers: Record<string, string>;
response: string;
cookies: Array<{ name: string; value: string }>;
userAgent: string;
};
}
/**
* Fetch HTML via FlareSolverr proxy (bypasses Cloudflare)
*/
async function fetchViaFlareSolverr(
targetUrl: string,
flaresolverrUrl: string,
timeout: number = FLARESOLVERR_TIMEOUT_MS
): Promise<string> {
const requestBody: FlareSolverrRequest = {
cmd: 'request.get',
url: targetUrl,
maxTimeout: timeout,
};
const response = await axios.post<FlareSolverrResponse>(
`${flaresolverrUrl}/v1`,
requestBody,
{
headers: { 'Content-Type': 'application/json' },
timeout: timeout + 5000, // Extra buffer for FlareSolverr processing
}
);
if (response.data.status !== 'ok' || !response.data.solution) {
throw new Error(`FlareSolverr error: ${response.data.message}`);
}
if (response.data.solution.status >= 400) {
throw new Error(`FlareSolverr returned HTTP ${response.data.solution.status}`);
}
return response.data.solution.response;
}
/**
* Unified HTML fetch function - tries FlareSolverr if configured, falls back to direct
*/
async function fetchHtml(
url: string,
flaresolverrUrl?: string,
logger?: RMABLogger
): Promise<string> {
// Try FlareSolverr first if configured
if (flaresolverrUrl) {
try {
moduleLogger.debug(`Using FlareSolverr for: ${url}`);
const html = await fetchViaFlareSolverr(url, flaresolverrUrl);
moduleLogger.debug(`FlareSolverr returned HTML length: ${html.length}`);
return html;
} catch (error) {
await logger?.warn(
`FlareSolverr failed, falling back to direct request: ${
error instanceof Error ? error.message : 'Unknown error'
}`
);
moduleLogger.debug('FlareSolverr error', { error: error instanceof Error ? error.message : String(error) });
// Fall through to direct request
}
}
// Direct request (may fail with Cloudflare protection)
moduleLogger.debug(`Using direct request for: ${url}`);
const response = await retryRequest(() =>
axios.get(url, {
headers: { 'User-Agent': USER_AGENT },
timeout: 30000,
})
);
moduleLogger.debug(`Direct request returned data length: ${response.data?.length || 0}`);
return response.data;
}
/**
* Test FlareSolverr connection
*/
export async function testFlareSolverrConnection(
flaresolverrUrl: string,
baseUrl: string = 'https://annas-archive.gl'
): Promise<{ success: boolean; message: string; responseTime?: number }> {
const startTime = Date.now();
try {
// Test with a simple request to the configured Anna's Archive base URL
const testUrl = baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`;
const html = await fetchViaFlareSolverr(testUrl, flaresolverrUrl, 30000);
const responseTime = Date.now() - startTime;
// Verify we got valid HTML
if (html && html.includes('Anna') && html.length > 1000) {
return {
success: true,
message: `Connection successful (${responseTime}ms)`,
responseTime,
};
}
return {
success: false,
message: 'FlareSolverr returned invalid response',
};
} catch (error) {
return {
success: false,
message: error instanceof Error ? error.message : 'Unknown error',
};
}
}
/**
* Main entry point: Download e-book from Anna's Archive by ASIN
*/
export async function downloadEbook(
asin: string,
title: string,
author: string,
targetDir: string,
preferredFormat: string = 'epub',
baseUrl: string = 'https://annas-archive.gl',
logger?: RMABLogger,
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<EbookDownloadResult> {
try {
let md5: string | null = null;
// Log FlareSolverr status
if (flaresolverrUrl) {
await logger?.info(`Using FlareSolverr at ${flaresolverrUrl}`);
}
// Step 1: Try ASIN search (exact match - best)
if (asin) {
await logger?.info(`Searching by ASIN: ${asin} (format: ${preferredFormat})...`);
md5 = await searchByAsin(asin, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
await logger?.info(`Found via ASIN: ${md5}`);
} else {
await logger?.info(`No results for ASIN, falling back to title + author search...`);
}
}
// Step 2: Fallback to title + author search
if (!md5) {
await logger?.info(`Searching by title + author: "${title}" by ${author}...`);
md5 = await searchByTitle(title, author, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode);
if (md5) {
await logger?.info(`Found via title search: ${md5}`);
}
}
if (!md5) {
return {
success: false,
error: 'No search results found (tried ASIN and title+author)',
};
}
await logger?.info(`Found MD5: ${md5}`);
// Step 3: Get slow download links (no waitlist only)
const slowLinks = await getSlowDownloadLinks(md5, baseUrl, logger, flaresolverrUrl);
if (slowLinks.length === 0) {
return {
success: false,
error: 'No download links available',
};
}
await logger?.info(`Found ${slowLinks.length} download link(s)`);
// Step 4 & 5: Try each slow download link until one succeeds
// Note: We determine the actual filename AFTER we know the real format from the download URL
const attemptsLimit = Math.min(slowLinks.length, MAX_SLOW_LINK_ATTEMPTS);
for (let i = 0; i < attemptsLimit; i++) {
const slowLink = slowLinks[i];
await logger?.info(`Attempting download link ${i + 1}/${attemptsLimit}...`);
try {
// Extract actual download URL from slow download page
const extracted = await extractDownloadUrl(
slowLink,
baseUrl,
preferredFormat,
logger,
flaresolverrUrl
);
if (!extracted) {
await logger?.warn(`No download URL found on page ${i + 1}`);
await delay(REQUEST_DELAY_MS);
continue;
}
// Use the actual format from the download URL, not the preferred format
const actualFormat = extracted.format;
const sanitizedFilename = sanitizeEbookFilename(title, author, actualFormat);
const targetPath = path.join(targetDir, sanitizedFilename);
// Check if file already exists
try {
await fs.access(targetPath);
await logger?.info(`E-book already exists: ${sanitizedFilename}`);
return {
success: true,
filePath: targetPath,
format: actualFormat,
};
} catch {
// File doesn't exist, continue with download
}
await logger?.info(`Downloading from: ${new URL(extracted.url).host} (format: ${actualFormat})`);
// Download file (direct - no FlareSolverr needed for file servers)
const success = await downloadFile(extracted.url, targetPath, logger);
if (success) {
await logger?.info(`E-book downloaded successfully: ${sanitizedFilename}`);
return {
success: true,
filePath: targetPath,
format: actualFormat,
};
}
await logger?.warn(`Download attempt ${i + 1} failed`);
await delay(REQUEST_DELAY_MS);
} catch (error) {
await logger?.warn(
`Download link ${i + 1} error: ${error instanceof Error ? error.message : 'Unknown'}`
);
await delay(REQUEST_DELAY_MS);
}
}
return {
success: false,
error: `All ${attemptsLimit} download attempts failed`,
};
} catch (error) {
const errorMsg = error instanceof Error ? error.message : 'Unknown error';
await logger?.error(`E-book download error: ${errorMsg}`);
return {
success: false,
error: errorMsg,
};
}
}
/**
* Step 1: Search Anna's Archive by ASIN and extract MD5 hash
* Exported for use by search-ebook processor
*/
export async function searchByAsin(
asin: string,
format: string,
baseUrl: string,
logger?: RMABLogger,
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<string | null> {
// Check cache first
const cacheKey = `${asin}-${format}-${languageCode}`;
if (md5Cache.has(cacheKey)) {
const cached = md5Cache.get(cacheKey);
if (cached) {
await logger?.info(`Using cached MD5 for ASIN ${asin}`);
}
return cached ?? null; // Convert undefined to null
}
try {
// Build search URL with ASIN and optional format filter
const formatParam = format && format !== 'any' ? `ext=${format}&` : '';
const searchUrl = `${baseUrl}/search?${formatParam}lang=${languageCode}&q=%22asin:${asin}%22`;
moduleLogger.debug(`ASIN search URL: ${searchUrl}`);
const html = await fetchHtml(searchUrl, flaresolverrUrl, logger);
const $ = cheerio.load(html);
// Exclude MD5 links from "Recent downloads" banner and "Partial matches" section
// Only look for actual search result links
const searchResultLinks = $('a[href*="/md5/"]').filter((i, elem) => {
// Exclude links inside the recent downloads banner
if ($(elem).closest('.js-recent-downloads-container').length > 0) {
return false;
}
// Exclude links inside the partial matches section
if ($(elem).closest('.js-partial-matches-show').length > 0) {
return false;
}
return true;
});
// Debug logging for ASIN search
const pageTitle = $('title').text();
const allMd5Links = $('a[href*="/md5/"]').length;
moduleLogger.debug('ASIN search results', {
htmlLength: html.length,
pageTitle,
totalMd5Links: allMd5Links,
searchResultLinks: searchResultLinks.length
});
// Extract MD5 from first search result link
const firstResult = searchResultLinks.first();
const href = firstResult.attr('href');
if (firstResult.length > 0) {
const resultText = firstResult.text().trim().substring(0, 100);
const parentText = firstResult.parent().text().trim().substring(0, 100);
moduleLogger.debug('First result details', { resultText, parentText });
}
if (!href) {
await logger?.warn(`No search results found for ASIN: ${asin}`);
md5Cache.set(cacheKey, null);
return null;
}
// Extract MD5 from href (e.g., "/md5/3b6f9c0f..." -> "3b6f9c0f...")
const md5Match = href.match(/\/md5\/([a-f0-9]+)/);
const md5 = md5Match ? md5Match[1] : null;
moduleLogger.debug(`Extracted MD5 from ASIN search: ${md5}`);
// Cache result
md5Cache.set(cacheKey, md5);
await delay(REQUEST_DELAY_MS);
return md5;
} catch (error) {
await logger?.error(
`Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`
);
md5Cache.set(cacheKey, null);
return null;
}
}
/**
* Search Anna's Archive by title and author (fallback method)
* Exported for use by search-ebook processor
*/
export async function searchByTitle(
title: string,
author: string,
format: string,
baseUrl: string,
logger?: RMABLogger,
flaresolverrUrl?: string,
languageCode: string = 'en'
): Promise<string | null> {
// Check cache first
const cacheKey = `title-${title}-${author}-${format}-${languageCode}`.toLowerCase();
if (md5Cache.has(cacheKey)) {
const cached = md5Cache.get(cacheKey);
if (cached) {
await logger?.info(`Using cached MD5 for title search`);
}
return cached ?? null;
}
try {
// Build search URL using specific term types for author and title (more accurate than raw query)
const encodedAuthor = encodeURIComponent(author);
const encodedTitle = encodeURIComponent(title);
// Use Anna's Archive advanced search with specific term types
let searchUrl = `${baseUrl}/search?termtype_1=author&termval_1=${encodedAuthor}&termtype_2=title&termval_2=${encodedTitle}`;
// Add format filter if not 'any'
if (format && format !== 'any') {
searchUrl += `&ext=${format}`;
}
// Add content type filters (books only, all fiction/nonfiction/unknown)
searchUrl += '&content=book_nonfiction&content=book_fiction&content=book_unknown';
// Add language filter
searchUrl += `&lang=${languageCode}`;
// Empty raw query (we're using specific terms instead)
searchUrl += '&q=';
moduleLogger.debug(`Title search URL: ${searchUrl}`);
const html = await fetchHtml(searchUrl, flaresolverrUrl, logger);
const $ = cheerio.load(html);
// Exclude MD5 links from "Recent downloads" banner and "Partial matches" section
const searchResultLinks = $('a[href*="/md5/"]').filter((i, elem) => {
// Exclude links inside the recent downloads banner
if ($(elem).closest('.js-recent-downloads-container').length > 0) {
return false;
}
// Exclude links inside the partial matches section
if ($(elem).closest('.js-partial-matches-show').length > 0) {
return false;
}
return true;
});
const allMd5Links = $('a[href*="/md5/"]').length;
moduleLogger.debug('Title search results', { totalMd5Links: allMd5Links, searchResultLinks: searchResultLinks.length });
// Extract MD5 from first search result link
const firstResult = searchResultLinks.first();
const href = firstResult.attr('href');
if (!href) {
await logger?.warn(`No search results found for title: "${title}" by ${author}`);
md5Cache.set(cacheKey, null);
return null;
}
// Extract MD5 from href
const md5Match = href.match(/\/md5\/([a-f0-9]+)/);
const md5 = md5Match ? md5Match[1] : null;
// Cache result
md5Cache.set(cacheKey, md5);
await delay(REQUEST_DELAY_MS);
return md5;
} catch (error) {
await logger?.error(
`Title search failed: ${error instanceof Error ? error.message : 'Unknown error'}`
);
md5Cache.set(cacheKey, null);
return null;
}
}
/**
* Step 3: Get slow download links from MD5 page (no waitlist only)
* Exported for use by search-ebook processor
*/
export async function getSlowDownloadLinks(
md5: string,
baseUrl: string,
logger?: RMABLogger,
flaresolverrUrl?: string
): Promise<string[]> {
try {
const md5Url = `${baseUrl}/md5/${md5}`;
moduleLogger.debug(`Fetching MD5 page: ${md5Url}`);
const html = await fetchHtml(md5Url, flaresolverrUrl, logger);
moduleLogger.debug('MD5 page HTML', { length: html.length, preview: html.substring(0, 500) });
// Check if we got a Cloudflare challenge page
if (html.includes('challenge-running') || html.includes('cf-browser-verification')) {
moduleLogger.warn('Appears to be Cloudflare challenge page');
}
const $ = cheerio.load(html);
const slowLinks: string[] = [];
// Debug: count all links
const allLinks = $('a').length;
const slowDownloadLinks = $('a[href*="/slow_download/"]').length;
const slowDownloadLinksAlt = $('a[href*="slow_download"]').length;
moduleLogger.debug('Link counts on page', { allLinks, slowDownloadLinks, slowDownloadLinksAlt });
// Log all href patterns to see what we're dealing with
const hrefPatterns: string[] = [];
$('a[href]').each((i, elem) => {
const href = $(elem).attr('href') || '';
if (href.includes('download') || href.includes('slow')) {
hrefPatterns.push(href.substring(0, 100));
}
});
if (hrefPatterns.length > 0) {
moduleLogger.debug('Download-related hrefs found', { hrefs: hrefPatterns.slice(0, 10) });
}
// Find all slow download links
$('a[href*="/slow_download/"]').each((i, elem) => {
const linkText = $(elem).text().toLowerCase();
// Check parent element text too - "no waitlist" may be outside the <a> tag
// e.g., <li><a>Slow Partner Server #5</a> (no waitlist, but can be very slow)</li>
const parentText = $(elem).parent().text().toLowerCase();
const href = $(elem).attr('href');
moduleLogger.debug('Found slow_download link', { href, linkText: linkText.substring(0, 30), parentText: parentText.substring(0, 60) });
// Check for "no waitlist" in either the link text or parent text
if (linkText.includes('no waitlist') || parentText.includes('no waitlist')) {
if (href) {
// Convert relative URL to absolute
const fullUrl = href.startsWith('http') ? href : `${baseUrl}${href}`;
slowLinks.push(fullUrl);
moduleLogger.debug(`Added slow link (no waitlist): ${fullUrl}`);
}
}
});
moduleLogger.debug(`Total slow links found: ${slowLinks.length}`);
await delay(REQUEST_DELAY_MS);
return slowLinks;
} catch (error) {
await logger?.error(
`Failed to get slow links: ${error instanceof Error ? error.message : 'Unknown error'}`
);
moduleLogger.debug('Error getting slow links', { error: error instanceof Error ? error.message : String(error) });
return [];
}
}
export interface ExtractedDownload {
url: string;
format: string;
}
/**
* Step 4: Extract actual download URL from slow download page
* IMPORTANT: Supports dynamic file formats (not hardcoded to .epub)
* Returns both URL and detected format
* Exported for use by direct-download processor
*/
export async function extractDownloadUrl(
slowDownloadUrl: string,
baseUrl: string,
format: string,
logger?: RMABLogger,
flaresolverrUrl?: string
): Promise<ExtractedDownload | null> {
try {
const html = await fetchHtml(slowDownloadUrl, flaresolverrUrl, logger);
const $ = cheerio.load(html);
// Build regex pattern based on format
// If format is 'any', match any common e-book extension
let pattern: RegExp;
if (format === 'any') {
pattern = /(https?:\/\/[^\s]+\.(epub|pdf|mobi|azw3|djvu|fb2))/i;
} else {
pattern = new RegExp(`(https?:\\/\\/[^\\s]+\\.${format})`, 'i');
}
let downloadUrl: string | null = null;
let detectedFormat: string | null = null;
// Method 1: Search in pre/code blocks first (most reliable)
$('pre, code').each((i, elem) => {
const text = $(elem).text();
const match = text.match(pattern);
if (match) {
downloadUrl = match[1];
// Extract format from URL
const formatMatch = downloadUrl.match(/\.(epub|pdf|mobi|azw3|djvu|fb2)$/i);
detectedFormat = formatMatch ? formatMatch[1].toLowerCase() : null;
return false; // Break loop
}
});
// Method 2: Search entire body text as fallback
if (!downloadUrl) {
const bodyText = $('body').text();
const match = bodyText.match(pattern);
if (match) {
downloadUrl = match[1];
// Extract format from URL
const formatMatch = downloadUrl.match(/\.(epub|pdf|mobi|azw3|djvu|fb2)$/i);
detectedFormat = formatMatch ? formatMatch[1].toLowerCase() : null;
}
}
await delay(REQUEST_DELAY_MS);
if (!downloadUrl || !detectedFormat) {
return null;
}
return { url: downloadUrl, format: detectedFormat };
} catch (error) {
await logger?.error(
`Failed to extract download URL: ${error instanceof Error ? error.message : 'Unknown error'}`
);
return null;
}
}
/**
* Step 5: Download file from URL with streaming (handles large files)
*/
async function downloadFile(
url: string,
targetPath: string,
logger?: RMABLogger
): Promise<boolean> {
try {
const response = await axios.get(url, {
responseType: 'stream',
timeout: DOWNLOAD_TIMEOUT_MS,
headers: { 'User-Agent': USER_AGENT },
maxRedirects: 5,
});
// Stream to file
const writer = require('fs').createWriteStream(targetPath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on('finish', () => {
writer.close();
resolve(true);
});
writer.on('error', (error: Error) => {
writer.close();
// Clean up partial file
fs.unlink(targetPath).catch(() => {});
reject(error);
});
// Set timeout
const timeout = setTimeout(() => {
writer.close();
fs.unlink(targetPath).catch(() => {});
reject(new Error('Download timeout'));
}, DOWNLOAD_TIMEOUT_MS);
writer.on('finish', () => clearTimeout(timeout));
writer.on('error', () => clearTimeout(timeout));
});
} catch (error) {
// Clean up partial file
try {
await fs.unlink(targetPath);
} catch {}
await logger?.error(
`Download failed: ${error instanceof Error ? error.message : 'Unknown error'}`
);
return false;
}
}
/**
* Sanitize filename for e-book
* Format: "[Title] - [Author].[format]"
* Note: format should be the actual detected format (e.g., 'pdf', 'epub'), not 'any'
*/
function sanitizeEbookFilename(title: string, author: string, format: string): string {
const sanitize = (str: string): string => {
return str
.replace(/[<>:"/\\|?*]/g, '') // Remove invalid chars
.replace(/\s+/g, ' ') // Collapse spaces
.trim()
.slice(0, 100); // Limit length
};
const cleanTitle = sanitize(title);
const cleanAuthor = sanitize(author);
// Use the actual format passed in (should already be the detected format from URL)
const cleanFormat = format.toLowerCase();
return `${cleanTitle} - ${cleanAuthor}.${cleanFormat}`;
}
/**
* Retry HTTP request with exponential backoff
*/
async function retryRequest<T>(
requestFn: () => Promise<T>,
retries: number = MAX_RETRIES
): Promise<T> {
let lastError: Error | null = null;
for (let attempt = 0; attempt < retries; attempt++) {
try {
return await requestFn();
} catch (error) {
lastError = error instanceof Error ? error : new Error('Unknown error');
// Only retry on 5xx errors or network errors
const isRetryable =
error instanceof AxiosError &&
(error.code === 'ECONNRESET' ||
error.code === 'ETIMEDOUT' ||
(error.response && error.response.status >= 500));
if (!isRetryable || attempt === retries - 1) {
throw lastError;
}
// Exponential backoff: 1s, 2s, 4s
const delayMs = 1000 * Math.pow(2, attempt);
await delay(delayMs);
}
}
throw lastError || new Error('Request failed after retries');
}
/**
* Delay helper
*/
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Clear MD5 cache (useful for testing)
*/
export function clearMd5Cache(): void {
md5Cache.clear();
}