/** * Component: E-book Sidecar Service * Documentation: documentation/integrations/ebook-sidecar.md */ import axios, { AxiosError } from 'axios'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; import { RMABLogger } from '../utils/logger'; // Module-level logger (renamed to avoid shadowing function parameter 'logger') const moduleLogger = RMABLogger.create('EbookScraper'); export interface EbookDownloadResult { success: boolean; filePath?: string; format?: string; error?: string; } const USER_AGENT = 'ReadMeABook/1.0 (Audiobook Automation)'; const REQUEST_DELAY_MS = 1500; // 1.5 second delay between requests const DOWNLOAD_TIMEOUT_MS = 60000; // 60 seconds per download attempt const MAX_SLOW_LINK_ATTEMPTS = 5; const MAX_RETRIES = 3; const FLARESOLVERR_TIMEOUT_MS = 60000; // 60 seconds for FlareSolverr requests // In-memory cache for MD5 lookups (prevents re-scraping same ASIN) const md5Cache = new Map(); // FlareSolverr types interface FlareSolverrRequest { cmd: 'request.get'; url: string; maxTimeout: number; } interface FlareSolverrResponse { status: 'ok' | 'error'; message: string; solution?: { url: string; status: number; headers: Record; response: string; cookies: Array<{ name: string; value: string }>; userAgent: string; }; } /** * Fetch HTML via FlareSolverr proxy (bypasses Cloudflare) */ async function fetchViaFlareSolverr( targetUrl: string, flaresolverrUrl: string, timeout: number = FLARESOLVERR_TIMEOUT_MS ): Promise { const requestBody: FlareSolverrRequest = { cmd: 'request.get', url: targetUrl, maxTimeout: timeout, }; const response = await axios.post( `${flaresolverrUrl}/v1`, requestBody, { headers: { 'Content-Type': 'application/json' }, timeout: timeout + 5000, // Extra buffer for FlareSolverr processing } ); if (response.data.status !== 'ok' || !response.data.solution) { throw new Error(`FlareSolverr error: ${response.data.message}`); } if (response.data.solution.status >= 400) { throw new Error(`FlareSolverr returned HTTP ${response.data.solution.status}`); } return response.data.solution.response; } /** * Unified HTML fetch function - tries FlareSolverr if configured, falls back to direct */ async function fetchHtml( url: string, flaresolverrUrl?: string, logger?: RMABLogger ): Promise { // Try FlareSolverr first if configured if (flaresolverrUrl) { try { moduleLogger.debug(`Using FlareSolverr for: ${url}`); const html = await fetchViaFlareSolverr(url, flaresolverrUrl); moduleLogger.debug(`FlareSolverr returned HTML length: ${html.length}`); return html; } catch (error) { await logger?.warn( `FlareSolverr failed, falling back to direct request: ${ error instanceof Error ? error.message : 'Unknown error' }` ); moduleLogger.debug('FlareSolverr error', { error: error instanceof Error ? error.message : String(error) }); // Fall through to direct request } } // Direct request (may fail with Cloudflare protection) moduleLogger.debug(`Using direct request for: ${url}`); const response = await retryRequest(() => axios.get(url, { headers: { 'User-Agent': USER_AGENT }, timeout: 30000, }) ); moduleLogger.debug(`Direct request returned data length: ${response.data?.length || 0}`); return response.data; } /** * Test FlareSolverr connection */ export async function testFlareSolverrConnection( flaresolverrUrl: string, baseUrl: string = 'https://annas-archive.gl' ): Promise<{ success: boolean; message: string; responseTime?: number }> { const startTime = Date.now(); try { // Test with a simple request to the configured Anna's Archive base URL const testUrl = baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`; const html = await fetchViaFlareSolverr(testUrl, flaresolverrUrl, 30000); const responseTime = Date.now() - startTime; // Verify we got valid HTML if (html && html.includes('Anna') && html.length > 1000) { return { success: true, message: `Connection successful (${responseTime}ms)`, responseTime, }; } return { success: false, message: 'FlareSolverr returned invalid response', }; } catch (error) { return { success: false, message: error instanceof Error ? error.message : 'Unknown error', }; } } /** * Main entry point: Download e-book from Anna's Archive by ASIN */ export async function downloadEbook( asin: string, title: string, author: string, targetDir: string, preferredFormat: string = 'epub', baseUrl: string = 'https://annas-archive.gl', logger?: RMABLogger, flaresolverrUrl?: string, languageCode: string = 'en' ): Promise { try { let md5: string | null = null; // Log FlareSolverr status if (flaresolverrUrl) { await logger?.info(`Using FlareSolverr at ${flaresolverrUrl}`); } // Step 1: Try ASIN search (exact match - best) if (asin) { await logger?.info(`Searching by ASIN: ${asin} (format: ${preferredFormat})...`); md5 = await searchByAsin(asin, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode); if (md5) { await logger?.info(`Found via ASIN: ${md5}`); } else { await logger?.info(`No results for ASIN, falling back to title + author search...`); } } // Step 2: Fallback to title + author search if (!md5) { await logger?.info(`Searching by title + author: "${title}" by ${author}...`); md5 = await searchByTitle(title, author, preferredFormat, baseUrl, logger, flaresolverrUrl, languageCode); if (md5) { await logger?.info(`Found via title search: ${md5}`); } } if (!md5) { return { success: false, error: 'No search results found (tried ASIN and title+author)', }; } await logger?.info(`Found MD5: ${md5}`); // Step 3: Get slow download links (no waitlist only) const slowLinks = await getSlowDownloadLinks(md5, baseUrl, logger, flaresolverrUrl); if (slowLinks.length === 0) { return { success: false, error: 'No download links available', }; } await logger?.info(`Found ${slowLinks.length} download link(s)`); // Step 4 & 5: Try each slow download link until one succeeds // Note: We determine the actual filename AFTER we know the real format from the download URL const attemptsLimit = Math.min(slowLinks.length, MAX_SLOW_LINK_ATTEMPTS); for (let i = 0; i < attemptsLimit; i++) { const slowLink = slowLinks[i]; await logger?.info(`Attempting download link ${i + 1}/${attemptsLimit}...`); try { // Extract actual download URL from slow download page const extracted = await extractDownloadUrl( slowLink, baseUrl, preferredFormat, logger, flaresolverrUrl ); if (!extracted) { await logger?.warn(`No download URL found on page ${i + 1}`); await delay(REQUEST_DELAY_MS); continue; } // Use the actual format from the download URL, not the preferred format const actualFormat = extracted.format; const sanitizedFilename = sanitizeEbookFilename(title, author, actualFormat); const targetPath = path.join(targetDir, sanitizedFilename); // Check if file already exists try { await fs.access(targetPath); await logger?.info(`E-book already exists: ${sanitizedFilename}`); return { success: true, filePath: targetPath, format: actualFormat, }; } catch { // File doesn't exist, continue with download } await logger?.info(`Downloading from: ${new URL(extracted.url).host} (format: ${actualFormat})`); // Download file (direct - no FlareSolverr needed for file servers) const success = await downloadFile(extracted.url, targetPath, logger); if (success) { await logger?.info(`E-book downloaded successfully: ${sanitizedFilename}`); return { success: true, filePath: targetPath, format: actualFormat, }; } await logger?.warn(`Download attempt ${i + 1} failed`); await delay(REQUEST_DELAY_MS); } catch (error) { await logger?.warn( `Download link ${i + 1} error: ${error instanceof Error ? error.message : 'Unknown'}` ); await delay(REQUEST_DELAY_MS); } } return { success: false, error: `All ${attemptsLimit} download attempts failed`, }; } catch (error) { const errorMsg = error instanceof Error ? error.message : 'Unknown error'; await logger?.error(`E-book download error: ${errorMsg}`); return { success: false, error: errorMsg, }; } } /** * Step 1: Search Anna's Archive by ASIN and extract MD5 hash * Exported for use by search-ebook processor */ export async function searchByAsin( asin: string, format: string, baseUrl: string, logger?: RMABLogger, flaresolverrUrl?: string, languageCode: string = 'en' ): Promise { // Check cache first const cacheKey = `${asin}-${format}-${languageCode}`; if (md5Cache.has(cacheKey)) { const cached = md5Cache.get(cacheKey); if (cached) { await logger?.info(`Using cached MD5 for ASIN ${asin}`); } return cached ?? null; // Convert undefined to null } try { // Build search URL with ASIN and optional format filter const formatParam = format && format !== 'any' ? `ext=${format}&` : ''; const searchUrl = `${baseUrl}/search?${formatParam}lang=${languageCode}&q=%22asin:${asin}%22`; moduleLogger.debug(`ASIN search URL: ${searchUrl}`); const html = await fetchHtml(searchUrl, flaresolverrUrl, logger); const $ = cheerio.load(html); // Exclude MD5 links from "Recent downloads" banner and "Partial matches" section // Only look for actual search result links const searchResultLinks = $('a[href*="/md5/"]').filter((i, elem) => { // Exclude links inside the recent downloads banner if ($(elem).closest('.js-recent-downloads-container').length > 0) { return false; } // Exclude links inside the partial matches section if ($(elem).closest('.js-partial-matches-show').length > 0) { return false; } return true; }); // Debug logging for ASIN search const pageTitle = $('title').text(); const allMd5Links = $('a[href*="/md5/"]').length; moduleLogger.debug('ASIN search results', { htmlLength: html.length, pageTitle, totalMd5Links: allMd5Links, searchResultLinks: searchResultLinks.length }); // Extract MD5 from first search result link const firstResult = searchResultLinks.first(); const href = firstResult.attr('href'); if (firstResult.length > 0) { const resultText = firstResult.text().trim().substring(0, 100); const parentText = firstResult.parent().text().trim().substring(0, 100); moduleLogger.debug('First result details', { resultText, parentText }); } if (!href) { await logger?.warn(`No search results found for ASIN: ${asin}`); md5Cache.set(cacheKey, null); return null; } // Extract MD5 from href (e.g., "/md5/3b6f9c0f..." -> "3b6f9c0f...") const md5Match = href.match(/\/md5\/([a-f0-9]+)/); const md5 = md5Match ? md5Match[1] : null; moduleLogger.debug(`Extracted MD5 from ASIN search: ${md5}`); // Cache result md5Cache.set(cacheKey, md5); await delay(REQUEST_DELAY_MS); return md5; } catch (error) { await logger?.error( `Search failed: ${error instanceof Error ? error.message : 'Unknown error'}` ); md5Cache.set(cacheKey, null); return null; } } /** * Search Anna's Archive by title and author (fallback method) * Exported for use by search-ebook processor */ export async function searchByTitle( title: string, author: string, format: string, baseUrl: string, logger?: RMABLogger, flaresolverrUrl?: string, languageCode: string = 'en' ): Promise { // Check cache first const cacheKey = `title-${title}-${author}-${format}-${languageCode}`.toLowerCase(); if (md5Cache.has(cacheKey)) { const cached = md5Cache.get(cacheKey); if (cached) { await logger?.info(`Using cached MD5 for title search`); } return cached ?? null; } try { // Build search URL using specific term types for author and title (more accurate than raw query) const encodedAuthor = encodeURIComponent(author); const encodedTitle = encodeURIComponent(title); // Use Anna's Archive advanced search with specific term types let searchUrl = `${baseUrl}/search?termtype_1=author&termval_1=${encodedAuthor}&termtype_2=title&termval_2=${encodedTitle}`; // Add format filter if not 'any' if (format && format !== 'any') { searchUrl += `&ext=${format}`; } // Add content type filters (books only, all fiction/nonfiction/unknown) searchUrl += '&content=book_nonfiction&content=book_fiction&content=book_unknown'; // Add language filter searchUrl += `&lang=${languageCode}`; // Empty raw query (we're using specific terms instead) searchUrl += '&q='; moduleLogger.debug(`Title search URL: ${searchUrl}`); const html = await fetchHtml(searchUrl, flaresolverrUrl, logger); const $ = cheerio.load(html); // Exclude MD5 links from "Recent downloads" banner and "Partial matches" section const searchResultLinks = $('a[href*="/md5/"]').filter((i, elem) => { // Exclude links inside the recent downloads banner if ($(elem).closest('.js-recent-downloads-container').length > 0) { return false; } // Exclude links inside the partial matches section if ($(elem).closest('.js-partial-matches-show').length > 0) { return false; } return true; }); const allMd5Links = $('a[href*="/md5/"]').length; moduleLogger.debug('Title search results', { totalMd5Links: allMd5Links, searchResultLinks: searchResultLinks.length }); // Extract MD5 from first search result link const firstResult = searchResultLinks.first(); const href = firstResult.attr('href'); if (!href) { await logger?.warn(`No search results found for title: "${title}" by ${author}`); md5Cache.set(cacheKey, null); return null; } // Extract MD5 from href const md5Match = href.match(/\/md5\/([a-f0-9]+)/); const md5 = md5Match ? md5Match[1] : null; // Cache result md5Cache.set(cacheKey, md5); await delay(REQUEST_DELAY_MS); return md5; } catch (error) { await logger?.error( `Title search failed: ${error instanceof Error ? error.message : 'Unknown error'}` ); md5Cache.set(cacheKey, null); return null; } } /** * Step 3: Get slow download links from MD5 page (no waitlist only) * Exported for use by search-ebook processor */ export async function getSlowDownloadLinks( md5: string, baseUrl: string, logger?: RMABLogger, flaresolverrUrl?: string ): Promise { try { const md5Url = `${baseUrl}/md5/${md5}`; moduleLogger.debug(`Fetching MD5 page: ${md5Url}`); const html = await fetchHtml(md5Url, flaresolverrUrl, logger); moduleLogger.debug('MD5 page HTML', { length: html.length, preview: html.substring(0, 500) }); // Check if we got a Cloudflare challenge page if (html.includes('challenge-running') || html.includes('cf-browser-verification')) { moduleLogger.warn('Appears to be Cloudflare challenge page'); } const $ = cheerio.load(html); const slowLinks: string[] = []; // Debug: count all links const allLinks = $('a').length; const slowDownloadLinks = $('a[href*="/slow_download/"]').length; const slowDownloadLinksAlt = $('a[href*="slow_download"]').length; moduleLogger.debug('Link counts on page', { allLinks, slowDownloadLinks, slowDownloadLinksAlt }); // Log all href patterns to see what we're dealing with const hrefPatterns: string[] = []; $('a[href]').each((i, elem) => { const href = $(elem).attr('href') || ''; if (href.includes('download') || href.includes('slow')) { hrefPatterns.push(href.substring(0, 100)); } }); if (hrefPatterns.length > 0) { moduleLogger.debug('Download-related hrefs found', { hrefs: hrefPatterns.slice(0, 10) }); } // Find all slow download links $('a[href*="/slow_download/"]').each((i, elem) => { const linkText = $(elem).text().toLowerCase(); // Check parent element text too - "no waitlist" may be outside the tag // e.g.,
  • Slow Partner Server #5 (no waitlist, but can be very slow)
  • const parentText = $(elem).parent().text().toLowerCase(); const href = $(elem).attr('href'); moduleLogger.debug('Found slow_download link', { href, linkText: linkText.substring(0, 30), parentText: parentText.substring(0, 60) }); // Check for "no waitlist" in either the link text or parent text if (linkText.includes('no waitlist') || parentText.includes('no waitlist')) { if (href) { // Convert relative URL to absolute const fullUrl = href.startsWith('http') ? href : `${baseUrl}${href}`; slowLinks.push(fullUrl); moduleLogger.debug(`Added slow link (no waitlist): ${fullUrl}`); } } }); moduleLogger.debug(`Total slow links found: ${slowLinks.length}`); await delay(REQUEST_DELAY_MS); return slowLinks; } catch (error) { await logger?.error( `Failed to get slow links: ${error instanceof Error ? error.message : 'Unknown error'}` ); moduleLogger.debug('Error getting slow links', { error: error instanceof Error ? error.message : String(error) }); return []; } } export interface ExtractedDownload { url: string; format: string; } /** * Step 4: Extract actual download URL from slow download page * IMPORTANT: Supports dynamic file formats (not hardcoded to .epub) * Returns both URL and detected format * Exported for use by direct-download processor */ export async function extractDownloadUrl( slowDownloadUrl: string, baseUrl: string, format: string, logger?: RMABLogger, flaresolverrUrl?: string ): Promise { try { const html = await fetchHtml(slowDownloadUrl, flaresolverrUrl, logger); const $ = cheerio.load(html); // Build regex pattern based on format // If format is 'any', match any common e-book extension let pattern: RegExp; if (format === 'any') { pattern = /(https?:\/\/[^\s]+\.(epub|pdf|mobi|azw3|djvu|fb2))/i; } else { pattern = new RegExp(`(https?:\\/\\/[^\\s]+\\.${format})`, 'i'); } let downloadUrl: string | null = null; let detectedFormat: string | null = null; // Method 1: Search in pre/code blocks first (most reliable) $('pre, code').each((i, elem) => { const text = $(elem).text(); const match = text.match(pattern); if (match) { downloadUrl = match[1]; // Extract format from URL const formatMatch = downloadUrl.match(/\.(epub|pdf|mobi|azw3|djvu|fb2)$/i); detectedFormat = formatMatch ? formatMatch[1].toLowerCase() : null; return false; // Break loop } }); // Method 2: Search entire body text as fallback if (!downloadUrl) { const bodyText = $('body').text(); const match = bodyText.match(pattern); if (match) { downloadUrl = match[1]; // Extract format from URL const formatMatch = downloadUrl.match(/\.(epub|pdf|mobi|azw3|djvu|fb2)$/i); detectedFormat = formatMatch ? formatMatch[1].toLowerCase() : null; } } await delay(REQUEST_DELAY_MS); if (!downloadUrl || !detectedFormat) { return null; } return { url: downloadUrl, format: detectedFormat }; } catch (error) { await logger?.error( `Failed to extract download URL: ${error instanceof Error ? error.message : 'Unknown error'}` ); return null; } } /** * Step 5: Download file from URL with streaming (handles large files) */ async function downloadFile( url: string, targetPath: string, logger?: RMABLogger ): Promise { try { const response = await axios.get(url, { responseType: 'stream', timeout: DOWNLOAD_TIMEOUT_MS, headers: { 'User-Agent': USER_AGENT }, maxRedirects: 5, }); // Stream to file const writer = require('fs').createWriteStream(targetPath); response.data.pipe(writer); return new Promise((resolve, reject) => { writer.on('finish', () => { writer.close(); resolve(true); }); writer.on('error', (error: Error) => { writer.close(); // Clean up partial file fs.unlink(targetPath).catch(() => {}); reject(error); }); // Set timeout const timeout = setTimeout(() => { writer.close(); fs.unlink(targetPath).catch(() => {}); reject(new Error('Download timeout')); }, DOWNLOAD_TIMEOUT_MS); writer.on('finish', () => clearTimeout(timeout)); writer.on('error', () => clearTimeout(timeout)); }); } catch (error) { // Clean up partial file try { await fs.unlink(targetPath); } catch {} await logger?.error( `Download failed: ${error instanceof Error ? error.message : 'Unknown error'}` ); return false; } } /** * Sanitize filename for e-book * Format: "[Title] - [Author].[format]" * Note: format should be the actual detected format (e.g., 'pdf', 'epub'), not 'any' */ function sanitizeEbookFilename(title: string, author: string, format: string): string { const sanitize = (str: string): string => { return str .replace(/[<>:"/\\|?*]/g, '') // Remove invalid chars .replace(/\s+/g, ' ') // Collapse spaces .trim() .slice(0, 100); // Limit length }; const cleanTitle = sanitize(title); const cleanAuthor = sanitize(author); // Use the actual format passed in (should already be the detected format from URL) const cleanFormat = format.toLowerCase(); return `${cleanTitle} - ${cleanAuthor}.${cleanFormat}`; } /** * Retry HTTP request with exponential backoff */ async function retryRequest( requestFn: () => Promise, retries: number = MAX_RETRIES ): Promise { let lastError: Error | null = null; for (let attempt = 0; attempt < retries; attempt++) { try { return await requestFn(); } catch (error) { lastError = error instanceof Error ? error : new Error('Unknown error'); // Only retry on 5xx errors or network errors const isRetryable = error instanceof AxiosError && (error.code === 'ECONNRESET' || error.code === 'ETIMEDOUT' || (error.response && error.response.status >= 500)); if (!isRetryable || attempt === retries - 1) { throw lastError; } // Exponential backoff: 1s, 2s, 4s const delayMs = 1000 * Math.pow(2, attempt); await delay(delayMs); } } throw lastError || new Error('Request failed after retries'); } /** * Delay helper */ function delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } /** * Clear MD5 cache (useful for testing) */ export function clearMd5Cache(): void { md5Cache.clear(); }