ReadMeABook/src/lib/utils/deduplicate-audiobooks.ts

/**
 * Component: Audiobook Deduplication Utility
 * Documentation: documentation/integrations/audible.md
 *
 * Deduplicates audiobook listings that represent the same recording
 * under different ASINs (publisher re-listings, rights transfers, etc.).
 *
 * Dedup key: normalized title + normalized narrator
 * Duration tolerance: max(longerDuration * 0.05, 10) minutes
 * Missing duration treated as compatible (graceful degradation).
 */

import type { AudibleAudiobook } from '../integrations/audible.service';

// ---------------------------------------------------------------------------
// Title / narrator normalization
// ---------------------------------------------------------------------------

/** Patterns in parentheses or brackets to strip (edition markers, format labels) */
const EDITION_PAREN_RE = /[([][^)\]]*?(?:unabridged|abridged|edition|remaster(?:ed)?|anniversary|complete|original|version|narrat(?:ed|or)?|audio(?:book)?|full cast|dramatiz(?:ed|ation))[^)\]]*[)\]]/gi;

/** Trailing subtitle after colon or long dash (used for extraction, not blind stripping) */
const SUBTITLE_RE = /\s*[:]\s+.+$/;
const LONG_DASH_SUBTITLE_RE = /\s+[-\u2013\u2014]\s+.+$/;

/** Trailing descriptors like "A Novel", "A Memoir" */
const TRAILING_DESCRIPTOR_RE = /\s*[-:,]?\s+a\s+(novel|memoir|thriller|mystery|romance|story|tale|novella)\s*$/i;

/**
 * Normalize a title for dedup comparison.
 * Strips subtitles, edition markers, and trailing descriptors.
 */
export function normalizeTitle(title: string): string {
  let t = title.toLowerCase();
  // Remove parenthesized/bracketed edition markers
  t = t.replace(EDITION_PAREN_RE, '');
  // Remove trailing descriptors before subtitle stripping
  t = t.replace(TRAILING_DESCRIPTOR_RE, '');
  // Remove subtitle after colon
  t = t.replace(SUBTITLE_RE, '');
  // Remove subtitle after long dash (but not short hyphenated words)
  t = t.replace(LONG_DASH_SUBTITLE_RE, '');
  // Collapse whitespace and trim
  return t.replace(/\s+/g, ' ').trim();
}

/**
 * Extract the subtitle portion from a title (part after colon or long dash).
 * Returns empty string if no subtitle found.
 * Used to prevent false dedup of series books like "Series: Book A" vs "Series: Book B".
 */
export function extractSubtitle(title: string): string {
  let t = title.toLowerCase();
  // Remove parenthesized/bracketed edition markers first (same as normalizeTitle)
  t = t.replace(EDITION_PAREN_RE, '');
  // Remove trailing descriptors
  t = t.replace(TRAILING_DESCRIPTOR_RE, '');
  t = t.replace(/\s+/g, ' ').trim();

  // Try colon subtitle
  const colonMatch = t.match(/\s*[:]\s+(.+)$/);
  if (colonMatch) return colonMatch[1].trim();

  // Try long dash subtitle
  const dashMatch = t.match(/\s+[-\u2013\u2014]\s+(.+)$/);
  if (dashMatch) return dashMatch[1].trim();

  return '';
}

/**
 * Check if two titles' subtitles are compatible for dedup purposes.
 * - Both have no subtitle → compatible
 * - One has a subtitle, other doesn't → compatible (re-listing with/without subtitle)
 * - Both have the SAME subtitle → compatible
 * - Both have DIFFERENT subtitles → NOT compatible (different books, e.g. series entries)
 */
function areSubtitlesCompatible(titleA: string, titleB: string): boolean {
  const subA = extractSubtitle(titleA);
  const subB = extractSubtitle(titleB);
  if (!subA || !subB) return true; // one or both missing → compatible
  return subA === subB;
}

/** Normalize narrator for comparison. Sorts individual names so order doesn't matter. */
function normalizeNarrator(narrator?: string): string {
  const raw = (narrator || '').toLowerCase().trim();
  if (!raw) return raw;
  return raw.split(',').map(n => n.trim()).filter(Boolean).sort().join(', ');
}

// ---------------------------------------------------------------------------
// Duration compatibility
// ---------------------------------------------------------------------------

/**
 * Check if two durations are compatible (represent the same recording).
 * Tolerance: max(longerDuration * 0.05, 10) minutes.
 * Missing duration on either side is treated as compatible.
 */
export function areDurationsCompatible(a?: number, b?: number): boolean {
  if (a == null || b == null) return true;
  const longer = Math.max(a, b);
  const tolerance = Math.max(longer * 0.05, 10);
  return Math.abs(a - b) <= tolerance;
}

// ---------------------------------------------------------------------------
// Metadata scoring (for picking best representative)
// ---------------------------------------------------------------------------

function metadataScore(book: AudibleAudiobook): number {
  let score = 0;
  if (book.coverArtUrl) score++;
  if (book.rating != null) score++;
  if (book.durationMinutes != null) score++;
  if (book.description) score++;
  if (book.narrator) score++;
  if (book.releaseDate) score++;
  if (book.genres && book.genres.length > 0) score++;
  return score;
}

// ---------------------------------------------------------------------------
// Dedup group types (for works-table persistence)
// ---------------------------------------------------------------------------

/** Metadata about a group of ASINs that were collapsed during dedup. */
export interface DedupGroup {
  canonicalAsin: string;     // ASIN of the "winner" (best metadata score)
  allAsins: string[];        // All ASINs in this group (including canonical)
  title: string;             // Author from the canonical entry
  author: string;            // Author from the canonical entry
  narrator?: string;         // Narrator from the canonical entry
  durationMinutes?: number;  // Duration from the canonical entry
}

/** Result of deduplication with group collection. */
export interface DeduplicateResult {
  books: AudibleAudiobook[];  // The deduped list (same as deduplicateAudiobooks returns)
  groups: DedupGroup[];       // Groups where 2+ ASINs were collapsed
}

// ---------------------------------------------------------------------------
// Main dedup functions
// ---------------------------------------------------------------------------

/**
 * Deduplicate audiobook listings by normalized title + narrator + duration.
 *
 * Same narrator + compatible duration + similar title = same recording -> collapse.
 * Different narrator = different production -> keep both.
 * Duration outside tolerance = different content (abridged vs unabridged) -> keep both.
 *
 * Preserves original ordering (position of first appearance).
 */
export function deduplicateAudiobooks(books: AudibleAudiobook[]): AudibleAudiobook[] {
  return deduplicateAndCollectGroups(books).books;
}

/**
 * Deduplicate audiobooks AND return grouping metadata for works-table persistence.
 * Returns both the deduped list and the groups where 2+ ASINs were collapsed.
 */
export function deduplicateAndCollectGroups(books: AudibleAudiobook[]): DeduplicateResult {
  if (books.length <= 1) return { books: [...books], groups: [] };

  // Group by normalized title + narrator
  const titleNarratorGroups = new Map<string, AudibleAudiobook[]>();
  const insertionOrder: string[] = [];

  for (const book of books) {
    const key = `${normalizeTitle(book.title)}|||${normalizeNarrator(book.narrator)}`;
    const group = titleNarratorGroups.get(key);
    if (group) {
      group.push(book);
    } else {
      titleNarratorGroups.set(key, [book]);
      insertionOrder.push(key);
    }
  }

  const result: AudibleAudiobook[] = [];
  const dedupGroups: DedupGroup[] = [];

  for (const key of insertionOrder) {
    const group = titleNarratorGroups.get(key)!;
    if (group.length === 1) {
      result.push(group[0]);
      continue;
    }

    // Within a title+narrator group, further split by duration AND subtitle
    // compatibility. Build sub-groups where all members are compatible with
    // the representative (first member). A book joins the first compatible sub-group.
    // This prevents false dedup of series entries like "Series: Book A" vs "Series: Book B".
    const subGroups: AudibleAudiobook[][] = [];

    for (const book of group) {
      let placed = false;
      for (const sg of subGroups) {
        // Check both duration and subtitle compatibility against the representative
        if (
          areDurationsCompatible(sg[0].durationMinutes, book.durationMinutes) &&
          areSubtitlesCompatible(sg[0].title, book.title)
        ) {
          sg.push(book);
          placed = true;
          break;
        }
      }
      if (!placed) {
        subGroups.push([book]);
      }
    }

    // From each sub-group, pick the best representative and collect group metadata
    for (const sg of subGroups) {
      let best = sg[0];
      let bestScore = metadataScore(best);
      for (let i = 1; i < sg.length; i++) {
        const score = metadataScore(sg[i]);
        if (score > bestScore) {
          best = sg[i];
          bestScore = score;
        }
      }
      result.push(best);

      // Collect group metadata for works-table persistence (only multi-ASIN groups)
      if (sg.length >= 2) {
        dedupGroups.push({
          canonicalAsin: best.asin,
          allAsins: sg.map(b => b.asin),
          title: best.title,
          author: best.author,
          narrator: best.narrator,
          durationMinutes: best.durationMinutes,
        });
      }
    }
  }

  return { books: result, groups: dedupGroups };
}