/**
 * News Scout Pipeline — Phase 1: Expanded RSS
 *
 * ~28 feeds across ESPN, CBS Sports, Yahoo, BBC, NBC Sports, The Ringer,
 * SB Nation, Sky Sports, The Guardian, USA Today, Sporting News, Bleacher Report, Fox Sports.
 * Extracts article images from: enclosure, media:thumbnail, media:content,
 * content:encoded img tags, and OG image scraping as fallback.
 * Fetches in batches of 5 with 500ms delay between batches.
 */

import RSSParser from 'rss-parser';
import { FeedConfig, ScoutCandidate } from './types';

// Custom fields to capture image data from RSS
const parser = new RSSParser<any, any>({
  timeout: 15000,
  headers: { 'User-Agent': 'Rainmaker/2.0 (Sports News Scout)' },
  customFields: {
    item: [
      ['media:content', 'mediaContent', { keepArray: false }],
      ['media:thumbnail', 'mediaThumbnail', { keepArray: false }],
    ],
  },
});

const FEEDS: FeedConfig[] = [
  // ESPN (8)
  { url: 'https://www.espn.com/espn/rss/nfl/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'nfl' },
  { url: 'https://www.espn.com/espn/rss/nba/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'nba' },
  { url: 'https://www.espn.com/espn/rss/mlb/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'mlb' },
  { url: 'https://www.espn.com/espn/rss/nhl/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'nhl' },
  { url: 'https://www.espn.com/espn/rss/mma/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'mma' },
  { url: 'https://www.espn.com/espn/rss/soccer/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'soccer' },
  { url: 'https://www.espn.com/espn/rss/ncf/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'ncaaf' },
  { url: 'https://www.espn.com/espn/rss/ncb/news', source: 'espn', sourceDisplay: 'ESPN', sport: 'ncaab' },
  // CBS Sports (6)
  { url: 'https://www.cbssports.com/rss/headlines/nfl/', source: 'cbssports', sourceDisplay: 'CBS Sports', sport: 'nfl' },
  { url: 'https://www.cbssports.com/rss/headlines/nba/', source: 'cbssports', sourceDisplay: 'CBS Sports', sport: 'nba' },
  { url: 'https://www.cbssports.com/rss/headlines/mlb/', source: 'cbssports', sourceDisplay: 'CBS Sports', sport: 'mlb' },
  { url: 'https://www.cbssports.com/rss/headlines/nhl/', source: 'cbssports', sourceDisplay: 'CBS Sports', sport: 'nhl' },
  { url: 'https://www.cbssports.com/rss/headlines/college-basketball/', source: 'cbssports', sourceDisplay: 'CBS Sports', sport: 'ncaab' },
  { url: 'https://www.cbssports.com/rss/headlines/college-football/', source: 'cbssports', sourceDisplay: 'CBS Sports', sport: 'ncaaf' },
  // Yahoo Sports (1)
  { url: 'https://sports.yahoo.com/rss/', source: 'yahoo', sourceDisplay: 'Yahoo Sports', sport: 'general' },
  // BBC Sport (2)
  { url: 'https://feeds.bbci.co.uk/sport/rss.xml', source: 'bbc', sourceDisplay: 'BBC Sport', sport: 'general' },
  { url: 'https://feeds.bbci.co.uk/sport/football/rss.xml', source: 'bbc', sourceDisplay: 'BBC Sport', sport: 'soccer' },
  // NBC Sports (1)
  { url: 'https://profootballtalk.nbcsports.com/feed/', source: 'nbcsports', sourceDisplay: 'NBC Sports', sport: 'nfl' },
  // SB Nation (1)
  { url: 'https://www.sbnation.com/rss/index.xml', source: 'sbnation', sourceDisplay: 'SB Nation', sport: 'general' },
  // Sky Sports (1)
  { url: 'https://www.skysports.com/rss/12040', source: 'skysports', sourceDisplay: 'Sky Sports', sport: 'soccer' },
  // The Guardian (2)
  { url: 'https://www.theguardian.com/sport/rss', source: 'guardian', sourceDisplay: 'The Guardian', sport: 'general' },
  { url: 'https://www.theguardian.com/football/rss', source: 'guardian', sourceDisplay: 'The Guardian', sport: 'soccer' },
];

const BATCH_SIZE = 5;

// Keywords for auto-classifying "general" feeds
const SPORT_KEYWORDS: Record<string, string[]> = {
  nfl: ['nfl', 'quarterback', 'touchdown', 'super bowl', 'gridiron', 'draft pick', 'wide receiver', 'tight end', 'running back', 'offensive line', 'defensive end', 'linebacker', 'cornerback', 'patriots', 'chiefs', 'eagles', 'cowboys', 'packers', '49ers', 'ravens', 'bills', 'dolphins', 'jets', 'steelers', 'bengals', 'browns', 'texans', 'colts', 'jaguars', 'titans', 'broncos', 'chargers', 'raiders', 'rams', 'seahawks', 'cardinals', 'commanders', 'giants', 'vikings', 'bears', 'lions', 'falcons', 'panthers', 'saints', 'buccaneers'],
  nba: ['nba', 'basketball', 'lakers', 'celtics', 'dunk', 'three-pointer', 'hoops', 'triple-double', 'double-double', 'free throw', 'rebound', 'warriors', 'nuggets', 'suns', 'clippers', 'knicks', 'nets', '76ers', 'sixers', 'bucks', 'heat', 'cavaliers', 'cavs', 'thunder', 'timberwolves', 'pelicans', 'grizzlies', 'mavericks', 'mavs', 'rockets', 'spurs', 'kings', 'blazers', 'jazz', 'pistons', 'pacers', 'hawks', 'hornets', 'wizards', 'magic', 'raptors', 'bulls', 'wembanyama', 'lebron', 'curry', 'jokic', 'giannis', 'embiid', 'doncic', 'tatum', 'edwards', 'maxey'],
  mlb: ['mlb', 'baseball', 'home run', 'pitcher', 'world series', 'strikeout', 'batting', 'innings', 'rbi', 'era', 'bullpen', 'dugout', 'yankees', 'red sox', 'dodgers', 'mets', 'cubs', 'braves', 'astros', 'phillies', 'padres', 'orioles', 'guardians', 'rangers', 'twins', 'mariners', 'brewers', 'reds', 'diamondbacks', 'd-backs', 'pirates', 'royals', 'tigers', 'angels', 'white sox', 'marlins', 'rockies', 'rays', 'nationals', 'athletics', 'blue jays', 'cardinals', 'giants', 'ohtani'],
  nhl: ['nhl', 'hockey', 'puck', 'stanley cup', 'goalie', 'hat trick', 'power play', 'slap shot', 'goaltender', 'penalty kill', 'face-off', 'bruins', 'maple leafs', 'leafs', 'canadiens', 'habs', 'red wings', 'penguins', 'capitals', 'flyers', 'rangers', 'islanders', 'devils', 'hurricanes', 'lightning', 'panthers', 'blue jackets', 'senators', 'sabres', 'blackhawks', 'predators', 'wild', 'jets', 'flames', 'oilers', 'canucks', 'kraken', 'golden knights', 'avalanche', 'stars', 'blues', 'coyotes', 'ducks', 'sharks', 'makar', 'mcdavid', 'matthews'],
  soccer: ['soccer', 'premier league', 'epl', 'champions league', 'la liga', 'mls', 'bundesliga', 'serie a', 'ligue 1', 'world cup', 'fifa', 'arsenal', 'manchester united', 'manchester city', 'liverpool', 'chelsea', 'tottenham', 'barcelona', 'real madrid', 'bayern', 'psg', 'juventus', 'inter milan', 'ac milan', 'dortmund', 'messi', 'ronaldo', 'haaland', 'mbappe', 'goal kick', 'penalty kick', 'red card', 'offside'],
  mma: ['ufc', 'mma', 'mixed martial', 'knockout', 'submission', 'octagon', 'bellator', 'flyweight', 'bantamweight', 'featherweight', 'lightweight', 'welterweight', 'middleweight', 'heavyweight'],
  ncaab: ['ncaab', 'march madness', 'college basketball', 'ncaa basketball', 'final four', 'sweet sixteen', 'elite eight'],
  ncaaf: ['ncaaf', 'college football', 'ncaa football', 'cfp', 'college playoff', 'heisman'],
};

function classifySport(title: string, description: string): string {
  const text = `${title} ${description}`.toLowerCase();
  let bestSport = 'general';
  let bestScore = 0;

  for (const [sport, keywords] of Object.entries(SPORT_KEYWORDS)) {
    let score = 0;
    for (const kw of keywords) {
      if (text.includes(kw)) score++;
    }
    if (score > bestScore) {
      bestScore = score;
      bestSport = sport;
    }
  }

  // "football" alone is ambiguous — only counts for soccer if no NFL keywords match
  if (bestSport === 'soccer' && bestScore === 1 && text.includes('football')) {
    const nflScore = SPORT_KEYWORDS.nfl.filter(kw => text.includes(kw)).length;
    if (nflScore > 0) bestSport = 'nfl';
  }

  // Keep unclassified stories as 'general' rather than forcing NFL
  return bestSport;
}

/**
 * Extract the best image URL from an RSS item.
 * Tries: enclosure → media:thumbnail → media:content → img in content:encoded → img in content
 */
function extractImageFromItem(item: any): string | undefined {
  // 1. enclosure (CBS Sports uses this)
  if (item.enclosure?.url && item.enclosure.type?.startsWith('image/')) {
    return item.enclosure.url;
  }

  // 2. media:thumbnail (BBC uses this)
  const thumb = item.mediaThumbnail;
  if (thumb) {
    const url = thumb.$?.url || thumb.url;
    if (url) return url;
  }

  // 3. media:content
  const media = item.mediaContent;
  if (media) {
    const url = media.$?.url || media.url;
    if (url) return url;
  }

  // 4. img tag in content:encoded (Yahoo uses this)
  const encoded = item['content:encoded'] || '';
  if (encoded) {
    const imgMatch = encoded.match(/src=["']([^"']+\.(?:jpg|jpeg|png|webp)[^"']*)/i);
    if (imgMatch) return imgMatch[1];
  }

  // 5. img tag in content
  const content = item.content || '';
  if (content) {
    const imgMatch = content.match(/src=["']([^"']+\.(?:jpg|jpeg|png|webp)[^"']*)/i);
    if (imgMatch) return imgMatch[1];
  }

  return undefined;
}

/**
 * Scrape OG image from article URL. Used as fallback for feeds without images (ESPN).
 * Fast: 5s timeout, only fetches first 50KB of HTML.
 */
async function scrapeOgImage(articleUrl: string): Promise<string | undefined> {
  try {
    const controller = new AbortController();
    const timeout = setTimeout(() => controller.abort(), 5000);

    const res = await fetch(articleUrl, {
      headers: { 'User-Agent': 'Rainmaker/2.0 (Sports News Scout)' },
      signal: controller.signal,
      redirect: 'follow',
    });
    clearTimeout(timeout);

    if (!res.ok) return undefined;

    // Read only first chunk for og:image — don't download the full page
    const reader = res.body?.getReader();
    if (!reader) return undefined;

    let html = '';
    while (html.length < 50000) {
      const { done, value } = await reader.read();
      if (done) break;
      html += new TextDecoder().decode(value);
      // og:image is always in the <head>, stop early if we pass it
      if (html.includes('</head>')) break;
    }
    reader.cancel().catch(() => {});

    // Look for og:image meta tag
    const ogMatch = html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i)
      || html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i);

    if (ogMatch?.[1]) return ogMatch[1];

    // Fallback: twitter:image
    const twMatch = html.match(/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']/i)
      || html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:image["']/i);

    return twMatch?.[1] || undefined;
  } catch {
    return undefined;
  }
}

function sleep(ms: number) {
  return new Promise(r => setTimeout(r, ms));
}

export async function runRssScout(): Promise<ScoutCandidate[]> {
  const candidates: ScoutCandidate[] = [];
  let totalFound = 0;
  let imagesFromRss = 0;

  // Process in batches
  for (let i = 0; i < FEEDS.length; i += BATCH_SIZE) {
    const batch = FEEDS.slice(i, i + BATCH_SIZE);

    const results = await Promise.allSettled(
      batch.map(async (feed) => {
        const items: ScoutCandidate[] = [];
        try {
          const parsed = await parser.parseURL(feed.url);
          for (const item of (parsed.items || [])) {
            const title = (item.title || '').trim();
            const url = (item.link || '').trim();
            const guid = (item.guid || item.link || '').trim();
            const description = (item.contentSnippet || item.content || '').trim().slice(0, 500);
            const publishedAt = item.pubDate ? new Date(item.pubDate) : new Date();

            if (!title || !url || !guid) continue;

            const sport = feed.sport === 'general'
              ? classifySport(title, description)
              : feed.sport;

            // Extract image from RSS item
            const imageUrl = extractImageFromItem(item);

            items.push({
              guid: `rss:${guid}`,
              title,
              url,
              source: feed.source,
              sourceDisplay: feed.sourceDisplay,
              sport,
              description,
              publishedAt,
              sourceType: 'rss',
              engagementScore: 0,
              imageUrl,
              isCurated: false,
              isBreaking: false,
              isFeatured: false,
            });
          }
        } catch (err: any) {
          console.error(`  RSS feed error [${feed.source}/${feed.sport}]: ${err.message}`);
        }
        return items;
      })
    );

    for (const result of results) {
      if (result.status === 'fulfilled') {
        candidates.push(...result.value);
        totalFound += result.value.length;
        imagesFromRss += result.value.filter(c => c.imageUrl).length;
      }
    }

    // 500ms delay between batches
    if (i + BATCH_SIZE < FEEDS.length) {
      await sleep(500);
    }
  }

  // OG image scraping for top items that don't have images yet
  // Only scrape for the top ~20 items (sorted by recency) to keep it fast
  const needImages = candidates
    .filter(c => !c.imageUrl)
    .sort((a, b) => b.publishedAt.getTime() - a.publishedAt.getTime())
    .slice(0, 30);

  if (needImages.length > 0) {
    console.log(`[rss-scout] Scraping OG images for ${needImages.length} articles...`);
    // Scrape in batches of 5
    let ogFound = 0;
    for (let i = 0; i < needImages.length; i += 5) {
      const batch = needImages.slice(i, i + 5);
      const ogResults = await Promise.allSettled(
        batch.map(async (c) => {
          const ogUrl = await scrapeOgImage(c.url);
          if (ogUrl) {
            c.imageUrl = ogUrl;
            ogFound++;
          }
        })
      );
      if (i + 5 < needImages.length) await sleep(300);
    }
    console.log(`  [rss-scout] OG images found: ${ogFound}/${needImages.length}`);
  }

  const totalImages = candidates.filter(c => c.imageUrl).length;
  console.log(`[rss-scout] Found ${totalFound} items from ${FEEDS.length} feeds (${imagesFromRss} RSS images, ${totalImages} total with images)`);
  return candidates;
}
