/**
 * News Scout Pipeline — Phase 1b: Google News RSS
 *
 * Google News RSS aggregates from 80,000+ sources. Acts as a meta-source
 * catching stories direct feeds miss. 10 sport-specific queries.
 * Batched with 1.5s delay to respect rate limits. 48-hour max age filter.
 */

import RSSParser from 'rss-parser';
import { GoogleNewsQueryConfig, ScoutCandidate } from './types';

const parser = new RSSParser<any, any>({
  timeout: 15000,
  headers: { 'User-Agent': 'Rainmaker/2.0 (Sports News Scout)' },
});

const QUERIES: GoogleNewsQueryConfig[] = [
  { query: 'NBA breaking news', sport: 'nba', label: 'NBA breaking' },
  { query: 'NFL breaking news', sport: 'nfl', label: 'NFL breaking' },
  { query: 'MLB breaking news', sport: 'mlb', label: 'MLB breaking' },
  { query: 'NHL breaking news', sport: 'nhl', label: 'NHL breaking' },
  { query: 'Premier League transfer news', sport: 'soccer', label: 'EPL transfers' },
  { query: 'UFC MMA fight news', sport: 'mma', label: 'UFC/MMA' },
  { query: 'college basketball NCAA news', sport: 'ncaab', label: 'NCAAB' },
  { query: 'college football NCAA news', sport: 'ncaaf', label: 'NCAAF' },
  { query: 'Champions League La Liga Bundesliga', sport: 'soccer', label: 'Euro soccer' },
  { query: 'sports trending viral', sport: 'general', label: 'Trending sports' },
];

const BATCH_SIZE = 3;
const MAX_AGE_HOURS = 48;

/**
 * Parse real source name from Google News title format: "Headline - Source Name"
 */
function parseSourceFromTitle(title: string): { headline: string; sourceName: string } {
  const lastDash = title.lastIndexOf(' - ');
  if (lastDash > 0 && lastDash < title.length - 3) {
    return {
      headline: title.substring(0, lastDash).trim(),
      sourceName: title.substring(lastDash + 3).trim(),
    };
  }
  return { headline: title, sourceName: 'Google News' };
}

/**
 * Normalize URL to help dedupe against direct RSS entries.
 * Strips tracking params, trailing slashes, and www prefix.
 */
function normalizeUrl(url: string): string {
  try {
    const u = new URL(url);
    // Remove common tracking params
    u.searchParams.delete('utm_source');
    u.searchParams.delete('utm_medium');
    u.searchParams.delete('utm_campaign');
    u.searchParams.delete('utm_content');
    u.searchParams.delete('utm_term');
    // Normalize host
    let host = u.hostname.replace(/^www\./, '');
    return `${host}${u.pathname.replace(/\/$/, '')}`;
  } catch {
    return url;
  }
}

function sleep(ms: number) {
  return new Promise(r => setTimeout(r, ms));
}

export async function runGnewsScout(): Promise<ScoutCandidate[]> {
  const candidates: ScoutCandidate[] = [];
  const seenUrls = new Set<string>();
  const maxAgeMs = MAX_AGE_HOURS * 60 * 60 * 1000;
  const now = Date.now();

  for (let i = 0; i < QUERIES.length; i += BATCH_SIZE) {
    const batch = QUERIES.slice(i, i + BATCH_SIZE);

    const results = await Promise.allSettled(
      batch.map(async (q) => {
        const items: ScoutCandidate[] = [];
        try {
          const encodedQuery = encodeURIComponent(q.query);
          const feedUrl = `https://news.google.com/rss/search?q=${encodedQuery}&ceid=US:en&hl=en-US&gl=US`;
          const parsed = await parser.parseURL(feedUrl);

          for (const item of (parsed.items || [])) {
            const rawTitle = (item.title || '').trim();
            const url = (item.link || '').trim();
            const guid = (item.guid || item.link || '').trim();

            if (!rawTitle || !url || !guid) continue;

            const publishedAt = item.pubDate ? new Date(item.pubDate) : new Date();
            if (now - publishedAt.getTime() > maxAgeMs) continue;

            // Dedupe by normalized URL
            const normalizedUrl = normalizeUrl(url);
            if (seenUrls.has(normalizedUrl)) continue;
            seenUrls.add(normalizedUrl);

            const { headline, sourceName } = parseSourceFromTitle(rawTitle);
            const description = (item.contentSnippet || item.content || '').trim().slice(0, 500);

            items.push({
              guid: `gnews:${guid}`,
              title: headline,
              url,
              source: sourceName.toLowerCase().replace(/[^a-z0-9]/g, ''),
              sourceDisplay: sourceName,
              sport: q.sport,
              description,
              publishedAt,
              sourceType: 'gnews',
              engagementScore: 0,
              isCurated: false,
              isBreaking: false,
              isFeatured: false,
            });
          }
        } catch (err: any) {
          console.error(`  [gnews-scout] Query "${q.label}" failed: ${err.message}`);
        }
        return items;
      })
    );

    for (const result of results) {
      if (result.status === 'fulfilled') {
        candidates.push(...result.value);
      }
    }

    // 1.5s delay between batches
    if (i + BATCH_SIZE < QUERIES.length) {
      await sleep(1500);
    }
  }

  console.log(`[gnews-scout] Found ${candidates.length} items from ${QUERIES.length} queries`);
  return candidates;
}
