#!/usr/bin/env python3
"""
Script to reprocess all PDFs with enhanced metadata extraction.
This will delete existing chunks and vector store, then reprocess everything.
"""

import sys
from pathlib import Path

# Add current directory to path
sys.path.insert(0, str(Path(__file__).parent))

from pdf_processor import PDFProcessor
from vector_store import VectorStore
from config import PDF_DIR, EMBEDDINGS_DIR, DOCUMENTS_DIR, METADATA_DIR
import json
import shutil

def clear_existing_data():
    """Delete existing chunks and vector store"""
    print("Clearing existing data...")
    
    # Delete vector store files
    index_file = EMBEDDINGS_DIR / "faiss_index.index"
    metadata_file = EMBEDDINGS_DIR / "documents_metadata.json"
    
    if index_file.exists():
        index_file.unlink()
        print(f"  ✓ Deleted {index_file}")
    
    if metadata_file.exists():
        metadata_file.unlink()
        print(f"  ✓ Deleted {metadata_file}")
    
    # Delete chunk JSON files
    chunk_files = list(DOCUMENTS_DIR.glob("*_chunks.json"))
    for chunk_file in chunk_files:
        chunk_file.unlink()
        print(f"  ✓ Deleted {chunk_file.name}")
    
    print(f"  Deleted {len(chunk_files)} chunk files")
    
    # Clear processed files metadata (force reprocessing)
    processed_metadata = METADATA_DIR / "processed_files.json"
    if processed_metadata.exists():
        processed_metadata.unlink()
        print(f"  ✓ Cleared processed files metadata")
    
    print("✓ All existing data cleared\n")

def reprocess_all_pdfs():
    """Reprocess all PDFs with enhanced metadata"""
    print("Reprocessing all PDFs with enhanced metadata...")
    print("=" * 60)
    
    processor = PDFProcessor()
    chunks = processor.process_all_new()
    
    if not chunks:
        print("No files to process.")
        return
    
    print(f"\n✓ Processed {len(chunks)} chunks from all PDFs")
    
    # Rebuild vector store
    print("\nRebuilding vector store...")
    vector_store = VectorStore()
    vector_store.add_documents(chunks)
    vector_store.save_index()
    
    print(f"✓ Vector store rebuilt with {len(chunks)} chunks")
    
    # Show sample metadata
    if chunks:
        sample = chunks[0]
        print("\nSample chunk metadata:")
        print(json.dumps(sample['metadata'], indent=2))
    
    print("\n✅ Reprocessing complete!")

if __name__ == "__main__":
    print("=" * 60)
    print("RAG System Reprocessing Script")
    print("=" * 60)
    print("\nThis will:")
    print("  1. Delete existing vector store and chunks")
    print("  2. Reprocess all PDFs with enhanced metadata")
    print("  3. Rebuild vector store with new chunks")
    print()
    
    response = input("Continue? (yes/no): ")
    if response.lower() != 'yes':
        print("Cancelled.")
        sys.exit(0)
    
    clear_existing_data()
    reprocess_all_pdfs()

