"""
Script to clear all processed data and start fresh.
This removes:
- All chunk files (*_chunks.json)
- All enriched cards (*_cards.json)
- Vector store index and metadata
- Processing metadata files
- Digests and capsules
"""
import shutil
from pathlib import Path
from config import (
    DOCUMENTS_DIR,
    NOTES_DIR,
    EMBEDDINGS_DIR,
    METADATA_DIR,
    DIGESTS_DIR,
    CAPSULES_DIR
)

def clear_all_processed_data():
    """Clear all processed data to start fresh"""
    print("=" * 60)
    print("🗑️  CLEARING ALL PROCESSED DATA")
    print("=" * 60)
    
    cleared_items = []
    
    # Clear chunk files
    chunk_files = list(DOCUMENTS_DIR.glob("*_chunks.json"))
    for f in chunk_files:
        f.unlink()
        cleared_items.append(f"Chunk file: {f.name}")
    print(f"✅ Cleared {len(chunk_files)} chunk files")
    
    # Clear enriched cards
    card_files = list(NOTES_DIR.glob("*_cards.json"))
    for f in card_files:
        f.unlink()
        cleared_items.append(f"Card file: {f.name}")
    print(f"✅ Cleared {len(card_files)} card files")
    
    # Clear vector store
    index_file = EMBEDDINGS_DIR / "faiss_index.index"
    metadata_file = EMBEDDINGS_DIR / "documents_metadata.json"
    pickle_file = EMBEDDINGS_DIR / "documents_metadata.pkl"
    
    for f in [index_file, metadata_file, pickle_file]:
        if f.exists():
            f.unlink()
            cleared_items.append(f"Vector store: {f.name}")
    print(f"✅ Cleared vector store")
    
    # Clear processing metadata
    processed_files_json = METADATA_DIR / "processed_files.json"
    processing_status_json = METADATA_DIR / "processing_status.json"
    enrichment_status_json = METADATA_DIR / "enrichment_status.json"
    
    for f in [processed_files_json, processing_status_json, enrichment_status_json]:
        if f.exists():
            f.unlink()
            cleared_items.append(f"Metadata: {f.name}")
    print(f"✅ Cleared processing metadata")
    
    # Clear digests and capsules
    digest_files = list(DIGESTS_DIR.glob("*.json"))
    capsule_files = list(CAPSULES_DIR.glob("*.json"))
    for f in digest_files + capsule_files:
        f.unlink()
        cleared_items.append(f"{'Digest' if 'digest' in f.name else 'Capsule'}: {f.name}")
    print(f"✅ Cleared {len(digest_files)} digests and {len(capsule_files)} capsules")
    
    # Clear any stray index/pickle files
    stray_files = list(EMBEDDINGS_DIR.glob("*.index")) + list(EMBEDDINGS_DIR.glob("*.pkl"))
    for f in stray_files:
        if f.exists():
            f.unlink()
            cleared_items.append(f"Stray file: {f.name}")
    
    print("\n" + "=" * 60)
    print(f"✅ CLEARED {len(cleared_items)} ITEMS")
    print("=" * 60)
    print("\n📋 Summary:")
    print(f"  • Chunk files: {len(chunk_files)}")
    print(f"  • Card files: {len(card_files)}")
    print(f"  • Vector store: cleared")
    print(f"  • Metadata files: cleared")
    print(f"  • Digests/Capsules: {len(digest_files) + len(capsule_files)}")
    print("\n✅ Ready to reprocess all PDFs!")

if __name__ == "__main__":
    import sys
    # Allow command-line argument for non-interactive use
    if len(sys.argv) > 1 and sys.argv[1] == "--yes":
        confirm = "YES"
    else:
        confirm = input("⚠️  This will delete ALL processed data. Type 'YES' to confirm: ")
    
    if confirm == "YES":
        clear_all_processed_data()
    else:
        print("❌ Cancelled. No data was deleted.")

