#!/usr/bin/env python3
"""
Tesseract OCR Fallback Service
Provides OCR capabilities for PDFs that fail normal text extraction.
Runs as a Flask service on port 5002.
"""

import os
import sys
import tempfile
from pathlib import Path
from flask import Flask, request, jsonify
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)

def extract_text_from_image(image):
    """Run Tesseract OCR on a PIL Image and return extracted text"""
    try:
        # Use Tesseract with optimal settings for document text
        text = pytesseract.image_to_string(
            image, 
            lang='eng',
            config='--psm 1 --oem 3'  # Auto page segmentation, LSTM engine
        )
        return text.strip()
    except Exception as e:
        logger.error(f"Tesseract OCR error: {e}")
        return ""

def extract_text_from_pdf(pdf_path, max_pages=50):
    """Convert PDF pages to images and run OCR on each"""
    try:
        # Convert PDF to images (200 DPI for balance of speed/quality)
        images = convert_from_path(pdf_path, dpi=200, first_page=1, last_page=max_pages)
        
        if not images:
            return None, "No pages could be extracted from PDF"
        
        logger.info(f"Extracted {len(images)} pages from PDF")
        
        all_text = []
        for i, img in enumerate(images):
            text = extract_text_from_image(img)
            if text:
                all_text.append(f"--- Page {i+1} ---\n{text}")
                logger.info(f"Page {i+1} OCR complete: {len(text)} chars")
            else:
                logger.warning(f"Page {i+1} OCR returned no text")
        
        if all_text:
            return '\n\n'.join(all_text), None
        else:
            return None, "OCR produced no text from any page"
            
    except Exception as e:
        logger.error(f"PDF OCR error: {e}")
        import traceback
        traceback.print_exc()
        return None, str(e)

@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    tesseract_version = None
    try:
        tesseract_version = pytesseract.get_tesseract_version().public
    except Exception as e:
        logger.warning(f"Could not get tesseract version: {e}")
    
    return jsonify({
        'status': 'ok',
        'ocr_engine': 'Tesseract',
        'tesseract_version': tesseract_version,
        'languages': pytesseract.get_languages(config='') if tesseract_version else []
    })

@app.route('/ocr', methods=['POST'])
def ocr_pdf():
    """
    Run OCR on a PDF file.
    Expects multipart form with 'file' field.
    Returns extracted text as JSON.
    """
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400
    
    file = request.files['file']
    if not file.filename:
        return jsonify({'error': 'Empty filename'}), 400
    
    # Save uploaded file temporarily
    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp:
        file.save(tmp.name)
        
        try:
            max_pages = request.form.get('max_pages', 50, type=int)
            text, error = extract_text_from_pdf(tmp.name, max_pages=max_pages)
            
            if error:
                return jsonify({'error': error, 'text': None}), 422
            
            return jsonify({
                'success': True,
                'text': text,
                'chars': len(text) if text else 0,
                'filename': file.filename
            })
            
        finally:
            os.unlink(tmp.name)

@app.route('/ocr_image', methods=['POST'])
def ocr_image():
    """
    Run OCR on a single image file.
    Expects multipart form with 'file' field.
    """
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400
    
    file = request.files['file']
    if not file.filename:
        return jsonify({'error': 'Empty filename'}), 400
    
    # Save uploaded file temporarily
    suffix = Path(file.filename).suffix or '.jpg'
    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
        file.save(tmp.name)
        
        try:
            img = Image.open(tmp.name)
            text = extract_text_from_image(img)
            
            return jsonify({
                'success': True,
                'text': text,
                'chars': len(text) if text else 0,
                'filename': file.filename
            })
            
        finally:
            os.unlink(tmp.name)

if __name__ == '__main__':
    host = os.environ.get('OCR_HOST', '127.0.0.1')
    port = int(os.environ.get('OCR_PORT', 5002))
    logger.info(f"Starting Tesseract OCR service on port {port}")
    app.run(host=host, port=port, debug=False)
