import os
import json
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
from ocr.preproc import preprocess
from ocr.engines import OCREngine
from ocr.pdf_helpers import process_pdf_pages
from utils.fs import iter_files, ensure_dir
from utils.classify import classify_text
from extractors import extract_aadhaar, extract_voter, extract_land

load_dotenv()
INPUT_DIR = Path(os.getenv('INPUT_DIR','data/input'))
CSV_PATH = Path(os.getenv('CSV_PATH','data/output/csv/ids.csv'))
JSONL_PATH = Path(os.getenv('JSONL_PATH','data/output/jsonl/ids.jsonl'))
SAVE_DEBUG = os.getenv('SAVE_DEBUG_IMAGES','false').lower() == 'true'
DEBUG_MAX = int(os.getenv('DEBUG_MAX','50'))

engine = OCREngine(prefer=os.getenv('OCR_ENGINE','tesseract'), lang=os.getenv('PADDLE_LANG','multi'), tesseract_cmd=os.getenv('TESSERACT_CMD') or None)

results = []
jsonl_out = []

def ocr_image(path: Path, idx: int):
    pil_rgb, pil_bw, np_rgb, np_gray = preprocess(path, save_debug=SAVE_DEBUG and idx < DEBUG_MAX, idx=idx)
    res = engine.run(pil_rgb=pil_rgb, pil_bw=pil_bw, np_rgb=np_rgb, np_gray=np_gray)
    return res

for i, file in enumerate(iter_files(INPUT_DIR)):
    try:
        print(f"Processing {file}...")
        if file.suffix.lower() == '.pdf':
            page_texts = process_pdf_pages(file, dpi=300, max_workers=4, prefer=os.getenv('OCR_ENGINE','tesseract'), lang=os.getenv('PADDLE_LANG','multi'), save_debug=(SAVE_DEBUG and i < DEBUG_MAX))
            for page_idx, page_text in enumerate(page_texts, start=1):
                doc_type = classify_text(page_text)
                if doc_type == 'aadhaar':
                    ext = extract_aadhaar(page_text)
                elif doc_type == 'voter':
                    ext = extract_voter(page_text)
                else:
                    ext = extract_land(page_text)
                row = {'source': str(file), 'page': page_idx, 'doc_type': ext.doc_type, 'confidence': round(ext.confidence,3)}
                row.update(ext.fields)
                results.append(row)
                jsonl_out.append({'source': str(file), **row})
                print(f" --> page {page_idx}: {ext.doc_type} (conf {ext.confidence:.2f})")
        else:
            ocr_res = ocr_image(file, i)
            text = ocr_res.text
            doc_type = classify_text(text)
            if doc_type == 'aadhaar':
                ext = extract_aadhaar(text)
            elif doc_type == 'voter':
                ext = extract_voter(text)
            else:
                ext = extract_land(text)
            row = {'source': str(file), 'doc_type': ext.doc_type, 'confidence': round(ext.confidence,3)}
            row.update(ext.fields)
            results.append(row)
            jsonl_out.append({'source': str(file), **row})
            print(f" --> {file.name}: {ext.doc_type} (conf {ext.confidence:.2f})")
    except Exception as e:
        print(f"ERROR processing {file}: {e}")

ensure_dir(CSV_PATH)
ensure_dir(JSONL_PATH)
df = pd.DataFrame(results)
df.to_csv(CSV_PATH, index=False)
with open(JSONL_PATH, 'w', encoding='utf-8') as f:
    for r in jsonl_out:
        f.write(json.dumps(r, ensure_ascii=False) + '\n')
print('Done. Outputs:')
print(' CSV ->', CSV_PATH)
print(' JSONL ->', JSONL_PATH)
