import os
import re
import json
from difflib import SequenceMatcher

# -------------------- CONFIG --------------------
MAX_PAGES = 2   # only first 2 pages for speed
THRESHOLD_DEFAULT = 0.7


# --------------- Utility Functions ---------------
def normalize_text(s: str) -> str:
    if not s:
        return ""
    s = s.strip()
    s = re.sub(r'\s+', ' ', s)
    return s


def bengali_digits_to_ascii(s: str) -> str:
    if not s:
        return s
    mapping = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")
    return s.translate(mapping)


def fuzzy_score(a, b):
    if not a:
        return 0.0
    a_norm = bengali_digits_to_ascii(a).lower()
    b_norm = bengali_digits_to_ascii(b).lower()
    return float(SequenceMatcher(None, a_norm, b_norm).ratio() * 100)


# --------------- Regex Patterns ---------------
patterns = {
    "daag": re.compile(r'দাগ\s*নং[:\s\-–]*([^\n\r,;]+)'),
    "khatian": re.compile(r'খিত(?:য়া|য়|িয়)?ন\s*নং[:\s\-–]*([^\n\r,;]+)'),
    "rayetar": re.compile(r'রায়েত(?:র)?\s*নাম[:\s\-–]*([^\n\r,;]+)')
}


# --------------- PDF Text Extractor ---------------
def extract_text_fast(pdf_path):
    pages_text = []

    # Try pdfplumber first
    try:
        import pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages[:MAX_PAGES]):
                txt = page.extract_text() or ""
                pages_text.append(txt)
    except:
        # Fallback: PyPDF2
        try:
            from PyPDF2 import PdfReader
            rd = PdfReader(pdf_path)
            for i, page in enumerate(rd.pages[:MAX_PAGES]):
                try:
                    txt = page.extract_text() or ""
                except:
                    txt = ""
                pages_text.append(txt)
        except:
            pages_text.append("")

    return pages_text


# --------------- Core Extraction Logic ---------------
def extract_fields_from_text(pages_text):
    combined = "\n".join(normalize_text(t) for t in pages_text)

    found = {}

    # দাগ নং
    m = patterns["daag"].search(combined)
    if m:
        found["daag"] = bengali_digits_to_ascii(normalize_text(m.group(1)))

    # খিতয়ান নং
    m = patterns["khatian"].search(combined)
    if m:
        found["khatian"] = normalize_text(m.group(1))

    # রায়েতর নাম
    m = patterns["rayetar"].search(combined)
    if m:
        found["rayetar"] = normalize_text(m.group(1))

    return found


# --------------- Main Processor ---------------
def process_pdfs(payload):
    results = []

    threshold = float(payload.get("threshold", THRESHOLD_DEFAULT))

    for pdf in payload.get("files", []):
        entry = {
            "pdf": pdf,
            "pages": [],
            "extracted": {},
            "matches": {}
        }

        if not os.path.exists(pdf):
            entry["error"] = "File not found"
            results.append(entry)
            continue

        # Extract text fast
        pages_text = extract_text_fast(pdf)

        entry["pages"] = [
            {"page": i, "text": t} for i, t in enumerate(pages_text)
        ]

        # Extract values
        extracted = extract_fields_from_text(pages_text)
        entry["extracted"] = extracted

        # -------------- Fuzzy Matches --------------
        # daag
        if payload.get("daag"):
            expected = payload["daag"][0]
            extracted_v = extracted.get("daag", "")
            score = fuzzy_score(extracted_v, expected)
            entry["matches"]["daag"] = {
                "expected": expected,
                "extracted": extracted_v,
                "score": score,
                "pass": score / 100.0 >= threshold
            }

        # khatian
        if payload.get("khatian"):
            expected = payload["khatian"][0]
            extracted_v = extracted.get("khatian", "")
            score = fuzzy_score(extracted_v, expected)
            entry["matches"]["khatian"] = {
                "expected": expected,
                "extracted": extracted_v,
                "score": score,
                "pass": score / 100.0 >= threshold
            }

        # farmer (rayetar)
        if payload.get("farmer"):
            expected = payload["farmer"][0]
            extracted_v = extracted.get("rayetar", "")
            score = fuzzy_score(extracted_v, expected)
            entry["matches"]["farmer_vs_rayetar"] = {
                "expected": expected,
                "extracted": extracted_v,
                "score": score,
                "pass": score / 100.0 >= threshold
            }

        results.append(entry)

    return {"results": results}


# ----------------- CLI Test -----------------
if __name__ == "__main__":
    payload = {
        "daag": ["519"],
        "khatian": ["152/1"],
        "farmer": ["Indrani Mondal"],
        "files": [
            "/mnt/data/371(Plot No)_Suahanta Mondal.pdf",
            "/mnt/data/348__South Gobindapur__SAINTHIA.pdf",
            "/mnt/data/345__South Gobindapur__SAINTHIA.pdf",
            "/mnt/data/1764746873181-519NetureeSAINTHIA.pdf"
        ],
        "threshold": 0.7
    }

    output = process_pdfs(payload)

    with open("fast_output.json", "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print("Done. Results written to fast_output.json")
