# main.py - Merged OpenCV ROI-based extraction into full API
# - Use environment var LANDDOC_LOG_DEBUG=1 to include OCR text previews in logs
# - No GPU required. Uses opencv-python-headless.

import os
import re
import shutil
import tempfile
import asyncio
from typing import List, Dict, Any, Optional
from functools import lru_cache

from fastapi import FastAPI, HTTPException, Body
from pydantic import BaseModel
import aiohttp
import aiofiles
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from pytesseract import Output
from PIL import Image
import numpy as np
from rapidfuzz import fuzz
from rapidfuzz.distance import Levenshtein

# OpenCV (headless) - optional; if missing we fallback
try:
    import cv2
    _HAS_CV2 = True
except Exception:
    cv2 = None
    _HAS_CV2 = False

# optional phonetic helper
try:
    import jellyfish
    _HAS_JELLYFISH = True
except Exception:
    _HAS_JELLYFISH = False

# indic-transliteration
try:
    from indic_transliteration import sanscript
    from indic_transliteration.sanscript import transliterate as it_transliterate
    _HAS_INDIC_TRANS = True
except Exception:
    _HAS_INDIC_TRANS = False

# ---------- Bengali digits map ----------
_BN_DIGITS_TRANS = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

def bn_digits_to_ascii(s: Optional[str]) -> str:
    if not s:
        return ""
    s2 = str(s).strip()
    s2 = s2.replace("\u00a0", " ").replace("\u200c", "")
    s2 = s2.translate(_BN_DIGITS_TRANS)
    s2 = re.sub(r'\s*\/\s*', '/', s2)
    s2 = re.sub(r'\s+', '', s2)
    s2 = re.sub(r'^[^\w\/\-]+|[^\w\/\-]+$', '', s2)
    return s2

# ---------- Config ----------
DOWNLOAD_DIR = "downloads"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

app = FastAPI(title="LandDoc OCR - OpenCV ROI Table Extraction + Matcher")

# ---------- Logging configuration ----------
import logging, json
from logging.handlers import RotatingFileHandler
from datetime import datetime
import uuid

LOG_PATH = os.environ.get("LANDDOC_LOG", "landdoc_extraction.log")
LOG_MAX_BYTES = 10 * 1024 * 1024
LOG_BACKUP_COUNT = 5
VERBOSE_LOG_OCR = os.environ.get("LANDDOC_LOG_DEBUG", "0") == "1"

logger = logging.getLogger("landdoc")
logger.setLevel(logging.INFO)
fh = RotatingFileHandler(LOG_PATH, maxBytes=LOG_MAX_BYTES, backupCount=LOG_BACKUP_COUNT, encoding="utf-8")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', "%Y-%m-%dT%H:%M:%S%z")
fh.setFormatter(formatter)
logger.addHandler(fh)
if os.environ.get("LANDDOC_DEBUG_CONSOLE", "0") == "1":
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)

def _safe_json(obj):
    try:
        return json.dumps(obj, ensure_ascii=False)
    except Exception:
        return json.dumps(str(obj), ensure_ascii=False)

def log_file_extraction(file_url: str, recs: List[Dict[str,Any]], ocr_text: Optional[str]=None, extra: Optional[Dict]=None, debug: bool=False):
    payload = {
        "event": "file_extraction",
        "file": file_url,
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "recs_count": len(recs) if recs is not None else 0,
        "recs": recs,
    }
    if extra:
        payload.update(extra)
    if debug and ocr_text:
        payload["ocr_text_preview"] = (ocr_text[:2000] + "...") if len(ocr_text) > 2000 else ocr_text
    logger.info(_safe_json(payload))

def log_farmer_match(request_id: str, farmer: str, top_candidates: List[Dict[str,Any]], best: Dict[str,Any], decision: str, debug: bool=False):
    payload = {
        "event": "farmer_match",
        "request_id": request_id,
        "farmer": farmer,
        "decision": decision,
        "top_candidates": top_candidates,
        "best": best,
        "timestamp": datetime.utcnow().isoformat() + "Z"
    }
    logger.info(_safe_json(payload))

# ---------- Regexes ----------
BENGALI_RE = re.compile(r'[\u0980-\u09FF]')
DAAG_RE = re.compile(r'(?:দা\s*গ|দাগ|দাগ নং|দাগ:|দাগঃ)\s*[:\-]?\s*([0-9A-Za-z\/\-]+)', flags=re.I)
KHATIAN_RE = re.compile(
    r'(?:খত(?:ি|িয়া|িয়|িয়|ান)?\s*(?:নং|ন|No\.?|No|number)?|khatian\s*(?:no|no\.|number))\s*[:\-]?\s*([0-9A-Za-z\/\-]+)',
    flags=re.I)
RAITER_LABEL_RE = re.compile(r'(?:রায়ত(?:ের)?\s*নাম|রায়তের নাম|রায়ত নাম|মালিক(?:ের)?\s*নাম)', flags=re.I)

# ---------- Download utilities ----------
MAX_CONCURRENCY = 6

async def download_file(session, url, dest_path, headers=None):
    async with session.get(url, headers=headers, timeout=60) as resp:
        resp.raise_for_status()
        f = await aiofiles.open(dest_path, 'wb')
        async for chunk in resp.content.iter_chunked(1024 * 32):
            await f.write(chunk)
        await f.close()

async def download_all(urls: List[str], dest_dir: str, headers: Optional[Dict[str,str]] = None) -> List[str]:
    os.makedirs(dest_dir, exist_ok=True)
    sem = asyncio.Semaphore(MAX_CONCURRENCY)
    async with aiohttp.ClientSession() as sess:
        tasks = []
        for i, url in enumerate(urls):
            ext = os.path.splitext(url.split('?')[0])[1] or ".pdf"
            fn = f"doc_{i}{ext}"
            path = os.path.join(dest_dir, fn)
            async def _dl(u=url, p=path):
                async with sem:
                    await download_file(sess, u, p, headers=headers)
                return p
            tasks.append(asyncio.create_task(_dl()))
        res = await asyncio.gather(*tasks)
    return res

# ---------- Text extraction ----------
def extract_text_from_pdf(path: str) -> str:
    try:
        t = extract_text(path)
        if t and len(t.strip()) > 80:
            return t
    except Exception:
        pass
    return ""

def ocr_pdf(path: str) -> str:
    parts = []
    images = convert_from_path(path, dpi=300)
    for img in images:
        parts.append(pytesseract.image_to_string(img, lang='ben+eng', config='--oem 1 --psm 6'))
    return "\n".join(parts)

# ---------- OCR concurrency wrapper ----------
OCR_SEM = asyncio.Semaphore(2)

async def ocr_pdf_async(path: str) -> str:
    loop = asyncio.get_running_loop()
    def _sync():
        try:
            return ocr_pdf(path)
        except Exception:
            return ""
    async with OCR_SEM:
        return await loop.run_in_executor(None, _sync)

# ---------- Bengali normalization ----------
COMMON_REPLACEMENTS = {"মন্ডল": "মণ্ডল", "\u00a0": " ", "\u200c": ""}

def apply_common_replacements(s: str) -> str:
    for k, v in COMMON_REPLACEMENTS.items():
        s = s.replace(k, v)
    return s

def normalize_bengali_name(name: str) -> str:
    if not name:
        return ""
    s = re.sub(r'[\x00-\x1F]+', ' ', name)
    s = apply_common_replacements(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# ---------- Improved double-map + single-map fallback tables ----------
_DOUBLE_MAP = {
    "kh": "খ", "gh": "ঘ", "ch": "চ", "jh": "ঝ", "th": "থ", "dh": "ধ",
    "ph": "ফ", "bh": "ভ", "sh": "শ", "shh": "ষ", "ss": "স", "ng": "ঙ",
    "ny": "ঞ", "rr": "ঢ়", "tr": "ত্র", "gn": "ঞ"
}

_SINGLE_MAP = {
    "a": "অ", "b": "ব", "c": "ক", "d": "দ", "e": "এ", "f": "ফ", "g": "গ", "h": "হ",
    "i": "ই", "j": "জ", "k": "ক", "l": "ল", "m": "ম", "n": "ন", "o": "ও", "p": "প",
    "q": "ক", "r": "র", "s": "স", "t": "ত", "u": "উ", "v": "ভ", "w": "ও", "x": "ক্স", "y": "য়", "z": "জ"
}

# ---------- Indic transliteration helpers ----------
@lru_cache(maxsize=4096)
def transliterate_forward_indic_cached(bn_name: str, scheme_out: str = "KOLKATA") -> str:
    return transliterate_forward_indic(bn_name, scheme_out=scheme_out)

def transliterate_forward_indic(bn_name: str, scheme_out: str = "KOLKATA") -> str:
    if not bn_name: return ""
    bn = normalize_bengali_name(bn_name)
    if not _HAS_INDIC_TRANS: return re.sub(r'\s+', ' ', bn)
    scheme_map = {"KOLKATA": sanscript.KOLKATA, "ITRANS": sanscript.ITRANS, "IAST": sanscript.IAST, "WX": sanscript.WX}
    out = ""
    try:
        out = it_transliterate(bn, sanscript.BENGALI, scheme_map.get(scheme_out, sanscript.KOLKATA))
    except Exception:
        try: out = it_transliterate(bn, sanscript.BENGALI, sanscript.KOLKATA)
        except Exception: out = ""
    return re.sub(r'\s+', ' ', out).strip()

def generate_forward_variants(bn_name: str, max_variants: int = 12) -> List[str]:
    variants = []; seen = set()
    def add(x):
        if not x: return
        x2 = re.sub(r'\s+', ' ', x.strip())
        if x2 not in seen: seen.add(x2); variants.append(x2)
    k = transliterate_forward_indic_cached(bn_name, scheme_out="KOLKATA")
    add(k); add(k.lower()); add(k.replace(" ", "")); add(k.replace(" ", "").lower())
    if _HAS_INDIC_TRANS:
        try:
            itr = it_transliterate(normalize_bengali_name(bn_name), sanscript.BENGALI, sanscript.ITRANS)
            add(itr); add(itr.lower()); add(itr.replace(" ", ""))
        except Exception: pass
    for v in list(variants)[:]:
        add(v.replace("sh", "s"))
        if len(v) > 3: add(v[:-1]); add(v[1:])
    return variants[:max_variants]

@lru_cache(maxsize=4096)
def reverse_transliterate_indic_cached(en_name: str, max_variants: int = 8) -> List[str]:
    return reverse_transliterate_indic(en_name, max_variants=max_variants)

def reverse_transliterate_indic(en_name: str, max_variants: int = 8) -> List[str]:
    if not en_name: return []
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', en_name).strip().lower()
    s = re.sub(r'\s+', ' ', s)
    variants = []; seen = set()
    if not _HAS_INDIC_TRANS: return _fallback_reverse_variants(en_name, max_variants)
    styles = [sanscript.ITRANS, sanscript.KOLKATA, sanscript.IAST, sanscript.WX]
    for src in styles:
        if len(variants) >= max_variants: break
        try:
            bn = it_transliterate(s, src, sanscript.BENGALI)
            bn = normalize_bengali_name(bn)
            if bn and bn not in seen: seen.add(bn); variants.append(bn)
        except Exception: continue
    toks = s.split()
    if len(variants) < max_variants and len(toks) > 1:
        try:
            joined = it_transliterate("".join(toks), sanscript.ITRANS, sanscript.BENGALI)
            joined = normalize_bengali_name(joined)
            if joined and joined not in seen: seen.add(joined); variants.append(joined)
        except Exception: pass
    return variants[:max_variants]

def _fallback_reverse_variants(en_name: str, max_variants:int=8) -> List[str]:
    s = re.sub(r'[^a-z0-9\s]', ' ', en_name.lower()).strip()
    toks = s.split()
    if not toks: return []
    def roman_to_bn(tok: str) -> str:
        i = 0; out = []
        while i < len(tok):
            if i + 3 <= len(tok) and tok[i:i+3] in _DOUBLE_MAP: out.append(_DOUBLE_MAP[tok[i:i+3]]); i += 3; continue
            if i + 2 <= len(tok) and tok[i:i+2] in _DOUBLE_MAP: out.append(_DOUBLE_MAP[tok[i:i+2]]); i += 2; continue
            ch = tok[i]; out.append(_SINGLE_MAP.get(ch, "")); i += 1
        return "".join(out)
    bn_tokens = [roman_to_bn(t) for t in toks]
    spaced = " ".join([t for t in bn_tokens if t]); joined = "".join([t for t in bn_tokens if t])
    out = []
    if spaced: out.append(spaced)
    if joined and joined != spaced: out.append(joined)
    return out[:max_variants]

# ---------- Similarity ----------
def normalize_bn_for_compare(bn: str) -> str:
    s = normalize_bengali_name(bn or "")
    if not s: return ""
    s = s.replace("ঃ", ":").replace("।", "").replace("–", "-").replace("—", "-")
    s = s.replace("্র", "র").replace("\u09cd\u09b0", "র")
    s = s.translate(_BN_DIGITS_TRANS)
    s = re.sub(r'[^ঀ-৾0-9\s]', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def bn_string_similarity(a: str, b: str) -> float:
    a = normalize_bn_for_compare(a); b = normalize_bn_for_compare(b)
    if not a or not b: return 0.0
    try: lev = Levenshtein.normalized_similarity(a, b) * 100.0
    except Exception: lev = fuzz.ratio(a, b)
    part = fuzz.partial_ratio(a, b)
    score = 0.7 * lev + 0.3 * part
    return score

EN_REPLACE_MAP = {".":" ", ",":" ", "-":" ", "'":"", "  ":" "}
def normalize_api_name(api_name: str) -> str:
    s = (api_name or "").strip().lower()
    for k, v in EN_REPLACE_MAP.items(): s = s.replace(k, v)
    return re.sub(r'\s+', ' ', s).strip()

def token_phonetic_score(a: str, b: str) -> float:
    a = (a or "").strip().lower(); b = (b or "").strip().lower()
    if not a or not b: return 0.0
    if not _HAS_JELLYFISH: return 100.0 if a == b else 0.0
    try:
        ma = jellyfish.metaphone(a) or ""; mb = jellyfish.metaphone(b) or ""
        if not ma or not mb: return 0.0
        return fuzz.ratio(ma, mb)
    except Exception: return 0.0

def token_score_en(api_tok: str, cand_tok: str) -> float:
    api_tok = (api_tok or "").strip(); cand_tok = (cand_tok or "").strip()
    if not api_tok or not cand_tok: return 0.0
    t1 = fuzz.ratio(api_tok, cand_tok)
    t2 = fuzz.partial_ratio(api_tok, cand_tok)
    t3 = fuzz.token_sort_ratio(api_tok, cand_tok)
    try: lev = Levenshtein.normalized_similarity(api_tok, cand_tok) * 100.0
    except Exception: lev = 0.0
    phon = token_phonetic_score(api_tok, cand_tok)
    return 0.20 * t1 + 0.15 * t2 + 0.15 * t3 + 0.35 * lev + 0.15 * phon

# ---------- Candidate generation ----------
def generate_bn_candidates_from_filemap(file_map: Dict[str, Any]) -> List[Dict[str, Any]]:
    out = []
    for file_url, fmap in file_map.items():
        for kh, val in fmap.get("khatian_map", {}).items():
            if isinstance(val, dict):
                bn = val.get("bn") or val.get("raw", "")
            else:
                bn = val or ""
            daag = None
            if fmap.get("daags"):
                daag = fmap["daags"][0]
            out.append({"bn": bn, "file": file_url, "daag_no": daag, "khatian_no": kh})
    return out

# ---------- Matching directions ----------
def match_forward(api_name: str, bn: str) -> float:
    api_norm = normalize_api_name(api_name)
    if not api_norm: return 0.0
    variants = generate_forward_variants(bn, max_variants=12)
    best = 0.0
    for v in variants:
        v_norm = re.sub(r'[^0-9a-z\s]', ' ', v.lower()).strip()
        ts = fuzz.token_set_ratio(api_norm, v_norm)
        pr = fuzz.partial_ratio(api_norm, v_norm)
        try: lev = Levenshtein.normalized_similarity(api_norm, v_norm) * 100.0
        except: lev = 0.0
        combined = 0.55 * ts + 0.25 * pr + 0.20 * lev
        if combined > best: best = combined
    return best / 100.0

def match_reverse(api_name: str, bn: str) -> float:
    if not api_name: return 0.0
    rv = reverse_transliterate_indic_cached(api_name, max_variants=12)
    best = 0.0; bn_norm = normalize_bn_for_compare(bn)
    for cand in rv:
        sc = bn_string_similarity(bn_norm, cand)
        if sc > best: best = sc
    return best / 100.0

def surname_similarity_bn(bn1: str, bn2: str) -> float:
    if not bn1 or not bn2: return 0.0
    a = normalize_bn_for_compare(bn1).strip(); b = normalize_bn_for_compare(bn2).strip()
    if not a or not b: return 0.0
    return bn_string_similarity(a.split()[-1], b.split()[-1]) / 100.0

def first_name_similarity(api_name: str, bn: str) -> float:
    bn_norm = normalize_bn_for_compare(bn)
    bn_tokens = bn_norm.split()
    bn_first = bn_tokens[0] if bn_tokens else ""
    api_norm = normalize_api_name(api_name)
    api_first = api_norm.split()[0] if api_norm.split() else ""
    best_bn_space = 0.0
    try:
        rev_cands = reverse_transliterate_indic_cached(api_name, max_variants=8)
        for rc in rev_cands:
            rc_first = rc.split()[0] if rc.split() else ""
            if rc_first and bn_first:
                sc = bn_string_similarity(bn_first, rc_first) / 100.0
                if sc > best_bn_space: best_bn_space = sc
    except Exception:
        best_bn_space = 0.0
    best_en_space = 0.0
    try:
        fwd = transliterate_forward_indic_cached(bn, scheme_out="KOLKATA") or ""
        fwd_first = re.sub(r'[^0-9a-z\s]', ' ', fwd.lower()).strip().split()[0] if fwd else ""
        if api_first and fwd_first: best_en_space = token_score_en(api_first, fwd_first) / 100.0
    except Exception:
        best_en_space = 0.0
    return max(best_bn_space, best_en_space)

def hybrid_match(api_name: str, bn: str, surname_boost: float = 0.0, surname_threshold: float = 0.6) -> Dict[str, Any]:
    f = match_forward(api_name, bn); r = match_reverse(api_name, bn)
    base_best = f if f >= r else r; best_dir = "forward" if f >= r else "reverse"
    surname_sim = 0.0
    try:
        rev_cands = reverse_transliterate_indic_cached(api_name, max_variants=6)
        best_rev_s = 0.0
        for rc in rev_cands:
            ssim = surname_similarity_bn(bn, rc)
            if ssim > best_rev_s: best_rev_s = ssim
        surname_sim = best_rev_s
    except Exception: surname_sim = 0.0
    boosted = base_best
    if surname_boost and surname_sim >= surname_threshold: boosted = min(1.0, base_best + surname_boost)
    fn_sim = first_name_similarity(api_name, bn)
    return {"forward_score": f, "reverse_score": r, "best_score": boosted, "best_direction": best_dir, "surname_similarity": surname_sim, "first_name_similarity": fn_sim}

# ---------- Cleaning utilities ----------
def strip_control_chars(s: Optional[str]) -> str:
    if not s: return ""
    s = re.sub(r'[\x00-\x1F\x7F]+', ' ', str(s))
    s = s.replace('\ufeff',' ').replace('\u200b',' ').replace('\u200c','').replace('\xa0',' ')
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def is_valid_raiter_text(s: str) -> bool:
    if not s: return False
    s2 = strip_control_chars(s)
    if re.search(r'\bদাগ\b|\bদাগ\s*নং\b|\bদােগর\b', s2): return False
    bengali_letters = re.findall(r'[\u0980-\u09FF]', s2)
    if len(bengali_letters) < 2: return False
    return True

def normalize_khatian_raw(kh_raw: Optional[str]) -> str:
    if not kh_raw: return ""
    k = strip_control_chars(str(kh_raw))
    k = re.sub(r'\s+', '', k)
    k = k.translate(_BN_DIGITS_TRANS)
    m = re.search(r'([0-9]{1,6}(?:\/[0-9]{1,4})?)', k)
    if m: return m.group(1)
    return k

def pick_best_raiter_for_khatian(candidates: List[Dict[str,Any]]) -> Optional[Dict[str,Any]]:
    best = None; best_score = -1
    for r in candidates:
        raw = r.get("raiter") or ""
        cleaned = strip_control_chars(raw)
        if not is_valid_raiter_text(cleaned): continue
        beng = len(re.findall(r'[\u0980-\u09FF]', cleaned))
        score = beng * 10 + len(cleaned)
        if score > best_score:
            best_score = score; best = {**r, "raiter_clean": cleaned}
    return best

def clean_recs_list(recs: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
    if not recs: return []
    for r in recs:
        r["raiter_raw"] = r.get("raiter") or ""
        r["raiter"] = strip_control_chars(r.get("raiter") or "")
        r["khatian_raw"] = r.get("khatian") or ""
        r["khatian"] = normalize_khatian_raw(r["khatian_raw"])
        r["daag_raw"] = r.get("daag") or ""
        r["daag"] = bn_digits_to_ascii(str(r["daag_raw"])) if r["daag_raw"] else r.get("daag") or ""
    by_kh = {}
    for r in recs:
        kh = r.get("khatian") or ""
        by_kh.setdefault(kh, []).append(r)
    cleaned = []
    for kh, group in by_kh.items():
        best = pick_best_raiter_for_khatian(group)
        if best:
            cleaned.append({"daag": best.get("daag"), "khatian": kh or None, "raiter": best.get("raiter_clean"), "source_count": len(group)})
        else:
            fallback = max(group, key=lambda x: len(re.findall(r'[\u0980-\u09FF]', x.get("raiter") or "")))
            cleaned.append({"daag": fallback.get("daag"), "khatian": kh or None, "raiter": (fallback.get("raiter") or "").strip(), "source_count": len(group)})
    seen = set(); out = []
    for c in cleaned:
        key = f"{c.get('daag')}::{c.get('khatian')}::{c.get('raiter')}"
        if key not in seen: seen.add(key); out.append(c)
    return out

# ---------- OpenCV-based table extraction ----------
def image_to_gray_cv(img_pil: Image.Image) -> Any:
    arr = np.array(img_pil)
    if arr.ndim == 3:
        gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
    else:
        gray = arr
    return gray

def detect_table_rois_cv(img_pil: Image.Image, debug: bool=False) -> List[Dict[str,int]]:
    if not _HAS_CV2:
        return []
    gray = image_to_gray_cv(img_pil)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    th = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 15, 9)
    horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (max(10, gray.shape[1]//40), 1))
    vert_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, max(10, gray.shape[0]//80)))
    horiz_lines = cv2.morphologyEx(th, cv2.MORPH_OPEN, horiz_kernel, iterations=1)
    vert_lines = cv2.morphologyEx(th, cv2.MORPH_OPEN, vert_kernel, iterations=1)
    mask = cv2.add(horiz_lines, vert_lines)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    connected = cv2.dilate(mask, kernel, iterations=2)
    contours, _ = cv2.findContours(connected, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    h, w = gray.shape[:2]
    for cnt in contours:
        x,y,ww,hh = cv2.boundingRect(cnt)
        if ww < 40 or hh < 20: continue
        if ww < w*0.1 and hh < h*0.02: continue
        pad_x = int(min(20, ww*0.05))
        pad_y = int(min(12, hh*0.1))
        rx = max(0, x-pad_x); ry = max(0, y-pad_y)
        rw = min(w-rx, ww+pad_x*2); rh = min(h-ry, hh+pad_y*2)
        rois.append({"x": rx, "y": ry, "w": rw, "h": rh})
    rois = sorted(rois, key=lambda r: r["y"])
    merged = []
    for r in rois:
        if not merged:
            merged.append(r); continue
        last = merged[-1]
        if r["y"] < last["y"] + last["h"]*0.6:
            nx = min(last["x"], r["x"]); ny = min(last["y"], r["y"])
            nr = max(last["x"]+last["w"], r["x"]+r["w"]) - nx
            nh = max(last["y"]+last["h"], r["y"]+r["h"]) - ny
            merged[-1] = {"x": nx, "y": ny, "w": nr, "h": nh}
        else:
            merged.append(r)
    return merged

def ocr_roi_text(img_pil: Image.Image, roi: Dict[str,int], lang: str='ben+eng', psm: int=6) -> str:
    x,y,w,h = roi["x"], roi["y"], roi["w"], roi["h"]
    crop = img_pil.crop((x,y,x+w,y+h))
    cfg = f'--oem 1 --psm {psm}'
    txt = pytesseract.image_to_string(crop, lang=lang, config=cfg)
    txt = re.sub(r'[\x00-\x1F]+', ' ', txt)
    txt = txt.replace('\ufeff',' ').replace('\u200b',' ').replace('\xa0',' ')
    return re.sub(r'\s+', ' ', txt).strip()

def extract_tables_via_opencv(pdf_path: str, dpi: int = 300, debug: bool=False) -> Dict[str, List[Dict[str,Any]]]:
    if not _HAS_CV2:
        return {"daags": [], "khatians": []}
    pages = convert_from_path(pdf_path, dpi=dpi)
    daag_items = []
    khatian_items = []
    for pidx, img in enumerate(pages):
        rois = detect_table_rois_cv(img, debug=debug)
        if debug:
            logger.info(f"page {pidx} rois: {len(rois)}")
        for roi in rois:
            txt = ocr_roi_text(img, roi, lang='ben+eng', psm=6)
            if not txt or len(txt.strip()) < 3: continue
            lines = [ln.strip() for ln in txt.splitlines() if ln.strip()]
            for ln in lines:
                if 'দাগ' in ln or 'দা গ' in ln or 'দাগ নং' in ln or 'দাগ:' in ln:
                    m = re.search(r'([0-9০১২৩৪৫৬৭৮৯\/\-]{1,8})', ln.replace(' ', ''))
                    if m:
                        val = m.group(1).translate(_BN_DIGITS_TRANS)
                        daag_items.append({"daag_label_line": ln, "daag_value": val, "page": pidx})
                        continue
            for i, ln in enumerate(lines):
                if 'খত' in ln or 'খতিয়ান' in ln or 'khatian' in ln.lower():
                    m = re.search(r'([0-9০১২৩৪৫৬৭৮৯\/\-]{1,8})', ln.replace(' ', ''))
                    kh = m.group(1).translate(_BN_DIGITS_TRANS) if m else ""
                    raiter = ""
                    if i+1 < len(lines):
                        cand = lines[i+1]
                        if len(re.findall(r'[\u0980-\u09FF]', cand)) >= 3:
                            raiter = cand
                    if not raiter:
                        parts = re.split(r'[:\-]', ln, maxsplit=1)
                        if len(parts) > 1 and len(re.findall(r'[\u0980-\u09FF]', parts[1]))>=3:
                            raiter = parts[1].strip()
                    khatian_items.append({"khatian": kh, "raiter": raiter, "page": pidx, "roi": roi})
            for ln in lines:
                if len(re.findall(r'[\u0980-\u09FF]', ln)) >= 4 and len(ln.split()) <= 6:
                    m = re.search(r'([0-9০১২৩৪৫৬৭৮৯\/\-]{1,8})', txt.replace(' ', ''))
                    kh = m.group(1).translate(_BN_DIGITS_TRANS) if m else ""
                    khatian_items.append({"khatian": kh, "raiter": ln, "page": pidx, "roi": roi})
    def dedupe_list(lst, keys):
        seen = set(); out = []
        for it in lst:
            k = tuple(it.get(k,"") for k in keys)
            if k not in seen:
                seen.add(k); out.append(it)
        return out
    daag_items = dedupe_list(daag_items, ["daag_value", "page"])
    khatian_items = dedupe_list(khatian_items, ["khatian", "raiter", "page"])
    return {"daags": daag_items, "khatians": khatian_items}

# ---------- Extraction heuristics (text-layer fallback) ----------
def extract_daag_khatian_raiter_from_text(text: str):
    text_norm = re.sub(r'\r','\n', text or "")
    lines = [ln.strip() for ln in text_norm.splitlines() if ln.strip()]
    daags_raw = [m.group(1).strip() for m in DAAG_RE.finditer(text or "")]
    daags = [bn_digits_to_ascii(d) for d in daags_raw if d]
    results = []
    for m in RAITER_LABEL_RE.finditer(text or ""):
        start = m.end(); snippet = text[start:start+300]
        for ln in snippet.splitlines():
            ln = ln.strip()
            if BENGALI_RE.search(ln) and 2 <= len(ln) <= 140:
                daag = daags[0] if daags else None
                kh = None
                back = text[max(0,m.start()-200): m.end()+200]
                km = KHATIAN_RE.search(back)
                if km:
                    kh_raw = km.group(1).strip()
                    kh = bn_digits_to_ascii(kh_raw)
                results.append({"daag": daag, "khatian": kh, "raiter": ln})
    for i, ln in enumerate(lines):
        mnum = re.search(r'\b([0-9]{1,6}(?:\/[0-9]{1,4})?)\b', ln)
        if mnum:
            kid_raw = mnum.group(1)
            kid = bn_digits_to_ascii(kid_raw)
            for j in range(1,4):
                if i+j < len(lines):
                    nxt = lines[i+j]
                    if BENGALI_RE.search(nxt) and 2 <= len(nxt) <= 120:
                        daag = daags[0] if daags else None
                        results.append({"daag": daag, "khatian": kid, "raiter": nxt})
                        break
    seen = set(); out = []
    for r in results:
        key = f"{r.get('daag')}::{r.get('khatian')}::{r.get('raiter')}"
        if key not in seen: seen.add(key); out.append(r)
    return out

# ---------- API models ----------
class SimpleCheckRequest(BaseModel):
    daag: List[str]
    khatian: List[str]
    farmer: List[str]
    files: List[str]
    threshold: float
    force_ocr: Optional[bool] = False
    request_id: Optional[str] = None

# ---------- /quick_check endpoint ----------
@app.post("/quick_check")
async def quick_check(req: SimpleCheckRequest = Body(...)):
    try:
        threshold = float(req.threshold)
    except Exception:
        threshold = 0.75
    threshold = max(0.0, min(1.0, threshold))
    request_id = getattr(req, "request_id", None) or str(uuid.uuid4())

    # thresholds
    SURNAME_REQUIRED = 0.60
    FIRSTNAME_REQUIRED = 0.45
    SURNAME_BOOST = 0.10
    SURNAME_THRESHOLD = 0.60
    FIRSTNAME_SAFETY_FLOOR = 0.25

    tempd = tempfile.mkdtemp(prefix="quickcheck_")
    try:
        try:
            downloaded = await download_all(req.files, tempd)
        except Exception as e:
            shutil.rmtree(tempd, ignore_errors=True)
            raise HTTPException(status_code=400, detail=f"download_error: {e}")

        file_map = {}
        global_daags = set()
        global_khatians = set()
        use_ocr = bool(getattr(req, "force_ocr", False))

        for file_url, local_path in zip(req.files, downloaded):
            # Prefer OpenCV table extraction when available
            tables = {"daags": [], "khatians": []}
            try:
                if _HAS_CV2:
                    tables = extract_tables_via_opencv(local_path, dpi=300, debug=False)
            except Exception as e:
                logger.exception("extract_tables_via_opencv failed: %s", e)
                tables = {"daags": [], "khatians": []}

            fmap = {"daags": [], "khatian_map": {}, "bn_to_khatian": {}}
            used_table_extraction = False

            if tables and (tables.get("daags") or tables.get("khatians")):
                # map daags
                for d in tables.get("daags", []):
                    v = d.get("daag_value") or ""
                    v = bn_digits_to_ascii(v)
                    if v and v not in fmap["daags"]:
                        fmap["daags"].append(v); global_daags.add(v)
                # map khatians
                for k in tables.get("khatians", []):
                    kh = bn_digits_to_ascii(k.get("khatian") or "")
                    bn_raw = k.get("raiter") or ""
                    bn_norm = normalize_bengali_name(bn_raw)
                    if kh:
                        fmap["khatian_map"][kh] = {"bn": bn_norm or bn_raw or "", "raw": bn_raw, "orig_khatian": k.get("khatian")}
                        global_khatians.add(kh)
                        if fmap["khatian_map"][kh].get("bn"):
                            fmap["bn_to_khatian"][fmap["khatian_map"][kh]["bn"]] = kh
                used_table_extraction = True

            if not used_table_extraction:
                # fall back to text-layer + OCR
                text = ""
                if use_ocr:
                    text = await ocr_pdf_async(local_path)
                else:
                    text = extract_text_from_pdf(local_path)
                    if not text or len(text.strip()) < 80:
                        text = await ocr_pdf_async(local_path)

                raw_recs = extract_daag_khatian_raiter_from_text(text)
                recs = clean_recs_list(raw_recs)
                # log both raw and cleaned
                try:
                    log_file_extraction(file_url, raw_recs, ocr_text=text if VERBOSE_LOG_OCR else None, extra={"cleaned_recs": recs}, debug=VERBOSE_LOG_OCR)
                except Exception:
                    logger.exception("log_file_extraction failed for %s", file_url)

                for r in recs:
                    daag = bn_digits_to_ascii(str(r.get("daag") or "")) if r.get("daag") else None
                    kh = r.get("khatian") or None
                    bn_norm = normalize_bengali_name(r.get("raiter") or "")
                    if daag and daag not in fmap["daags"]:
                        fmap["daags"].append(daag); global_daags.add(daag)
                    if kh:
                        fmap["khatian_map"][kh] = {"bn": bn_norm or r.get("raiter") or "", "raw": r.get("raiter")}
                        global_khatians.add(kh)
                        if fmap["khatian_map"][kh].get("bn"):
                            fmap["bn_to_khatian"][fmap["khatian_map"][kh]["bn"]] = kh
            else:
                # if we used table extraction, also log the extracted tables
                try:
                    log_file_extraction(file_url, tables.get("khatians", []) + tables.get("daags", []), ocr_text=None, extra={"method":"opencv_table"}, debug=VERBOSE_LOG_OCR)
                except Exception:
                    logger.exception("log_file_extraction (table) failed for %s", file_url)

            file_map[file_url] = fmap

        missing_daag = []
        missing_khatian = []
        for dq in req.daag:
            if bn_digits_to_ascii(str(dq)) not in global_daags:
                missing_daag.append({"daag": str(dq), "files_checked": req.files})
        for kq in req.khatian:
            if bn_digits_to_ascii(str(kq)) not in global_khatians:
                missing_khatian.append({"khatian": str(kq), "files_checked": req.files})

        bn_candidates = generate_bn_candidates_from_filemap(file_map)
        missing_farmer = []
        matches_found = []

        for farmer_q in req.farmer:
            best_overall = {"best_score": 0.0, "best_bn": None, "best_file": None,
                            "best_daag": None, "best_khatian": None, "direction": None,
                            "forward_score": 0.0, "reverse_score": 0.0,
                            "surname_similarity": 0.0, "first_name_similarity": 0.0}
            cands_scores = []
            for cand in bn_candidates:
                bn = cand.get("bn")
                if not bn:
                    continue
                res = hybrid_match(farmer_q, bn, surname_boost=SURNAME_BOOST, surname_threshold=SURNAME_THRESHOLD)
                score = res["best_score"]
                cands_scores.append({
                    "bn": bn, "file": cand.get("file"), "daag": cand.get("daag_no"), "khatian": cand.get("khatian_no"),
                    "score": score, "forward_score": res.get("forward_score"), "reverse_score": res.get("reverse_score"),
                    "surname_similarity": res.get("surname_similarity"), "first_name_similarity": res.get("first_name_similarity")
                })
                if score > best_overall["best_score"]:
                    fn_sim = res.get("first_name_similarity")
                    if fn_sim is None: fn_sim = first_name_similarity(farmer_q, bn)
                    best_overall.update({
                        "best_score": score, "best_bn": bn, "best_file": cand.get("file"),
                        "best_daag": cand.get("daag_no"), "best_khatian": cand.get("khatian_no"),
                        "direction": res["best_direction"], "forward_score": res["forward_score"],
                        "reverse_score": res["reverse_score"], "surname_similarity": res.get("surname_similarity", 0.0),
                        "first_name_similarity": fn_sim
                    })
            top_candidates = sorted(cands_scores, key=lambda x: x["score"], reverse=True)[:3]
            try:
                decision = "matched" if (best_overall["best_score"] >= threshold or (best_overall["surname_similarity"] >= SURNAME_REQUIRED and best_overall["first_name_similarity"] >= FIRSTNAME_REQUIRED)) else "missing"
                log_farmer_match(request_id, farmer_q, top_candidates, best_overall, decision, debug=VERBOSE_LOG_OCR)
            except Exception:
                logger.exception("log_farmer_match failed for farmer %s", farmer_q)

            is_high_score = (best_overall["best_score"] >= threshold)
            is_strict_component_match = (best_overall["surname_similarity"] >= SURNAME_REQUIRED and best_overall["first_name_similarity"] >= FIRSTNAME_REQUIRED)
            structured_match = is_high_score or is_strict_component_match
            weak_firstname_flag = False
            if is_high_score and best_overall["first_name_similarity"] < FIRSTNAME_SAFETY_FLOOR:
                weak_firstname_flag = True

            if structured_match:
                matches_found.append({
                    "farmer": farmer_q,
                    "matched_file": best_overall["best_file"],
                    "found_bengali": best_overall["best_bn"],
                    "daag_no": best_overall["best_daag"],
                    "khatian_no": best_overall["best_khatian"],
                    "score": best_overall["best_score"],
                    "direction": best_overall["direction"],
                    "forward_score": best_overall["forward_score"],
                    "reverse_score": best_overall["reverse_score"],
                    "surname_similarity": best_overall["surname_similarity"],
                    "first_name_similarity": best_overall["first_name_similarity"],
                    "weak_firstname": weak_firstname_flag,
                    "top_candidates": top_candidates
                })
            else:
                missing_farmer.append({
                    "farmer": farmer_q,
                    "best_score": best_overall["best_score"],
                    "found_bengali": best_overall["best_bn"],
                    "file": best_overall["best_file"],
                    "daag_no": best_overall["best_daag"],
                    "khatian_no": best_overall["best_khatian"],
                    "direction": best_overall["direction"],
                    "forward_score": best_overall["forward_score"],
                    "reverse_score": best_overall["reverse_score"],
                    "surname_similarity": best_overall["surname_similarity"],
                    "first_name_similarity": best_overall["first_name_similarity"],
                    "top_candidates": top_candidates
                })

        summary = {"total_mismatches": len(missing_daag) + len(missing_khatian) + len(missing_farmer),
                   "daag_mismatches": len(missing_daag),
                   "khatian_mismatches": len(missing_khatian),
                   "farmer_mismatches": len(missing_farmer)}
        status = "ACCEPTED" if summary["total_mismatches"] == 0 else "REJECTED"

        try:
            logger.info(_safe_json({"event":"request_summary","request_id":request_id,"missing_daag":len(missing_daag),
                                    "missing_khatian":len(missing_khatian),"missing_farmer":len(missing_farmer),"status":status,
                                    "timestamp": datetime.utcnow().isoformat() + "Z"}))
        except Exception:
            logger.exception("Failed to log request_summary for %s", request_id)

        return {"missing_daag": missing_daag, "missing_khatian": missing_khatian, "missing_farmer": missing_farmer,
                "summary": summary, "status": status, "matches": matches_found}
    finally:
        shutil.rmtree(tempd, ignore_errors=True)

# End of file