from typing import List, Optional, Dict, Any
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel, Field
import asyncio
import aiohttp
import tempfile
import os
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("land_pdf_extractor")

app = FastAPI(title="Land PDF Extraction API")

# ---- Pydantic models ----
class ExtractRequest(BaseModel):
    daag: Optional[List[str]] = Field(default_factory=list)
    khatian: Optional[List[str]] = Field(default_factory=list)
    farmer: Optional[List[str]] = Field(default_factory=list)
    files: List[str]
    status: Optional[str] = None  # e.g. "accepted" or "rejected"

class FileResult(BaseModel):
    url: str
    ok: bool
    reason: Optional[str] = None
    extracted: Dict[str, Any] = Field(default_factory=dict)

class ExtractResponse(BaseModel):
    overall_status: str
    results: List[FileResult]

# ---- Helpers ----
async def fetch_file(session: aiohttp.ClientSession, url: str, dest_path: Path) -> None:
    async with session.get(url, timeout=60) as resp:
        if resp.status != 200:
            raise RuntimeError(f"HTTP {resp.status}")
        with open(dest_path, "wb") as f:
            while True:
                chunk = await resp.content.read(1024 * 32)
                if not chunk:
                    break
                f.write(chunk)


def extract_text_from_pdf(path: Path) -> List[str]:
    """Return list of page texts using pdfplumber (fast)"""
    pages = []
    try:
        with pdfplumber.open(path) as pdf:
            for p in pdf.pages:
                pages.append(p.extract_text() or "")
    except Exception as e:
        logger.exception("pdfplumber failed: %s", e)
    return pages


def ocr_pdf(path: Path) -> List[str]:
    """Fallback OCR using pdf2image + pytesseract; heavier but robust"""
    texts = []
    try:
        images = convert_from_path(str(path), dpi=200)
        for img in images:
            texts.append(pytesseract.image_to_string(img))
    except Exception as e:
        logger.exception("OCR failed: %s", e)
    return texts


def find_khatian_in_pages(pages: List[str], khatian_values: List[str]) -> Optional[int]:
    """Return page index where any khatian string appears (case-insensitive substring match)"""
    if not khatian_values:
        return None
    lowered = [k.lower() for k in khatian_values]
    for i, text in enumerate(pages):
        t = (text or "").lower()
        for k in lowered:
            if k in t:
                return i
    return None


def extract_tables_bn(path: str) -> List[Dict[str, Any]]:
    """
    Extract Daag table (Table-1) and Khatian/Name table (Table-2),
    remove only header rows, keep all data rows.
    Returns a flat list of rows with table_type in {'daag','khatian','other'}.
    """
    tables = extract_tables_from_pdf_all_pages(path)
    flat = []

    for t in tables:
        rows = t.get("rows", [])
        page = t.get("page")
        t_idx = t.get("table_index")

        # Determine table type by header keywords (inspect first row)
        header_row = rows[0] if rows else []
        header_text = " ".join([str(c) for c in header_row])
        is_daag_table = ("দাগ" in header_text and "রায়েতর" not in header_text and "রায়েতর" not in header_text)
        is_khatian_table = ("খত" in header_text or "রায়েতর" in header_text or "রায়েতর" in header_text)

        for ridx, row in enumerate(rows):

            row_text = " ".join([str(c) for c in row if c])
            # Skip the FIRST header row of the table only
            if ridx == 0:
                continue

            # Clean cells
            cleaned_cells = []
            for c in row:
                cs = str(c).strip()
                cs = repair_bengali_spacing(cs)
                cs = remove_noise_tokens(cs)
                cleaned_cells.append(cs)

            daag = ""
            khatian = ""
            name_bn = ""
            name_en = ""

            # --------------------
            # TABLE-1: DAAG TABLE
            # --------------------
            if is_daag_table:
                # Expect first column is Daag number
                if cleaned_cells:
                    daag = guess_khatian_or_daag_from_cell(cleaned_cells[0])

            # --------------------
            # TABLE-2: KHATIAN + NAME TABLE
            # --------------------
            if is_khatian_table:
                for cell in cleaned_cells:
                    # extract khatian
                    kh = guess_khatian_or_daag_from_cell(cell)
                    if kh:
                        # prefer slash-containing patterns as khatian
                        if '/' in cell or re.search(r'[০-৯]/', cell) or re.search(r'[0-9]/', cell):
                            khatian = kh
                        elif not daag:
                            # if no daag assigned, keep numeric as daag candidate
                            daag = daag or kh
                    # name detection (Bengali letters)
                    if not name_bn and re.search(r'[ঀ-৿]', cell):
                        name_bn = cell

                name_bn = name_bn.strip()
                name_en = transliterate_name_pretty(name_bn) if name_bn else ""

            flat.append({
                "page": page,
                "table_index": t_idx,
                "row_index": ridx,
                "table_type": "daag" if is_daag_table else "khatian" if is_khatian_table else "other",
                "cells": cleaned_cells,
                "row_text": row_text,
                "daag": daag,
                "khatian": khatian,
                "name_bn": name_bn,
                "name_en": name_en
            })

    return flat


def parse_land_details_from_text(text: str) -> Dict[str, Any]:
    """Simple heuristic parser tuned for Bengali land records.
    Detects Daag, Khatian, and Rayetor Nam (রায়েতর নাম).
    """
    res = {}
    lines = [l.strip() for l in (text or "").splitlines() if l.strip()]
    for ln in lines[:300]:
        low = ln.lower()

        # ---- DAAG ----
        if "দাগ" in low:
            res.setdefault("daag", ln)

        # ---- KHATIAN ----
        if "খতিয়ান" in low or "খতিয়ান" in low or "খত" in low:
            res.setdefault("khatian", ln)

        # ---- RAYETOR NAM (OWNER / FARMER NAME) ----
        if "রায়েতর নাম" in low or "রায়েতর নাম" in low:
            res.setdefault("owner", ln)

    return res


async def process_single_file(url: str, khatian_values: List[str]) -> FileResult:
    tmpdir = Path(tempfile.mkdtemp(prefix="landpdf_"))
    dest = tmpdir / "doc.pdf"
    try:
        async with aiohttp.ClientSession() as session:
            await fetch_file(session, url, dest)
    except Exception as e:
        return FileResult(url=url, ok=False, reason=f"download_failed: {e}")

    # Try fast text extraction
    pages = extract_text_from_pdf(dest)
    page_idx = find_khatian_in_pages(pages, khatian_values)
    used_ocr = False
    if page_idx is None:
        # Try OCR fallback
        logger.info("khatian not found in text; trying OCR for %s", url)
        pages = ocr_pdf(dest)
        page_idx = find_khatian_in_pages(pages, khatian_values)
        used_ocr = True

    if page_idx is None:
        # last-resort: return full-text parse
        combined = "\n".join(pages)
        parsed = parse_land_details_from_text(combined)
        reason = "khatian_not_found"
        return FileResult(url=url, ok=bool(parsed), reason=reason if not parsed else None, extracted=parsed)

    # We found a page — parse that page
    parsed = parse_land_details_from_text(pages[page_idx])
    parsed["_meta"] = {"page_index": page_idx, "ocr_used": used_ocr}
    # cleanup
    try:
        for p in tmpdir.iterdir():
            p.unlink()
        tmpdir.rmdir()
    except Exception:
        pass
    return FileResult(url=url, ok=True, extracted=parsed)

# ---- API Endpoints ----
@app.post("/extract", response_model=ExtractResponse)
async def extract(req: ExtractRequest):
    # If user explicitly marks status rejected, return a 400 (or choose 422/409 as you prefer)
    if req.status and req.status.lower() == "rejected":
        # non-200 response as requested
        raise HTTPException(status_code=400, detail={"error": "request_status_rejected"})

    # Process files concurrently but limit concurrency to avoid resource exhaustion
    semaphore = asyncio.Semaphore(4)

    async def sem_task(url):
        async with semaphore:
            return await process_single_file(url, req.khatian or [])

    tasks = [asyncio.create_task(sem_task(u)) for u in req.files]
    results = await asyncio.gather(*tasks)

    overall = "partial_success"
    if all(r.ok for r in results):
        overall = "success"
    elif all(not r.ok for r in results):
        overall = "failure"

    return ExtractResponse(overall_status=overall, results=results)

# For local debugging
if __name__ == "__main__":
    import uvicorn
    uvicorn.run("fastapi_land_pdf_extractor:app", host="0.0.0.0", port=8900, log_level="info")



















# #!/usr/bin/env python3
# """
# Merged FastAPI app: your original concurrent-download script + Bengali table-first PDF extraction.
# Save as: app_concurrent_downloads_merged.py
# Run:
#     uvicorn app_concurrent_downloads_merged:app --host 0.0.0.0 --port 8000
# """
# from fastapi import FastAPI, UploadFile, File, Form, HTTPException
# from fastapi.responses import JSONResponse
# from pydantic import BaseModel
# from typing import List, Optional, Dict, Any
# from pathlib import Path
# import shutil
# import re
# import json
# from difflib import SequenceMatcher
# import requests
# from urllib.parse import urlparse
# import time
# import concurrent.futures
# import logging

# # optional: rapidfuzz if available
# try:
#     from rapidfuzz import fuzz
#     HAVE_RAPIDFUZZ = True
# except Exception:
#     HAVE_RAPIDFUZZ = False

# # table/text extraction libs
# import pdfplumber
# try:
#     from pdf2image import convert_from_path
#     import pytesseract
#     HAVE_OCR = True
# except Exception:
#     HAVE_OCR = False

# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger("land_doc_matcher")

# app = FastAPI(title="Land-Doc Table Matcher (Concurrent Downloads) - Merged")

# # ---------------- default FILES (edit if you want) ----------------
# DEFAULT_FILES = [
#     "/mnt/data/371(Plot No)_Suahanta Mondal.pdf",
#     "/mnt/data/348__South Gobindapur__SAINTHIA.pdf",
#     "/mnt/data/345__South Gobindapur__SAINTHIA.pdf",
#     "/mnt/data/1764746873181-519NetureeSAINTHIA.pdf"
# ]

# # ---------------- Matching config (tweak if needed) ----------------
# REQUIRE_KHATIAN_FOR_FARMER_MATCH = True
# FARMER_FUZZY_THRESHOLD = 85.0
# KHATIAN_CONFIDENT_MIN_SCORE = 60.0

# # ---------------- Download config ----------------
# DOWNLOAD_DIR = Path("./downloads"); DOWNLOAD_DIR.mkdir(exist_ok=True)
# UPLOAD_DIR = Path("./uploads"); UPLOAD_DIR.mkdir(exist_ok=True)
# DOWNLOAD_TIMEOUT = 20  # seconds per request
# DOWNLOAD_WORKERS = 8   # ThreadPool max workers (tweak for your environment)
# DOWNLOAD_RETRIES = 1   # simple retry count

# # ---------------- Pydantic models ----------------
# class MatchPayload(BaseModel):
#     daag: List[str]
#     khatian: List[str]
#     farmer: List[str]
#     files: Optional[List[str]] = None

# # ---------------- Helper functions for URL detection + download ----------------
# def is_url(s: str) -> bool:
#     try:
#         p = urlparse(s)
#         return p.scheme in ("http", "https")
#     except Exception:
#         return False


# def download_pdf_once(url: str, dest_dir: Path, timeout: int = DOWNLOAD_TIMEOUT) -> Optional[str]:
#     try:
#         r = requests.get(url, stream=True, timeout=timeout)
#         r.raise_for_status()
#         parsed = urlparse(url)
#         fname = Path(parsed.path).name
#         if not fname or not fname.lower().endswith(".pdf"):
#             fname = f"download_{int(time.time()*1000)}.pdf"
#         dest = dest_dir / fname
#         if dest.exists():
#             base = dest.stem
#             suf = 1
#             while (dest_dir / f"{base}_{suf}.pdf").exists():
#                 suf += 1
#             dest = dest_dir / f"{base}_{suf}.pdf"
#         with dest.open("wb") as f:
#             for chunk in r.iter_content(chunk_size=8192):
#                 if chunk:
#                     f.write(chunk)
#         return str(dest.resolve())
#     except Exception:
#         return None


# def download_with_retries(url: str, dest_dir: Path, retries: int = DOWNLOAD_RETRIES) -> Optional[str]:
#     attempt = 0
#     while attempt <= retries:
#         path = download_pdf_once(url, dest_dir)
#         if path:
#             return path
#         attempt += 1
#     return None


# def download_all_urls_concurrent(urls: List[str], dest_dir: Path, max_workers: int = DOWNLOAD_WORKERS) -> Dict[str, Optional[str]]:
#     results: Dict[str, Optional[str]] = {}
#     if not urls:
#         return results
#     seen = set()
#     uniq_urls = []
#     for u in urls:
#         if u not in seen:
#             seen.add(u)
#             uniq_urls.append(u)

#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
#         future_to_url = {executor.submit(download_with_retries, url, dest_dir): url for url in uniq_urls}
#         for fut in concurrent.futures.as_completed(future_to_url):
#             url = future_to_url[fut]
#             try:
#                 local = fut.result()
#             except Exception:
#                 local = None
#             results[url] = local
#     return results

# # ---------------- Matching implementation (original) ----------------
# PREFIXES = [
#     r'দখলদার মন্তব', r'দখলদার', r'মন্তব',
#     r'ব \s* া \s* ক্ত', r'ব া ক্ত', r'বাক্তি', r'বাক্ত', r'বক্ত', r'দখ'
# ]
# PREFIX_RE = re.compile(r'^(?:' + "|".join(PREFIXES) + r')\s*', flags=re.I)

# HEADER_NOISE_RE = re.compile(
#     r'(জ\.এল|জ\.এল নং|দাগর|দােগর|ম \s* াপ|ম াপ|Click Here|Live Data|Banglarbhumi|উল্লিখিত|খিত|Remarks|Nil|থানা|ব্লক|মৌজা)',
#     flags=re.I
# )

# SURNAME_HINTS = ["মন্ডল","মণ্ডল","হোসেন","ইসলাম","চৌধুরী","দাস","কুমার","রায়","রাজ","শর্মা"]
# PAIR_RE = re.compile(r'([\u0980-\u09FF\u0020\.\-]{2,160}?)\s+([০-৯0-9]{1,6}\s*/\s*[০-৯0-9]{0,6}|[০-৯0-9]{2,6})')

# # ---- Bengali helpers: digits + transliteration ----
# NUM_MAP = str.maketrans("০১২৩৪৫৬৭৮৯","0123456789")

# def beng_to_ascii(s: str) -> str:
#     if not s:
#         return ""
#     return (s or "").translate(NUM_MAP)

# BENG_TO_LAT_MAP = {
#     'অ':'o','আ':'a','ই':'i','ঈ':'i','উ':'u','এ':'e','ঐ':'oi','ও':'o','ঔ':'ou',
#     'ক':'k','খ':'kh','গ':'g','ঘ':'gh','ঙ':'ng','চ':'ch','ছ':'chh','জ':'j','ঝ':'jh','ঞ':'n',
#     'ট':'t','ঠ':'th','ড':'d','ঢ':'dh','ণ':'n','ত':'t','থ':'th','দ':'d','ধ':'dh','ন':'n',
#     'প':'p','ফ':'ph','ব':'b','ভ':'bh','ম':'m','য':'y','র':'r','ল':'l','শ':'sh','ষ':'sh','স':'s','হ':'h',
#     '্':'','া':'a','ি':'i','ী':'i','ু':'u','ূ':'u','ে':'e','ৈ':'oi','ো':'o','ৌ':'ou','ঁ':'n','ঃ':''
# }

# def transliterate_bengali_to_latin(s: str) -> str:
#     """Simple rule-based transliteration for Bengali -> Latin-ish ASCII."""
#     if not s:
#         return ""
#     out = []
#     for ch in s:
#         if ch in BENG_TO_LAT_MAP:
#             out.append(BENG_TO_LAT_MAP[ch])
#         else:
#             if re.match(r'[A-Za-z0-9 ]', ch):
#                 out.append(ch.lower())
#             else:
#                 out.append(' ')
#     txt = "".join(out)
#     txt = re.sub(r'\s+', ' ', txt).strip()
#     return txt

# def clean_ctrl(s: str) -> str:
#     if not s:
#         return ""
#     s = re.sub(r'[\x00-\x1F\x7F]+', ' ', s)
#     s = re.sub(r'\s+', ' ', s)
#     return s.strip()


# def transliterate_bengali_to_latin_if_needed(s: str) -> str:
#     if not s:
#         return ""
#     if re.search(r'[\u0980-\u09FF]', s):
#         return transliterate_bengali_to_latin(s)
#     return s.lower()

# def fuzzy_score(a: str, b: str) -> float:
#     if HAVE_RAPIDFUZZ:
#         try:
#             return float(fuzz.token_set_ratio(a,b))
#         except Exception:
#             return float(fuzz.ratio(a,b))
#     else:
#         return SequenceMatcher(None, a, b).ratio() * 100.0


# def normalize_candidate_name(raw: str) -> str:
#     s = (raw or "").strip()
#     s = PREFIX_RE.sub('', s)
#     s = HEADER_NOISE_RE.sub('', s)
#     s = re.sub(r'[^ \u0980-\u09FFA-Za-z]', ' ', s)
#     s = re.sub(r'\s{2,}', ' ', s).strip()
#     return s

# # ---------------- New: table-first extraction + Bengali-aware helpers ----------------

# def normalize_text(s: str) -> str:
#     if not s:
#         return ""
#     return re.sub(r"\s+", " ", s.strip()).lower()


# def extract_tables_from_pdf_all_pages(path: str) -> List[Dict[str, Any]]:
#     tables = []
#     try:
#         with pdfplumber.open(path) as pdf:
#             for p in pdf.pages:
#                 page_tables = p.extract_tables()
#                 if not page_tables:
#                     continue
#                 for tix, t in enumerate(page_tables):
#                     normed = [[(c or "").strip() for c in row] for row in t]
#                     normed = [row for row in normed if any(cell for cell in row)]
#                     if normed:
#                         tables.append({"page": p.page_number, "table_index": tix, "rows": normed})
#     except Exception as e:
#         logger.exception("table extraction failed: %s", e)
#     return tables


# def build_flat_list_from_tables(tables: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
#     flat = []
#     for t in tables:
#         rows = t.get('rows', [])
#         for ridx, r in enumerate(rows):
#             flat.append({
#                 'page': t.get('page'),
#                 'table_index': t.get('table_index'),
#                 'row_index': ridx,
#                 'cells': r,
#                 'row_text': ' '.join([str(c) for c in r if c])
#             })
#     return flat


# def classify_tables(flat_records: List[Dict[str,Any]]) -> Dict[str, List[Dict[str,Any]]]:
#     daag_re = re.compile(r"\b[0-9]{1,6}\b")
#     khatian_re = re.compile(r"\b[0-9]{1,6}\s*[/-]\s*[0-9]{0,6}\b")
#     buckets = {'daag': [], 'khatian': [], 'other': []}
#     for rec in flat_records:
#         text = normalize_text(rec.get('row_text',''))
#         # translate bengali digits in text to ascii for classification
#         text_digits = text.translate(NUM_MAP)
#         if khatian_re.search(text_digits):
#             buckets['khatian'].append(rec)
#         elif daag_re.search(text_digits):
#             buckets['daag'].append(rec)
#         else:
#             buckets['other'].append(rec)
#     return buckets


# def fuzzy_in(cell: str, candidates: List[str], cutoff: float = 0.8) -> bool:
#     """Return True if any candidate fuzzy-matches the cell text at or above cutoff."""
#     if not cell or not candidates:
#         return False
#     cell_n = normalize_text(cell)
#     for c in candidates:
#         c_n = normalize_text(c)
#         if not c_n:
#             continue
#         # exact substring check first
#         if c_n in cell_n:
#             return True
#         # fallback fuzzy ratio
#         ratio = SequenceMatcher(None, cell_n, c_n).ratio()
#         if ratio >= cutoff:
#             return True
#     return False


# def match_against_user_input(flat_records: List[Dict[str,Any]], khatian_values: List[str], daag_values: List[str], farmer_values: List[str]) -> Dict[str,Any]:
#     matches = {'daag_matches': [], 'khatian_matches': [], 'farmer_matches': []}
#     for rec in flat_records:
#         for cell in rec.get('cells', []):
#             if not cell:
#                 continue
#             cell_str = str(cell)
#             # normalize for Bengali digits comparison
#             cell_digits = cell_str.translate(NUM_MAP)
#             # exact checks
#             for d in daag_values or []:
#                 d_digits = d.translate(NUM_MAP)
#                 if d and re.search(r"\b" + re.escape(d_digits) + r"\b", cell_digits):
#                     matches['daag_matches'].append({**rec, 'matched_value': d, 'matched_on': 'daag_exact', 'cell': cell_str})
#             for k in khatian_values or []:
#                 k_digits = k.translate(NUM_MAP)
#                 if k and re.search(r"\b" + re.escape(k_digits) + r"\b", cell_digits):
#                     matches['khatian_matches'].append({**rec, 'matched_value': k, 'matched_on': 'khatian_exact', 'cell': cell_str})
#             for f in farmer_values or []:
#                 # compare transliterated lowercase versions to be robust
#                 f_norm = transliterate_bengali_to_latin_if_needed(f)
#                 cell_norm = transliterate_bengali_to_latin_if_needed(cell_str)
#                 if f and f_norm and f_norm in cell_norm:
#                     matches['farmer_matches'].append({**rec, 'matched_value': f, 'matched_on': 'farmer_exact', 'cell': cell_str})
#             # fuzzy checks
#             if fuzzy_in(cell_str, khatian_values, cutoff=0.9):
#                 for k in khatian_values:
#                     if fuzzy_in(cell_str, [k], cutoff=0.9):
#                         matches['khatian_matches'].append({**rec, 'matched_value': k, 'matched_on': 'khatian_fuzzy', 'cell': cell_str})
#             if fuzzy_in(cell_str, daag_values, cutoff=0.88):
#                 for d in daag_values:
#                     if fuzzy_in(cell_str, [d], cutoff=0.88):
#                         matches['daag_matches'].append({**rec, 'matched_value': d, 'matched_on': 'daag_fuzzy', 'cell': cell_str})
#             if fuzzy_in(cell_str, farmer_values, cutoff=0.75):
#                 for f in farmer_values:
#                     if fuzzy_in(cell_str, [f], cutoff=0.75):
#                         matches['farmer_matches'].append({**rec, 'matched_value': f, 'matched_on': 'farmer_fuzzy', 'cell': cell_str})
#     # dedupe
#     for k in list(matches.keys()):
#         seen = set()
#         uniq = []
#         for m in matches[k]:
#             key = (m.get('matched_value'), m.get('page'), m.get('table_index'), m.get('row_index'))
#             if key not in seen:
#                 seen.add(key)
#                 uniq.append(m)
#         matches[k] = uniq
#     return matches


# def ocr_pdf_to_pages(path: str) -> List[str]:
#     if not HAVE_OCR:
#         return []
#     texts = []
#     try:
#         images = convert_from_path(str(path), dpi=200)
#         for img in images:
#             try:
#                 # prefer ben+eng if tesseract has Bengali traineddata installed
#                 texts.append(pytesseract.image_to_string(img, lang='ben+eng', config='--psm 6'))
#             except Exception:
#                 texts.append(pytesseract.image_to_string(img, config='--psm 6'))
#     except Exception as e:
#         logger.exception("OCR failed: %s", e)
#     return texts


# def extract_text_from_pdf(path: str) -> List[str]:
#     pages = []
#     try:
#         with pdfplumber.open(path) as pdf:
#             for p in pdf.pages:
#                 pages.append(p.extract_text() or "")
#     except Exception as e:
#         logger.exception("pdfplumber failed: %s", e)
#     return pages


# def find_khatian_in_pages(pages: List[str], khatian_values: List[str]) -> Optional[int]:
#     """Return page index where any khatian string appears (handles Bengali digits & separators)."""
#     if not khatian_values:
#         return None
#     # Normalize target khatians to ascii digits (remove spaces)
#     targets = [re.sub(r"\s+", "", str(k).translate(NUM_MAP)) for k in khatian_values]
#     for i, text in enumerate(pages):
#         if not text:
#             continue
#         t = normalize_text(text)
#         t_digits = t.translate(NUM_MAP)
#         t_compact = re.sub(r"[^0-9/\-]", " ", t_digits)
#         for tgt in targets:
#             if not tgt:
#                 continue
#             if re.search(r"\b" + re.escape(tgt) + r"\b", t_compact):
#                 return i
#             if tgt in t_compact:
#                 return i
#     return None


# def extract_pairs_from_pdf(pdf_path: str) -> Dict[str, Any]:
#     """Table-first extractor. Returns raw_pairs (khatian/name) plus flat table info for debug."""
#     pdf_p = str(pdf_path)
#     try:
#         tables = extract_tables_from_pdf_all_pages(pdf_p)
#     except Exception as e:
#         logger.exception("table extract error: %s", e)
#         tables = []

#     flat = build_flat_list_from_tables(tables)
#     buckets = classify_tables(flat)

#     raw_pairs = []
#     # prefer khatian bucket to build pairs: try to extract khatian and name from same row
#     for rec in buckets.get('khatian', []):
#         row_text = rec.get('row_text','')
#         cells = rec.get('cells', [])
#         # 1) try explicit 'রায়েতর নাম' pattern in the row_text (handles রায়েতর/রায়েতর)
#         name = ''
#         kh = ''
#         m_name = re.search(r'র[াা]য়েতর নাম\s*[:\-–]?\s*(.+)', row_text)
#         if m_name:
#             name = m_name.group(1).strip()
#         # 2) find khatian pattern in concatenated row (support bengali digits)
#         m = re.search(r'([০-৯0-9]{1,6}\s*[/-]\s*[০-৯0-9]{1,6})', row_text)
#         if not m:
#             # try after digit translation
#             t = row_text.translate(NUM_MAP)
#             m = re.search(r'([0-9]{1,6}\s*[/-]\s*[0-9]{1,6})', t)
#         kh = beng_to_ascii(m.group(1)) if m else ''
#         # 3) if name not found yet, try to pick a name-like cell from the row (prefer Bengali letters)
#         if not name:
#             for c in cells:
#                 if c and re.search(r'[\u0980-\u09FF]', str(c)):
#                     # candidate name
#                     name = re.sub(r'[^\u0980-\u09FFA-Za-z\s\-\.]', ' ', str(c)).strip()
#                     break
#         # transliterate name as well
#         name_en = transliterate_bengali_to_latin(name) if name else ''
#         raw_pairs.append({'name': name, 'name_en': name_en, 'khatian': kh})

#     # if no khatian rows found, fallback to first-page pair regex (legacy behaviour)
#     if not raw_pairs:
#         try:
#             with pdfplumber.open(pdf_p) as pdf:
#                 page = pdf.pages[0]
#                 text = page.extract_text() or ''
#         except Exception:
#             return {"pdf": Path(pdf_p).name, "text_sample": "", "raw_pairs": [], "error": "pdf_open_failed", "flat": flat, "buckets": {k: len(v) for k,v in buckets.items()}}
#         text = clean_ctrl(text)
#         for m in PAIR_RE.finditer(text):
#             raw_name = m.group(1).strip()
#             raw_kh = m.group(2).strip()
#             name = re.sub(r'\s{2,}', ' ', raw_name).strip(" -,.")
#             kh = beng_to_ascii(raw_kh).replace(' ', '')
#             name_en = transliterate_bengali_to_latin(name) if name else ''
#             raw_pairs.append({"name": name, "name_en": name_en, "khatian": kh})
#         return {"pdf": Path(pdf_p).name, "text_sample": text, "raw_pairs": raw_pairs, "flat": flat, "buckets": {k: len(v) for k,v in buckets.items()}}

#     # also include daag-only rows (as separate items with blank name)
#     for rec in buckets.get('daag', []):
#         rt = rec.get('row_text','')
#         m = re.search(r'([০-৯0-9]{1,6})', rt)
#         if m:
#             raw_pairs.append({'name': '', 'name_en': '', 'khatian': beng_to_ascii(m.group(1))})

#     # build text sample from first page if available
#     text_sample = ''
#     try:
#         with pdfplumber.open(pdf_p) as pdf:
#             if pdf.pages:
#                 text_sample = (pdf.pages[0].extract_text() or '')
#     except Exception:
#         text_sample = ''

#     return {"pdf": Path(pdf_p).name, "text_sample": text_sample, "raw_pairs": raw_pairs, "flat": flat, "buckets": {k: len(v) for k,v in buckets.items()}}


# def filter_and_normalize_pairs(raw_pairs):
#     out = []
#     for p in raw_pairs:
#         name = p.get("name","")
#         kh = p.get("khatian","")
#         name_norm = normalize_candidate_name(name)
#         if not name_norm and not kh:
#             continue
#         if len(name_norm) < 2 and kh:
#             # keep kh only if no name but reasonable kh present
#             out.append({"name": name_norm, "khatian": kh})
#             continue
#         out.append({"name": name_norm, "khatian": kh})
#     return out

# # ---------------- rest of original matching code unchanged ----------------

# def match_user_input(pairs, target_khatian):
#     return any(p.get("khatian") == target_khatian for p in pairs)


# def match_any_user_khatian(pairs, target_khatian_list):
#     if not target_khatian_list:
#         return False
#     for kh in target_khatian_list:
#         if any(p.get("khatian") == kh for p in pairs):
#             return True
#     return False


# def match_any_user_daag(pairs, text_sample, target_daag_list):
#     if not target_daag_list:
#         return False
#     for d in target_daag_list:
#         if any(p.get("khatian") == d for p in pairs):
#             return True
#     for d in target_daag_list:
#         if re.search(r'\b' + re.escape(d) + r'\b', text_sample):
#             return True
#     return False


# def fuzzy_match_any_farmer(expected_farmers, candidates, khatian_present):
#     best_overall = {"expected": "", "candidate": "", "score": 0.0, "translit": ""}
#     if not expected_farmers or not candidates:
#         return {**best_overall, "pass": False}
#     for expected in expected_farmers:
#         expected_norm = expected.strip()
#         tnorm = re.sub(r'[^\w\s]', ' ', expected_norm).strip().lower()
#         t_tokens = [tok for tok in re.split(r'\s+', tnorm) if tok]
#         best_for_expected = {"candidate": "", "score": 0.0, "translit": ""}
#         for cand in candidates:
#             c = (cand or "").strip()
#             if not c:
#                 continue
#             cand_translit = transliterate_bengali_to_latin(c) if re.search(r'[\u0980-\u09FF]', c) else c.lower()
#             cand_translit = re.sub(r'\s+', ' ', cand_translit).strip()
#             c_tokens = [tok for tok in re.split(r'\s+', cand_translit) if tok]
#             full_score = fuzzy_score(tnorm, cand_translit)
#             token_best = 0.0
#             for tt in t_tokens:
#                 for ct in c_tokens:
#                     s = fuzzy_score(tt, ct)
#                     if s > token_best:
#                         token_best = s
#             surname_boost = 0.0
#             if c_tokens:
#                 last = c_tokens[-1]
#                 for tt in t_tokens:
#                     if last and (tt == last or fuzzy_score(tt, last) > 85):
#                         surname_boost += 35.0
#             hint_boost = 0.0
#             for s in SURNAME_HINTS:
#                 if s in c:
#                     hint_boost += 10.0
#             combined = (0.6 * full_score) + (0.35 * token_best) + surname_boost + hint_boost
#             combined += min(8, len(c_tokens))
#             if combined > best_for_expected["score"]:
#                 best_for_expected = {"candidate": c, "score": float(combined), "translit": cand_translit}
#         if best_for_expected["score"] > best_overall["score"]:
#             best_overall = {"expected": expected_norm,
#                             "candidate": best_for_expected["candidate"],
#                             "score": best_for_expected["score"],
#                             "translit": best_for_expected["translit"]}
#     effective_threshold = KHATIAN_CONFIDENT_MIN_SCORE if khatian_present else FARMER_FUZZY_THRESHOLD
#     best_overall["pass"] = best_overall["score"] >= effective_threshold
#     return best_overall


# def strict_all_matched(payload_list, matched_list):
#     if not payload_list:
#         return True
#     matched_set = set(matched_list or [])
#     return all(item in matched_set for item in payload_list)


# def process_all(files: List[str], payload: Dict):
#     out = {"results": [], "summary": {}}
#     payload_daag_list = payload.get("daag") or []
#     payload_khatian_list = payload.get("khatian") or []
#     payload_farmer_list = payload.get("farmer") or []

#     daag_matched_items = []
#     khatian_matched_items = []
#     farmer_matched_items = []

#     urls = [f for f in files if is_url(f)]
#     non_urls = [f for f in files if not is_url(f)]

#     url_to_local = download_all_urls_concurrent(urls, DOWNLOAD_DIR, max_workers=DOWNLOAD_WORKERS) if urls else {}

#     final_sources: List[Dict] = []
#     for f in files:
#         if is_url(f):
#             local = url_to_local.get(f)
#             final_sources.append({"orig": f, "local": local, "is_url": True})
#         else:
#             final_sources.append({"orig": f, "local": f, "is_url": False})

#     for src in final_sources:
#         orig = src["orig"]
#         local = src["local"]

#         if src["is_url"]:
#             if not local:
#                 out["results"].append({
#                     "pdf": Path(orig).name,
#                     "source": orig,
#                     "error": "download_failed",
#                     "path_or_url": orig
#                 })
#                 continue

#         if not Path(local).exists():
#             out["results"].append({
#                 "pdf": Path(orig).name,
#                 "source": local,
#                 "error": "file_not_found",
#                 "path_or_url": orig
#             })
#             continue

#         rec = extract_pairs_from_pdf(local)
#         if rec.get("error"):
#             out["results"].append({
#                 "pdf": rec.get("pdf"),
#                 "source": local,
#                 "error": rec.get("error")
#             })
#             continue

#         raw_pairs = rec.get("raw_pairs", [])
#         pairs = filter_and_normalize_pairs(raw_pairs)
#         text_sample = rec.get("text_sample","")

#         daag_present = match_any_user_daag(pairs, text_sample, payload_daag_list)
#         khatian_present = match_any_user_khatian(pairs, payload_khatian_list)
#         candidates = [p["name"] for p in pairs if p.get("name")]
#         fam_best = fuzzy_match_any_farmer(payload_farmer_list, candidates, khatian_present)

#         if daag_present:
#             for d in payload_daag_list:
#                 if d not in daag_matched_items:
#                     if any(p.get("khatian") == d for p in pairs) or re.search(r'\b'+re.escape(d)+r'\b', text_sample):
#                         daag_matched_items.append(d)

#         if khatian_present:
#             for k in payload_khatian_list:
#                 if k not in khatian_matched_items and any(p.get("khatian")==k for p in pairs):
#                     khatian_matched_items.append(k)

#         if fam_best.get("pass"):
#             em = fam_best.get("expected")
#             if em and em not in farmer_matched_items:
#                 farmer_matched_items.append(em)

#         # include per-pdf diagnostic in results
#         out["results"].append({
#             "pdf": rec.get("pdf"),
#             "source": local,
#             "pairs": pairs,
#             "flat_count": len(rec.get("flat", [])),
#             "buckets": rec.get("buckets", {}),
#             "matches": {
#                 "daag_present": daag_present,
#                 "khatian_present": khatian_present,
#                 "farmer_best": fam_best
#             }
#         })

#     strict_daag_ok = strict_all_matched(payload_daag_list, daag_matched_items)
#     strict_khatian_ok = strict_all_matched(payload_khatian_list, khatian_matched_items)
#     strict_farmer_ok = strict_all_matched(payload_farmer_list, farmer_matched_items)

#     status = "ACCEPTED" if (strict_daag_ok and strict_khatian_ok and strict_farmer_ok) else "REJECTED"

#     missing_daag = [d for d in payload_daag_list if d not in daag_matched_items]
#     missing_khatian = [k for k in payload_khatian_list if k not in khatian_matched_items]
#     missing_farmer = [f for f in payload_farmer_list if f not in farmer_matched_items]

#     overall = {
#         "daag_matched_items": daag_matched_items,
#         "khatian_matched_items": khatian_matched_items,
#         "farmer_matched_items": farmer_matched_items
#     }

#     out["summary"] = {
#         "files_checked": len(files),
#         "payload": payload,
#         "overall_match": overall,
#         "missing": {
#             "daag": missing_daag,
#             "khatian": missing_khatian,
#             "farmer": missing_farmer
#         },
#         "strict_checks": {
#             "strict_daag_ok": strict_daag_ok,
#             "strict_khatian_ok": strict_khatian_ok,
#             "strict_farmer_ok": strict_farmer_ok
#         },
#         "status": status
#     }
#     return out

# # ---------------- FastAPI endpoints ----------------
# @app.get("/health")
# def health():
#     return {"status": "ok"}

# @app.post("/match")
# async def match_endpoint(
#     payload_json: Optional[str] = Form(None),
#     upload_files: Optional[List[UploadFile]] = File(None)
# ):
#     """
#     Multipart form entrypoint:
#     - payload_json: JSON string
#     - upload_files: optional list of files
#     """
#     if payload_json:
#         try:
#             payload_obj = json.loads(payload_json)
#         except Exception as e:
#             raise HTTPException(status_code=400, detail=f"invalid payload_json: {e}")
#     else:
#         raise HTTPException(status_code=400, detail="Please POST JSON to /match_json or send payload_json form field.")

#     files_to_process = []
#     if upload_files:
#         for up in upload_files:
#             dest = UPLOAD_DIR / up.filename
#             with dest.open("wb") as f:
#                 shutil.copyfileobj(up.file, f)
#             files_to_process.append(str(dest.resolve()))

#     files_from_payload = payload_obj.get("files") or []
#     for p in files_from_payload:
#         files_to_process.append(p)

#     if not files_to_process:
#         files_to_process = DEFAULT_FILES.copy()

#     try:
#         pay = MatchPayload(
#             daag=payload_obj.get("daag", []),
#             khatian=payload_obj.get("khatian", []),
#             farmer=payload_obj.get("farmer", []),
#             files=[]
#         )
#     except Exception as e:
#         raise HTTPException(status_code=400, detail=f"payload validation error: {e}")

#     payload_for_proc = {
#         "daag": pay.daag,
#         "khatian": pay.khatian,
#         "farmer": pay.farmer
#     }

#     results = process_all(files_to_process, payload_for_proc)

#     http_status = 200 if results.get("summary", {}).get("status") == "ACCEPTED" else 422
#     return JSONResponse(content=results, status_code=http_status)

# @app.post("/quick_check")
# async def match_json(payload: MatchPayload):
#     files_to_process = []
#     if payload.files:
#         for p in payload.files:
#             files_to_process.append(p)
#     if not files_to_process:
#         files_to_process = DEFAULT_FILES.copy()

#     payload_for_proc = {
#         "daag": payload.daag,
#         "khatian": payload.khatian,
#         "farmer": payload.farmer
#     }

#     results = process_all(files_to_process, payload_for_proc)

#     http_status = 200 if results.get("summary", {}).get("status") == "ACCEPTED" else 422
#     return JSONResponse(content=results, status_code=http_status)
















# #!/usr/bin/env python3
# """
# FastAPI app with concurrent PDF URL downloads (threadpool).
# Returns HTTP 200 when summary.status == "ACCEPTED", otherwise HTTP 422.

# Save as: app_concurrent_downloads.py
# Run:
#     uvicorn app_concurrent_downloads:app --host 0.0.0.0 --port 8000
# """
# from fastapi import FastAPI, UploadFile, File, Form, HTTPException
# from fastapi.responses import JSONResponse
# from pydantic import BaseModel
# from typing import List, Optional, Dict
# from pathlib import Path
# import shutil
# import re
# import json
# from difflib import SequenceMatcher
# import requests
# from urllib.parse import urlparse
# import time
# import concurrent.futures

# # optional: rapidfuzz if available
# try:
#     from rapidfuzz import fuzz
#     HAVE_RAPIDFUZZ = True
# except Exception:
#     HAVE_RAPIDFUZZ = False

# import pdfplumber

# app = FastAPI(title="Land-Doc Table Matcher (Concurrent Downloads)")

# # ---------------- default FILES (edit if you want) ----------------
# DEFAULT_FILES = [
#     "/mnt/data/371(Plot No)_Suahanta Mondal.pdf",
#     "/mnt/data/348__South Gobindapur__SAINTHIA.pdf",
#     "/mnt/data/345__South Gobindapur__SAINTHIA.pdf",
#     "/mnt/data/1764746873181-519NetureeSAINTHIA.pdf"
# ]

# # ---------------- Matching config (tweak if needed) ----------------
# REQUIRE_KHATIAN_FOR_FARMER_MATCH = True
# FARMER_FUZZY_THRESHOLD = 85.0
# KHATIAN_CONFIDENT_MIN_SCORE = 60.0

# # ---------------- Download config ----------------
# DOWNLOAD_DIR = Path("./downloads"); DOWNLOAD_DIR.mkdir(exist_ok=True)
# UPLOAD_DIR = Path("./uploads"); UPLOAD_DIR.mkdir(exist_ok=True)
# DOWNLOAD_TIMEOUT = 20  # seconds per request
# DOWNLOAD_WORKERS = 8   # ThreadPool max workers (tweak for your environment)
# DOWNLOAD_RETRIES = 1   # simple retry count

# # ---------------- Pydantic models ----------------
# class MatchPayload(BaseModel):
#     daag: List[str]
#     khatian: List[str]
#     farmer: List[str]
#     files: Optional[List[str]] = None

# # ---------------- Helper functions for URL detection + download ----------------
# def is_url(s: str) -> bool:
#     try:
#         p = urlparse(s)
#         return p.scheme in ("http", "https")
#     except Exception:
#         return False

# def download_pdf_once(url: str, dest_dir: Path, timeout: int = DOWNLOAD_TIMEOUT) -> Optional[str]:
#     try:
#         r = requests.get(url, stream=True, timeout=timeout)
#         r.raise_for_status()
#         parsed = urlparse(url)
#         fname = Path(parsed.path).name
#         if not fname or not fname.lower().endswith(".pdf"):
#             fname = f"download_{int(time.time()*1000)}.pdf"
#         dest = dest_dir / fname
#         if dest.exists():
#             base = dest.stem
#             suf = 1
#             while (dest_dir / f"{base}_{suf}.pdf").exists():
#                 suf += 1
#             dest = dest_dir / f"{base}_{suf}.pdf"
#         with dest.open("wb") as f:
#             for chunk in r.iter_content(chunk_size=8192):
#                 if chunk:
#                     f.write(chunk)
#         return str(dest.resolve())
#     except Exception:
#         return None

# def download_with_retries(url: str, dest_dir: Path, retries: int = DOWNLOAD_RETRIES) -> Optional[str]:
#     attempt = 0
#     while attempt <= retries:
#         path = download_pdf_once(url, dest_dir)
#         if path:
#             return path
#         attempt += 1
#     return None

# def download_all_urls_concurrent(urls: List[str], dest_dir: Path, max_workers: int = DOWNLOAD_WORKERS) -> Dict[str, Optional[str]]:
#     results: Dict[str, Optional[str]] = {}
#     if not urls:
#         return results
#     seen = set()
#     uniq_urls = []
#     for u in urls:
#         if u not in seen:
#             seen.add(u)
#             uniq_urls.append(u)

#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
#         future_to_url = {executor.submit(download_with_retries, url, dest_dir): url for url in uniq_urls}
#         for fut in concurrent.futures.as_completed(future_to_url):
#             url = future_to_url[fut]
#             try:
#                 local = fut.result()
#             except Exception:
#                 local = None
#             results[url] = local
#     return results

# # ---------------- Matching implementation ----------------
# PREFIXES = [
#     r'দখলদার মন্তব', r'দখলদার', r'মন্তব',
#     r'ব \s* া \s* ক্ত', r'ব া ক্ত', r'বাক্তি', r'বাক্ত', r'বক্ত', r'দখ'
# ]
# PREFIX_RE = re.compile(r'^(?:' + "|".join(PREFIXES) + r')\s*', flags=re.I)

# HEADER_NOISE_RE = re.compile(
#     r'(জ\.এল|জ\.এল নং|দাগর|দােগর|ম \s* াপ|ম াপ|Click Here|Live Data|Banglarbhumi|উল্লিখিত|খিত|Remarks|Nil|থানা|ব্লক|মৌজা)',
#     flags=re.I
# )

# SURNAME_HINTS = ["মন্ডল","মণ্ডল","হোসেন","ইসলাম","চৌধুরী","দাস","কুমার","রায়","রাজ","শর্মা"]
# PAIR_RE = re.compile(r'([\u0980-\u09FF\u0020\.\-]{2,160}?)\s+([০-৯0-9]{1,6}\s*/\s*[০-৯0-9]{0,6}|[০-৯0-9]{2,6})')
# NUM_MAP = str.maketrans("০১২৩৪৫৬৭৮৯","0123456789")

# def beng_to_ascii(s: str) -> str:
#     return (s or "").translate(NUM_MAP)

# def clean_ctrl(s: str) -> str:
#     if not s:
#         return ""
#     s = re.sub(r'[\x00-\x1F\x7F]+', ' ', s)
#     s = re.sub(r'\s+', ' ', s)
#     return s.strip()

# BENG_TO_LAT_MAP = {
#     'অ':'o','আ':'a','ই':'i','ঈ':'i','উ':'u','এ':'e','ঐ':'oi','ও':'o','ঔ':'ou',
#     'ক':'k','খ':'kh','গ':'g','ঘ':'gh','ঙ':'ng','চ':'ch','ছ':'chh','জ':'j','ঝ':'jh','ঞ':'n',
#     'ট':'t','ঠ':'th','ড':'d','ঢ':'dh','ণ':'n','ত':'t','থ':'th','দ':'d','ধ':'dh','ন':'n',
#     'প':'p','ফ':'ph','ব':'b','ভ':'bh','ম':'m','য':'y','র':'r','ল':'l','শ':'sh','ষ':'sh','স':'s','হ':'h',
#     '্':'','া':'a','ি':'i','ী':'i','ু':'u','ূ':'u','ে':'e','ৈ':'oi','ো':'o','ৌ':'ou','ঁ':'n','ঃ':''
# }

# def transliterate_bengali_to_latin(s: str) -> str:
#     out = []
#     for ch in s:
#         if ch in BENG_TO_LAT_MAP:
#             out.append(BENG_TO_LAT_MAP[ch])
#         else:
#             if re.match(r'[A-Za-z0-9 ]', ch):
#                 out.append(ch.lower())
#             else:
#                 out.append('')
#     txt = "".join(out)
#     txt = re.sub(r'\s+', ' ', txt).strip()
#     return txt

# def fuzzy_score(a: str, b: str) -> float:
#     if HAVE_RAPIDFUZZ:
#         try:
#             return float(fuzz.token_set_ratio(a,b))
#         except Exception:
#             return float(fuzz.ratio(a,b))
#     else:
#         return SequenceMatcher(None, a, b).ratio() * 100.0

# def normalize_candidate_name(raw: str) -> str:
#     s = (raw or "").strip()
#     s = PREFIX_RE.sub('', s)
#     s = HEADER_NOISE_RE.sub('', s)
#     s = re.sub(r'[^ \u0980-\u09FFA-Za-z]', ' ', s)
#     s = re.sub(r'\s{2,}', ' ', s).strip()
#     return s

# def extract_pairs_from_pdf(pdf_path: str):
#     try:
#         with pdfplumber.open(pdf_path) as pdf:
#             page = pdf.pages[0]
#             text = page.extract_text() or ""
#     except Exception:
#         return {"pdf": Path(pdf_path).name, "text_sample": "", "raw_pairs": [], "error": "pdf_open_failed"}
#     text = clean_ctrl(text)
#     raw_pairs = []
#     for m in PAIR_RE.finditer(text):
#         raw_name = m.group(1).strip()
#         raw_kh = m.group(2).strip()
#         name = re.sub(r'\s{2,}', ' ', raw_name).strip(" -,.")
#         kh = beng_to_ascii(raw_kh).replace(' ', '')
#         raw_pairs.append({"name": name, "khatian": kh})
#     return {"pdf": Path(pdf_path).name, "text_sample": text, "raw_pairs": raw_pairs}

# def filter_and_normalize_pairs(raw_pairs):
#     out = []
#     for p in raw_pairs:
#         name = p.get("name","")
#         kh = p.get("khatian","")
#         name_norm = normalize_candidate_name(name)
#         if not name_norm and not kh:
#             continue
#         if len(name_norm) < 2 and kh:
#             continue
#         out.append({"name": name_norm, "khatian": kh})
#     return out

# def match_user_input(pairs, target_khatian):
#     return any(p.get("khatian") == target_khatian for p in pairs)

# def match_any_user_khatian(pairs, target_khatian_list):
#     if not target_khatian_list:
#         return False
#     for kh in target_khatian_list:
#         if any(p.get("khatian") == kh for p in pairs):
#             return True
#     return False

# def match_any_user_daag(pairs, text_sample, target_daag_list):
#     if not target_daag_list:
#         return False
#     for d in target_daag_list:
#         if any(p.get("khatian") == d for p in pairs):
#             return True
#     for d in target_daag_list:
#         if re.search(r'\b' + re.escape(d) + r'\b', text_sample):
#             return True
#     return False

# def fuzzy_match_any_farmer(expected_farmers, candidates, khatian_present):
#     best_overall = {"expected": "", "candidate": "", "score": 0.0, "translit": ""}
#     if not expected_farmers or not candidates:
#         return {**best_overall, "pass": False}
#     for expected in expected_farmers:
#         expected_norm = expected.strip()
#         tnorm = re.sub(r'[^\w\s]', ' ', expected_norm).strip().lower()
#         t_tokens = [tok for tok in re.split(r'\s+', tnorm) if tok]
#         best_for_expected = {"candidate": "", "score": 0.0, "translit": ""}
#         for cand in candidates:
#             c = (cand or "").strip()
#             if not c:
#                 continue
#             cand_translit = transliterate_bengali_to_latin(c) if re.search(r'[\u0980-\u09FF]', c) else c.lower()
#             cand_translit = re.sub(r'\s+', ' ', cand_translit).strip()
#             c_tokens = [tok for tok in re.split(r'\s+', cand_translit) if tok]
#             full_score = fuzzy_score(tnorm, cand_translit)
#             token_best = 0.0
#             for tt in t_tokens:
#                 for ct in c_tokens:
#                     s = fuzzy_score(tt, ct)
#                     if s > token_best:
#                         token_best = s
#             surname_boost = 0.0
#             if c_tokens:
#                 last = c_tokens[-1]
#                 for tt in t_tokens:
#                     if last and (tt == last or fuzzy_score(tt, last) > 85):
#                         surname_boost += 35.0
#             hint_boost = 0.0
#             for s in SURNAME_HINTS:
#                 if s in c:
#                     hint_boost += 10.0
#             combined = (0.6 * full_score) + (0.35 * token_best) + surname_boost + hint_boost
#             combined += min(8, len(c_tokens))
#             if combined > best_for_expected["score"]:
#                 best_for_expected = {"candidate": c, "score": float(combined), "translit": cand_translit}
#         if best_for_expected["score"] > best_overall["score"]:
#             best_overall = {"expected": expected_norm,
#                             "candidate": best_for_expected["candidate"],
#                             "score": best_for_expected["score"],
#                             "translit": best_for_expected["translit"]}
#     effective_threshold = KHATIAN_CONFIDENT_MIN_SCORE if khatian_present else FARMER_FUZZY_THRESHOLD
#     #effective_threshold = FARMER_FUZZY_THRESHOLD
#     best_overall["pass"] = best_overall["score"] >= effective_threshold
#     return best_overall

# def strict_all_matched(payload_list, matched_list):
#     if not payload_list:
#         return True
#     matched_set = set(matched_list or [])
#     return all(item in matched_set for item in payload_list)

# def process_all(files: List[str], payload: Dict):
#     out = {"results": [], "summary": {}}
#     payload_daag_list = payload.get("daag") or []
#     payload_khatian_list = payload.get("khatian") or []
#     payload_farmer_list = payload.get("farmer") or []

#     daag_matched_items = []
#     khatian_matched_items = []
#     farmer_matched_items = []

#     urls = [f for f in files if is_url(f)]
#     non_urls = [f for f in files if not is_url(f)]

#     url_to_local = download_all_urls_concurrent(urls, DOWNLOAD_DIR, max_workers=DOWNLOAD_WORKERS) if urls else {}

#     final_sources: List[Dict] = []
#     for f in files:
#         if is_url(f):
#             local = url_to_local.get(f)
#             final_sources.append({"orig": f, "local": local, "is_url": True})
#         else:
#             final_sources.append({"orig": f, "local": f, "is_url": False})

#     for src in final_sources:
#         orig = src["orig"]
#         local = src["local"]

#         if src["is_url"]:
#             if not local:
#                 out["results"].append({
#                     "pdf": Path(orig).name,
#                     "source": orig,
#                     "error": "download_failed",
#                     "path_or_url": orig
#                 })
#                 continue

#         if not Path(local).exists():
#             out["results"].append({
#                 "pdf": Path(orig).name,
#                 "source": local,
#                 "error": "file_not_found",
#                 "path_or_url": orig
#             })
#             continue

#         rec = extract_pairs_from_pdf(local)
#         if rec.get("error"):
#             out["results"].append({
#                 "pdf": rec.get("pdf"),
#                 "source": local,
#                 "error": rec.get("error")
#             })
#             continue

#         raw_pairs = rec["raw_pairs"]
#         pairs = filter_and_normalize_pairs(raw_pairs)
#         text_sample = rec.get("text_sample","")

#         daag_present = match_any_user_daag(pairs, text_sample, payload_daag_list)
#         khatian_present = match_any_user_khatian(pairs, payload_khatian_list)
#         candidates = [p["name"] for p in pairs if p.get("name")]
#         fam_best = fuzzy_match_any_farmer(payload_farmer_list, candidates, khatian_present)

#         if daag_present:
#             for d in payload_daag_list:
#                 if d not in daag_matched_items:
#                     if any(p.get("khatian") == d for p in pairs) or re.search(r'\b'+re.escape(d)+r'\b', text_sample):
#                         daag_matched_items.append(d)

#         if khatian_present:
#             for k in payload_khatian_list:
#                 if k not in khatian_matched_items and any(p.get("khatian")==k for p in pairs):
#                     khatian_matched_items.append(k)

#         if fam_best.get("pass"):
#             em = fam_best.get("expected")
#             if em and em not in farmer_matched_items:
#                 farmer_matched_items.append(em)

#         # out_item = {
#         #     "pdf": rec["pdf"],
#         #     "source": local,
#         #     "pairs": pairs,
#         #     "matches": {
#         #         "daag_expected": payload_daag_list,
#         #         "daag_match": {"present": daag_present},
#         #         "khatian_expected": payload_khatian_list,
#         #         "khatian_match": {"present": khatian_present},
#         #         "farmer_expected": payload_farmer_list,
#         #         "farmer_match": {
#         #             "expected_matched": fam_best.get("expected",""),
#         #             "extracted": fam_best.get("candidate",""),
#         #             "translit": fam_best.get("translit",""),
#         #             "score": fam_best.get("score", 0.0),
#         #             "pass": fam_best.get("pass", False)
#         #         }
#         #     }
#         # }
#        # out["results"].append(out_item)

#     strict_daag_ok = strict_all_matched(payload_daag_list, daag_matched_items)
#     strict_khatian_ok = strict_all_matched(payload_khatian_list, khatian_matched_items)
#     strict_farmer_ok = strict_all_matched(payload_farmer_list, farmer_matched_items)

#     status = "ACCEPTED" if (strict_daag_ok and strict_khatian_ok and strict_farmer_ok) else "REJECTED"

#     missing_daag = [d for d in payload_daag_list if d not in daag_matched_items]
#     missing_khatian = [k for k in payload_khatian_list if k not in khatian_matched_items]
#     missing_farmer = [f for f in payload_farmer_list if f not in farmer_matched_items]

#     overall = {
#         "daag_matched_items": daag_matched_items,
#         "khatian_matched_items": khatian_matched_items,
#         "farmer_matched_items": farmer_matched_items
#     }

#     out["summary"] = {
#         "files_checked": len(files),
#         "payload": payload,
#         "overall_match": overall,
#         "missing": {
#             "daag": missing_daag,
#             "khatian": missing_khatian,
#             "farmer": missing_farmer
#         },
#         "strict_checks": {
#             "strict_daag_ok": strict_daag_ok,
#             "strict_khatian_ok": strict_khatian_ok,
#             "strict_farmer_ok": strict_farmer_ok
#         },
#         "status": status
#     }
#     return out

# # ---------------- FastAPI endpoints ----------------
# @app.get("/health")
# def health():
#     return {"status": "ok"}

# @app.post("/match")
# async def match_endpoint(
#     payload_json: Optional[str] = Form(None),
#     upload_files: Optional[List[UploadFile]] = File(None)
# ):
#     """
#     Multipart form entrypoint:
#     - payload_json: JSON string
#     - upload_files: optional list of files
#     """
#     if payload_json:
#         try:
#             payload_obj = json.loads(payload_json)
#         except Exception as e:
#             raise HTTPException(status_code=400, detail=f"invalid payload_json: {e}")
#     else:
#         raise HTTPException(status_code=400, detail="Please POST JSON to /match_json or send payload_json form field.")

#     files_to_process = []
#     if upload_files:
#         for up in upload_files:
#             dest = UPLOAD_DIR / up.filename
#             with dest.open("wb") as f:
#                 shutil.copyfileobj(up.file, f)
#             files_to_process.append(str(dest.resolve()))

#     files_from_payload = payload_obj.get("files") or []
#     for p in files_from_payload:
#         files_to_process.append(p)

#     if not files_to_process:
#         files_to_process = DEFAULT_FILES.copy()

#     try:
#         pay = MatchPayload(
#             daag=payload_obj.get("daag", []),
#             khatian=payload_obj.get("khatian", []),
#             farmer=payload_obj.get("farmer", []),
#             files=[]
#         )
#     except Exception as e:
#         raise HTTPException(status_code=400, detail=f"payload validation error: {e}")

#     payload_for_proc = {
#         "daag": pay.daag,
#         "khatian": pay.khatian,
#         "farmer": pay.farmer
#     }

#     results = process_all(files_to_process, payload_for_proc)

#     http_status = 200 if results.get("summary", {}).get("status") == "ACCEPTED" else 422
#     return JSONResponse(content=results, status_code=http_status)

# @app.post("/quick_check")
# async def match_json(payload: MatchPayload):
#     files_to_process = []
#     if payload.files:
#         for p in payload.files:
#             files_to_process.append(p)
#     if not files_to_process:
#         files_to_process = DEFAULT_FILES.copy()

#     payload_for_proc = {
#         "daag": payload.daag,
#         "khatian": payload.khatian,
#         "farmer": payload.farmer
#     }

#     results = process_all(files_to_process, payload_for_proc)

#     http_status = 200 if results.get("summary", {}).get("status") == "ACCEPTED" else 422
#     return JSONResponse(content=results, status_code=http_status)
