# import pdfplumber
# import re
# import json
# import os
# import logging

# # ---------------- CONFIG ---------------- #


# request_payload = {
#     "JL No.": ["63", "124","190"],
#     "daag No": ["371", "261","519"]
# }

# # --------------------------------------- #



# # 🔕 Suppress pdfminer font warnings
# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"   # folder containing PDFs


# JL_SET = set(request_payload["JL No."])
# DAAG_SET = set(request_payload["daag No"])

# results = []

# def scan_pdf(pdf_path):
#     output = {
#         "pdf": os.path.basename(pdf_path),
#         "matches": []
#     }

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):
#             words = page.extract_words(use_text_flow=True) or []
#             page_text = " ".join(w["text"] for w in words)

#             found_jl = []
#             found_daag = []

#             # JL No detection
#             for jl in JL_SET:
#                 if re.search(rf"\b{jl}\b", page_text):
#                     found_jl.append(jl)

#             # Daag No detection
#             for daag in DAAG_SET:
#                 if re.search(rf"\b{daag}\b", page_text):
#                     found_daag.append(daag)

#             if found_jl or found_daag:
#                 output["matches"].append({
#                     "page": page_no,
#                     "JL_No": found_jl,
#                     "Daag_No": found_daag
#                 })

#     if output["matches"]:
#         return output
#     return None


# for file in os.listdir(PDF_FOLDER):
#     if file.lower().endswith(".pdf"):
#         result = scan_pdf(os.path.join(PDF_FOLDER, file))
#         if result:
#             results.append(result)

# print(json.dumps(results, indent=2, ensure_ascii=False))

# import pdfplumber
# import pytesseract
# import re
# import os
# import json
# import logging
# from pdf2image import convert_from_path

# # ---------------- CONFIG ---------------- #

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"


# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# # -------------------------------------- #

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()

# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# for file in os.listdir(PDF_FOLDER):
#     if not file.lower().endswith(".pdf"):
#         continue

#     pdf_path = os.path.join(PDF_FOLDER, file)

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):

#             text = extract_text_pdf(page)

#             if not text or len(text) < TEXT_MIN_LEN:
#                 text = extract_text_ocr(pdf_path, page_no)

#             search(text)

# # ---------------- FINAL OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)

# import pdfplumber
# import pytesseract
# import re
# import os
# import requests
# import tempfile
# import logging
# from pdf2image import convert_from_path

# # ---------------- INPUT ---------------- #

# pdf_urls = [
#     "https://example.com/261__BRAHMANBAHARA__MAYURESWAR-2.pdf",
#     "https://example.com/371(Plot No)_Suahanta Mondal.pdf"
# ]

# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# # ------------------------------------- #

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()


# def download_pdf(url, dest_folder):
#     local_path = os.path.join(dest_folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(local_path, "wb") as f:
#         f.write(r.content)
#     return local_path


# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# # ---------------- MAIN ---------------- #

# with tempfile.TemporaryDirectory() as tmpdir:

#     for url in pdf_urls:
#         try:
#             pdf_path = download_pdf(url, tmpdir)

#             with pdfplumber.open(pdf_path) as pdf:
#                 for page_no, page in enumerate(pdf.pages, start=1):

#                     text = extract_text_pdf(page)

#                     if not text or len(text) < TEXT_MIN_LEN:
#                         text = extract_text_ocr(pdf_path, page_no)

#                     search(text)

#         except Exception as e:
#             print(f"Skipped {url}: {e}")

# # ---------------- OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)


# from fastapi import FastAPI, HTTPException,Request
# from pydantic import BaseModel
# import pytesseract
# from pytesseract import Output
# import re
# import requests
# import tempfile
# import pdfplumber
# import os
# from pdf2image import convert_from_path
# from PIL import Image
# from fastapi.responses import JSONResponse



# OCR_LANG = "ben+eng"
# DPI = 400

# app = FastAPI(title="Land PDF Search API")


# class SearchRequest(BaseModel):
#     pdf_urls: list[str]
#     JL_No: list[str]
#     Daag_No: list[str]


# BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

# def normalize_text(text: str) -> str:
#     text = text.translate(BN_TO_EN)
#     text = re.sub(r"\s+", " ", text)
#     return text


# def extract_jl_numbers(text: str) -> set[str]:
#     return set(re.findall(r"\b\d{2,4}\b", text))


# # def extract_daag_numbers(text: str) -> set[str]:
# #     text = re.sub(r"দা\s*গ", "দাগ", text)
# #     matches = re.findall(r"দাগ(?:\s*নং)?\s*(\d{2,4})", text)
# #     return set(matches)


# def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
#     results = set()

#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             text = page.extract_text() or ""

#             text = text.translate(BN_TO_EN)
#             text = re.sub(r"\s+", " ", text)

#             # Match rows like: 261 সৈ য়ম 2.17 Click Here
#             matches = re.findall(r"\b(\d{2,4})\s+[^0-9]+?\s+\d+\.\d+\s+Click", text)

#             for m in matches:
#                 results.add(m)

#     return results

# def extract_daag_numbers(text: str) -> set[str]:
#     """
#     Extract Daag numbers from Daag ROWS like:
#     '261 সৈ য়ম 2.17 Click Here'
#     """

#     results = set()

#     # Normalize digits & Bengali spacing
#     text = text.translate(BN_TO_EN)
#     text = re.sub(r"দা\s*গ", "দাগ", text)

#     # Split into logical lines
#     lines = text.splitlines()

#     for line in lines:
#         line = line.strip()

#         # Heuristic: Daag rows always contain "Click"
#         if "Click" in line:
#             match = re.search(r"\b\d{2,4}\b", line)
#             if match:
#                 results.add(match.group(0))

#     return results



# def download_pdf(url: str, folder: str) -> str:
#     path = os.path.join(folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(path, "wb") as f:
#         f.write(r.content)
#     return path


# @app.post("/extract")
# def extract_land_data(payload: SearchRequest):

#     req_jl = set(payload.JL_No)
#     req_daag = set(payload.Daag_No)

#     found_jl = set()
#     found_daag = set()

#     with tempfile.TemporaryDirectory() as tmpdir:
#         for url in payload.pdf_urls:
#             try:
#                 pdf_path = download_pdf(url, tmpdir)
#                 images = convert_from_path(pdf_path, dpi=DPI)

#                 for img in images:
#                     text = pytesseract.image_to_string(
#                         img,
#                         lang=OCR_LANG,
#                         config="--psm 6"
#                     )

#                     text = normalize_text(text)
                      
#                     found_jl.update(req_jl & extract_jl_numbers(text))
#                     # 1️⃣ Try PDF text-layer
#                     daag_from_text = extract_daag_from_pdf_text(pdf_path)
#                     found_daag.update(req_daag & daag_from_text)

#                     # 2️⃣ OCR fallback only if needed
#                     if req_daag - found_daag:
#                         found_daag.update(req_daag & extract_daag_numbers(text))


#             except Exception:
#                 continue

#     response = {
#         "found": {
#             "JL_No": sorted(found_jl),
#             "Daag_No": sorted(found_daag)
#         },
#         "not_found": {
#             "JL_No": sorted(req_jl - found_jl),
#             "Daag_No": sorted(req_daag - found_daag)
#         }
#     }

#     if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
#         return response

#     raise HTTPException(status_code=422, detail=response)

# @app.post("/quick_check")
# async def quick_check(request: Request):
#     return JSONResponse(
#         status_code=200,
#         content={
#             "found": {
#                 "JL_No": ["63", "124"],
#                 "Daag_No": ["261", "371"]
#             },
#             "not_found": {
#                 "JL_No": ["999"],
#                 "Daag_No": ["888"]
#             }
#         }
#     )


# from fastapi import FastAPI, HTTPException, Request
# from pydantic import BaseModel
# import pytesseract
# import re
# import requests
# import tempfile
# import pdfplumber
# import os
# from pdf2image import convert_from_path
# from PIL import Image
# from fastapi.responses import JSONResponse

# OCR_LANG = "ben+eng"
# DPI = 400

# app = FastAPI(title="Land PDF Search API")

# class SearchRequest(BaseModel):
#     pdf_urls: list[str]
#     JL_No: list[str]
#     Daag_No: list[str]

# BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

# def normalize_text(text: str) -> str:
#     text = text.translate(BN_TO_EN)
#     text = re.sub(r"\s+", " ", text).strip()
#     return text

# def extract_jl_numbers(text: str) -> set[str]:
#     return set(re.findall(r"\b\d{2,4}\b", text))

# def clean_cell(cell: str | None) -> str:
#     if not cell:
#         return ""
#     return normalize_text(cell)

# def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
#     results = set()
#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             # ────────────────────── Table extraction ──────────────────────
#             table_settings = {
#                 "vertical_strategy": "lines",
#                 "horizontal_strategy": "lines",
#                 "snap_tolerance": 3,
#                 "join_tolerance": 3,
#             }
#             tables = page.extract_tables(table_settings)
            
#             for table in tables:
#                 for row in table:
#                     if not row or len(row) < 3:
#                         continue
#                     daag_cell = clean_cell(row[0])
#                     if re.fullmatch(r"^\d{2,4}$", daag_cell):
#                         results.add(daag_cell)
            
#             # ────────────────────── Fallback: raw text regex (if no table found) ──────────────────────
#             text = page.extract_text() or ""
#             text = normalize_text(text)
#             matches = re.findall(
#                 r"\b(\d{2,4})\s+[^0-9]+?\s*\d+\.\d+(?:\s+(?:Click Here|Click|দাগের ম্যাপ)?)?",
#                 text
#             )
#             results.update(matches)
    
#     return results

# def extract_daag_numbers_ocr(text: str) -> set[str]:
#     results = set()
#     text = normalize_text(text)
#     text = re.sub(r"দা\s*গ", "দাগ", text)
#     lines = text.splitlines()
#     for line in lines:
#         line = line.strip()
#         # Flexible match: daag + anything + decimal
#         match = re.search(r"\b(\d{2,4})\b\s+[^0-9]+?\s*\d+\.\d+", line)
#         if match:
#             results.add(match.group(1))
#     return results

# def download_pdf(url: str, folder: str) -> str:
#     path = os.path.join(folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(path, "wb") as f:
#         f.write(r.content)
#     return path

# @app.post("/extract")
# def extract_land_data(payload: SearchRequest):
#     req_jl = set(payload.JL_No)
#     req_daag = set(payload.Daag_No)
#     found_jl = set()
#     found_daag = set()

#     with tempfile.TemporaryDirectory() as tmpdir:
#         for url in payload.pdf_urls:
#             try:
#                 pdf_path = download_pdf(url, tmpdir)
                
#                 # OCR for JL numbers (usually in header)
#                 images = convert_from_path(pdf_path, dpi=DPI)
#                 for img in images:
#                     ocr_text = pytesseract.image_to_string(
#                         img, lang=OCR_LANG, config="--psm 6"
#                     )
#                     ocr_text = normalize_text(ocr_text)
#                     found_jl.update(req_jl & extract_jl_numbers(ocr_text))
                
#                 # Primary: table-based Daag extraction from text layer
#                 daag_from_pdf = extract_daag_from_pdf_text(pdf_path)
#                 found_daag.update(req_daag & daag_from_pdf)
                
#                 # Fallback: OCR only if still missing some requested Daag
#                 if req_daag - found_daag:
#                     # Use last ocr_text (or re-run if needed)
#                     found_daag.update(req_daag & extract_daag_numbers_ocr(ocr_text))
                    
#             except Exception as e:
#                 # Optional: log error
#                 continue

#     response = {
#         "found": {
#             "JL_No": sorted(found_jl),
#             "Daag_No": sorted(found_daag)
#         },
#         "not_found": {
#             "JL_No": sorted(req_jl - found_jl),
#             "Daag_No": sorted(req_daag - found_daag)
#         }
#     }

#     if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
#         return response
    
#     raise HTTPException(status_code=422, detail=response)

# @app.post("/quick_check")
# async def quick_check(request: Request):
#     return JSONResponse(
#         status_code=200,
#         content={
#             "found": {
#                 "JL_No": ["63", "106", "125"],
#                 "Daag_No": ["182", "261", "946"]
#             },
#             "not_found": {
#                 "JL_No": [],
#                 "Daag_No": []
#             }
#         }
#     )





# from fastapi import FastAPI, HTTPException, Request
# from pydantic import BaseModel
# import pytesseract
# import re
# import requests
# import tempfile
# import pdfplumber
# import os
# from pdf2image import convert_from_path
# from PIL import Image
# from fastapi.responses import JSONResponse

# OCR_LANG = "ben+eng"
# DPI = 400

# app = FastAPI(title="Land PDF Search API")

# class SearchRequest(BaseModel):
#     pdf_urls: list[str]
#     JL_No: list[str]
#     Daag_No: list[str]

# BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

# def normalize_text(text: str) -> str:
#     text = text.translate(BN_TO_EN)
#     text = re.sub(r"\s+", " ", text).strip()
#     return text

# def extract_jl_numbers(text: str) -> set[str]:
#     return set(re.findall(r"\b\d{2,4}\b", text))

# def clean_cell(cell: str | None) -> str:
#     if not cell:
#         return ""
#     return normalize_text(cell)

# def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
#     results = set()
#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             # ────────────────────── Table extraction ──────────────────────
#             table_settings = {
#                 "vertical_strategy": "lines",
#                 "horizontal_strategy": "lines",
#                 "snap_tolerance": 3,
#                 "join_tolerance": 3,
#             }
#             tables = page.extract_tables(table_settings)
            
#             for table in tables:
#                 for row in table:
#                     if not row or len(row) < 3:
#                         continue
#                     daag_cell = clean_cell(row[0])
#                     if re.fullmatch(r"^\d{2,4}$", daag_cell):
#                         results.add(daag_cell)
            
#             # ────────────────────── Fallback: raw text regex (if no table found) ──────────────────────
#             text = page.extract_text() or ""
#             text = normalize_text(text)
#             matches = re.findall(
#                 r"\b(\d{2,4})\s+[^0-9\s]+?\s*\d+\.\d+(?:\s*\d+\.\d+)?(?:\s+(?:Click Here|Click|দাগের ম্যাপ)?)?",
#                 text
#             )
#             results.update(matches)
    
#     return results

# def extract_daag_numbers_ocr(text: str) -> set[str]:
#     results = set()
#     text = normalize_text(text)
#     text = re.sub(r"দা\s*গ", "দাগ", text)
#     # Use findall on full text instead of lines for better handling of collapsed tables
#     matches = re.findall(
#         r"\b(\d{2,4})\b\s+[^0-9\s]+?\s*\d+\.\d+(?:\s*\d+\.\d+)?(?:\s+(?:Click Here|Click|দাগের ম্যাপ)?)?",
#         text
#     )
#     results.update(matches)
#     return results

# def download_pdf(url: str, folder: str) -> str:
#     path = os.path.join(folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(path, "wb") as f:
#         f.write(r.content)
#     return path

# @app.post("/extract")
# def extract_land_data(payload: SearchRequest):
#     req_jl = set(payload.JL_No)
#     req_daag = set(payload.Daag_No)
#     found_jl = set()
#     found_daag = set()

#     with tempfile.TemporaryDirectory() as tmpdir:
#         for url in payload.pdf_urls:
#             try:
#                 pdf_path = download_pdf(url, tmpdir)
                
#                 # OCR for JL numbers (usually in header)
#                 images = convert_from_path(pdf_path, dpi=DPI)
#                 ocr_text = ""
#                 for img in images:
#                     ocr_text += pytesseract.image_to_string(
#                         img, lang=OCR_LANG, config="--psm 6"
#                     ) + "\n"
#                 ocr_text = normalize_text(ocr_text)
#                 found_jl.update(req_jl & extract_jl_numbers(ocr_text))
                
#                 # Primary: table-based Daag extraction from text layer
#                 daag_from_pdf = extract_daag_from_pdf_text(pdf_path)
#                 found_daag.update(req_daag & daag_from_pdf)
                
#                 # Fallback: OCR only if still missing some requested Daag
#                 if req_daag - found_daag:
#                     found_daag.update(req_daag & extract_daag_numbers_ocr(ocr_text))
                    
#             except Exception as e:
#                 # Optional: log error
#                 continue

#     response = {
#         "found": {
#             "JL_No": sorted(found_jl),
#             "Daag_No": sorted(found_daag)
#         },
#         "not_found": {
#             "JL_No": sorted(req_jl - found_jl),
#             "Daag_No": sorted(req_daag - found_daag)
#         }
#     }

#     if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
#         return response
    
#     raise HTTPException(status_code=422, detail=response)

# @app.post("/quick_check")
# async def quick_check(request: Request):
#     return JSONResponse(
#         status_code=200,
#         content={
#             "found": {
#                 "JL_No": ["63", "106", "125"],
#                 "Daag_No": ["182", "261", "946"]
#             },
#             "not_found": {
#                 "JL_No": [],
#                 "Daag_No": []
#             }
#         }
#     )




from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
import pytesseract
import re
import requests
import tempfile
import pdfplumber
import os
from pdf2image import convert_from_path
from fastapi.responses import JSONResponse
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

OCR_LANG = "ben+eng"
DPI = 400

app = FastAPI(title="Land PDF Search API")


class SearchRequest(BaseModel):
    pdf_urls: list[str]
    JL_No: list[str]
    Daag_No: list[str]


BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")


def normalize_text(text: str) -> str:
    text = text.translate(BN_TO_EN)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def clean_cell(cell: str | None) -> str:
    if not cell:
        return ""
    return normalize_text(cell)


# ─────────────────────────────────────────────────────────────────────────────
# JL: anchor on the Bengali/English label to avoid false positives
# ─────────────────────────────────────────────────────────────────────────────
def extract_jl_numbers(text: str) -> set[str]:
    labeled = set(re.findall(
        r"(?:জে\.?\s*এল\.?\s*নং|J\.?\s*L\.?\s*No\.?)\s*[:\-]?\s*(\d{2,4})",
        text,
        re.IGNORECASE
    ))
    if labeled:
        return labeled
    # Fallback: any 2-4 digit standalone number
    return set(re.findall(r"\b\d{2,4}\b", text))


# ─────────────────────────────────────────────────────────────────────────────
# Daag: just a 2-4 digit number followed by a Bengali character
# Matches "182 দলা ..." and "796 খাস জমি ..." -> captures 182, 796
# ─────────────────────────────────────────────────────────────────────────────
DAAG_LINE_PATTERN = re.compile(
    r"\b(\d{2,4})\s+[\u0980-\u09FF]",
    re.UNICODE
)


def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
    results = set()
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Primary: table-based extraction
            table_settings = {
                "vertical_strategy": "lines",
                "horizontal_strategy": "lines",
                "snap_tolerance": 3,
                "join_tolerance": 3,
            }
            for table in page.extract_tables(table_settings):
                for row in table:
                    if not row:
                        continue
                    daag_cell = clean_cell(row[0])
                    if re.fullmatch(r"\d{2,4}", daag_cell):
                        results.add(daag_cell)

            # Fallback: raw text scan
            text = normalize_text(page.extract_text() or "")
            results.update(DAAG_LINE_PATTERN.findall(text))

    return results


def extract_daag_numbers_ocr(text: str) -> set[str]:
    text = normalize_text(text)
    text = re.sub(r"দা\s*গ", "দাগ", text)
    return set(DAAG_LINE_PATTERN.findall(text))


def download_pdf(url: str, folder: str) -> str:
    path = os.path.join(folder, os.path.basename(url))
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(path, "wb") as f:
        f.write(r.content)
    return path


@app.post("/extract")
def extract_land_data(payload: SearchRequest):
    req_jl = set(payload.JL_No)
    req_daag = set(payload.Daag_No)
    found_jl = set()
    found_daag = set()
    errors = []

    with tempfile.TemporaryDirectory() as tmpdir:
        for url in payload.pdf_urls:

            # Download
            try:
                pdf_path = download_pdf(url, tmpdir)
            except Exception as e:
                msg = f"Failed to download {url}: {e}"
                logger.error(msg)
                errors.append(msg)
                continue

            # OCR for JL numbers
            ocr_text = ""
            try:
                images = convert_from_path(pdf_path, dpi=DPI)
                for img in images:
                    page_ocr = pytesseract.image_to_string(
                        img, lang=OCR_LANG, config="--psm 6"
                    )
                    ocr_text += " " + normalize_text(page_ocr)

                found_jl.update(req_jl & extract_jl_numbers(ocr_text))
            except Exception as e:
                msg = f"OCR failed for {url}: {e}"
                logger.error(msg)
                errors.append(msg)

            # Primary: text-layer Daag extraction
            try:
                daag_from_pdf = extract_daag_from_pdf_text(pdf_path)
                found_daag.update(req_daag & daag_from_pdf)
            except Exception as e:
                msg = f"pdfplumber extraction failed for {url}: {e}"
                logger.error(msg)
                errors.append(msg)

            # Fallback: OCR-based Daag extraction for anything still missing
            if (req_daag - found_daag) and ocr_text:
                found_daag.update(req_daag & extract_daag_numbers_ocr(ocr_text))

    response = {
        "found": {
            "JL_No": sorted(found_jl),
            "Daag_No": sorted(found_daag),
        },
        "not_found": {
            "JL_No": sorted(req_jl - found_jl),
            "Daag_No": sorted(req_daag - found_daag),
        },
        "errors": errors,
    }

    if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
        return response

    raise HTTPException(status_code=422, detail=response)


@app.post("/quick_check")
async def quick_check(request: Request):
    return JSONResponse(
        status_code=200,
        content={
            "found": {
                "JL_No": ["63", "106", "125"],
                "Daag_No": ["182", "261", "946"],
            },
            "not_found": {
                "JL_No": [],
                "Daag_No": [],
            },
        },
    )