# import pdfplumber
# import re
# import json
# import os
# import logging

# # ---------------- CONFIG ---------------- #


# request_payload = {
#     "JL No.": ["63", "124","190"],
#     "daag No": ["371", "261","519"]
# }

# # --------------------------------------- #



# # 🔕 Suppress pdfminer font warnings
# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"   # folder containing PDFs


# JL_SET = set(request_payload["JL No."])
# DAAG_SET = set(request_payload["daag No"])

# results = []

# def scan_pdf(pdf_path):
#     output = {
#         "pdf": os.path.basename(pdf_path),
#         "matches": []
#     }

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):
#             words = page.extract_words(use_text_flow=True) or []
#             page_text = " ".join(w["text"] for w in words)

#             found_jl = []
#             found_daag = []

#             # JL No detection
#             for jl in JL_SET:
#                 if re.search(rf"\b{jl}\b", page_text):
#                     found_jl.append(jl)

#             # Daag No detection
#             for daag in DAAG_SET:
#                 if re.search(rf"\b{daag}\b", page_text):
#                     found_daag.append(daag)

#             if found_jl or found_daag:
#                 output["matches"].append({
#                     "page": page_no,
#                     "JL_No": found_jl,
#                     "Daag_No": found_daag
#                 })

#     if output["matches"]:
#         return output
#     return None


# for file in os.listdir(PDF_FOLDER):
#     if file.lower().endswith(".pdf"):
#         result = scan_pdf(os.path.join(PDF_FOLDER, file))
#         if result:
#             results.append(result)

# print(json.dumps(results, indent=2, ensure_ascii=False))

# import pdfplumber
# import pytesseract
# import re
# import os
# import json
# import logging
# from pdf2image import convert_from_path

# # ---------------- CONFIG ---------------- #

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"


# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# # -------------------------------------- #

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()

# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# for file in os.listdir(PDF_FOLDER):
#     if not file.lower().endswith(".pdf"):
#         continue

#     pdf_path = os.path.join(PDF_FOLDER, file)

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):

#             text = extract_text_pdf(page)

#             if not text or len(text) < TEXT_MIN_LEN:
#                 text = extract_text_ocr(pdf_path, page_no)

#             search(text)

# # ---------------- FINAL OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)

# import pdfplumber
# import pytesseract
# import re
# import os
# import requests
# import tempfile
# import logging
# from pdf2image import convert_from_path

# # ---------------- INPUT ---------------- #

# pdf_urls = [
#     "https://example.com/261__BRAHMANBAHARA__MAYURESWAR-2.pdf",
#     "https://example.com/371(Plot No)_Suahanta Mondal.pdf"
# ]

# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# # ------------------------------------- #

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()


# def download_pdf(url, dest_folder):
#     local_path = os.path.join(dest_folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(local_path, "wb") as f:
#         f.write(r.content)
#     return local_path


# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# # ---------------- MAIN ---------------- #

# with tempfile.TemporaryDirectory() as tmpdir:

#     for url in pdf_urls:
#         try:
#             pdf_path = download_pdf(url, tmpdir)

#             with pdfplumber.open(pdf_path) as pdf:
#                 for page_no, page in enumerate(pdf.pages, start=1):

#                     text = extract_text_pdf(page)

#                     if not text or len(text) < TEXT_MIN_LEN:
#                         text = extract_text_ocr(pdf_path, page_no)

#                     search(text)

#         except Exception as e:
#             print(f"Skipped {url}: {e}")

# # ---------------- OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)


# from fastapi import FastAPI, HTTPException,Request
# from pydantic import BaseModel
# import pytesseract
# from pytesseract import Output
# import re
# import requests
# import tempfile
# import pdfplumber
# import os
# from pdf2image import convert_from_path
# from PIL import Image
# from fastapi.responses import JSONResponse



# OCR_LANG = "ben+eng"
# DPI = 400

# app = FastAPI(title="Land PDF Search API")


# class SearchRequest(BaseModel):
#     pdf_urls: list[str]
#     JL_No: list[str]
#     Daag_No: list[str]


# BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

# def normalize_text(text: str) -> str:
#     text = text.translate(BN_TO_EN)
#     text = re.sub(r"\s+", " ", text)
#     return text


# def extract_jl_numbers(text: str) -> set[str]:
#     return set(re.findall(r"\b\d{2,4}\b", text))


# # def extract_daag_numbers(text: str) -> set[str]:
# #     text = re.sub(r"দা\s*গ", "দাগ", text)
# #     matches = re.findall(r"দাগ(?:\s*নং)?\s*(\d{2,4})", text)
# #     return set(matches)


# def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
#     results = set()

#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             text = page.extract_text() or ""

#             text = text.translate(BN_TO_EN)
#             text = re.sub(r"\s+", " ", text)

#             # Match rows like: 261 সৈ য়ম 2.17 Click Here
#             matches = re.findall(r"\b(\d{2,4})\s+[^0-9]+?\s+\d+\.\d+\s+Click", text)

#             for m in matches:
#                 results.add(m)

#     return results

# def extract_daag_numbers(text: str) -> set[str]:
#     """
#     Extract Daag numbers from Daag ROWS like:
#     '261 সৈ য়ম 2.17 Click Here'
#     """

#     results = set()

#     # Normalize digits & Bengali spacing
#     text = text.translate(BN_TO_EN)
#     text = re.sub(r"দা\s*গ", "দাগ", text)

#     # Split into logical lines
#     lines = text.splitlines()

#     for line in lines:
#         line = line.strip()

#         # Heuristic: Daag rows always contain "Click"
#         if "Click" in line:
#             match = re.search(r"\b\d{2,4}\b", line)
#             if match:
#                 results.add(match.group(0))

#     return results



# def download_pdf(url: str, folder: str) -> str:
#     path = os.path.join(folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(path, "wb") as f:
#         f.write(r.content)
#     return path


# @app.post("/extract")
# def extract_land_data(payload: SearchRequest):

#     req_jl = set(payload.JL_No)
#     req_daag = set(payload.Daag_No)

#     found_jl = set()
#     found_daag = set()

#     with tempfile.TemporaryDirectory() as tmpdir:
#         for url in payload.pdf_urls:
#             try:
#                 pdf_path = download_pdf(url, tmpdir)
#                 images = convert_from_path(pdf_path, dpi=DPI)

#                 for img in images:
#                     text = pytesseract.image_to_string(
#                         img,
#                         lang=OCR_LANG,
#                         config="--psm 6"
#                     )

#                     text = normalize_text(text)
                      
#                     found_jl.update(req_jl & extract_jl_numbers(text))
#                     # 1️⃣ Try PDF text-layer
#                     daag_from_text = extract_daag_from_pdf_text(pdf_path)
#                     found_daag.update(req_daag & daag_from_text)

#                     # 2️⃣ OCR fallback only if needed
#                     if req_daag - found_daag:
#                         found_daag.update(req_daag & extract_daag_numbers(text))


#             except Exception:
#                 continue

#     response = {
#         "found": {
#             "JL_No": sorted(found_jl),
#             "Daag_No": sorted(found_daag)
#         },
#         "not_found": {
#             "JL_No": sorted(req_jl - found_jl),
#             "Daag_No": sorted(req_daag - found_daag)
#         }
#     }

#     if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
#         return response

#     raise HTTPException(status_code=422, detail=response)



import os
import re
import tempfile
import requests
import pdfplumber

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

from pdf2image import convert_from_path
import pytesseract


# ================= CONFIG =================

OCR_LANG = "ben+eng"
DPI = 300   # Optimized (400 is unnecessary)

app = FastAPI(title="Land PDF Search API")


# ================= REQUEST MODEL =================

class SearchRequest(BaseModel):
    pdf_urls: list[str]
    JL_No: list[str]
    Daag_No: list[str]


# ================= UTILS =================

BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")


def normalize_text(text: str) -> str:
    text = text.translate(BN_TO_EN)
    text = re.sub(r"\s+", " ", text)
    return text


# 🔒 JL logic (UNCHANGED)
def extract_jl_numbers(text: str) -> set[str]:
    return set(re.findall(r"\b\d{2,4}\b", text))


# ================= DAAG: PDF TEXT LAYER =================

def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
    """
    Extract Daag numbers from PDF text-layer like:
    '261 সৈ য়ম 2.17 Click Here'
    """
    results = set()

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            text = text.translate(BN_TO_EN)
            text = re.sub(r"\s+", " ", text)

            matches = re.findall(
                r"\b(\d{2,4})\s+[^0-9]+?\s+\d+\.\d+\s+Click",
                text
            )

            results.update(matches)

    return results


# ================= DAAG: OCR FALLBACK =================

def extract_daag_numbers(text: str) -> set[str]:
    """
    OCR fallback for Daag rows like:
    '261 সৈ য়ম 2.17 Click Here'
    """
    results = set()
    text = text.translate(BN_TO_EN)

    for line in text.splitlines():
        if "Click" in line:
            match = re.search(r"\b\d{2,4}\b", line)
            if match:
                results.add(match.group(0))

    return results


# ================= FILE DOWNLOAD =================

def download_pdf(url: str, folder: str) -> str:
    path = os.path.join(folder, os.path.basename(url))
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(path, "wb") as f:
        f.write(r.content)
    return path


# ================= API =================

@app.post("/extract")
def extract_land_data(payload: SearchRequest):

    req_jl = set(payload.JL_No)
    req_daag = set(payload.Daag_No)

    found_jl = set()
    found_daag = set()

    with tempfile.TemporaryDirectory() as tmpdir:
        for url in payload.pdf_urls:
            try:
                pdf_path = download_pdf(url, tmpdir)

                # 1️⃣ FAST: PDF text-layer for Daag
                daag_from_text = extract_daag_from_pdf_text(pdf_path)
                found_daag.update(req_daag & daag_from_text)

                need_jl_ocr = bool(req_jl - found_jl)
                need_daag_ocr = bool(req_daag - found_daag)

                # 2️⃣ OCR ONLY IF NEEDED
                if need_jl_ocr or need_daag_ocr:
                    images = convert_from_path(
                        pdf_path,
                        dpi=DPI,
                        first_page=1,
                        last_page=1
                    )

                    for img in images:
                        text = pytesseract.image_to_string(
                            img,
                            lang=OCR_LANG,
                            config="--psm 6"
                        )

                        text = normalize_text(text)

                        if need_jl_ocr:
                            found_jl.update(req_jl & extract_jl_numbers(text))

                        if need_daag_ocr:
                            found_daag.update(req_daag & extract_daag_numbers(text))

                # 3️⃣ EARLY EXIT (BIG SPEED GAIN)
                if not (req_jl - found_jl or req_daag - found_daag):
                    break

            except Exception:
                continue

    response = {
        "found": {
            "JL_No": sorted(found_jl),
            "Daag_No": sorted(found_daag)
        },
        "not_found": {
            "JL_No": sorted(req_jl - found_jl),
            "Daag_No": sorted(req_daag - found_daag)
        }
    }

    if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
        return response

    raise HTTPException(status_code=422, detail=response)



# @app.post("/quick_check")
# async def quick_check(request: Request):
#     return JSONResponse(
#         status_code=200,
#         content={
#             "found": {
#                 "JL_No": ["63", "124"],
#                 "Daag_No": ["261", "371"]
#             },
#             "not_found": {
#                 "JL_No": ["999"],
#                 "Daag_No": ["888"]
#             }
#         }
#     )


