# import pdfplumber
# import pandas as pd
# import re

# def clean_bengali_name(text):
#     if not text: return ""
#     # Specifically target the noise found in your document
#     noise = ['ব্যক্তি', 'ব্যাক্তি', 'ব া ক্ত', ' Nil', 'Remarks', 'ক্ত', 'ক্তি']
#     for n in noise:
#         text = text.replace(n, '')
#     # Strip English and digits to keep only the Bengali name
#     text = re.sub(r'[0-9A-Za-z]', '', text)
#     return re.sub(r'\s+', ' ', text).strip()

# def extract_from_hardcoded_zones(pdf_path):
#     all_results = []
    
#     with pdfplumber.open(pdf_path) as pdf:
#         for page_num, page in enumerate(pdf.pages):
#             # SAFE HEIGHT: Dynamic variable to prevent Bounding Box errors
#             h = page.height 
            
#             # HARDCODED X-ZONES (Your Red Lines)
#             # Zone 1: Khatian (x=25 to x=85)
#             # Zone 2: Name (x=90 to x=285)
#             k_strip = page.crop((25, 0, 85, h))
#             n_strip = page.crop((90, 0, 285, h))
            
#             # Extract word objects with precise coordinates
#             k_words = k_strip.extract_words()
#             n_words = n_strip.extract_words()
            
#             # Map name fragments by vertical position
#             name_lines = {}
#             for nw in n_words:
#                 y_key = round(nw['top'] / 3) * 3  # 3px tolerance for line grouping
#                 name_lines.setdefault(y_key, []).append(nw['text'])

#             for kw in k_words:
#                 # Only grab pure digits (Khatian numbers)
#                 if re.fullmatch(r'\d+', kw['text']):
#                     k_val = kw['text']
#                     k_y = kw['top']
                    
#                     # Search for names within the vertical bounds of this Khatian
#                     row_names = []
#                     for ny in sorted(name_lines.keys()):
#                         # Match name lines within 15 pixels of the Khatian number
#                         if abs(ny - k_y) < 15:
#                             line_text = " ".join(name_lines[ny])
#                             # Split if multiple people are stacked in one row
#                             parts = re.split(r'ব্যক্তি|ব্যাক্তি|ব\s*া\s*ক\s*্ত|ক্ত', line_text)
#                             for p in parts:
#                                 cleaned = clean_bengali_name(p)
#                                 if len(cleaned) > 1:
#                                     row_names.append(cleaned)
                    
#                     for final_name in row_names:
#                         all_results.append({
#                             "Khatian": k_val,
#                             "Rayater_Name": final_name,
#                             "Page": page_num + 1
#                         })

#     return pd.DataFrame(all_results).drop_duplicates()

# # EXECUTION
# pdf_file = "/var/www/html/land-tabula/input_pdfs/261__BRAHMANBAHARA__MAYURESWAR-2.pdf"
# try:
#     df = extract_from_hardcoded_zones(pdf_file)
#     df.to_csv("Khatian_Final_Hardcoded_Fixed.csv", index=False, encoding='utf-8-sig')
#     print(f"Success! Extracted {len(df)} entries.")
#     print(df.head(10))
# except Exception as e:
#     print(f"Error: {e}")

# import pdfplumber
# import re
# import json
# from indic_transliteration import sanscript
# from indic_transliteration.sanscript import transliterate

# def ml_transliterate(text):
#     if not text: return ""
#     # Standardize Bengali text (Fixes PDF ligature splits)
#     text = re.sub(r'\s+([\u09BE-\u09CD])', r'\1', text)
#     # Remove Null bytes often found in PDF extractions
#     text = text.replace('\u0000', '')
    
#     # Transliterate: BENGALI -> ITRANS
#     english_phonetic = transliterate(text, sanscript.BENGALI, sanscript.ITRANS)
    
#     # Clean up phonetic markers to make it readable English
#     eng_clean = english_phonetic.lower().capitalize()
    
#     # Standard Land Record Spellings
#     replacements = {
#         "maNDala": "Mandal", "mandala": "Mandal", 
#         "phulamAlI": "Phulmali", "daphadara": "Dafadar",
#         "nandI": "Nandi", "dAsa": "Das"
#     }
#     for k, v in replacements.items():
#         eng_clean = eng_clean.replace(k, v)
        
#     return eng_clean

# def process_batch(pdf_files):
#     batch_results = []
    
#     # Strict Red-Zone Coordinates for Khatian and Name columns
#     K_ZONE = (20, 85)
#     N_ZONE = (82, 220)

#     for pdf_path in pdf_files:
#         with pdfplumber.open(pdf_path) as pdf:
#             for page in pdf.pages:
#                 h = page.height
#                 k_strip = page.crop((K_ZONE[0], 0, K_ZONE[1], h))
#                 n_strip = page.crop((N_ZONE[0], 0, N_ZONE[1], h))
                
#                 k_words = k_strip.extract_words()
#                 n_words = n_strip.extract_words()
                
#                 name_map = {}
#                 for nw in n_words:
#                     y_key = round(nw['top'] / 2) * 2
#                     name_map.setdefault(y_key, []).append(nw['text'])

#                 for kw in k_words:
#                     if re.fullmatch(r'\d+', kw['text']):
#                         k_val = kw['text']
                        
#                         for ny in name_map.keys():
#                             if abs(ny - kw['top']) < 12:
#                                 raw_bn = " ".join(name_map[ny])
                                
#                                 # --- IMPROVED CLEANING LAYER ---
#                                 # 1. Remove Nulls and English/Digits
#                                 bn_clean = raw_bn.replace('\u0000', '')
#                                 bn_clean = re.sub(r'[A-Za-z0-9]', '', bn_clean).strip()
                                
#                                 # 2. Filter Systemic Junk (Common PDF artifacts in land records)
#                                 junk_patterns = ['ব্যাক্তি', 'ব্যক্তি', 'ক্ত', 'িশ', 'ান', 'Nil']
#                                 for junk in junk_patterns:
#                                     bn_clean = bn_clean.replace(junk, '')
                                
#                                 bn_clean = bn_clean.strip()

#                                 # 3. ML Transliterate only if it's a valid name (not a fragment)
#                                 if len(bn_clean) > 3: 
#                                     en_name = ml_transliterate(bn_clean)
                                    
#                                     batch_results.append({
#                                         "khatian": int(k_val),
#                                         "owner_bn": bn_clean,
#                                         "owner_en": en_name,
#                                         "source_file": pdf_path.split('/')[-1]
#                                     })
#     return batch_results

# # Execution
# current_batch = ["/var/www/html/land-tabula/input_pdfs/1764746873181-519NetureeSAINTHIA.pdf"]
# extracted_data = process_batch(current_batch)
# print(json.dumps(extracted_data, indent=4, ensure_ascii=False))



# import pdfplumber
# import re
# import json
# from indic_transliteration import sanscript
# from indic_transliteration.sanscript import transliterate

# def ml_transliterate(text):
#     """
#     ML-based phonetic conversion. 
#     Converts Bengali script to English (Latin) based on phonetics.
#     """
#     if not text: return ""
    
#     # Standardize Bengali text (Fixes PDF ligature splits)
#     text = re.sub(r'\s+([\u09BE-\u09CD])', r'\1', text)
    
#     # Transliterate: BENGALI -> ITRANS (Indian Language Transliteration Standard)
#     # This captures the sound of the name accurately.
#     english_phonetic = transliterate(text, sanscript.BENGALI, sanscript.ITRANS)
    
#     # Clean up phonetic markers to make it readable English
#     eng_clean = english_phonetic.lower().capitalize()
#     # Handle common phonetic overlaps in land records
#     replacements = {"maNDala": "Mandal", "mandal": "Mandal", "phulamAlI": "Phulmali"}
#     for k, v in replacements.items():
#         eng_clean = eng_clean.replace(k, v)
        
#     return eng_clean

# def process_batch(pdf_files):
#     """Process up to 5 PDFs and return an array of objects."""
#     batch_results = []
    
#     # Strict Red-Zone Coordinates
#     K_ZONE = (20, 85)
#     N_ZONE = (82, 220)

#     for pdf_path in pdf_files:
#         with pdfplumber.open(pdf_path) as pdf:
#             for page in pdf.pages:
#                 h = page.height
#                 k_strip = page.crop((K_ZONE[0], 0, K_ZONE[1], h))
#                 n_strip = page.crop((N_ZONE[0], 0, N_ZONE[1], h))
                
#                 # Extract words with coordinate mapping
#                 k_words = k_strip.extract_words()
#                 n_words = n_strip.extract_words()
                
#                 # Map names to Y-coordinates
#                 name_map = {}
#                 for nw in n_words:
#                     y_key = round(nw['top'] / 2) * 2
#                     name_map.setdefault(y_key, []).append(nw['text'])

#                 for kw in k_words:
#                     if re.fullmatch(r'\d+', kw['text']):
#                         k_val = kw['text']
                        
#                         # Find names on the same line (12px tolerance)
#                         for ny in name_map.keys():
#                             if abs(ny - kw['top']) < 12:
#                                 raw_bn = " ".join(name_map[ny])
                                
#                                 # 1. Clean Bengali
#                                 bn_clean = re.sub(r'[A-Za-z0-9]', '', raw_bn).strip()
                                
#                                 # 2. ML Transliterate to English
#                                 en_name = ml_transliterate(bn_clean)
                                
#                                 if len(en_name) > 2:
#                                     batch_results.append({
#                                         "khatian": int(k_val),
#                                         "owner_bn": bn_clean,
#                                         "owner_en": en_name,
#                                         "source_file": pdf_path.split('/')[-1]
#                                     })
#     return batch_results

# # Example Usage for 5 files
# current_batch = ["/var/www/html/land-tabula/input_pdfs/261__BRAHMANBAHARA__MAYURESWAR-2.pdf"]    #, "file2.pdf", "file3.pdf", "file4.pdf", "file5.pdf"]
# extracted_data = process_batch(current_batch)

# # Now you can compare 'owner_en' with your list of values
# print(json.dumps(extracted_data, indent=4, ensure_ascii=False))













# import pdfplumber
# import pandas as pd
# import re

# def clean_bengali_accurate(text):
#     if not text: return ""
#     # 1. Remove markers and common OCR noise
#     noise = ['ব্যক্তি', 'ব্যাক্তি', 'ব া ক্ত', ' Nil', 'Remarks', 'ক্তি', 'ক্ত']
#     for n in noise:
#         text = text.replace(n, '')
    
#     # 2. Fix spacing issues (common in Bengali PDF extraction)
#     # Join vowel signs that got separated: 'ি জপদ' -> 'দ্বিজপদ'
#     text = re.sub(r'\s+([\u09BE-\u09CD])', r'\1', text)
    
#     # 3. Clean stray English and extra whitespace
#     text = re.sub(r'[0-9A-Za-z]', '', text)
#     return re.sub(r'\s+', ' ', text).strip()

# def extract_exact_red_zones(pdf_path):
#     all_results = []
    
#     # PRECISE HARDCODED COORDINATES
#     # Khatian Zone: 20 to 82 (Left side)
#     # Name Zone: 82 to 225 (Starts exactly where Khatian ends; ends before Father's column)
#     K_ZONE = (20, 82)
#     N_ZONE = (82, 225) 

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_num, page in enumerate(pdf.pages):
#             h = page.height
            
#             # Crop the page into two vertical strips
#             k_strip = page.crop((K_ZONE[0], 0, K_ZONE[1], h))
#             n_strip = page.crop((N_ZONE[0], 0, N_ZONE[1], h))
            
#             # Extract words with a small x_tolerance to keep characters together
#             k_words = k_strip.extract_words(x_tolerance=2)
#             n_words = n_strip.extract_words(x_tolerance=2)

#             # Group words by their vertical position (Y-coordinate)
#             # We use a 3-pixel tolerance for line alignment
#             lines = {}
#             for nw in n_words:
#                 y_key = round(nw['top'] / 3) * 3
#                 lines.setdefault(y_key, []).append(nw['text'])

#             for kw in k_words:
#                 # Target the Khatian Number (digits)
#                 if re.fullmatch(r'\d+', kw['text']):
#                     k_val = kw['text']
#                     k_y = kw['top']
                    
#                     # Match names that appear on the same horizontal line as the Khatian
#                     row_names = []
#                     for ny in lines.keys():
#                         if abs(ny - k_y) < 12: # Vertical window for pairing
#                             full_text = " ".join(lines[ny])
#                             # Split by Individual marker to handle multiple owners in one row
#                             parts = re.split(r'ব্যক্তি|ব্যাক্তি|ব\s*া\s*ক\s*্ত|ক্ত', full_text)
#                             for p in parts:
#                                 cleaned = clean_bengali_accurate(p)
#                                 if len(cleaned) > 2 and "নাম" not in cleaned:
#                                     row_names.append(cleaned)
                    
#                     for name in row_names:
#                         all_results.append({
#                             "Khatian": k_val,
#                             "Rayater_Name": name,
#                             "Page": page_num + 1
#                         })

#     return pd.DataFrame(all_results).drop_duplicates()

# # EXECUTION
# pdf_file = "/var/www/html/land-tabula/input_pdfs/261__BRAHMANBAHARA__MAYURESWAR-2.pdf"
# try:
#     df = extract_exact_red_zones(pdf_file)
#     df.to_csv("Khatian_Final_Accurate_Zones.csv", index=False, encoding='utf-8-sig')
#     print(f"Success! Captured {len(df)} entries from all pages.")
#     print(df.head(15))
# except Exception as e:
#     print(f"An error occurred: {e}")



# import pdfplumber
# import pandas as pd
# import re

# def clean_bengali_final(text):
#     if not text: return ""
#     # Remove markers and common "leaked" noise
#     noise = ['ব্যক্তি', 'ব্যাক্তি', 'ব া ক্ত', ' Nil', 'Remarks', 'ক্তি', 'ক্ত']
#     for n in noise:
#         text = text.replace(n, '')
#     # Join broken Bengali vowel signs (Ligature Stitching)
#     text = re.sub(r'\s+([\u09BE-\u09CD])', r'\1', text)
#     # Remove English/Digits
#     text = re.sub(r'[0-9A-Za-z]', '', text)
#     return re.sub(r'\s+', ' ', text).strip()

# def extract_owners_only(pdf_path):
#     all_results = []
    
#     # EXTREMELY STRICT COORDINATES
#     # Khatian Column: 20 to 80
#     # Owner Name Column: 85 to 210 (Strict cut-off before Father's column)
#     K_ZONE = (20, 0, 80, 842)
#     N_ZONE = (85, 0, 210, 842) 

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_num, page in enumerate(pdf.pages):
#             h = page.height
            
#             # Crop exactly where the Red Borders are
#             k_strip = page.crop((K_ZONE[0], 0, K_ZONE[2], h))
#             n_strip = page.crop((N_ZONE[0], 0, N_ZONE[2], h))
            
#             # Extract words with x_tolerance to keep Bengali characters joined
#             k_words = k_strip.extract_words(x_tolerance=2)
#             n_words = n_strip.extract_words(x_tolerance=2)

#             # Group Names by their Vertical (Y) coordinate
#             name_map = {}
#             for nw in n_words:
#                 y_key = round(nw['top'] / 3) * 3
#                 name_map.setdefault(y_key, []).append(nw['text'])

#             for kw in k_words:
#                 # Ensure we only pick up the Khatian Number
#                 if re.fullmatch(r'\d+', kw['text']):
#                     k_val = kw['text']
#                     k_y = kw['top']
                    
#                     # Find owner names on the same line as the Khatian
#                     row_owner_names = []
#                     for ny in name_map.keys():
#                         if abs(ny - k_y) < 12: # Vertical pairing
#                             full_line = " ".join(name_map[ny])
#                             # Split by Individual marker to handle stacked owners
#                             parts = re.split(r'ব্যক্তি|ব্যাক্তি|ব\s*া\s*ক\s*্ত|ক্ত', full_line)
#                             for p in parts:
#                                 cleaned = clean_bengali_final(p)
#                                 if len(cleaned) > 2 and "নাম" not in cleaned:
#                                     row_owner_names.append(cleaned)
                    
#                     for owner in row_owner_names:
#                         all_results.append({
#                             "Khatian": k_val,
#                             "Rayater_Name": owner,
#                             "Page": page_num + 1
#                         })

#     return pd.DataFrame(all_results).drop_duplicates()

# # EXECUTION
# pdf_file = "/var/www/html/land-tabula/input_pdfs/261__BRAHMANBAHARA__MAYURESWAR-2.pdf"
# df = extract_owners_only(pdf_file)
# df.to_csv("Khatian_Owners_Verified.csv", index=False, encoding='utf-8-sig')

# print("Extraction Finished. Preview of captured Owners:")
# print(df.head(15))


# Run the script
# pdf_input = "/var/www/html/land-tabula/input_pdfs/261__BRAHMANBAHARA__MAYURESWAR-2.pdf"


# from fastapi import FastAPI, HTTPException
# from pydantic import BaseModel
# from typing import List
# import pdfplumber
# import re
# from indic_transliteration import sanscript
# from indic_transliteration.sanscript import transliterate
# from thefuzz import fuzz

# app = FastAPI()

# # --- Utility Functions (From previous steps) ---

# def ml_transliterate(text):
#     if not text: return ""
#     text = re.sub(r'\s+([\u09BE-\u09CD])', r'\1', text).replace('\u0000', '')
#     english_phonetic = transliterate(text, sanscript.BENGALI, sanscript.ITRANS)
#     eng_clean = english_phonetic.lower().capitalize()
    
#     replacements = {"maNDala": "Mandal", "mandala": "Mandal", "phulamAlI": "Phulmali"}
#     for k, v in replacements.items():
#         eng_clean = eng_clean.replace(k, v)
#     return eng_clean

# def extract_data_from_pdf(file_path):
#     extracted = []
#     K_ZONE, N_ZONE = (20, 85), (82, 220) # Your specific red-zone coordinates
    
#     try:
#         with pdfplumber.open(file_path) as pdf:
#             for page in pdf.pages:
#                 k_strip = page.crop((K_ZONE[0], 0, K_ZONE[1], page.height))
#                 n_strip = page.crop((N_ZONE[0], 0, N_ZONE[1], page.height))
                
#                 name_map = {}
#                 for nw in n_strip.extract_words():
#                     y_key = round(nw['top'] / 2) * 2
#                     name_map.setdefault(y_key, []).append(nw['text'])

#                 for kw in k_strip.extract_words():
#                     # Handle numeric and slashed khatians (e.g., 470/1)
#                     if re.fullmatch(r'[\d/]+', kw['text']):
#                         for ny, texts in name_map.items():
#                             if abs(ny - kw['top']) < 12:
#                                 bn = re.sub(r'[A-Za-z0-9\u0000]', '', " ".join(texts)).strip()
#                                 if len(bn) > 3:
#                                     extracted.append({
#                                         "khatian": kw['text'],
#                                         "owner_en": ml_transliterate(bn)
#                                     })
#         return extracted
#     except Exception as e:
#         return []

# # --- API Models ---

# class MatchRequest(BaseModel):
#     khatian: List[str]
#     owner: List[str]
#     source_file: str

# # --- Endpoints ---

# @app.post("/match-land-records")
# async def match_records(request: MatchRequest):
#     # 1. Path to your PDF storage folder
#     file_path = f"/var/www/html/land-tabula/input_pdfs/{request.source_file}"
    
#     # 2. Extract current content from PDF
#     pdf_content = extract_data_from_pdf(file_path)
#     if not pdf_content:
#         raise HTTPException(status_code=404, detail="File not found or empty")

#     results = []
    
#     # 3. Compare Input vs PDF Content
#     for req_khatian, req_owner in zip(request.khatian, request.owner):
#         match_found = False
#         highest_score = 0
#         matched_name = ""

#         for record in pdf_content:
#             # Match Khatian exactly, then Fuzzy Match Name
#             if str(record['khatian']) == str(req_khatian):
#                 # Using token_set_ratio to handle name order differences
#                 score = fuzz.token_set_ratio(req_owner, record['owner_en'])
                
#                 # 40% tolerance means score must be >= 60
#                 if score >= 60:
#                     match_found = True
#                     if score > highest_score:
#                         highest_score = score
#                         matched_name = record['owner_en']

#         results.append({
#             "input_khatian": req_khatian,
#             "input_owner": req_owner,
#             "is_matched": match_found,
#             "confidence_score": highest_score,
#             "matched_with": matched_name
#         })

#     return {"status": "success", "matches": results}

# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app, host="0.0.0.0", port=8000)


from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import pdfplumber
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from thefuzz import fuzz

app = FastAPI()

def ml_transliterate(text):
    if not text: return ""
    # Clean null bytes and fix PDF-specific ligature spacing
    text = text.replace('\u0000', '')
    text = re.sub(r'\s+([\u09BE-\u09CD])', r'\1', text)
    
    # ML Phonetic Transliteration: BENGALI -> ITRANS
    english_phonetic = transliterate(text, sanscript.BENGALI, sanscript.ITRANS)
    eng_clean = english_phonetic.lower().capitalize()
    
    # Standardization for Land Records (Matches common English spellings)
    replacements = {
        "maNDala": "Mandal", "mandala": "Mandal", 
        "phulamAlI": "Phulmali", "daphadara": "Dafadar",
        "nandI": "Nandi", "dAsa": "Das", "shekha": "Sheikh"
    }
    for k, v in replacements.items():
        eng_clean = eng_clean.replace(k, v)
    return eng_clean

def extract_data_from_pdf(file_path):
    extracted = []
    # Red-Zone column coordinates (Khatian & Owner Name)
    K_ZONE, N_ZONE = (20, 85), (82, 220) 
    
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                h = page.height
                k_strip = page.crop((K_ZONE[0], 0, K_ZONE[1], h))
                n_strip = page.crop((N_ZONE[0], 0, N_ZONE[1], h))
                
                # Group names by their vertical Y-coordinate
                name_map = {}
                for nw in n_strip.extract_words():
                    y_key = round(nw['top'] / 2) * 2
                    name_map.setdefault(y_key, []).append(nw['text'])

                for kw in k_strip.extract_words():
                    k_val = kw['text'].strip()
                    # Accepts numeric or sub-divided khatians (e.g., 774 or 470/1)
                    if re.fullmatch(r'[\d/]+', k_val):
                        for ny, texts in name_map.items():
                            if abs(ny - kw['top']) < 12:
                                # Clean Bengali: Remove stray English characters
                                bn = re.sub(r'[A-Za-z0-9\u0000]', '', " ".join(texts)).strip()
                                if len(bn) > 2:
                                    extracted.append({
                                        "khatian": k_val, 
                                        "owner_en": ml_transliterate(bn)
                                    })
        return extracted
    except Exception as e:
        print(f"Extraction Error: {e}")
        return []

class MatchRequest(BaseModel):
    khatian: List[str]
    owner: List[str]
    source_file: str

@app.post("/match-land-records")
async def match_records(request: MatchRequest):
    # Ensure this path matches your server structure
    file_path = f"/var/www/html/land-tabula/input_pdfs/{request.source_file}"
    
    # 1. Extract PDF content once per API call
    pdf_content = extract_data_from_pdf(file_path)
    
    if not pdf_content:
        raise HTTPException(status_code=404, detail="File not found or extraction failed")

    results = []
    
    # 2. Iterate through user list and match with PDF content
    for req_khat, req_own in zip(request.khatian, request.owner):
        best_match = {"is_matched": False, "score": 0, "matched_as": ""}
        
        # Normalize khatian (e.g., matching '0774' with '774')
        norm_req_khat = str(req_khat).lstrip('0')

        for record in pdf_content:
            norm_pdf_khat = str(record['khatian']).lstrip('0')
            
            if norm_pdf_khat == norm_req_khat:
                # 40% Tolerance logic (Score 60+)
                score = fuzz.token_set_ratio(req_own, record['owner_en'])
                
                if score >= 60 and score > best_match["score"]:
                    best_match = {
                        "is_matched": True, 
                        "score": score, 
                        "matched_as": record['owner_en']
                    }
        
        results.append({
            "input_khatian": req_khat,
            "input_owner": req_own,
            "is_matched": best_match["is_matched"],
            "confidence": best_match["score"],
            "matched_name_in_pdf": best_match["matched_as"]
        })

    return {"status": "success", "results": results}

if __name__ == "__main__":
    import uvicorn
    # Using the path to your current venv
    uvicorn.run(app, host="0.0.0.0", port=8000)