# extractors/voter.py
import re
from extractors.base import ExtractResult
from utils.text import EPIC_RX, DOB_RX

# Common label tokens in english and bengali (and some transliteration)
NAME_LABELS = [
    r"elector'?s name", r"electors name", r"elector name", r"name", r"নাম", r"নামঃ",
    r"নামের", r"নামঃ", r"নাম:", r"নাম।"
]
FATHER_LABELS = [
    r"father'?s name", r"husband'?s name", r"father name", r"পিতা|পিতার নাম", r"স্বামীর নাম", r"স্বামী"
]
DOB_LABELS = [
    r"dob", r"date of birth", r"birth", r"জন্ম", r"জন্ম তারিখ"
]
GENDER_LABELS = [r"male|female|transgender|মহিলা|পুরুষ|পুরুষ"]

def _clean_token(s: str) -> str:
    """Basic cleanup of OCR tokens: trim, remove extra punctuation and common OCR artifacts."""
    if not s:
        return s
    s = s.strip()
    # remove weird repeated punctuation and control chars:
    s = re.sub(r"[\u200b\u200c\u200d]", "", s)
    s = re.sub(r"[_\uFFFD]+", "", s)
    s = s.strip(" \t\n\r:.-,")
    # collapse multiple spaces
    s = re.sub(r"\s{2,}", " ", s)
    return s

def _looks_like_name(s: str) -> bool:
    """Rudimentary check — a name is usually 2+ words and not mostly digits."""
    if not s:
        return False
    s2 = re.sub(r"[^A-Za-z\u0980-\u09FF0-9 ]", "", s)  # allow Bengali block too
    words = [w for w in s2.split() if w.strip()]
    return len(words) >= 2 and sum(ch.isdigit() for ch in s2) < len(s2) * 0.4

def _join_split_label(lines):
    """
    Fix cases where OCR split a label across lines and produced "'s Name" or similar.
    If a line looks like " 's Name" or "Name" preceded by something like "Father", join them.
    """
    out = []
    i = 0
    while i < len(lines):
        cur = lines[i]
        # look ahead for patterns like "'s Name" or "Name" that are likely continuation
        if i + 1 < len(lines):
            next_line = lines[i+1]
            # next_line is a short continuation like "'s Name" or "Name"
            if re.match(r"^[\'\"\u2019sS]{0,3}\s*name[\s\:\-]*$", next_line.strip(), re.I) or re.match(r"^[\'\"\u2019sS]{0,3}\s*নাম[\s\:\-]*$", next_line.strip(), re.I):
                combined = (cur + " " + next_line).strip()
                out.append(combined)
                i += 2
                continue
        out.append(cur)
        i += 1
    return out

def extract_voter(text: str) -> ExtractResult:
    """
    Improved voter extractor:
     - Uses EPIC regex to find card id
     - Searches neighbor lines for name and father/husband name
     - Supports English + Bengali labels and fixes common split-label artifacts
     - Tries several heuristics to guess name (lines around photo/EPIC, label-based extraction)
    """
    if not text:
        return ExtractResult("voter", {"epic": None, "name": None, "father_or_spouse": None, "dob": None, "gender": None, "address": None}, 0.0)

    # Normalize newlines and whitespace
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    # fix common split cases where label got split into "'s Name"
    lines = _join_split_label(lines)
    # Quick lower-case version for label checks
    llines_lower = [l.lower() for l in lines]

    epic = None
    name = None
    father = None
    dob = None
    gender = None
    address = None

    # find EPIC first and note its line index
    epic_idx = None
    for i, l in enumerate(lines):
        m = EPIC_RX.search(l)
        if m:
            epic = m.group(1)
            epic_idx = i
            break

    # try to find DOB anywhere
    for l in lines:
        m = DOB_RX.search(l)
        if m:
            dob = _clean_token(m.group(1))
            break

    # try to find gender token
    for l in lines:
        if re.search(r"\b(male|female|transgender|male\b|female\b)\b", l, re.I) or re.search(r"মহিলা|পুরুষ|নারী|পুরুষ", l):
            gender = _clean_token(l)
            break

    # label-based extraction (preferred)
    for i, l in enumerate(lines):
        low = l.lower()
        # name labels
        if any(re.search(lbl, low) for lbl in NAME_LABELS):
            # strip the label portion and keep the remainder
            # e.g. "Elector's Name: John Doe" -> "John Doe"
            try:
                name_candidate = re.split(r"[:\-]\s*", l, maxsplit=1)[1]
            except Exception:
                # fallback: take next line if label line has no immediate name
                name_candidate = lines[i+1] if i+1 < len(lines) else ""
            name_candidate = _clean_token(name_candidate)
            if not name_candidate:
                # maybe name is in next line
                if i+1 < len(lines):
                    name_candidate = _clean_token(lines[i+1])
            if _looks_like_name(name_candidate):
                name = name_candidate
                break

    # father/husband label based
    for i, l in enumerate(lines):
        low = l.lower()
        if any(re.search(lbl, low) for lbl in FATHER_LABELS):
            try:
                candidate = re.split(r"[:\-]\s*", l, maxsplit=1)[1]
            except Exception:
                candidate = lines[i+1] if i+1 < len(lines) else ""
            candidate = _clean_token(candidate)
            if not candidate and i+1 < len(lines):
                candidate = _clean_token(lines[i+1])
            # sometimes "Father's Name" might be OCRed as "Father's" and next line "'s Name" etc.
            if candidate:
                father = candidate
                break

    # If the label-based approach didn't find name/father, try positional heuristics relative to EPIC or photo
    if (not name or not _looks_like_name(name)) and epic_idx is not None:
        # common layouts: name often appears 1-3 lines above EPIC or just below the title
        search_range = list(range(max(0, epic_idx-4), epic_idx)) + list(range(epic_idx+1, min(len(lines), epic_idx+4)))
        for i in search_range:
            cand = _clean_token(lines[i])
            if _looks_like_name(cand):
                # if name empty, take first plausible candidate; if father empty, subsequent plausible candidate
                if not name:
                    name = cand
                elif not father and cand != name:
                    father = cand
                if name and father:
                    break

    # Fallback: sometimes first non-label line is name (title area)
    if not name:
        for i in range(min(6, len(lines))):
            l = _clean_token(lines[i])
            # skip lines that are clearly labels or single tokens like "ELECTION COMMISSION"
            if len(l.split()) >= 2 and not re.search(r"election|commission|identity|card|ভোট|নির্বাচন", l, re.I):
                if _looks_like_name(l):
                    name = l
                    break

    # Final cleanup: remove stray tokens like "'s Name"
    if name:
        name = re.sub(r"(^[\'\"\u2019sS]+\s*|[\'\"\u2019sS]+\s*$)", "", name).strip()
    if father:
        father = re.sub(r"(^[\'\"\u2019sS]+\s*|[\'\"\u2019sS]+\s*$)", "", father).strip()

    fields = {
        "epic": epic,
        "name": name if name else None,
        "father_or_spouse": father if father else None,
        "dob": dob,
        "gender": gender,
        "address": address,
    }

    # heuristically compute confidence (simple)
    filled = sum(1 for v in fields.values() if v)
    conf = min(0.98, 0.35 + 0.12 * filled)
    return ExtractResult("voter", fields, conf)
