o
    ҆i                     @   s   d dl Z d dlmZ d dlmZmZ g dZg dZg dZdgZ	de
d	e
fd
dZde
d	efddZdd Zde
d	efddZdS )    N)ExtractResult)EPIC_RXDOB_RX)
zelector'?s namezelectors namezelector namenameu	   নাম   নামঃu   নামেরr   u
   নাম:u   নাম।)zfather'?s namezhusband'?s namezfather nameu&   পিতা|পিতার নামu   স্বামীর নামu   স্বামী)dobzdate of birthbirthu   জন্মu   জন্ম তারিখuG   male|female|transgender|মহিলা|পুরুষ|পুরুষsreturnc                 C   sH   | s| S |   } tdd| } tdd| } |  d} tdd| } | S )zUBasic cleanup of OCR tokens: trim, remove extra punctuation and common OCR artifacts.z[\u200b\u200c\u200d] z
[_\uFFFD]+z 	
:.-,z\s{2,} )stripresub)r	    r   6/var/www/html/id_ocr_project_final/extractors/voter.py_clean_token   s   
r   c                 C   sR   | sdS t dd| }dd | D }t|dko(tdd |D t|d	 k S )
uG   Rudimentary check — a name is usually 2+ words and not mostly digits.Fz[^A-Za-z\u0980-\u09FF0-9 ]r   c                 S   s   g | ]}|  r|qS r   r   ).0wr   r   r   
<listcomp>%   s    z$_looks_like_name.<locals>.<listcomp>   c                 s   s    | ]}|  V  qd S N)isdigit)r   chr   r   r   	<genexpr>&       z#_looks_like_name.<locals>.<genexpr>g?)r   r   splitlensum)r	   s2wordsr   r   r   _looks_like_name    s
   *r"   c                 C   s   g }d}|t | k rQ| | }|d t | k rB| |d  }td| tjs0td| tjrB|d |  }|| |d7 }q|| |d7 }|t | k s
|S )z
    Fix cases where OCR split a label across lines and produced "'s Name" or similar.
    If a line looks like " 's Name" or "Name" preceded by something like "Father", join them.
    r      z%^[\'\"\u2019sS]{0,3}\s*name[\s\:\-]*$u*   ^[\'\"\u2019sS]{0,3}\s*নাম[\s\:\-]*$r   r   )r   r   matchr   Iappend)linesouticur	next_linecombinedr   r   r   _join_split_label(   s   (

r-   textc              	      s  | st dddddddddS dd |  D }t|}dd |D }d}d}d}d}d}d}d}	t|D ]\}
}t|}|rI|d}|
}	 nq5|D ]}t|}|r^t|d} nqL|D ]}t	d	|t	j
sqt	d
|rwt|} nqat|D ]W\}
}|  t fddtD rzt	jd|ddd }W n ty   |
d t|k r||
d  nd}Y nw t|}|s|
d t|k rt||
d  }t|r|} nq|t|D ][\}
}|  t fddtD r3zt	jd|ddd }W n ty   |
d t|k r||
d  nd}Y nw t|}|s,|
d t|k r,t||
d  }|r3|} nq|r<t|s|	durtttd|	d |	tt|	d tt||	d  }|D ]&}
t||
 }t|r|sr|}n
|s|||kr||}|r|r nq_|sttdt|D ]$}
t||
 }t| dkrt	d|t	j
st|r|} nq|rt	dd| }|rt	dd| }||r|nd|r|nd|||d}tdd | D }tddd|  }t d||S )a:  
    Improved voter extractor:
     - Uses EPIC regex to find card id
     - Searches neighbor lines for name and father/husband name
     - Supports English + Bengali labels and fixes common split-label artifacts
     - Tries several heuristics to guess name (lines around photo/EPIC, label-based extraction)
    voterN)epicr   father_or_spouser   genderaddressg        c                 S   s   g | ]
}|  r|  qS r   r   r   lr   r   r   r   J   s    z!extract_voter.<locals>.<listcomp>c                 S   s   g | ]}|  qS r   )lowerr4   r   r   r   r   N   s    r#   z-\b(male|female|transgender|male\b|female\b)\bu<   মহিলা|পুরুষ|নারী|পুরুষc                 3       | ]	}t | V  qd S r   r   searchr   lbllowr   r   r   q       z extract_voter.<locals>.<genexpr>z[:\-]\s*)maxsplitr   c                 3   r7   r   r8   r:   r<   r   r   r      r>   r         r   uD   election|commission|identity|card|ভোট|নির্বাচনz)(^[\'\"\u2019sS]+\s*|[\'\"\u2019sS]+\s*$)c                 s   s    | ]}|rd V  qdS )r#   Nr   )r   vr   r   r   r      r   g\(\?gffffff?gQ?)r   
splitlinesr-   	enumerater   r9   groupr   r   r   r%   r6   anyNAME_LABELSr   	Exceptionr   r"   FATHER_LABELSlistrangemaxminr   r   r   values)r.   r'   llines_lowerr0   r   fatherr   r2   r3   epic_idxr)   r5   mname_candidate	candidatesearch_rangecandfieldsfilledconfr   r<   r   extract_voter>   s   


$&8
$

rZ   )r   extractors.baser   
utils.textr   r   rG   rI   
DOB_LABELSGENDER_LABELSstrr   boolr"   r-   rZ   r   r   r   r   <module>   s   