o
    i0                     @   s   d dl mZ d dlmZmZmZmZmZ d dlZ	z
d dl
mZ dZW n ey-   dZY nw zd dlZd dlmZ dZW n eyG   dZY nw h dZd	ee d
efddZdee d
efddZeG dd dZG dd dZdS )    )	dataclass)ListTupleOptionalDictAnyN)	PaddleOCRTF)Output>	   chenkatatejapanlatinkoreanchinese_chtlangreturnc                 C   sr   | sdS t |   }|dv rdS |dv rdS d|v r1dd |dD D ]
}|tv r0|  S q&|tv r7|S dS )z
    Normalize a PADDLE_LANG value into a PaddleOCR-supported token.
    If user provided language hints like 'multi', 'hi', 'bn' -> fallback to 'en'
    (PaddleOCR stable release doesn't necessarily ship hi/bn models).
    r   )multiallmixed)hihindibnbengalibenzbn-inbn_in,c                 S   s   g | ]}|  qS  )strip).0cr   r   1/var/www/html/id_ocr_project_final/ocr/engines.py
<listcomp>(   s    z*_normalize_paddle_lang.<locals>.<listcomp>)strr    lowersplit_VALID_PADDLE_LANGS)r   l	candidater   r   r#   _normalize_paddle_lang   s   r+   	user_hintc                    s6   | sdS t |   t fdddD }|rdS dS )z
    Build a tesseract language token string.
    If user hint includes 'bn' or 'bengali' include Bengali; always include English.
    Examples: 'eng+ben' or just 'eng'.
    eng+benc                 3   s    | ]}| v V  qd S Nr   )r!   xur   r#   	<genexpr>8   s    z(_tesseract_lang_token.<locals>.<genexpr>)r   r   r   eng)r%   r&   any)r,   wants_bnr   r0   r#   _tesseract_lang_token/   s   r6   c                   @   sR   e Zd ZU eed< eeeeeeef   ed< eed< dZ	ee
eef  ed< dS )	OcrResulttextboxesengineNraw)__name__
__module____qualname__r%   __annotations__r   r   r   intr;   r   r   r   r   r   r#   r7   >   s
   
 r7   c                   @   sh   e Zd ZdZddededee fdd	Zd
ejde	fddZ
ddee de	fddZdde	fddZdS )	OCREnginea  
    Hybrid OCR engine:
     - Prefer PaddleOCR (layout & rotation robust) when configured.
     - Always fallback to Tesseract using combined 'eng+ben' when Paddle returns nothing or fails.
     - Defensive around missing libs / unsupported paddle languages.
    	tesseractr   Npreferr   tesseract_cmdc                 C   s   |pd  | _t|| _t|| _|r'z
dd l}||j_W n	 ty&   Y nw d | _	| jdkrot
rqzt| jddd| _	W d S  tyb   zd| _tdddd| _	W Y d S  tya   d | _	Y Y d S w  tyn   d | _	Y d S w d S d S )NrB   r   paddleT)r   use_angle_clsrecr   )r&   rC   r+   r   r6   _tess_lang_hintpytesseractrD   	Exception_paddle_HAS_PADDLEr   AssertionError)selfrC   r   rD   rI   r   r   r#   __init__L   s2   

zOCREngine.__init__np_rgbr   c              
   C   sN  t r| jd u rtdd ddS zz| jj|dd}g }g }|D ]W}|D ]R}z|r:t|dkr:t|d dkr:|d d nd}W n tyH   d}Y nw |rP|| |d }dd	 |D }	d
d	 |D }
|t|	t|
t|	t|
f q!qtd	|
 |r|nd dd|idW S  ty } ztdd ddt|idW  Y d }~S d }~ww )N zpaddle-unavailabler8   r9   r:   T)cls   r   c                 S      g | ]}t |d  qS )r   r@   r!   pr   r   r#   r$   y       z)OCREngine._run_paddle.<locals>.<listcomp>c                 S   rU   )rT   rV   rW   r   r   r#   r$   y   rY   
rE   
paddle_rawr8   r9   r:   r;   zpaddle-errorerror)rL   rK   r7   ocrlenrJ   appendminmaxjoinr    r%   )rN   rP   reslinesr9   pagedettxtptsxsyser   r   r#   _run_paddleh   s0   4
$(
$zOCREngine._run_paddle
lang_tokenc              
   C   sl  t s	tdd ddS z|p| jpd}tj|tj|d}g }g }t|dg }t	|D ]U}|d | p3d
 }	|	s9q*||	 t|ddg| | }
t|d	dg| | }t|d
dg| | }t|ddg| | }||
||
| || f q*td|
 |r|nd d| dd|idW S  ty } ztdd ddt|idW  Y d }~S d }~ww )NrQ   ztesseract-unavailablerR   r-   )output_typer   r8   leftr   topwidthheightrZ   z
tesseract()tesseract_datar\   ztesseract-errorr]   )	_HAS_TESSr7   rH   rI   image_to_datar	   DICTr_   getranger    r`   r@   rc   rJ   r%   )rN   	pil_imagern   	tess_langdtextsr9   nisrp   rq   rr   rs   rl   r   r   r#   _run_tesseract   s.   
0$zOCREngine._run_tesseractc                 C   s   |d u r|d urddl m} ||}|d u r&|d ur&ddl m} ||}| jdkr_tr_| jd ur_|d u r?|d ur?t|}| |}|j	rI|S | j
|pN|| jd}|j	r]|j d|_|S | j
|pd|| jdS )Nr   )ImagerE   )rn   z|fallback_from_paddle)PILr   	fromarrayrC   rL   rK   nparrayrm   r8   r   rH   r:   )rN   pil_rgbpil_bwrP   np_grayr   rd   fallbackr   r   r#   run   s"   



zOCREngine.run)rB   r   Nr.   )NNNN)r<   r=   r>   __doc__r%   r   rO   r   ndarrayr7   rm   r   r   r   r   r   r#   rA   E   s    rA   )dataclassesr   typingr   r   r   r   r   numpyr   	paddleocrr   rL   rJ   rI   r	   rv   r(   r%   r+   r6   r7   rA   r   r   r   r#   <module>   s,   