
    j2iE                        S SK JrJrJrJr  S SKJrJrJr  S SK	J
r
Jr  S SKrS SKrS SKrS SKrS SKrS SKJr  S SKrS SKJr  S SKr\R.                  " \R0                  S9  \R2                  " S5      r\" S	S
9r " S S\
5      r " S S\
5      r " S S\
5      rS\R>                  S\ S\SS4S jr!S\S\\    4S jr"S\S\\    4S jr#S\\    S\\    S\\$   4S jr%S\ S\\\ \4      4S jr&S\ S\\ \4   4S jr'S\ S\\    S\4S jr(\RS                  S \S!9S"\4S# j5       r*\+S$:X  a  S SK,r,\,RZ                  " S%S&S'S(S)9  gg)*    )ListOptionalDictAny)FastAPIHTTPExceptionBackgroundTasks)	BaseModelFieldN)convert_from_path)Path)levelland_pdf_extractorzLand PDF Extraction API)titlec                       \ rS rSr% \" \S9r\\\	      \
S'   \" \S9r\\\	      \
S'   \" \S9r\\\	      \
S'   \\	   \
S'   Sr\\	   \
S'   S	rg)
ExtractRequest   default_factorydaagkhatianfarmerfilesNstatus )__name__
__module____qualname____firstlineno__r   listr   r   r   str__annotations__r   r   r   __static_attributes__r       */var/www/html/land-doc-ocr/app_ocr_land.pyr   r      sb     %d ;D(49
;#(#>GXd3i >"'"=FHT#Y=9 FHSM r$   r   c                   ^    \ rS rSr% \\S'   \\S'   Sr\\   \S'   \	" \
S9r\\\4   \S'   Srg)	
FileResult   urlokNreasonr   	extractedr   )r   r   r   r   r!   r"   boolr+   r   r   dictr,   r   r   r#   r   r$   r%   r'   r'      s1    	HH FHSM  %d ;ItCH~;r$   r'   c                   0    \ rS rSr% \\S'   \\   \S'   Srg)ExtractResponse!   overall_statusresultsr   N)	r   r   r   r   r!   r"   r   r'   r#   r   r$   r%   r0   r0   !   s    *r$   r0   sessionr)   	dest_pathreturnc                   #    U R                  USS9 IS h  vN nUR                  S:w  a  [        SUR                   35      e[        US5       n UR                  R                  S5      I S h  vN nU(       d  OUR                  U5        M>  S S S 5        S S S 5      IS h  vN   g  N N:! , (       d  f       N#= f N! , IS h  vN  (       d  f       g = f7f)N<   )timeout   zHTTP wbi   )getr   RuntimeErroropencontentreadwrite)r4   r)   r5   respfchunks         r%   
fetch_filerE   &   s     {{3{++t;;#t{{m455)T"a"ll//	::	  # ,++
 ; #" ,+++sn   CB)C5C  B-0B+1B-C C#B>$C+B--
B;	7C >C CC	CCpathc                 @   / n [         R                  " U 5       nUR                   H+  nUR                  UR	                  5       =(       d    S5        M-     SSS5        U$ ! , (       d  f       U$ = f! [
         a!  n[        R                  SU5         SnAU$ SnAff = f)z1Return list of page texts using pdfplumber (fast) Nzpdfplumber failed: %s)
pdfplumberr>   pagesappendextract_text	Exceptionlogger	exception)rF   rJ   pdfpes        r%   extract_text_from_pdfrS   2   s    E5__T"cYYQ^^-34  #
 L #"
 L  50!44L5s4   A2 <A A2  
A/*A2 /A2 2
B<BBc                     / n [        [        U 5      SS9nU H(  nUR                  [        R                  " U5      5        M*     U$ ! [
         a!  n[        R                  SU5         SnAU$ SnAff = f)z>Fallback OCR using pdf2image + pytesseract; heavier but robustr:   )dpizOCR failed: %sN)r   r!   rK   pytesseractimage_to_stringrM   rN   rO   )rF   textsimagesimgrR   s        r%   ocr_pdfr[   >   so    E."3t9#6CLL44S9:  L  .)1--L.s   AA 
A2A--A2rJ   khatian_valuesc                     U(       d  gU Vs/ s H  o"R                  5       PM     nn[        U 5       H2  u  pEU=(       d    SR                  5       nU H  nX&;   d  M
  Us  s  $    M4     gs  snf )zUReturn page index where any khatian string appears (case-insensitive substring match)NrH   )lower	enumerate)rJ   r\   klowereditextts          r%   find_khatian_in_pagesre   J   sa    "01.Qwwy.G1U#ZR Av  $
  2s   A*c                    [        U 5      n/ nU GHX  nUR                  S/ 5      nUR                  S5      nUR                  S5      nU(       a  US   O/ nSR                  U Vs/ s H  n[        U5      PM     sn5      n	SU	;   =(       a    SU	;  =(       a    SU	;  n
S	U	;   =(       d    SU	;   =(       d    SU	;   n[	        U5       GH  u  pSR                  U Vs/ s H  o(       d  M  [        U5      PM     sn5      nUS:X  a  MA  / nU HC  n[        U5      R                  5       n[        U5      n[        U5      nUR                  U5        ME     S
nS
nS
nS
nU
(       a  U(       a  [        US   5      nU(       a  U H  n[        U5      nU(       aQ  SU;   d6  [        R                  SU5      (       d  [        R                  SU5      (       a  UnOU(       d  U=(       d    UnU(       a  Mo  [        R                  SU5      (       d  M  UnM     UR                  5       nU(       a  [        U5      OS
nUR                  UUUU
(       a  SO
U(       a  SOSUUUUUUS.
5        GM     GM[     U$ s  snf s  snf )z
Extract Daag table (Table-1) and Khatian/Name table (Table-2),
remove only header rows, keep all data rows.
Returns a flat list of rows with table_type in {'daag','khatian','other'}.
rowspagetable_indexr    	   দাগu   রায়েতরu   রায়েতর   খতrH   /u
   [০-৯]/z[0-9]/u	   [ঀ-৿]r   r   other)
rh   ri   	row_index
table_typecellsrow_textr   r   name_bnname_en)!extract_tables_from_pdf_all_pagesr<   joinr!   r_   striprepair_bengali_spacingremove_noise_tokensrK   guess_khatian_or_daag_from_cellresearchtransliterate_name_pretty)rF   tablesflatrd   rg   rh   t_idx
header_rowcheader_textis_daag_tableis_khatian_tableridxrowrr   cleaned_cellscsr   r   rs   rt   cellkhs                          r%   extract_tables_bnr   W   s_    /t4FDuuVR uuV}m$ !%T!W"
hh
;
1A
;<$3  O8LT_8_  Od{  DO  eO$3  E7K{7Z  E^u  zE  _E"4IDxx :AQ :;Hqy MV\\^+B/(,$$R(	  DGGG
  :=;KLD
  )D8>B$;"))M4*H*HBIIV_aeLfLf&(G!%#':2D"7ryyt'D'D"& * "--/@G3G<RKK$!(5fHX9^e&$""" c ) R KE  < !;s   %I/

I4)I4rc   c                    0 nU =(       d    SR                  5        Vs/ s H)  o"R                  5       (       d  M  UR                  5       PM+     nnUSS  Ho  nUR                  5       nSU;   a  UR                  SU5        SU;   d  SU;   d  SU;   a  UR                  S	U5        S
U;   d  SU;   d  M]  UR                  SU5        Mq     U$ s  snf )u   Simple heuristic parser tuned for Bengali land records.
Detects Daag, Khatian, and Rayetor Nam (রায়েতর নাম).
rH   Ni,  rk   r   u   খতিয়ানu   খতিয়ানrl   r   u   রায়েতর নামu   রায়েতর নামowner)
splitlinesrw   r^   
setdefault)rc   resllineslnlows         r%   parse_land_details_from_textr      s     C!% 7 7 9G 91WWYYQWWY 9EGDSkhhj #NN62& #c)-AS-HHX[ONN9b) *S04UY\4\NN7B'  J! Hs
   C
C
c                 B  #    [        [        R                  " SS95      nUS-  n [        R                  " 5        IS h  vN n[        X@U5      I S h  vN   S S S 5      IS h  vN   [        U5      n[        Xa5      nSnUc.  [        R                  SU 5        [        U5      n[        Xa5      nSnUc>  S	R                  U5      n	[        U	5      n
S
n[        U [        U
5      U
(       d  XS9$ S U
S9$ [        Xg   5      n
XxS.U
S'    UR!                  5        H  nUR#                  5         M     UR%                  5         [        U SU
S9$  GN
 N N! , IS h  vN  (       d  f       N= f! [         a  n[        U SSU 3S9s S nA$ S nAff = f! [         a     N]f = f7f)Nlandpdf_)prefixzdoc.pdfFzdownload_failed: )r)   r*   r+   z,khatian not found in text; trying OCR for %sT
khatian_not_found)r)   r*   r+   r,   )
page_indexocr_used_meta)r)   r*   r,   )r   tempfilemkdtempaiohttpClientSessionrE   rM   r'   rS   re   rN   infor[   rv   r   r-   iterdirunlinkrmdir)r)   r\   tmpdirdestr4   rR   rJ   page_idxused_ocrcombinedparsedr+   rQ   s                r%   process_single_filer      s    (""*56FIDM((**gW4000 +* "$'E$U;HHBCH(?99U#-h7$cd6l6llUYekll *%/:F%-DF7O!AHHJ " #$&99C +0 +*** Mce6Gs4KLLM:  s   #FE* E	 E* EEEE* #E$E* (BF7F >F	E* EE* E'EE'#E* &F'E* *
F4FFFFF
FFFFz/extract)response_modelreqc                   ^ ^#    T R                   (       a*  T R                   R                  5       S:X  a  [        SSS0S9e[        R                  " S5      mU U4S jnT R
                   Vs/ s H  n[        R                  " U" U5      5      PM!     nn[        R                  " U6 I S h  vN nSn[        S	 U 5       5      (       a  S
nO[        S U 5       5      (       a  Sn[        XTS9$ s  snf  NG7f)Nrejectedi  errorrequest_status_rejected)status_codedetail   c                    >#    T IS h  vN   [        U TR                  =(       d    / 5      I S h  vN sS S S 5      IS h  vN   $  N< N N	! , IS h  vN  (       d  f       g = f7fN)r   r   )r)   r   	semaphores    r%   sem_taskextract.<locals>.sem_task   s8     9,S#++2CDD 99D 999sS   A'AA'#AA	AA'AA'	AA'A$AA$ A'partial_successc              3   8   #    U  H  oR                   v   M     g 7fr   r*   .0rs     r%   	<genexpr>extract.<locals>.<genexpr>   s     
!A44s   successc              3   B   #    U  H  oR                   (       + v   M     g 7fr   r   r   s     r%   r   r      s     'w!XXws   failure)r2   r3   )
r   r^   r   asyncio	Semaphorer   create_taskgatherallr0   )r   r   utasksr3   overallr   s   `     @r%   extractr      s      zzcjj&&(J6W>W4XYY !!!$IE 8;yyAy!W  !-yEANNE**GG

!
!!!	'w'	'	''CC B*s   A'C3+&C,C3)C1*AC3__main__zfastapi_land_pdf_extractor:appz0.0.0.0i"  r   )hostport	log_level).typingr   r   r   r   fastapir   r   r	   pydanticr
   r   r   r   r   osrI   	pdf2imager   rV   pathlibr   loggingbasicConfigINFO	getLoggerrN   appr   r'   r0   r   r!   rE   rS   r[   intre   r   r   r   postr   r   uvicornrunr   r$   r%   <module>r      s   , , ; ; %    	  '      ',, '			/	0-.!Y !< <i 
	g33 	# 	$ 	SW 		 	c 		$ 	49 	
c 
DI 
(SV- 
RC RDc3h$8 Rjs tCH~ 0%:3 %:S	 %:j %:P *_5D~ D 6D2 zKK0ytW]^ r$   