
    9iA@                     v   S SK r S SKrS SKrS SKJrJrJr  S SKrS SKJ	r	  SSK
Jr  SSKJr  SSKJr   " S S	\5      r S!S
 jrS\4S jrS\S\S\S\S\4
S jrS\S\4S jrS\S\4S jr  S"S\S\S\S\\\4   S\\   S\S\S\4S jjr     S#S\S\S\S\\\4   S\\   S\S\S\S\S\S\4S jjrS  rg)$    N)AnyDictList)ModeKeys   )OfaBasePreprocessor)get_database_matches)dump_db_json_schemac                      ^  \ rS rSrSr\R                  4U 4S jjrS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4S jrS\\	\
4   S\\	\
4   4S jrS	rU =r$ )
OfaTextToSqlPreprocessor   z(
OFA preprocessor for text to sql tasks
c                 p  > [         [        U ]
  " XU/UQ70 UD6  U R                  R                  R                  SS5      U l        U R                  R                  SS5      U l        SU l        0 U l	        [        R                  R                  [        R                  R                  U5      S5      U l        g)zpreprocess the data

Args:
    cfg(modelscope.utils.config.ConfigDict) : model config
    model_dir (str): model path,
    mode: preprocessor mode (model mode)
promptz . generating sql code.max_struct_length   	databaseN)superr   __init__cfgmodelgetinstruction_textr   	separatordb_schema_cacheospathjoinabspathdatabase_path)selfr   	model_dirmodeargskwargs	__class__s         e/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/preprocessors/ofa/text2sql.pyr   !OfaTextToSqlPreprocessor.__init__   s     	&6st 	H8<	H@F	H !% 2 283L!N!%.A3!G!WW\\GGOOI&
4    datareturnc                     U R                   [        R                  :X  a  U R                  U5      $ U R	                  U5      $ N)r#   r   TRAIN_build_train_sample_build_infer_sample)r!   r*   s     r'   __call__!OfaTextToSqlPreprocessor.__call__-   s4    99&++D11++D11r)   c           	         SU R                   ;   a  SU;   d   S5       eXR                   S      nUR                  U R                  5      n[        U5      S:X  d   S5       eUu  pEnX`R                  ;  a2  [        U R                  S-   U-   S-   U-   S-   U5      U R                  U'   SR                  UR                  5       R                  5       SU R                   5      n[        XEX`R                  U R                  U   U R                  R                  S	5      nUS
   nUS   nUS   n	US   n
SR                  X(5      U R                  -   nU R                  X R                  -   5      nUSU R                  U R                   -   S-    nU R                  SR                  U	5      SSS9SU R"                   n[$        R&                  " XR(                  /5      n[$        R&                  " U R*                  U/5      nSUUUU
S.nU$ )z
build sample for training tasks.

step 1. Get the input question and database id from text input
step 2. Get the database structure input
step 3. Add a pseudo ids for every input.
step 4. Calculate the target and previous output items.
text;there must be `text` column in task key map and source data   z=invalid input, should contain query, question and database id/.sqlite NT	struct_intext_inseq_out	db_struct{} ; structured knowledge: {}    {}F)add_bosadd_eos        )idsourcetargetprev_output_tokensr=   )
column_mapsplitr   lenr   r
   r    r   stripmax_src_lengthseq2seq_inputr   r   formatr   tokenize_textr   max_tgt_lengthtorchcateos_itembos_item)r!   r*   r4   textsqueryquestiondb_id
seq_inputsr:   r<   r=   src_itemtgt_itemtarget_itemprev_output_itemsamples                   r'   r/   ,OfaTextToSqlPreprocessor._build_train_sample3   s7    (Vt^ 	JI	J;OOF+,

4>>*
 	PO	P  "' ,,,*=""S(5036>J+D  ' 88HNN,2245Id6I6IJK"5E;M;M#'#7#7#>#')
 {+	)$Y'{+	.55#445%%d-B-B&BC $d11D4J4JJ "# % %%LL!5 & /D//1 ii== 9: 99dmmX%>? !"2"
 r)   c                    SU R                   ;   a  SU;   d   S5       eXR                   S      nUR                  U R                   S   S5      nUR                  5       nX0R                  ;  a2  [	        U R
                  S-   U-   S-   U-   S-   U5      U R                  U'   SR                  UR                  5       R                  5       SU R                   5      n[        SX#U R
                  U R                  U   U R                  R                  5      nUS	   nUS
   nSR                  X%5      U R                  -   nU R                  X R                  -   5      nUSU R                  U R                  -   S-    nSXvS.nSU R                   ;   a6  U R                   S   U;   a#  SR                  XR                   S      5      US'   U$ )z
build sample for inference tasks.

step 1. Get the input question and database id from text input
step 2. Get the database structure input
step 3. Add a pseudo ids for every input.
r4   r5   r   culture_companyr7   r8   r9   Nr:   r=   r>   r?   rC   )rD   rE   r=   solutionr@   label)rH   r   rK   r   r
   r    r   rI   rL   rM   r   r   rN   r   rO   r   )	r!   r*   r4   rX   rY   r:   r=   rZ   r^   s	            r'   r0   ,OfaTextToSqlPreprocessor._build_infer_samplek   s    (Vt^ 	JI	J;OOF+,46GH ,,,*=""S(5036>J+D  ' xx

**,-Ad.A.ABC"4d6H6H#'#7#7#>P
{+	{+	.55#445%%d-B-B&BC $d11D4J4JJ "# % xH(T__.#.$#ll4
0K+LMF7Or)   )r    r   r   r   r   )__name__
__module____qualname____firstlineno____doc__r   	INFERENCEr   r   strr   r1   r/   r0   __static_attributes____classcell__)r&   s   @r'   r   r      s     ((402T#s(^ 2S#X 26S#X 64S> 6p%S#X %4S> % %r)   r   c                     [        XX#U5      n[        Xu5      S   R                  5       nU(       d  UUUS.$ [        Xu5      u  pUUU	US.$ )Nserialized_schema)r:   r;   r=   )r:   r;   r<   r=   )form_input_for_constructionspider_add_serialized_schemarK   spider_pre_process_one_function)
rV   rW   rX   db_pathschemar$   is_trainexro   r<   s
             r'   rM   rM      sl     
%Uef	MB4
%'',uw *
 	

 8AH&	 r)   itemc                 l    Sn[        U S   U S   SUR                  S9nX S   R                  5       -   U4$ )N rV   rX   T)rV   rX   normalize_querytarget_with_db_idrW   )spider_get_targetr{   rK   )rw   r$   prefixr<   s       r'   rr   rr      sI    F7m7m00	G $**,,g55r)   rV   rX   rz   r{   r+   c                 Z    U(       a  [         OS nU(       a  U SU" U 5       3$ U" U 5      $ )Nc                     U $ r-    )xs    r'   <lambda>#spider_get_target.<locals>.<lambda>   s    ar)   z | )	normalize)rV   rX   rz   r{   
_normalizes        r'   r|   r|      s8     .KJ/@eWC
5)*+ jG r)   c                 <    S nS nS nU" U" U" U 5      5      5      $ )Nc                 &    U R                  SS5      $ )N , , )replacess    r'   	comma_fixnormalize.<locals>.comma_fix   s    yy%%r)   c                 @    SR                  U R                  5       5      $ )Nr9   )r   rI   r   s    r'   white_space_fix"normalize.<locals>.white_space_fix   s    xx	""r)   c                 4    [         R                  " SS U 5      $ )Nz\b(?<!['\"])(\w+)(?!['\"])\bc                 @    U R                  S5      R                  5       $ )Nr   )grouplower)matchs    r'   r   *normalize.<locals>.lower.<locals>.<lambda>   s    EKKN$8$8$:r)   )resubr   s    r'   r   normalize.<locals>.lower   s    vv5:A? 	?r)   r   )rV   r   r   r   s       r'   r   r      s'    &#?
 _U5\233r)   rv   c                     [        US5      (       a4  [        U S   U S   U S   U S   U S   U S   U S   UR                  S	S
9	nSU0$ [        U S   U S   U S   U S   U S   SSS	UR                  S	S9
nSU0$ )Nschema_serialization_with_nlrW   rs   rX   db_column_namesdb_table_namesdb_primary_keysdb_foreign_keysT)	rW   rs   rX   r   r   r   r   $schema_serialization_with_db_contentrz   peteshawF)
rW   rs   rX   r   r   schema_serialization_typeschema_serialization_randomizedschema_serialization_with_db_idr   rz   ro   )getattr!serialize_schema_natural_languager   serialize_schema)rv   r$   ro   s      r'   rq   rq      s    t344=
^yMW+01./01011500 
4  !233 -
^yMW+01./&0,1,01500 
  !233r)   rW   rs   r   r   r   c	           	         U SSR                  U V	s/ s H  o(       a  U	R                  5       OU	PM     sn	5       S3n
S nS nS nS nUS   n[        [        US   US	   5      5      nU
/n/ n/ nSn[	        U5       GH}  u  nnU(       a  UR                  5       OUnUR                  U5        / n/ n/ n[	        [        US
   US   5      5       H  u  nu  nnUS:X  a  M  U(       a  UR                  5       OUnUR                  U5        UU:X  d  MC  UR                  U5        UU;   a  UR                  U5        U(       d  Mt  [        U UUUS-   U-   S-   U-   S-   S9nU(       d  M  UR                  UUR                  U5      45        M     U" UU5      nUR                  U5        U" SR                  U5      5      nUR                  U5        [        U5      S:  d  GMd  U" U5      n UR                  U 5        GM     U HB  u  nnUUS
   U      n!UU   n"UUS
   U      n#UU   n$U" U!U"U#U$5      n%UR                  U%5        MD     SR                  U5      $ s  sn	f )Nz contains tables such as r   .c                     U  S3$ )Nz is the primary key.r   )primary_keys    r'   &table_description_primary_key_templateQserialize_schema_natural_language.<locals>.table_description_primary_key_template  s    233r)   c                 2    SU  SSR                  U5       S3$ )NzTable z has columns such as r   r   )r   )namecolumn_namess     r'   table_description<serialize_schema_natural_language.<locals>.table_description  s"    v2499\3J2K1MMr)   c           
      z    SR                  U  VVs/ s H  u  pSR                  X5      PM     snn5       $ s  snnf )Nry   z"The {} contains values such as {}.)r   rN   )cv_pairscolumnvalues      r'   value_description<serialize_schema_natural_language.<locals>.value_description	  s9    ''kstksZgZ`?FFvUkstuvwts   7
c           	           SU SU  SU SU S3	$ )NzThe z of z is the foreign key of r   r   )table_1column_1table_2column_2s       r'   foreign_key_descriptionBserialize_schema_natural_language.<locals>.foreign_key_description  s'    hZtG9,CH:TRYQZZ[\\r)   	column_idother_column_idtable_idcolumn_namer   r7   r8   rW   
table_namer   rs   r9   )r   r   listzip	enumerateappendr	   rJ   )&rW   rs   rX   r   r   r   r   r   rz   r   overall_descriptionr   r   r   r   descriptionsdb_table_name_strsdb_column_name_strs	value_sepr   r   table_name_strcolumnscolumn_value_pairsprimary_keysr   r   y
column_strmatchestable_description_columns_str!table_description_primary_key_strvalue_description_strx_table_namex_column_namey_table_namey_column_nameforeign_key_description_strs&                                         r'   r   r      s    #G#<!YY^l'm^lVZ

T(Q^l'mnoopr4Nx] &k2OOK(/:K*LMOO ((LI ). 9*/>))+J!!.1!*OJ/#M24"5Iv1 A~&51J&&z2H}z*/ ''
3772!)#-$%!(3!6!<u!D#,"-	G w*11')@AC)"5. ):G)%%9:,RIIl#-%)=>!"Q&$56H$I! 56K !:N  1)/**Ea*HI+A.)/**Ea*HI+A.&=-}'F#78   88L!!O (ns   "Ir   r   r   c
                   ^ ^^^^	^^^^^^ US:X  a  Sn
SnSnSnSmSmSmOUS:X  a  S	n
S
nSnSnSmSmSmO[         eS[        S[        S[        4UUUUU	U UU4S jjm[        U5       V^V^s/ s Hi  u  mmUR                  T	(       a  TR	                  5       OTUR                  [        UU4S j[        U4S j[        US   US   5      5      5      5      S9PMk     nnnU(       a  [        R                  " U5        U(       a#  U
R                  TS9UR                  U5      -   nU$ UR                  U5      nU$ s  snnf )NverbosezDatabase: {db_id}. z. z"Table: {table}. Columns: {columns}r   z{column} ({values})z{column}r   z
 | {db_id}ry   z | {table} : {columns}r   z{column} ( {values} )r   r   r+   c                   > T(       a  UR                  5       OUnT
(       aP  [        T	U UTS-   T-   S-   T-   S-   S9nU(       a  TR                  UTR                  U5      S9$ TR                  US9$ TR                  US9$ )Nr7   r8   r   )r   values)r   )r   r	   rN   r   )r   r   column_name_strr   column_str_with_valuescolumn_str_without_valuesrX   rs   rz   rW   r   r   s       r'   get_column_str(serialize_schema.<locals>.get_column_strk  s     &++ 
"- 	/*!%' 3.4u<yH	G -44*9>>'3J 5 L L 1777OO,33?3KKr)   c                    > T" TU S   S9$ )Nr   )r   r   r   )r   r   r   s    r'   r   "serialize_schema.<locals>.<lambda>  s    n#-1Q4Ar)   c                    > U S   T:H  $ )Nr   r   )r   r   s    r'   r   r     s    !A$("2r)   r   )tabler   )rX   )NotImplementedErrorrk   r   rN   r   r   mapfilterr   randomshuffle)rW   rs   rX   r   r   r   r   r   r   rz   	db_id_str	table_sep	table_str
column_sepr   r   tablesro   r   r   r   r   s   ```     ``    ``  @@@@r'   r   r   K  s|    !I-)		8	
!6$.!		"j	0 		,	
!8$.!	!!L3 LS LS L LD '0&? '@"h
 	(7*""$ZOOA2+J7+M:
 	 	
 '@  " 'v&%,, - $>>&12  &NN621s   -A0D:c                 X   U UUUUS   US    VVs/ s H  u  pVUPM	     snnUS    VVs/ s H  u  pVUPM	     snnS.US   US    Vs/ s H  nSU0PM	     snUS    VVs/ s H  u  pxUPM	     snnUS    VVs/ s H  u  pxUPM	     snnS.S	.	$ s  snnf s  snnf s  snf s  snnf s  snnf )
Ntable_names_originalcolumn_names_original)r   r   column_typesr   r   foreign_keys)r   r   )	rV   rW   rX   rs   r   r   db_column_typesr   r   r   )	rV   rW   rX   rs   rt   r   r   r   r   s	            r'   rp   rp     s    	%& .44K-L-L)H -L .44K-L-L)H -L	
 	~ ".131i 
13 392H2H.I 2H 392H 2H.I  2H 	
5$ $3 s   BBB B 9B&)F)FT)r   FTFT)r   r   r   typingr   r   r   rQ   modelscope.utils.constantr   baser   utils.bridge_content_encoderr	   utils.get_tablesr
   r   rM   dictrr   rk   boolr|   r   rq   r   r   rp   r   r)   r'   <module>r     s   
  	 " "  . % > 1@2 @R !2
6$ 
6  	
 	4S 4S 4$4T 4D 4N 27 S"S"S" S" #s(^	S"
 IS" +/S" S" 	S"x &0,1,016 KKK K #s(^	K
 IK  #K &*K &*K +/K K 	K\%r)   