
    9iSS                     |   S SK r S SKJrJr  S SKrS SKrS SKrS SKJ	r	J
r
JrJrJrJr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  S SKJ r J!r!  S SK"J#r#  S SK$J%r%  S SK&J'r'J(r(J)r)  S SK*J+r+  \+" 5       r,Sr-Sr. " S S\R^                  5      r0 " S S\05      r1 " S S\R^                  5      r2g)    N)DictUnion)ArrowBasedBuilderDatasetDatasetDictGeneratorBasedBuilderIterableDatasetIterableDatasetDict)is_remote_filesystem)DatasetInfo)camelcase_to_snakecase)csv)FileLock)
map_nested)HubApi)DatasetContextConfig)ExternalDatasetNativeIterableDataset)DataStreamingDownloadManager)get_subdir_hash_from_split)DEFAULT_DATASET_NAMESPACEDatasetPathNameDownloadMode)
get_logger	delimiter,c                      ^  \ rS rSrS\4U 4S jjr\4S jrSS\4S\4S jjr	S r
S	 rS
 rS rS rS\4S jrSrU =r$ )CsvDatasetBuilder&   dataset_context_configc           
        > UR                   U l         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  R                  U l        UR                  R                  U l        UR                  U l
        [        0 5      U l        [        R                  R                  U R                  U R                  U R                   U R                  [         R"                  5      U l        [&        U l        [*        U R                  ;   a  U R                  [*           U l        U R
                  =(       d-    [-        UR                  R.                  R1                  5       5      n[3        X R                  S9nSSKJnJn  U R                  R;                  5        VVs0 s H  u  pgXe" U/S S9_M     nnnUR=                  U5      n[>        T	U ]  " SU R$                  U R                  UUS.U R                  D6  U R                   U RB                  l"        [G        U R                   5      U l$        [        0 5      U l%        g s  snnf )Nsplitversionr   )DataFilesDictDataFilesList)origin_metadata)	cache_dirconfig_namehash
data_files )&dataset_namecache_root_dir	namespacer$   subset_namer#   data_meta_configmeta_data_fileszip_data_filesconfig_kwargsinput_config_kwargsdictsplit_path_dictospathjoinr   	META_NAMEcache_build_dirDEFAULT_CSV_DELIMITERcsv_delimiterDELIMITER_NAMElisttarget_dataset_structurekeysr   datasets.data_filesr%   r&   itemsfrom_local_or_remotesuper__init__infobuilder_namer   namelocal_meta_csv_paths)
selfr    r#   sub_dir_hashr%   r&   kvr+   	__class__s
            n/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/msdatasets/download/dataset_builder.pyrG   CsvDatasetBuilder.__init__(   s   2??4CC/99-551==+11
5FFVV4EETT#9#G#G #Bx!ww||D,?,?,0NND<M<M,0LL,;,E,E G 3T555!%!9!9.!ID

 Dd#9#J#J#;#;DDFD1/ 	E ,,224
4 }aS$774 	 
 #77
C
 	(**!		(
 &&	( "&!2!2		*4+<+<=	*.r(!!
s   I-c           	      x    [         R                  R                  U R                  U R	                  SSUS95      nU$ )NFT)with_version	with_hashr/   )r8   r9   r:   _cache_dir_root_relative_data_dir)rL   r/   builder_data_dirs      rQ   _build_cache_dir"CsvDatasetBuilder._build_cache_dirU   sB    77<<  ##"di $ IJ
      Treturnc                 
   Uc  U R                   R                  OU SU R                   R                   3nU R                  nU R                  nU(       a)  [        R
                  R                  X@R                  5      nU(       a=  [        R
                  R                  U[        U R                  R                  5      5      nU(       a;  U(       a4  [        U[        5      (       a  [        R
                  R                  XF5      nU$ )zRelative path of this dataset in cache_dir:
Will be:
    self.name/self.config.version/self.hash/
or if a namespace has been specified:
    self.namespace___self.name/self.config.version/self.hash/
___)rH   rI   configr*   r8   r9   r:   	config_idstrr$   
isinstance)rL   rT   rU   r/   rX   builder_configr*   s          rQ   rW   $CsvDatasetBuilder._relative_data_dir]   s     6?5F49911ykY\]a]f]f]s]s\tLuyy!ww||,<nnM!ww||,<,/0C0C,D F*T3"7"7!ww||,<Cr[   c           
         U R                   R                  (       d  [        S5      eUR                  U R                   R                  5      nUR                  U R                  5      n/ nUR                  5        Hb  u  pV[        U[        5      (       a  U/nUR                  [        R                  " UUR                  U5      UR                  U5      S.S95        Md     U$ )Nz7At least one data file must be specified, but got none.)filesbase_dirrJ   
gen_kwargs)r_   r+   
ValueErrordownload_and_extractr3   rD   rb   ra   appenddatasetsSplitGenerator
iter_filesget)rL   
dl_managerr+   r3   splits
split_namerf   s          rQ   _split_generators#CsvDatasetBuilder._split_generatorss   s    {{%%IK K44T[[5K5KL
#889L9LM!+!1!1!3J%%%MM''#!+!6!6u!=$2$6$6z$B  "4 r[   c           
   #     ^#    U R                   R                  b4  [        R                  " U R                   R                  R                  5      OS nU(       aC  [        UR                  UR                  5       VVs0 s H  u  pEXER                  5       _M     snnOS n[        U5       H  u  pg[        R                  " USXPR                  S9n/ n	UR                  R                   H,  n
U
R                  S5      (       d  M  U	R                  U
5        M.      [        U5       HT  u  pU	 H%  n
T(       d  M  X   R!                  U4S j5      X'   M'     [        R"                  R%                  XS9nXk4U4v   MV     M     g s  snnf ! [&         a-  n[(        R+                  SU S[	        U5       SU 35        e S nAff = f7f)	NT)iteratordtyper   :FILEc                 D   > [         R                  R                  TU 5      $ Nr8   r9   r:   )xrg   s    rQ   <lambda>4CsvDatasetBuilder._generate_tables.<locals>.<lambda>   s    "'',,x*Cr[   )schemazFailed to read file 'z' with error z: )r_   featurespar   typezipnamestypesto_pandas_dtype	enumeratepdread_csvr>   _engineendswithrl   applyTablefrom_pandasrj   loggererror)rL   rf   rg   r   rJ   rx   file_idxfilecsv_file_readertransform_fields
field_name	batch_idxdfpa_tablees     `            rQ   _generate_tables"CsvDatasetBuilder._generate_tables   s     $ 4 4 @ 4;;//44 FJ 	
   #6<<>
> ''))>
  	 (.NH kkt5<N<NPO!-55;;
&&w//$++J7 <%.%?MI&6
#8-/^-A-A C.EBN '7  "xx33B3FH#/99 &@ /	
&  +D6tAwir!MOsD   A8F>;E>A%F> F>F4AF9F>
F;(F66F;;F>c                 b   UR                   R                  nUR                   R                  nU(       d  [        R                  nUR                   R
                  nU(       d  [        R                  nU R                  nU(       d  [        R                  n/ nUR                  [        R                  5        UR                  UR                   R                  5        UR                  U5        UR                  U5        UR                  U5        [        R                  R                  U5      n	[        R                  R                  UR                  [        R                  5      U	S-   5      n
[        U
5         [        R                  R!                  U5      nU(       aD  U["        R$                  R&                  :X  a&  [(        R+                  SU R,                   SU S35        [(        R/                  SU R,                   SU S35        U R1                  X!S9  S S S 5        g ! , (       d  f       g = f)N.lockReusing dataset  ()Generating dataset )rq   download_mode)download_configr(   r#   r   LOCK_FILE_NAME_ANYr$   r0   rl   DATA_FILES_NAMEr-   LOCK_FILE_NAME_DELIMITERr:   r8   r9   stripr   existsr   REUSE_DATASET_IF_EXISTSvaluer   warningrJ   rH   _download_and_prepare)rL   r   rq   download_kwargstarget_cache_dirrs   version_namer0   lock_file_nameslock_file_name	lock_pathdata_existss               rQ   download_and_prepare&CsvDatasetBuilder.download_and_prepare   s    &55??//55
(;;J!1199*==L&&)<<K >>?z99FFG|,{+z*(AAFF GGLL""?#B#BCW$&	 i ''..)9:K}0T0T0Z0ZZ&tyyk4D3EQGIKK-dii[;K:LANO&&% ' D !  s   7B H  
H.c           	         SS K nUR                  R                  nU[        R                  R
                  :X  a%  UR                  USS9  [        R                  " USS9  U R                  R                  5        VVs0 s H  u  pVU[        R                  " Xd5      _M     snnU l        UR                  U R                  5      U l        g s  snnf )Nr   T)ignore_errors)exist_ok)shutilr   r(   r   FORCE_REDOWNLOADr   rmtreer8   makedirsr2   rD   r   fetch_meta_files_from_urlrK   rk   r3   r7   )rL   rq   r   r   r   rN   rO   s          rQ   r   'CsvDatasetBuilder._download_and_prepare   s    %55??L99???MM*$M?KK(48 ,,224%
4 v//DD4%
!
  *>> !%
s   ;#Cc                   ^ [         R                  " USU R                  S9n/ nUR                  R	                  5        H,  nUR                  S5      (       d  M  UR                  U5        M.     U R                  R                  US5      mU H  n[        T[        5      (       ah  [        T5      S:  aY  UR                  S   [        T5      :w  a7  [        R                  SU SUR                  S    S[        T5       S	35        Mz  TX5'   M  [        T[        5      (       a"  T(       a  X5   R!                  U4S
 j5      X5'   M  [        R#                  SU 35        M     [$        R&                  R)                  U5      n[+        US9$ )NFrw   r   ry    r   z,Number of lines in meta-csv file for split 'z' (z&) does not match number of data-files(z)!c                 D   > [         R                  R                  TU 5      $ r{   r|   )r}   base_extracted_dirs    rQ   r~   ;CsvDatasetBuilder._convert_csv_to_dataset.<locals>.<lambda>   s    bggll+=qAr[   zNothing to do for field )arrow_table)r   r   r>   columnstolistr   rl   r7   rp   rb   r@   lenshaper   r   ra   r   r   r   r   r   r   )rL   rs   csv_file_pathr   r   r   pa_datar   s          @rQ   _convert_csv_to_dataset)CsvDatasetBuilder._convert_csv_to_dataset   sh   [[ET5G5GI **++-J""7++ ''
3 . 04/C/C/G/G0*J,   $'(:$;a$?88A;#&8"99LLFzlRUVXV^V^_`VaUb c??BCU?V>WWY[
 &8BN.449K!#!5!5A"C !9*FG +  ((&&r*7++r[   c                     [        U R                  R                  5        VVs0 s H  u  pXR                  X5      _M     snn5      $ s  snnf r{   )r   rK   rD   r   )rL   rN   rO   s      rQ   
as_datasetCsvDatasetBuilder.as_dataset   sN    11779
9 ++A119
  	 
s   A
)r<   r.   r>   r-   r5   rK   r2   rJ   r/   r#   r7   r0   r$   r3   )__name__
__module____qualname____firstlineno__r   rG   r   rY   ra   rW   rt   r   r   r   r   r   r   __static_attributes____classcell__rP   s   @rQ   r   r   &   sd    +3/C +3Z *C   )-%)%>  DG ,&6%DN! ,>K  r[   r   c                   4    \ rS rSrS\4S jrS rS rS rSr	g)	TaskSpecificDatasetBuilder   r    c                    UR                   U l        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  =(       d-    [        UR                  R                  R                  5       5      n[        X R
                  S9U l        UR                  R                  U l        UR                  R                  U l        S U l        S U l        ["        R$                  " SUR                   05      U l        [(        R*                  R-                  UR.                  5      U l        U R3                  5       U l        UR                  R6                  U l        g )Nr"   rI   )r-   rJ   r0   r/   r#   r$   r@   r1   rA   rB   r   r*   r2   r+   r3   r7   r_   r   	from_dictrH   r8   r9   
expanduserr.   rV   rY   
_cache_dirmeta_args_map_config_kwargs)rL   r    r#   s      rQ   rG   #TaskSpecificDatasetBuilder.__init__  s!    +77	1==/99+11
-55

 Dd#9#J#J#;#;DDFD./	0AAQQ4EETT#))3@@AC	!ww11"11 3//14EESSr[   c                 r   [         R                  R                  U R                  U R                  R                  [         R                  S5      S-   5      n[        U5         [         R                  R                  U R                  5      nU(       aN  U[        R                  :X  a:  [        R                  SU R                   SU R                   S35         S S S 5        g [        R                  SU R                   SU R                   S35        S S S 5        U R                  US9  g ! , (       d  f       N= f)N_r   r   r   r   r   )rq   )r8   r9   r:   rV   r   replacesepr   r   r   r   r   r   rJ   rH   r   )rL   r   rq   r   r   r   s         rQ   r   /TaskSpecificDatasetBuilder.download_and_prepare  s     GGLL  OO##BFFC07:<	 i ''..9K}0T0TT&tyykDOO3DAFH !  KK-dii[4??:K1MN ! 	""j"9 ! s   !A6D( 0D((
D6c                 D    UR                  U R                  5      U l        g r{   )rk   r3   r7   )rL   rq   s     rQ   r   0TaskSpecificDatasetBuilder._download_and_prepare'  s    )>> !r[   c                 B    [        U R                  U R                  5      $ r{   )r   r7   r   )rL   s    rQ   r   %TaskSpecificDatasetBuilder.as_dataset+  s    t33T5H5HIIr[   )r   rV   r   r_   r+   r*   rH   rJ   r/   r#   r7   r0   r$   r3   N)
r   r   r   r   r   rG   r   r   r   r   r,   r[   rQ   r   r      s!    T/C T.:!Jr[   r   c                      ^  \ rS rSrS\4U 4S jjr\S\S\R                  4S j5       r	S\
S\\\\4   \4   4S jrS\
4S jrS\4S	 jrS
 rS\SS4S jr\S\S\S\4S j5       rSrU =r$ )IterableDatasetBuilderi/  r    c           	      |  > UR                   U l         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  R                  U l        UR                  R                  U l        UR                  U l
        UR                  U l        [        R                  R                  U R                  U R                  U R                   U R                  [        R                   5      U l        [$        U l        [(        U R                  ;   a  U R                  [(           U l        U R
                  =(       d-    [+        UR                  R,                  R/                  5       5      n[1        X R                  S9n[2        TU ]h  " SU R"                  U R                   U R                  US S.U R                  D6  U R                   U R6                  l        [;        U R                   5      U l        S U l        UR                  R@                  U l         g )Nr"   )r(   r-   r)   r*   r+   r,   )!r-   r.   r/   r$   r0   r#   r1   r2   r3   r4   r5   stream_batch_sizer8   r9   r:   r   r;   r<   r=   r>   r?   r@   rA   rB   r   rF   rG   rH   rI   r   rJ   meta_csv_dfmeta_cache_dir)rL   r    r#   rM   rP   s       rQ   rG   IterableDatasetBuilder.__init__1  s   2??4CC/99-551==+11
5FFVV4EETT#9#G#G !7!I!I!ww||D,?,?,0NND<M<M,0LL,;,E,E G 3T555!%!9!9.!ID

 Dd#9#J#J#;#;DDFD1/ 	 	(****	( &&	( "&!2!2		*4+<+<=	4EETTr[   r\   c                     [        U S9nU$ )N)r    )r   )r    builder_instances     rQ   get_builder_instance+IterableDatasetBuilder.get_builder_instanceY  s     2#9;r[   rq   c                 v   [        U [        [        45      (       d  [        SU R                   S35      e[        U R                  5      (       + nU(       d,  [        S[        U R                  5      R                   S35      eU R                  U5        U R                  U5       Vs0 s H  nUR                  U_M     nnUR                  R                  nUc  UnO$XT;   a  XE   nO[        SU S[        U5       35      e[        U R                   USS9n[        U["        5      (       a  [%        U5      nU$ s  snf )	NzBuilder z is not streamable.z(Loading a streaming dataset cached in a z is not supported yet.zBad split: z. Available splits: T)	map_tuple)rb   r   r   rj   rJ   r   _fsNotImplementedErrorr   r   _check_manual_downloadrt   r   r#   r@   r   _as_streaming_dataset_singler6   r
   )rL   rq   is_localsgsplits_generatorsr#   splits_generatorstreaming_datasetss           rQ   as_streaming_dataset+IterableDatasetBuilder.as_streaming_dataset`  sH    $!68I JKKx		{2EFGG+DHH55%:4>;R;R:SSij  	##J/ ,,Z8
8 GGRK8 	 
 **00=0'07eW$8>O9P8QR 
 (--

 ($//!45G!H!!1
s   &D6c           
         / nSnSnU R                   (       a,  [        [        U R                   R                  5       5      5      nU R                  (       a,  [        [        U R                  R                  5       5      5      nU(       aT  U(       dM  U R                   R                  5        H-  u  pVUR                  [        R                  " UU/ US.S95        M/     U$ U(       a  U(       a  U R                  R                  5        H`  u  pW[        U[        5      (       a  U/nU R                   R                  U5      nUR                  [        R                  " UUUUS.S95        Mb     U$ U(       dl  U(       ae  U R                  R                  5        HE  u  pW[        U[        5      (       a  U/nUR                  [        R                  " USUUS.S95        MG     U$ SU R                   S3e)Nr   )metarf   rq   rh   +Neither column meta nor data file found in z#.json, specify at least one column.)r2   nextitervaluesr3   rD   rl   rm   rn   rb   ra   rp   r-   )rL   rq   rr   meta_data_filezip_data_filers   meta_file_urlrf   s           rQ   rt   (IterableDatasetBuilder._split_generators  s   !$t';';'B'B'D"EFN d&9&9&@&@&B!CDM--1-A-A-G-G-I)
++'$1%'*4$ .JP = %)%8%8%>%>%@!
eS))"GE $ 4 4 8 8 D++'$1%**4$	 &A: !  M%)%8%8%>%>%@!
eS))"GE++'$&%**4$ &A  @@Q@Q?RRuvvr[   c                 x    U R                  U5      n[        UU R                  UR                  U R                  S9$ )N)rH   r#   r   ) _get_examples_iterable_for_splitr   rH   rJ   r   )rL   r   ex_iterables      rQ   r   3IterableDatasetBuilder._as_streaming_dataset_single  s>    
 ;;<LM$"''"44	6 	6r[   c              +     #    UR                  S5      nUR                  S5      nUR                  S5      n[        5       nSnSnU(       aW  [        [        [	        U5      5      5      nUR                  S5      (       a$  Sn[        R                  R                  U5      S   nU(       aH  U(       dA  U R                  U5        [        R                  R                  U R                  5      n	SU	4v   g U(       a  U(       a  U R                  U5        U(       aB  UR                  U R                  U R                   U R"                  U5      n
XR$                  l        [        R                  R                  U R                  5      n	SU	4v   g U(       d/  U(       a(  [        R                  R)                  S	U05      n	SU	4v   g S
U R                   S3e7f)Nr  rf   rq   Fr   z.zipTr   z
Input:FILEr  z.json .)rp   r   ra   r  r  r   r8   r9   splitext_get_meta_csv_dfr   r   r   r   &get_dataset_access_config_for_unzippedr-   r/   r$   r   
oss_configfrom_pydict)rL   ri   r  rf   rq   hub_apiis_zipzip_file_namezip_filer   oss_config_for_unzippeds              rQ   r   'IterableDatasetBuilder._generate_tables  sp    "v.w'^^L1
(4U,-H  (( " 0 0 :1 =!!-0xx++D,<,<=HX+u!!-0*1*X*X%%t~~t||!+#' 9P**5xx++D,<,<=HX+5xx++\5,ABHX+ @@Q@Q?RRYZZs   GGr  Nc                     U R                   b  U R                   R                  (       aF  [        R                  " XR                  5      n[
        R                  " USU R                  S9U l         g g )NFr   )r   emptyr   r   r   r   r   r>   )rL   r  meta_csv_file_paths      rQ   r  'IterableDatasetBuilder._get_meta_csv_df  s[    #t'7'7'='=!'!A!A22"4!{{",, .D (>r[   headerstextsr   c                     0 nU R                  U5      n [        S[        U 5      5       H7  n/ nU H&  nUR                  UR                  U5      U   5        M(     XSX   '   M9     U$ )Nr   )r#   ranger   rl   )r"  r#  r   residxcol_listlines          rQ   trans_data_to_mapping,IterableDatasetBuilder.trans_data_to_mapping  sd    --	*CL)CH

9 5c :;  (	 *
 
r[   )r<   r.   r>   r-   r5   r   r   r2   rJ   r/   r#   r   r0   r$   r3   )r   r   r   r   r   rG   staticmethodr   Csvr   r   r   r   ra   r	   r  rt   r   r   r   r  r@   r*  r   r   r   s   @rQ   r   r   /  s    &U/C &UP  $8 =@WW   &"6&"	tC()?:	;&"P1,H 1f
6 

6'[R.c .d . s 4 C  r[   r   )3r8   typingr   r   rm   pandasr   pyarrowr   r   r   r   r   r	   r
   datasets.filesystemsr   datasets.infor   datasets.namingr   datasets.packaged_modulesr   datasets.utils.filelockr   datasets.utils.py_utilsr   modelscope.hub.apir   4modelscope.msdatasets.context.dataset_context_configr   !modelscope.msdatasets.dataset_clsr   r   /modelscope.msdatasets.download.download_managerr   )modelscope.msdatasets.utils.dataset_utilsr   modelscope.utils.constantr   r   r   modelscope.utils.loggerr   r   r?   r=   r-  r   r   r   r,   r[   rQ   <module>r>     s    
    + + 6 % 2 ) , . %F!F F .	 V Vr-J!2 -J`SSWW Sr[   