
    9iA0                         S SK r S SKrS SKrS SKJr  S SKrS SKrS SKJr  S SK	J
r
  S SKJr  S SKJrJrJrJr  S SKJr  S SKJrJr  \" 5       r " S	 S
\5      r " S S\5      r " S S\5      rg)    N)islice)IterableDataset)tqdm)MaxComputeUtil)DEFAULT_MAXCOMPUTE_ENDPOINTEXTENSIONS_TO_LOADMaxComputeEnvsVirgoDatasetConfig)
get_logger)fetch_csv_with_url	valid_urlc                   0    \ rS rSrSrS rS rS rS rSr	g)	ExternalDataset   z"Dataset class for custom datasets.c           	         Xl         [        R                  " U5      U l        U R                  R	                  SU R                   05        S U l        U R                   R                  5        VVs0 s H  u  p4U/ _M
     snnU l        0 U l        SnU R                   R                  5        GHU  u  pg[        U[        5      (       d  M  [        R                  R                  U5      (       d  MC  [        R                  " U5      n[        U V	s/ s H4  n	[        R                  R!                  U	5      S   R#                  S5      PM6     sn	5      n
SU
;   a  M  [%        U
5      S:w  aB  SR'                  [(        R*                  " 5       5      n[,        R/                  SU SU S	35        M  [1        U
5      S
   nU[(        ;  a  GM  U V	s/ s H"  n	[        R                  R'                  Xy5      PM$     nn	XR                  U'   GMX     U(       a?  [(        R2                  " U5      n[4        R6                  " U4SU R                  0UD6U l        g g s  snnf s  sn	f s  sn	f )Nsplit_config .   ,zSplit-z has been ignored, please flatten your folder structure, and make sure these files have same extensions. Supported extensions: z .r   
data_files)split_path_dictcopydeepcopyconfig_kwargsupdatespec_extension_datasetitemssplit_data_files
custom_map
isinstancestrospathisdirlistdirsetsplitextstriplenjoinr   keysloggererrorlistgetdatasetsload_dataset)selfr   r   k_file_ext
split_name	split_dirsplit_file_names	file_nameset_files_extssupported_extssplit_file_pathss                i/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/msdatasets/dataset_cls/dataset.py__init__ExternalDataset.__init__   s(   .!]]=9!!>43G3G"HI&*# ,,224!
4 rE4!
  %)%9%9%?%?%A!J)S))bggmmI.F.F#%::i#8 !$%5&%5	 GG$$Y/399#>%5& " '~&!+%(XX.@.E.E.G%HNLL  -11?0@DE /2#55 &6$%5	 GGLL6%5 ! $ 5E%%j13 &B6 )--h7H*2*?*?+M%)%:%:+M>K+MD' G!
&$$s   -I	
;I
)Ic                     U R                   (       d  [        U R                  5      $ U R                   R                  5       $ N)r   r+   r   __len__r4   s    r?   rD   ExternalDataset.__len__H   s>     ,,   
 	
262M2M2U2U 3
	
    c                     U R                   (       d  U R                  R                  U5      $ U R                   R                  U5      $ rC   )r   r   r1   __getitem__)r4   items     r?   rI   ExternalDataset.__getitem__N   s9    **''++D11..::4@@rG   c              #      #    U R                   (       d)  U R                  R                  5        H
  u  pX4v   M     g U R                   R                  5        H
  u  pX4v   M     g 7frC   )r   r   r   )r4   r5   vs      r?   __iter__ExternalDataset.__iter__T   sT     **,,224d
 5 3399;d
 <s   A$A&)r   r!   r   r    r   N)
__name__
__module____qualname____firstlineno____doc__r@   rD   rI   rN   __static_attributes__ rG   r?   r   r      s    ,,M\
ArG   r   c                   R   ^  \ rS rSrSrS
U 4S jjrS rS rS rS r	SS jr
S	rU =r$ )NativeIterableDataset]   z&The modelscope iterable dataset class.c                 .   > [         TU ]  XUS9  X@l        g )N)ex_iterableinfosplit)superr@   stream_batch_size)r4   r[   r\   r]   r_   	__class__s        r?   r@   NativeIterableDataset.__init__`   s    [5I!2rG   c              #      #    [        U R                  U R                  SS9SU R                  SS9 H  nU R	                  U5      nUv   M     g 7f)NF
batch_sizedrop_last_batchzOverall progressT)desctotaldynamic_ncols)r   iterr_   n_shards_download_item)r4   rJ   rets      r?   rN   NativeIterableDataset.__iter__d   sU     		#55u  N'mm"$D %%d+CI$s   AAc                     U R                   $ rC   )rj   rE   s    r?   rD   NativeIterableDataset.__len__o   s    }}rG   c           	   #   :  #    [        U[        5      (       a
  UnUS-   nSnO$UR                  nUR                  nUR                  nUb  US::  a  [        S5      e[        [        U R                  SSS9UX45      SSS	9 H  nU R                  U5      nUv   M     g7f)
zP
Returns the item at index `index` in the dataset. Slice indexing is supported.
r   Nr   zstep must be positiveFrc   zSlicing progressT)rf   rh   )
r"   intstartstopstep
ValueErrorr   r   ri   rk   )r4   indexrr   rs   rt   rJ   rl   s          r?   rI   !NativeIterableDataset.__getitem__r   s      eS!!E19DDKKE::D::D	455IIEIBE  ("$D %%d+CI$s   BBc                    0 n[        U[        5      (       a   UR                  5        H  u  p4XBU'   UR                  S5      (       d  M!  U R                  R
                  R                  S5      nUR                  U5      n[        U[        5      (       a  U/nXbU'   XBUR                  S5      '   M     U$ UnU$ ! [         a"  n[        R                  U5        Un S nAU$ S nAff = f)Nz:FILE
dl_manager)r"   dictr   endswith_ex_iterablekwargsr1   download_and_extractr#   r*   	Exceptionr.   r/   )r4   rJ   rl   r5   rM   ry   ex_cache_pathes           r?   rk   $NativeIterableDataset._download_item   s    dD!! JJLDAFzz'**%)%6%6%=%=%A%A,%O
(2(G(G(J%mS99-:OM!.A01AGGG,- )  
 C
  Q 
s   -B9 
A)B9 9
C%C  C%c                     / nUS::  a  U$ SnU R                  5        H!  nX1:  a    U$ UR                  U5        US-  nM#     U$ )z
Returns the first n rows of the dataset.

Args:
    n (int): Number of rows to return.

Returns:
    list: The list of results, e.g. [{'id': 'abc123', 'text': 'hello world'}, ...]
r   r   )rN   append)r4   nresiter_numrJ   s        r?   headNativeIterableDataset.head   sW     6JMMOD} 
 JJtMH	 $
 
rG   )r_   )r   )   )rP   rQ   rR   rS   rT   r@   rN   rD   rI   rk   r   rU   __classcell__)r`   s   @r?   rX   rX   ]   s)    03	4, rG   rX   c                       \ rS rSrSrS rS rS rS r\	S\
R                  4S j5       rS	 r\S\4S
 j5       rS rSrg)VirgoDataset   a  Dataset class for Virgo.

Attributes:
    _meta_content (str): Virgo meta data content, could be a url that contains csv file.
    _data_type (int): Virgo dataset type, 0-Standard virgo dataset; Others-User define dataset (to be supported)

Examples:
    >>> from modelscope.msdatasets.dataset_cls.dataset import VirgoDataset
    >>> input_kwargs = {'metaContent': 'http://xxx-xxx/xxx.csv', 'samplingType': 0}
    >>> virgo_dataset = VirgoDataset(**input_kwargs)
    >>> print(virgo_dataset[1])
    >>> print(len(virgo_dataset))
    >>> for line in virgo_dataset:
    >>>     print(line)

    Note: If you set `download_virgo_files` to True by using
        MsDataset.load(dataset_name='your-virgo-dataset-id', hub=Hubs.virgo, download_virgo_files=True),
        you can get the cache file path of the virgo dataset, the column name is `cache_file`.
    >>> if virgo_dataset.download_virgo_files:
    >>>     print(virgo_dataset[1].get('cache_file'))
c                 0   SU l         SU l        SU l        S U l        S U l        Xl        [        R                  " 5       U l        U R
                  R                  [        R                  S5      U l         U R
                  R                  [        R                  S5      U l        U R                  5         U R                  5         SU l        SU l        SU l        S U l        S U l        U R
                  R                  SS5      U l        U R
                  R                  SS 5      U l        U R
                  R                  SS5      U l        U R                  (       a>  U R                  R/                  U R                  U R                  5      u  U l        U l        g g )Nr   r   Fodps_batch_sized   
odps_limitodps_drop_last)_meta_content	data_typeodps_table_nameodps_table_partition_odps_utilsr   pd	DataFrame_metapopr
   meta_contentsampling_type_check_variables_parse_metameta_content_cache_filevirgo_cache_dirdownload_virgo_filesodps_table_insodps_reader_insr   r   r   get_table_reader_ins)r4   r}   s     r?   r@   VirgoDataset.__init__   sX   "$$&)-!+/##%<<>
!//33++R1++//,,a1 	')$!*/!"##11556GM,,00tD"00445EuM8<8H8H8]8]$$d&?&?9A5D!5 rG   c           
      n   U R                   (       a~  [        R                  " U R                   UU R                  U R                  U R
                  U R                  R                  R                  U R                  R                  R                  S9$ U R                  R                  U   R                  5       $ )N)readerrv   batch_size_inlimit_indrop_last_in
partitionscolumns)r   r   gen_reader_itemr   r   r   r   table_schemar   namesr   ilocto_dict)r4   rv   s     r?   rI   VirgoDataset.__getitem__   s    !11++"22!00..;;FF++88>>@ @ zzu%--//rG   c                     [        U R                  [        5      (       a  U R                  R                  SS5      $ [	        U R                  5      $ )N	odpsCountr   )r"   r   rz   r1   r+   rE   s    r?   rD   VirgoDataset.__len__   s6    djj$''::>>+q114::rG   c           	   #     #    U R                   (       a  [        R                  " U R                   U R                  U R                  U R
                  U R                  R                  R                  U R                  R                  R                  S9nU H  nUv   M	     g U R                  R                  5        H  u  p4UR                  5       v   M     g 7f)N)r   r   r   r   r   r   )r   r   gen_reader_batchr   r   r   r   r   r   r   r   iterrowsr   )r4   odps_batch_databatchr6   rows        r?   rN   VirgoDataset.__iter__  s     ,==++"22!00..;;FF++88>>@O ) ) **--/kkm# 0s   CCreturnc                     U R                   $ )z
Virgo meta data. Contains columns: id, meta_info, analysis_result, external_info and
    cache_file (if download_virgo_files is True).
)r   rE   s    r?   metaVirgoDataset.meta  s     zzrG   c                    [        U R                  [        5      (       a6  [        U R                  5      (       a  [	        U R                  5      nXl        g [        U R                  [        5      (       ai  U R                  U l        U R
                  R                  SS5      U l        U R
                  R                  SS 5      U l	        U R                  5       U l        g Se)NodpsTableNamer   odpsTablePartitionz%The meta content must be url or dict.)r"   r   r#   r   r   r   rz   r1   r   r   _get_odps_infor   )r4   meta_content_dfs     r?   r   VirgoDataset._parse_meta  s    d((#..9""4$ 4$01C1CDO(J**D11++DJ#'::>>/2#FD (,

7K7;)=D%#224D99rG   c                  \   [         R                  R                  [        R                  S5      n [         R                  R                  [        R
                  S5      n[         R                  R                  [        R                  S5      n[         R                  R                  [        R                  [        5      nU (       a  U(       a  U(       dP  [        S[        R                   S[        R
                   S[        R                   S[        R                   S3	5      e[        XX#5      $ )zU
Get MaxComputeUtil instance.

Args:
    None

Returns:
    MaxComputeUtil instance.
r   z&Please set MaxCompute envs for Virgo: z, z6(default: http://service-corp.odps.aliyun-inc.com/api))r$   environr1   r	   	ACCESS_IDACCESS_SECRET_KEYPROJECT_NAMEENDPOINTr   ru   r   )	access_id
access_key	proj_nameendpoints       r?   r   VirgoDataset._get_odps_info'  s     JJNN>#;#;R@	ZZ^^N$D$DbI
JJNN>#>#>C	::>>."9"9"=? 
)89Q9Q8RRT!334B~7R7R6SSU!**++ac  iYIIrG   c                 
   U R                   (       d  SeU R                  S;  a  SeU R                  S:X  a  [        U R                   5      (       d  SeU R                  S:X  a"  [        U R                   [        5      (       d  Segg)	zCheck member variables in this class.
1. Condition-1: self._meta_content cannot be empty
2. Condition-2: self._meta_content must be url when self._data_type is 0
z"Them meta content cannot be empty.)r   r   zFSupported samplingType should be 0 or 1, others are not supported yet.r   z1The meta content must be url when data type is 0.r   z2The meta content must be dict when data type is 1.N)r   r   r   r"   rz   rE   s    r?   r   VirgoDataset._check_variablesA  st    
 !!66>>'ZZ>>Qy1C1C'D'DEE>>Qz$2D2Dd'K'KFF (LrG   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   N)rP   rQ   rR   rS   rT   r@   rI   rD   rN   propertyr   r   r   r   staticmethodr   r   r   rU   rV   rG   r?   r   r      se    ,A@
0
$ bll  : JN J J2GrG   r   )r   mathr$   	itertoolsr   r2   pandasr   r   	tqdm.autor   ,modelscope.msdatasets.utils.maxcompute_utilsr   modelscope.utils.constantr   r   r	   r
   modelscope.utils.loggerr   modelscope.utils.url_utilsr   r   r.   objectr   rX   r   rV   rG   r?   <module>r      si      	    $  G; ; / D	Cf CLYO YxTG6 TGrG   