
    9i                        S SK r S SKrS SKJrJrJrJrJrJrJ	r	J
r
Jr  S SKrS SKJrJrJrJrJr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJrJrJrJ r   S S	K!J"r"J#r#  S S
K$J%r%  S SK&J'r'  S SK(J)r)  S SK*J+r+  S SK,J-r-  S SK.J/r/J0r0  S SK1J2r2  S SK3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;J<r<J=r=  S SK>J?r?J@r@  S SKAJBrB  \B" 5       rCS\4S jrD " S S5      rEg)    N)	AnyCallableDictIterableListMappingOptionalSequenceUnion)DatasetDatasetDictFeaturesIterableDatasetIterableDatasetDict)_PACKAGED_DATASETS_MODULES)is_relative_path)DatasetRepository)DatasetContextConfig)LocalDataLoaderManagerLocalDataLoaderTypeRemoteDataLoaderManagerRemoteDataLoaderType)ExternalDatasetNativeIterableDataset)build_custom_dataset)DatasetDeleteManager)load_dataset_with_ctx)DatasetUploadManager)build_preprocessor)Config
ConfigDict)MS_DATASETS_CACHE)
DEFAULT_DATASET_NAMESPACEDEFAULT_DATASET_REVISIONREPO_TYPE_DATASETConfigFieldsDatasetFormationsDownloadModeHubsModeKeysTasks
UploadMode)is_tf_availableis_torch_available)
get_loggerreturnc                     U c  / n U $ [        U [        5      (       a  U /n U $ [        [        U 5      5      [        U 5      :  a  [	        SU  35      eU $ )Nz"List columns contains duplicates: )
isinstancestrlenset
ValueError)paras    `/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/msdatasets/ms_dataset.pyformat_listr9   )   s^    |
 K	 
D#		v K 
SY#d)	#=dVDEEK    c            +       d   \ rS rSr% SrSrSr\\S'    SLS\	\
\\\4   S\\   4S jjrS rS	 rS
 r\S 5       r\S 5       r\ SLS\	\
\\4   S\S\	\S 4   4S jj5       r\ SLS\	\
\\\\\4   S\S\	\S 4   4S jj5       r\\S\\ RB                  SSSS\"RF                  \$SSS\%" 5       SSS4S\	\\&4   S\\   S\\   S\\   S\\    S\\   S\\   S\\   S\\	\\'\   \(\\	\\'\   4   4   4      S\\"   S\\   S\\)   S\\*   S\\+   S \\%   S!\\   S"\\*   S#\\*   S\	\S \4   4&S$ jj5       r,\\\SSS%\-R\                  4S&\S'\S\S\\   S\\   S(\\+   S)\\+   S*\\*   S+\\-   SS4S, jj5       r/\\SS4S-\S.\S/\\   S0\\   S1\\   SS4S2 jj5       r0\\SSS4S-\S3\S/\\   S0\\   S1\\   S4\*SS4S5 jj5       r1\\\4S&\S\S\\   S\\   S\4
S6 jj5       r2     SMS7\	\\3\   4   S8\	\4\3\4   4   S9\S:\5S;\*4
S< jjr6       SNS=\+S>\*S8\	\4\3\4   4   S7\	\\3\   4   S?\4S@\*SA\7\\84   SB\	\\3\   4   SC\*4SD jjr9S\
4SE jr:SF\7\\4   S\
4SG jr;  SOS8\	\4\3\4   4   S7\	\\3\   4   S;\*4SH jjr<    SPS=\+S>\*S8\	\4\3\4   4   S@\*SC\*SB\	\\3\   4   S7\	\\3\   4   4SI jjr=  SQS \%4SJ jjr>SKr?g)R	MsDataset3   a  
ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
provide efficient data access and local storage managements. On top of
that, MsDataset supports the data integration and interactions with multiple
remote hubs, particularly, ModelScope's own Dataset-hub. MsDataset also
abstracts away data-access details with other remote storage, including both
general external web-hosted data and cloud storage such as OSS.
N_dataset_context_configds_instancetargetc                     Xl         UbU  X R                   R                  ;  a<  [        S[        U R                   R                  R	                  5       5       SU 35      eX l        SU l        g )Nz)"target" must be a column of the dataset(z
, but got F)_hf_dsfeatures	TypeErrorlistkeysr@   	is_custom)selfr?   r@   s      r8   __init__MsDataset.__init__@   se     "&0D0D"D;DAUAUAZAZA\<];^^hiohpq  r:   c              #   x   #    U R                    H&  nU R                  b  XR                     v   M"  Uv   M(     g 7fN)rB   r@   )rH   items     r8   __iter__MsDataset.__iter__L   s/     KKD{{&;;''
	  s   8:c                      U R                   U   $ rL   rB   )rH   keys     r8   __getitem__MsDataset.__getitem__S   s    {{3r:   c                 ,    [        U R                  5      $ rL   )r4   rB   rH   s    r8   __len__MsDataset.__len__V   s    4;;r:   c                     U R                   $ rL   rQ   rV   s    r8   r?   MsDataset.ds_instanceY   s    {{r:   c                 n    [        U R                  [        5      (       a  U R                  R                  $ g rL   )r2   rB   r   config_kwargsrV   s    r8   r\   MsDataset.config_kwargs]   s&    dkk?33;;,,,r:   hf_dsr0   c           	         [         R                  " S[        5        [        U[        5      (       a  U " X5      $ [        U[
        5      (       aq  [        UR                  5       5      S:X  a)  U " [        [        UR                  5       5      5      U5      $ UR                  5        VVs0 s H  u  p4X0" XB5      _M     snn$ [        U[        5      (       a  U " U5      $ [        S[        U5       35      es  snnf )zt
@deprecated
This method is deprecated and may be removed in future releases, please use `to_ms_dataset()` instead.
z@from_hf_dataset is deprecated, please use to_ms_dataset instead.   z2"hf_ds" must be a Dataset or DatasetDict, but got )warningswarnDeprecationWarningr2   r   r   r4   rF   nextitervaluesitemsr   rD   type)clsr^   r@   kvs        r8   from_hf_datasetMsDataset.from_hf_datasetd   s     	N	  eW%%u%%{++5::< A%4U\\^ 45v>>27++-@-$!As1~%-@@//u:DT%[MR 	 As   'C3c           	      H   [        U[        5      (       a  U " X5      $ [        U[        5      (       aq  [        UR	                  5       5      S:X  a)  U " [        [        UR                  5       5      5      U5      $ UR                  5        VVs0 s H  u  p4X0" XB5      _M     snn$ [        U[        5      (       a  U " U5      $ [        U[        5      (       a  U " U5      $ [        U[        5      (       a  U " U5      $ [        U[        5      (       aq  [        UR	                  5       5      S:X  a)  U " [        [        UR                  5       5      5      U5      $ UR                  5        VVs0 s H  u  p4X0" XB5      _M     snn$ [        S[        U5       35      es  snnf s  snnf )z&Convert input to `MsDataset` instance.r`   z8"ds_instance" must be a Dataset or DatasetDict, but got )r2   r   r   r4   rF   rd   re   rf   rg   r   r   r   r   rD   rh   )ri   r?   r@   rj   rk   s        r8   to_ms_datasetMsDataset.to_ms_dataset|   sl    k7++{++[11;##%&!+4[%7%7%9 :;VDD2=2C2C2EF2E$!As1~%2EFF_55{##%:;;{##_55{##%899;##%&!+4[%7%7%9 :;VDD2=2C2C2EF2E$!As1~%2EFFJ4P[K\J]^  G Gs   F)FFr`   dataset_name	namespaceversionhubsubset_namesplitdata_dir
data_filesdownload_mode	cache_dirrC   use_streamingstream_batch_size
custom_cfgtokendataset_info_onlytrust_remote_codec                 *
   U(       a  SSK Jn  U" 5       nUR                  U5        [        U	=(       d    [        R                  5      n	[        U=(       d    [
        R                  5      nU[
        R                  :H  n[        U [        5      (       d,  [        U [        5      (       d  [        S[        U 5       35      e[        U [        5      (       a0  Uc  Sn[        R                  " X 05      n[        R!                  UUS9$ ["        R$                  R'                  U 5      n ["        R$                  R)                  U 5      n[+        U 5      (       aj  U R-                  S5      S:X  aU  U(       dN  U(       dG  U R/                  S5      nUS   R1                  5       nUS   R1                  5       n U(       a  U (       d  S	eU(       a  [2        R5                  S
U  S35        [7        SU UUUUUUUUU	U
UUUS.UD6nU [8        ;   dH  ["        R$                  R;                  U 5      (       d$  ["        R$                  R=                  U 5      (       a|  [?        U5      RA                  [B        RD                  5      n[        R!                  UUS9n[        U[        5      (       a)  UUl#        U(       a  URH                  " SSU0UD6  SUl%        U$ U[
        R                  :X  a  SSK&J n  U" U 4UUUU	RN                  US.UD6$ U[
        R                  :X  Ga)  SSK Jn  U" 5       nURQ                  US-   U -   [R        S9nURU                  U UUS9u  nn[        U5      [        [V        RX                  RN                  5      :X  a6  [[        SUS-   U -   UUUUU
USU	RN                  UUUUUS.UD6 nUsSSS5        $ []        U5      n U RA                  [^        R`                  5      n[        R!                  UUS9n[        U[        5      (       a3  U Rb                  Ul#        U(       a  URH                  " SSU0UD6  SUl%        U$ U[
        Rd                  :X  a  [f        Rh                  " S[j        5        SSK6J7n!  SSK8J9n"  U[t        :X  a  U"Rv                  Ul<        U[z        :X  a  U"R|                  Ul?        U
[        :X  a/  SSKAJBn#  ["        R$                  R                  U#SSS5      n
U
UlD        U!" U5      n$U$R                  5         U$R                  $ Se! , (       d  f       g= f)a  Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.

Args:
    dataset_name (str): Path or name of the dataset.
        The form of `namespace/dataset_name` is also supported.
    namespace(str, optional): Namespace of the dataset. It should not be None if you load a remote dataset
        from Hubs.modelscope,
    namespace (str, optional):
        Namespace of the dataset. It should not be None if you load a remote dataset
        from Hubs.modelscope,
    target (str, optional): Name of the column to output.
    version (str, optional): Version of the dataset script to load:
    subset_name (str, optional): Defining the subset_name of the dataset.
    data_dir (str, optional): Defining the data_dir of the dataset configuration. I
    data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
    split (str, optional): Which split of the data to load.
    hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
    download_mode (DownloadMode or str, optional): How to treat existing datasets. default
                                                   DownloadMode.REUSE_DATASET_IF_EXISTS
    cache_dir (str, Optional): User-define local cache directory.
    use_streaming (bool, Optional): If set to True, no need to download all data files.
                                    Instead, it streams the data progressively, and returns
                                    NativeIterableDataset or a dict of NativeIterableDataset.
    stream_batch_size (int, Optional): The batch size of the streaming data.
    custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
                               see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
    token (str, Optional): SDK token of ModelScope.
    dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
    trust_remote_code (bool, Optional): If set to True, trust the remote code. Default to `False`.
    **config_kwargs (additional keyword arguments): Keyword arguments to be passed

Returns:
    MsDataset (MsDataset): MsDataset object for a certain dataset.
r   )HubApiz.dataset_name must be `str` or `list`, but got Nr@   )r@   /r`   zUThe dataset_name should be in the form of `namespace/dataset_name` or `dataset_name`.z3Use trust_remote_code=True. Will invoke codes from z9. Please make sure that you can trust the external codes.)rq   rr   rs   ru   rv   r@   rt   rw   rx   ry   cache_root_dirr{   r|   r   r}   T)load_dataset)namerv   	streamingry   r   )repo_id	repo_type)rq   rr   endpoint)pathr   rw   rx   rv   rz   rC   download_configry   revisionr~   r   r   r   zMThe option `Hubs.virgo` is deprecated, will be removed in the future version.)VirgoDownloader)VirgoDatasetConfig)
CACHE_HOMEvirgort   datasetszPlease adjust input args to specify a loading mode, we support following scenes: loading from local disk, huggingface hub and modelscope hub. )Gmodelscope.hub.apir   loginr(   REUSE_DATASET_IF_EXISTSr)   
modelscopehuggingfacer2   r3   rE   rD   rh   r   	from_dictr<   ro   osr   
expanduserexistsr   countrv   striploggerwarningr   r   isdirisfiler   r   r   HF_DATA_LOADERr>   to_custom_datasetrG   r   valueget_endpoint_for_readr%   get_dataset_id_and_typer'   generalr   r   r   MS_DATA_LOADERdataset_context_configr   ra   rb   rc   -modelscope.msdatasets.data_loader.data_loaderr   modelscope.utils.constantr   r#   default_virgo_namespacerr   r$   default_dataset_versionrs   r"   modelscope.utils.config_dsr   joinr   processdataset)%rq   rr   r@   rs   rt   ru   rv   rw   rx   ry   rz   rC   r{   r|   r}   r~   r   r   r\   r   apiis_huggingface_hubdataset_instis_local_pathdataset_name_splitr   r   _apir   dataset_id_on_hubdataset_typedataset_resremote_dataloader_managerr   r   r   virgo_downloaders%                                        r8   loadMsDataset.load   s   x 1(CIIe$] &M(4(L(LN3)$//*!T%5%55,,,Zd6$ 6$@lAS@TU  lD))~!",,f-CDL**<*GGww)),7|4L))l.@.@//"/8J!-!3!3C!8*1-335I-a0668LLmmNNEl^ T4 45
 "6 "%#!'$'//" "$ 55: :!#!=!=1&((4'66)8  %22<2OL,	227M4 22 @#-@1>@-1L*D$$$-! '+11"3!  ! ! DOO#18D11!C,6+ 2 -H /3.J.J)#! /K /#+| < C(9(A(A(G(G$HH* )&_|;(!)#-#"+!)(,&3&9&9!(#"/*;*;) () -8&#) )* -D*-,)8EE(77 9(66   7  1lI66;T;k;kL8!$66 D'1D5BD15.##DJJMM9:LN VD553E3]3]&0221C1[1[&.--AGGLLWe)35	8A&5./EF$$&#+++Q Qu) )s    T
TTobject_namelocal_file_pathnum_processes	chunksizefilter_hidden_filesupload_modec	           	          [         R                  " S[        5        U (       d  [        S5      e[	        X#US9n	[        U=(       d    [
        R                  5      n[        R                  R                  U5      (       a  U	R                  U UUS9  g[        R                  R                  U5      (       a  U	R                  U UUUUUS9  g[        U S35      e)zs
@deprecated
This method is deprecated and may be removed in future releases, please use git command line instead.
zThe function `upload` is deprecated, please use git command or modelscope.hub.api.HubApi.upload_folder or modelscope.hub.api.HubApi.upload_file.zobject_name cannot be empty!rq   rr   rs   )r   r   r   )object_dir_namelocal_dir_pathr   r   r   r   z& is not a valid file path or directoryN)ra   rb   rc   r6   r   r,   	OVERWRITEr   r   r   uploadr   
upload_dir)
r   r   rq   rr   rs   r   r   r   r   _upload_managers
             r8   r   MsDataset.uploadq  s    	. 	8 :L		M ;<<.%GM !!D
0D0DE77>>/**""' /' # ) WW]]?++&& +.+#$7' ' ) "##IJL Lr:   dataset_work_dir
dataset_idr   
auth_tokengit_pathc                    [         R                  " S[        5        [        U UUUUS9nUR	                  5       nU(       a%  [
        R                  SR                  U5      5        g[
        R                  SR                  U5      5        g)a  Clone meta-file of dataset from the ModelScope Hub.

Args:
    dataset_work_dir (str): Current git working directory.
    dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
    revision (str, optional):
        revision of the model you want to clone from. Can be any of a branch, tag or commit hash
    auth_token (str, optional):
        token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
        as the token is already saved when you login the first time, if None, we will use saved token.
    git_path (str, optional):
        The git command line path, if None, we use 'git'
Returns:
    None
zWThe function `clone_meta` is deprecated, please use git command line to clone the repo.repo_work_dirr   r   r   r   zAlready cloned repo to: {}zRepo dir already exists: {}N)	ra   rb   rc   r   cloner   infoformatr   )r   r   r   r   r   _repoclone_work_dirs          r8   
clone_metaMsDataset.clone_meta  st    , 	e	  "*!! KK4;;NKLNN-44^DFr:   commit_messageforcec                 t    [         R                  " S[        5        [        U SUUUS9nUR	                  XUS9  g)a  Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first.

Args:
    dataset_work_dir (str): Current working directory.
    commit_message (str): Commit message.
    revision(`Optional[str]`):
        revision of the model you want to clone from. Can be any of a branch, tag or commit hash
    auth_token(`Optional[str]`):
        token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
        as the token is already saved when you log in the first time, if None, we will use saved token.
    git_path:(`Optional[str]`):
        The git command line path, if None, we use 'git'
    force (Optional[bool]): whether to use forced-push.

Returns:
    None

zuThe function `upload_meta` is deprecated, please use git command or CLI `modelscope upload owner_name/repo_name ...`. r   )r   branchr   N)ra   rb   rc   r   push)r   r   r   r   r   r   r   s          r8   upload_metaMsDataset.upload_meta  sH    2 	C 		  "*! 	

.
Or:   c                 j    [        XUS9nUR                  U S9n[        R                  SU  S35        U$ )a"  Delete object of dataset. Please log in first and make sure you have permission to manage the dataset.

Args:
    object_name (str): The object name of dataset to be deleted. Could be a name of file or directory. If it's
        directory, then ends with `/`.
        For example: your-data-name.zip, train/001/img_001.png, train/, ...
    dataset_name (str): Path or name of the dataset.
    namespace(str, optional): Namespace of the dataset.
    version (str, optional): Version of the dataset.

Returns:
    res_msg (str): Response message.

r   )r   zObject z successfully removed!)r   deleter   r   )r   rq   rr   rs   _delete_managerresp_msgs         r8   r   MsDataset.delete  sB    & /%GM"))k)Bgk]*@ABr:   columnspreprocessors	task_namedata_config	to_tensorc                    [        5       (       d  [        S5      e[        U R                  [        5      (       aC  UR                  SU05        UR                  U R                  R                  5        [        XC5      $ Ub  U R                  X!US9$ U R                  R                  5         U R                  R                  SXS9  U R                  $ )a  Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
   torch.utils.data.DataLoader.

Args:
    preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
        every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
        will be used as a field of torch.utils.data.Dataset.
    columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
        `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
        If the `preprocessors` is not None, the output fields of processors will also be added.
    task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
    data_config (ConfigDict, default None): config dict for model object.
        Attributes of ConfigDict:
            `preprocessor` (Callable, List[Callable], optional): preprocessors to deal with dataset
            `type` (str): the type of task
            `split_config` (dict, optional): get the split config for ExternalDataset
            `test_mode` (bool, optional): is test mode or not
    to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
    format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

Returns:
    :class:`torch.utils.data.Dataset`

z>The function to_torch_dataset requires pytorch to be installedpreprocessor)r   r   torch)rh   r   format_kwargs)r.   ImportErrorr2   rB   r   updater\   r   !_to_torch_dataset_with_processorsreset_format
set_format)rH   r   r   r   r   r   r   s          r8   to_torch_datasetMsDataset.to_torch_dataset  s    B "##P  dkk?33>?t{{889'??$99) : E E KK$$&KK""g # L;;r:   
batch_sizeshuffle
collate_fndrop_remaindercollate_fn_args
label_colsprefetchc
                    [        5       (       d  [        S5      eUb  U R                  UUUUU	UUS9$ Uc  [        R	                  S5        gU R
                  R                  5         U R
                  R                  UUUUUUUU	S9$ )ah  Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
   model.fit() or model.predict().

Args:
    batch_size (int): Number of samples in a single batch.
    shuffle(bool): Shuffle the dataset order.
    preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
        every sample of the dataset. The output type of processors is dict, and each field of the dict will be
        used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
        shouldn't be None.
    columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
        the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
        processors will also be added.
    collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
        the `preprocessors` is None, the `collate_fn` shouldn't be None.
    drop_remainder(bool, default None): Drop the last incomplete batch when loading.
    collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
    label_cols (str or List[str], default None): Dataset column(s) to load as labels.
    prefetch (bool, default True): Prefetch data.

Returns:
    :class:`tf.data.Dataset`

z?The function to_tf_dataset requires Tensorflow to be installed.N)r   r   r   r   z?The `preprocessors` and the `collate_fn` should`t be both None.)r   r   r   r   )r-   r   _to_tf_dataset_with_processorsr   errorrB   r   to_tf_dataset)
rH   r   r   r   r   r   r   r   r   r   s
             r8   r  MsDataset.to_tf_datasetO  s    H   Q  $66-!% 7 ! ! LLQ   "{{(()+! )  	r:   c                 N    U R                   R                  5         U R                   $ rL   )rB   r   rV   s    r8   to_hf_datasetMsDataset.to_hf_dataset  s      "{{r:   column_mappingc                 l    U R                   R                  5         U R                   R                  U5      $ )z
Rename columns and return the underlying hf dataset directly
TODO: support native MsDataset column rename.
Args:
    column_mapping: the mapping of the original and new column names
Returns:
    underlying hf dataset
)rB   r   rename_columns)rH   r	  s     r8   remap_columnsMsDataset.remap_columns  s)     	  "{{)).99r:   c                   ^ [        U[        5      (       a  UOU/n[        U5      nU R                  R                  R                  5        Vs/ s H  oUU;   d  M
  UPM     nn/ n/ nU(       Ga  [        [        U R                  5      5      nU V	s0 s H  o[        R                  " X   5      _M     n
n	U HR  nU
R                  U" U5      R                  5        V	Vs0 s H  u  pU	[        R                  " U5      _M     snn	5        MT     S nU
R                  5        HO  n	U" X   5      (       d,  [        R                  SU	 S35        UR                  U	5        M>  UR                  U	5        MQ     SS Km " U4S jSTR                   R"                  R$                  5      nU" U R                  UUXrU5      $ s  snf s  sn	f s  snn	f )Nc                     [         R                  " U R                  [         R                  5      =(       d/    [         R                  " U R                  [         R                  5      $ rL   )np
issubdtypedtypeintegerfloating)r   s    r8   is_numpy_numberDMsDataset._to_torch_dataset_with_processors.<locals>.is_numpy_number  s;    }}U[["**= .KKB. .r:   zData of column z  is non-numeric, will be removedr   c                   J   >^  \ rS rSrS\4U 4S jjrS rU4S jrS rSr	U =r
$ )AMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataseti  r   c                    > [        [        5      R                  5         Xl        X l        X`l        X0l        X@l        XPl        g rL   )	superr<   rI   r   preprocessor_listr   retained_numeric_columnsretained_unumeric_columnsr   )rH   r   r  r  r  r   r   	__class__s          r8   rI   JMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__init__  s7     i ))+&):&!*0H-1J.&r:   c                 ,    [        U R                  5      $ rL   )r4   r   rV   s    r8   rW   IMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__len__  s    4<<((r:   c                 L   > U R                   (       a  TR                  U5      $ U$ rL   )r   	as_tensor)rH   xr   s     r8   type_converterPMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.type_converter  s    >> ??1--Hr:   c                    U R                   U   nU R                   Vs0 s H7  o0R                  (       a  X0R                  ;   d  M$  X0R	                  X#   5      _M9     nnU R
                   Hl  nU" U5      R                  5        HO  u  p6U R                  (       a  X0R                  ;   a  U R	                  U5      XC'   M:  X0R                  ;   d  MK  XdU'   MQ     Mn     U$ s  snf rL   )r   r   r   r  r%  r  rg   r  )rH   index	item_dictrj   resr   rk   s          r8   rS   MMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__getitem__  s     LL/	 "\\)..999 9A**9<88)  
 %)$:$:L ,Y 7 = = ? $ !%B%B B%)%8%8%;CF"@"@@%&F !@ %; 
s   #CC)r   r   r  r  r  r   )__name__
__module____qualname____firstlineno__r   rI   rW   r%  rS   __static_attributes____classcell__)r  r   s   @r8   MsMapDatasetr    s#    	' 	') r:   r2  )r2   rE   r9   rB   rC   rF   rd   re   r  arrayr   rg   r   r   appendr   utilsdatar   )rH   r   r   r   r  rR   r  r  samplerj   
sample_res	processorrk   r  r2  r   s                  @r8   r   +MsDataset._to_torch_dataset_with_processors  s    .84.! .!M'4o 	 g&  ;;//446
6C.C6 	 
 $& $&!$t{{+,F:AB'QRXXfi00'JB.	!!!*6!2!8!8!:<!: ^!:<= /
.  __&&z}55NN)!,LMO-44Q7(//2 ' 	$	5;;++33 $	L DKK):45	K 	KE
 C<s   	F8F8"F=#Gc                   ^ ^^^^^^ [        U[        5      (       a  UOU/m[        T5      m[        U5      n[        [        TU-   5      5      nT R                  R
                  R                  5        V	s/ s H  oU;   d  M
  U	PM     sn	mSS KmTR                  R                  R                  [        R                  " [        T R                  5      [        R                  S95      n
U(       a"  U
R                  [        T R                  5      S9n
SUUU 4S jjmT" SS5      mTR!                  TR#                  S TR                  5      /S9UUU4S j5       nSSKJn  U
R)                  XS	9n
T(       a  U4S
 jnU
R)                  U5      n
O![        U5      S:X  a  U
R)                  S 5      n
US:  a  U
R+                  XS9n
U(       a  U
R-                  U5      n
U
$ s  sn	f )Nr   )r  )buffer_sizec                   > [        U 5      n T Vs0 s H)  o"[        R                  " TR                  U    U   5      _M+     nnT H_  nUR	                  U" TR                  U    5      R                  5        VVs0 s H  u  p%U[        R                  " U5      _M     snn5        Ma     U(       a  U$ [        [        UR                  5       5      5      $ s  snf s  snnf rL   )	intr  r3  rB   r   rg   tuplerE   rf   )	ireturn_dictrj   r*  r   rk   r  retained_columnsrH   s	         r8   func6MsDataset._to_tf_dataset_with_processors.<locals>.func  s    AA;KL;Kabhht{{1~a011;KCL 1

 ,T[[^ < B B D D rxx{N D  !2 
cjjl+,, Ms   0C9#CT)input_signaturec           
        > TR                  TU /TR                  5        Vs/ s H(  nTR                  R                  UR                  5      PM*     snS9n[        T5       V Vs0 s H
  u  pX2U    _M     snn $ s  snf s  snn f )N)inpTout)numpy_functionrf   dtypesas_dtyper  	enumerate)r@  valoutputrR   rC  r8  tfs       r8   fetch_function@MsDataset._to_tf_dataset_with_processors.<locals>.fetch_function  s    &&C  *0022 II&&syy12 ' F 2;:1FG1FvqCN1FGG
 Hs   /A9$A>)AUTOTUNE)num_parallel_callsc                 2  > U R                  5        VVs0 s H  u  pUT;   d  M  X_M     nnn[        U 5      S:X  a"  [        [        U R	                  5       5      5      n [        U5      S:X  a"  [        [        UR	                  5       5      5      nX4$ s  snnf )Nr`   )rg   r4   rd   re   rf   )input_batchrR   tensorlabelsr   s       r8   split_features_and_labelsKMsDataset._to_tf_dataset_with_processors.<locals>.split_features_and_labels+  s     (3'8'8':':cZ>O  CK':   {#q("&tK,>,>,@'A"BKv;!#!$v}}"78F"**s
   BBr`   c                 F    [        [        U R                  5       5      5      $ rL   )rd   re   rf   )r$  s    r8   <lambda>:MsDataset._to_tf_dataset_with_processors.<locals>.<lambda>9  s    $tAHHJ7G2Hr:   )r   )F)r2   rE   r9   r5   rB   rC   rF   
tensorflowr6  r   from_tensor_slicesr  aranger4   int64r   function
TensorSpectensorflow.data.experimentalrR  mapbatchr   )rH   r   r   r   r   r   r   r   cols_to_retainrR   
tf_datasetrP  rR  rX  rC  r  rB  r8  rO  s   `     `       @@@@@r8   r  (MsDataset._to_tf_dataset_with_processors  s    .84.! .!M'4o 	 !,
g&c*w"678;;//446
6C:OC6
 	 WW__77IIc$++&bhh79
#++DKK8H+IJ	- 	- !T]
	bmmD"((&C%D	E		H 
F		H 	:^^ $ 9
	+ $(ABJ\Q#(HIJ>#)) * ;J #,,X6J}
s   5	GGc                 J   [        5       (       d  [        S5      eU(       d  gSU l        Uc  SU;   a  UR                  S5      nU[        R
                  :X  a  SOSnUR                  SU 35      nUcE  [        U[        R                  5      (       a  [        UR                  R                  S9O[        SS9nUR                  [        US	95        UR                  nS
U;   a  UR                  S
5      n[         R"                  " U5      nSU;   a  UR                  S5      nUc/  [        US5      (       a  UR$                  n	U	(       a  ['        X5      n[)        U R*                  [,        5      (       aV  UR                  [        US95        UR                  U R*                  R.                  5        [1        XaR                  S9U l        gUb'  UR                  SS5      n
U R3                  X*S9U l        gU R*                  R5                  5         U R*                  R7                  SS9  g)aX  Convert the input datasets to specific custom datasets by given model configuration and preprocessor.

Args:
    custom_cfg (Config): The model configuration for custom datasets.
    preprocessor (Preprocessor, Optional): Preprocessor for data samples.
    mode (str, Optional): See modelscope.utils.constant.ModeKeys

Returns:
    `MsDataset`
z?The function to_custom_dataset requires pytorch to be installedNTmodetrainrM  zdataset.)rh   )rj  taskfieldr   )r   )cfgr   r   )r   r   r   )r.   r   rG   getr*   TRAINsafe_gethasattrr&   modelr!   rh   r   dictrl  popr+   find_field_by_taskr   r   r2   rB   r   r\   r   r   r   r   )rH   r}   r   rj  kwargs
ds_cfg_keydata_cfgr   
field_namepreprocessor_cfgr   s              r8   r   MsDataset.to_custom_datasetB  s     "##Q    <zz&) !% 6WE
&&*'>?AHL..B0 B0zz'7'7'<'<=5?T5J $( OO	V

6*I--i8
fG,JGJ$G$G)6612BO dkk?33OODl;<OODKK556.9DK#

;5I@@* A ADK
 	 KK$$&KK"""0r:   )rB   rG   r@   rL   )NNNNT)NNNNNNT)NT)NTNN)NN)@r,  r-  r.  r/  __doc__rB   r>   r   __annotations__r   r   r   r   r   r	   r3   rI   rN   rS   rW   propertyr?   r\   classmethodr   rt  rl   r   ro   staticmethodr#   r$   r)   r   r(   r   r"   r    rE   r
   r   r   boolr>  r   r,   r   r   r   r   r   r   r   r!   r   r   r   r  r  r  r   r  r   r0  r   r:   r8   r<   r<   3   s!    F4818
 *.
#G_o$9%: ;
 "#
        '+$Wk?%JK #/4T;5F/G . 
 %)	#(+)>)8:M*N $O "	 .343D-E	 6  $= $!9"oo%)#"& KO0<#4'+(-+,'-x#,1,1+VQCI&VQC=VQ VQ #	VQ
 d^VQ c]VQ }VQ 3-VQ U3#*3c6>sm7D 1E ,E $F$F G HVQ  -VQ C=VQ 8$VQ   ~!VQ" $C=#VQ$ V$%VQ& }'VQ( $D>)VQ* $D>+VQ. 
t["77	8/VQ VQp 
 (A%=+/'(260:0D0DBLBL BL BL  }	BL
 c]BL $C=BL  }BL "*$BL "*-BL JNBL BLH  .F/3-1	$FS $F"$F%c]$F  (}$F &c]	$F 7;	$F $FL  /G04.2"'$Pc $P$'$P&sm$P !)$P 'sm	$P
  $P
 -1$P $PL  +D(@C  "3- ! FI 4 *.9="&0sDI~&0 XtH~560 	0
  0 0l :>)-##*.,0@@ @ XtH~56	@
 sDI~&@ @ @ c3h@ #tCy.)@ @Dw 
:DcN 
:w 
: *.	OKXtH~56OK sDI~&OK 	OKl  $,0)-NN N XtH~56	N
 N N #tCy.)N sDI~&Nd (,#B&,B Br:   r<   )Fr   ra   typingr   r   r   r   r   r   r	   r
   r   numpyr  r   r   r   r   r   r   datasets.packaged_modulesr   datasets.utils.file_utilsr   modelscope.hub.repositoryr   4modelscope.msdatasets.context.dataset_context_configr   5modelscope.msdatasets.data_loader.data_loader_managerr   r   r   r   !modelscope.msdatasets.dataset_clsr   r   9modelscope.msdatasets.dataset_cls.custom_datasets.builderr   (modelscope.msdatasets.utils.delete_utilsr   ,modelscope.msdatasets.utils.hf_datasets_utilr   (modelscope.msdatasets.utils.upload_utilsr   modelscope.preprocessorsr   modelscope.utils.configr    r!   r   r"   r   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   modelscope.utils.import_utilsr-   r.   modelscope.utils.loggerr/   r   r9   r<   r   r:   r8   <module>r     s    
 % % % + + @ 6 7 F I N I 7 6 8D D D
 N .	 Q Qr:   