
    9i                      B   S SK r S SKJr  S SKJrJr  S SKrS SKJ	r	  S SK
Jr  S SKJrJr  S SKJr  \" 5       rS r  SS	\S
\\   S\\   4S jjrS\	S\S\S\S\S\S\4S jrS\4S jrS\\\4   S\S\4S jrS\\\4   S\4S jrS r\4S\S\S\S\S\\   4
S jjrg)    N)defaultdict)OptionalUnion)HubApi)DatasetContextConfig)DEFAULT_DATASET_REVISIONMetaDataFields)
get_loggerc                     U R                  5        VVs0 s H5  u  pUR                  S5      (       d  UR                  S5      (       d  M3  X_M7     snn$ s  snnf )Nmetafile)itemsget)dataset_structurekvs      i/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/msdatasets/utils/dataset_utils.pyformat_dataset_structurer      sO     &++--DAEE&MMQUU6] 	-  s   2A
Ar   subset_namesplitc                    U(       a  X;  d$  U(       d<  [        U R                  5       5      S:  a  [        SU SU R                  5        35      eUnU(       d:  [        [	        U R                  5       5      5      n[
        R                  SU 35        [        X   5      nU(       a$  X$;  a  [        SU SUR                  5        35      eU(       a  X$U   0nX44$ )a  
Args:
    dataset_structure (dict): Dataset Structure, like
     {
        "default":{
            "train":{
                "meta":"my_train.csv",
                "file":"pictures.zip"
            }
        },
        "subsetA":{
            "test":{
                "meta":"mytest.csv",
                "file":"pictures.zip"
            }
        }
    }
    subset_name (str, optional): Defining the subset_name of the dataset.
    split (str, optional): Which split of the data to load.
Returns:
       target_subset_name (str): Name of the chosen subset.
       target_dataset_structure (dict): Structure of the chosen split(s), like
       {
           "test":{
                    "meta":"mytest.csv",
                    "file":"pictures.zip"
                }
        }
   zsubset_name z not found. Available: z,No subset_name specified, defaulting to the zsplit )lenkeys
ValueErrornextiterloggerinfor   )r   r   r   target_subset_nametarget_dataset_structures        r   get_target_dataset_structurer"      s    B 	<$5$:$:$< = A;-'>?P?U?U?W>XY
 	
 %!$'8'='='?"@A:;M:NO	
  8- /6UG23K3P3P3R2ST
 	
 $)E+J#K 77    hub_api	max_limitis_recursivedataset_name	namespaceversionreturnc           	          / nU R                  UUUUSUS9nU H.  nUR                  S5      n	U	(       d  M  UR                  U	5        M0     U$ )a  
List all objects for specific dataset.

Args:
    hub_api (class HubApi): HubApi instance.
    max_limit (int): Max number of objects.
    is_recursive (bool): Whether to list objects recursively.
    dataset_name (str): Dataset name.
    namespace (str): Namespace.
    version (str): Dataset version.
Returns:
    res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...]
T)r'   r(   r%   r&   is_filter_dirrevisionKey)list_oss_dataset_objectsr   append)
r$   r%   r&   r'   r(   r)   resobjectsitem
object_keys
             r   list_dataset_objectsr5   R   sb      C..!! / G XXe_


:	  Jr#   c                     SnU R                  5        H7  u  p#[        U[        5      (       d  M  UR                  S5      (       a  M4  Sn  U$    U$ )z
To check whether input contains at least one directory.

Args:
    file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'}
Returns:
    True if input contains at least one directory, False otherwise.
Fz.zipT)r   
isinstancestrendswith)file_mapr1   r   r   s       r   contains_dirr;   t   sK     C aajj&8&8CJ	 ! Jr#   c                     [        U [        5      (       a  U /n [        R                  R	                  USR	                  U 5      5      $ )N_)r7   r8   ospathjoin)r   r)   s     r   get_subdir_hash_from_splitrA      s3    %77<<%11r#   c                 ~    [        U [        5      (       a  U /$ [        U [        5      (       a  U $ S[        U 5       S3e)z Unify the split to list-format. z/Expected format of split: str or list, but got .)r7   r8   listtype)r   s    r   get_split_listrF      s;    %w	E4	 	 ?U}ANNr#   c                    0 nU R                  5        H	  u  p4/ X#'   M     U HY  nU R                  5        HB  u  p4UR                  UR                  S5      S-   5      (       d  M/  X#   R                  U5        MD     M[     U$ )a  
Get the map between dataset split and oss objects.

Args:
    file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val
        are dirs.
    objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png']
Returns:
    A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'],
        'validation':['val/003/3_38.png']}
/)r   
startswithrstripr0   )r:   r2   r1   r   r   obj_keys         r   get_split_objects_maprL      su     C  ! NN$DA!!!((3-#"566g& % 
 Jr#   subset_split_intocontext_configr-   c           	         [        [        5      n[        [        5      n[        [        5      n[        [        5      n[        5       n	UR                  R                  n
U R                  5        Hn  u  pUR                  SS5      X'   U	R                  UR                  SS5      XU5      X['   UR                  S5      (       a  US   Xk'   UR                  S5      X{'   Mp     / nUR                  5        GH  u  pU(       d  M  UR                  [        R                  5      (       d  M5  X[   n[        R                  " X5      nUR                  R                  SS5      n[        R                  " USUS	S
9nUR                  UR                  R                  R!                  S5         R#                  5       n[%        U5      S:X  a)  [&        R)                  SU S35        UR                  S   nOUS   nUU   R#                  5       nXU'   GM     U(       d)  [+        U	SSUUUS9n[-        U5      (       a  [/        Xm5      nXVXx4$ )z
Return:
    meta_map: Structure of meta files (.csv), the meta file name will be replaced by url, like
    {
       "test": "https://xxx/mytest.csv"
    }
    file_map: Structure of data files (.zip), like
    {
        "test": "pictures.zip"
    }
custom r   r   args	delimiter,F\)iteratorrS   
escapecharz:FILEr   zNo column contains ":FILE" in rC   T)r$   r%   r&   r'   r(   r)   )r   dictr   data_meta_configmeta_cache_dirr   r   get_dataset_file_url_originr	   ARGS_BIG_DATAfetch_meta_files_from_urlconfig_kwargspdread_csvcolumnsr8   containsto_listr   r   errorr5   r;   rL   )rM   r'   r(   rN   r-   meta_mapr:   args_mapcustom_type_mapmodelscope_apir[   r   r   r2   	args_dictmeta_csv_file_urlmeta_csv_file_pathcsv_delimitercsv_df
target_cols                       r   get_dataset_filesrp      s     4 H4 H4 H!$'OXN#44CCN(..0!%(B!7(DDHHVR ,8E88F"6lHO((6* 1 G$NN,9~'C'CDD (!'!A!A!"3 +88<<[#NM[["'	!F
  (:(:(C(C) !') :!#45G4HJL#^^A.
']
Z(002G%UO/ -2 &"% !!,X?Hx88r#   )NN)r>   collectionsr   typingr   r   pandasr`   modelscope.hub.apir   4modelscope.msdatasets.context.dataset_context_configr   modelscope.utils.constantr   r	   modelscope.utils.loggerr
   r   r   rY   r8   r"   intboolrD   r5   r;   rA   rF   rL   rp    r#   r   <module>r{      s)   
 # "  % N .	 ?C8<58D 58.6sm58(058p& S  '*7:"%*.Dd "2eCI&6 2 2 2O%T	* Ot O8 1I	E9 E9$'E9!$E9 ';E9 !)	E9r#   