
    9i2                     2   S SK r S SKJrJr  S SKJrJr  S SKJrJ	r	J
r
JrJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  S SKJrJrJrJr  S SK J!r!  S SK"J#r#  \!" 5       r$ " S S\5      r% " S S\%5      r& " S S\%5      r' " S S\%5      r(g)    N)ABCabstractmethod)OptionalUnion)DatasetDatasetBuilderDatasetDictIterableDatasetIterableDatasetDict)load_dataset)ModelScopeConfig)OssAuthConfig)DatasetContextConfig)DataFilesManager)ExternalDataset)DataMetaManager)DatasetFormationsDatasetPathNameDownloadModeVirgoDatasetConfig)
get_logger)	valid_urlc                   v    \ rS rSrSrS\4S jr\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
rg)BaseDownloader   z%Base dataset downloader to load data.dataset_context_configc                     Xl         g Nr   )selfr   s     m/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/msdatasets/data_loader/data_loader.py__init__BaseDownloader.__init__   s    &<#    c                 <    [        S[        R                   S35      e)z6The entity processing pipeline for fetching the data. 'No default implementation provided for z	.process.NotImplementedErrorr   __name__r    s    r!   processBaseDownloader.process!   s$     "5n6M6M5NiX
 	
r$   c                 <    [        S[        R                   S35      e)Nr&   z._authorize.r'   r*   s    r!   
_authorizeBaseDownloader._authorize(   s"    !5n6M6M5Nl[
 	
r$   c                 <    [        S[        R                   S35      e)Nr&   z._build.r'   r*   s    r!   _buildBaseDownloader._build.   s"    !5n6M6M5NhW
 	
r$   c                 <    [        S[        R                   S35      e)Nr&   z._prepare_and_download.r'   r*   s    r!   _prepare_and_download$BaseDownloader._prepare_and_download4   s#    !5n6M6M5NNef
 	
r$   c                 <    [        S[        R                   S35      e)Nr&   z._post_process.r'   r*   s    r!   _post_processBaseDownloader._post_process:   s"    !5n6M6M5No^
 	
r$   r   N)r)   
__module____qualname____firstlineno____doc__r   r"   r   r+   r.   r1   r4   r7   __static_attributes__ r$   r!   r   r      sv    /=/C = 
 
 
 

 
 

 
 

 
 
r$   r   c                   b   ^  \ rS rSrS\4U 4S jjrS
S jrS
S jrS
S jrS
S jr	S
S jr
S	rU =r$ )OssDownloaderA   r   c                 \   > [         TU ]  U5        S U l        S U l        S U l        S U l        g r   )superr"   data_files_builderdatasetbuilderdata_files_managerr    r   	__class__s     r!   r"   OssDownloader.__init__C   s6    /0>B :> 	 26>Br$   c                     U R                  5         U R                  5         U R                  5         U R                  5         g)zSequential data fetching process: authorize -> build -> prepare_and_download -> post_process,
to keep dataset_context_config updated. Nr.   r1   r4   r7   r*   s    r!   r+   OssDownloader.processM   /     	""$r$   c                 >   [         R                  " 5       n[         R                  " 5       n[         R                  " 5       nU R                  R
                  (       d  [        XUS9nO(U R                  R
                  nXl        X$l        X4l	        X@R                  l        g)zjAuthorization of target dataset.
Get credentials from cache and send to the modelscope-hub in the future. cookies	git_token	user_infoN)
r   get_cookies	get_tokenget_user_infor   auth_configr   rQ   rR   rS   )r    rQ   rR   rS   rW   s        r!   r.   OssDownloader._authorizeV   s~     #..0$..0	$224	**66'	KK 55AAK")$-!$-!2=##/r$   c                     [        U R                  5      nUR                  5         UR                  5         UR                  U l        [	        U R                  S9U l        U R
                  R                  5       U l        g)ziSequential data files building process: build_meta -> build_data_files , to keep context_config updated. r   N)r   r   fetch_meta_filesparse_dataset_structurer   rG   get_data_files_builderrF   )r    meta_managers     r!   r1   OssDownloader._buildh   si     't'B'BC%%',,.&2&I&I# #3#'#>#>#@..EEGr$   c                    U R                   R                  R                  nU R                   R                  R                  nU R                   R                  nU R                   R
                  nU R                   R                  nU R                   R                  nU R                   R                  nU R                   R                  nU R                   R                  n	U R                   R                  n
U R                   R                  nU R                   R                  nU R                  c  U(       d  SU S3eU(       aY  U[        R                   :X  aE  U(       a  ["        R%                  SU S35        ['        U4UUUUUU	U
R(                  US.UD6U l        gU R,                  R/                  U R                  5      U l        g)z.Fetch data-files from modelscope dataset-hub. Nzmeta-file: z$.py not found on the modelscope hub.z3Use trust_remote_code=True. Will invoke codes from z9. Please make sure that you can trust the external codes.)namerevisionsplitdata_dir
data_files	cache_dirdownload_modetrust_remote_code)r   data_meta_configdataset_py_scriptdataset_formationdataset_namesubset_nameversionrb   rc   rd   cache_root_dirrf   config_kwargsrg   rF   r   hf_compatibleloggerwarninghf_load_datasetvaluerE   rG   fetch_data_files)r    ri   rj   rk   rl   rm   rb   rc   rd   re   rf   input_kwargsrg   s                r!   r4   #OssDownloader._prepare_and_downloadu   s    77HHZZ 77HHZZ22??11==--55++11..7700;;
//>>	33AA22@@ 77II<<(9~-QRR!26G6U6U!U I, XB BC +!
   !%#+11"3
  
 DL  22CCDLr$   c                     [        U R                  [        5      (       a0  U R                  R                  R
                  U R                  l        g g r   )
isinstancerE   r   r   rh   meta_type_map
custom_mapr*   s    r!   r7   OssDownloader._post_process   s6    dllO44&*&A&A&R&R&`&`DLL# 5r$   )rF   rD   rG   rE   r   )returnN)r)   r9   r:   r;   r   r"   r+   r.   r1   r4   r7   r=   __classcell__rI   s   @r!   r@   r@   A   s4    C/C C>$H%Na ar$   r@   c                   R   ^  \ rS rSrSrS\4U 4S jjrS rS rS r	S r
S	 rS
rU =r$ )VirgoDownloader   z&Data downloader for Virgo data source.r   c                 2   > [         TU ]  U5        S U l        g r   rC   r"   rE   rH   s     r!   r"   VirgoDownloader.__init__       /0r$   c                     U R                  5         U R                  5         U R                  5         U R                  5         g)zl
Sequential data fetching virgo dataset process: authorize -> build -> prepare_and_download -> post_process
NrL   r*   s    r!   r+   VirgoDownloader.process   rN   r$   c                    SSK Jn  [        R                  " 5       n[        R                  " 5       nU R
                  R                  (       d	  U" USUS9nO)U R
                  R                  nX$l        SUl        X4l	        X@R
                  l        g)zAuthorization of virgo dataset.r   )VirgoAuthConfig rP   N)
&modelscope.msdatasets.auth.auth_configr   r   rT   rV   r   rW   rQ   rR   rS   )r    r   rQ   rS   rW   s        r!   r.   VirgoDownloader._authorize   su    J"..0$224	**66)2DK 55AAK")$&K!$-!2=##/r$   c                    SSK Jn  SSKn[        U R                  5      nUR                  5         UR                  U l        U" S
0 U R                  R                  D6U l        [        R                  R                  U R                  R                  U R                  R                  U R                  R                  U R                  R                  5      n[        R                  " [        R                  R                  U[         R"                  5      SS9  [        R                  R                  U[         R"                  S5      n[%        U R                  R&                  UR(                  5      (       a_  U R                  R&                  nUR+                  USS9  XPR                  l        X@R                  l        [0        R3                  S	U 35        gg)z+
Fetch virgo meta and build virgo dataset.
r   )VirgoDatasetNTexist_okzmeta_content.csvF)indexzVirgo meta content saved to r>   ))modelscope.msdatasets.dataset_cls.datasetr   pandasr   r   fetch_virgo_metaro   rE   ospathjoinrn   	namespacerk   rm   makedirsr   	META_NAMEry   meta	DataFrameto_csvmeta_content_cache_filevirgo_cache_dirrq   info)r    r   pdr]   r   r   meta_content_dfs          r!   r1   VirgoDownloader._build   sd    	K&t'B'BC%%'&2&I&I## 9))779 '',,''66''11''44''//	1
 	GGLL/*C*CD	 #%'',,/>/H/H/A#C dll''66"ll//O""#:%"H3JLL0+:LL(KK./F.GHJ 7r$   c                   ^^^^^	^
 U R                   R                  R                  SS5      nU R                  R                  S:X  Ga  U(       Ga  SSKm	SSKmSSKnSSKJ	m
  SSK
Jm  UU	U
4S jmSU R                  l        U R                   R                  n[        R                  R!                  U R                  R"                  [$        R&                  5      mU[(        R*                  :X  a  UR-                  TSS	9  SS
KJn  UR3                  SS9  U R                  R4                  R7                  UUU4S jSS9U R                  R4                  [8        R:                  '   ggg)z;
Fetch data-files from oss-urls in the virgo meta content.
download_virgo_filesr   r   N)urlparse)partialc                   > / n/ n TR                  U 5      n U R                  SS5      nU(       a  UR                  U5        OGU R                  SS5      nU H/  nUR                  SS5      nU(       d  M  UR                  U5        M1     U H  n[        U5      nU(       a2  T" U5      n	[        R
                  R                  U	R
                  5      n
O[        SU 35      e[        R
                  R                  X5      nUR                  X{45        M     U H  u  pU(       d  M  [        R
                  R                  U5      (       a  M4  [        R                  SU 35        [        R                  " USS9  [        US	5       nUR!                  TR                  U5      R"                  5        S S S 5        M     U$ ! [         a$  n[        R                  SU 35        / n S nANS nAff = f! , (       d  f       M  = f)
Nurlr   	inner_urlzUnsupported url: zparse virgo meta info error: zDownloading file to Tr   wb)loadsgetappendr   r   r   basename
ValueErrorr   	Exceptionrq   errorexistsr   r   openwritecontent)meta_info_valrc   file_url_listfile_path_listfile_urltmp_inner_member_listitemone_file_urlis_urlurl_parse_res	file_name	file_pathefile_url_itemfile_path_itemfjsonrequestsr   s                   r!   download_file<VirgoDownloader._prepare_and_download.<locals>.download_file   s    "!#($(JJ}$=M,00;H%,,X60=0A0A'1--$9D'+xxr':H'x - 4 4X > %:
 )6!*<!8!,4X,>M(*(8(89K9K(LI",/@
-K"LL$&GGLL$E	&--|.GH )6 6D1M%~bggnn^.L.L&:>:J$KLHt<!.$71GGHLL$?$G$GH 87	 6D &% ! (LL#@!DE%'N( 87s+   A*F> 5B!F> +G/>
G,G''G,/
G>	T)ignore_errors)tqdmzapply download_file)descc                 2   > T" TTS9" U R                   5      $ )N)rc   )	meta_info)rowdata_files_dirr   r   s    r!   <lambda>7VirgoDownloader._prepare_and_download.<locals>.<lambda>(  s    %!@@C!Or$      )axis)r   ro   poprE   	data_typer   r   shutilurllib.parser   	functoolsr   r   rf   r   r   r   r   r   DATA_FILES_NAMEr   FORCE_REDOWNLOADrmtree	tqdm.autor   r   r   progress_applyr   col_cache_file)r    r   r   rf   r   r   r   r   r   r   r   s        @@@@@@r!   r4   %VirgoDownloader._prepare_and_download   s   
  $::HHLL"B ( <<!!Q&+?-)&&P 15DLL- 77EEMWW\\$,,*F*F*9*I*IKN  = ==nDA&KK2K3 #',,"3"3"B"BO #C # LL" s ,@&r$   c                     g r   r>   r*   s    r!   r7   VirgoDownloader._post_process,      r$   )rE   r   r)   r9   r:   r;   r<   r   r"   r+   r.   r1   r4   r7   r=   r~   r   s   @r!   r   r      s5    0/C >$JBFP r$   r   c                   R   ^  \ rS rSrSrS\4U 4S jjrS rS rS r	S r
S	 rS
rU =r$ )MaxComputeDownloaderi0  z+Data downloader for MaxCompute data source.r   c                 2   > [         TU ]  U5        S U l        g r   r   rH   s     r!   r"   MaxComputeDownloader.__init__4  r   r$   c                     g r   r>   r*   s    r!   r+   MaxComputeDownloader.process8  r   r$   c                     g r   r>   r*   s    r!   r.   MaxComputeDownloader._authorize;  r   r$   c                     g r   r>   r*   s    r!   r1   MaxComputeDownloader._build>  r   r$   c                     g r   r>   r*   s    r!   r4   *MaxComputeDownloader._prepare_and_downloadA  r   r$   c                     g r   r>   r*   s    r!   r7   "MaxComputeDownloader._post_processD  r   r$   )rE   r   r   s   @r!   r   r   0  s1    5/C  r$   r   ))r   abcr   r   typingr   r   datasetsr   r   r	   r
   r   r   rs   modelscope.hub.apir   r   r   4modelscope.msdatasets.context.dataset_context_configr   3modelscope.msdatasets.data_files.data_files_managerr   !modelscope.msdatasets.dataset_clsr   ,modelscope.msdatasets.meta.data_meta_managerr   modelscope.utils.constantr   r   r   r   modelscope.utils.loggerr   modelscope.utils.url_utilsr   rq   r   r@   r   r   r>   r$   r!   <module>r      s    
 # "+ + 4 / @ = HI I . 0	#
S #
L]aN ]a@Ln L^> r$   