
    9iB                        S SK r S SKrS SKrS SKJr  S SKJr  S SKJrJ	r	J
r
JrJrJr  S SKrS SKrS SKrS SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  S SKJ r   S SK!J"r"J#r#  S SK$J%r%  S SK&J'r'  S SK(J)r)  S SK*J+r+J,r,J-r-  S SK.J/r/  S SK0J1r1  SSK2J3r3  Sr4\1" \45      r5S\ Rl                  S'   \%Rn                  " \Rp                  S9 " S S\"5      5       r9g)    N)defaultdict)ceil)CallableDictListOptionalTupleUnion)distributed)nnDataset)Trainers)
TorchModel)	MsDataset)pipeline)Preprocessor)EpochBasedTrainerNlpEpochBasedTrainer)TRAINERS)build_optimizer)Config)DEFAULT_MODEL_REVISIONModeKeysTasks)func_receive_dict_inputs)
get_logger   )is_paralleltrueTOKENIZERS_PARALLELISM)module_namec                   x  ^  \ rS rSrSSSSSSS\SSSSS4S	\\\\R                  \
4      S
\\
   S\\   S\\\\4      S\\\\4      S\\\\\
\4   4      S\\R$                  R&                  \R$                  R(                  R*                  4   S\\
   S\4U 4S jjjr S#S\\R0                  R2                  R                  \\\R0                  R2                  R                     4   S\S\
S\\   4U 4S jjjrS rS rS rS r S$S jr!S r" S#S\\
   S\\
\#4   4S jjr$S\\\
\4      4S  jr%S! r&S"r'U =r($ )%SiameseUIETrainer%   N)NN*      i`  i     modelcfg_filecfg_modify_fntrain_dataseteval_datasetpreprocessor
optimizersmodel_revisionseedc                    > [        S5        Xl        Xl        Xl        Xl        [
        TU ]  " SUUUU R                  UUUUUU	S.
UD6  g)a/  Epoch based Trainer, a training helper for PyTorch.

Args:
    model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir
        or a model id. If model is None, build_model method will be called.
    cfg_file(str): The local config file.
    cfg_modify_fn (function): Optional[Callable] = None, config function
    train_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*):
        The dataset to use for training.

        Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
        distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
        `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
        manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
        sets the seed of the RNGs used.
    eval_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation.
    preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor.
        NOTE: If the preprocessor has been called before the dataset fed into this
        trainer by user's custom code,
        this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file.
        Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and
        this preprocessing action will be executed every time the dataset's __getitem__ is called.
    optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
        containing the optimizer and the scheduler to use.
    model_revision (str): The model version to use in modelhub.
    negative_sampling_rate (float): The rate to do negative sampling.
    slide_len (int): The length to slide.
    max_len (int): The max length of prompt + text.
    hint_max_len (int): The max length of prompt.
    seed (int): The optional random seed for torch, cuda, numpy and random.
z*******************)
r)   r*   r+   data_collatorr,   r-   r.   r/   r0   r1   N )print	slide_lenmax_lenhint_max_lennegative_sampling_ratesuper__init___nn_collate_fn)selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r9   r6   r7   r8   kwargs	__class__s                  k/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/trainers/nlp/siamese_uie_trainer.pyr;   SiameseUIETrainer.__init__(   se    d 	#$"(&<# 	'--'%%!)	 	    datasets	model_cfgmodec                    > U[         R                  :X  a  U R                  U5      n[        [        U ]  " SUU R                  UUS.UD6$ )N)rC   rD   rE   r.   r4   )r   TRAINload_datasetr:   r$   build_datasetcfg)r=   rC   rD   rE   r.   r>   r?   s         r@   rI   SiameseUIETrainer.build_datasetm   sU     8>>!((2H&; hh%	
  	rB   c                     SU R                   l        U R                  " U R                   4U R                  U R                  U R
                  S.U R                  R                  R                  S0 5      D6nU$ )a  Builder torch dataloader for training.

We provide a reasonable default that works well. If you want to use something else, you can change
the config for data.train in configuration file, or subclass and override this method
(or `get_train_dataloader` in a subclass.
N)distr1   
collate_fn
dataloader)	r,   r.   _build_dataloader_with_dataset_dist_seedtrain_data_collatorrJ   trainget)r=   data_loaders     r@   get_train_dataloader&SiameseUIETrainer.get_train_dataloader}   sn     +/'994//	4
 hhnn  r24 rB   c           	          U(       d  g U HM  nU[        UU/-   5      ==   U Vs/ s H  oUU:w  d  M
  UPM     sn-  ss'   U R                  X   UX4/-   5        MO     g s  snf N)tupleget_brother_type_map)r=   schemabrother_type_mapprefix_typeskvs         r@   r\   &SiameseUIETrainer.get_brother_type_map   sn    AU<&'S$) * +:@.K&QFq&.KL +%%fi1A&2S&8: .Ks
   	AAc                    / n[        U5       GHJ  u  p4[        R                  " US   5      US'   [        R                  " US   5      US'   [        [        5      nUS    HH  nSnU H=  nXxS    S3-  nUS   US   S.n	XU   ;  a  XW   R                  U	5        XxS    S	3-  nM?     MJ     [        [        5      n
U R                  US   U
/ 5        US    H  nSn[        U5       H  u  p[        [        US
-   5       Vs/ s H
  oU   S   PM     sn5      nU
R                  U/ 5       H:  nX~ S3-   nX;  d  M  [        R                  " 5       U R                  :  d  M6  / X_'   M<     XxS    S3-  nXxS    S	3-  nM     M     US    H8  nU S3nX;  d  M  [        R                  " 5       U R                  :  d  M4  / X_'   M:     [        U5       GH  u  pUS    SU 3UUS   XW   S.nUS   nUS   nU R                  U/5      S   nU R                  U/U R                  SS9S   nUR                  US'   UR                  S/ 5      nU R                  UUUS   U5      u  nn[!        U5      U R"                  :  a1  [%        [!        U5      U R"                  -
  U R&                  -  5      S
-   OS
n[        U5       H  nXR&                  -  XR&                  -  U R"                  -   nnUUUR(                  UU UR*                  UU UR(                  UR*                  UR,                  UU UR,                  UUU UUU S.
nUR                  U5        M     GM     GMM     SSKJn  UR2                  " U5      n[        R4                  " [        [!        U5      5      S5       H"  n[6        R9                  SU SUU    S35        M$     U$ s  snf )N	info_listr]    typez: spanoffset)rg   rh   z, r'   id-text)ri   hintrk   spansr   T)
max_length
truncationoffsetsrm   )
ri   shifttokens	token_idshint_tokenshint_token_idsattention_maskscross_attention_maskshead_labelstail_labelsr      zSample z of the training set: .)	enumeratejsonloadsr   listappendr\   r[   rangerU   randomr9   train_preprocessorr8   rp   _get_labelslenr7   r   r6   rr   idsattention_maskrC   r   	from_listsampleloggerinfo)r=   raw_datasetdatanum_line
raw_samplehint_spans_mapr   rl   itemrg   r^   ijkeystneg_hintr`   r   uuidrk   tokenized_inputtokenized_hintentitiesrx   ry   	split_numabr   r,   indexs                                  r@   rH   SiameseUIETrainer.load_dataset   s+   $-k$: H&*jjK1H&IJ{##'::j.B#CJx (.N";/ DF|nB//D$(LDNKD$#77&,33D9F|nB//D ! 0  +40%%j&:<L&(* #;/(GA%A, G,Qa, GHC.223;#'D)#3#9fmm ? 77?879N4	 <
 F|nB//DF|nB//D  / 0  )S81fmm 7//70/1N,	 * %^4'-.as3 &v.+1	 d|f~"&"9"94&"A!"D!%!8!8Ft'8'8T "9 "KKL"N$3$;$;y!!::gr2+/+;+;/6)+<h,H([
 _-< !)DLL8DNNJBC  y)A~~-q>>/ADLL/PqA"!""1"8"81"=%4%8%81%='5'<'<*8*<*<+:+I+I!A+N1?1N1N'21Q'7'21Q'7D KK% *) 5I %;P 	%))$/]]5]);#<a@EKK% 6}U7K6LANP A q !Hs    Oc           
      &   [        U5      nS/U-  nS/U-  n0 n[        [        U5      5       H$  n	X9   n
[        U
S   U
S   5       H  nXU'   M	     M&     U H  nUS   u  pUS-  nX;  a>  US-  nU[        U5      :  a#  [        SUS   US   XS   S   US   S    5        OX;  a  M>  X;  a5  US-  nUS:  a#  [        SUS   US   XS   S   US   S    5        OX;  a  M5  U[        U5      :  d  US:  a  M  X   nX   nSXo'   SUU'   M     Xg4$ )Nr   r'   rh   hrg   t)r   r   r5   )r=   rk   r   rp   r   
num_tokensrx   ry   char_index_to_token_index_mapr   rh   r   er   r   
token_head
token_tails                    r@   r   SiameseUIETrainer._get_labels   sl   )
cJ&cJ&(*%s7|$AZF6!9fQi034a0 1 % AX;DAFA8Qs4y=#q{AfI{1~ak!n=? 8 8Qq5#q{AfI{1~ak!n=? 8 3t9}A69J69J&'K#&'K
#) * ''rB   c           	      v    / nU H0  nUR                  XB/U R                  [        U5      -
  -  -   5        M2     U$ rZ   )r   r7   r   )r=   r   valresseqs        r@   _paddingSiameseUIETrainer._padding  s9    CJJsUdllSX&=>>? 
rB   c           	         [         R                  " U R                  U Vs/ s H  o"S   PM	     sn5      [         R                  S9n[         R                  " U R                  U Vs/ s H  o"S   PM	     sn5      [         R                  S9n[         R                  " U R                  U Vs/ s H  o"S   PM	     sn5      [         R                  S9n[         R                  " U R                  U Vs/ s H  o"S   PM	     sn5      [         R                  S9n[         R                  " U R                  U Vs/ s H  o"S   PM	     sn5      [         R                  S9n[         R                  " U R                  U Vs/ s H  o"S   PM	     sn5      [         R                  S9nUR                  S5      R                  S	S
9R                  5       R                  5       n	U	SU	S-  -
  S-  -  n	[        U R                  U	5      n
US S 2S U
24   nUS S 2S U
24   nUS S 2S U
24   nUS S 2S U
24   nUR                  S5      R                  S	S
9R                  5       R                  5       n	U	SU	S-  -
  S-  -  n	[        U R                  U	5      nUS S 2S U24   nUS S 2S U24   nUUUUUUS.$ s  snf s  snf s  snf s  snf s  snf s  snf )Nrs   )dtyperu   rv   rw   rx   ry   r   )dim   )	input_idsrv   hint_idsrw   rx   ry   )torchtensorr   longfloatgtsummaxr   minr7   r8   )r=   batchr   rs   ru   rv   rw   rx   ry   batch_max_lentruncate_lenhint_truncate_lens               r@   r<    SiameseUIETrainer._nn_collate_fn  s   LLMM?,?@**	 MMeDed 01eDE**  ,,MMuEut 12uEF** !&MMUKUT 78UKL**! llMM5A54.5AB++ llMM5A54.5AB++
 "Q+++3779>>@!ma//1444<<7a,./	)!]l]*:;!!]l]"23!!]l]"23 '))!,00R08<<>CCE!ma//144 1 1=A'+=,=+=(=> 5a9K:K9K6K L #.&%:&&
 	
E @ E F L B Bs#   J&%J+*J0/J54J:9J?checkpoint_pathreturnc           	         [        [        R                  U R                  [	        U R
                  5      S9nUb;  [        R                  R                  U5      (       a  SSK	J
n  UR                  X5        U R                  R                  5         [        R                  U l        U R                   U l        S=n=pxSU R$                  l        U R$                   H  n	U	S   n
[(        R*                  " U	S   5      n[(        R*                  " U	S   5      nU" XS	9S
   n[-        U Vs/ s H  n[	        U5      PM     sn5      n[-        U Vs/ s H  n[	        U5      PM     sn5      n[/        U5      [/        U5      [/        UR1                  U5      5      nnnUU-  nUU-  nUU-  nM     U R3                  XgU5      u  nnnUUUS.$ s  snf s  snf )a  evaluate a dataset

evaluate a dataset via a specific model from the `checkpoint_path` path, if the `checkpoint_path`
does not exist, read from the config file.

Args:
    checkpoint_path (Optional[str], optional): the model path. Defaults to None.

Returns:
    Dict[str, float]: the results about the evaluation
    Example:
    {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
)deviceNr   )LoadCheckpointHook绽|=rk   r]   rd   )inputr]   output)	precisionrecallf1)r   r   siamese_uier)   strr   ospathisfilemodelscope.trainers.hooksr   load_checkpointevalr   EVAL_modetrain_dataloadereval_dataloaderr-   r.   r}   r~   setr   intersectioncompute_metrics)r=   r   argsr>   pipeline_uier   num_pred
num_recallnum_correctr   rk   r]   gold_info_listpred_info_listr   pred_info_list_setgold_info_list_setr   r   cr   r   r   s                          r@   evaluateSiameseUIETrainer.evaluate:  s   "  tzz#dkk2BD&277>>/+J+JD..E

]]
#44.333:)-&''F&>DZZx 01F!ZZ{(;<N)DXNN!$N%KNDc$iN%K!L!$N%KNDc$iN%K!L,-s3E/F"//0BCIE!qAMH!OJ1K ( !% 4 4X5@!B	62&&CC &L%Ks   #G
G
c                     U R                   $ )a  Get the metric class types.

The first choice will be the metrics configured in the config file, if not found, the default metrics will be
used.
If no metrics is found and the eval dataset exists, the method will raise an error.

Returns: The metric types.

)r   )r=   s    r@   get_metricsSiameseUIETrainer.get_metricse  s     ###rB   c                     Xs=:X  a  S:X  a   g  U[        U5      -  nU[        U5      -  nSU-  U-  XE-   -  nUS:X  a  gXEU4$ )Nr   )r'   r'   r'   r   )r   r   r   )r   )r=   r   r   r   r   r   r   s          r@   r   !SiameseUIETrainer.compute_metricsq  s^    *U* +%/1	uZ00]V#y'9:%"$$rB   )r   r   r8   r7   r9   r6   rZ   )r   ))__name__
__module____qualname____firstlineno__r   r   r
   r   r   Moduler   r   r   r   r   r   r	   r   optim	Optimizerlr_scheduler_LRSchedulerintr;   utilsr   r   r   rI   rW   r\   rH   r   r   r<   r   r   r   r   __static_attributes____classcell__)r?   s   @r@   r$   r$   %   s   
 BF&*04AE@DEIHN,B#$#CE*bii"<=>C smC $H-	C
 $E)W*<$=>C #5G);#<=C #5)-c<.?)@*A $B CC ekk33#kk66CCD EC %SMC C CT >B %ekk&6&6&>&>	&*5;;+;+;+C+C&D'E !F "(  	
 %-\$:   :Ob(@+
\ 37)D"*3-)D #3:.)DV
$T%T	"23 
$	% 	%rB   r$   ):r   r   timecollectionsr   mathr   typingr   r   r   r   r	   r
   r}   numpynpr   r   rM   r   torch.utils.datar   modelscope.metainfor   modelscope.models.baser   modelscope.msdatasetsr   modelscope.pipelinesr   modelscope.preprocessors.baser   modelscope.trainersr   r   modelscope.trainers.builderr   %modelscope.trainers.optimizer.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   r   modelscope.utils.file_utilsr   modelscope.utils.loggerr   parallel.utilsr   PATHr   environregister_modulesiamese_uie_trainerr$   r4   rB   r@   <module>r     s    
   #  ? ?    %  $ ( - + ) 6 G 0 A * M M @ . (	D	'-

# $ 
h&B&BCT%) T% DT%rB   