
    9i2K                        S SK r S SKrS SKrS SKJrJrJrJrJrJ	r	  S SK
r
S SKrS SKrS SKrS SKJrJr  S SKJr  SSKJr  SSKJrJr  SSKJrJr  SSKJrJrJ r J!r!J"r"  \\   r#\r$\	\%\\#\$4   4   r&\\\\\      4   r'\\\\\      4   r(\RR                  " \*5      r+S	\RX                  S
\%S\-S\\RX                  \\%   4   4S jr.S\R^                  S\&S\R^                  4S jr0\Rb                  " SSS9  S-S\%S\%S\\%   S\2S\4
S jj5       r3 S.S\%S\%S\\&   S\4S jjr4     S/S\\%   S\%S\\%   S\\\%      S\\%   S\\\&      S\2S\'4S  jjr5     S0S	\RX                  S
\%S!\S"\\\      S#\-S$\\-   S%\\-   S&\2S\(4S' jjr6    S1S	\RX                  S
\%S#\-S$\\-   S%\\-   S&\24S( jjr7 " S) S*\5      r8 " S+ S,\85      r9g)2    N)DictIterableListOptionalTupleUnion)UnicoreDataset
data_utils)utils   )	NumpyDict	TorchDict)process_featuresprocess_labels)add_assembly_featuresconvert_monomer_features
merge_msaspair_and_mergepost_processconfigmodenum_resreturnc                 2   [         R                  " U 5      nX1   nUR                  5          UR                  c  X$l        S S S 5        UR                  R
                  UR                  R                  -   nUR                  R                  (       a  XSR                  R                  -  nUR                  R                  (       a  XSR                  R                  -  nX1   R                  (       a  XSR                  R                  -  nX54$ ! , (       d  f       N= fN)copydeepcopyunlocked	crop_sizecommonunsupervised_featuresrecycling_featuresuse_templatestemplate_featuresis_multimermultimer_features
supervisedsupervised_features)r   r   r   cfgmode_cfgfeature_namess         i/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/science/unifold/dataset.pymake_data_configr-   !   s    
 --
CyH	%!( 
 JJ44szz7T7TTM
zz555
zz555
y;;; 
s   D
Dall_atom_positions	operationc                     US:X  a  U $ Uu  p#[         R                  " U5      R                  SS5      n[         R                  " U5      R                  S5      nXR                  -  U-   $ )NI   )nparrayreshapeT)r.   r/   rottranss       r,   process_labelr9   6   s[    C!!JC
((3-

1
%CHHUO##A&E%--       T)maxsizer   sequence_idmonomer_feature_diruniprot_msa_dir
is_monomerc                    [         R                  " [        R                  R	                  X S35      5      n[        U5      n0 UEnUb  [         R                  " [        R                  R	                  X  S35      5      nU(       a%  [        US   US   US   US   5      u  US'   US'   U$ [         R                  " U5      nS H	  nXg   XW'   M     U$ )Nz.feature.pkl.gzz.uniprot.pkl.gzmsadeletion_matrix)msa_all_seqmsa_species_identifiers_all_seqdeletion_matrix_all_seq)r   load_pickleospathjoinr   r   convert_all_seq_feature)r=   r>   r?   r@   monomer_featurechain_featureall_seq_featurekeys           r,   load_single_featurerP   @   s     ''
(M*IJLO.?O''M"++GGLLM*IJL%/!%(!"34#E*#$56	&#M% -!##   $;;OLO
 &5%9" r:   label_id	label_dirsymmetry_operationc                     [         R                  " [        R                  R	                  X S35      5      nUb  [        US   U5      US'   UR                  5        VVs0 s H  u  pEUS;   d  M  XE_M     nnnU$ s  snnf )Nz.label.pkl.gzr.   aatyper.   all_atom_mask
resolution)r   rG   rH   rI   rJ   r9   items)rQ   rR   rS   labelkvs         r,   load_single_labelr]   d   s    
 
Y*M :;=E%&3&');'="# KKM!DAQG&H 	! 
 
 Ls   A:/A:sequence_ids	label_idssymmetry_operationsc           
         U  Vs/ s H  n[        XqUU5      PM     nnUb  [        U5      [        U 5      :X  d   eUc   eUc  U V	s/ s H  n	SPM     nn	[        X55       V
Vs/ s H  u  p[        XU5      PM     nn
n[        X5       VV
s/ s H  u  pUR	                  U
5      PM       nn
[        U5      nUb)  U VVs/ s H  nS Vs0 s H  nXU   _M
     snPM     nnnOS n[        R                  " U Vs/ s H  oS   PM	     sn[        R                  S9nU(       a  US   nO[        U5      n[        U5      nUUS'   X4$ s  snf s  sn	f s  snn
f s  sn
nf s  snf s  snnf s  snf )Nr1   rU   
seq_length)dtyper   asym_len)rP   lenzipr]   updater   r3   r4   int64r   r   )r^   r>   r?   r_   rR   r`   r@   sall_chain_features_lloall_chain_labelsfr[   crd   s                    r,   loadrq   v   s    2>1=A 	AO&	(1=  
 9~\!2222$$$&09":	13	": Y<
< bQ/< 	 

 $''9#LM#L%!"#LM./AB 
 &	' &a L
 L  tGK
 &	 	 '  xx2DE2DQ<2DE hh(H/2+,>?)*<=%-z"//O #;

 	N
 ' Fs5   EE#EE?
E!	EE!9E'E!featureslabelsseed	batch_idxdata_idxis_distillationc                    US:X  a  Uc   e[         R                  " XESS9   [        R                  R	                  SU R
                  R                  S-   5      n[        R                  R                  5       X   R                  :  n	S S S 5        OU R
                  R                  nSn	[        W5      US'   [        W	5      US'   [        U5      US'   U(       a  S	U;   a  UR                  S	5        [        US
   5      n
[        XU
S9u  pUb  US   S   R                  S5      US'   [         R                  " XFSS9   [        R                  R	                  SS5      US'   [        R                  " X,S9nUR                  5        VVs0 s H  u  pU[         R"                  " U5      _M     nnn[         R$                  " 5          ['        X+R
                  X   5      nS S S 5        S S S 5        Uby  U VVVs/ s H<  oR                  5        VVs0 s H  u  pU[         R"                  " U5      _M     snnPM>     nnnn[         R$                  " 5          [)        U5      nS S S 5        X#4$ X#4$ ! , (       d  f       GN= fs  snnf ! , (       d  f       N= f! , (       d  f       N= fs  snnf s  snnnf ! , (       d  f       X#4$ = f)Ntrain	recyclingrO   r   r   num_recycling_itersuse_clamped_faperw   
msa_chainsrb   )r   r   rX   protein_featurei{  crop_and_fix_size_seed)desired_keys)r
   
numpy_seedr3   randomrandintr    max_recycling_itersranduse_clamped_fape_probintpopr-   r5   r   filterrY   torchtensorno_gradr   r   )r   r   rr   rs   rt   ru   rv   rw   	num_itersr}   r   r)   r+   r[   r\   rl   s                   r,   processr      sH    w$$$""4D		))6==44q8:I!yy~~  22 3 ED MM55	&))nH"##&'7#8H "%o"6H<83\"(<()G)&WMC!'<!8!@!@!D			t3D	E-/YY->->q%-H)*<<E3;>>3CD3C41Au||A&3CD]]_'**ciHH 	 
F HNO"((*=*$!1ell1o%*=O]]_#F+F  8C ED0 E_	 
F	E >O_ sg   A%I);AJ#I;)JJJ3J)#J#.J)J0)
I8;J
J	J
J #J)0
K c           
      |    SU;  a  UOUR                  S5      n[        S0 UDSU0D6u  p[        XXX#XE5      u  pX4$ )Nr@    )r   rq   r   )
r   r   rt   ru   rv   rw   load_kwargsr@   rr   rs   s
             r,   load_and_processr      sX     {* 	0;0M  AkAjAHvXt':Hr:   c                   z    \ rS rSr    SS jrS rSS jrS rS r\	S 5       r
\	S	\\\\   4   4S
 j5       rSrg)UnifoldDataset   Nc	                 d   X@l         S n	U	" [        R                   R                  U R                   X-   S-   5      5      n
U	" [        R                   R                  U R                   X-   S-   5      5      U l        U R	                  U R                  5      U l        0 U l        U R
                   H#  nU R
                  U   nX   U R                  U'   M%     Xl        [        R                  SR                  [        U R                  5      [        U R                  5      5      5        [        R                   R                  U R                   S5      U l        [        R                   R                  U R                   S5      U l        [        R                   R                  U R                   US-   5      nUS:X  a  [        R                   R                  U5      (       a  U(       d  U	" U5      U l        [        R                  S	R                  [        U R                  5      5      5        [        R                   R                  U R                   S
5      U l        [        R                   R                  U R                   S5      U l        OS U l        UR$                  [&        R(                  " 5       -  UR*                  S   -  U l        Ub  X`R$                  -  O[        U R                  5      U l        XPl        U R1                  U R                  5      u  U l        U l        U l        U R1                  U R                  5      u  U l        U l        U l        U R                  b.  U R1                  U R                  5      u  U l        U l         U l!        URD                  U l#        X l$        URJ                  U l%        g )Nc                 @    [         R                  " [        U SSS95      $ )Nrutf-8encoding)jsonrq   open)filenames    r,   	load_json*UnifoldDataset.__init__.<locals>.load_json   s    99T(C'BCCr:   z_sample_weight.jsonz_multi_label.jsonz$load {} chains (unique {} sequences)pdb_features
pdb_labelszsd_train_sample_weight.jsonry   z"load {} self-distillation samples.sd_features	sd_labelsr   )&rI   rH   rJ   multi_label_inverse_mapinverse_multi_labelsample_weightseq_sample_weightloggerinfoformatre   feature_path
label_pathisfilesd_sample_weightsd_feature_pathsd_label_path
batch_sizedistributed_utilsget_data_parallel_world_sizeupdate_freqdata_lenr   cal_sample_weightnum_seqseq_keysseq_sample_prob	num_chain
chain_keyssample_probsd_num_chainsd_chain_keyssd_sample_probdatar   rt   sd_prob)selfargsrt   r   	data_pathr   max_step
disable_sdjson_prefixr   r   chainentitysd_sample_weight_paths                 r,   __init__UnifoldDataset.__init__   s    		D "GGLL$+.CCEF %GGLLK$69L$LMO#'#4#4T5E5E#F --E--e4F(5(=Du% . "/:AA""#S)?)?%@B 	CGGLLNC'',,tyy,? "II{%BB!D7?rww~~% '  '/9$-.C$DD!KK<CCD))*, -#%77<<		=#ID !#dii!ED$(D!OO/LLNNq!" 	
 # &),T-?-?)@ 	 	<@<R<R""=$9dmT%9<@<R<R= 9)9  ,
 &&t'<'<=	!"#kk	||r:   c                     [        UR                  5       5      n[        UR                  5       5      nU Vs/ s H
  oAU   U-  PM     nn[	        U5      nXbU4$ s  snf r   )listkeyssumvaluesre   )r   r   	prot_keys
sum_weightr[   r   num_prots          r,   r    UnifoldDataset.cal_sample_weight.  s^    ++-.	--/0
>GHiQ'*4iHy>K// Is   Ac                    SnU R                   S:X  Gax  [        R                  " U R                  USS9   U R                  b/  [
        R                  R                  S5      S   U R                  :  OSnU(       aD  [
        R                  R                  U R                  U R                  S9nU R                  U   nUnOU(       dQ  [
        R                  R                  U R                  U R                  S9nU R                  U   nU R                   U   nOm[
        R                  R                  U R"                  U R$                  S9nU R&                  U   n[
        R                  R                  U R(                  U   5      nS S S 5        OU R                  U   nU R                   U   nWWU4$ ! , (       d  f       N= f)NFry   data_sampler{   r   r   )p)r   r
   r   rt   r   r3   r   r   r   choicer   r   r   r   r   r   r   r   r   r   r   )r   idxsample_by_seqrw   prot_idx
label_nameseq_nameseq_idxs           r,   sample_chainUnifoldDataset.sample_chain5  s   99&&tyy#=I&*&;&;&G %'IINN1$5a$84<<$G#(   #!yy//))T-@-@  0  BH!%!3!3H!=J)H(#%99#3#3 NNd.>.> $4 $@%)__X%>
#'#;#;J#G"$))"2"2 LLD,@,@ #3 #B#'==#9%'YY%5%5 ,,X6&8
' JI, -J//
;H_441 JIs   EF//
F=c                 $   U R                  USS9u  p#nU(       d  U R                  U R                  4OU R                  U R                  4u  pV[        U R                  U R                  U R                  XR                  -  UUU/US U/US SS9u  pxU$ )NT)r   
ru   rv   rw   r^   r>   r?   r_   rR   r`   r@   )
r   r   r   r   r   r   r   r   rt   r   )	r   r   r=   rQ   rw   feature_dirrR   rr   rk   s	            r,   __getitem__UnifoldDataset.__getitem__R  s    151B1Bt 2C 2%. <K $(#4#4#'??#4#'#7#79K9K"L 	 'KKIIIIoo-+% + j $
 r:   c                     U R                   $ r   )r   )r   s    r,   __len__UnifoldDataset.__len__i  s    }}r:   c                 ,    [         R                  " U SS9$ )Nr   dim)r
   collate_dict)sampless    r,   collaterUnifoldDataset.collaterl  s     &&wA66r:   mappingc           
          0 nU R                  5        H1  u  p#U H&  nXA;   a  X   nX%:X  d   SU SU SU S35       eX!U'   M(     M3     U$ )Nzmultiple entities (z, z) exist for reference .)rY   )r   inverse_mappingentrefsrefent_2s         r,   r   UnifoldDataset._inverse_mapq  sq     IC)+0EX,UG2cU:PQTPUUVWX$'*$  ) r:   )r   r   r   r   r   r   r   r   r   r   r   rI   r   r   r   r   r   r   r   r   r   rt   r   r   r   ry   NF )F)__name__
__module____qualname____firstlineno__r   r   r   r   r   staticmethodr   r   strr   r   __static_attributes__r   r:   r,   r   r      sf     =$~05:. 7 7 
d3S	>2 
 
r:   r   c                      ^  \ rS rSr    SS\R
                  S\S\R
                  S\S\S\\   S\	S	\4U 4S
 jjjr
S r\S 5       r\S 5       r\S 5       r\S 5       rSrU =r$ )UnifoldMultimerDataseti  r   rt   r   r   r   r   r   r   c	           
      ^  > [         T
U ]  XX4XVXx5        X@l        [        R                  " [        [        R                  R                  U R                  US-   5      SS95      U l	        U R                  U R                  5      U l        [        R                  R                  U R                  S5      U l        [        R                  R                  U R                  S5      U l        [        R                  R                  U R                  S5      U l        UR                   U l        U R"                  S:X  ax  U R%                  U R                  U R                  U R&                  U R                   5      u  U l        U l        U R)                  U R&                  5      u  U l        U l        U l        g g )Nzpdb_assembly.jsonr   r   r   pdb_uniprotsr   ry   )superr   r   r   rq   r   rH   rI   rJ   pdb_assembly
get_chainsr   
pdb_chainsmonomer_feature_pathuniprot_msa_pathr   
max_chainsr   filter_pdb_by_max_chainsr   r   r   r   r   )r   r   rt   r   r   r   r   r   r   kwargs	__class__s             r,   r   UnifoldMultimerDataset.__init__  sH    	V#	2" IIT^^(+>>@ "#
 //$*B*BC$&GGLL1?%A! "T^^^ L'',,t~~|D//99262O2O!2!2D4F4F3!/DOT/ AE@V@V""A$=DNDOT-=	  r:   c                    U R                  U5      u  p#nU(       a"  U/nU/nU R                  S U R                  pnS n
OU R                  U5      nXR                  ;   aZ  U R
                  S:X  aJ  U R                  U   S    Vs/ s H  nUS-   U-   PM     nnU R                  U   S    Vs/ s H  oPM     n
nOU R                  U   nS n
U Vs/ s H  oR                  U   PM     nnU R                  U R                  U R                  pn[        U R                  U R
                  U R                  XR                  -  UUUUUUU	U
SS9$ s  snf s  snf s  snf )Nry   chainsrk   opersFr   )r   r   r   get_pdb_namer  r   r  r   r  r  r   r   r   rt   r   )r   r   seq_idrQ   rw   r_   r^   r  r  r   r`   pdb_ididtchain_ids                  r,   r   "UnifoldMultimerDataset.__getitem__  s   ,0,=,=c,B)/!
I"8L$$"" 5? 
 #'&&x0F***tyyG/C #//7AA SL2%A  
  $008A'A!AA $ '# !OOF3	&*#CLCLx((29   ))%% 5?   KKIIIIoo-+% 4,  3
 	
''s   E	*EEc                    [        U 5      S::  a  g U  Vs/ s H  oS   PM	     nnU  Vs/ s H  oS   c  M
  US   PM     nn [        R                  " USS9nU(       d  S nX#4$ s  snf s  snf ! [         a    [	        SU5      ef = f)Nr   r   r   zcannot collate features)re   r
   r   BaseException
ValueError)r   ri   featslabss       r,   r   UnifoldMultimerDataset.collater  s     w<1&'w!1w'%:g1!g:	?++Eq9E D{ (:  	?6>>	?s   A#	A(	A(A- -Bc                 *    U R                  S5      S   $ )Nrk   r   )split)r   s    r,   r  #UnifoldMultimerDataset.get_pdb_name  s    {{3""r:   c                 ~    0 nU  H4  n[         R                  U5      nX1;  a  / X'   X   R                  U5        M6     U$ r   )r  r  append)canon_chain_mapr  r   pdbs       r,   r  !UnifoldMultimerDataset.get_chains  sF    
$E(55e<C$"$
O""5)	 %
 r:   c                    0 nU  HB  nXQ;   a  [        X   S   5      nXc::  a  X   XE'   M%  M'  [        X   5      nUS:X  d  M<  X   XE'   MD     U Vs0 s H#  n[        R                  U5      U;   d  M  XrU   _M%     nn[        R	                  S[        U 5      [        U5      -
   S[        U 5       S[        U5      [        U5      -
   S[        U5       SU 3
5        XH4$ s  snf )Nr  r   zfiltered out z / z PDBs (z chains) by max_chains )re   r  r  r   r   )	r  r  r   r  new_pdb_chainsr   sizer[   new_sample_weights	            r,   r  /UnifoldMultimerDataset.filter_pdb_by_max_chains  s    E$<.x89%,6,=N) & :,-19,6,=N)   #
"%2215G  AQ" 	 

 	C
Oc..AAB#c*oEV WM"S):%;;<CM@R?S T'L*	+ 00
s   C0	C)r   r   r   r  r  r   r  r  r   r   r  r   )r   r  r  r  mlc
ConfigDictr   r  r   boolr   r   r  r   r  r  r  r  __classcell__)r  s   @r,   r  r    s     "& $nn$ $ 	$
 $ $ 3-$ $ $ $B/
b   # #   1 1r:   r  )NFr   )NNNNF)Nr   NNF)r   NNF):r   loggingrH   typingr   r   r   r   r   r   r   ml_collectionsr3  numpyr3   r   unicore.datar	   r
   unicore.distributedr   r   r   data.data_opsr   r   data.processr   r   data.process_multimerr   r   r   r   r   RotationTranslationr  	OperationNumpyExampleTorchExample	getLoggerr   r   r4  r   r-   ndarrayr9   	lru_cacher5  rP   r]   rq   r   r   r   r  r   r:   r,   <module>rH     s     	 ? ?     3 :  / :B B H#uX{2334	Yi 99:Yi 99:			8	$NN
  3>>49$%	*.bjj .&.+-::. & &*	    c]  	 
   ' L /3 !+ 	* &*%)#5910s)1010 c]10 S	"	10
 }10 "$y/210 10 10p )-#"!.NN.
. . T)_%	.
 . }. sm. . .h #"!NN
  }	
 sm $O^ OdG1^ G1r:   