
    9i|-                     V   S r SSKrSSKJrJrJrJrJrJr  SSK	r
SSKJr  SSKJr  SSKJrJrJr  SSKJrJrJrJr  \\\
R2                  4   r\\R6                  \R8                  4   rS\S	\S
\S\4S jrS\\R@                     S\4S jr!S\S\S\S\"S\\\4   4
S jr# " S S5      r$g)z@Functions for building the input features for the unifold model.    N)AnyMappingMutableMappingOptionalSequenceUnion)logging)residue_constants)msa_identifiersparsers	templates)hhblitshhsearch	hmmsearch	jackhmmersequencedescriptionnum_resreturnc                 0   0 n[         R                  " U [         R                  SS9US'   [        R                  " U4[        R
                  S9US'   [        R                  " UR                  S5      /[        R                  S9US'   [        R                  " [        U5      [        R
                  S9US'   [        R                  " U/U-  [        R
                  S9US	'   [        R                  " U R                  S5      /[        R                  S9US
'   U$ )z/Constructs a feature dict of sequence features.T)r   mappingmap_unknown_to_xaatypedtypebetween_segment_residuesutf-8domain_nameresidue_index
seq_lengthr   )
r
   sequence_to_onehotrestype_order_with_xnpzerosint32arrayencodeobject_range)r   r   r   featuress       n/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/science/unifold/msa/pipeline.pymake_sequence_featuresr,       s     H*==!66HX
 ,.88WK:<((,DH'( hh(:(:7(C'D-/ZZ9H] "wrxx HH_XXwi'&9JH\88X__W%=$>*,**6HZO    msasc           	         U (       d  [        S5      e/ n/ n/ n[        5       n[        U 5       H  u  pVU(       d  [        SU S35      e[        UR                  5       H  u  pxX;   a  M  UR	                  U5        UR                  U V	s/ s H  n	[        R                  U	   PM     sn	5        UR                  UR                  U   5        [        R                  " UR                  U   5      n
UR                  U
R                  R                  S5      5        M     M     [        U S   R                  S   5      n[        U5      n0 n[        R                   " U[        R"                  S9US'   [        R                   " U[        R"                  S9US'   [        R                   " U/U-  [        R"                  S9US	'   [        R                   " U[        R$                  S9US
'   U$ s  sn	f )z*Constructs a feature dict of MSA features.z"At least one MSA must be provided.zMSA z$ must contain at least one sequence.r   r   r   deletion_matrix_intmsanum_alignmentsmsa_species_identifiers)
ValueErrorset	enumerate	sequencesaddappendr
   HHBLITS_AA_TO_IDdeletion_matrixr   get_identifiersdescriptions
species_idr'   lenr#   r&   r%   r(   )r.   int_msar;   species_idsseen_sequences	msa_indexr1   sequence_indexr   residentifiersr   r2   r*   s                 r+   make_msa_featuresrG   4   s   =>>GOKUN#D/	yk!EFH H(1#--(@$N)x(NNDLMHS"33C8HMO""3#6#6~#FG)99  02K{55<<WEF )A	 * $q'##A&'G\NH&(hhbhh&OH"#hhwbhh7HUO!#	7""(("4H*,((2::+'H&'O Ns   
G'input_fasta_pathmsa_out_path
msa_formatuse_precomputed_msasc                    U(       a$  [         R                  R                  U5      (       d>  U R                  U5      S   n[	        US5       nUR                  XS   5        SSS5        U$ [        R                  " SU5        [	        USSS9 nX6R                  5       0nSSS5        U$ ! , (       d  f       U$ = f! , (       d  f       W$ = f)z:Runs an MSA tool, checking if output already exists first.r   wNzReading MSA from file %srr   encoding)	ospathexistsqueryopenwriter	   warningread)
msa_runnerrH   rI   rJ   rK   resultfs          r+   run_msa_toolr\   X   s      rww~~l'C'C!!"23A6,$GGF&' % M 	2LA,g6! &&(+F 7M %$ M 76Ms   B(B:(
B7:
C	c                       \ rS rSrSr   SS\S\S\S\S\\   S\\   S	\\   S
\\   S\S\R                  S\
S\S\S\
4S jjrS\S\S\4S jrS\S\S\4S jrSrg)DataPipelinek   z:Runs the alignment tools and assembles the input features.jackhmmer_binary_pathhhblits_binary_pathuniref90_database_pathmgnify_database_pathbfd_database_pathuniclust30_database_pathsmall_bfd_database_pathuniprot_database_pathtemplate_searchertemplate_featurizeruse_small_bfdmgnify_max_hitsuniref_max_hitsrK   c                 b   Xl         [        R                  " UUS9U l        U(       a  [        R                  " UUS9U l        O[
        R                  " UXV/S9U l        [        R                  " UUS9U l        [        R                  " UUS9U l	        Xl
        Xl        Xl        Xl        Xl        g)zInitializes the data pipeline.)binary_pathdatabase_path)rn   	databasesN)_use_small_bfdr   	Jackhmmerjackhmmer_uniref90_runnerjackhmmer_small_bfd_runnerr   HHBlitshhblits_bfd_uniclust_runnerjackhmmer_mgnify_runnerjackhmmer_uniprot_runnerrh   ri   rk   rl   rK   )selfr`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rK   s                  r+   __init__DataPipeline.__init__n   s    $ ,)2)<)<-0*2& .7.A.A15/7D+ 07/,G0D, (1':':-.(0$ )2(;(;-/)1% "3#6 ..$8!r-   rH   msa_output_dirr   c                 F	   [        USS9 nUR                  5       nSSS5        [        R                  " W5      u  pV[	        U5      S:w  a  [        SU S35      eUS   nUS   n[	        U5      n	[        R                  R                  US5      n
[        U R                  UU
S	U R                  5      n[        R                  R                  US
5      n[        U R                  UUS	U R                  5      nUS	   n[        R                  " XR                  S9n[        R                  " U5      n[        R                   " U5      nU R"                  R$                  S	:X  a  U R"                  R'                  U5      nOnU R"                  R$                  S:X  a2  [        R(                  " U5      nU R"                  R'                  U5      nO"[        SU R"                  R$                   35      e[        R                  R                  USU R"                  R*                   35      n[        US5       nUR-                  U5        SSS5        [        R.                  " US	   5      nUR1                  U R                  S9n[        R.                  " US	   5      nUR1                  U R2                  S9nU R"                  R5                  XS9nU R6                  (       a]  [        R                  R                  US5      n[        U R8                  UUS	U R                  5      n[        R.                  " US	   5      nO\[        R                  R                  US5      n[        U R:                  UUSU R                  5      n[        R<                  " US   5      nU R>                  RA                  UUS9n[C        UUU	S9n[E        UUU45      n[F        RH                  " S[	        U5      5        [F        RH                  " S[	        U5      5        [F        RH                  " S[	        U5      5        [F        RH                  " SUS   S   5        [F        RH                  " SURJ                  S   RL                  S   5        0 UEUEURJ                  E$ ! , (       d  f       GNj= f! , (       d  f       GN[= f)z@Runs alignment tools on the input sequence and creates features.r   rO   N   z&More than one input sequence found in .r   zuniref90_hits.stostozmgnify_hits.sto)max_sequencesa3mz$Unrecognized template input format: z	pdb_hits.rM   max_seqs)output_stringinput_sequencezsmall_bfd_hits.stozbfd_uniclust_hits.a3m)query_sequencehits)r   r   r   z Uniref90 MSA size: %d sequences.zBFD MSA size: %d sequences.zMGnify MSA size: %d sequences.z,Final (deduplicated) MSA size: %d sequences.r2   zbTotal number of templates (NB: this can include bad templates and is later filtered to top 4): %d.template_domain_names)'rU   rX   r   parse_fastar?   r4   rQ   rR   joinr\   rs   rK   rw   truncate_stockholm_msarl   deduplicate_stockholm_msa'remove_empty_columns_from_stockholm_msarh   input_formatrT   convert_stockholm_to_a3moutput_formatrV   parse_stockholmtruncaterk   get_template_hitsrq   rt   rv   	parse_a3mri   get_templatesr,   rG   r	   infor*   shape)ry   rH   r|   r[   input_fasta_str
input_seqsinput_descsr   input_descriptionr   uniref90_out_pathjackhmmer_uniref90_resultmgnify_out_pathjackhmmer_mgnify_resultmsa_for_templatespdb_templates_resultuniref90_msa_as_a3mpdb_hits_out_pathuniref90_msa
mgnify_msapdb_template_hitsbfd_out_pathjackhmmer_small_bfd_resultbfd_msahhblits_bfd_uniclust_resulttemplates_resultsequence_featuresmsa_featuress                               r+   processDataPipeline.process   sy    "W5ffhO 6")"5"5o"F
z?a89I8J!LN N#A'Nn%GGLL9LM$0**%%%
! '',,~7HI".((%%#
 6e<#::-A-AC#==#KK !!..%7#'#9#9#?#?!$# ##00E9")"B"B!###'#9#9#?#?#$%  C $ 6 6 C CDF G G GGLLi(>(>(L(L'MNP#S)QGG() * ..%e,.#,,d6J6J,K,,-DU-KL
(($2F2F(G
 22DD. E O 77<<8LML)5// ))*& --*513G 77<<(?AL*600 ))+' ''(CE(JKG33AA)0A B C 3#)
 ),)LM7\9JK2CLA5s:G:)*1-	
 	=%%&=>DDQG	



 ''
 	
W 65b *)s   Q?+R?
R
R c                     [         R                  R                  US5      n[        U R                  UUSU R
                  5      n[        R                  " US   5      nUR                  SS9n[        U/5      nU$ )Nzuniprot_hits.stor   iP  r   )
rQ   rR   r   r\   rx   rK   r   r   r   rG   )ry   rH   r|   uniprot_pathuniprot_resultr1   all_seq_dicts          r+   process_uniprotDataPipeline.process_uniprot  su    ww||N4FG%))%%
 %%nU&;<llEl*(#/r-   )rq   rv   rw   rt   rx   rs   rk   ri   rh   rl   rK   N)i  i'  F)__name__
__module____qualname____firstlineno____doc__strr   TemplateSearcherr   TemplateHitFeaturizerboolintrz   FeatureDictr   r   __static_attributes__ r-   r+   r^   r^   k   s    D  #$%*)9")9 !)9 !$	)9
 ")9 $C=)9 #+3-)9 "*#)9  (})9 ,)9 '<<)9 )9 )9 )9 #)9Vr
 r
 #r
(3r
h (+0;r-   r^   )%r   rQ   typingr   r   r   r   r   r   numpyr#   abslr	   &modelscope.models.science.unifold.datar
   %modelscope.models.science.unifold.msar   r   r   +modelscope.models.science.unifold.msa.toolsr   r   r   r   r   ndarrayr   HHSearch	Hmmsearchr   r   r,   MsarG   r   r\   r^   r   r-   r+   <module>r      s    G 	 J J   D> >O O S"**_-**I,?,??@ S s $',7(!HW[[1 !k !H  	
  S#X&o or-   