
    9i                        S r SSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKJ	r	J
r
JrJrJrJr  SSKrSSKJr  SSKJr  SSKJrJr  SSKJr   " S S	\5      r " S
 S\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\#5      r$ " S S\#5      r% " S S\#5      r& " S  S!\#5      r'\RP                  \RP                  \RP                  \RR                  \RR                  \RP                  S".r*S#\RV                  S$\\,\,4   4S% jr-S&\,S'\\,\R                  4   S(\\R                     S$\.4S) jr/S*\,S$\\,\\,   4   4S+ jr0S,\,S$\\,\R                  4   4S- jr1  S^S#\RV                  S.\,S/\,S'\\,\R                  4   S(\R                  S0\2S1\2S$\.4S2 jjr3S3\,S4\,S5\Rh                  S$\\,\,\54   4S6 jr6S7\,S3\,S5\Rh                  S8\\5\54   S9\,S$\\,\\5\54   4   4S: jr7S;\Rp                  S<\Rp                  S=\24S> jr9S5\Rh                  S?\,S=\2S$\\Rp                  \Rp                  4   4S@ jr:S5\Rh                  S&\,SA\\5\54   S4\,S/\,S3\,S9\,S$\\
\,\	4   \\,   4   4SB jr;SC\,SD\,SE\\5   SF\\5   SG\,S$\\5\54   4SH jr<\Rz                  " SISJ9 " SK SL5      5       r>\R~                  " SMSNSO9SP 5       r@ S_S/\,S#\RV                  SQ\,SR\R                  S'\\,\R                  4   SS\\,\\,   4   S9\,ST\.S$\>4SU jjrA\Rz                  " SISJ9 " SV SW5      5       rB " SX SY\R                  5      rD " SZ S[\D5      rE " S\ S]\D5      rFg)`zBFunctions for getting templates and calculating template features.    N)AnyDictMappingOptionalSequenceTuple)logging)residue_constants)mmcifparsers)kalignc                       \ rS rSrSrSrg)Error    zBase class for exceptions. N__name__
__module____qualname____firstlineno____doc____static_attributes__r       o/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/science/unifold/msa/templates.pyr   r       s    $r   r   c                       \ rS rSrSrSrg)NoChainsError$   z?An error indicating that template mmCIF didn't have any chains.r   Nr   r   r   r   r   r   $   s    Ir   r   c                       \ rS rSrSrSrg)SequenceNotInTemplateError(   zDAn error indicating that template mmCIF didn't contain the sequence.r   Nr   r   r   r   r   r   (       Nr   r   c                       \ rS rSrSrSrg)NoAtomDataInTemplateError,   zFAn error indicating that template mmCIF didn't contain atom positions.r   Nr   r   r   r   r#   r#   ,       Pr   r#   c                       \ rS rSrSrSrg)TemplateAtomMaskAllZerosError0   zFAn error indicating that template mmCIF had all atom positions masked.r   Nr   r   r   r   r'   r'   0   r%   r   r'   c                       \ rS rSrSrSrg)QueryToTemplateAlignError4   zDAn error indicating that the query can't be aligned to the template.r   Nr   r   r   r   r*   r*   4   r!   r   r*   c                       \ rS rSrSrSrg)CaDistanceError8   z@An error indicating that a CA atom distance exceeds a threshold.r   Nr   r   r   r   r-   r-   8   s    Jr   r-   c                       \ rS rSrSrSrg)MultipleChainsError<   zCAn error indicating that multiple chains were found for a given ID.r   Nr   r   r   r   r0   r0   <   s    Mr   r0   c                       \ rS rSrSrSrg)PrefilterErrorA   z/A base class for template prefilter exceptions.r   Nr   r   r   r   r3   r3   A       9r   r3   c                       \ rS rSrSrSrg)	DateErrorE   zEAn error indicating that the hit date was after the max allowed date.r   Nr   r   r   r   r7   r7   E   s    Or   r7   c                       \ rS rSrSrSrg)AlignRatioErrorI   zHAn error indicating that the hit align ratio to the query was too small.r   Nr   r   r   r   r:   r:   I   s    Rr   r:   c                       \ rS rSrSrSrg)DuplicateErrorM   zGAn error indicating that the hit was an exact subsequence of the query.r   Nr   r   r   r   r=   r=   M   s    Qr   r=   c                       \ rS rSrSrSrg)LengthErrorQ   z/An error indicating that the hit was too short.r   Nr   r   r   r   r@   r@   Q   r5   r   r@   template_aatypetemplate_all_atom_masktemplate_all_atom_positionstemplate_domain_namestemplate_sequencetemplate_sum_probshitreturnc                     [         R                  " SU R                  5      nU(       d  [        SU R                   35      eUR	                  S5      R                  S5      u  p#UR                  5       U4$ )z0Returns PDB id and chain id for an HHSearch Hit.z[a-zA-Z\d]{4}_[a-zA-Z0-9.]+z)hit.name did not start with PDBID_chain: r   _)rematchname
ValueErrorgroupsplitlower)rI   id_matchpdb_idchain_ids       r   _get_pdb_id_and_chainrW   _   sf     xx6AH7zBD 	D~~a(..s3F<<>8##r   rU   release_datesrelease_date_cutoffc                 8    Uc  [        S5      eX;   a  X   U:  $ g)aR  Checks if the template date is after the release date cutoff.

Args:
    pdb_id: 4 letter pdb code.
    release_dates: Dictionary mapping PDB ids to their structure release dates.
    release_date_cutoff: Max release date that is valid for this query.

Returns:
    True if the template release date is after the cutoff, False otherwise.
z)The release_date_cutoff must not be None.F)rP   )rU   rX   rY   s      r   _is_after_cutoffr[   j   s0     "DEE$'::: r   obsolete_file_pathc                 ~   [        U 5       n0 nU H  nUR                  5       nUR                  S5      (       d  M+  [        U5      S:  a,  USS R	                  5       nUSS R	                  5       nXRU'   Mf  [        U5      S:X  d  Mw  USS R	                  5       nSX$'   M     UsSSS5        $ ! , (       d  f       g= f)zDParses the data file from PDB that lists which pdb_ids are obsolete.OBSLTE            !   N)openstrip
startswithlenrS   )r\   fresultlinefrom_idto_ids         r   _parse_obsoleterm      s    	 	!QD::<D
 x((t9r>"2bk//1G BK--/E&+7OY"_"2bk//1G&*FO   % 
"	!	!s   ,B.AB.B..
B<pathc                    U R                  S5      (       a  0 n[        U SSS9 nU Hp  nUR                  S5      u  pEUR                  5       n[        R                  " [        USS 5      [        USS	 5      [        US
S 5      S9XR                  5       '   Mr     SSS5        U$ [        SU -  5      e! , (       d  f       U$ = f)zHParses release dates file, returns a mapping from PDBs to release dates.txtrzutf-8)encoding:N            
   )yearmonthdayz+Invalid format of the release date file %s.)endswithrd   rR   re   datetimeintrP   )rn   rX   rh   rj   rU   dates         r   _parse_release_datesr      s    }}U$g.!#zz#zz| 190A0AT"1Xd1Qi.D2J1)lln-  / FMNN /. s   A7B33
Chit_pdb_codequery_sequencemax_subsequence_ratiomin_align_ratioc                    U R                   nU[        U5      -  nU R                  R                  SS5      n	[	        [        U	5      5      [        U5      -  n
X;   =(       a    X:  n[        XU5      (       a  [        SX1    SU S35      eX::  a  [        SU S35      eU(       a  [        SU
 S35      e[        U	5      S	:  a  [        S
[        U	5       S35      eg)a  Determines if template is valid (without parsing the template mmcif file).

Args:
    hit: HhrHit for the template.
    hit_pdb_code: The 4 letter pdb code of the template hit. This might be
        different from the value in the actual hit since the original pdb might
        have become obsolete.
    query_sequence: Amino acid sequence of the query.
    release_dates: Dictionary mapping pdb codes to their structure release
        dates.
    release_date_cutoff: Max release date that is valid for this query.
    max_subsequence_ratio: Exclude any exact matches with this much overlap.
    min_align_ratio: Minimum overlap between the template and query.

Returns:
    True if the hit passed the prefilter. Raises an exception otherwise.

Raises:
    DateError: If the hit date was after the max allowed date.
    AlignRatioError: If the hit align ratio to the query was too small.
    DuplicateError: If the hit was an exact subsequence of the query.
    LengthError: If the hit was too short.
- zDate () > max template date ().z@Proportion of residues aligned to query too small. Align ratio: .zMTemplate is an exact subsequence of query with large coverage. Length ratio: rx   zTemplate too short. Length: T)
aligned_colsrg   hit_sequencereplacefloatr[   r7   r:   r=   r@   )rI   r   r   rX   rY   r   r   r   align_ratiorG   length_ratio	duplicates               r   _assess_hhsearch_hitr      s$   @ ##L^!44K((00b9./03~3FFL
 	+ 	10  5HII]01 2#$B() 	) %'=+, 	, ''3nA78 	8 "*3/@+A*B!DF 	F r   template_chain_idrG   mmcif_objectc           
      4   UR                   nUR                  R                  U 5      nU(       a2  X;   a-  [        R                  " SUU 5        UR                  U5      nX@U4$ UR                  R                  5        HB  u  pdU(       d  M  X;   d  M  [        R                  " SUU5        UR                  U5      nXFU4s  $    U Vs/ s H  owS:X  a  SOSU-  PM     nn[        R                  " SR                  U5      5      nUR                  R                  5        HP  u  pd[        R                  " X5      n	U	(       d  M$  [        R                  " SUU5        U	R                  5       nXFU4s  $    [        SU< S	U < S
U< SUR                  < 35      es  snf )a)  Tries to find the template chain in the given pdb file.

This method tries the three following things in order:
    1. Tries if there is an exact match in both the chain ID and the sequence.
         If yes, the chain sequence is returned. Otherwise:
    2. Tries if there is an exact match only in the sequence.
         If yes, the chain sequence is returned. Otherwise:
    3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
         If yes, the chain sequence is returned.
If none of these succeed, a SequenceNotInTemplateError is thrown.

Args:
    template_chain_id: The template chain ID.
    template_sequence: The template chain sequence.
    mmcif_object: The PDB object to search for the template in.

Returns:
    A tuple with:
    * The chain sequence that was found to match the template in the PDB object.
    * The ID of the chain that is being returned.
    * The offset where the template sequence starts in the chain sequence.

Raises:
    SequenceNotInTemplateError: If no match is found after the steps described
        above.
z$Found an exact template match %s_%s.z"Found a sequence-only match %s_%s.Xr   z(?:%s|X)r   z(Found a fuzzy sequence-only match %s_%s.z(Could not find the template sequence in rL   z. Template sequence: z, chain_to_seqres: )file_idchain_to_seqresgetr	   infofinditemsrM   compilejoinsearchstartr   )
r   rG   r   rU   chain_sequencemapping_offsetrV   aaregexrN   s
             r   _find_template_in_pdbr      s}   < !!F!11556GHN,>;V&	(',,->?.@@ %1$@$@$F$F$H >0BLL=v!#+001BCN!^;; %I >OO=Nr#IS:?2=NEOJJrwwu~&E$0$@$@$F$F$H 		%05LLCV!#"[[]N!^;; %I %!'):<M!-!=!=	?@ @ Ps   Fold_template_sequenceold_mappingkalign_binary_pathc                 p   [         R                  " US9nUR                  R                  US5      nU(       d~  [	        UR                  5      S:X  aI  [
        R                  " SUUR                  5        [        UR                  R                  5       5      S   nO[        SU SUR                   S35      e [        R                  " UR                  X/5      5      nUR                  u  p[
        R                  " SUU	5        0 nSnSnSn[#        UU	5       H>  u  nnUS:w  a  US-  nUS:w  a  US-  nUS:w  d  M$  US:w  d  M,  UUU'   UU:X  d  M9  US-  nM@     [%        U5      ['        [	        U 5      [	        U5      5      -  S:  a&  [        SU < SUR                  < SU< SU< S3	5      e0 nUR)                  5        H  u  nnUR                  US5      UU'   M     UR+                  SS5      nUU4$ ! [         a7  n
[        S	U < S
U< SUR                  < SU< S[!        U
5      < 3
5      eSn
A
ff = f)a0  Aligns template from the mmcif_object to the query.

In case PDB70 contains a different version of the template sequence, we need
to perform a realignment to the actual sequence that is in the mmCIF file.
This method performs such realignment, but returns the new sequence and
mapping only if the sequence in the mmCIF file is 90% identical to the old
sequence.

Note that the old_template_sequence comes from the hit, and contains only that
part of the chain that matches with the query while the new_template_sequence
is the full chain.

Args:
    old_template_sequence: The template sequence that was returned by the PDB
        template search (typically done using HHSearch).
    template_chain_id: The template chain id was returned by the PDB template
        search (typically done using HHSearch). This is used to find the right
        chain in the mmcif_object chain_to_seqres mapping.
    mmcif_object: A mmcif_object which holds the actual template data.
    old_mapping: A mapping from the query sequence to the template sequence.
        This mapping will be used to compute the new mapping from the query
        sequence to the actual mmcif_object template sequence by aligning the
        old_template_sequence and the actual template sequence.
    kalign_binary_path: The path to a kalign executable.

Returns:
    A tuple (new_template_sequence, new_query_to_template_mapping) where:
    * new_template_sequence is the actual template sequence that was found in
        the mmcif_object.
    * new_query_to_template_mapping is the new mapping from the query to the
        actual template found in the mmcif_object.

Raises:
    QueryToTemplateAlignError:
    * If there was an error thrown by the alignment tool.
    * Or if the actual template sequence differs by more than 10% from the
        old_template_sequence.
)binary_pathr      zICould not find %s in %s, but there is only 1 sequence, so using that one.r   zCould not find chain z in zR. If there are no mmCIF parsing errors, it is possible it was not a protein chain.zCould not align old template z to template z (rL   z
). Error: Nz1Old aligned template: %s
New aligned template: %sr   g?z9Insufficient similarity of the sequence in the database: z* to the actual sequence in the mmCIF file z: z. We require at least 90 % similarity wrt to the shorter of the sequences. This is not a problem unless you think this is a template that should be included.)r   Kalignr   r   rg   r	   r   r   listvaluesr*   r   	parse_a3malign	sequences	Exceptionstrzipr   minr   r   )r   r   r   r   r   alignernew_template_sequence
parsed_a3mold_aligned_templatenew_aligned_templateeold_to_new_template_mappingold_template_indexnew_template_indexnum_sameold_template_aanew_template_aanew_query_to_template_mappingquery_indexs                      r   _realign_pdb_template_to_queryr   1  s   Z mm(:;G(88<<2
 !|++,1LL"!$$	 %),,335%778%:! ,'(9':$|?S?S>T U! !" "
&&MM0HIK
5?5I5I2 LL< #%H,/0D0D-F(c!!#c!!#c!o&<&8 ("$/1A-F 	h#+,c2G.HI	J 	 ( &$$!%	
 
	 %'!+6+<+<+>''6::"B( 	&	 ,?
 299#rB "???u  	' &%$$!A 		s   74G4 4
H5>2H00H5all_positionsall_positions_maskmax_ca_ca_distancec                 *   [         R                  S   nSnSn[        [        X5      5       Hd  u  nu  px[	        X   5      n	U	(       aF  Xs   n
U(       a9  [
        R                  R                  X-
  5      nX:  a  [        SXfS-   X4-  5      eU
nU	nMf     g)z@Checks if the distance between unmasked neighbor residues is ok.CAFNz9The distance between residues %d and %d is %f > limit %f.r   )	r
   
atom_order	enumerater   boolnplinalgnormr-   )r   r   r   ca_positionprev_is_unmaskedprev_calphaicoordsmaskthis_is_unmaskedthis_calphadistances               r   _check_residue_distancesr     s     $..t4KK&s='MN>F 12 -K99>>+*CD0)S!eXBCD D &K+ Or   auth_chain_idc                   ^ [        U R                  U   5      nU R                  R                  5        Vs/ s H  oDR                  U:X  d  M  UPM     nn[        U5      S:w  a  [        SU S35      eUS   n[        R                  " U[        R                  S/5      n[        R                  " U[        R                  /[        R                  S9n[        U5       GH  n	[        R                  " [        R                  S/[        R                  S9n
[        R                  " [        R                  /[        R                  S9mU R                  U   U	   nUR                  (       Gd9  UUR                  UR                   R"                  UR                   R$                  4   nUR'                  5        H  nUR)                  5       nUR+                  5       u  nnnU[        R,                  R/                  5       ;   a1  UUU/U
[        R,                  U   '   ST[        R,                  U   '   Mz  UR1                  5       S:X  d  M  UR3                  5       S	:X  d  M  UUU/U
[        R,                  S
   '   ST[        R,                  S
   '   M     [        R,                  S   n[        R,                  S   n[        R,                  S   nUR3                  5       S:X  a  [5        U4S jUUU4 5       5      (       a  [        R6                  R9                  U
U   U
U   -
  5      [        R6                  R9                  U
U   U
U   -
  5      :  aZ  U
U   R;                  5       U
U   R;                  5       sU
U'   U
U'   TU   R;                  5       TU   R;                  5       sTU'   TU'   XU	'   TX'   GM     [=        XxU5        Xx4$ s  snf )z?Gets atom positions and mask from a list of Biopython Residues.r   z0Expected exactly one chain in structure with id r   r      dtypeg      ?SEMSESDCDNH1NH2ARGc              3   .   >#    U  H
  nTU   v   M     g 7fNr   ).0
atom_indexr   s     r   	<genexpr>&_get_atom_positions.<locals>.<genexpr>  s     N~D,~s   )rg   r   	structure
get_chainsidr0   r   zerosr
   atom_type_numint64rangefloat32seqres_to_structure
is_missinghetflagpositionresidue_numberinsertion_code	get_atomsget_name	get_coordr   keysupperget_resnameallr   r   copyr   )r   r   r   num_rescrelevant_chainschainr   r   	res_indexposres_at_positionresatom	atom_namexyzcdnh1nh2r   s                        @r   _get_atom_positionsr    sM    ,..}=>G  ))4466a$$-:O6   ?q !>}oQO
 	
 AEHHg'8'F'FJKM7,=,K,K"L(*27^	hh)77;2::Nxx*889L&::=I)))''((77((77 C
  MMO	..*1a 1 < < A A CCDEq!9C)44Y?@DGD*55i@A__&$.3??3D3M?@!QiC)44T:;?BD*55d;< ( #--d3B#..u5C#..u5C!U*NC~NNNC3r7(:;C3r7(:;<%(X]]_c#hmmo"C#c('+Cy~~'7c9I$S	49#&i (,%G $H ]/1,,es   N?N?mappingc                    U b  U R                   (       d  [        SU< SU< 35      eSn [        UUU S9u  pn
 [        X	SS9u  p[        R                  " UUR                  S
   5      n[        R                  " XR                  S
   5      n/ n/ n/ nU H|  nUR!                  [        R"                  " [$        R&                  S45      5        UR!                  [        R"                  " [$        R&                  5      5        UR!                  S5        M~     UR)                  5        H)  u  nnUU
-   nUU   S
   UU'   UU   S
   UU'   UU   UU'   M+     [        R*                  " U5      S:  aF  [-        SUU	[/        UR1                  5       5      U
-   [3        UR1                  5       5      U
-   4-  5      eSR5                  U5      n[$        R6                  " U[$        R8                  5      n[        R:                  " U5      [        R:                  " U5      UR=                  5       [        R:                  " U5      UR?                  5        SU	 3R=                  5       S.U4$ ! [         aU    Un	SU SU SU	 S3n[        R
                  " U5        [        UUU UUS9u  p[        R                  " S	UU	UU5        UnS
n
 GNf = f! [        [        4 a&  n[        SU< SU	< S[        U5      < 35      UeSnAff = f)a1  Parses atom positions in the target structure and aligns with the query.

Atoms for each residue in the template structure are indexed to coincide
with their corresponding residue in the query sequence, according to the
alignment mapping provided.

Args:
    mmcif_object: mmcif_parsing.MmcifObject representing the template.
    pdb_id: PDB code for the template.
    mapping: Dictionary mapping indices in the query sequence to indices in
        the template sequence.
    template_sequence: String describing the amino acid sequence for the
        template protein.
    query_sequence: String describing the amino acid sequence for the query
        protein.
    template_chain_id: String ID describing which chain in the structure proto
        should be used.
    kalign_binary_path: The path to a kalign executable used for template
            realignment.

Returns:
    A tuple with:
    * A dictionary containing the extra features derived from the template
        protein structure.
    * A warning message if the hit was realigned to the actual mmCIF sequence.
        Otherwise None.

Raises:
    NoChainsError: If the mmcif object doesn't contain any chains.
    SequenceNotInTemplateError: If the given chain id / sequence can't
        be found in the mmcif object.
    QueryToTemplateAlignError: If the actual template in the mmCIF file
        can't be aligned to the query.
    NoAtomDataInTemplateError: If the mmcif object doesn't contain
        atom positions.
    TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
        unmasked residues.
NzNo chains in PDB: rL   )r   rG   r   zThe exact sequence z was not found in z1. Realigning the template to the actual sequence.)r   r   r   r   r   z2Sequence in %s_%s: %s successfully realigned to %sr   g     b@)r   zCould not get atom data (z): r   r   ru   zATemplate all atom mask was all zeros: %s_%s. Residue range: %d-%dr   )rE   rD   rG   rC   rF   ) r   r   r   r   r	   warningr   r   r  r-   KeyErrorr#   r   r   rR   shapeappendr   r
   r   r   sumr'   r   r   maxr   sequence_to_onehotHHBLITS_AA_TO_IDarrayencoderS   )r   rU   r  rG   r   r   r   r  seqresrV   r   all_atom_positionsall_atom_maskexall_atom_masksoutput_templates_sequencetemplates_all_atom_positionstemplates_all_atom_masksrL   kvtemplate_indextemplates_aatypes                          r   _extract_template_featuresr,    s2   ^ <#?#?#%68 9 	9 G!+@//%,
(.DM -@u->) "4"4":":1"=?XXm-@-@-CDN "#% !$++HH'55q9:	< ''HH&445	7!((-  1^+*<^*LQ*O$Q'&4^&DQ&G #'8';!!$	   
vv&'!++OGNN$%6GNN$%6	 	 !#(A B(;;!#4#E#EG HH12&(hh/G&H!:!A!A!C!xx(89(.'7q
%C%J%J%L	
 	
 
Y &  %!"3!44Fhaz!RT 	 	 8"3/%1
 	@	
 #7D X& M')/3r7)D EJL	MMs)   I J? AJ<;J<?K5!K00K5hit_query_sequencer   indices_hitindices_queryoriginal_query_sequencec                    U (       d  0 $ U R                  SS5      nUR                  SS5      nUR                  U5      n[        S U 5       5      nU Vs/ s H  oS:  a  X-
  OSPM     n	n[        S U 5       5      nU Vs/ s H  oS:  a  X-
  OSPM     n
n0 n[        X5       H=  u  pUS:w  d  M  US:w  d  M  U[	        U5      :  d  X-   [	        U5      :  a  M7  XX-   '   M?     U$ s  snf s  snf )a  Gets mapping from indices in original query sequence to indices in the hit.

hit_query_sequence and hit_sequence are two aligned sequences containing gap
characters. hit_query_sequence contains only the part of the original query
sequence that matched the hit. When interpreting the indices from the .hhr, we
need to correct for this to recover a mapping from original query sequence to
the hit sequence.

Args:
    hit_query_sequence: The portion of the query sequence that is in the .hhr
        hit
    hit_sequence: The portion of the hit sequence that is in the .hhr
    indices_hit: The indices for each aminoacid relative to the hit sequence
    indices_query: The indices for each aminoacid relative to the original query
        sequence
    original_query_sequence: String describing the original query sequence.

Returns:
    Dictionary with indices in the original query sequence as keys and indices
    in the hit sequence as values.
r   r   c              3   4   #    U  H  oS :  d  M
  Uv   M     g7fr   Nr   r   r  s     r   r   4_build_query_to_hit_index_mapping.<locals>.<genexpr>  s     3[F!![   		r   c              3   4   #    U  H  oS :  d  M
  Uv   M     g7fr3  r   r4  s     r   r   r5    s     5]"f!!]r6  )r   r   r   r   rg   )r-  r   r.  r/  r0  hhsearch_query_sequencehhsearch_query_offsetmin_idxr  fixed_indices_hitfixed_indices_queryr  q_iq_ts                 r   !_build_query_to_hit_index_mappingr?    s   : 	 188bA''R0L388! 3[33G@KL1BB6L5]55G/</<!2v2%}  
 G+?"9c,''3+F#+K- ,-36C/0 @ N! Ms   C)C.T)frozenc                   P    \ rS rSr% \\\\4      \S'   \\   \S'   \\   \S'   Sr	g)SingleHitResulti  featureserrorr  r   N)
r   r   r   r   r   r   r   r   __annotations__r   r   r   r   rB  rB    s*    wsCx())C=c]r   rB     F)typedc                 t    [        U S5       nUR                  5       nS S S 5        U$ ! , (       d  f       W$ = f)Nrq   )rd   read)rn   rh   	file_datas      r   
_read_filerK    s1    	dCAFFH	 
 
s   (
7	mmcif_dirmax_template_dateobsolete_pdbsstrict_error_checkc                 2   [        U5      u  pX;   a  XX   c  [        SSSU S3S9$ X;  a	  X;   a  XX   n [        UUU UUS9  [        UR                  UR                  UR                  UR                  U 5      nUR                  R                  S	S
5      n[         R"                  R%                  X(S-   5      n[
        R&                  " SUU U5        [)        U5      n[*        R,                  " XS9nUR.                  b  [0        R0                  R3                  UR.                  R4                  S   S5      nUU:  aB  SU< SU< SU< S3nU(       a  [        SUSS9$ [
        R&                  " U5        [        SSSS9$  [7        UR.                  UUUU U	US9u  nnUR8                  c  S/US'   OUR8                  /US'   [        USUS9$ ! [         aq  n
SU SU	 S[	        U
5       3n[
        R                  " U5        U(       a+  [        U
[        [        45      (       a  [        SUSS9s Sn
A
$ [        SSSS9s Sn
A
$ Sn
A
ff = f! [:        [<        [>        4 al  n
U< SU	< SUR8                  < SUR@                  < S[	        U
5      < SURB                  < 3nU(       a  [        SUSS9s Sn
A
$ [        SSUS9s Sn
A
$ Sn
A
f[D         aG  n
SUU	UR8                  UR@                  [	        U
5      URB                  4-  n[        SUSS9s Sn
A
$ Sn
A
ff = f)z>Tries to extract template features from a single HHSearch hit.NzHit z is obsolete.)rC  rD  r  )rI   r   r   rX   rY   zhit rL   z did not pass prefilter: r   r   z.cifz2Reading PDB entry from %s. Query: %s, template: %s)r   mmcif_stringrelease_date%Y-%m-%dz	Template z date (r   r   )r   rU   r  rG   r   r   r   r   rH   z (sum_probs: z, rank: z): feature extracting errors: z, mmCIF parsing errors: zZ%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: %s, mmCIF parsing errors: %s)#rW   rB  r   r3   r   r	   r   
isinstancer7   r=   r?  queryr   r.  r/  r   osrn   r   debugrK  r   parser   r}   strptimeheaderr,  	sum_probsr   r#   r'   indexerrorsr   )r   rI   rL  rM  rX   rN  r   rO  r   hit_chain_idr   msgr  rG   cif_path
cif_stringparsing_resulthit_release_daterD  rC  realign_warningr  s                         r   _process_single_hitre    s5    "7s!;L $)D)L<.68 	8
 (((6LH%)' 1	
  0		3;K;K03030A0A0>@G ((00b9ww||If'<=HMM<	 H%J[[ON"".#,,55''..~>
L// !E
 "&!> > e$&D$OO4I$>'44/)*1%
!/ == ./SH)*.1mm_H)*
 T?D 	DA  H\N!L>1J3q6(SS*QN0K"L"L"DTJJD$GGHF %) O 		A%% 	 "DNN"DgNN I+		A%%. 	 E4HHIsh   G =A
I 
IAH>(I.
H>8I>ILAK,L2
K<LL<LLLc                   J    \ rS rSr% \\\4   \S'   \\   \S'   \\   \S'   Sr	g)TemplateSearchResultih  rC  r]  warningsr   N)
r   r   r   r   r   r   r   rE  r   r   r   r   r   rg  rg  h  s%    c3hSMsmr   rg  c                       \ rS rSrSr SS\S\S\S\S\\   S\\   S	\4S
 jjr	\
R                  S\S\\R                     S\4S j5       rSrg)TemplateHitFeaturizerio  zFAn abstract base class for turning template hits to template features.rL  rM  max_hitsr   release_dates_pathobsolete_pdbs_pathrO  c                    Xl         [        R                  " [        R                  R	                  U R                   S5      5      (       d9  [
        R                  " SU R                   5        [        SU R                    35      e [        R                  R                  US5      U l
        X0l        X@l        Xpl        U(       a(  [
        R                  " SU5        [        U5      U l        O0 U l        U(       a(  [
        R                  " SU5        [#        U5      U l        g0 U l        g! [         a    [        S5      ef = f)	a  Initializes the Template Search.

Args:
    mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
        is found by HHSearch, this directory is used to retrieve the template
        data.
    max_template_date: The maximum date permitted for template structures. No
        template with date higher than this date will be returned. In ISO8601
        date format, YYYY-MM-DD.
    max_hits: The maximum number of templates that will be returned.
    kalign_binary_path: The path to a kalign executable used for template
        realignment.
    release_dates_path: An optional path to a file with a mapping from PDB IDs
        to their release dates. Thanks to this we don't have to redundantly
        parse mmCIF files to get that information.
    obsolete_pdbs_path: An optional path to a file containing a mapping from
        obsolete PDB IDs to the PDB IDs of their replacements.
    strict_error_check: If True, then the following will be treated as errors:
        * If any template date is after the max_template_date.
        * If any template has identical PDB ID to the query.
        * If any template is a duplicate of the query.
        * Any feature computation errors.
z*.cifzCould not find CIFs in %szCould not find CIFs in rS  z9max_template_date must be set and have format YYYY-MM-DD.z#Using precomputed release dates %s.z#Using precomputed obsolete pdbs %s.N)
_mmcif_dirglobrV  rn   r   r	   rD  rP   r}   rY  _max_template_date	_max_hits_kalign_binary_path_strict_error_checkr   r   _release_datesrm   _obsolete_pdbs)selfrL  rM  rk  r   rl  rm  rO  s           r   __init__TemplateHitFeaturizer.__init__r  s	   B $yydoow?@@MM5tG6t6GHII	M&.&7&7&@&@!:'/D#
 "#5 #5 LL>+-"67I"JD"$DLL>+-"12D"ED"$D'  	MKM M	Ms   %D( (D>r   hitsrJ   c                     g)z0Computes the templates for given query sequence.Nr   )rw  r   rz  s      r   get_templates#TemplateHitFeaturizer.get_templates  s    r   )rs  rr  rq  ro  rv  ru  rt  NF)r   r   r   r   r   r   r~   r   r   rx  abcabstractmethodr   r   TemplateHitrg  r|  r   r   r   r   rj  rj  o  s    P $)<%<% <% 	<%
  <% %SM<% %SM<% !<%| 	?"%?7../?4H? ?r   rj  c                   H    \ rS rSrSrS\S\\R                     S\	4S jr
Srg)	HhsearchHitFeaturizeri  z@A class for turning a3m hits from hhsearch to template features.r   rz  rJ   c                    [         R                  " SU5        0 n[         H  n/ X4'   M	     Sn/ n/ n[        US SS9 GH/  nXPR                  :  a    GO[        UUU R                  U R                  U R                  U R                  U R                  U R                  S9n	U	R                  (       a  UR                  U	R                  5        U	R                  (       a  UR                  U	R                  5        U	R                  c:  [         R                  " SUR                   U	R                  U	R                  5        GM  US-  nU H#  n
X:   R                  U	R                  U
   5        M%     GM2     U HX  nUS:  a1  ["        R$                  " X;   SS	9R'                  [        U   5      X;'   M:  ["        R(                  " / [        U   S
9X;'   MZ     [+        X6US9$ )EComputes the templates for given query sequence (more details above).Searching for template for: %sr   c                     U R                   $ r   r[  r  s    r   <lambda>5HhsearchHitFeaturizer.get_templates.<locals>.<lambda>  s    akkr   Tkeyreverser   rI   rL  rM  rX   rN  rO  r   .Skipped invalid hit %s, error: %s, warning: %sr   axisr   rC  r]  rh  )r	   r   TEMPLATE_FEATURESsortedrr  re  ro  rq  ru  rv  rt  rs  rD  r  r  rC  rO   r   stackastyper  rg  )rw  r   rz  template_featurestemplate_feature_namenum_hitsr]  rh  rI   ri   r(  rO   s               r   r|  #HhsearchHitFeaturizer.get_templates  s    	5~F%6!794 &7 $$94HC>>)(-//"&"9"9"11"11#'#;#;#'#;#;	F ||fll+ ~~/&DHHLLNN	 A*A%(//0BC +E IJ &D!|*,((%++"F#4T#:; "'
 +-((/5+7!' & $&J 	Jr   r   Nr   r   r   r   r   r   r   r   r  rg  r|  r   r   r   r   r  r    s1    J>J"%>J7../>J4H>Jr   r  c                   H    \ rS rSrSrS\S\\R                     S\	4S jr
Srg)	HmmsearchHitFeaturizeri  zAA class for turning a3m hits from hmmsearch to template features.r   rz  rJ   c                    [         R                  " SU5        0 n[         H  n/ X4'   M	     [        5       n/ n/ nU(       a  US   R                  c  UnO[        US SS9nU GH\  n	[        U5      U R                  :  a    GOA[        UU	U R                  U R                  U R                  U R                  U R                  U R                  S9n
U
R                  (       a  UR!                  U
R                  5        U
R"                  (       a  UR!                  U
R"                  5        U
R$                  c:  [         R&                  " SU	R(                  U
R                  U
R"                  5        GM  U
R$                  S   nX;   a  GM"  UR+                  U5        U H#  nX<   R!                  U
R$                  U   5        M%     GM_     U(       a:  U H2  n[,        R.                  " X=   SS	9R1                  [        U   5      X='   M4     GO<[        U5      n[,        R2                  " S
U[        [4        R6                  5      4[,        R8                  5      [,        R2                  " S
U[4        R:                  4[,        R8                  5      [,        R2                  " S
U[4        R:                  S4[,        R8                  5      [,        R<                  " SR?                  5       /[,        R@                  S9[,        R<                  " SR?                  5       /[,        R@                  S9[,        R<                  " S/[,        R8                  S9S.n[C        X6US9$ )r  r  r   c                     U R                   $ r   r  r  s    r   r  6HmmsearchHitFeaturizer.get_templates.<locals>.<lambda>  s    Q[[r   Tr  r  r  rG   r  r   r   r   r   rB   r  )"r	   r   r  setr[  r  rg   rr  re  ro  rq  ru  rv  rt  rs  rD  r  r  rC  rW  rO   addr   r  r  r   r
   restypes_with_x_and_gapr   r   r  r  object_rg  )rw  r   rz  r  r  already_seenr]  rh  sorted_hitsrI   ri   already_seen_keyr(  rO   r  s                  r   r|  $HmmsearchHitFeaturizer.get_templates  s    	5~F%6!794 &7 utAw((0K +@$OKC< DNN2(-//"&"9"9"11"11#'#;#;#'#;#;	F ||fll+ ~~/&DHHLLNN	 $*??3F#G #3  !12*A%(//0BC +K P )*,((%++"F#4T#:; "' *
 .)G )AA"C DJJ !W&7&E&EF% !W&7&E&EqI% "))+bjj9"))+bjj9!BJJ/%!( $&J 	Jr   r   Nr  r   r   r   r  r    s5    KXJ"%XJ7../XJ4HXJr   r  )gffffff?g?r~  )Gr   r  dataclassesr}   	functoolsrp  rV  rM   typingr   r   r   r   r   r   numpyr   abslr	   &modelscope.models.science.unifold.datar
   %modelscope.models.science.unifold.msar   r   +modelscope.models.science.unifold.msa.toolsr   r   r   r   r   r#   r'   r*   r-   r0   r3   r7   r:   r=   r@   r   r  r  r  r   rW   r   r[   rm   r   r   r   MmcifObjectr~   r   r   ndarrayr   r  r,  r?  	dataclassrB  	lru_cacherK  re  rg  ABCrj  r  r  r   r   r   <module>r     sT   I 
     	 	 @ @   D @ >%I %JE JO OQ QQE QO OKe KN% N
:Y :P PSn SR^ R:. :
 zz jj#%::ZZ** $w22 $uS#X $3 1 112 "("3"34 
	2 Xc]8J0K .Os OwsH4E4E/E'F O4 $( ?			?? ? 3 1 112	?
 "**? !? ? 
?D>@>@36>@''>@,1#sC-,@>@BA@A@A@ ##A@ c"	A@
 A@ 3S!!"A@H,BJJ ,13,16,*8-''8-8;8-!8-&+BJJ

,B&C8-vP##PP S#XP 	P
 P P P 4S>8C=()Pf888 #8 C=	8
 !8 S#X8v d#  $ Ru% &  %DIDI			DI DI  ((	DI
 3 1 112DI 3-.DI DI DI DIN d#  $E?CGG E?PAJ1 AJH[J2 [Jr   