
    9i[              
          S r SSKrSSKrSSKrSSKrSSKrSSKJrJrJ	r	J
r
JrJrJr  \\\      r\R                   " SS9 " S S5      5       r\R                   " SS9 " S S	5      5       rS
\S\\\   \\   4   4S jrS\S\4S jrS\S\4S jrS\\   S\S\\   4S jr  S4S\S\
\   S\S\4S jjrS\S\\   S\4S jrS\S\S\4S jrS\S\4S jrS\S\4S jrS\S\S\\
\      4S  jrS!\S"\S#\	\   4S$ jrS%\\   S\4S& jr S'\S\\   4S( jr!S)\S\\\"4   4S* jr#S!\S+\S\	\   4S, jr$\R                   " SS9 " S- S.5      5       r%S/\S\%4S0 jr& S5S1\S\S2\S\\   4S3 jjr'g)6z+Functions for parsing various file formats.    N)DictIterableListOptionalSequenceSetTupleT)frozenc                   ^    \ rS rSr% Sr\\   \S'   \\S'   \\   \S'   S r	S r
S\4S	 jrS
rg)Msa   z%Class representing a parsed MSA file.	sequencesdeletion_matrixdescriptionsc           
      0   [        U R                  5      [        U R                  5      s=:X  a  [        U R                  5      :X  dP  O  [	        S[        U R                  5       S[        U R                  5       S[        U R                  5       S35      eg )Nz5All fields for an MSA must have the same length. Got z sequences, z! rows in the deletion matrix and z descriptions.)lenr   r   r   
ValueErrorselfs    m/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/science/unifold/msa/parsers.py__post_init__Msa.__post_init__!   s    DNN#s4+?+?'@ #C!!E# #4>>*+<t++,--Nt(()*.:; ;#    c                 ,    [        U R                  5      $ N)r   r   r   s    r   __len__Msa.__len__*   s    4>>""r   max_seqsc                 f    [        U R                  S U U R                  S U U R                  S U S9$ )Nr   r   r   )r   r   r   r   )r   r   s     r   truncateMsa.truncate-   s=    nnYh/ 00(;**9H5
 	
r    N)__name__
__module____qualname____firstlineno____doc__r   str__annotations__DeletionMatrixr   r   intr!   __static_attributes__r#   r   r   r   r      s5    /}##3-;#
 
r   r   c                   |    \ rS rSr% Sr\\S'   \\S'   \\S'   \\	   \S'   \\S'   \\S'   \
\   \S	'   \
\   \S
'   Srg)TemplateHit5   z"Class representing a template hit.indexnamealigned_cols	sum_probsqueryhit_sequenceindices_queryindices_hitr#   N)r$   r%   r&   r'   r(   r,   r*   r)   r   floatr   r-   r#   r   r   r/   r/   5   s>    ,J
IJ9cr   r/   fasta_stringreturnc                    / n/ nSnU R                  5        Hj  nUR                  5       nUR                  S5      (       a,  US-  nUR                  USS 5        UR                  S5        MU  U(       d  M^  X==   U-  ss'   Ml     X4$ )a=  Parses FASTA string and returns list of strings with amino-acid sequences.

Arguments:
    fasta_string: The string contents of a FASTA file.

Returns:
    A tuple of two lists:
    * A list of sequences.
    * A list of sequence descriptions taken from the comment lines. In the
        same order as the sequences.
>   N )
splitlinesstrip
startswithappend)r:   r   r   r1   lines        r   parse_fastarF   C   s     ILE'')zz|??3QJEQR)R D  * ""r   stockholm_stringc                 ,   [         R                  " 5       nU R                  5        HY  nUR                  5       nU(       a  UR	                  S5      (       a  M2  UR                  5       u  p4X1;  a  SX'   X==   U-  ss'   M[     / n/ nSn/ n[        UR                  5       5       H  u  pU	S:X  a)  Un[        U5       V
Vs/ s H  u  pUS:w  d  M  U
PM     nn
nSR                  U Vs/ s H  oU   PM	     sn5      nUR                  U5        / nSn[        XG5       H4  u  nnUS:w  d  US:w  d  M  US:X  a  US-  nM!  UR                  U5        SnM6     UR                  U5        M     [        UU[        UR                  5       5      S9$ s  snn
f s  snf )a  Parses sequences and deletion matrix from stockholm format alignment.

Args:
    stockholm_string: The string contents of a stockholm file. The first
        sequence in the file should be the query sequence.

Returns:
    A tuple of:
        * A list of sequences that have been aligned to the query. These
            might contain duplicates.
        * The deletion matrix for the alignment as a list of lists. The element
            at `deletion_matrix[i][j]` is the number of residues deleted from
            the aligned sequence i at residue position j.
        * The names of the targets matched, including the jackhmmer subsequence
            suffix.
#//r@   r   -r?   r    )collectionsOrderedDictrA   rB   rC   split	enumeratevaluesjoinrD   zipr   listkeys)rG   name_to_sequencerE   r2   sequencemsar   r5   keep_columns	seq_indexirescaligned_sequencedeletion_vecdeletion_countseq_res	query_ress                     r   parse_stockholmrc   `   s   " #..0 ++-zz|t{33'%'"(* . COEL()9)@)@)BC	>E,5e,<K,<&!s
A,<LK 77#FAQK#FG

#$ "%h"6GY#~c!1#"a'N ''7%&N #7 	|,+  D. '*//12 ' L $Gs   ?FF'F

a3m_stringc                 v   [        U 5      u  p/ nU HP  n/ nSnU H2  nUR                  5       (       a  US-  nM  UR                  U5        SnM4     UR                  U5        MR     [        R	                  SS[
        R                  5      nU V	s/ s H  oR                  U5      PM     n
n	[        U
UUS9$ s  sn	f )ag  Parses sequences and deletion matrix from a3m format alignment.

Args:
    a3m_string: The string contents of a a3m file. The first sequence in the
        file should be the query sequence.

Returns:
    A tuple of:
        * A list of sequences that have been aligned to the query. These
            might contain duplicates.
        * The deletion matrix for the alignment as a list of lists. The element
            at `deletion_matrix[i][j]` is the number of residues deleted from
            the aligned sequence i at residue position j.
        * A list of descriptions, one per sequence, from the a3m file.
r   r?   r@   r    )	rF   islowerrD   r)   	maketransstringascii_lowercase	translater   )rd   r   r   r   msa_sequencer_   r`   jdeletion_tablesaligned_sequencess              r   	parse_a3mrp      s      **5IO!Ayy{{!###N3!"  	|, " ]]2r6+A+ABN>GHi^4iH#'!  Is   B6query_non_gapssto_seqc              #      #    [        X5       H,  u  p#U(       a  Uv   M  US:w  d  M  UR                  5       v   M.     g 7f)NrL   )rS   lower)rq   rr   is_query_res_non_gapsequence_ress       r   _convert_sto_seq_to_a3mrw      s9     .1..J*S $$&&	 /Ks   "??stockholm_formatmax_sequencesremove_first_row_gapsc                   ^^ 0 m0 nSnU R                  5        Hx  nU=(       a    [        U5      U:  nUR                  5       (       d  M1  UR                  S5      (       a  MI  UR	                  SS9u  pgXc;  a  U(       a  Mh  SX6'   X6==   U-  ss'   Mz     U R                  5        Ho  nUSS S:X  d  M  UR	                  S	S9nUSS	 u  pi[        U5      S:X  a  US	   OSn
U	S
:w  a  MB  U(       a  Xc;  a  MP  U
TU'   [        T5      [        U5      :X  d  Mo    O   0 mU(       a6  [        [        UR                  5       5      5      nU Vs/ s H  oS:g  PM	     nnUR                  5        H>  u  pnUR                  SS5      nU(       a  SR                  [        WU5      5      nUTU'   M@     UU4S jT 5       nSR                  U5      S-   $ s  snf )z3Converts MSA in Stockholm format to the A3M format.FrI   r?   maxsplitr@   N   #=GS   DErL   .c              3   `   >#    U  H#  nS U STR                  US5       STU    3v   M%     g7f)r>    r@   
N)get).0ka3m_sequencesr   s     r   	<genexpr>+convert_stockholm_to_a3m.<locals>.<genexpr>  s?      ,* s!L,,Q34B}Q7G6HI*s   +.r   )rA   r   rB   rC   rO   nextiterrQ   itemsreplacerR   rw   )rx   ry   rz   r   reached_max_sequencesrE   seqnamealigned_seqcolumnsfeaturevaluequery_sequencer\   rq   sto_sequenceout_sequencefasta_chunksr   r   s                    @@r   convert_stockholm_to_a3mr      s    LI! ++- - !(#3'3(::<< < < $(::q:#9 G'(%'	"+- . !++-8v jj!j,G&q|G"%g,!"3GAJE$$)A$)L!< C	N2 .  Md9#3#3#5670>?*?!*!2#++C4 77'EGL!-g "3,*,L99\"T)) @s   G
rE   seqnamesc                 P   U R                  5       (       d  gU R                  5       S:X  a  gU R                  S5      (       a  gU R                  S5      (       a  gU SS S:X  a  U R                  SS	9u  p#nX1;   $ U R                  S
5      (       a  gU R                  S5      S   nX1;   $ )z'Function to decide which lines to keep.TrK   z# STOCKHOLM#=GC RFNr~   r      r|   rJ   Fr   r   )rB   rC   rO   	partition)rE   r   _r   s       r   
_keep_liner     s    ::<<zz|t}%%y!!BQx6

A
.A""			..%a(""r   stockholm_msac                    [        5       n/ nU R                  5        Hh  nUR                  5       (       d  M  UR                  S5      (       a  M2  UR	                  S5      S   nUR                  U5        [        U5      U:  d  Mh    O   U R                  5        H&  n[        XB5      (       d  M  UR                  U5        M(     SR                  U5      S-   $ )z<Truncates a stockholm file to a maximum number of sequences.rI   r   r   r   )
setrA   rB   rC   r   addr   r   rD   rR   )r   ry   r   filtered_linesrE   r   s         r   truncate_stockholm_msar     s    uHN((*::<< < < nnS)!,GLL!8}- + ((*d%%!!$' + 99^$t++r   c                 r  ^ 0 m0 n[        U R                  5       5       GHh  u  p#UR                  S5      (       Ga  UnUnUR                  S5      u    pg/ n[	        [        U5      5       H]  n	UR                  5        H5  u  pjU
R                  S5      u  pnX   S:w  d  M#  UR                  S5          MJ     UR                  S5        M_     UUU'   [        U5      (       d  U H  nSTU'   M
     O\UR                  5        HH  u  pU
R                  S5      u  pnSR                  [        R                  " X5      5      nU SU 3TU'   MJ     0 nGM1  UR                  5       (       a  UR                  S5      (       d  X1U'   GMc  UTU'   GMk     SR                  U4S	 j[	        [        T5      5       5       5      $ )
z9Removes empty columns (dashes-only) from a Stockholm MSA.r   r   rL   TFr@   rI   r   c              3   .   >#    U  H
  nTU   v   M     g 7fr   r#   )r   r[   processed_liness     r   r   :remove_empty_columns_from_stockholm_msa.<locals>.<genexpr>Y  s     N2MQoa(2Ms   )rP   rA   rC   
rpartitionranger   r   rD   anyrR   	itertoolscompressrB   )r   unprocessed_linesr[   rE   reference_annotation_ireference_annotation_liner   first_alignmentmaskrl   unprocessed_lineprefix	alignment
line_indexmasked_alignmentr   s                  @r   'remove_empty_columns_from_stockholm_msar   /  s   O]5578??9%%%&"(,%$(OOC$8!AqD3/0+<+B+B+D'A+;+F+Fs+K(Fy |s*D)	 ,E KK& 1 +D &(   #4J24OJ/ #4 5F4K4K4M0J+;+F+Fs+K(Fy')ww!**9;(=$ *02B1C&D $"$	 5N !#ZZ\\$//+">">#'a !%OAK 9L 99N%O8L2MNOOr   c                 *   [         R                  " [        5      nU R                  5        H`  nUR	                  5       (       d  M  UR                  S5      (       a  M2  UR	                  5       nUR                  5       u  p4X==   U-  ss'   Mb     [        5       n[        5       n[        [        UR                  5       5      5      nU Vs/ s H  oS:g  PM	     n	nUR                  5        HS  u  p4SR                  [        R                  " XI5      5      n
X;   a  M1  UR                  U
5        UR                  U5        MU     / nU R                  5        H&  n[!        X&5      (       d  M  UR#                  U5        M(     SR                  U5      S-   $ s  snf )z;Remove duplicate sequences (ignoring insertions wrt query).rI   rL   r@   r   )rM   defaultdictr)   rA   rB   rC   rO   r   r   r   rQ   r   rR   r   r   r   r   rD   )r   sequence_dictrE   r   r   seen_sequencesr   query_alignr]   r   r   r   s               r   deduplicate_stockholm_msar   \  sI   ++C0M ((* ::<< < <::<D!%G"i/" + UNuHtM00234K)*kHkD*+113779#5#5i#FG-/0LL! 4 N((*d%%!!$' + 99^$t++ +s   	Fregex_patternc                 p    [         R                  " X5      nUc  [        SU 35      eUR                  5       $ )NzCould not parse query line )rematchRuntimeErrorgroups)r   rE   r   s      r   _get_hhr_line_regex_groupsr     s4    HH])E}8?@@<<>r   rW   start_indexindices_listc                 x    UnU  H2  nUS:X  a  UR                  S5        M  UR                  U5        US-  nM4     g)zUComputes the relative indices for each residue with respect to the original sequence.rL   r=   r?   N)rD   )rW   r   r   countersymbols        r    _update_hhr_residue_indices_listr     s?     GS=#(qLG r   detailed_linesc                    [        U S   R                  5       S   5      nU S   SS nSn[        R                  " X0S   5      nUc  [	        SU < SU S   < S	35      eUR                  5        Vs/ s H  n[        U5      PM     snu      pg  phnS
n	S
n
/ n/ nSnU SS  GH  nUR                  S5      (       a  UR                  S5      (       d  UR                  S5      (       d  UR                  S5      (       d  Sn[        XSS 5      n[        US   5      S-
  nUS   n[        US   5      n[        U Vs/ s H  oUS:X  d  M
  UPM     sn5      nUU-
  U-   nU[        U5      :X  d   eU	U-  n	[        UUU5        M  UR                  S5      (       d  M  UR                  S5      (       a  GM  UR                  S5      (       a  GM*  UR                  S5      (       a  GMC  Sn[        XSS 5      n[        US   5      S-
  nUS   nU[        U5      :X  d   eU
U-  n
[        UUU5        GM     [        UU[        U5      UU	U
UUS9$ s  snf s  snf )a  Parses the detailed HMM HMM comparison section for a single Hit.

This works on .hhr files generated from both HHBlits and HHSearch.

Args:
    detailed_lines: A list of lines from a single comparison section between 2
        sequences (which each have their own HMM's)

Returns:
    A dictionary with the information from that detailed comparison section

Raises:
    RuntimeError: If a certain line cannot be processed
r   r=   r?   NzProbab=(.*)[	 ]*E-value=(.*)[	 ]*Score=(.*)[	 ]*Aligned_cols=(.*)[	 ]*Identities=(.*)%[	 ]*Similarity=(.*)[	 ]*Sum_probs=(.*)[	 ]*Template_Neff=(.*)r   zCould not parse section: z. Expected this: 
z to contain summary.r@   r   zQ z	Q ss_dsspz	Q ss_predzQ Consensusz1[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)   rL   zT z	T ss_dsspz	T ss_predzT Consensusz/[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)r1   r2   r3   r4   r5   r6   r7   r8   )r,   rO   r   r   r   r   r9   rC   r   r   r   r/   )r   number_of_hitname_hitpatternr   xr   r3   r4   r5   r6   r7   r8   length_blockrE   pattr   startdelta_queryendnum_insertionsdelta_hit_sequences                         r   _parse_hhr_hitr     s|     q)//1"56Ma $H	  HHWQ/0E}~a023 	3 #\\^
,^58^
,Q1Aq ELMKLqr"OOD!!$//+*F*F4466 HD/23i@F q	NQ&E )KfQi.C [!E[H![!EFN;7L3{#3333 [ E,[%O__T""OOK00 OOK88 OOM:: J3Drs)DF1I*%+AY"#s+='>>>>  2201CU1<>Q #V &!#	 	o -8 "Fs   /I6	I
I

hhr_stringc           
      j   U R                  5       n[        U5       VVs/ s H  u  p#UR                  S5      (       d  M  UPM!     nnn/ nU(       a\  UR                  [	        U5      5        [        [	        U5      S-
  5       H'  nUR                  [        XU   XBS-       5      5        M)     U$ s  snnf )z)Parses the content of an entire HHR file.zNo r?   )rA   rP   rC   rD   r   r   r   )r   linesr[   rE   block_startshitss         r   	parse_hhrr     s    !!#E #5))gaT__U-C)   DCJ's<(1,-AKKu!_\a%5HIJL . Ks
   B/B/tbloutc                     SS0nU R                  5        Vs/ s H  o"S   S:w  d  M  UPM     nnU H*  nUR                  5       nUS   nUS   n[        U5      X'   M,     U$ s  snf )zDParse target to e-value mapping parsed from Jackhmmer tblout string.r5   r   rJ   r~   )rA   rO   r9   )r   e_valuesrE   r   fieldse_valuetarget_names          r   parse_e_values_from_tbloutr   	  st    |H$//1D1d!W^T1ED )Qi %g	 
 O Es
   A!A!r   c                     / nUnU  HN  nUS:X  a  UR                  S5        M  UR                  5       (       a  US-  nM8  UR                  U5        US-  nMP     U$ )zHReturns indices for non-gap/insert residues starting at the given index.rL   r=   r?   )rD   rf   )rW   r   indicesr   r   s        r   _get_indicesr     s]    GGS=NN2^^qLG NN7#qLG  Nr   c                   R    \ rS rSr% \\S'   \\S'   \\S'   \\S'   \\S'   \\S'   Srg	)
HitMetadatai*  pdb_idchainr   r   lengthtextr#   N)r$   r%   r&   r'   r)   r*   r,   r-   r#   r   r   r   r   *  s     KJJ	HK
Ir   r   descriptionc           
          [         R                  " SU R                  5       5      nU(       d  [        SU  S35      e[	        US   US   [        US   5      [        US   5      [        US   5      US	   S
9$ )z3Parses the hmmsearch A3M sequence description line.zF^>?([a-z0-9]+)_(\w+)/([0-9]+)-([0-9]+).*protein length:([0-9]+) *(.*)$zCould not parse description: "z".r?   r   r   r~         )r   r   r   r   r   r   )r   r   rB   r   r   r,   )r   r   s     r   _parse_hmmsearch_descriptionr   4  s~     HHQE
 9+bIJJQxAh%(maM58}1X r   r   
skip_firstc                    [        [        [        U5      6 5      nU(       a  USS n[        U SS9n/ n[	        USS9 H  u  nu  pxSU;  a  M  [        U5      n	[        U V
s/ s H  oR                  5       =(       a    U
S:g  PM      sn
5      n[        XyR                  S-
  S9n[        UU	R                   SU	R                   3USU UR                  5       UUS9nUR                  U5        M     U$ s  sn
f )	a
  Parses an a3m string produced by hmmsearch.

Args:
    query_sequence: The query sequence.
    a3m_string: The a3m string produced by hmmsearch.
    skip_first: Whether to skip the first sequence in the a3m string.

Returns:
    A sequence of `TemplateHit` results.
r?   Nr   )r   zmol:proteinrL   r   r   )rT   rS   rF   r   rP   r   sumisupperr   r/   r   r   upperrD   )r   rd   r   
parsed_a3mr7   r   r[   r6   hit_descriptionmetadatarr3   r8   hits                 r   parse_hmmsearch_a3mr  J  s     c;z234J^
 q9MD.7
!.L**L//@lKlIIK4AH4lKL"<~~7IJOO$Ahnn%56% %++-'#	
 	C% /M( K Ls   !%C1
)NT)T)(r(   rM   dataclassesr   r   rh   typingr   r   r   r   r   r   r	   r,   r+   	dataclassr   r/   r)   rF   rc   rp   boolrw   r   r   r   r   r   r   r   r   r   r9   r   r   r   r   r  r#   r   r   <module>r	     s   2    	  G G G(3-( d#
 
 $
6 d#
 
 $
#c #eHSM8C=,H&I #:;c ;c ;|$# $# $N'HTN '%('-5c]' $("&6*6*C=6*  6* 		6*r#S #CH # #(,# ,c ,c ,(*P3 *P3 *PZ ,S  ,S  ,Fc %(-5hsm-D	s 	 	379	^8C= ^[ ^B# (;"7 *s tCJ/? 3 s tCy $ d#  $c k 0 ,0) )$')$()4<[4I)r   