
    9iF                        S r SSKrSSKrSSKrSSKrSSKJrJrJrJ	r	J
r
  SSKJr  SSKJr  SSKJr  SSKJr  \r\\\4   r\R,                  R,                  r\r\\\	\   4   r\R4                  " SS	9 " S
 S5      5       r\R4                  " SS	9 " S S5      5       r\R4                  " SS	9 " S S5      5       r\R4                  " SS	9 " S S5      5       r\R4                  " SS	9 " S S5      5       r\R4                  " SS	9 " S S5      5       r  " S S\!5      r"S\S\S\	\\\4      4S jr#S\S\S\S\\\\\4   4   4S jr$\RJ                  " SSS 9SS!.S"\S#\S$\&S\ 4S% jj5       r'\RJ                  " SSS 9SS!.S"\S#\S$\&S\ 4S& jj5       r(S'\S\4S( jr)S)r*S\S\4S* jr+S\S\4S+ jr,S\S\	\   4S, jr-S\\\4   S\\\	\   4   4S- jr.S.\S\&4S/ jr/g)0zParses the mmCIF file format.    N)AnyMappingOptionalSequenceTuple)logging)PDB)SCOPData)MMCIFParserT)frozenc                   *    \ rS rSr% \\S'   \\S'   Srg)Monomer"   idnum N__name__
__module____qualname____firstlineno__str__annotations__int__static_attributes__r       k/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/modelscope/models/science/unifold/msa/mmcif.pyr   r   "   s    G	Hr   r   c                   f    \ rS rSr% \\S'   \\S'   \\S'   \\S'   \\S'   \\S'   \\S'   \\S	'   S
rg)AtomSite*   residue_nameauthor_chain_idmmcif_chain_idauthor_seq_nummmcif_seq_numinsertion_codehetatm_atom	model_numr   Nr   r   r   r   r   r   *   s1    Nr   r   c                   4    \ rS rSr% \\S'   \\S'   \\S'   Srg)ResiduePosition7   chain_idresidue_numberr&   r   Nr   r   r   r   r*   r*   7   s    Mr   r*   c                   D    \ rS rSr% \\   \S'   \\S'   \\S'   \\S'   Sr	g)ResidueAtPosition>   positionname
is_missinghetflagr   N)
r   r   r   r   r   r*   r   r   boolr   r   r   r   r/   r/   >   s    ''
ILr   r/   c                       \ rS rSr% Sr\\S'   \\S'   \\S'   \	\
\4   \S'   \	\
\	\\4   4   \S'   \\S'   \	\
\
4   \S	'   \	\
\4   \S
'   Srg)MmcifObjectF   aD  Representation of a parsed mmCIF file.

Contains:
    file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
        files being processed.
    header: Biopython header.
    structure: Biopython structure.
    chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
        {'A': 'ABCDEFG'}
    seqres_to_structure: Dict; for each chain_id contains a mapping between
        SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,  1: ResidueAtPosition, ...}}
    raw_string: The raw string used to construct the MmcifObject.
file_idheader	structurechain_to_seqresseqres_to_structure
raw_stringmmcif_to_author_chain_idvalid_chainsr   N)r   r   r   r   __doc__r   r   	PdbHeaderPdbStructurer   ChainIdSeqResr   r/   r   r   r   r   r   r7   r7   F   sh     LWf_-- '#7H2H*I!IJJO%gw&677'3,''r   r7   c                   H    \ rS rSr% Sr\\   \S'   \\	\
\
4   \4   \S'   Srg)ParsingResult`   zReturned by the parse function.

Contains:
    mmcif_object: A MmcifObject, may be None if no chain could be successfully
        parsed.
    errors: A dict mapping (file_id, chain_id) to any exception generated.
mmcif_objecterrorsr   N)r   r   r   r   rA   r   r7   r   r   r   r   r   r   r   r   r   rG   rG   `   s,     ;''E#s(OS())r   rG   c                       \ rS rSrSrSrg)
ParseErrorn   z;An error indicating that an mmCIF file could not be parsed.r   N)r   r   r   r   rA   r   r   r   r   rL   rL   n   s    Er   rL   prefixparsed_inforeturnc           
         / n/ nUR                  5        H?  u  pEUR                  U 5      (       d  M  UR                  U5        UR                  U5        MA     [        U Vs/ s H  n[	        U5      [	        US   5      :H  PM     sn5      (       d
   SU-  5       e[        U6  Vs/ s H  n[        [        X&5      5      PM     sn$ s  snf s  snf )aB  Extracts loop associated with a prefix from mmCIF data as a list.

Reference for loop_ in mmCIF:
    http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html

Args:
    prefix: Prefix shared by each of the data items in the loop.
        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
        _entity_poly_seq.mon_id. Should include the trailing period.
    parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
        parser.

Returns:
    Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
r   z2mmCIF error: Not all loops are the same length: %s)items
startswithappendalllenzipdict)rN   rO   colsdatakeyvaluexss          r   mmcif_loop_to_listr^   r   s    " DD!'')
>>&!!KKKK *
 *.*.BB3tAw<$   F	=	DF  +.t*5*BDT*55	 6s   !$C%Cindexc                 N    [        X5      nU Vs0 s H  oDU   U_M
     sn$ s  snf )a;  Extracts loop associated with a prefix from mmCIF data as a dictionary.

Args:
    prefix: Prefix shared by each of the data items in the loop.
        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
        _entity_poly_seq.mon_id. Should include the trailing period.
    index: Which item of loop data should serve as the key.
    parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
        parser.

Returns:
    Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
    indexed by the index column.
)r^   )rN   r_   rO   entriesentrys        r   mmcif_loop_to_dictrc      s-    & !5G-45WE%L%W555s   "   F)typed)catch_all_errorsr9   mmcif_stringrf   c                     0 n [        SS9nUR                  nUR                  5        H!  u  pg[        U[        5      (       a  M  U/XV'   M#     [        U5      n[        US9n	U	(       d  [        SU S4S05      $ 0 n
[        U5       H.  nUR                  S:w  a  M  UR                  U
UR                  '   M0     [        U USSSUU
U	S9n[        XS	9$ ! [         a"  nXU S4'   U(       d  e [        SUS	9s SnA$ SnAff = f)
  Entry point, parses an mmcif_string.

Args:
    file_id: A string identifier for this file. Should be unique within the
        collection of files being processed.
    mmcif_string: Contents of an mmCIF file.
    catch_all_errors: If True, all exceptions are caught and error messages are
        returned as part of the ParsingResult. If False exceptions will be allowed
        to propagate.

Returns:
    A ParsingResult.
TQUIETrO   N %No protein chains found in this file.1r9   r:   r;   r<   r=   r>   r?   r@   rI   rJ   )r   _mmcif_dictrR   
isinstancelist_get_header_get_protein_chainsrG   _get_atom_site_listr(   r"   r#   r7   	Exception)r9   rg   rf   rJ   parserrO   r[   r\   r:   r@   r?   atomrI   es                 r   
fast_parser|      s.   $ F-?4( (( &++-JCeT**$)7  . [) +{C }&MNP P $& '4D~~$'+';'; %##%	 5 #  $"%=%	
 ,FF ? !}$v>>	?s*   =C 3C 9AC 
C=C82C=8C=c                     0 n [         R                  " SS9n[        R                  " U5      nUR	                  SU5      n[        U5      nUR                  nUR                  5        H!  u  p[        U
[        5      (       a  M  U
/X'   M#     [        U5      n[        US9nU(       d  [        SU S4S05      $ UR                  5        VVVs0 s H+  u  pU[        U Vs/ s H  oR                  PM     sn5      _M-     nnnn0 n0 n[        U5       GH.  nUR                   S:w  a  M  UR"                  UUR$                  '   UR$                  U;   d  MA  SnUR&                  S	:X  a"  UR(                  S
;   a  SnOSUR(                  -   nUR*                  n[-        UR*                  5      (       d  Sn[/        UR"                  [1        UR2                  5      US9n[1        UR4                  5      UUR$                     -
  nUR7                  UR"                  0 5      n[9        UUR(                  SUS9UU'   UUUR"                  '   GM1     UR                  5        HF  u  nnUU   nUU   n[;        U5       H'  u  nnUU;  d  M  [9        SUR<                  SSS9UU'   M)     MH     0 nUR                  5        Hx  u  nnUU   n/ nU HO  n[>        R@                  R7                  UR<                  S5      nURC                  [E        U5      S:X  a  UOS5        MQ     SRG                  U5      nUUU'   Mz     [I        U UUUUUUUS9n[        UUS9$ s  snf s  snnnf ! [J         a"  n U X0S4'   U(       d  e [        SUS9s Sn A $ Sn A ff = f)ri   Trj   rm   rl   Nrn   ro    HETATM)HOHWATWH_)r,   r-   r&   F)r1   r2   r3   r4   X   rp   rq   )&r	   r   ioStringIOget_structure_get_first_modelrr   rR   rs   rt   ru   rv   rG   minr   rw   r(   r"   r#   r'   r!   r&   _is_setr*   r   r$   r%   getr/   	enumerater   r
   protein_letters_3to1rT   rV   joinr7   rx   )!r9   rg   rf   rJ   ry   handlefull_structurefirst_model_structurerO   r[   r\   r:   r@   r,   seqmonomerseq_start_numr?   seq_to_structure_mappingsrz   r4   r&   r1   seq_idxcurrentseq_infoauthor_chaincurrent_mappingidxauthor_chain_to_sequencecoderI   r{   s!                                    r   parser      s   $ Fm?t,\*--b&9 0 @ (( &++-JCeT**$)7  . [) +{C }&MNP P ".!3!3!5
!5 cc:c7;;c:;;!5 	 
 $& $&!'4D~~$ (,';'; %##% ""l2##x/ ((N:"%"&):):":!%!4!4t2233%(N*!11#&t':':#;#1
 &&(*78K8K*LM377((".#4%**$#	$  CJ)$*>*>?I 5N #/"4"4"6Hh3H=L7EO )( 3Wo-+<!%$ZZ#' #	,%OC( !4 #7 $& "."4"4"6Hh3H=LC#4488SI

3t9>4s; $ ''#,C58$\2 #7 #+4 9"%=%	
 ,vFFc ;
f  ? !}$v>>	?s]   A;M 3M 7M M
M2M
=AM D'M <CM M

M 
M=M82M=8M=r;   c                 4    [        U R                  5       5      $ )z1Returns the first model in a Biopython structure.)next
get_models)r;   s    r   r   r   n  s    	$$&''r      c                 "    U S   n[        U5      $ )z!Returns the oldest revision date.*_pdbx_audit_revision_history.revision_date)r   )rO   revision_datess     r   get_release_dater   v  s     !MNN~r   c                    0 n[        SU 5      nSR                  U Vs/ s H  o3S   R                  5       PM     sn5      US'   SU ;   a  [        U 5      US'   O[        R
                  " SU S   5        S	US
'   S H   nX@;   d  M
   X   S   n[        U5      US
'   M"     U$ s  snf ! [         a    [        R                  " SX   5         MQ  f = f)zFReturns a basic header containing method, release date and resolution.z_exptl.,z_exptl.methodstructure_methodr   release_datez$Could not determine release_date: %sz	_entry.idg        
resolution)z_refine.ls_d_res_highz _em_3d_reconstruction.resolutionz_reflns.d_resolution_highr   zInvalid resolution format: %s)	r^   r   lowerr   r   warningfloat
ValueErrordebug)rO   r:   experiments
experimentres_keyraw_resolutions         r   ru   ru   |  s    F$Y<K!$?JK{O	$	*	*	,{K"MF
 4{B!1+!>~>#K0	2  F<
 !4!,!5a!8',^'<|$ M1 	L(  4=)244s   B B%%#CCc                     [        U S   U S   U S   U S   U S   U S   U S   U S   5       Vs/ s H  n[        U6 PM     sn$ s  snf )	zGReturns list of atom sites; contains data not present in the structure.z_atom_site.label_comp_idz_atom_site.auth_asym_idz_atom_site.label_asym_idz_atom_site.auth_seq_idz_atom_site.label_seq_idz_atom_site.pdbx_PDB_ins_codez_atom_site.group_PDBz_atom_site.pdbx_PDB_model_num)rW   r   )rO   sites     r   rw   rw     s{     %(231223011267./78	%
	%
D$ 	%
  s   Ac           
      <   [        SU 5      n[        R                  " [        5      nU H0  nX#S      R	                  [        US   [        US   5      S95        M2     [        SSU 5      n[        SU 5      n[        R                  " [        5      nU H   nUS	   nUS
   n	Xi   R	                  U5        M"     0 n
UR                  5        HK  u  pXi   n[        U Vs/ s H  nSXMR                     S   ;   PM     sn5      (       d  M>  U H  nXU'   M	     MM     U
$ s  snf )zExtracts polymer information for protein chains only.

Args:
    parsed_info: _mmcif_dict produced by the Biopython parser.

Returns:
    A dict mapping mmcif chain id to a list of Monomers.
z_entity_poly_seq.z_entity_poly_seq.entity_idz_entity_poly_seq.mon_idz_entity_poly_seq.num)r   r   z_chem_comp.z_chem_comp.idz_struct_asym.z_struct_asym.idz_struct_asym.entity_idpeptidez_chem_comp.type)r^   collectionsdefaultdictrt   rT   r   r   rc   rR   anyr   )rO   entity_poly_seqspolymersentity_poly_seq
chem_compsstruct_asymsentity_to_mmcif_chainsstruct_asymr,   	entity_idr@   r   	chain_idsr   s                 r   rv   rv     sE    **={K&&t,H+!=>?FF"#<=(>?@	 , $M?$/1J
 &o{CL(44T:#01 89	)00: $ L'~~/	*5	 '
'G Z

34EFF'
   &)1X& &  0 
s   D
rZ   c                     U S;  $ )zFReturns False if data is a special mmCIF character indicating 'unset'.).?r   )rZ   s    r   r   r     s    z!!r   )0rA   r   dataclasses	functoolsr   typingr   r   r   r   r   abslr   Bior	   Bio.Datar
   Bio.PDB.MMCIFParserr   r   rD   rB   	StructurerC   rE   	MmCIFDict	dataclassr   r   r*   r/   r7   rG   rx   rL   r^   rc   	lru_cacher5   r|   r   r   -_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDEr   ru   rw   rv   r   r   r   r   <module>r      s   $    	 : :    + CH	}}&&	C#&'	 d#  $ d#  $ d#  $ d#  $ d#( ( $(2 d#
* 
* $
*F F6s 6$-62:738;L2M6>666 6 S'#s(##$	6. Ru% )-???? ?? "&?? 2??? &??D Ru% $(??? !? -:? &?D( ( (
 13 -)  Y 9 BY 8H3E  1 #!$ %1)0(7:K1K)L1h"# "$ "r   