Source code for parmed.formats.pdb

"""
This package contains classes responsible for reading and writing both PDB and
PDBx/mmCIF files.
"""
from __future__ import division, print_function, absolute_import

from collections import OrderedDict, namedtuple
from contextlib import closing
try:
    from string import ascii_letters
except ImportError:
    from string import letters as ascii_letters # Python 2
import io
import ftplib
import numpy as np
from ..exceptions import PDBError, PDBWarning
from ..formats.pdbx import PdbxReader, PdbxWriter, containers
from ..formats.registry import FileFormatType
from ..periodic_table import AtomicNum, Mass, Element, element_by_name
from ..residue import AminoAcidResidue, RNAResidue, DNAResidue, WATER_NAMES
from ..modeller import StandardBiomolecularResidues
from ..structure import Structure
from ..topologyobjects import Atom, ExtraPoint, Bond, Link
from ..symmetry import Symmetry
from ..utils.io import genopen
from ..utils.six import iteritems, string_types, add_metaclass, PY3
from ..utils.six.moves import range
import re
import warnings

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

_ascii_letters_set = set(ascii_letters)

def _compare_atoms(old_atom, new_atom, resname, resid, chain, segid, inscode):
    """
    Compares two atom instances, along with the residue name, number, and chain
    identifier, to determine if two atoms are actually the *same* atom, but
    simply different conformations

    Parameters
    ----------
    old_atom : :class:`Atom`
        The original atom that has been added to the structure already
    new_atom : :class:`Atom`
        The new atom that we want to see if it is the same as the old atom
    resname : ``str``
        The name of the residue that the new atom would belong to
    resid : ``int``
        The number of the residue that the new atom would belong to
    chain : ``str``
        The chain identifier that the new atom would belong to
    segid : ``str``
        The segment identifier for the molecule
    inscode : ``str``
        The insertion code for the residue

    Returns
    -------
    True if they are the same atom, False otherwise
    """
    if old_atom.name != new_atom.name: return False
    if old_atom.residue.name != resname: return False
    if old_atom.residue.number != resid: return False
    if old_atom.residue.chain != chain.strip(): return False
    if old_atom.residue.segid != segid.strip(): return False
    if old_atom.residue.insertion_code != inscode.strip(): return False
    return True

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

def _standardize_resname(resname):
    """ Looks up a standardized residue name for the given resname """
    try:
        return AminoAcidResidue.get(resname, abbronly=True).abbr, False
    except KeyError:
        try:
            return RNAResidue.get(resname).abbr, False
        except KeyError:
            try:
                return DNAResidue.get(resname).abbr, False
            except KeyError:
                if resname.strip() in WATER_NAMES:
                    return 'HOH', True
                else:
                    return resname, True

def _is_hetatm(resname):
    """ Sees if residue name is "standard", otherwise, we need to use HETATM to
    print atom records instead of ATOM
    """
    if len(resname) != 3:
        return not (RNAResidue.has(resname) or DNAResidue.has(resname))
    return not (AminoAcidResidue.has(resname) or RNAResidue.has(resname)
                or DNAResidue.has(resname))

def _number_truncated_to_n_digits(num, digits):
    """ Truncates the given number to the specified number of digits """
    if num < 0:
        return int(-(-num % eval('1e%d' % (digits-1))))
    return int(num % eval('1e%d' % digits))

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

[docs]@add_metaclass(FileFormatType) class PDBFile(object): """ Standard PDB file format parser and writer """ #=================================================== AtomLookupKey = namedtuple( 'AtomLookupKey', ('name', 'number', 'residue_name', 'residue_number', 'chain', 'insertion_code', 'segment_id', 'alternate_location') )
[docs] @staticmethod def id_format(filename): """ Identifies the file type as a PDB file Parameters ---------- filename : str or file object Name of the file to check format for Returns ------- is_fmt : bool True if it is a PDB file """ if isinstance(filename, string_types): own_handle = True fileobject = genopen(filename, 'r') elif hasattr(filename, 'read'): own_handle = False fileobject = filename try: for line in fileobject: if line[:6] in {'CRYST1', 'END ', 'END', 'HEADER', 'NUMMDL', 'MASTER', 'AUTHOR', 'CAVEAT', 'COMPND', 'EXPDTA', 'MDLTYP', 'KEYWDS', 'OBSLTE', 'SOURCE', 'SPLIT ', 'SPRSDE', 'TITLE ', 'ANISOU', 'CISPEP', 'CONECT', 'DBREF ', 'HELIX ', 'HET ', 'LINK ', 'MODRES', 'REVDAT', 'SEQADV', 'SHEET ', 'SSBOND', 'FORMUL', 'HETNAM', 'HETSYN', 'SEQRES', 'SITE ', 'ENDMDL', 'MODEL ', 'TER ', 'JRNL ', 'REMARK', 'TER', 'DBREF ', 'DBREF2', 'DBREF1', 'DBREF', 'HET', 'LINKR '}: continue # Hack to support reduce-added flags elif line[:6] == 'USER ' and line[6:9] == 'MOD': continue elif line[:5] in ('ORIGX', 'SCALE', 'MTRIX'): if line[5] not in '123': return False elif line[:6] in ('ATOM ', 'HETATM'): atnum, atname = line[6:11], line[12:16] resname, resid = line[17:20], line[22:26] x, y, z = line[30:38], line[38:46], line[46:54] occupancy, bfactor = line[54:60], line[60:66] elem = line[76:78] # Check for various attributes. This is the first atom, so # we can assume we haven't gotten into the regime of "weird" # yet, like hexadecimal atom/residue indices. if not atnum.strip().lstrip('-').isdigit(): return False if atname.strip().isdigit(): return False if not resname.strip(): return False if not resid.strip().lstrip('-').isdigit(): return False try: float(x), float(y), float(z) except ValueError: return False if occupancy.strip(): try: float(occupancy) except ValueError: return False if bfactor.strip(): try: float(bfactor) except ValueError: return False if elem.strip(): if any(x.isdigit() for x in elem): return False return True else: return False return False finally: if own_handle: fileobject.close()
#=================================================== def __init__(self, fileobj): # Open file object that we are parsing self.fileobj = fileobj self._atom_map_from_attributes = OrderedDict() self._atom_map_from_all_attributes = OrderedDict() self._current_model_number = 1 self._residue_indices_overflow = False self._atom_indices_overflow = False self.struct = Structure() self._symmetry_lines = [] self._link_lines = [] self._coordinates = [[]] self._model_atom_counts = [0] self._anisou_records = dict() self._atom_map_from_atom_number = dict() self._atom_map_to_parent = dict() self._model1_atoms_in_structure = set() self._model_open = True self._insertion_codes = set() # Only permitted for compliant PDB files self._last_atom = None self._last_residue = None # Some writers use additional fields to hold extra digits for the residue number if it goes # more than 4 digits self._residue_number_field_extended_by = 0 # Fallback if parser fails to process residue numbers self._last_residue_number_label = None #===================================================
[docs] @staticmethod def download(pdb_id, timeout=10, saveto=None): """ Goes to the wwPDB website and downloads the requested PDB, loading it as a :class:`Structure` instance Parameters ---------- pdb_id : str The 4-letter PDB ID to try and download from the RCSB PDB database timeout : float, optional The number of seconds to wait before raising a timeout error. Default is 10 seconds saveto : str, optional If provided, this will be treated as a file name to which the PDB file will be saved. If None (default), no PDB file will be written. This will be a verbatim copy of the downloaded PDB file, unlike the somewhat-stripped version you would get by using :meth:`Structure.write_pdb <parmed.structure.Structure.write_pdb>` Returns ------- struct : :class:`Structure <parmed.structure.Structure>` Structure instance populated by the requested PDB Raises ------ socket.timeout if the connection times out while trying to contact the FTP server IOError if there is a problem retrieving the requested PDB or writing a requested ``saveto`` file ImportError if the gzip module is not available TypeError if pdb_id is not a 4-character string """ import gzip if not isinstance(pdb_id, string_types) or len(pdb_id) != 4: raise ValueError('pdb_id must be the 4-letter PDB code') pdb_id = pdb_id.lower() ftp = ftplib.FTP('ftp.wwpdb.org', timeout=timeout) ftp.login() fileobj = io.BytesIO() try: ftp_loc = '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz' % (pdb_id[1:3], pdb_id) ftp.retrbinary('RETR %s' % ftp_loc , fileobj.write) except ftplib.all_errors as err: raise IOError('Could not retrieve PDB ID %s; %s' % (pdb_id, err)) finally: ftp.close() # Rewind, wrap it in a GzipFile and send it to parse fileobj.seek(0) if PY3: fileobj = io.TextIOWrapper(gzip.GzipFile(fileobj=fileobj, mode='r')) else: fileobj = gzip.GzipFile(fileobj=fileobj, mode='r') if saveto is not None: with closing(genopen(saveto, 'w')) as f: f.write(fileobj.read()) fileobj.seek(0) return PDBFile.parse(fileobj)
#=================================================== _relatere = re.compile(r'RELATED ID: *(\w+) *RELATED DB: *(\w+)', re.I) def _add_metadata_fields(self): self.struct.experimental = '' self.struct.journal = '' self.struct.authors = '' self.struct.keywords = '' self.struct.doi = '' self.struct.pmid = '' self.struct.journal_authors = '' self.struct.volume = '' self.struct.title = '' self.struct.year = None self.struct.resolution = None self.struct.related_entries = []
[docs] @classmethod def parse(cls, filename, skip_bonds=False): """ Read a PDB file and return a populated `Structure` class Parameters ---------- filename : str or file-like Name of the PDB file to read, or a file-like object that can iterate over the lines of a PDB. Compressed file names can be specified and are determined by file-name extension (e.g., file.pdb.gz, file.pdb.bz2) skip_bonds : bool, optional If True, skip trying to assign bonds. This can save substantial time when parsing large files with non-standard residue names. However, no bonds are assigned. This is OK if, for instance, the PDB file is being parsed simply for its coordinates. This may also reduce element assignment if element information is not present in the PDB file already. Default is False. Metadata -------- The PDB parser also adds metadata to the returned Structure object that may be present in the PDB file experimental : ``str`` EXPDTA record journal : ``str`` JRNL record authors : ``str`` AUTHOR records keywords : ``str`` KEYWDS records doi : ``str`` DOI from the JRNL record pmid : ``str`` PMID from the JRNL record journal_authors : ``str`` Author info from the JRNL record volume : ``str`` Volume of the published article from the JRNL record page : ``str`` Page of the published article from the JRNL record title : ``str`` TITL section of the JRNL record year : ``int`` Year that the article was published, from the JRNL record resolution : ``float`` The X-RAY resolution in Angstroms, or None if not found related_entries : ``list of (str, str)`` List of entries in other databases Returns ------- structure : :class:`Structure` The Structure object initialized with all of the information from the PDB file. No bonds or other topological features are added by default. """ if isinstance(filename, string_types): own_handle = True fileobj = genopen(filename, 'r') else: own_handle = False fileobj = filename inst = cls(fileobj) inst._add_metadata_fields() # Support hexadecimal numbering like that printed by VMD try: inst._parse_open_file(fileobj) finally: if own_handle: fileobj.close() # Assign bonds based on standard templates and simple distances if not skip_bonds: inst.struct.assign_bonds() inst._postprocess_metadata() inst.struct.unchange() try: inst.struct.coordinates = inst._coordinates except ValueError: raise PDBError('Coordinate shape mismatch. Probably caused by different atom counts in ' 'some of the PDB models') # Postprocess features of the PDB file we couldn't resolve until the end inst._process_structure_symmetry() inst._process_link_records() inst._assign_anisou_to_atoms() if inst._residue_indices_overflow: # If we have overflows in our residue numbers, wipe out all insertion codes. CHARMM # abuses the PDB format and usurps the insertion code (and later fields) to extend the # residue number. I don't feel bad discarding insertion code information when the PDB # format is abused like this. It's really never used for overflowing PDBs in my # experience anyway for residue in inst.struct.residues: residue.insertion_code = '' return inst.struct
def _parse_open_file(self, fileobj): method_dispatch = { 'REMARK': self._parse_remarks, 'ATOM': self._parse_atom_record, 'ANISOU': self._parse_anisou_record, 'TER': self._parse_ter_record, 'HETATM': self._parse_atom_record, 'EXPDTA': self._parse_experimental, 'AUTHOR': self._parse_author, 'JRNL': self._parse_journal, 'KEYWDS': self._parse_keywords, 'CONECT': self._parse_connect_record, 'LINK': self._parse_links, 'CRYST1': self._parse_cryst1, 'MODEL': self._new_model, 'ENDMDL': self._end_model, } for line in self.fileobj: rec = line[:6].strip() if rec in method_dispatch: method_dispatch[rec](line) def _parse_remarks(self, line): """ Parse the various remarks, which may contain various metadata """ if line[6:10] == ' 290': self._symmetry_lines.append(line) elif line[6:10] == ' 900': rematch = self._relatere.match(line[11:]) if rematch: self.struct.related_entries.append(rematch.groups()) elif line[6:10] == ' 2': # Resolution if (not line[11:22].strip() or self.struct.resolution is not None or line[23:38] == 'NOT APPLICABLE.'): return elif line[11:22] != 'RESOLUTION.': warnings.warn('Unrecognized RESOLUTION record in PDB file: %s' % line.strip()) else: try: self.struct.resolution = float(line[23:30]) except ValueError: warnings.warn('Trouble converting resolution (%s) to float' % line[23:30]) def _parse_links(self, line): self._link_lines.append(line) def _parse_keywords(self, line): self.struct.keywords += '%s,' % line[10:].strip() def _parse_ter_record(self, *args): if self._current_model_number == 1: self._last_atom.residue.ter = True def _parse_experimental(self, line): if self.struct.experimental: self.struct.experimental += ' %s' % line[6:].strip() else: self.struct.experimental += line[6:].strip() def _parse_author(self, line): if self.struct.authors: self.struct.authors += ' %s' % line[10:].strip() else: self.struct.authors = line[10:].strip() def _parse_journal(self, line): part = line[12:16] if part == 'AUTH': self.struct.journal_authors += line[19:].strip() elif part == 'TITL': self.struct.title += ' %s' % line[19:].strip() elif part == 'REF ': self.struct.journal += ' %s' % line[19:47].strip() if not line[16:18].strip(): self.struct.volume = line[51:55].strip() self.struct.page = line[56:61].strip() try: self.struct.year = int(line[62:66]) except ValueError: # Shouldn't happen, but don't throw a fit pass elif part == 'PMID': self.struct.pmid = line[19:].strip() elif part == 'DOI ': self.struct.doi = line[19:].strip() def _parse_cryst1(self, line): """ Parses unit cell information from the CRYST1 record """ a = float(line[6:15]) b = float(line[15:24]) c = float(line[24:33]) try: A = float(line[33:40]) B = float(line[40:47]) C = float(line[47:54]) except (IndexError, ValueError): A = B = C = 90.0 self.struct.box = [a, b, c, A, B, C] self.struct.space_group = line[55:66].strip() @staticmethod def _parse_atom_parts_1(line): return dict( number=line[6:11], name=line[12:16].strip(), alternate_location=line[16].strip(), residue_name=line[17:21].strip(), chain=line[21].strip(), residue_number=line[22:26].strip(), insertion_code=line[26].strip(), ) @classmethod def _parse_atom_parts(cls, line): """ Pulls out atom attributes from the line and packs it into a dict """ # segment_id is CHARMM-specific parts = cls._parse_atom_parts_1(line) parts.update(dict( x=float(line[30:38]), y=float(line[38:46]), z=float(line[46:54]), occupancy=line[54:60], bfactor=line[60:66], element=line[76:78], charge=line[78:80], segment_id=line[72:76].strip(), )) elem = '%-2s' % parts['element'] # Make sure the space is at the end elem = elem[1] + ' ' if elem[0] == ' ' else elem try: elem = (elem[0].upper() + elem[1].lower()).strip() atomic_number = AtomicNum[elem] except KeyError: elem = element_by_name(parts['name']) atomic_number = AtomicNum[elem] parts['atomic_number'] = atomic_number parts['mass'] = Mass[elem] parts['bfactor'] = try_convert(parts['bfactor'], float, 0.0) parts['occupancy'] = try_convert(parts['occupancy'], float, 0.0) parts['charge'] = try_convert(parts['charge'], float, 0.0) return parts def _determine_residue_number(self, residue_number, line): if self._last_atom is not None: last_residue_number = self._last_atom.residue.number else: last_residue_number = 0 extended_by = self._residue_number_field_extended_by if self._residue_indices_overflow: if extended_by: try: self._last_residue_number_label = residue_number + line[26:26+extended_by] residue_number = int(residue_number + line[26:26+extended_by]) return residue_number except ValueError: pass else: if residue_number == self._last_residue_number_label: return last_residue_number else: self._last_residue_number_label = residue_number return last_residue_number + 1 if last_residue_number >= 9999 and residue_number == '1000' + '0' * extended_by and \ line[26+extended_by] == '0': self._residue_indices_overflow = True self._residue_number_field_extended_by += 1 self._last_residue_number_label = '1000' + '0' * self._residue_number_field_extended_by return int(self._last_residue_number_label) elif last_residue_number == 9999 and residue_number != '9999': # This is the first time we notice residue indices overflowing self._residue_indices_overflow = True self._last_residue_number_label = residue_number return last_residue_number + 1 else: self._last_residue_number_label = residue_number return int(residue_number) def _parse_anisou_record(self, line): parts = self._parse_atom_parts_1(line) try: u11 = int(line[28:35]) u22 = int(line[35:42]) u33 = int(line[42:49]) u12 = int(line[49:56]) u13 = int(line[56:63]) u23 = int(line[63:70]) except ValueError: warnings.warn('Problem parsing anisotropic factors from ANISOU record from: ' + line, PDBWarning) else: anisou = np.array([u11/1e4, u22/1e4, u33/1e4, u12/1e4, u13/1e4, u23/1e4]) key = self._make_atom_key_from_parts(parts, all_parts=True) self._anisou_records[key] = (anisou, line) def _determine_atom_number(self, atom_number): if self._last_atom is not None: self._atom_indices_overflow = (self._atom_indices_overflow or self._last_atom.number >= 99999) if self._atom_indices_overflow and self._last_atom is not None: return self._last_atom.number + 1 try: return int(atom_number) except ValueError: return self._last_atom.number + 1 if self._last_atom is not None else 1 @classmethod def _make_atom_key_from_parts(cls, atom_parts, all_parts=False): """ If all_parts is False, only key from the values that determine alternate locations """ return cls.AtomLookupKey( name=atom_parts['name'], number=atom_parts['number'] if all_parts else None, residue_name=atom_parts['residue_name'], residue_number=atom_parts['residue_number'], chain=atom_parts['chain'], insertion_code=atom_parts['insertion_code'], segment_id=atom_parts.get('segment_id', ''), alternate_location=atom_parts['alternate_location'] if all_parts else None, ) def _parse_atom_record(self, line): """ Parses an atom record from a PDB file """ atom_parts = self._parse_atom_parts(line) residue_number = self._determine_residue_number(atom_parts['residue_number'], line) atom_number = self._determine_atom_number(atom_parts['number']) AtomClass = ExtraPoint if atom_parts['name'] in ('EP', 'LP') else Atom atom = AtomClass(atomic_number=atom_parts['atomic_number'], name=atom_parts['name'], charge=atom_parts['charge'], mass=atom_parts['mass'], occupancy=atom_parts['occupancy'], bfactor=atom_parts['bfactor'], altloc=atom_parts['alternate_location'], number=atom_number) atom.xx = atom_parts['x'] atom.xy = atom_parts['y'] atom.xz = atom_parts['z'] attribute_key = self._make_atom_key_from_parts(atom_parts) all_attribute_key = self._make_atom_key_from_parts(atom_parts, all_parts=True) current_atom = self._atom_map_from_attributes.get(attribute_key, None) if (current_atom is not None and atom_parts['alternate_location'] in _ascii_letters_set and not self._atom_indices_overflow and not self._residue_indices_overflow): if self._current_model_number == 1: current_atom.other_locations[atom_parts['alternate_location']] = atom self._atom_map_to_parent[atom] = current_atom if atom_number not in self._atom_map_from_atom_number: self._atom_map_from_atom_number[atom_number] = atom # altloc atoms should be reachable in the all_attributes map self._atom_map_from_all_attributes[all_attribute_key] = atom return elif not current_atom in self._model1_atoms_in_structure: # This is if the atom is not in the structure, but in the alternate locations. We # don't currently store coordinates for those atoms beyond the first frame. Note # that this should be incredibly rare, since alt-locs are common in static structure # determination, like X-Ray crystallography. Ensemble methods like NMR won't have # alternate locations in addition to multiple frames, so this is not expected to be # an impactful limitation return current_atom_index = self._model_atom_counts[-1] self._model_atom_counts[-1] += 1 if self._current_model_number == 1: self._atom_map_from_all_attributes[all_attribute_key] = atom self._atom_map_from_attributes[attribute_key] = atom self._atom_map_from_atom_number[atom_number] = atom self.struct.add_atom(atom, atom_parts['residue_name'], residue_number, atom_parts['chain'], atom_parts['insertion_code'], atom_parts['segment_id']) self._model1_atoms_in_structure.add(atom) self._last_atom = atom else: try: atom_from_first_model = self.struct.atoms[current_atom_index] except IndexError: raise PDBError('Atom number mismatch between models') if (atom_from_first_model.residue.name != atom_parts['residue_name'] or atom_from_first_model.name != atom_parts['name']): raise PDBError('Atom/residue name mismatch in different models in model %d [%s]!' % (self._current_model_number, line.strip())) if self._current_model_number == 1 or current_atom in self._model1_atoms_in_structure: self._coordinates[-1].extend([atom.xx, atom.xy, atom.xz]) def _new_model(self, line): if self._current_model_number == 1 and len(self.struct.atoms) == 0: return # MODEL 1 if self._model_open: warnings.warn('%s begun before last model ended. Assuming it is ending' % line.strip(), PDBWarning) self._end_model(line) self._coordinates.append([]) self._model_atom_counts.append(0) self._model_open = True self._current_model_number += 1 def _end_model(self, line): """ Ends the current model and validates the processed model is the same size as the models that came before this one """ if not self._model_open: raise PDBError('No model was begun before ENDMDL was encountered') self._model_open = False if len(self._atom_map_from_attributes) == 0: raise PDBError('No atoms found in model') if len(self._coordinates[-1]) != 3 * len(self._atom_map_from_attributes): raise PDBError('Coordinate mismatch in model %d' % self._current_model_number) def _parse_connect_record(self, line): """ Parses the CONECT records and creates the bond. According to the format spec, the first two atom indexes are required. The final 3 are optional. """ origin_index = try_convert(line[6:11], int) index_1 = try_convert(line[11:16], int) index_2 = try_convert(line[16:21], int) index_3 = try_convert(line[21:26], int) index_4 = try_convert(line[26:31], int) if origin_index is None or index_1 is None: warnings.warn('Bad CONECT record -- not enough atom indexes in line: %s' % line, PDBWarning) return origin_atom = self._atom_map_from_atom_number.get(origin_index, None) atom_1 = self._atom_map_from_atom_number.get(index_1, None) atom_2 = self._atom_map_from_atom_number.get(index_2, None) atom_3 = self._atom_map_from_atom_number.get(index_3, None) atom_4 = self._atom_map_from_atom_number.get(index_4, None) if origin_atom is None or atom_1 is None: warnings.warn('CONECT record - could not find atoms %d and/or %d to connect. Line: %s' % (origin_index, index_1, line), PDBWarning) return origin_atom = self._atom_map_to_parent.get(origin_atom, origin_atom) for partner in (atom_1, atom_2, atom_3, atom_4): partner = self._atom_map_to_parent.get(partner, partner) if partner is None or partner in origin_atom.bond_partners: continue self.struct.bonds.append(Bond(origin_atom, partner)) def _process_structure_symmetry(self): if self._symmetry_lines: data = [] for line in self._symmetry_lines: if line.strip().startswith('REMARK 290 SMTRY'): data.append(line.split()[4:]) tensor = np.asarray(data, dtype='f8') self.struct.symmetry = Symmetry(tensor) def _process_link_records(self): for line in self._link_lines: atom_1_parts = self._parse_atom_parts_1(line) atom_2_parts = self._parse_atom_parts_1(line[30:]) symop1 = line[59:65].strip() symop2 = line[66:72].strip() try: length = float(line[73:78]) except ValueError: warnings.warn('Malformed LINK line (bad distance): %s' % line, PDBWarning) continue key1 = self._make_atom_key_from_parts(atom_1_parts) key2 = self._make_atom_key_from_parts(atom_2_parts) try: a1 = self._atom_map_from_attributes[key1] a2 = self._atom_map_from_attributes[key2] except KeyError: warnings.warn('Could not find link atoms %s and %s' % (key1, key2), PDBWarning) else: self.struct.links.append(Link(a1, a2, length, symop1, symop2)) def _assign_anisou_to_atoms(self): """ Assigns the ANISOU tensors to the atoms they belong to """ for key, (anisou_tensor, line) in iteritems(self._anisou_records): try: self._atom_map_from_all_attributes[key].anisou = anisou_tensor except KeyError: warnings.warn('Could not find atom belonging to anisou tensor with key %s. ' 'Line: %s' % (key, line), PDBWarning) def _postprocess_metadata(self): self.struct.keywords = [s.strip() for s in self.struct.keywords.split(',') if s.strip()] self.struct.journal = self.struct.journal.strip() self.struct.title = self.struct.title.strip()
[docs] @staticmethod def write(struct, dest, renumber=True, coordinates=None, altlocs='all', write_anisou=False, charmm=False, use_hetatoms=True, standard_resnames=False, increase_tercount=True, write_links=False): """ Write a PDB file from a Structure instance Parameters ---------- struct : :class:`Structure` The structure from which to write the PDB file dest : str or file-like Either a file name or a file-like object containing a `write` method to which to write the PDB file. If it is a filename that ends with .gz or .bz2, a compressed version will be written using either gzip or bzip2, respectively. renumber : bool, optional, default True If True, renumber the atoms and residues sequentially as they are stored in the structure. If False, use the original numbering if it was assigned previously. coordinates : array-like of float, optional If provided, these coordinates will be written to the PDB file instead of the coordinates stored in the structure. These coordinates should line up with the atom order in the structure (not necessarily the order of the "original" PDB file if they differ) altlocs : str, optional, default 'all' Keyword controlling which alternate locations are printed to the resulting PDB file. Allowable options are: - 'all' : print all alternate locations - 'first' : print only the first alternate locations - 'occupancy' : print the one with the largest occupancy. If two conformers have the same occupancy, the first one to occur is printed Input is case-insensitive, and partial strings are permitted as long as it is a substring of one of the above options that uniquely identifies the choice. write_anisou : bool, optional, default False If True, an ANISOU record is written for every atom that has one. If False, ANISOU records are not written. charmm : bool, optional, default False If True, SEGID will be written in columns 73 to 76 of the PDB file in the typical CHARMM-style PDB output. This will be omitted for any atom that does not contain a SEGID identifier. use_hetatoms: bool, optional, default True If True, certain atoms will have the HETATM tag instead of ATOM as per the PDB-standard. standard_resnames : bool, optional, default False If True, common aliases for various amino and nucleic acid residues will be converted into the PDB-standard values. increase_tercount : bool, optional, default True If True, the TER atom number field increased by one compared to atom card preceding it; this conforms to PDB standard. write_links : bool, optional, default False If True, any LINK records stored in the Structure will be written to the LINK records near the top of the PDB file. If this is True, then renumber *must* be False or a ValueError will be thrown Notes ----- If multiple coordinate frames are present, these will be written as separate models (but only the unit cell from the first model will be written, as the PDB standard dictates that only one set of unit cells shall be present). """ # Determine if we have *any* atom or residue numbers set. If none of # them are set, force renumbering no_atom_numbers_assigned = {a.number for a in struct.atoms} == {-1} no_residue_numbers_assigned = {r.number for r in struct.residues} == {-1} renumber = renumber or (no_atom_numbers_assigned and no_residue_numbers_assigned) if renumber and write_links: raise ValueError('write_links requires renumber=False AND original (not implied) ' 'numbers to be assigned') if altlocs.lower() == 'all'[:len(altlocs)]: altlocs = 'all' elif altlocs.lower() == 'first'[:len(altlocs)]: altlocs = 'first' elif altlocs.lower() == 'occupancy'[:len(altlocs)]: altlocs = 'occupancy' else: raise ValueError("Illegal value of occupancy [%s]; expected 'all', " "'first', or 'occupancy'" % altlocs) own_handle = False if not hasattr(dest, 'write'): dest = genopen(dest, 'w') own_handle = True if charmm: atomrec = ('ATOM %5d %-4s%1s%-4s%1s%4d%1s %8.3f%8.3f%8.3f%6.2f' '%6.2f %-4s%2s%-2s\n') anisourec = 'ANISOU%5d %-4s%1s%-4s%1s%4d%1s %7d%7d%7d%7d%7d%7d %2s%-2s\n' terrec = 'TER %5d %-4s%1s%4d\n' reslen = 4 else: atomrec = ('ATOM %5d %-4s%1s%-3s %1s%4d%1s %8.3f%8.3f%8.3f%6.2f' '%6.2f %-4s%2s%-2s\n') anisourec = 'ANISOU%5d %-4s%1s%-3s %1s%4d%1s %7d%7d%7d%7d%7d%7d %2s%-2s\n' terrec = ('TER %5d %-3s %1s%4d\n') reslen = 3 linkrec = ('LINK %-4s%1s%-3s %1s%4d%1s ' '%-4s%1s%-3s %1s%4d%1s %6s %6s %5.2f\n') hetatomrec = atomrec.replace('ATOM ', 'HETATM') if use_hetatoms else atomrec if struct.box is not None: dest.write('CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f %-11s%4s\n' % ( struct.box[0], struct.box[1], struct.box[2], struct.box[3], struct.box[4], struct.box[5], struct.space_group, '')) if struct.symmetry is not None: fmt = '%d%4d%10.6f%10.6f%10.6f%15.5f\n' for index, arr in enumerate(struct.symmetry.data): arr_list = [1 + index % 3, 1 + index//3] + arr.tolist() symm_line = "REMARK 290 SMTRY" + fmt % tuple(arr_list) dest.write(symm_line) if coordinates is not None: coords = np.array(coordinates, copy=False, subok=True) try: coords = coords.reshape((-1, len(struct.atoms), 3)) except ValueError: raise TypeError("Coordinates has unexpected shape") else: coords = struct.get_coordinates('all') if coords is None: raise ValueError('Cannot write PDB file with no coordinates') # Create a function to process each atom and return which one we want # to print, based on our alternate location choice if altlocs == 'all': def print_atoms(atom, coords): return atom, atom.other_locations, coords[atom.idx] elif altlocs == 'first': def print_atoms(atom, coords): return atom, dict(), coords[atom.idx] elif altlocs == 'occupancy': def print_atoms(atom, coords): occ = atom.occupancy a = atom for key, item in iteritems(atom.other_locations): if item.occupancy > occ: occ = item.occupancy a = item return a, dict(), [a.xx, a.xy, a.xz] else: assert False, 'Should not be here' if standard_resnames: standardize = lambda x: _standardize_resname(x)[:reslen] else: standardize = lambda x: (x[:reslen], _is_hetatm(x)) nmore = 0 # how many *extra* atoms have been added? last_number = 0 if write_links: for link in struct.links: rec = ( _format_atom_name_for_pdb(link.atom1), link.atom1.altloc, link.atom1.residue.name, link.atom1.residue.chain, link.atom1.residue.number, link.atom1.residue.insertion_code, _format_atom_name_for_pdb(link.atom2), link.atom2.altloc, link.atom2.residue.name, link.atom2.residue.chain, link.atom2.residue.number, link.atom2.residue.insertion_code, link.symmetry_op1, link.symmetry_op2, link.length, ) dest.write(linkrec % rec) for model, coord in enumerate(coords): if coords.shape[0] > 1: dest.write('MODEL %5d\n' % (model+1)) for res in struct.residues: if renumber: atoms = res.atoms else: atoms = sorted(res.atoms, key=lambda atom: atom.number) if charmm: segid = (res.segid or res.chain)[:4] else: segid = '' for atom in atoms: pa, others, (x, y, z) = print_atoms(atom, coord) # Figure out the serial numbers we want to print if renumber: anum = _number_truncated_to_n_digits(atom.idx + 1 + nmore, 5) rnum = _number_truncated_to_n_digits(res.idx + 1, 4) else: anum = _number_truncated_to_n_digits(pa.number, 5) rnum = _number_truncated_to_n_digits(res.number, 4) last_number = anum # Do any necessary name munging to respect the PDB spec aname = _format_atom_name_for_pdb(pa) resname, hetatom = standardize(res.name) if hetatom: rec = hetatomrec else: rec = atomrec dest.write(rec % (anum, aname, pa.altloc, resname, res.chain[:1], rnum, res.insertion_code[:1], x, y, z, pa.occupancy, pa.bfactor, segid, Element[pa.atomic_number].upper(), '')) if write_anisou and pa.anisou is not None: anisou = [int(ani*1e4) for ani in pa.anisou] dest.write(anisourec % (anum, aname, pa.altloc, resname, res.chain[:1], rnum, res.insertion_code[:1], anisou[0], anisou[1], anisou[2], anisou[3], anisou[4], anisou[5], Element[pa.atomic_number].upper(), '')) for key in sorted(others.keys()): oatom = others[key] x, y, z = oatom.xx, oatom.xy, oatom.xz if renumber: nmore += 1 anum = (pa.idx + 1 + nmore) else: anum = oatom.number or last_number + 1 anum = anum - anum // 100000 * 100000 last_number = anum aname = _format_atom_name_for_pdb(oatom) dest.write(rec % (anum, aname, key, resname, res.chain[:1], rnum, res.insertion_code[:1], x, y, z, oatom.occupancy, oatom.bfactor, segid, Element[oatom.atomic_number].upper(), '')) if write_anisou and oatom.anisou is not None: anisou = [int(ani*1e4) for ani in oatom.anisou] el = Element[oatom.atomic_number].upper() dest.write(anisourec % (anum, aname, oatom.altloc[:1], resname, res.chain[:1], rnum, res.insertion_code[:1], anisou[0], anisou[1], anisou[2], anisou[3], anisou[4], anisou[5], el, '')) if res.ter or (len(struct.bonds) > 0 and _needs_ter_card(res)): if increase_tercount: dest.write(terrec % (anum+1, resname, res.chain, rnum)) if renumber: nmore += 1 else: last_number += 1 else: dest.write(terrec % (anum, resname, res.chain, rnum)) if coords.shape[0] > 1: dest.write('ENDMDL\n') dest.write("%-80s\n" % "END") if own_handle: dest.close()
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
[docs]@add_metaclass(FileFormatType) class CIFFile(object): """ Standard PDBx/mmCIF file format parser and writer """ #===================================================
[docs] @staticmethod def id_format(filename): """ Identifies the file type as a PDBx/mmCIF file Parameters ---------- filename : str Name of the file to check format for Returns ------- is_fmt : bool True if it is a PDBx/mmCIF file """ f = genopen(filename) try: for line in f: if line.startswith('#'): continue if line[:5] == 'data_' and len(line.split()) == 1: return True else: return False return False finally: f.close()
#===================================================
[docs] @staticmethod def download(pdb_id, timeout=10, saveto=None): """ Goes to the wwPDB website and downloads the requested PDBx/mmCIF, loading it as a :class:`Structure` instance Parameters ---------- pdb_id : str The 4-letter PDB ID to try and download from the RCSB PDB database timeout : float, optional The number of seconds to wait before raising a timeout error. Default is 10 seconds saveto : str, optional If provided, this will be treated as a file name to which the PDB file will be saved. If None (default), no CIF file will be written. This will be a verbatim copy of the downloaded CIF file, unlike the somewhat-stripped version you would get by using :meth:`Structure.write_cif <parmed.structure.Structure.write_cif>` Returns ------- struct : :class:`Structure <parmed.structure.Structure>` Structure instance populated by the requested PDBx/mmCIF Raises ------ socket.timeout if the connection times out while trying to contact the FTP server IOError if there is a problem retrieving the requested PDB ImportError if the gzip module is not available TypeError if pdb_id is not a 4-character string """ import gzip if not isinstance(pdb_id, string_types) or len(pdb_id) != 4: raise ValueError('pdb_id must be the 4-letter PDB code') pdb_id = pdb_id.lower() ftp = ftplib.FTP('ftp.wwpdb.org', timeout=timeout) ftp.login() fileobj = io.BytesIO() try: ftp.retrbinary('RETR /pub/pdb/data/structures/divided/mmCIF/' '%s/%s.cif.gz' % (pdb_id[1:3], pdb_id), fileobj.write) except ftplib.all_errors as err: raise IOError('Could not retrieve PDB ID %s; %s' % (pdb_id, err)) finally: ftp.close() fileobj.seek(0) if PY3: fileobj = io.TextIOWrapper(gzip.GzipFile(fileobj=fileobj, mode='r')) else: fileobj = gzip.GzipFile(fileobj=fileobj, mode='r') if saveto is not None: with closing(genopen(saveto, 'w')) as f: f.write(fileobj.read()) fileobj.seek(0) return CIFFile.parse(fileobj)
#===================================================
[docs] @staticmethod def parse(filename, skip_bonds=False): """ Read a PDBx or mmCIF file and return a populated `Structure` class Parameters ---------- filename : ``str or file-like`` Name of PDB file to read, or a file-like object that can iterate over the lines of a PDB. Compressed file names can be specified and are determined by file-name extension (e.g., file.pdb.gz, file.pdb.bz2) skip_bonds : bool, optional If True, skip trying to assign bonds. This can save substantial time when parsing large files with non-standard residue names. However, no bonds are assigned. This is OK if, for instance, the CIF file is being parsed simply for its coordinates. Default is False. Metadata -------- The PDB parser also adds metadata to the returned Structure object that may be present in the PDB file experimental : ``str`` EXPDTA record journal : ``str`` JRNL record authors : ``str`` AUTHOR records keywords : ``str`` KEYWDS records doi : ``str`` DOI from the JRNL record pmid : ``str`` PMID from the JRNL record journal_authors : ``str`` Author info from the JRNL record volume : ``str`` Volume of the published article from the JRNL record page : ``str`` Page of the published article from the JRNL record title : ``str`` TITL section of the JRNL record year : ``str`` Year that the article was published, from the JRNL record resolution : ``float`` The X-RAY resolution in Angstroms, or None if not found related_entries : ``list of (str, str)`` List of entries in other databases Returns ------- structure1 [, structure2 [, structure3 [, ...] ] ] structure# : :class:`Structure` The Structure object initialized with all of the information from the PDBx/mmCIF file. No bonds or other topological features are added by default. If multiple structures are defined in the CIF file, multiple Structure instances will be returned as a tuple. Raises ------ ValueError if the file severely violates the PDB format specification. If this occurs, check the formatting on each line and make sure it matches the others. """ if isinstance(filename, string_types): own_handle = True fileobj = genopen(filename, 'r') else: own_handle = False fileobj = filename try: cifobj = PdbxReader(fileobj) data = [] cifobj.read(data) finally: if own_handle: fileobj.close() structures = [] for cont in data: struct = Structure() structures.append(struct) # Add metadata fields struct.experimental = struct.journal = struct.authors = '' struct.keywords = struct.doi = struct.pmid = '' struct.journal_authors = struct.volume = struct.title = '' struct.year = struct.resolution = None struct.related_entries = [] # Now we have the data. First get the metadata if it exists exptl = cont.getObj('exptl') if exptl is not None: struct.experimental = exptl.getValue('method') auth = cont.getObj('audit_author') if auth is not None: nameidx = auth.getAttributeIndex('name') if nameidx != -1: struct.authors = ', '.join([t[nameidx] for t in auth.getRowList()]) reflns = cont.getObj('reflns') if reflns is not None: res = reflns.getValue('d_resolution_high') if res != '?': try: struct.resolution = float(res) except ValueError: warnings.warn('Could not convert resolution (%s) to float' % res) cite = cont.getObj('citation_author') if cite is not None: nameidx = cite.getAttributeIndex('name') if nameidx != -1: journal_authors = [] for i in range(cite.getRowCount()): a = cite.getRow(i)[nameidx] if a not in journal_authors: journal_authors.append(a) struct.journal_authors = ', '.join(journal_authors) cite = cont.getObj('citation') if cite is not None: doiid = cite.getAttributeIndex('pdbx_database_id_DOI') pmiid = cite.getAttributeIndex('pdbx_database_id_PubMed') titlid = cite.getAttributeIndex('title') yearid = cite.getAttributeIndex('year') pageid = cite.getAttributeIndex('page_first') jrnlid = cite.getAttributeIndex('journal_abbrev') volid = cite.getAttributeIndex('journal_volume') rows = cite.getRowList() if doiid != -1: struct.doi = ', '.join([row[doiid] for row in rows if row[doiid] != '?']) if pmiid != -1: struct.pmid = ', '.join([row[pmiid] for row in rows if row[pmiid] != '?']) if titlid != -1: struct.title = '; '.join([row[titlid] for row in rows]) if yearid != -1: struct.year = ', '.join([row[yearid] for row in rows]) if pageid != -1: struct.page = ', '.join([row[pageid] for row in rows]) if jrnlid != -1: struct.journal = '; '.join([row[jrnlid] for row in rows]) if volid != -1: struct.volume = ', '.join([row[volid] for row in rows]) keywds = cont.getObj('struct_keywords') if keywds is not None: textid = keywds.getAttributeIndex('text') if textid != -1: rows = keywds.getRowList() struct.keywords = ', '.join([row[textid] for row in rows]) struct.keywords = [key.strip() for key in struct.keywords.split(',') if key.strip()] dbase = cont.getObj('pdbx_database_related') if dbase is not None: dbid = dbase.getAttributeIndex('db_id') nameid = dbase.getAttributeIndex('db_name') if dbid != -1 and nameid != -1: rows = dbase.getRowList() struct.related_entries = [(r[dbid],r[nameid]) for r in rows] # Now go through all of the atoms. Any items that do *not* exist are # given an index of -1. So we append an empty string on the end of # each row so that the default value for any un-specified value is # the empty string. This avoids needing any conditionals inside the # loop atoms = cont.getObj('atom_site') atnumid = atoms.getAttributeIndex('id') elemid = atoms.getAttributeIndex('type_symbol') atnameid = atoms.getAttributeIndex('auth_atom_id') altlocid = atoms.getAttributeIndex('label_alt_id') resnameid = atoms.getAttributeIndex('auth_comp_id') chainid = atoms.getAttributeIndex('auth_asym_id') resnumid = atoms.getAttributeIndex('auth_seq_id') inscodeid = atoms.getAttributeIndex('pdbx_PDB_ins_code') xid = atoms.getAttributeIndex('Cartn_x') yid = atoms.getAttributeIndex('Cartn_y') zid = atoms.getAttributeIndex('Cartn_z') occupid = atoms.getAttributeIndex('occupancy') bfactorid = atoms.getAttributeIndex('B_iso_or_equiv') modelid = atoms.getAttributeIndex('pdbx_PDB_model_num') origmodel = None lastmodel = None all_coords = [] xyz = [] atommap = dict() last_atom = Atom() for i in range(atoms.getRowCount()): row = atoms.getRow(i) + [''] atnum = int(row[atnumid]) elem = row[elemid] atname = row[atnameid] altloc = row[altlocid] if altloc == '.': altloc = '' resname = row[resnameid] chain = row[chainid] resnum = int(row[resnumid]) inscode = row[inscodeid] if inscode in '?.': inscode = '' model = int(row[modelid]) if origmodel is None: origmodel = lastmodel = model x, y, z = float(row[xid]), float(row[yid]), float(row[zid]) occup = float(row[occupid]) bfactor = float(row[bfactorid]) # Try to figure out the element elem = '%-2s' % elem # Make sure we have at least 2 characters if elem[0] == ' ': elem = elem[1] + ' ' try: atsym = (elem[0] + elem[1].lower()).strip() atomic_number = AtomicNum[atsym] mass = Mass[atsym] except KeyError: # Now try based on the atom name... but don't try too hard # (e.g., don't try to differentiate b/w Ca and C) try: atomic_number = AtomicNum[atname.strip()[0].upper()] mass = Mass[atname.strip()[0].upper()] except KeyError: try: sym = atname.strip()[:2] sym = '%s%s' % (sym[0].upper(), sym[1].lower()) atomic_number = AtomicNum[sym] mass = Mass[sym] except KeyError: atomic_number = 0 # give up mass = 0.0 if atname.startswith('EP') or atname.startswith('LP'): atom = ExtraPoint(atomic_number=atomic_number, name=atname, mass=mass, occupancy=occup, bfactor=bfactor, altloc=altloc, number=atnum) else: atom = Atom(atomic_number=atomic_number, name=atname, mass=mass, occupancy=occup, bfactor=bfactor, altloc=altloc, number=atnum) atom.xx, atom.xy, atom.xz = x, y, z if (_compare_atoms(last_atom, atom, resname, resnum, chain, '', inscode) and altloc): atom.residue = last_atom.residue last_atom.other_locations[altloc] = atom else: if model == origmodel: # Only add the atoms once struct.add_atom(atom, resname, resnum, chain, inscode) last_atom = atom if model == lastmodel: xyz.extend([x, y, z]) else: if all_coords and len(xyz) != len(all_coords[-1]): raise ValueError('All frames must have same number ' 'of atoms') all_coords.append(xyz) xyz = [x, y, z] lastmodel = model # Keep a mapping in case we need to go back and add attributes, # like anisotropic b-factors if model == origmodel: key = (resnum,resname,inscode,chain,atnum,altloc,atname) atommap[key] = atom # Check for unit cell parameters cell = cont.getObj('cell') if cell is not None: aid = cell.getAttributeIndex('length_a') bid = cell.getAttributeIndex('length_b') cid = cell.getAttributeIndex('length_c') alphaid = cell.getAttributeIndex('angle_alpha') betaid = cell.getAttributeIndex('angle_beta') gammaid = cell.getAttributeIndex('angle_gamma') row = cell.getRow(0) struct.box = np.array( [float(row[aid]), float(row[bid]), float(row[cid]), float(row[alphaid]), float(row[betaid]), float(row[gammaid])] ) symmetry = cont.getObj('symmetry') if symmetry is not None: spaceid = symmetry.getAttributeIndex('space_group_name_H-M') row = symmetry.getRow(0) if spaceid != -1: struct.space_group = row[spaceid] # Check for anisotropic B-factors anisou = cont.getObj('atom_site_anisotrop') if anisou is not None: atnumid = anisou.getAttributeIndex('id') atnameid = anisou.getAttributeIndex('pdbx_auth_atom_id') altlocid = anisou.getAttributeIndex('pdbx_label_alt_id') resnameid = anisou.getAttributeIndex('pdbx_auth_comp_id') chainid = anisou.getAttributeIndex('pdbx_auth_asym_id') resnumid = anisou.getAttributeIndex('pdbx_auth_seq_id') inscodeid = anisou.getAttributeIndex('pdbx_PDB_ins_code') u11id = anisou.getAttributeIndex('U[1][1]') u22id = anisou.getAttributeIndex('U[2][2]') u33id = anisou.getAttributeIndex('U[3][3]') u12id = anisou.getAttributeIndex('U[1][2]') u13id = anisou.getAttributeIndex('U[1][3]') u23id = anisou.getAttributeIndex('U[2][3]') if -1 in (atnumid, atnameid, altlocid, resnameid, chainid, resnumid, u11id, u22id, u33id, u12id, u13id, u23id): warnings.warn('Incomplete anisotropic B-factor CIF ' 'section. Skipping', PDBWarning) else: try: for i in range(anisou.getRowCount()): row = anisou.getRow(i) + [''] atnum = int(row[atnumid]) atname = row[atnameid] altloc = row[altlocid] resname = row[resnameid] chain = row[chainid] resnum = int(row[resnumid]) inscode = row[inscodeid] u11 = float(row[u11id]) u22 = float(row[u22id]) u33 = float(row[u33id]) u12 = float(row[u12id]) u13 = float(row[u13id]) u23 = float(row[u23id]) if altloc == '.': altloc = '' if inscode in '?.': inscode = '' key = (resnum, resname, inscode, chain, atnum, altloc, atname) atommap[key].anisou = np.array( [u11, u22, u33, u12, u13, u23] ) except (ValueError, KeyError): # If at least one went wrong, set them all to None for key, atom in iteritems(atommap): atom.anisou = None warnings.warn('Problem processing anisotropic ' 'B-factors. Skipping', PDBWarning) if xyz: if len(xyz) != len(struct.atoms) * 3: raise ValueError('Corrupt CIF; all models must have the ' 'same atoms') all_coords.append(xyz) if all_coords: struct._coordinates = np.array(all_coords).reshape( (-1, len(struct.atoms), 3)) # Make sure we assign bonds for all of the structures we parsed if not skip_bonds: for struct in structures: struct.assign_bonds() # Build the return value if len(structures) == 1: return structures[0] return tuple(structures)
#===================================================
[docs] @staticmethod def write(struct, dest, renumber=True, coordinates=None, altlocs='all', write_anisou=False, standard_resnames=False): """ Write a PDB file from the current Structure instance Parameters ---------- struct : :class:`Structure` The structure from which to write the PDBx/mmCIF file dest : ``str or file-like`` Either a file name or a file-like object containing a `write` method to which to write the PDB file. If it is a filename that ends with .gz or .bz2, a compressed version will be written using either gzip or bzip2, respectively. renumber : ``bool`` If True, renumber the atoms and residues sequentially as they are stored in the structure. If False, use the original numbering if it was assigned previously coordinates : ``array-like of float`` If provided, these coordinates will be written to the PDB file instead of the coordinates stored in the structure. These coordinates should line up with the atom order in the structure (not necessarily the order of the "original" PDB file if they differ) altlocs : ``str`` Keyword controlling which alternate locations are printed to the resulting PDB file. Allowable options are: - 'all' : (default) print all alternate locations - 'first' : print only the first alternate locations - 'occupancy' : print the one with the largest occupancy. If two conformers have the same occupancy, the first one to occur is printed Input is case-insensitive, and partial strings are permitted as long as it is a substring of one of the above options that uniquely identifies the choice. write_anisou : ``bool`` If True, an ANISOU record is written for every atom that has one. If False, ANISOU records are not written standard_resnames : bool, optional If True, common aliases for various amino and nucleic acid residues will be converted into the PDB-standard values. Default is False Notes ----- If multiple coordinate frames are present, these will be written as separate models (but only the unit cell from the first model will be written, as the PDBx standard dictates that only one set of unit cells shall be present). """ if altlocs.lower() == 'all'[:len(altlocs)]: altlocs = 'all' elif altlocs.lower() == 'first'[:len(altlocs)]: altlocs = 'first' elif altlocs.lower() == 'occupancy'[:len(altlocs)]: altlocs = 'occupancy' else: raise ValueError("Illegal value of occupancy [%s]; expected 'all', " "'first', or 'occupancy'" % altlocs) own_handle = False if not hasattr(dest, 'write'): dest = genopen(dest, 'w') own_handle = True # Make the main container cont = containers.DataContainer('cell') # Add cell info if applicable if struct.box is not None: cell = containers.DataCategory('cell') cell.appendAttribute('length_a') cell.appendAttribute('length_b') cell.appendAttribute('length_c') cell.appendAttribute('angle_alpha') cell.appendAttribute('angle_beta') cell.appendAttribute('angle_gamma') cell.append(struct.box[:]) cont.append(cell) # symmetry sym = containers.DataCategory('symmetry') sym.appendAttribute('space_group_name_H-M') sym.append([struct.space_group]) cont.append(sym) if coordinates is not None: coords = np.array(coordinates, copy=False, subok=True) try: coords = coords.reshape((-1, len(struct.atoms), 3)) except ValueError: raise TypeError("Coordinates has unexpected shape") else: coords = struct.get_coordinates('all') if coords is None: raise ValueError('Cannot write CIF file with no coordinates') # Create a function to process each atom and return which one we want # to print, based on our alternate location choice if altlocs == 'all': def print_atoms(atom, coords): return atom, atom.other_locations, coords[atom.idx] elif altlocs == 'first': def print_atoms(atom, coords): return atom, dict(), coords[atom.idx] elif altlocs == 'occupancy': def print_atoms(atom, coords): occ = atom.occupancy a = atom for key, item in iteritems(atom.other_locations): if item.occupancy > occ: occ = item.occupancy a = item return a, dict(), [a.xx, a.xy, a.xz] else: assert False, 'Should not be here' if standard_resnames: standardize = lambda x: _standardize_resname(x) else: standardize = lambda x: (x, _is_hetatm(x)) # Now add the atom section. Include all names that the CIF standard # usually includes, but put '?' in sections that contain data we don't # store in the Structure, Residue, or Atom classes cifatoms = containers.DataCategory('atom_site') cont.append(cifatoms) cifatoms.setAttributeNameList( ['group_PDB', 'id', 'type_symbol', 'label_atom_id', 'label_alt_id', 'label_comp_id', 'label_asym_id', 'label_entity_id', 'label_seq_id', 'pdbx_PDB_ins_code', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv', 'Cartn_x_esd', 'Cartn_y_esd', 'Cartn_z_esd', 'occupancy_esd', 'B_iso_or_equiv_esd', 'pdbx_formal_charge', 'auth_seq_id', 'auth_comp_id', 'auth_asym_id', 'auth_atom_id', 'pdbx_PDB_model_num'] ) write_anisou = write_anisou and any(atom.anisou is not None for atom in struct.atoms) if write_anisou: cifanisou = containers.DataCategory('atom_site_anisotrop') cont.append(cifanisou) cifanisou.setAttributeNameList( ['id', 'type_symbol', 'pdbx_label_atom_id', 'pdbx_label_alt_id', 'pdbx_label_comp_id', 'pdbx_label_asym_id', 'pdbx_label_seq_id', 'U[1][1]', 'U[2][2]', 'U[3][3]', 'U[1][2]', 'U[1][3]', 'U[2][3]', 'U[1][1]_esd', 'U[2][2]_esd', 'U[3][3]_esd', 'U[1][2]_esd', 'U[1][3]_esd', 'U[2][3]_esd', 'pdbx_auth_seq_id', 'pdbx_auth_comp_id', 'pdbx_auth_asym_id', 'pdbx_auth_atom_id'] ) nmore = 0 # how many *extra* atoms have been added? last_number = 0 last_rnumber = 0 for model, coord in enumerate(coords): for res in struct.residues: if renumber: atoms = res.atoms else: atoms = sorted(res.atoms, key=lambda atom: atom.number) resname, hetatom = standardize(res.name) if hetatom: atomrec = 'HETATM' else: atomrec = 'ATOM ' for atom in atoms: pa, others, (x, y, z) = print_atoms(atom, coord) # Figure out the serial numbers we want to print if renumber: anum = (atom.idx + 1 + nmore) rnum = (res.idx + 1) else: anum = (pa.number or last_number + 1) rnum = (atom.residue.number or last_rnumber + 1) last_number = anum last_rnumber = rnum cifatoms.append( [atomrec, anum, Element[pa.atomic_number].upper(), pa.name, pa.altloc, resname, res.chain, '?', rnum, res.insertion_code, x, y, z, pa.occupancy, pa.bfactor, '?', '?', '?', '?', '?', '', rnum, resname, res.chain, pa.name, str(model+1)] ) if write_anisou and pa.anisou is not None: cifanisou.append( [anum, Element[pa.atomic_number].upper(), pa.name, pa.altloc, resname, res.chain, rnum, pa.anisou[0], pa.anisou[1], pa.anisou[2], pa.anisou[3], pa.anisou[4], pa.anisou[5], '?', '?', '?', '?', '?', '?', rnum, resname, res.chain, pa.name] ) for key in sorted(others.keys()): oatom = others[key] x, y, z = oatom.xx, oatom.xy, oatom.xz if renumber: nmore += 1 anum = (pa.idx + 1 + nmore) else: anum = oatom.number or last_number + 1 last_number = anum el = Element[oatom.atomic_number].upper() cifatoms.append( [atomrec, anum, el, oatom.name, oatom.altloc, resname, res.chain, '?', rnum, res.insertion_code, x, y, z, oatom.occupancy, oatom.bfactor, '?', '?', '?', '?', '?', '', rnum, resname, res.chain, oatom.name, '1'] ) if write_anisou and oatom.anisou is not None: cifanisou.append( [anum, Element[oatom.atomic_number].upper(), oatom.name, oatom.altloc, resname, res.chain, rnum, oatom.anisou[0], oatom.anisou[1], oatom.anisou[2], oatom.anisou[3], oatom.anisou[4], oatom.anisou[5], '?', '?', '?', '?', '?', '?', rnum, resname, res.chain, oatom.name] ) # Now write the PDBx file writer = PdbxWriter(dest) writer.write([cont]) if own_handle: dest.close()
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ def _find_atom_index(struct, idx): """ Returns the atom with the given index in the structure. This is required because atom indices may not start from 1 and may contain gaps in a PDB file. This tries to find atoms quickly, assuming that indices *do* start from 1 and have no gaps. It then looks up or down, depending on whether we hit an index too high or too low. So it *assumes* that the sequence is monotonically increasing. If the atom can't be found, None is returned """ idx0 = min(max(idx - 1, 0), len(struct.atoms)-1) if struct[idx0].number == idx: return struct[idx0] if struct[idx0].number < idx: idx0 += 1 while idx0 < len(struct.atoms): if struct[idx0].number == idx: return struct[idx0] idx0 += 1 return None # not found else: idx0 -= 1 while idx0 > 0: if struct[idx0].number == idx: return struct[idx0] idx0 -= 1 return None # not found def _needs_ter_card(res): """ Determines if a TER card is needed by seeing if the residue is a polymeric residue that is *not* bonded to the next residue """ # First see if it's in the list of standard biomolecular residues. If so, # and it has no tail, no TER is needed std_resname = _standardize_resname(res.name)[0] if std_resname in StandardBiomolecularResidues: is_std_res = True if StandardBiomolecularResidues[std_resname].tail is None: return False else: is_std_res = False my_res_idx = res.idx residxs = set() for atom in res.atoms: for bond in atom.bonds: residxs |= {bond.atom1.residue.idx, bond.atom2.residue.idx} if my_res_idx + 1 in residxs: return False # It's connected to next residue elif is_std_res: return True else: # Heuristic -- add a TER if it's bonded to the previous residue, which # indicates it's polymeric. Otherwise don't. return my_res_idx - 1 in residxs def _format_atom_name_for_pdb(atom): if len(atom.name) < 4 and len(Element[atom.atomic_number]) != 2: return ' %-3s' % atom.name return atom.name[:4]
[docs]def try_convert(value, cast_type, default=None): try: return cast_type(value) except ValueError: return default