Source code for parmed.formats.mol2

"""
This module contains parsers for mol2-format files (with support for the mol3
extension described at http://q4md-forcefieldtools.org/Tutorial/leap-mol3.php
"""
from __future__ import print_function, division, absolute_import

import copy
from parmed.exceptions import Mol2Error, ParameterWarning
from parmed.formats.registry import FileFormatType
from parmed.modeller.residue import ResidueTemplate, ResidueTemplateContainer
from parmed.periodic_table import element_by_name, AtomicNum
from parmed.residue import AminoAcidResidue, RNAResidue, DNAResidue
from parmed.structure import Structure
from parmed.topologyobjects import Atom, Bond
from parmed.utils.io import genopen
from parmed.utils.six import add_metaclass, string_types
import warnings

[docs]@add_metaclass(FileFormatType) class Mol2File(object): """ Class to read and write TRIPOS Mol2 files """ BOND_ORDER_MAP = dict(ar=1.5, am=1.25) REVERSE_BOND_ORDER_MAP = {1.25 : 'am', 1.5 : 'ar'} #===================================================
[docs] @staticmethod def id_format(filename): """ Identify the file as a Mol2 (or Mol3) file format or not Parameters ---------- filename : str Name of the file to test whether or not it is a mol2 file Returns ------- is_fmt : bool True if it is a mol2 (or mol3) file, False otherwise """ f = genopen(filename, 'r') try: for line in f: if line.startswith('#'): continue if not line.strip(): continue return line.startswith('@<TRIPOS>') return False finally: f.close()
#===================================================
[docs] @staticmethod def parse(filename, structure=False): """ Parses a mol2 file (or mol3) file Parameters ---------- filename : str or file-like Name of the file to parse or file-like object to parse from structure : bool, optional If True, the return value is a :class:`Structure` instance. If False, it is either a :class:`ResidueTemplate` or :class:`ResidueTemplateContainter` instance, depending on whether there is one or more than one residue defined in it. Default is False Returns ------- molecule : :class:`Structure`, :class:`ResidueTemplate`, or :class:`ResidueTemplateContainer` The molecule defined by this mol2 file Raises ------ Mol2Error If the file format is not recognized or non-numeric values are present where integers or floating point numbers are expected. Also raises Mol2Error if you try to parse a mol2 file that has multiple @<MOLECULE> entries with ``structure=True``. """ if isinstance(filename, string_types): f = genopen(filename, 'r') own_handle = True else: f = filename own_handle = False rescont = ResidueTemplateContainer() struct = Structure() restemp = ResidueTemplate() mol_info = [] multires_structure = False try: section = None last_residue = None headtail = 'head' molecule_number = 0 for line in f: if line.startswith('#'): continue if not line.strip() and section is None: continue if line.startswith('@<TRIPOS>'): section = line[9:].strip() if section == 'MOLECULE' and (restemp.atoms or rescont): if structure: raise Mol2Error('Cannot convert MOL2 with multiple ' '@<MOLECULE>s to a Structure') # Set the residue name from the MOL2 title if the # molecule had only 1 residue and it was given a name in # the title if not multires_structure and mol_info[0]: restemp.name = mol_info[0] multires_structure = False rescont.append(restemp) restemp = ResidueTemplate() struct = Structure() last_residue = None molecule_number += 1 mol_info = [] continue if section is None: raise Mol2Error('Bad mol2 file format') if section == 'MOLECULE': # Section formatted as follows: # mol_name # num_atoms [num_bonds [num_substr [num_feat [num_sets]]]] # mol_type # charge_type # [status_bits] # [mol_comment] # TODO: Do something with the name. if len(mol_info) == 0: mol_info.append(line.strip()) elif len(mol_info) == 1: mol_info.append([int(x) for x in line.split()]) elif len(mol_info) == 2: mol_info.append(line.strip()) elif len(mol_info) == 3: mol_info.append(line.strip()) # Ignore the rest continue if section == 'ATOM': # Section formatted as follows: # atom_id -- serial number of atom # atom_name -- name of the atom # x -- X-coordinate of the atom # y -- Y-coordinate of the atom # z -- Z-coordinate of the atom # atom_type -- type of the atom # subst_id -- Residue serial number # subst_name -- Residue name # charge -- partial atomic charge # status_bit -- ignored words = line.split() id = int(words[0]) name = words[1] x = float(words[2]) y = float(words[3]) z = float(words[4]) typ = words[5] try: resid = int(words[6]) except IndexError: resid = 0 try: resname = words[7] except IndexError: resname = 'UNK' if 'NO_CHARGES' not in mol_info: try: charge = float(words[8]) except IndexError: charge = 0 else: charge = 0 if last_residue is None: last_residue = (resid, resname) restemp.name = resname atom = Atom(name=name, type=typ, number=id, charge=charge) atom.xx, atom.xy, atom.xz = x, y, z struct.add_atom(atom, resname, resid) if last_residue != (resid, resname): rescont.append(restemp) restemp = ResidueTemplate() restemp.name = resname last_residue = (resid, resname) multires_structure = True try: restemp.add_atom(copy.copy(atom)) except ValueError: # Allow mol2 files being parsed as a Structure to have # duplicate atom names if not structure: raise continue if section == 'BOND': # Section formatted as follows: # bond_id -- serial number of bond (ignored) # origin_atom_id -- serial number of first atom in bond # target_atom_id -- serial number of other atom in bond # bond_type -- string describing bond type # status_bits -- ignored words = line.split() int(words[0]) # Bond serial number... redundant and ignored a1 = int(words[1]) a2 = int(words[2]) try: order = words[3] except IndexError: order = 1.0 if order in Mol2File.BOND_ORDER_MAP: order = Mol2File.BOND_ORDER_MAP[order] else: try: order = float(order) except ValueError: warnings.warn('Mol2 bond order not recognized: %s' % order, ParameterWarning) order = 1.0 atom1 = struct.atoms.find_original_index(a1) atom2 = struct.atoms.find_original_index(a2) struct.bonds.append(Bond(atom1, atom2, order=order)) # Now add it to our residue container # See if it's a head/tail connection if atom1.residue is not atom2.residue: if atom1.residue.idx == len(rescont): res1 = restemp elif atom1.residue.idx < len(rescont): res1 = rescont[atom1.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' if atom2.residue.idx == len(rescont): res2 = restemp elif atom2.residue.idx < len(rescont): res2 = rescont[atom2.residue.idx] assert atom.residue.idx <= len(rescont), 'Bad bond!' assert res1 is not res2, 'BAD identical residues' idx1 = atom1.idx - atom1.residue[0].idx idx2 = atom2.idx - atom2.residue[0].idx if atom1.residue.idx < atom2.residue.idx: res1.tail = res1[idx1] res2.head = res2[idx2] else: res1.head = res1[idx1] res2.tail = res2[idx2] elif not multires_structure: if not structure: restemp.add_bond(a1-1, a2-1, order) else: # Same residue, add the bond offset = atom1.residue[0].idx if atom1.residue.idx == len(rescont): res = restemp else: res = rescont[atom1.residue.idx] res.add_bond(atom1.idx-offset, atom2.idx-offset, order) continue if section == 'CRYSIN': # Section formatted as follows: # a -- length of first unit cell vector # b -- length of second unit cell vector # c -- length of third unit cell vector # alpha -- angle b/w b and c # beta -- angle b/w a and c # gamma -- angle b/w a and b # space group -- number of space group (ignored) # space group setting -- ignored words = line.split() box = [float(w) for w in words[:6]] if len(box) != 6: raise ValueError('%d box dimensions found; needed 6' % len(box)) struct.box = copy.copy(box) rescont.box = copy.copy(box) continue if section == 'SUBSTRUCTURE': # Section formatted as follows: # subst_id -- residue number # subst_name -- residue name # root_atom -- first atom of residue # subst_type -- ignored (usually 'RESIDUE') # dict_type -- type of substructure (ignored) # chain -- chain ID of residue # sub_type -- type of the chain # inter_bonds -- # of inter-substructure bonds # status -- ignored # comment -- ignored words = line.split() if not words: continue id = int(words[0]) resname = words[1] try: chain = words[5] except IndexError: chain = '' # Set the chain ID for res in struct.residues: if res.number == id and res.name == resname: res.chain = chain continue # MOL3 sections if section == 'HEADTAIL': atname, residx = line.split() residx = int(residx) if residx in (0, 1) or residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx-1] else: raise Mol2Error('Residue out of range in head/tail') for atom in res: if atom.name == atname: if headtail == 'head': res.head = atom headtail = 'tail' else: res.tail = atom headtail = 'head' break else: if headtail == 'head': headtail = 'tail' else: headtail = 'head' continue if section == 'RESIDUECONNECT': words = line.split() residx = int(words[0]) if residx - 1 == len(rescont): res = restemp elif residx - 1 < len(rescont): res = rescont[residx-1] else: raise Mol2Error('Residue out of range in ' 'residueconnect') for a in words[3:]: if a == '0': continue for atom in res: if atom.name == a: res.connections.append(atom) break else: raise Mol2Error('Residue connection atom %s not ' 'found in residue %d' % (a, residx)) if structure: for atom in struct.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return struct elif len(rescont) > 0: if not multires_structure and mol_info[0]: restemp.name = mol_info[0] rescont.append(restemp) for res in rescont: for atom in res.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return rescont else: for atom in restemp.atoms: anum = _guess_atomic_number(atom.name, restemp) if anum == 0: anum = _guess_atomic_number(atom.type, restemp) atom.atomic_number = anum return restemp except ValueError as e: raise Mol2Error('String conversion trouble: %s' % e) finally: if own_handle: f.close()
#===================================================
[docs] @staticmethod def write(struct, dest, mol3=False, split=False, compress_whitespace=False): """ Writes a mol2 file from a structure or residue template Parameters ---------- struct : :class:`Structure` or :class:`ResidueTemplate` or :class:`ResidueTemplateContainer` The input structure to write the mol2 file from dest : str or file-like obj Name of the file to write or open file handle to write to mol3 : bool, optional If True and ``struct`` is a ResidueTemplate or container, write HEAD/TAIL sections. Default is False split : bool, optional If True and ``struct`` is a ResidueTemplateContainer or a Structure with multiple residues, each residue is printed in a separate @<MOLECULE> section that appear sequentially in the output file compress_whitespace : bool, optional If True, seprate fields on one line with a single space instead of aligning them with whitespace. This is useful for parsers that truncate lines at 80 characters (e.g., some versions of OpenEye). However, it will not look as "neat" upon visual inspection in a text editor. Default is False. """ own_handle = False if not hasattr(dest, 'write'): own_handle = True dest = genopen(dest, 'w') if split: # Write sequentially if it is a multi-residue container or Structure if isinstance(struct, ResidueTemplateContainer): try: for res in struct: Mol2File.write(res, dest, mol3, compress_whitespace=compress_whitespace) finally: if own_handle: dest.close() return elif isinstance(struct, Structure) and len(struct.residues) > 1: try: for res in ResidueTemplateContainer.from_structure(struct): Mol2File.write(res, dest, mol3, compress_whitespace=compress_whitespace) finally: if own_handle: dest.close() return try: if isinstance(struct, ResidueTemplateContainer): natom = sum([len(c) for c in struct]) # To find the number of bonds, we need to total number of bonds # + the number of bonds that would be formed by "stitching" # together residues via their head and tail bonds = [] charges = [] bases = [1 for res in struct] for i, res in enumerate(struct): if i < len(struct) - 1: bases[i+1] = bases[i] + len(res) for i, res in enumerate(struct): for bond in res.bonds: bonds.append((bond.atom1.idx+bases[i], bond.atom2.idx+bases[i], bond.order)) if i < len(struct)-1 and (res.tail is not None and struct[i+1].head is not None): bonds.append((res.tail.idx+bases[i], struct[i+1].head.idx+bases[i+1], bond.order)) charges.extend([a.charge for a in res]) residues = struct name = struct.name or struct[0].name else: natom = len(struct.atoms) bonds = [(b.atom1.idx+1, b.atom2.idx+1, b.order) for b in struct.bonds] if isinstance(struct, ResidueTemplate): residues = [struct] name = struct.name else: residues = struct.residues name = struct.residues[0].name charges = [a.charge for a in struct.atoms] dest.write('@<TRIPOS>MOLECULE\n') dest.write('%s\n' % name) dest.write('%d %d %d 0 1\n' % (natom, len(bonds), len(residues))) if len(residues) == 1: dest.write('SMALL\n') else: for residue in residues: if AminoAcidResidue.has(residue.name): dest.write('PROTEIN\n') break if (RNAResidue.has(residue.name) or DNAResidue.has(residue.name)): dest.write('NUCLEIC\n') break else: dest.write('BIOPOLYMER\n') if not any(charges): dest.write('NO_CHARGES\n') printchg = False else: dest.write('USER_CHARGES\n') printchg = True # See if we want to print box info if hasattr(struct, 'box') and struct.box is not None: box = struct.box dest.write('@<TRIPOS>CRYSIN\n') if compress_whitespace: fmt = '%.4f %.4f %.4f %.4f %.4f %.4f 1 1\n' else: fmt = '%10.4f %10.4f %10.4f %10.4f %10.4f %10.4f 1 1\n' dest.write(fmt % tuple(box)) # Now do ATOM section dest.write('@<TRIPOS>ATOM\n') j = 1 for i, res in enumerate(residues): for atom in res: try: x = atom.xx except AttributeError: x = 0 try: y = atom.xy except AttributeError: y = 0 try: z = atom.xz except AttributeError: z = 0 if compress_whitespace: fmt = '%d %s %.4f %.4f %.4f %s %d %s' else: fmt = '%8d %-8s %10.4f %10.4f %10.4f %-8s %6d %-8s' dest.write(fmt % (j, atom.name, x, y, z, atom.type.strip() or atom.name, i+1, res.name)) if printchg: if compress_whitespace: fmt = ' %.6f\n' else: fmt = ' %10.6f\n' dest.write(fmt % atom.charge) else: dest.write('\n') j += 1 dest.write('@<TRIPOS>BOND\n') for i, bond in enumerate(bonds): if bond[2] in Mol2File.REVERSE_BOND_ORDER_MAP: order = Mol2File.REVERSE_BOND_ORDER_MAP[bond[2]] else: order = int(bond[2]) if compress_whitespace: fmt = '%d %d %d %s\n' else: fmt = '%8d %8d %8d %s\n' dest.write(fmt % (i+1, bond[0], bond[1], order)) dest.write('@<TRIPOS>SUBSTRUCTURE\n') first_atom = 0 for i, res in enumerate(residues): if not hasattr(res, 'chain') or not res.chain: chain = '****' else: chain = res.chain intresbonds = 0 if isinstance(res, ResidueTemplate): if i != len(residues)-1 and (res.tail is not None and residues[i+1].head is not None): intresbonds += 1 if i != 0 and (res.head is not None and residues[i-1].tail is not None): intresbonds += 1 else: for atom in res: for a2 in atom.bond_partners: if a2.residue is not res: intresbonds += 1 if compress_whitespace: fmt = '%d %s %d RESIDUE %d %s ROOT %d\n' else: fmt = '%8d %-8s %8d RESIDUE %4d %-4s ROOT %6d\n' dest.write(fmt % (i+1, res.name, first_atom+1, 0, chain[:4], intresbonds)) first_atom += len(res) if mol3: dest.write('@<TRIPOS>HEADTAIL\n') for i, res in enumerate(residues): if isinstance(res, ResidueTemplate): if res.head is not None: dest.write('%s %d\n' % (res.head.name, i+1)) else: dest.write('0 0\n') if res.tail is not None: dest.write('%s %d\n' % (res.tail.name, i+1)) else: dest.write('0 0\n') else: head = tail = None for atom in res: for a2 in atom.bond_partners: if a2.residue.idx == res.idx - 1: head = atom if a2.residue.idx == res.idx + 1: tail = atom if head is not None: dest.write('%s %d\n' % (head.name, i+1)) else: dest.write('0 0\n') if tail is not None: dest.write('%s %d\n' % (tail.name, i+1)) else: dest.write('0 0\n') dest.write('@<TRIPOS>RESIDUECONNECT\n') for i, res in enumerate(residues): if isinstance(res, ResidueTemplate): con = [res.head, res.tail, None, None, None, None] for i, a in enumerate(res.connections): con[i+2] = a else: con = [None, None, None, None, None, None] ncon = 2 for atom in res: for a2 in atom.bond_partners: if a2.residue.idx == res.idx - 1: con[0] = atom elif a2.residue.idx == res.idx + 1: con[1] = atom elif a2.residue.idx != res.idx: con[ncon] = atom ncon += 1 dest.write('%d' % (i+1)) for a in con: if a is not None: dest.write(' %s' % a.name) else: dest.write(' 0') dest.write('\n') finally: if own_handle: dest.close()
def _guess_atomic_number(name, residue=None): """ Guesses the atomic number """ # Special-case single-atom residues, which are almost always ions name = ''.join(c for c in name if c.isalpha()) if residue is None or len(residue.atoms) == 1: if len(name) > 1: try: return AtomicNum[name[0].upper() + name[1].lower()] except KeyError: return AtomicNum[element_by_name(name)] return AtomicNum[element_by_name(name)]