Source code for parmed.formats.pdbx.PdbxReader

##
# File:  PdbxReader.py
# Date:  2012-01-09  Jdw  Adapted from PdbxParser
#
# Updates:
#
# 2012-01-09 - (jdw) Separate reader and writer classes.
#
# 2012-09-02 - (jdw)  Revise tokenizer to better handle embedded quoting.
#
##
"""
PDBx/mmCIF dictionary and data file parser.

Acknowledgements:

 The tokenizer used in this module is modeled after the clever parser design
 used in the PyMMLIB package.
 
 PyMMLib Development Group
 Authors: Ethan Merritt: merritt@u.washington.ed  & Jay Painter: jay.painter@gmail.com
 See:  http://pymmlib.sourceforge.net/

"""

import re
from parmed.exceptions import PdbxError, PdbxSyntaxError
from parmed.formats.pdbx.PdbxContainers import (
        DataCategory, DefinitionContainer, DataContainer)

[docs]class PdbxReader(object): """ PDBx reader for data files and dictionaries. """ def __init__(self,ifh): """ ifh - input file handle returned by open() """ # self.__curLineNumber = 0 self.__ifh=ifh self.__stateDict={"data": "ST_DATA_CONTAINER", "loop": "ST_TABLE", "global": "ST_GLOBAL_CONTAINER", "save": "ST_DEFINITION", "stop": "ST_STOP"}
[docs] def read(self, containerList): """ Appends to the input list of definition and data containers. """ self.__curLineNumber = 0 try: self.__parser(self.__tokenizer(self.__ifh), containerList) except StopIteration: pass except (RuntimeError, DeprecationWarning) as e: if 'StopIteration' not in str(e): raise else: raise PdbxError()
def __syntaxError(self, errText): raise PdbxSyntaxError(self.__curLineNumber, errText) def __getContainerName(self,inWord): """ Returns the name of the data_ or save_ container """ return str(inWord[5:]).strip() def __getState(self, inWord): """Identifies reserved syntax elements and assigns an associated state. Returns: (reserved word, state) where - reserved word - is one of CIF syntax elements: data_, loop_, global_, save_, stop_ state - the parser state required to process this next section. """ i = inWord.find("_") if i == -1: return None,"ST_UNKNOWN" try: rWord=inWord[:i].lower() return rWord, self.__stateDict[rWord] except: return None,"ST_UNKNOWN" def __parser(self, tokenizer, containerList): """ Parser for PDBx data files and dictionaries. Input - tokenizer() reentrant method recognizing data item names (_category.attribute) quoted strings (single, double and multi-line semi-colon delimited), and unquoted strings. containerList - list-type container for data and definition objects parsed from from the input file. Return: containerList - is appended with data and definition objects - """ # Working container - data or definition curContainer = None # # Working category container categoryIndex = {} curCategory = None # curRow = None state = None # Find the first reserved word and begin capturing data. # while True: curCatName, curAttName, curQuotedString, curWord = next(tokenizer) if curWord is None: continue reservedWord, state = self.__getState(curWord) if reservedWord is not None: break while True: # # Set the current state - # # At this point in the processing cycle we are expecting a token containing # either a '_category.attribute' or a reserved word. # if curCatName is not None: state = "ST_KEY_VALUE_PAIR" elif curWord is not None: reservedWord, state = self.__getState(curWord) else: self.__syntaxError("Miscellaneous syntax error") return # # Process _category.attribute value assignments # if state == "ST_KEY_VALUE_PAIR": try: curCategory = categoryIndex[curCatName] except KeyError: # A new category is encountered - create a container and add a row curCategory = categoryIndex[curCatName] = DataCategory(curCatName) try: curContainer.append(curCategory) except AttributeError: self.__syntaxError("Category cannot be added to data_ block") return curRow = [] curCategory.append(curRow) else: # Recover the existing row from the category try: curRow = curCategory[0] except IndexError: self.__syntaxError("Internal index error accessing category data") return # Check for duplicate attributes and add attribute to table. if curAttName in curCategory.getAttributeList(): self.__syntaxError("Duplicate attribute encountered in category") return else: curCategory.appendAttribute(curAttName) # Get the data for this attribute from the next token tCat, tAtt, curQuotedString, curWord = next(tokenizer) if tCat is not None or (curQuotedString is None and curWord is None): self.__syntaxError("Missing data for item _%s.%s" % (curCatName,curAttName)) if curWord is not None: # # Validation check token for misplaced reserved words - # reservedWord, state = self.__getState(curWord) if reservedWord is not None: self.__syntaxError("Unexpected reserved word: %s" % (reservedWord)) curRow.append(curWord) elif curQuotedString is not None: curRow.append(curQuotedString) else: self.__syntaxError("Missing value in item-value pair") curCatName, curAttName, curQuotedString, curWord = next(tokenizer) continue # # Process a loop_ declaration and associated data - # elif state == "ST_TABLE": # The category name in the next curCatName,curAttName pair # defines the name of the category container. curCatName,curAttName,curQuotedString,curWord = next(tokenizer) if curCatName is None or curAttName is None: self.__syntaxError("Unexpected token in loop_ declaration") return # Check for a previous category declaration. if curCatName in categoryIndex: self.__syntaxError("Duplicate category declaration in loop_") return curCategory = DataCategory(curCatName) try: curContainer.append(curCategory) except AttributeError: self.__syntaxError("loop_ declaration outside of data_ block or save_ frame") return curCategory.appendAttribute(curAttName) # Read the rest of the loop_ declaration while True: curCatName, curAttName, curQuotedString, curWord = next(tokenizer) if curCatName is None: break if curCatName != curCategory.getName(): self.__syntaxError("Changed category name in loop_ declaration") return curCategory.appendAttribute(curAttName) # If the next token is a 'word', check it for any reserved words - if curWord is not None: reservedWord, state = self.__getState(curWord) if reservedWord is not None: if reservedWord == "stop": return else: self.__syntaxError("Unexpected reserved word after loop declaration: %s" % (reservedWord)) # Read the table of data for this loop_ - while True: curRow = [] curCategory.append(curRow) for tAtt in curCategory.getAttributeList(): if curWord is not None: curRow.append(curWord) elif curQuotedString is not None: curRow.append(curQuotedString) curCatName,curAttName,curQuotedString,curWord = next(tokenizer) # loop_ data processing ends if - # A new _category.attribute is encountered if curCatName is not None: break # A reserved word is encountered if curWord is not None: reservedWord, state = self.__getState(curWord) if reservedWord is not None: break continue elif state == "ST_DEFINITION": # Ignore trailing unnamed saveframe delimiters e.g. 'save_' sName=self.__getContainerName(curWord) if (len(sName) > 0): curContainer = DefinitionContainer(sName) containerList.append(curContainer) categoryIndex = {} curCategory = None curCatName,curAttName,curQuotedString,curWord = next(tokenizer) elif state == "ST_DATA_CONTAINER": # dName=self.__getContainerName(curWord) if len(dName) == 0: dName="unidentified" curContainer = DataContainer(dName) containerList.append(curContainer) categoryIndex = {} curCategory = None curCatName,curAttName,curQuotedString,curWord = next(tokenizer) elif state == "ST_STOP": return elif state == "ST_GLOBAL": curContainer = DataContainer("blank-global") curContainer.setGlobal() containerList.append(curContainer) categoryIndex = {} curCategory = None curCatName,curAttName,curQuotedString,curWord = next(tokenizer) elif state == "ST_UNKNOWN": self.__syntaxError("Unrecogized syntax element: " + str(curWord)) return def __tokenizer(self, ifh): """ Tokenizer method for the mmCIF syntax file - Each return/yield from this method returns information about the next token in the form of a tuple with the following structure. (category name, attribute name, quoted strings, words w/o quotes or white space) Differentiated the regular expression to the better handle embedded quotes. """ # # Regex definition for mmCIF syntax - semi-colon delimited strings are handled # outside of this regex. mmcifRe = re.compile( r"(?:" "(?:_(.+?)[.](\S+))" "|" # _category.attribute "(?:['](.*?)(?:[']\s|[']$))" "|" # single quoted strings "(?:[\"](.*?)(?:[\"]\s|[\"]$))" "|" # double quoted strings "(?:\s*#.*$)" "|" # comments (dumped) "(\S+)" # unquoted words ")") fileIter = iter(ifh) ## Tokenizer loop begins here --- while True: line = next(fileIter) self.__curLineNumber += 1 # Dump comments if line.startswith("#"): continue # Gobble up the entire semi-colon/multi-line delimited string and # and stuff this into the string slot in the return tuple # if line.startswith(";"): mlString = [line[1:]] while True: line = next(fileIter) self.__curLineNumber += 1 if line.startswith(";"): break mlString.append(line) # remove trailing new-line that is part of the \n; delimiter mlString[-1] = mlString[-1].rstrip() # yield (None, None, "".join(mlString), None) # # Need to process the remainder of the current line - line = line[1:] #continue # Apply regex to the current line consolidate the single/double # quoted within the quoted string category for it in mmcifRe.finditer(line): tgroups = it.groups() if tgroups != (None, None, None, None, None): if tgroups[2] is not None: qs = tgroups[2] elif tgroups[3] is not None: qs = tgroups[3] else: qs = None groups = (tgroups[0],tgroups[1],qs,tgroups[4]) yield groups def __tokenizerOrg(self, ifh): """ Tokenizer method for the mmCIF syntax file - Each return/yield from this method returns information about the next token in the form of a tuple with the following structure. (category name, attribute name, quoted strings, words w/o quotes or white space) """ # # Regex definition for mmCIF syntax - semi-colon delimited strings are handled # outside of this regex. mmcifRe = re.compile( r"(?:" "(?:_(.+?)[.](\S+))" "|" # _category.attribute "(?:['\"](.*?)(?:['\"]\s|['\"]$))" "|" # quoted strings "(?:\s*#.*$)" "|" # comments (dumped) "(\S+)" # unquoted words ")") fileIter = iter(ifh) ## Tokenizer loop begins here --- while True: line = next(fileIter) self.__curLineNumber += 1 # Dump comments if line.startswith("#"): continue # Gobble up the entire semi-colon/multi-line delimited string and # and stuff this into the string slot in the return tuple # if line.startswith(";"): mlString = [line[1:]] while True: line = next(fileIter) self.__curLineNumber += 1 if line.startswith(";"): break mlString.append(line) # remove trailing new-line that is part of the \n; delimiter mlString[-1] = mlString[-1].rstrip() # yield (None, None, "".join(mlString), None) # # Need to process the remainder of the current line - line = line[1:] #continue ## Apply regex to the current line for it in mmcifRe.finditer(line): groups = it.groups() if groups != (None, None, None, None): yield groups