Source code for parmed.formats.pdbx.PdbxReader

##
# File:  PdbxReader.py
# Date:  2012-01-09  Jdw  Adapted from PdbxParser
#
# Updates:
#
# 2012-01-09 - (jdw) Separate reader and writer classes.
#
# 2012-09-02 - (jdw)  Revise tokenizer to better handle embedded quoting.
#
##
"""
PDBx/mmCIF dictionary and data file parser.

Acknowledgements:

 The tokenizer used in this module is modeled after the clever parser design
 used in the PyMMLIB package.
 
 PyMMLib Development Group
 Authors: Ethan Merritt: merritt@u.washington.ed  & Jay Painter: jay.painter@gmail.com
 See:  http://pymmlib.sourceforge.net/

"""

import re
from parmed.exceptions import PdbxError, PdbxSyntaxError
from parmed.formats.pdbx.PdbxContainers import (
        DataCategory, DefinitionContainer, DataContainer)

[docs]class PdbxReader(object):
    """ PDBx reader for data files and dictionaries.
    
    """
    def __init__(self,ifh):
        """  ifh - input file handle returned by open()
        """
        # 
        self.__curLineNumber = 0        
        self.__ifh=ifh
        self.__stateDict={"data":   "ST_DATA_CONTAINER",
                          "loop":   "ST_TABLE",
                          "global": "ST_GLOBAL_CONTAINER",
                          "save":   "ST_DEFINITION",
                          "stop":   "ST_STOP"}
        
[docs]    def read(self, containerList):
        """
        Appends to the input list of definition and data containers.
        
        """
        self.__curLineNumber = 0
        try:
            self.__parser(self.__tokenizer(self.__ifh), containerList)
        except StopIteration:
            pass
        except (RuntimeError, DeprecationWarning) as e:
            if 'StopIteration' not in str(e):
                raise
        else:
            raise PdbxError()

    def __syntaxError(self, errText):
        raise PdbxSyntaxError(self.__curLineNumber, errText)

    def __getContainerName(self,inWord):
        """ Returns the name of the data_ or save_ container
        """
        return str(inWord[5:]).strip()
    
    def __getState(self, inWord):
        """Identifies reserved syntax elements and assigns an associated state.  

           Returns: (reserved word, state)
           where - 
              reserved word -  is one of CIF syntax elements:
                               data_, loop_, global_, save_, stop_
              state - the parser state required to process this next section.
        """
        i = inWord.find("_")
        if i == -1:
            return None,"ST_UNKNOWN"

        try:
            rWord=inWord[:i].lower()            
            return rWord, self.__stateDict[rWord]
        except:
            return None,"ST_UNKNOWN"
        
    def __parser(self, tokenizer, containerList):
        """ Parser for PDBx data files and dictionaries.

            Input - tokenizer() reentrant method recognizing data item names (_category.attribute)
                    quoted strings (single, double and multi-line semi-colon delimited), and unquoted
                    strings.

                    containerList -  list-type container for data and definition objects parsed from
                                     from the input file.

            Return:
                    containerList - is appended with data and definition objects - 
        """
        # Working container - data or definition
        curContainer = None
        #
        # Working category container 
        categoryIndex = {}
        curCategory = None
        #
        curRow = None
        state =  None

        # Find the first reserved word and begin capturing data.
        #
        while True:
            curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
            if curWord is None:
                continue
            reservedWord, state  = self.__getState(curWord)
            if reservedWord is not None:
                break
        
        while True:
            #
            #  Set the current state  -
            #
            #  At this point in the processing cycle we are expecting a token containing
            #  either a '_category.attribute'  or a reserved word.  
            #
            if curCatName is not None:
                state = "ST_KEY_VALUE_PAIR"
            elif curWord is not None:
                reservedWord, state = self.__getState(curWord)
            else:
                self.__syntaxError("Miscellaneous syntax error")
                return            

            #
            # Process  _category.attribute  value assignments 
            #
            if state == "ST_KEY_VALUE_PAIR":
                try:
                    curCategory = categoryIndex[curCatName]
                except KeyError:
                    # A new category is encountered - create a container and add a row 
                    curCategory = categoryIndex[curCatName] = DataCategory(curCatName)

                    try:
                        curContainer.append(curCategory)
                    except AttributeError:
                        self.__syntaxError("Category cannot be added to  data_ block")
                        return

                    curRow = []                    
                    curCategory.append(curRow)
                else:
                    # Recover the existing row from the category
                    try:
                        curRow = curCategory[0] 
                    except IndexError:
                        self.__syntaxError("Internal index error accessing category data")
                        return

                # Check for duplicate attributes and add attribute to table.
                if curAttName in curCategory.getAttributeList():
                    self.__syntaxError("Duplicate attribute encountered in category")
                    return
                else:
                    curCategory.appendAttribute(curAttName)


                # Get the data for this attribute from the next token
                tCat, tAtt, curQuotedString, curWord = next(tokenizer)

                if tCat is not None or (curQuotedString is None and curWord is None):
                    self.__syntaxError("Missing data for item _%s.%s" % (curCatName,curAttName))

                if curWord is not None:
                    # 
                    # Validation check token for misplaced reserved words  -  
                    #
                    reservedWord, state  = self.__getState(curWord)
                    if reservedWord is not None:
                        self.__syntaxError("Unexpected reserved word: %s" % (reservedWord))

                    curRow.append(curWord)

                elif curQuotedString is not None:
                    curRow.append(curQuotedString)

                else:
                    self.__syntaxError("Missing value in item-value pair")

                curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
                continue

            #
            # Process a loop_ declaration and associated data -
            #
            elif state == "ST_TABLE":

                # The category name in the next curCatName,curAttName pair
                #    defines the name of the category container.
                curCatName,curAttName,curQuotedString,curWord = next(tokenizer)

                if curCatName is None or curAttName is None:
                    self.__syntaxError("Unexpected token in loop_ declaration")
                    return

                # Check for a previous category declaration.
                if curCatName in categoryIndex:
                    self.__syntaxError("Duplicate category declaration in loop_")
                    return

                curCategory = DataCategory(curCatName)

                try:
                    curContainer.append(curCategory)
                except AttributeError:
                    self.__syntaxError("loop_ declaration outside of data_ block or save_ frame")
                    return

                curCategory.appendAttribute(curAttName)

                # Read the rest of the loop_ declaration 
                while True:
                    curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
                    
                    if curCatName is None:
                        break

                    if curCatName != curCategory.getName():
                        self.__syntaxError("Changed category name in loop_ declaration")
                        return

                    curCategory.appendAttribute(curAttName)


                # If the next token is a 'word', check it for any reserved words - 
                if curWord is not None:
                    reservedWord, state  = self.__getState(curWord)
                    if reservedWord is not None:
                        if reservedWord == "stop":
                            return
                        else:
                            self.__syntaxError("Unexpected reserved word after loop declaration: %s" % (reservedWord))
                    
                # Read the table of data for this loop_ - 
                while True:
                    curRow = []                    
                    curCategory.append(curRow)

                    for tAtt in curCategory.getAttributeList():
                        if curWord is not None:
                            curRow.append(curWord)
                        elif curQuotedString is not None:
                            curRow.append(curQuotedString)

                        curCatName,curAttName,curQuotedString,curWord = next(tokenizer)

                    # loop_ data processing ends if - 

                    # A new _category.attribute is encountered
                    if curCatName is not None:
                        break

                    # A reserved word is encountered
                    if curWord is not None:
                        reservedWord, state = self.__getState(curWord)
                        if reservedWord is not None:
                            break
                        
                continue


            elif state == "ST_DEFINITION":
                # Ignore trailing unnamed saveframe delimiters e.g. 'save_'
                sName=self.__getContainerName(curWord)
                if (len(sName) > 0):
                    curContainer = DefinitionContainer(sName)
                    containerList.append(curContainer)
                    categoryIndex = {}
                    curCategory = None

                curCatName,curAttName,curQuotedString,curWord = next(tokenizer)

            elif state == "ST_DATA_CONTAINER":
                #
                dName=self.__getContainerName(curWord)
                if len(dName) == 0:
                    dName="unidentified"
                curContainer = DataContainer(dName)
                containerList.append(curContainer)
                categoryIndex = {}
                curCategory = None
                curCatName,curAttName,curQuotedString,curWord = next(tokenizer)

            elif state == "ST_STOP":
                return
            elif state == "ST_GLOBAL":
                curContainer = DataContainer("blank-global")
                curContainer.setGlobal()
                containerList.append(curContainer)
                categoryIndex = {}
                curCategory = None
                curCatName,curAttName,curQuotedString,curWord = next(tokenizer)

            elif state == "ST_UNKNOWN":
                self.__syntaxError("Unrecogized syntax element: " + str(curWord))
                return
                

    def __tokenizer(self, ifh):
        """ Tokenizer method for the mmCIF syntax file - 

            Each return/yield from this method returns information about
            the next token in the form of a tuple with the following structure.

            (category name, attribute name, quoted strings, words w/o quotes or white space)

            Differentiated the regular expression to the better handle embedded quotes.

        """
        #
        # Regex definition for mmCIF syntax - semi-colon delimited strings are handled
        #                                     outside of this regex.
        mmcifRe = re.compile(
            r"(?:"

             "(?:_(.+?)[.](\S+))"               "|"  # _category.attribute

             "(?:['](.*?)(?:[']\s|[']$))"       "|"  # single quoted strings
             "(?:[\"](.*?)(?:[\"]\s|[\"]$))"    "|"  # double quoted strings             

             "(?:\s*#.*$)"                      "|"  # comments (dumped)

             "(\S+)"                                 # unquoted words

             ")")

        fileIter = iter(ifh)

        ## Tokenizer loop begins here ---
        while True:
            line = next(fileIter)
            self.__curLineNumber += 1

            # Dump comments
            if line.startswith("#"):
                continue
            
            # Gobble up the entire semi-colon/multi-line delimited string and
            #    and stuff this into the string slot in the return tuple
            #
            if line.startswith(";"):
                mlString = [line[1:]]
                while True:
                    line = next(fileIter)
                    self.__curLineNumber += 1
                    if line.startswith(";"):
                        break
                    mlString.append(line)

                # remove trailing new-line that is part of the \n; delimiter
                mlString[-1] = mlString[-1].rstrip()
                #
                yield (None, None, "".join(mlString), None)
                #
                # Need to process the remainder of the current line -
                line = line[1:]
                #continue

            # Apply regex to the current line consolidate the single/double
            # quoted within the quoted string category
            for it in mmcifRe.finditer(line):
                tgroups = it.groups()
                if tgroups != (None, None, None, None, None):
                    if tgroups[2] is not None:
                        qs = tgroups[2]
                    elif tgroups[3] is not None:
                        qs = tgroups[3]
                    else:
                        qs = None
                    groups = (tgroups[0],tgroups[1],qs,tgroups[4])
                    yield groups

    def __tokenizerOrg(self, ifh):
        """ Tokenizer method for the mmCIF syntax file - 

            Each return/yield from this method returns information about
            the next token in the form of a tuple with the following structure.

            (category name, attribute name, quoted strings, words w/o quotes or white space)

        """
        #
        # Regex definition for mmCIF syntax - semi-colon delimited strings are handled
        #                                     outside of this regex.
        mmcifRe = re.compile(
            r"(?:"

             "(?:_(.+?)[.](\S+))"               "|"  # _category.attribute

             "(?:['\"](.*?)(?:['\"]\s|['\"]$))" "|"  # quoted strings

             "(?:\s*#.*$)"                      "|"  # comments (dumped)

             "(\S+)"                                 # unquoted words

             ")")

        fileIter = iter(ifh)

        ## Tokenizer loop begins here ---
        while True:
            line = next(fileIter)
            self.__curLineNumber += 1

            # Dump comments
            if line.startswith("#"):
                continue
            
            # Gobble up the entire semi-colon/multi-line delimited string and
            #    and stuff this into the string slot in the return tuple
            #
            if line.startswith(";"):
                mlString = [line[1:]]
                while True:
                    line = next(fileIter)
                    self.__curLineNumber += 1
                    if line.startswith(";"):
                        break
                    mlString.append(line)

                # remove trailing new-line that is part of the \n; delimiter
                mlString[-1] = mlString[-1].rstrip()
                #
                yield (None, None, "".join(mlString), None)
                #
                # Need to process the remainder of the current line -
                line = line[1:]
                #continue

            ## Apply regex to the current line 
            for it in mmcifRe.finditer(line):
                groups = it.groups()
                if groups != (None, None, None, None):
                    yield groups