File: Parser.py

package info (click to toggle)
python-biopython 1.42-2
links: PTS
area: main
in suites: etch, etch-m68k
size: 17,584 kB
ctags: 12,272
sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (630 lines) | stat: -rw-r--r-- 24,697 bytes
# Copyright 2004 by Jason A. Hackney.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

from string import join
from Bio.Alphabet import IUPAC
from Bio import File
from Bio.ParserSupport import *
from Bio import Seq
from Bio.MEME import Motif
import re

class MEMERecord:
    """A class for holding the results of a MEME run.
    
    A MEMERecord is an object that holds the results from running
    MEME. It implements no methods of its own.
        
    """
    def __init__ (self):
        """__init__ (self)"""
        self.motifs = []
        self.version = ""
        self.datafile = ""
        self.command = ""
        self.alphabet = None
        self.sequence_names = []
        
    def get_motif_by_name (self, name):
        for m in self.motifs:
            if m.name == name:
                return m

class MEMEParser (AbstractParser):
    """A parser for the text output of the MEME program.
    Parses the output into an object of the MEMERecord class.
    
    Methods:
    parse (handle): parses the contents of the file handle passed to it.
    
    Example:
    
    f = open("meme.output.txt")
    parser = MEMEParser()
    meme_record = parser.parse(f)
    for motif in meme_record.motifs:
        for instance in motif.instances:
            print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
    
    """
    def __init__ (self):
        """__init__ (self)"""
        self._scanner = _MEMEScanner()
        self._consumer = _MEMEConsumer()
    
    def parse (self, handle):
        """parse (self, handle)"""
        self._scanner.feed(handle, self._consumer)
        return self._consumer.data
    


class _MEMEScanner:
    """Scanner for MEME output. 
    
    Methods:
    feed
        
    """
    
    def feed (self, handle, consumer):
        """
        Feeds in MEME output for scanning. handle should
        implement the readline method. consumer is 
        a Consumer object that can receive the salient events.
        """
        if isinstance(handle, File.UndoHandle):
            uhandle = handle
        else:
            uhandle = File.UndoHandle(handle)
        
        self._scan_header(uhandle, consumer)
        self._scan_motifs    (uhandle, consumer)
    
    def _scan_header(self, uhandle, consumer):
        try:
            read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version')
        except SyntaxError:
            raise SyntaxError, "Improper input file. File should contain a line starting MEME version."
        read_and_call(uhandle, consumer._version, start = 'MEME version')
        read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET')
        read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET')
        read_and_call(uhandle, consumer.noevent, start = '****')
        read_and_call(uhandle, consumer._datafile, start = 'DATAFILE')
        read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET')
        read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
        read_and_call(uhandle, consumer.noevent, start = '----')
        read_and_call_until(uhandle, consumer._sequence_name, start = '***')
        read_and_call_until(uhandle, consumer.noevent, start = 'command:')
        read_and_call(uhandle, consumer._commandline, start = 'command:')
        read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF  1')
    
    def _scan_motifs(self, uhandle, consumer):
        while 1:
            read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF')
            read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value')
            read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value')
            read_and_call(uhandle, consumer.noevent, start = '---')
            read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
            read_and_call(uhandle, consumer.noevent, start = '---')
            read_and_call_until(uhandle, consumer.add_instance, start = '---')
            read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix')
            read_and_call(uhandle, consumer.noevent)
            read_and_call_until(uhandle, consumer.add_to_logodds, start = '---')
            read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix')
            read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix')    
            read_and_call_until(uhandle, consumer.add_to_pssm, start = '---')
            read_and_call_until(uhandle, consumer.noevent, start = 'Time')
            read_and_call(uhandle, consumer.noevent, start = 'Time')
            read_and_call(uhandle, consumer.noevent, blank = 1)
            read_and_call(uhandle, consumer.noevent, start = '***')
            read_and_call_while(uhandle, consumer.noevent, blank = 1)
            read_and_call(uhandle, consumer.noevent, start = '***')
            line = safe_peekline(uhandle)
            if line.startswith("SUMMARY OF MOTIFS"):
                break
    


class _MEMEConsumer:
    """
    Consumer that can receive events from MEME Scanner.
    
    This is the Consumer object that should be passed to the 
    MEME Scanner.
    """
    
    def __init__ (self):
        self.current_motif = None
        self.sequence_names = []
        self.data = MEMERecord()
    
    def _version (self, line):
        line = line.strip()
        ls = line.split()
        self.data.version = ls[2]
    
    def _datafile (self, line):
        line = line.strip()
        line = line.replace('DATAFILE= ','')
        self.data.datafile = line
    
    def _alphabet (self, line):
        line = line.strip()
        line = line.replace('ALPHABET= ','')
        if line == 'ACGT':
            al = IUPAC.unambiguous_dna
        else:
            al = IUPAC.protein
        self.data.alphabet = al
    
    def _sequence_name (self, line):
        line = line.strip()
        ls = line.split()
        self.data.sequence_names.append(ls[0])
        if len(ls) == 6:
            self.data.sequence_names.append(ls[3])
    
    def _commandline (self, line):
        line = line.strip()
        line = line.replace('command: ','')
        self.data.command = line
    
    def _add_motif_with_info (self, line):
        line = line.strip()
        ls = line.split()
        motif = Motif.MEMEMotif()
        motif._length(ls[4])
        motif._numoccurrences(ls[7])
        motif._evalue(ls[13])
        motif._alphabet(self.data.alphabet)
        self.data.motifs.append(motif)
        self.current_motif = motif
    
    def motif_name (self, line):
        line = line.strip()
        ls = line.split()
        name = join(ls[0:2], ' ')
        self.current_motif._name(name)
    
    def add_instance (self, line):
        line = line.strip()
        ls = line.split()
        if self.data.command.find('revcomp') != -1:
            seq = Seq.Seq(ls[5], self.data.alphabet)
            self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[2], pvalue = ls[3], strand = ls[1])
        else:
            seq = Seq.Seq(ls[4], self.data.alphabet)
            self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[1], pvalue = ls[2])
    
    def add_to_pssm (self, line):
        line = line.strip()
        sl = line.split()
        thisposition = tuple([float(i) for i in sl])
        self.current_motif.add_to_pssm(thisposition)
    
    def add_to_logodds (self, line):
        line = line.strip()
        sl = line.split()
        thisposition = tuple([float(i) for i in sl])    
        self.current_motif.add_to_logodds(thisposition)
    
    def noevent (self,line):
        pass
    


class _MASTConsumer:
    """
    Consumer that can receive events from _MASTScanner.
    
    A _MASTConsumer parses lines from a mast text output file.
    The motif match diagrams are parsed using line buffering. 
    Each of the buffering functions have a dummy variable that is
    required for testing using the Bio.ParserSupport.TaggingConsumer.
    If this variable isn't there, the TaggingConsumer barfs. In
    the _MASTScanner, None is passed in the place of this variable.
    """
    def __init__ (self):
        self.data = MASTRecord()
        self._current_seq = ""
        self._line_buffer = []
        self._buffer_size = 0
        self._buffered_seq_start = 0
    
    def _version (self, line):
        line = line.strip()
        ls = line.split()
        self.data._version(ls[2])
    
    def _database (self, line):
        line = line.strip()
        ls = line.split()
        self.data._database(ls[1])
        al = ""
        if ls[2] == '(nucleotide)':
            al = IUPAC.unambiguous_dna
            self.data._alphabet(al)        
        else:
            al = IUPAC.protein
            self.data._alphabet(al)
        
    def _add_motif (self, line):
        line = line.strip()
        ls = line.split()
        m = Motif.MEMEMotif()
        m._alphabet(self.data.alphabet)
        m._length(ls[1])
        name = ls[0]
        m._name(name)
        m._consensus(ls[2])
        self.data._add_motif(m)
    
    def _add_match_diagram (self, line):
        line = line.strip()
        ls = line.split()
        self.data._add_diagram_for_sequence(ls[1], self._current_seq)
        ds = ls[1].split('_')
        i = 0
        start = 0
        for i in range(0,len(ds)):
            if ds[i].find('[') != -1 or ds[i].find('<') != -1:
                inst = Motif.Instance()
                inst._seqname (self._current_seq)
                inst._start (start)
                r = re.compile('\d+')
                mn = r.findall(ds[i])[0]
                if ds[i].find('-') != -1:
                    inst.strand = '-'
                else:
                    inst.strand = '+'
                motif = self.data.get_motif_by_name(mn)
                motif.add_instance(inst)
                start += motif.length
            else:
                start += int(ds[i])            
    
    def _add_sequence_match_with_diagram (self, line):
        line = line.strip()
        ls = line.split()
        self.data._add_sequence(ls[0])
        self.data._add_diagram_for_sequence(ls[2],ls[0])
        ds = ls[2].split('_')
        i = 0
        start = 0
        for i in range(0,len(ds)):
            if ds[i].find('+') != -1 or ds[i].find('-') != -1:
                inst = Motif.Instance()
                inst._seqname (ls[0])
                inst._start (start)
                r = re.compile('\d+')
                mn = r.findall(ds[i])[0]
                if ds[i].find('-') != -1:
                    inst.strand = '-'
                else:
                    inst.strand = '+'
                motif = self.data.get_motif_by_name(mn)
                motif.add_instance(inst)
                start += motif.length
            else:
                start += int(ds[i])            
    
    def _add_diagram_from_buffer (self, dummy):
        line = ""
        for l in self._line_buffer:
            line += l.strip()
        ls = line.split()
        self.data._add_diagram_for_sequence(ls[1], self._current_seq)
        ds = ls[1].split('_')
        i = 0
        start = 0
        for i in range(0,len(ds)):
            if ds[i].find('[') != -1 or ds[i].find('<') != -1:
                inst = Motif.Instance()
                inst._seqname (self._current_seq)
                inst._start (start)
                r = re.compile('\d+')
                mn = r.findall(ds[i])[0]
                if ds[i].find('-') != -1:
                    inst.strand = '-'
                else:
                    inst.strand = '+'
                motif = self.data.get_motif_by_name(mn)
                motif.add_instance(inst)
                start += motif.length
            else:
                start += int(ds[i])            
    
    def _set_current_seq (self, line):
        line = line.strip()
        self._current_seq = line
        if not self.data.sequences.count(line):
            self.data.sequences.append(line)
    
    def _add_line_to_buffer (self, line):
        line = line.strip()
        if not line.startswith('*****'):
            self._line_buffer.append(line)
        else:
            return -1
    
    def _parse_buffer (self, dummy):
        """Parses the line buffer to get e-values for each instance of a motif.
        This buffer parser is the most likely point of failure for the 
        MASTParser.
        """
        insts = self.data.get_motif_matches_for_sequence(self._current_seq)    
        if len(insts) > 0:
            
            fullSeq = self._line_buffer[self._buffer_size-1]
            pvals = self._line_buffer[1].split()
            p = 0
            lpval = len(pvals)
            while p < lpval:
                if pvals[p].count('e') > 1:
                #Break blocks up by e and parse into valid floats. This only 
                #works if there are no e-values greater than 1e-5.
                    pvs = []
                    spe = pvals[p].split('e')
                    spe.reverse()
                    dotind = spe[1].find('.')
                    if dotind == -1:
                        thispval = spe[1][-1] + 'e' + spe[0]
                    else:
                        thispval = spe[1][dotind-1:] + 'e' + spe[0]
                    pvs.append(thispval)
                    for spi in range(2,len(spe)):
                        dotind = spe[spi].find('.')
                        prevdotind = spe[spi-1].find('.')
                        if dotind != -1:
                            if prevdotind == -1:
                                thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1]
                            else:
                                thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1]
                        else:
                            if prevdotind == -1:
                                thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1]
                            else:
                                thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1]
                        pvs.append(thispval)
                    pvs.reverse()
                    if p > 0:
                        pvals = pvals[0:p] + pvs + pvals[p+1:]
                    else:
                        pvals = pvs + pvals[p+1:]
                    lpval = len(pvals)
                p += 1
            i = 0
            if len(pvals) != len(insts):
                sys.stderr.write("Failure to parse p-values for " + self._current_seq +  ":  " + self._line_buffer[1] + " to: " + str(pvals) + "\n")
                pvals = []
#            else:
#                sys.stderr.write('These are just fine' + self._current_seq + ': ' + self._line_buffer[1] + " to: " + str(pvals) + "\n")
            for i in range(0,len(insts)):
                inst = insts[i]
                start = inst.start - self._buffered_seq_start + 1
                thisSeq = fullSeq[start:start+inst.length]
                thisSeq = Seq.Seq(thisSeq, self.data.alphabet)
                inst._sequence(thisSeq)
                if pvals:
                    inst._pvalue(float(pvals[i]))

    def _blank_buffer (self, dummy):
        self._line_buffer = []
        self._buffer_size = 0
    
    def _collapse_buffer(self, dummy):
        if self._buffer_size == 0:
            if len(self._line_buffer) > 0:
                self._buffer_size = len(self._line_buffer)
                ll = self._line_buffer[self._buffer_size-1].split()
                self._line_buffer[self._buffer_size-1] = ll[1]
                self._buffered_seq_start = int(ll[0])
        else:
            i = 0
            for i in range(self._buffer_size, len(self._line_buffer)-1):
                    self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip()
            ll = self._line_buffer[len(self._line_buffer)-1].split()
            if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]):
                self._line_buffer[self._buffer_size-1] += ll[1]
            else:
                differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]))
                self._line_buffer[self._buffer_size-1] += "N"*differ
                self._line_buffer[self._buffer_size-1] += ll[1]
            self._line_buffer = self._line_buffer[0:self._buffer_size]
    
    def _add_motif_match (self, line):
        line = line.strip()
        if line.find('[') != -1 or line.find('<') != -1:
            pass
        elif line.find('e') != -1:
            pass
        elif line.find('+') != -1:
            pass
    
    def noevent (self, line):
        pass
    


class MASTParser(AbstractParser):
    """
    Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord
    
    A MASTParser takes a file handle for a MAST text output file and 
    returns a MASTRecord, containing the hits between motifs and 
    sequences. The parser does some unusual line buffering to parse out 
    match diagrams. Really complex diagrams often lead to an error message 
    and p-values not being parsed for a given line.
    
    Methods:
    parse (handle): parses the data from the file handle passed to it.
    
    Example:
    
    f = open("mast_file.txt")
    parser = MASTParser()
    mast_record = parser.parse(f)
    for motif in mast_record.motifs:
        for instance in motif.instances:
            print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
    """
    def __init__ (self):
        self._consumer = _MASTConsumer()
        self._scanner = _MASTScanner()
    
    def parse (self, handle):
        self._scanner.feed(handle, self._consumer)
        return self._consumer.data
    


class _MASTScanner:
    """
    Scanner for MAST text output. 
        
    """
    def feed (self, handle, consumer):
        if isinstance(handle, File.UndoHandle):
            uhandle = handle
        else:
            uhandle = File.UndoHandle(handle)
            
        self._scan_header(uhandle, consumer)
        self._scan_matches(uhandle, consumer)
        self._scan_annotated_matches(uhandle, consumer)
    
    def _scan_header (self, uhandle, consumer):
        try:
            read_and_call_until(uhandle, consumer.noevent, contains = "MAST version")
        except SyntaxError:
            raise SyntaxError, "Improper input file. Does not begin with a line with 'MAST version'"
        read_and_call(uhandle, consumer._version, contains = 'MAST version')
        read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS')
        read_and_call(uhandle, consumer.noevent, start = 'DATABASE')
        read_and_call(uhandle, consumer.noevent, start = '****')
        read_and_call(uhandle, consumer._database, contains = 'DATABASE')
        read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH')
        read_and_call(uhandle, consumer.noevent, contains = 'MOTIF')
        read_and_call(uhandle, consumer.noevent, contains = '----')
        read_and_call_until(uhandle, consumer._add_motif, blank = 1)
        read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
    
    def _scan_matches (self, uhandle, consumer):
        read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
        read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
        read_and_call(uhandle, consumer.noevent, start = '---')
#        read_and_call_until(uhandle, consumer._add_sequence_match_with_diagram, blank = 1)
        read_and_call_until(uhandle, consumer.noevent, blank = 1)
        read_and_call(uhandle, consumer.noevent, blank = 1)
    
    def _scan_annotated_matches (self, uhandle, consumer):
        read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:')
        read_and_call(uhandle, consumer.noevent, start = 'SECTION III:')
        read_and_call_until(uhandle, consumer.noevent, start = '****')
        read_and_call(uhandle, consumer.noevent, start = '****')
        read_and_call_until(uhandle, consumer.noevent, start = '*****')
        read_and_call(uhandle, consumer.noevent)
        read_and_call_while(uhandle, consumer.noevent, blank = 1)
        readMatches = 1
        while readMatches == 1:
            if consumer._current_seq:
                if consumer._buffer_size != 0:
                    consumer._parse_buffer(None)
                consumer._blank_buffer(None)
            read_and_call(uhandle, consumer._set_current_seq)
            read_and_call_until(uhandle, consumer.noevent, start = '  DIAGRAM')
            read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
            consumer._add_diagram_from_buffer(None)
            consumer._blank_buffer(None)
            read_and_call(uhandle, consumer.noevent, blank = 1)
            while 1:
                line = safe_peekline(uhandle)
                if line.startswith('****'):
                    consumer._parse_buffer(None)
                    readMatches = 0
                    break
                read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
                read_and_call(uhandle, consumer.noevent, blank = 1)
                consumer._collapse_buffer(None)
                if attempt_read_and_call(uhandle, consumer.noevent, blank = 1):
                    break
                elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'):
                    consumer._parse_buffer(None)
                    consumer._blank_buffer(None)
                    readMatches = 0
                    break
    


class MASTRecord:
    """The class for holding the results from a MAST run.
    
    A MASTRecord holds data about matches between motifs and sequences.
    The motifs held by the MASTRecord are objects of the class MEMEMotif.
    
    Methods:
    get_motif_matches_for_sequence(sequence_name): returns all of the
        motif matches within a given sequence. The matches are objects of
        the class MEME.Motif.Instance
    get_motif_matches (motif_name): returns all of the matches for a motif
        in the sequences searched. The matches returned are of class 
        MEME.Motif.Instance
    get_motif_by_name (motif_name): returns a MEMEMotif with the given
        name.
    """
    def __init__ (self):
        self.sequences = []
        self.version = ""
        self.matches = []
        self.database = ""
        self.diagrams = {}
        self.alphabet = None
        self.motifs = []
    
    def _version (self, version):
        self.version = version
    
    def _alphabet (self, alphabet):
        if alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna or alphabet == IUPAC.unambiguous_dna:
            self.alphabet = alphabet
        else:
            return -1
    
    def _database(self, database):
        self.database = database
    
    def get_motif_matches_for_sequence (self, seq):
        insts = []
        for m in self.motifs:
            for i in m.instances:
                if i.sequence_name == seq:
                    insts.append(i)
        insts.sort(lambda x,y: cmp(x.start, y.start))
        return insts
    
    def get_motif_matches (self, motif):
        m = self.get_motif_by_name (motif.name)
        return m.instances
    
    def _add_diagram_for_sequence (self, diagram, seq):
        self.diagrams[seq] = diagram
    
    def _add_match (self, match):
        self.matches.append(match)
    
    def _add_sequence (self, sequence):
        self.sequences.append(sequence)
    
    def _add_motif (self, motif):
        self.motifs.append(motif)
    
    def get_motif_by_name (self, name):
        for m in self.motifs:
            if m.name == name:
                return m