File: _parse_yn00.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3
links: PTS, VCS
area: main
in suites: stretch
size: 46,860 kB
ctags: 13,237
sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (132 lines) | stat: -rw-r--r-- 5,743 bytes
parent folder | download | duplicates (2)
# Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com)
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.

import re


def parse_ng86(lines, results):
    """ Parse the Nei & Gojobori (1986) section of the results.
    Nei_Gojobori results are organized in a lower
    triangular matrix, with the sequence names labeling
    the rows and statistics in the format:
    w (dN dS) per column
    Example row (2 columns):
    0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)"""
    sequences = []
    for line in lines:
        # Find all floating point numbers in this line
        line_floats_res = re.findall("-*\d+\.\d+", line)
        line_floats = [float(val) for val in line_floats_res]
        matrix_row_res = re.match("(.+)\s{5,15}", line)
        if matrix_row_res is not None:
            seq_name = matrix_row_res.group(1).strip()
            sequences.append(seq_name)
            results[seq_name] = {}
            for i in range(0, len(line_floats), 3):
                NG86 = {}
                NG86["omega"] = line_floats[i]
                NG86["dN"] = line_floats[i + 1]
                NG86["dS"] = line_floats[i + 2]
                results[seq_name][sequences[i // 3]] = {"NG86": NG86}
                results[sequences[i // 3]][seq_name] = {"NG86": NG86}
    return (results, sequences)


def parse_yn00(lines, results, sequences):
    """ Parse the Yang & Nielsen (2000) part of the results.
    Yang & Nielsen results are organized in a table with
    each row comprising one pairwise species comparison.
    Rows are labeled by sequence number rather than by
    sequence name."""

    # Example (header row and first table row):
    # seq. seq.     S       N        t   kappa   omega     dN +- SE    dS +- SE
    # 2    1    67.3   154.7   0.0136  3.6564  0.0000 -0.0000 +- 0.0000  0.0150
    # +- 0.0151
    for line in lines:
        # Find all floating point numbers in this line
        line_floats_res = re.findall("-*\d+\.\d+", line)
        line_floats = [float(val) for val in line_floats_res]
        row_res = re.match("\s+(\d+)\s+(\d+)", line)
        if row_res is not None:
            seq1 = int(row_res.group(1))
            seq2 = int(row_res.group(2))
            seq_name1 = sequences[seq1 - 1]
            seq_name2 = sequences[seq2 - 1]
            YN00 = {}
            YN00["S"] = line_floats[0]
            YN00["N"] = line_floats[1]
            YN00["t"] = line_floats[2]
            YN00["kappa"] = line_floats[3]
            YN00["omega"] = line_floats[4]
            YN00["dN"] = line_floats[5]
            YN00["dN SE"] = line_floats[6]
            YN00["dS"] = line_floats[7]
            YN00["dS SE"] = line_floats[8]
            results[seq_name1][seq_name2]["YN00"] = YN00
            results[seq_name2][seq_name1]["YN00"] = YN00
            seq_name1 = None
            seq_name2 = None
    return results


def parse_others(lines, results, sequences):
    """Parse the results from the other methods.

    The remaining methods are grouped together. Statistics
    for all three are listed for each of the pairwise
    species comparisons, with each method's results on its
    own line.
    The stats in this section must be handled differently
    due to the possible presence of NaN values, which won't
    get caught by my typical "line_floats" method used above.
    """
    # Example:
    # 2 (Pan_troglo) vs. 1 (Homo_sapie)

    # L(i):      143.0      51.0      28.0  sum=    222.0
    # Ns(i):    0.0000    1.0000    0.0000  sum=   1.0000
    # Nv(i):    0.0000    0.0000    0.0000  sum=   0.0000
    # A(i):     0.0000    0.0200    0.0000
    # B(i):    -0.0000   -0.0000   -0.0000
    # LWL85:  dS =  0.0227 dN =  0.0000 w = 0.0000 S =   45.0 N =  177.0
    # LWL85m: dS =    -nan dN =    -nan w =   -nan S =   -nan N =   -nan (rho = -nan)
    # LPB93:  dS =  0.0129 dN =  0.0000 w = 0.0000
    seq_name1 = None
    seq_name2 = None
    for line in lines:
        comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line)
        if comp_res is not None:
            seq_name1 = comp_res.group(1)
            seq_name2 = comp_res.group(2)
        elif seq_name1 is not None and seq_name2 is not None:
            if "dS =" in line:
                stats = {}
                line_stats = line.split(":")[1].strip()
                # Find all of the xx = ###### values in a row
                # ie dS =  0.0227
                # For dN and dS, the values have 8 characters from the equals
                # sign, while the rest have 7 characters. On Windows,
                # NaNs take on weird values like -1.#IND, which might fill the
                # entire fixed column width.
                res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?",
                                         line_stats)
                for stat_pair in res_matches:
                    stat = stat_pair.split('=')[0].strip()
                    value = stat_pair.split('=')[1].strip()
                    try:
                        stats[stat] = float(value)
                    except:
                        stats[stat] = None
                if "LWL85:" in line:
                    results[seq_name1][seq_name2]["LWL85"] = stats
                    results[seq_name2][seq_name1]["LWL85"] = stats
                elif "LWL85m" in line:
                    results[seq_name1][seq_name2]["LWL85m"] = stats
                    results[seq_name2][seq_name1]["LWL85m"] = stats
                elif "LPB93" in line:
                    results[seq_name1][seq_name2]["LPB93"] = stats
                    results[seq_name2][seq_name1]["LPB93"] = stats
    return results