File: update_ncbi_codon_table.py

package info (click to toggle)
python-biopython 1.78%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 65,756 kB
  • sloc: python: 221,141; xml: 178,777; ansic: 13,369; sql: 1,208; makefile: 131; sh: 70
file content (127 lines) | stat: -rw-r--r-- 4,691 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Helper script to update Codon tables from the NCBI.

These tables are based on parsing the NCBI file:
ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt

More detailed information about the tables are here:
https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi

This script is used to update Bio/Data/CodonTable.py

Note that the NCBI sometimes revise the older tables,
so don't just add new tables - replace all of them
and check for any differences in the old tables.
"""

import re


def line_wrap(text, indent=0, max_len=78, string=False):
    """Return a wrapped line if length is larger max_len.

    The new parameter 'string' allows to wrap quoted text which is delimited
    by single quotes. It adds a closing quote to the end of the line and an
    opening quote to the start of the next line.
    """
    split_len = max_len if not string else max_len - 2
    if len(text) <= max_len:
        return text
    line = text[:split_len]
    assert " " in line, line
    line, rest = line.rsplit(" ", 1)
    # New:
    if string:
        line += ' "'
        rest = '"' + rest
    rest = " " * indent + rest + text[split_len:]
    assert len(line) < max_len
    if indent + len(rest) <= max_len:
        return line + "\n" + rest
    else:
        return line + "\n" + line_wrap(rest, indent, max_len, string)


print("##########################################################################")
print("# Start of auto-generated output from Scripts/update_ncbi_codon_table.py #")
print("##########################################################################")
print()

version = ""
for line in open("gc.prt").readlines():
    if not version and line.startswith("--  Version"):
        version = line.split("Version", 1)[1].strip()
        print(f"# Data from NCBI genetic code table version {version}\n")
    if line[:2] == " {":
        names = []
        id = None
        aa = None
        start = None
        bases = []
    elif line[:6] == "  name":
        names.append(re.search('"([^"]*)"', line).group(1))
    elif line[:8] == "    name":
        names.append(re.search('"(.*)$', line).group(1))
    elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n':
        names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma"
    elif line[:4] == "  id":
        id = int(re.search(r"(\d+)", line).group(1))
    elif line[:10] == "  ncbieaa ":
        aa = line[12 : 12 + 64]
    elif line[:10] == "  sncbieaa":
        start = line[12 : 12 + 64]
    elif line[:9] == "  -- Base":
        bases.append(line[12 : 12 + 64])
    elif line[:2] == " }":
        assert names != [] and id is not None and aa is not None
        assert start is not None and bases != []
        if len(names) == 1:
            names.append(None)
        # Use %r instead of %s to include the quotes of the string!
        print("register_ncbi_table(")
        print(line_wrap(f'    name="{names[0]}",', 4, string=True))
        print(line_wrap("    alt_name=%s," % (repr(names[1]).replace("'", '"'))))
        print(f"    id={id:d},")
        print("    table={")
        s = " " * 8
        noqa = False
        for i in range(64):
            if aa[i] != "*":
                s += f'"{bases[0][i]}{bases[1][i]}{bases[2][i]}": "{aa[i]}", '
            else:
                # leave a space for stop codons
                s += " " * 12
                noqa = True
            if i % 4 == 3:
                # Print out in rows of four:
                if noqa:
                    s += "  # noqa: E241"
                print(s.rstrip())
                s = " " * 8
                noqa = False
        assert not s.strip()
        print("    },")
        codons = [
            bases[0][i] + bases[1][i] + bases[2][i]
            for i in range(64)
            if start[i] == "*"
        ]
        print("    stop_codons=%s," % repr(codons).replace("'", '"'))
        codons = [
            bases[0][i] + bases[1][i] + bases[2][i]
            for i in range(64)
            if start[i] == "M"
        ]
        print("    start_codons=%s," % repr(codons).replace("'", '"'))
        print(")")
        print("")
    elif line[:2] == "--" or line in ("\n", "}\n", "Genetic-code-table ::= {\n"):
        pass
    else:
        raise Exception("Unparsed: " + repr(line))

print("########################################################################")
print("# End of auto-generated output from Scripts/update_ncbi_codon_table.py #")
print("########################################################################")