File: Proux_et_al_2002_Figure_6.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 46,856 kB
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (181 lines) | stat: -rw-r--r-- 7,381 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#

"""GenomeDiagram script to mimic Proux et al 2002 Figure 6

You can use the Entrez module to download the 3 required GenBank files

This is an extended version of the example in the Biopython Tutorial
which produces a GenomeDiagram figure close to Proux et al 2002 Figure 6.

See http://dx.doi.org/10.1128/JB.184.21.6026-6036.2002
"""
from reportlab.lib import colors
from reportlab.lib.colors import red, grey, orange, green, brown
from reportlab.lib.colors import blue, lightblue, purple

from Bio.Graphics import GenomeDiagram
from Bio.Graphics.GenomeDiagram import CrossLink

from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation

name = "Proux Fig 6"

# As explained in the Biopython Tutorial, these are three phage genomes. The first
# two are self-containged GenBank files, but the third phage is integrated into a
# bacterial genome, thus we slice the full record and also take the reverse
# complement to match the strand orientation of the other two phage:

A_rec = SeqIO.read("NC_002703.gbk", "gb")
B_rec = SeqIO.read("AF323668.gbk", "gb")
C_rec = SeqIO.read("NC_003212.gbk",
                   "gb")[2587879:2625807].reverse_complement(name=True)
records = dict((rec.name, rec) for rec in [A_rec, B_rec, C_rec])

# Here we hard code the gene colors for simiplicity and to match the target image.
# In practice you might have an automatic mapping based on the gene annotation
# or some other classification:

A_colors = [red] * 5 + [grey] * 7 + [orange] * 2 + [grey] * 2 + [orange] \
    + [grey] * 11 + [green] * 4 + [grey] + [green] * 2 + [grey, green] \
    + [brown] * 5 + [blue] * 4 + [lightblue] * 5 + [grey, lightblue] \
    + [purple] * 2 + [grey]
B_colors = [red] * 6 + [grey] * 8 + [orange] * 2 + [grey] + [orange] \
    + [grey] * 21 + [green] * 5 + [grey] + [brown] * 4 + [blue] * 3 \
    + [lightblue] * 3 + [grey] * 5 + [purple] * 2
C_colors = [grey] * 30 + [green] * 5 + [brown] * 4 + [blue] * 2 \
    + [grey, blue] + [lightblue] * 2 + [grey] * 5

# Here we hard code a list of cross-links with percentage identity scores, based
# on a manual inspection of the target image (there could be mistakes here).
# In practice you might generate the list of cross-mappings from a BLAST report
# or similar computational analysis:

# Tuc2009 (NC_002703) vs bIL285 (AF323668)
A_vs_B = [
    (99, "Tuc2009_01", "int"),
    (33, "Tuc2009_03", "orf4"),
    (94, "Tuc2009_05", "orf6"),
    (100, "Tuc2009_06", "orf7"),
    (97, "Tuc2009_07", "orf8"),
    (98, "Tuc2009_08", "orf9"),
    (98, "Tuc2009_09", "orf10"),
    (100, "Tuc2009_10", "orf12"),
    (100, "Tuc2009_11", "orf13"),
    (94, "Tuc2009_12", "orf14"),
    (87, "Tuc2009_13", "orf15"),
    (94, "Tuc2009_14", "orf16"),
    (94, "Tuc2009_15", "orf17"),
    (88, "Tuc2009_17", "rusA"),
    (91, "Tuc2009_18", "orf20"),
    (93, "Tuc2009_19", "orf22"),
    (71, "Tuc2009_20", "orf23"),
    (51, "Tuc2009_22", "orf27"),
    (97, "Tuc2009_23", "orf28"),
    (88, "Tuc2009_24", "orf29"),
    (26, "Tuc2009_26", "orf38"),
    (19, "Tuc2009_46", "orf52"),
    (77, "Tuc2009_48", "orf54"),
    (91, "Tuc2009_49", "orf55"),
    (95, "Tuc2009_52", "orf60"),
]

# bIL285 (AF323668) vs Listeria innocua prophage 5 (in NC_003212)
B_vs_C = [
    (42, "orf39", "lin2581"),
    (31, "orf40", "lin2580"),
    (49, "orf41", "lin2579"),  # terL
    (54, "orf42", "lin2578"),  # portal
    (55, "orf43", "lin2577"),  # protease
    (33, "orf44", "lin2576"),  # mhp
    (51, "orf46", "lin2575"),
    (33, "orf47", "lin2574"),
    (40, "orf48", "lin2573"),
    (25, "orf49", "lin2572"),
    (50, "orf50", "lin2571"),
    (48, "orf51", "lin2570"),
    (24, "orf52", "lin2568"),
    (30, "orf53", "lin2567"),
    (28, "orf54", "lin2566"),
]


def get_feature(features, id, tags=("locus_tag", "gene", "old_locus_tag")):
    """Search list of SeqFeature objects for an identifier under the given tags."""
    for f in features:
        for key in tags:
            # tag may not be present in this feature
            for x in f.qualifiers.get(key, []):
                if x == id:
                    return f
    raise KeyError(id)

gd_diagram = GenomeDiagram.Diagram(name)
feature_sets = {}
max_len = 0
for i, record in enumerate([A_rec, B_rec, C_rec]):
    max_len = max(max_len, len(record))
    # Allocate tracks 5 (top), 3, 1 (bottom) for A, B, C
    # (empty tracks 2 and 4 add useful white space to emphasise the cross links
    # and also serve to make the tracks vertically more compressed)
    gd_track_for_features = gd_diagram.new_track(5 - 2 * i,
                                                 name=record.name,
                                                 greytrack=True, height=0.5,
                                                 start=0, end=len(record))
    assert record.name not in feature_sets
    feature_sets[record.name] = gd_track_for_features.new_set()

# We add dummy features to the tracks for each cross-link BEFORE we add the
# arrow features for the genes. This ensures the genes appear on top:
for X, Y, X_vs_Y in [("NC_002703", "AF323668", A_vs_B),
                     ("AF323668", "NC_003212", B_vs_C)]:
    features_X = records[X].features
    features_Y = records[Y].features
    set_X = feature_sets[X]
    set_Y = feature_sets[Y]
    for score, x, y in X_vs_Y:
        color = colors.linearlyInterpolatedColor(colors.white, colors.firebrick,
                                                 0, 100, score)
        border = colors.lightgrey
        f_x = get_feature(features_X, x)
        F_x = set_X.add_feature(SeqFeature(FeatureLocation(f_x.location.start,
                                                           f_x.location.end,
                                                           strand=0)),
                                color=color, border=border)
        f_y = get_feature(features_Y, y)
        F_y = set_Y.add_feature(SeqFeature(FeatureLocation(f_y.location.start,
                                                           f_y.location.end,
                                                           strand=0)),
                                color=color, border=border)
        gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border))


for record, gene_colors in zip([A_rec, B_rec, C_rec],
                               [A_colors, B_colors, C_colors]):
    gd_feature_set = feature_sets[record.name]

    i = 0
    for feature in record.features:
        if feature.type != "gene":
            # Exclude this feature
            continue
        try:
            g_color = gene_colors[i]
        except IndexError:
            print("Don't have color for %s gene %i" % (record.name, i))
            g_color = grey
        gd_feature_set.add_feature(feature, sigil="BIGARROW",
                                   color=g_color, label=True,
                                   name=str(i + 1),
                                   label_position="start",
                                   label_size=6, label_angle=0)
        i += 1

gd_diagram.draw(format="linear", pagesize='A4', fragments=1,
                start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")