1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
#!/usr/bin/env python3
# Author: Katharina J. Hoff
# E-Mail: katharina.hoff@uni-greifswald.de
# Last modified on September 10th 2018
#
# This Python script extracts CDS features from a GTF file, excises
# corresponding sequence windows from a genome FASTA file, stitches the
# codingseq parts together, adds letters N at the ends if bases are
# annotated as missing by frame in the GTF file, makes reverse complement
# if required, and translates to protein sequence.
# Output files are:
# * file with protein sequences in FASTA format,
# * file with coding squences in FASTA format
# The script automatically checks for in-frame stop codons and prints a
# warning to STDOUT if such genes are in the GTF-file. The IDs of bad genes
# are printed to a file bad_genes.lst. Option -s allows to exclude bad genes
# from the FASTA output file, automatically.
try:
import argparse
except ImportError:
raise ImportError('Failed to import argparse. Try installing with \"pip3 install argparse\"')
try:
import re
except ImportError:
raise ImportError('Failed to import argparse. Try installing with \"pip3 install re\"')
try:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
except ImportError:
raise ImportError('Failed to import biophython modules. Try installing with \"pip3 install biopython\"')
parser = argparse.ArgumentParser(
description='Generate *.codingseq and *.aa FASTA-format files from genes \
in a GTF-file produced by AUGUSTUS auxprogs tool joingenes \
and a corresponding genomic FASTA-file.')
parser.add_argument('-g', '--genome', required=True,
type=str, help='genome sequence file (FASTA-format)')
parser.add_argument('-f', '--gtf', required=True,
type=str, help='file with CDS coordinates (GTF-format)')
parser.add_argument('-o', '--out', required=True, type=str,
help="name stem pf output file with coding sequences and \
protein sequences (FASTA-format); will be extended by \
.codingseq/.aa")
parser.add_argument('-t', '--table', dest='translation_table', default=1,
type=int, help='Translational code table number (INT)')
parser.add_argument('-s', '--filter_out_invalid_stops', dest='filter',
type=bool, help='Suppress output of protein sequences \
that contain internal stop codons.', default=False)
args = parser.parse_args()
# output file names:
codingseqFile = args.out + ".codingseq"
proteinFile = args.out + ".aa"
# Read GTF file CDS entries for transcripts
cds = {}
try:
with open(args.gtf, "r") as gtf_handle:
for line in gtf_handle:
if re.match(
r"\S+\t\S+\tCDS\t\d+\t\d+\t\S+\t\S+\t\d\t.*transcript_id \"(\S+)\";", line):
seq_id, st, en, stx, fr, tx_id = re.match(
r"(\S+)\t\S+\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t(\d)\t.*transcript_id \"(\S+)\";", line).groups()
if seq_id not in cds:
cds[seq_id] = {}
if tx_id not in cds[seq_id]:
cds[seq_id][tx_id] = []
cds[seq_id][tx_id].append(
{'start': int(st), 'end': int(en), 'strand': stx,
'frame': int(fr)})
except IOError:
print("Error: Failed to open file " + args.gtf + "!")
# Read genome file (single FASTA entries are held in memory, only), extract
# CDS sequence windows, add N when frame information states missing nucleotides
codingseq = {}
try:
with open(args.genome, "rU") as genome_handle:
for record in SeqIO.parse(genome_handle, "fasta"):
if record.id in cds:
for tx in cds[record.id]:
if tx not in codingseq:
codingseq[tx] = SeqRecord(
Seq("", generic_dna), id=tx, description=tx)
nCDS = len(cds[record.id][tx])
for i in range(0, nCDS):
cds_line = cds[record.id][tx][i]
if i == 0 and cds_line['strand'] == '+' and cds_line['frame'] != 0:
codingseq[tx].seq += Seq((3 - cds_line['frame'])
* 'N', generic_dna)
codingseq[tx].seq += record.seq[cds_line['start'] -
1:cds_line['end']]
if i == (nCDS-1):
if cds_line['strand'] == '+':
if (len(codingseq[tx].seq) % 3) != 0:
codingseq[tx].seq += Seq(
(3 - (len(codingseq[tx]) % 3)) * 'N',
generic_dna)
if i == (nCDS-1) and cds_line['strand'] == '-':
if cds_line['frame'] != 0:
codingseq[tx].seq += Seq((3 - cds_line['frame'])
* 'N', generic_dna)
if(len(codingseq[tx].seq) % 3) != 0:
codingseq[tx].seq = Seq(
(3 - (len(codingseq[tx].seq) % 3)) * 'N',
generic_dna) + codingseq[tx].seq
codingseq[tx].seq = codingseq[tx].seq.reverse_complement()
except IOError:
print("Error: Failed to open file " + args.genome + "!")
# Print coding sequences to file
try:
with open(codingseqFile, "w") as codingseq_handle:
for tx_id, seq_rec in codingseq.items():
SeqIO.write(seq_rec, codingseq_handle, "fasta")
except IOError:
print("Error: Failed to open file " + codingseqFile + "!")
# Translate coding sequences, identify sequences with in-frame stop codons,
# print proteins in FASTA format
bad_tx = {}
try:
with open(proteinFile, "w") as protein_handle:
for tx_id, seq_rec in codingseq.items():
is_good = True
seq_rec.seq = seq_rec.seq.translate(table=args.translation_table)
if re.search(r"\*\w", str(seq_rec.seq)):
bad_tx[tx_id] = 1
if ((args.filter == True) and (tx_id not in bad_tx)) or args.filter == False:
SeqIO.write(seq_rec, protein_handle, "fasta")
except IOError:
print("Error: Failed to open file " + proteinFile + "!")
# Print IDs of genes with in-frame stop codons if any were found
if len(bad_tx) > 0:
try:
with open("bad_genes.lst", "w") as bad_handle:
for tx in bad_tx:
bad_handle.write(tx + "\n")
except IOError:
print("Error: Failed to open file bad_genes.lst!")
print("WARNING: The GTF file contained " + str(len(bad_tx)) + " gene(s) " +
"with internal Stop codons (see file bad_genes.lst).")
if args.filter:
print("Note: Translations of the bad genes have already been exclude " +
"from the output file " + args.gtf + ".")
else:
print("Note: Translations of the bad genes have been included in the " +
"output file. If you wish to exclude them, run " +
"getAnnoFastaFromJoingenes.py with option --filter!")
|