1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
#!/usr/bin/env python3
#
# This is script was contributed by Philip Ashton (https://github.com/flashton2003)
# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format
# supported by bcftools csq. Please expand as necessary and send a pull request.
#
# See also
# https://github.com/samtools/bcftools/issues/530#issuecomment-268278248
# https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831
# https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373
#
import sys
import gffutils
class PseudoFeature():
def __init__(self, chrom, start, stop, strand):
self.chrom = chrom
self.start = start
self.stop = stop
self.strand = strand
class FeatureGroup():
def __init__(self, gene_id):
## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603
self.id = gene_id
## the below should be lists of instances of Feature class.
self.gene = None # one feature group should have a single gene
self.transcript = None
## assuming here that ncRNA is more like transcript than exon
## i.e. there is only one ncRNA feature per feature group.
self.ncRNA = None
self.exons = []
self.CDSs = []
def add_gene(self, feature):
self.gene = feature
def add_transcript(self, feature):
self.transcript = feature # one feature group should have a single gene
def add_exon(self, feature):
self.exons.append(feature)
def add_cds(self, feature):
self.CDSs.append(feature)
def add_ncRNA(self, feature):
self.ncRNA = feature
def get_args():
if len(sys.argv) != 3:
print('Usage: gff2gff.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>')
sys.exit()
else:
return sys.argv[1], sys.argv[2]
def group_features(db):
feature_groups = {}
for feature in db.all_features():
if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'):
if feature.featuretype == 'gene':
## for any particular feature group, the gene doesn't necassarily come
## before the CDS in the gff, so need to have the if/else logic
gene_id = feature.id.split('.')[0]
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_gene(feature)
else:
feature_groups[gene_id].add_gene(feature)
## in this gff, the transcripts are referred to as mRNA
## not all genes have mRNA records
elif feature.featuretype == 'mRNA':
gene_id = feature.id.split('.')[0]
# print(feature.id)
# print(gene_id)
feature_groups[gene_id].add_transcript(feature)
elif feature.featuretype == 'exon':
gene_id = feature.attributes['locus_tag'][0]
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_exon(feature)
else:
feature_groups[gene_id].add_exon(feature)
elif feature.featuretype == 'CDS':
gene_id = feature.attributes['locus_tag'][0]
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_cds(feature)
else:
feature_groups[gene_id].add_cds(feature)
elif feature.featuretype == 'ncRNA':
gene_id = feature.attributes['locus_tag'][0]
# print(gene_id)
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_ncRNA(feature)
else:
feature_groups[gene_id].add_ncRNA(feature)
return feature_groups
def make_transcript_pseudofeature(CDSs):
## takes list of CDSs, list len will often be 1, but will sometimes be 2
start = min([x.start for x in CDSs])
stop = max([x.stop for x in CDSs])
chrom = CDSs[0].chrom
## check that both on the same strand
assert len(set([x.strand for x in CDSs])) == 1
strand = CDSs[0].strand
pf = PseudoFeature(chrom, start, stop, strand)
return pf
def check_feature_groups(feature_groups):
## checking that feature group has a gene, at least one CDS, and a transcript
## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript
## only checking that have transcript for everything
for gene_id in feature_groups:
## don't want to analyse ncRNAs here, but need to include them up
## to this point so that we know that that gene is an ncRNA gene
if feature_groups[gene_id].ncRNA != None:
continue
assert feature_groups[gene_id].gene != None
assert len(feature_groups[gene_id].CDSs) > 0
if feature_groups[gene_id].transcript == None:
## using pseudofeature because the gffutils feature says it's not really intended
## for direct instantiation by users
feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs)
def print_features(feature_groups):
for gene_id in feature_groups:
feature_group = feature_groups[gene_id]
if feature_group.ncRNA != None:
continue
print('###')
name = feature_group.gene.attributes['Name'][0]
gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}'
print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t')
transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}'
print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t')
cds_attributes = f'Parent=transcript:{gene_id};Name={name}'
for c in feature_group.CDSs:
print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t')
def main():
'''
Ensembl gff should have one gene and one transcript per "feature group"
Then, can have multiple CDS/exons
read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially)
multiple CDS/exon
Exons aren't printed, as not needed bt bcftools csq
'''
gff_handle, db_handle = get_args()
fn = gffutils.example_filename(gff_handle)
db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True)
feature_groups = group_features(db)
check_feature_groups(feature_groups)
print_features(feature_groups)
## gff input is generated from a genbank file by bp_genbank2gff3.pl
if __name__ == '__main__':
main()
|