File: gff2gff.py

package info (click to toggle)
bcftools 1.16-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 20,252 kB
  • sloc: ansic: 60,589; perl: 5,818; python: 587; sh: 333; makefile: 284
file content (172 lines) | stat: -rwxr-xr-x 7,404 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
#
# This is script was contributed by Philip Ashton (https://github.com/flashton2003)
# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format
# supported by bcftools csq. Please expand as necessary and send a pull request.
# 
# See also
#   https://github.com/samtools/bcftools/issues/530#issuecomment-268278248
#   https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831
#   https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373
#
import sys
import gffutils

class PseudoFeature():
    def __init__(self, chrom, start, stop, strand):
        self.chrom = chrom
        self.start = start
        self.stop = stop
        self.strand = strand
        

class FeatureGroup():
    def __init__(self, gene_id):
        ## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603
        self.id = gene_id
        ## the below should be lists of instances of Feature class.
        self.gene = None # one feature group should have a single gene
        self.transcript = None
        ## assuming here that ncRNA is more like transcript than exon
        ## i.e. there is only one ncRNA feature per feature group.
        self.ncRNA = None
        self.exons = []
        self.CDSs = []

    def add_gene(self, feature):
        self.gene = feature

    def add_transcript(self, feature):
        self.transcript = feature # one feature group should have a single gene

    def add_exon(self, feature):
        self.exons.append(feature)

    def add_cds(self, feature):
        self.CDSs.append(feature)

    def add_ncRNA(self, feature):
        self.ncRNA = feature


def get_args():
    if len(sys.argv) != 3:
        print('Usage: gff2gff.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>')
        sys.exit()
    else:
        return sys.argv[1], sys.argv[2]

def group_features(db):
    feature_groups = {}
    for feature in db.all_features():
        if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'):
            if feature.featuretype == 'gene':
                ## for any particular feature group, the gene doesn't necassarily come
                ## before the CDS in the gff, so need to have the if/else logic
                gene_id = feature.id.split('.')[0]
                if gene_id not in feature_groups:
                    feature_groups[gene_id] = FeatureGroup(gene_id)
                    feature_groups[gene_id].add_gene(feature)
                else:
                    feature_groups[gene_id].add_gene(feature)
            ## in this gff, the transcripts are referred to as mRNA
            ## not all genes have mRNA records
            elif feature.featuretype == 'mRNA':
                gene_id = feature.id.split('.')[0]
                # print(feature.id)
                # print(gene_id)
                feature_groups[gene_id].add_transcript(feature)
            elif feature.featuretype == 'exon':
                gene_id = feature.attributes['locus_tag'][0]
                if gene_id not in feature_groups:
                    feature_groups[gene_id] = FeatureGroup(gene_id)
                    feature_groups[gene_id].add_exon(feature)
                else:
                    feature_groups[gene_id].add_exon(feature)

            elif feature.featuretype == 'CDS':
                gene_id = feature.attributes['locus_tag'][0]
                if gene_id not in feature_groups:
                    feature_groups[gene_id] = FeatureGroup(gene_id)
                    feature_groups[gene_id].add_cds(feature)
                else:
                    feature_groups[gene_id].add_cds(feature)

            elif feature.featuretype == 'ncRNA':
                gene_id = feature.attributes['locus_tag'][0]
                # print(gene_id)
                if gene_id not in feature_groups:
                    feature_groups[gene_id] = FeatureGroup(gene_id)
                    feature_groups[gene_id].add_ncRNA(feature)
                else:
                    feature_groups[gene_id].add_ncRNA(feature)
    return feature_groups

def make_transcript_pseudofeature(CDSs):
    ## takes list of CDSs, list len will often be 1, but will sometimes be 2
    start = min([x.start for x in CDSs])
    stop = max([x.stop for x in CDSs])
    chrom = CDSs[0].chrom
    ## check that both on the same strand
    assert len(set([x.strand for x in CDSs])) == 1
    strand = CDSs[0].strand
    pf = PseudoFeature(chrom, start, stop, strand)
    return pf


def check_feature_groups(feature_groups):
    ## checking that feature group has a gene, at least one CDS, and a transcript
    ## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript
    ## only checking that have transcript for everything
    for gene_id in feature_groups:
        ## don't want to analyse ncRNAs here, but need to include them up 
        ## to this point so that we know that that gene is an ncRNA gene
        if feature_groups[gene_id].ncRNA != None:
            continue
        assert feature_groups[gene_id].gene != None
        assert len(feature_groups[gene_id].CDSs) > 0
        if feature_groups[gene_id].transcript == None:
            ## using pseudofeature because the gffutils feature says it's not really intended
            ## for direct instantiation by users
            feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs)
       
def print_features(feature_groups):
    for gene_id in feature_groups:
        feature_group = feature_groups[gene_id]
        if feature_group.ncRNA != None:
            continue
        print('###')
        name = feature_group.gene.attributes['Name'][0]
        gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}'
        print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t')
        
        transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}'
        print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t')
        
        cds_attributes =  f'Parent=transcript:{gene_id};Name={name}'

        for c in feature_group.CDSs:
            print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t')

def main():
    '''
    Ensembl gff should have one gene and one transcript per "feature group"
    Then, can have multiple CDS/exons
    read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially)
    multiple CDS/exon
    Exons aren't printed, as not needed bt bcftools csq
    '''
    gff_handle, db_handle = get_args()
    fn = gffutils.example_filename(gff_handle)
    db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True)
    feature_groups = group_features(db)
    check_feature_groups(feature_groups)
    print_features(feature_groups)



## gff input is generated from a genbank file by bp_genbank2gff3.pl

if __name__ == '__main__':
    main()