File: extractAnno.py

package info (click to toggle)
augustus 3.4.0%2Bdfsg2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 758,480 kB
  • sloc: cpp: 65,451; perl: 21,436; python: 3,927; ansic: 1,240; makefile: 1,032; sh: 189; javascript: 32
file content (102 lines) | stat: -rw-r--r-- 4,617 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/python3
import argparse
import fileinput
import sys
import os
import numpy as np

# author Giovanna Migliorelli
# version beta 02.06.2020

parser = argparse.ArgumentParser(description='Extract transcripts from some Ensembl annoation on the base of given chunks.')
parser.add_argument('-c', '--chunks',
                    #required=True,
                    nargs='+',
                    help='a list of one or more positive integers indicating the chunk/s for which we want to build a minimal annotation from given Ensembl.') 
args = parser.parse_args()


# helper function to extract values for some key (e.g. gene_id, transcript_id) from ENSEMBL like 8th field in some gff line
def extractTransId(str, key):
    value = ''
    for x in str.split(';'):
        pos = x.find(key)
        if pos>-1:
            value = x.split("\"")[1]
    return value

# reads in some annotation file and filters in those transcripts which fall in the interval [left,right]
def extractAnno(infile, outfile, mode, left, right, overlap, subset):
    transcripts = {}
    with open(infile, 'r') as input:
        while True:
            line = input.readline()
            if not line:
                break
            if line.find('#')==-1 and line!='\n':
                split = [x for x in line.split("\t") if x is not ""]

                if split[0] == '1':
                    trans_id = extractTransId(split[8], 'transcript_id')
                    if trans_id != '':
                        if trans_id not in subset:
                            # just in case gff contains some unordered pair
                            start = min(int(split[3]), int(split[4]))
                            end = max(int(split[3]), int(split[4]))

                            if trans_id in transcripts:
                                transcripts[trans_id][0] = min(transcripts[trans_id][0], start)
                                transcripts[trans_id][1] = max(transcripts[trans_id][1], end)
                            else:
                                transcripts[trans_id] = [start, end]
                                                
              
    # read once again and filter (not smart but helps saving mem)
    with open(infile, 'r') as input, open(outfile, mode) as output:
        while True:
            line = input.readline()
            if not line:
                break
            if line.find('#')==-1 and line!='\n':                       # here comments and blank lines are purged, we can change it though
                split = [x for x in line.split("\t") if x is not ""]
                trans_id = extractTransId(split[8], 'transcript_id')
                if trans_id != '':
                    if trans_id in transcripts:
                        if overlap == 'full':
                            if transcripts[trans_id][0]>=left and transcripts[trans_id][0]<=right:
                                output.write(line)
                                subset[trans_id] = True
                        elif overlap == 'partial':
                            if not(transcripts[trans_id][0]>=right or transcripts[trans_id][0]<=left):
                                output.write(line)
                                subset[trans_id] = True
                            
if __name__ == '__main__':
    if args.chunks is None:
        print('No chunks specified, please make use of --chunks to pass a non empty list of positive integers...')
        sys.exit()

    chunks = [int(x) for x in list(dict.fromkeys(args.chunks))]
    chunks = [x for x in chunks if x>0 and x<126]          # range valid for chr1 chunk size 2.5 Mb, chunk overlap 0.5 Mb

    if len(chunks) == 0:
        sys.exit()

    # should be synced with chunk_size, chunk_overlap, dir names used in the project see executeTestCGP.py for details
    chunk_size = 2500000
    chunk_overlap = 500000
    infile = '../examples/cgp12way/ENSEMBL/ensembl.ensembl_and_ensembl_havana.chr1.CDS.gtf.dupClean.gtf'
    outfile = '../examples/cgp12way/ENSEMBL/ensembl.ensembl_and_ensembl_havana.chr1.CDS.gtf.dupClean.FILTERED.gtf'
    
    subset = {} # used to keep track of those transcripts which have been already included and avoid duplicates
    
    print('Cleaning up old', outfile)
    if os.path.exists(outfile):
        os.remove(outfile)

    for chunk in chunks:
        left = (chunk_size-chunk_overlap)*(chunk-1)
        right = left + chunk_size - 1

        print('Including transcripts entirely contained within chunk', chunk, '[', left, ',', right, ']')
        extractAnno(infile, outfile, 'a+', left, right, 'full', subset)