File: seq-reg.py

package info (click to toggle)
aegean 0.16.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 8,444 kB
  • sloc: ansic: 12,050; python: 465; sh: 315; makefile: 308; perl: 151
file content (60 lines) | stat: -rwxr-xr-x 1,728 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/python3

# Copyright (c) 2010-2015, Daniel S. Standage and CONTRIBUTORS
#
# The AEGeAn Toolkit is distributed under the ISC License. See
# the 'LICENSE' file in the AEGeAn source code distribution or
# online at https://github.com/standage/AEGeAn/blob/master/LICENSE.


import argparse
import os
import re
import subprocess
import sys


def parse_fasta(fp):
    """Stolen shamelessly from http://stackoverflow.com/a/7655072/459780."""
    name, seq = None, []
    for line in fp:
        line = line.rstrip()
        if line.startswith('>'):
            if name:
                yield (name, ''.join(seq))
            name, seq = line, []
        else:
            seq.append(line)
    if name:
        yield (name, ''.join(seq))


def seq_len(fp):
    for defline, seq in parse_fasta(fp):
        seqid = defline[1:].split(" ")[0]
        yield seqid, len(seq)


def parse_gff3(fp):
    for line in fp:
        if line.startswith('##gff-version') or \
           line.startswith('##sequence-region'):
            continue
        yield line.rstrip()


if __name__ == '__main__':
    desc = 'Correct a GFF3\'s sequence-region entries with Fasta sequences'
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('gff3', type=argparse.FileType('r'),
                        help='Annotation in GFF3 format; use - for stdin')
    parser.add_argument('fasta', type=argparse.FileType('r'),
                        help='Corresponding sequences in Fasta format')
    args = parser.parse_args()

    print('##gff-version   3')
    for seqid, seqlen in seq_len(args.fasta):
        print('##sequence-region     %s 1 %d' % (seqid, seqlen))

    for entry in parse_gff3(args.gff3):
        print(entry)