File: GobyCompactToFasta.py

package info (click to toggle)
libgoby-java 3.3.1%2Bdfsg2-11
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 58,108 kB
  • sloc: java: 78,105; cpp: 5,011; xml: 3,170; python: 2,108; sh: 1,575; ansic: 277; makefile: 114
file content (104 lines) | stat: -rwxr-xr-x 3,239 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python3

#
# Copyright (C) 2010 Institute for Computational Biomedicine,
#                    Weill Medical College of Cornell University
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import getopt
import struct
import sys
import textwrap

from goby.Reads import CompactReads, decodeSequence


def usage():
    print("usage:", sys.argv[0],
          "[-h|--help] [-v|--verbose] [-f|--format <fasta|fastq>] [-o|--output <output-filename>] <filename>")


def main():
    verbose = False
    format = "fasta"
    output = sys.stdout

    fake_quality_score = 40

    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:o:hv", ["format=", "output=", "help", "verbose"])
    except getopt.GetoptError as err:
        print( str(err), file=sys.stderr)
        usage()
        sys.exit(1)

    # Collect options
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt in ("-v", "--verbose"):
            verbose = True
        elif opt in ("-f", "--format"):
            format = arg
        elif opt in ("-o", "--output"):
            output = open(arg, "w")

    if format != "fasta" and format != "fastq":
        print( "Format", format, "is not supported", file=sys.stderr)
        usage()
        sys.exit(3)

    if len(args) != 1:
        usage()
        sys.exit(2)

    filename = args[0]
    if verbose:
        print("Processing file =", filename)

    if format == "fasta":
        new_entry_character = ">"
    else:
        new_entry_character = "@"

    for entry in CompactReads(filename):
        # if the entry has a description, use it otherwise use the index
        if (entry.HasField("description")):
            description = entry.description
        else:
            description = entry.read_index
        print ("%c%s" % (new_entry_character, description), file=output)

        for line in textwrap.wrap(text=decodeSequence(entry.sequence), width=60):
            print(  line, file=output)

        if format == "fastq":
            print("+", file=output)
            quality_string = ""
            if entry.HasField("qualityScores"):
                for quality_score in entry.quality_scores:
                    quality_string += chr(struct.unpack("B", quality_score)[0] + 64)  # Assume Illumina encoding

            else:
                # fill the string with a constant quality code
                quality_string = quality_string.ljust(len(entry.sequence), chr(fake_quality_score + 64))

            for line in textwrap.wrap(quality_string, 60):
                print( line, file=output)

if __name__ == "__main__":
    main()