File: filetest.py

package info (click to toggle)
snap-aligner 1.0.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,988 kB
  • sloc: cpp: 36,500; ansic: 5,239; python: 227; makefile: 85; sh: 28
file content (175 lines) | stat: -rw-r--r-- 8,443 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# filetest.py
#
# Run file i/o tests on SNAP
#
# There are 4x2 possibilities for input:
# FQ | FQZ | SAM | BAM
# Single | paired
#
# There is a compressed reference in data/xx.fa.gz
#
# There are 3 input files stored:
#   xx_in1.fq.gz xx_in2.fq.gz xx_in12.bam
# The others are created from these using gunzip and samtools
# FQ/FQZ single : in1
# FQ/FQZ paired : in1 & in2
# BAM/SAM single/paired : in12
#
# There are 2x2 possibilities for output:
# SAM | BAM
# Unsorted | sorted
#
# These are validated against 4 output files using samtools & diff
#   xx_ref1.bam xx_sorted_ref1.bam xx_ref12.bam xx_sorted_ref12.bam
#
# Input & output files are stored in data
# Temp files are put in data/temp
#

import sys
import os
import shutil
import subprocess

if (len(sys.argv) < 5 or len(sys.argv) > 6):
    print "usage: %s data_dir file_base snap-aligner bin_dir [-quick]" % sys.argv[0]
    exit(1)

data = sys.argv[1]
template = sys.argv[2]
snap = sys.argv[3]
bin = sys.argv[4]
quick = len(sys.argv) == 6

run = "" # declare global

def _f(name):
    return os.path.normpath(data + "/" + name.replace("^", template).replace("#", run))

def _ff(names):
    return [_f(x) for x in names]
    
def runit(args, tag, strict=False, stdout=None, stdin=None):
    print "$ %s%s%s" % (' '.join(args), "" if stdin == None else " < "+stdin, "" if stdout == None else " > "+stdout)
    fout = _f(("temp/stdout-%s" % tag)) if stdout == None else stdout
    ferr = _f("temp/stderr-%s" % tag)
    retcode = subprocess.call(args, stdout=open(fout, "w"), stderr=open(ferr, "w"), stdin=(None if stdin==None else open(stdin, "r")))
    if retcode != 0:
        print "Run %s exited with %d" % (tag.replace("#", run), retcode)
        print open(ferr, "r").read(),
        if strict:
            exit(1)
        return False
    else:
        return True

# setup data & temp directories with all needed input files

temp = os.path.normpath(data + "/temp")
if not quick:
	if os.path.exists(temp):
		shutil.rmtree(temp)
	os.mkdir(temp)

	runit([bin + "gzip", "-d", "-c", _f("^.fa.gz")], "gunzip-ref", strict=True, stdout=_f("temp/^.fa"))
	runit([bin + "gzip", "-d", "-c", _f("^_in1.fq.gz")], "gunzip1", strict=True, stdout=_f("temp/^_in1.fq"))
	runit([bin + "gzip", "-d", "-c", _f("^_in2.fq.gz")], "gunzip2", strict=True, stdout=_f("temp/^_in2.fq"))
	runit([bin + "samtools", "view", "-h", _f("^_in12.bam"), "-o", _f("temp/^_in12.sam")], "samtools1", strict=True)
	runit([bin + "samtools", "view", _f("^_ref1.bam"), "-o", _f("temp/^_ref1u.sam")], "samtools2", strict=True)
	runit([bin + "sort", _f("temp/^_ref1u.sam")], "namesort-ref1", stdout=_f("temp/^_ref1.sam"), strict=True)
	os.remove(_f("temp/^_ref1u.sam"))
	runit([bin + "samtools", "view", _f("^_sorted_ref1.bam"), "-o", _f("temp/^_sorted_ref1.sam")], "samtools3", strict=True)
	runit([bin + "samtools", "view", _f("^_ref12.bam"), "-o", _f("temp/^_ref12u.sam")], "samtools4", strict=True)
	runit([bin + "sort", _f("temp/^_ref12u.sam")], "namesort-ref12", stdout=_f("temp/^_ref12.sam"), strict=True)
	os.remove(_f("temp/^_ref12u.sam"))
	runit([bin + "samtools", "view", _f("^_sorted_ref12.bam"), "-o", _f("temp/^_sorted_ref12.sam")], "samtools5", strict=True)
	# create index
	runit([snap, "index", _f("temp/^.fa"), _f("temp/^.idx")], "snap-index", strict=True)

inputs = {
    "fq": [["temp/^_in1.fq"], ["temp/^_in1.fq", "temp/^_in2.fq"]],
    "fq.gz": [["^_in1.fq.gz"], ["^_in1.fq.gz", "^_in2.fq.gz"]],
    "sam": [["temp/^_in12.sam"], ["temp/^_in12.sam"]],
    "bam": [["^_in12.bam"], ["^_in12.bam"]]
}

formats = {
    "fq": "-fastq",
    "fq.gz": "-compressedFastq",
    "sam": "-sam",
    "bam": "-bam"
}

runs = 0
failed = 0
for input_format in ["fq", "fq.gz", "bam", "sam"]:
    for paired in [0, 1]:
        for input_format_2 in ["", "fq", "fq.gz", "bam", "sam"]:
            test_output_format = ["sam", "bam"] if input_format == "fq" and input_format_2 == "" else ["bam"]
            test_sorted = [0, 1] if (input_format == "fq" or input_format == "bam") and input_format_2 == "" else [0]
            for output_format in test_output_format:
                for sorted in test_sorted:
                    test_pipe = [0, 1] if sorted == 0 and (input_format == "bam" or input_format == "sam" or paired == 0) and input_format_2 == "" else [0]
                    for pipe in test_pipe:
                        test_threads = ["1", "4"] if input_format == "fq" and output_format == "bam" and sorted == 1 and pipe == 0 else ["4"]
                        for threads in test_threads:
                            runs += 1
                            temps = [] # temporary files, deleted on success
                            # build & run snap command line
                            args = [snap, ["single", "paired"][paired], _f("temp/^.idx")]
                            args = args + ["-t", threads, "-b", "-="]
                            if pipe == 0:
                                args = args + _ff(inputs[input_format][paired])
                                if input_format_2 != "":
                                    args = args + _ff(inputs[input_format_2][paired])
                            else:
                                args = args + [formats[input_format], "-"]
                            if sorted:
                                args.append("-so")
                            run = "%s-%s-%s-%s-%s-%s-%s" % (input_format, input_format_2, ["single", "paired"][paired], output_format, ["unsorted", "sorted"][sorted], threads, ["file", "pipe"][pipe])
                            outfile = _f("temp/output-#." + output_format)
                            if pipe == 0:
                                args = args + ["-o", outfile]
                            else:
                                args = args + ["-o", formats[output_format], "-"]
                            ok = runit(args, "snap-#") if pipe ==0 else runit(args, "snap-#", stdin=_f(inputs[input_format][paired][0]), stdout=outfile)
                            temps.append(outfile)
                            if not ok:
                                failed += 1
                                continue
                            # validate output
                            ok = runit(["java", "-jar", _f("ValidateSamFile.jar"), "input=" + outfile, "output=" + _f("temp/validate-#")], "validate-#")
                            temps.append(_f("temp/validate-#"))
                            if not ok:
                                failed += 1
                                continue
                            if False:
                                # todo: need more sophisticated diff
                                # remove header, convert to SAM if needed
                                samfile = _f("temp/output-nh-#.sam")
                                ok = runit([bin + "samtools", "view"] + {"bam":[], "sam":["-S"]}[output_format] + [outfile, "-o", samfile], "samtools-view-#")
                                temps.append(samfile)
                                if not ok:
                                    failed += 1
                                    continue
                                outfile = samfile
                                # sort by name if not sorted by coordinate
                                if sorted == 0:
                                    sortfile = _f("temp/output-namesort-#.sam")
                                    ok = runit([bin + "sort", outfile], "namesort-#", stdout=sortfile)
                                    temps.append(sortfile)
                                    if not ok:
                                        failed += 1
                                        continue
                                    outfile = sortfile
                                # compare with reference file
                                use12 = "12" if paired or input_format == "sam" or input_format == "bam" else "1"
                                reffile = _f("temp/^%s_ref%s.sam" % (["", "_sorted"][sorted], use12))
                                ok = runit([bin + "diff", outfile, reffile], "diff-#")
                                if not ok:
                                    failed += 1
                                    continue
                            # delete temp files
                            for f in temps:
                                os.remove(f)
print "completed %d runs, %d failures" % (runs, failed)