1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
|
#!/usr/bin/python
import sys
import os
import struct
import binascii
import argparse
import random
#import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
DNAdict = {'A':0, 'C':1, 'G':2, 'T':3}
def tiles():
""" generator for the tile numbers """
for i1 in range(2):
for i2 in range(3):
for i3 in range(16):
yield 1000*(i1+1) + 100*(i2+1) + i3+1
def open_file_handles(n, lane, tile, base_path):
handles = []
for i in range(n):
h = open(base_path + "/BaseCalls/L%03i/C%i.1/s_%i_%i.bcl"%(lane,(i+1),lane,tile), "w+b")
h.write(struct.pack("<i",0)) # write placeholder 32bits
handles.append(h)
return handles
def create_directories(n, lane, base_path):
for i in range(n):
os.makedirs(base_path + "/BaseCalls/L%03i/C%i.1"%(lane,(i+1)))
# Wrapper we use to modify the SeqIO.parse generator so that it
# returns a fixed number (random_num) of random reads apart from
# the original number (read_num) of fastq reads.
def SeqIO_addRandom_fastq(SeqIO_fastq_parser, read_num, random_num, read_length):
def RandomDNA():
return ''.join(random.choice('ACGT') for _ in range(read_length))
total = read_num + random_num
for i in range(total):
# probability of yielding a true read
p = float(read_num)/float(read_num + random_num)
# yield true read from file
if random.random() < p:
read_num = read_num - 1
yield (1, SeqIO_fastq_parser.next())
# yield simulated read
else:
random_num = random_num - 1
qual_annotation = {"phred_quality": [0]*read_length}
random_record = SeqRecord(Seq(RandomDNA(), generic_dna),
id="random",
letter_annotations=qual_annotation)
yield (0, random_record)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("file", help="The fastq-file to generate the tile files from.")
parser.add_argument("-o", "--output", help="Output directory for BaseCalls folder."
" (default: current directory)", default=os.getcwd())
parser.add_argument("--id", help="Write sequence IDs into text files.",
action="store_true", default=True)
parser.add_argument("--tiles", help="Number of tiles to generate. (Default: 96)",
type=int, choices=range(1,97), metavar="[1-96]", default=96)
parser.add_argument("--filtered", help="Fraction of reads that did not pass the filter."
" (Default: 0)", type=float, default=0)
parser.add_argument("--add_noise", help="If true, filtered reads are randomly generated.",
action="store_true", default=False)
parser.add_argument("--lane", help="Number of the lane. (Default: 1)",
type=int, choices=range(1,999), metavar="[1-999]", default=1)
args = parser.parse_args()
return args
def main(argv):
args = parse_args()
# Count the number of sequences to ensure equal distribution among tiles.
num_lines = sum(1 for line in open(args.file))
assert(num_lines % 4 == 0) # maybe use if-clause instead of assert
read_count = num_lines / 4
random_size = int((args.filtered*read_count)/(1-args.filtered))
count = read_count
# add the number of false noise reads to the count, if we add those
if args.add_noise:
count = read_count + random_size
# Read the length of the fastq reads (we only read one record)
handle = open(args.file, "rU")
for record in SeqIO.parse(handle,"fastq"):
length = len(record.seq)
break
handle.close()
# Set tile number generator and reads per tile.
tile_size = count / args.tiles
tile = enumerate(tiles())
# Format: All 0, Version number (3), All 0 placeholder for num. of clusters
filter_header = struct.pack("<iii", 0x00000000, 0x00000003, 0x00000000)
# Create the BaseCalls/LYY/CX.1 subdirectory structure
create_directories(length, args.lane, args.output)
t_i, active_tile = tile.next()
handles = open_file_handles(length, args.lane, active_tile, args.output)
read_records = 0
input_handle = open(args.file, "r")
filter_handle = open(args.output + "/BaseCalls/L%03i/s_%i_%i.filter"%(args.lane,args.lane,active_tile), "w")
filter_handle.write(filter_header)
if args.id:
name_handle = open(args.output + "/BaseCalls/L%03i/s_%i_%i.names"%(args.lane,args.lane,active_tile), "w")
# Output progress status.
sys.stdout.write("Writing tile: %d" % (active_tile))
sys.stdout.flush()
SeqIO_iter = SeqIO.parse(input_handle, "fastq")
# wrap the SeqIO parser to generate random reads (none if no noise should be added)
if args.add_noise:
SeqIO_iter = SeqIO_addRandom_fastq(SeqIO_iter, read_count, random_size, length)
else:
SeqIO_iter = SeqIO_addRandom_fastq(SeqIO_iter, read_count, 0, length)
for filtered, record in SeqIO_iter:
if (read_records == tile_size and (t_i+1) < args.tiles):
# Write the first byte, now we know how many reads were written.
map(lambda handle: handle.seek(0), handles)
map(lambda handle: handle.write(struct.pack("<i", read_records)), handles)
map(lambda handle: handle.close(), handles)
# Update read count
filter_handle.seek(8)
filter_handle.write(struct.pack("<i", read_records))
filter_handle.close()
# Prepare next iteration.
read_records = 0
t_i, active_tile = tile.next()
handles = open_file_handles(length, args.lane, active_tile, args.output)
filter_handle = open(args.output + "/BaseCalls/L%03i/s_%i_%i.filter"%(args.lane,args.lane,active_tile), "w")
filter_handle.write(filter_header)
if args.id:
name_handle = open(args.output + "/BaseCalls/L%03i/s_%i_%i.names"%(args.lane,args.lane,active_tile), "w")
# Output progress status.
sys.stdout.write("\rWriting tile: %d" % (active_tile))
sys.stdout.flush()
qual = record.letter_annotations["phred_quality"]
# Write sequence to tile files.
for i,c in enumerate(record.seq):
if c in DNAdict:
data = struct.pack("B", (qual[i] << 2) | DNAdict[c])
else:
data = struct.pack("B", 0)
handles[i].write(data)
if args.add_noise:
filter_handle.write(struct.pack("B", filtered))
else:
filter_handle.write(struct.pack("B", random.random() >= args.filtered))
if args.id:
name_handle.write(record.id + "\n")
read_records += 1
# Write the first sequence number byte for the last tile.
map(lambda handle: handle.seek(0), handles)
map(lambda handle: handle.write(struct.pack("<i",read_records)), handles)
map(lambda handle: handle.close(), handles)
# Update read count
filter_handle.seek(8)
filter_handle.write(struct.pack("<i", read_records))
filter_handle.close()
if __name__=="__main__":
main(sys.argv)
# TODO: Add .stats etc. files to output.
|