1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
#!/usr/bin/python3 -O
############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################
#Calculate coverage from raw file
import sys
def coverage(in_filename, out_filename, maxLen, bar, kmer):
inFile = open(in_filename)
outFile = open(out_filename, 'w')
hist = [0] * (maxLen + 1)
for line in inFile:
coords = line.split()
stpos = int(coords[0])
rlen = int(coords[1])
for i in range(0, rlen - kmer + 1):
cpos = stpos + i
if cpos <= maxLen:
hist[cpos] += 1
covered = 0.0
for i in range(0, maxLen + 1):
if (hist[i] > 0):
covered += 1.0
# print("Coverage: " + str(covered/maxLen) + "\n")
newHist = [0 for i in range((maxLen + 1) / bar + 2)]
for i in range(maxLen + 1):
newHist[int(i/bar)] += hist[i]
for i in range((maxLen + 1) / bar + 1):
outFile.write(str(i) + ' ' + str(newHist[i] / bar) + '\n')
inFile.close()
outFile.close()
return covered/maxLen
def read_genome(filename):
res_seq = []
seq = ''
for line in open(filename):
if line[0] == '>':
pass
else:
seq += line.strip()
return seq
def write_fasta(data, filename):
outFile = open(filename, 'w')
for seq in data:
outFile.write('>' + seq[0] + '\n');
i = 0
while i < len(seq[1]):
outFile.write(seq[1][i:i+60] + '\n')
i += 60
outFile.close()
def analyze_gaps(in_filename, out_filename, reference, out_ref, kmer):
inFile = open(in_filename)
outFile = open(out_filename, 'w')
gaps_stat = {101:0, 501:0, 1001:0}
chunks_stat = {101:0, 501:0, 1001:0}
current = 0
#[start,end)
gaps = []
chunks = []
line = inFile.readline()
while line:
cov = int(line.split()[1])
end = current
while cov == 0 and line:
end += 1
line = inFile.readline()
if line:
cov = int(line.split()[1])
if end != current:
gaps.append((current, end));
length = end - current
for key in gaps_stat:
if length < key:
gaps_stat[key] += 1
current = end
while cov > 0 and line:
end += 1
line = inFile.readline()
if line:
cov = int(line.split()[1])
if end != current:
chunks.append((current, end + kmer - 1));
length = end - current
for key in chunks_stat:
if length < key:
chunks_stat[key] += 1
current = end
outFile.write("Total chunks: " + str(len(chunks)) + "\n")
for key in chunks_stat:
outFile.write("Chunks < " + str(key) + ": " + str(chunks_stat[key])+ "\n")
outFile.write("Total gaps: " + str(len(gaps)) + "\n")
for key in gaps_stat:
outFile.write("Gaps < " + str(key) + ": " + str(gaps_stat[key])+ "\n")
outFile.write("Gaps:\n")
for gap in gaps:
outFile.write(str(gap[0]) + ' ' + str(gap[1]) + "\n")
outFile.close()
# genome = read_genome(reference, chunks)
genome = read_genome(reference)
ref_chunks = []
i = 0
for chunk in chunks:
ref_chunks.append(("PART_" + str(i) + "_from_" + str(chunk[0]) + "_to_" + str(chunk[1]), genome[chunk[0]:chunk[1]]))
i += 1
write_fasta(ref_chunks, out_ref)
return chunks, gaps
def main():
if len(sys.argv) < 5:
print("Usage: <coverage file> <output> <genome length> <bar width> [k = 1]");
exit(0)
kmer = 1
if len(sys.argv) > 5:
kmer = int(sys.argv[5])
cov = coverage(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]), kmer)
print("Coverage: " + str(cov) + "\n")
if __name__ == '__main__':
main()
|