1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
|
from . import Fast5File
import matplotlib
#matplotlib.use('Agg') # Must be called before any other matplotlib calls
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
#logging
import logging
logger = logging.getLogger('poretools')
logger.setLevel(logging.INFO)
def plot_collectors_curve(args, start_times, read_lengths):
"""
collectors curve of the run
"""
# set t_0 as the first measured time for the read.
t_0 = start_times[0]
# adjust times to be relative to t_0
start_times = [float(t - t_0) / float(3600) + 0.00000001 for t in start_times]
# compute the cumulative based on reads or total base pairs
if args.plot_type == 'reads':
y_label = "Total reads"
cumulative = np.cumsum(list(range(len(start_times))))
elif args.plot_type == 'basepairs':
y_label = "Total base pairs"
cumulative = np.cumsum(read_lengths)
step = args.skip
# make a data frame of the lists
d = {'start': [start_times[n] for n in range(0, len(start_times), step)],
'lengths': [read_lengths[n] for n in range(0, len(read_lengths), step)],
'cumul': [cumulative[n] for n in range(0, len(cumulative), step)]}
df = pd.DataFrame(d)
if args.savedf:
df.to_csv(args.savedf, sep="\t")
# title
total_reads = len(read_lengths)
total_bp = sum(read_lengths)
plot_title = "Yield: " \
+ str(total_reads) + " reads and " \
+ str(total_bp) + " base pairs."
if args.theme_bw:
sns.set_style("whitegrid")
# plot
if args.saveas is not None:
plt.figure(figsize=(8.5,8.5))
plt.plot(df['start'], df['cumul'])
plt.xlabel('Time (hours)')
plt.ylabel(y_label)
plt.title(plot_title)
if args.saveas is not None:
plot_file = args.saveas
plt.savefig(plot_file)
else:
plt.show()
def run(parser, args):
start_times = []
read_lengths = []
files_processed = 0
for fast5 in Fast5File.Fast5FileSet(args.files):
if fast5.is_open:
fq = fast5.get_fastq()
start_time = fast5.get_start_time()
if start_time is None:
logger.warning("No start time for %s!" % (fast5.filename))
fast5.close()
continue
start_times.append(start_time)
if fq is not None:
read_lengths.append(len(fq.seq))
else:
read_lengths.append(0)
fast5.close()
files_processed += 1
if files_processed % 100 == 0:
logger.info("%d files processed." % files_processed)
# sort the data by start time
start_times, read_lengths = (list(t) for t in zip(*sorted(zip(start_times, read_lengths))))
plot_collectors_curve(args, start_times, read_lengths)
|