1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pauvre - just a pore PhD student's plotting package
# Copyright (c) 2016-2017 Darrin T. Schultz. All rights reserved.
#
# This file is part of pauvre.
#
# pauvre is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# pauvre is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with pauvre. If not, see <http://www.gnu.org/licenses/>.
# TODO: make a function to nicely print out the pandas dataframes
"""
Program: pauvre stats
example usage/output:
- For example, if I want to know the number of bases and reads with AT LEAST
PHRED score 5 and AT LEAST length of 500, I can run the program as you
see below and look at the cells highlighted with <braces>.
- We find that there are 968,653 basepairs contained in 859 reads that
fit meanReadQuality >= Q5, readLen >= 500.
> pauvre marginplot --fastq miniDSMN15.fastq
>numReads: 1000
>numBasepairs: 1029114
>meanLen: 1029.114
>medianLen: 875.5
>minLen: 11
>maxLen: 5337
>N50: 1278
>L50: 296
> Basepairs >= bin by mean PHRED and length
>minLen Q0 Q5 Q10 Q15 Q17.5 Q20 Q21.5 Q25 Q25.5 Q30
> 0 1029114 1010681 935366 429279 143948 25139 3668 2938 2000 0
> 500 984212 <968653> 904787 421307 142003 24417 3668 2938 2000 0
> 1000 659842 649319 616788 300948 103122 17251 2000 2000 2000 0
> et cetera...
> Number of reads >= bin by mean Phred+Len
>minLen Q0 Q5 Q10 Q15 Q17.5 Q20 Q21.5 Q25 Q25.5 Q30
> 0 1000 969 865 366 118 22 3 2 1 0
> 500 873 <859> 789 347 113 20 3 2 1 0
> 1000 424 418 396 187 62 11 1 1 1 0
> et cetera...
"""
from pauvre.functions import parse_fastq_length_meanqual
import os
import pandas as pd
import numpy as np
def stats(df, fastqName, histogram=False):
"""
arguments:
<df>
- a pandas dataframe with cols ['length', 'meanQual'] that contains read
length and quality information
<fastqName>
- just the name of the fastq file. This is used for printing out headers
for the data tables.
purpose:
Calculate and output various stats about this fastq file. Currently
this program reports:
- Total number of reads
- Total number of basepairs
- mean
- median
- min
- max
- N50
- A table of num basepairs filtered by min mean PHRED and length
- A table of num reads filtered by the same as above ^
example usage/output:
- For example, if I want to know the number of bases and reads with AT LEAST
PHRED score 5 and AT LEAST length of 500, I can run the program as you
see below and look at the cells highlighted with <braces>.
- We find that there are 968,653 basepairs contained in 859 reads that
fit meanReadQuality >= Q5, readLen >= 500.
> pauvre marginplot --fastq miniDSMN15.fastq
>numReads: 1000
>numBasepairs: 1029114
>meanLen: 1029.114
>medianLen: 875.5
>minLen: 11
>maxLen: 5337
>N50: 1278
>L50: 296
> Basepairs >= bin by mean PHRED and length
>minLen Q0 Q5 Q10 Q15 Q17.5 Q20 Q21.5 Q25 Q25.5 Q30
> 0 1029114 1010681 935366 429279 143948 25139 3668 2938 2000 0
> 500 984212 <968653> 904787 421307 142003 24417 3668 2938 2000 0
> 1000 659842 649319 616788 300948 103122 17251 2000 2000 2000 0
> et cetera...
> Number of reads >= bin by mean Phred+Len
>minLen Q0 Q5 Q10 Q15 Q17.5 Q20 Q21.5 Q25 Q25.5 Q30
> 0 1000 969 865 366 118 22 3 2 1 0
> 500 873 <859> 789 347 113 20 3 2 1 0
> 1000 424 418 396 187 62 11 1 1 1 0
> et cetera...
"""
fastqBase = os.path.basename(fastqName)
analysis_sizes = [0, 1000, 5000, 10000]
print_string = ""
for this_size in analysis_sizes:
these_lengths = df.loc[df["length"] >= this_size, "length"]
if len(these_lengths) > 0:
print_string += "# Fastq stats for {}, reads >= {}bp\n".format(fastqBase, this_size)
print_string += "numReads: {}\n".format(len(these_lengths))
print_string += "%totalNumReads: {0:.2f}\n".format((len(these_lengths) / len(df)) * 100)
print_string += "numBasepairs: {}\n".format(sum(these_lengths))
print_string += "%totalBasepairs: {0:.2f}\n".format(
(sum(these_lengths) / sum(df["length"])) * 100)
print_string += "meanLen: {}\n".format(np.mean(these_lengths))
print_string += "medianLen: {}\n".format(np.median(these_lengths))
print_string += "minLen: {}\n".format(min(these_lengths))
maxLen = max(these_lengths)
print_string += "maxLen: {}\n".format(maxLen)
# calculate the N50
fiftypercent = 0.5 * sum(these_lengths)
lenSum = 0
count = 0
for val in sorted(these_lengths, reverse=True):
lenSum += val
count += 1
if lenSum >= fiftypercent:
print_string += "N50: {}\n".format(int(val))
print_string += "L50: {}\n".format(count)
break
print_string += "\n"
print_string += lengthQual_table(df)
if histogram: # now make a histogram with read lengths
histogram_lengths(df["length"], fastqBase.split('.')[0])
print(print_string)
def lengthQual_table(df):
"""Create a table with lengths/basepairs on columns and qualities on rows"""
# This block calculates the number of length bins for this data set
lengthBinList = []
size_map = [(1000, 250),
(10000, 500),
(40000, 1000),
(100000, 5000),
(500000, 20000),
(1000000, 50000),
(10000000000, 100000)]
# first, figure out where we will start the table
minlen = min(df["length"])
current_val = 0
firstDone = False
for this_max_size, this_size_step in size_map:
for this_bin in range(current_val, this_max_size, this_size_step):
if minlen < this_bin:
if not firstDone:
lengthBinList.append(prev)
firstDone = True
lengthBinList.append(this_bin)
prev = this_bin
current_val = this_max_size
# now figure out the largest bin
maxLen = df["length"].max()
first_index_gt_maxLen = next(i for i, v in enumerate(lengthBinList) if v > maxLen) + 1
lengthBinList = lengthBinList[0:first_index_gt_maxLen]
qualBinList = []
increment_by = 1
while len(qualBinList) == 0 or len(qualBinList) > 15:
# now set up the bins for mean PHRED
minQual = int(np.floor(min(df["meanQual"])))
maxQual = int(np.ceil(max(df["meanQual"])))
qualBinList = list(np.arange(minQual, maxQual + increment_by, increment_by))
increment_by += 0.25
# now make a table of read lengths
bpTots = []
readnumTots = []
for row in range(len(lengthBinList)):
dataNums = []
readNums = []
for column in range(len(qualBinList)):
thisQuery = df.query("length >= {} and meanQual >= {}".format(
lengthBinList[row], qualBinList[column]))
dataNums.append(sum(thisQuery['length']))
readNums.append(len(thisQuery.index))
bpTots.append(dataNums)
readnumTots.append(readNums)
tables = {"Basepairs >= bin by mean PHRED and length": bpTots,
"Number of reads >= bin by mean Phred+Len": readnumTots}
print_table = ""
for key in sorted(tables):
# make a dataframe of our basepair distribution table
dataDf = pd.DataFrame(tables[key], columns=["Q{}".format(x) for x in qualBinList])
# add the min lengths as a column
dataDf.insert(0, 'minLen', lengthBinList)
print_table += pretty_print_table(dataDf, key)
return print_table
def histogram_lengths(length, name_prefix):
"""Create a histogram of read counts per length."""
counts = length.value_counts().to_frame(name="readCount")
counts.index.rename('readLen', inplace=True)
counts.sort_index(inplace=True)
counts.to_csv("{}.hist.csv".format(name_prefix), index=True)
def pretty_print_table(df, title):
print_string = ""
dataframeStr = df.to_string(index=False)
# this is the char width of the whole table printed
lendataframeStr = len(dataframeStr.split('\n')[0])
# this is the char width of the minlen portion of the printed table
minLenLen = len(dataframeStr.split()[0])
blank = " " * minLenLen
# center the text on this offset as the table title
txtoffset = lendataframeStr - minLenLen
print_string += "\n{}{:^{offset}}\n".format(
blank, title, offset=txtoffset)
print_string += dataframeStr + "\n"
return print_string
def run(args):
"""This just opens the fastq file and passes the info to the stats() function.
This is a wrapper function that is accessed by pauvre_main.
Useful since we can call stats() independently from other pauvre programs."""
df = parse_fastq_length_meanqual(args.fastq)
stats(df, args.fastq, histogram=args.histogram)
|