File: stats.py

package info (click to toggle)
python-pauvre 0.2.3-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 616 kB
  • sloc: python: 3,947; sh: 44; makefile: 20
file content (250 lines) | stat: -rw-r--r-- 9,968 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# pauvre - just a pore PhD student's plotting package
# Copyright (c) 2016-2017 Darrin T. Schultz. All rights reserved.
#
# This file is part of pauvre.
#
# pauvre is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# pauvre is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with pauvre.  If not, see <http://www.gnu.org/licenses/>.

# TODO: make a function to nicely print out the pandas dataframes

"""
Program: pauvre stats

example usage/output:
 - For example, if I want to know the number of bases and reads with AT LEAST
   PHRED score 5 and AT LEAST length of 500, I can run the program as you
   see below and look at the cells highlighted with <braces>.
 - We find that there are 968,653 basepairs contained in 859 reads that
   fit meanReadQuality >= Q5, readLen >= 500.
 > pauvre marginplot --fastq miniDSMN15.fastq

 >numReads: 1000
 >numBasepairs: 1029114
 >meanLen: 1029.114
 >medianLen: 875.5
 >minLen: 11
 >maxLen: 5337
 >N50: 1278
 >L50: 296

 >                      Basepairs >= bin by mean PHRED and length
 >minLen       Q0       Q5     Q10     Q15   Q17.5    Q20  Q21.5   Q25  Q25.5  Q30
 >     0  1029114  1010681  935366  429279  143948  25139   3668  2938   2000    0
 >   500   984212  <968653> 904787  421307  142003  24417   3668  2938   2000    0
 >  1000   659842   649319  616788  300948  103122  17251   2000  2000   2000    0
 > et cetera...
 >              Number of reads >= bin by mean Phred+Len
 >minLen    Q0   Q5  Q10  Q15  Q17.5  Q20  Q21.5  Q25  Q25.5  Q30
 >     0  1000  969  865  366    118   22      3    2      1    0
 >   500   873 <859> 789  347    113   20      3    2      1    0
 >  1000   424  418  396  187     62   11      1    1      1    0
 > et cetera...
"""


from pauvre.functions import parse_fastq_length_meanqual
import os
import pandas as pd
import numpy as np


def stats(df, fastqName, histogram=False):
    """
    arguments:
     <df>
      - a pandas dataframe with cols ['length', 'meanQual'] that contains read
         length and quality information
     <fastqName>
      - just the name of the fastq file. This is used for printing out headers
         for the data tables.

    purpose:
     Calculate and output various stats about this fastq file. Currently
     this program reports:
       - Total number of reads
       - Total number of basepairs
       - mean
       - median
       - min
       - max
       - N50
       - A table of num basepairs filtered by min mean PHRED and length
       - A table of num reads filtered by the same as above ^

    example usage/output:
     - For example, if I want to know the number of bases and reads with AT LEAST
       PHRED score 5 and AT LEAST length of 500, I can run the program as you
       see below and look at the cells highlighted with <braces>.
     - We find that there are 968,653 basepairs contained in 859 reads that
       fit meanReadQuality >= Q5, readLen >= 500.
     > pauvre marginplot --fastq miniDSMN15.fastq

     >numReads: 1000
     >numBasepairs: 1029114
     >meanLen: 1029.114
     >medianLen: 875.5
     >minLen: 11
     >maxLen: 5337
     >N50: 1278
     >L50: 296

     >                      Basepairs >= bin by mean PHRED and length
     >minLen       Q0       Q5     Q10     Q15   Q17.5    Q20  Q21.5   Q25  Q25.5  Q30
     >     0  1029114  1010681  935366  429279  143948  25139   3668  2938   2000    0
     >   500   984212  <968653> 904787  421307  142003  24417   3668  2938   2000    0
     >  1000   659842   649319  616788  300948  103122  17251   2000  2000   2000    0
     > et cetera...
     >              Number of reads >= bin by mean Phred+Len
     >minLen    Q0   Q5  Q10  Q15  Q17.5  Q20  Q21.5  Q25  Q25.5  Q30
     >     0  1000  969  865  366    118   22      3    2      1    0
     >   500   873 <859> 789  347    113   20      3    2      1    0
     >  1000   424  418  396  187     62   11      1    1      1    0
     > et cetera...
    """
    fastqBase = os.path.basename(fastqName)

    analysis_sizes = [0, 1000, 5000, 10000]
    print_string = ""
    for this_size in analysis_sizes:
        these_lengths = df.loc[df["length"] >= this_size, "length"]
        if len(these_lengths) > 0:
            print_string += "# Fastq stats for {}, reads >= {}bp\n".format(fastqBase, this_size)
            print_string += "numReads: {}\n".format(len(these_lengths))
            print_string += "%totalNumReads: {0:.2f}\n".format((len(these_lengths) / len(df)) * 100)
            print_string += "numBasepairs: {}\n".format(sum(these_lengths))
            print_string += "%totalBasepairs: {0:.2f}\n".format(
                (sum(these_lengths) / sum(df["length"])) * 100)
            print_string += "meanLen: {}\n".format(np.mean(these_lengths))
            print_string += "medianLen: {}\n".format(np.median(these_lengths))
            print_string += "minLen: {}\n".format(min(these_lengths))
            maxLen = max(these_lengths)
            print_string += "maxLen: {}\n".format(maxLen)

            # calculate the N50
            fiftypercent = 0.5 * sum(these_lengths)
            lenSum = 0
            count = 0
            for val in sorted(these_lengths, reverse=True):
                lenSum += val
                count += 1
                if lenSum >= fiftypercent:
                    print_string += "N50: {}\n".format(int(val))
                    print_string += "L50: {}\n".format(count)
                    break
            print_string += "\n"

    print_string += lengthQual_table(df)

    if histogram:  # now make a histogram with read lengths
        histogram_lengths(df["length"], fastqBase.split('.')[0])
    print(print_string)


def lengthQual_table(df):
    """Create a table with lengths/basepairs on columns and qualities on rows"""
    # This block calculates the number of length bins for this data set
    lengthBinList = []
    size_map = [(1000, 250),
                (10000, 500),
                (40000, 1000),
                (100000, 5000),
                (500000, 20000),
                (1000000, 50000),
                (10000000000, 100000)]
    # first, figure out where we will start the table
    minlen = min(df["length"])
    current_val = 0
    firstDone = False
    for this_max_size, this_size_step in size_map:
        for this_bin in range(current_val, this_max_size, this_size_step):
            if minlen < this_bin:
                if not firstDone:
                    lengthBinList.append(prev)
                    firstDone = True
                lengthBinList.append(this_bin)
            prev = this_bin
        current_val = this_max_size
    # now figure out the largest bin
    maxLen = df["length"].max()
    first_index_gt_maxLen = next(i for i, v in enumerate(lengthBinList) if v > maxLen) + 1
    lengthBinList = lengthBinList[0:first_index_gt_maxLen]

    qualBinList = []
    increment_by = 1
    while len(qualBinList) == 0 or len(qualBinList) > 15:
        # now set up the bins for mean PHRED
        minQual = int(np.floor(min(df["meanQual"])))
        maxQual = int(np.ceil(max(df["meanQual"])))
        qualBinList = list(np.arange(minQual, maxQual + increment_by, increment_by))
        increment_by += 0.25

    # now make a table of read lengths
    bpTots = []
    readnumTots = []
    for row in range(len(lengthBinList)):
        dataNums = []
        readNums = []
        for column in range(len(qualBinList)):
            thisQuery = df.query("length >= {} and meanQual >= {}".format(
                lengthBinList[row], qualBinList[column]))
            dataNums.append(sum(thisQuery['length']))
            readNums.append(len(thisQuery.index))
        bpTots.append(dataNums)
        readnumTots.append(readNums)

    tables = {"Basepairs >= bin by mean PHRED and length": bpTots,
              "Number of reads >= bin by mean Phred+Len": readnumTots}
    print_table = ""
    for key in sorted(tables):
        # make a dataframe of our basepair distribution table
        dataDf = pd.DataFrame(tables[key], columns=["Q{}".format(x) for x in qualBinList])
        # add the min lengths as a column
        dataDf.insert(0, 'minLen', lengthBinList)
        print_table += pretty_print_table(dataDf, key)
    return print_table


def histogram_lengths(length, name_prefix):
    """Create a histogram of read counts per length."""
    counts = length.value_counts().to_frame(name="readCount")
    counts.index.rename('readLen', inplace=True)
    counts.sort_index(inplace=True)
    counts.to_csv("{}.hist.csv".format(name_prefix), index=True)


def pretty_print_table(df, title):
    print_string = ""
    dataframeStr = df.to_string(index=False)
    # this is the char width of the whole table printed
    lendataframeStr = len(dataframeStr.split('\n')[0])
    # this is the char width of the minlen portion of the printed table
    minLenLen = len(dataframeStr.split()[0])
    blank = " " * minLenLen
    # center the text on this offset as the table title
    txtoffset = lendataframeStr - minLenLen
    print_string += "\n{}{:^{offset}}\n".format(
        blank, title, offset=txtoffset)
    print_string += dataframeStr + "\n"
    return print_string


def run(args):
    """This just opens the fastq file and passes the info to the stats() function.
    This is a wrapper function that is accessed by pauvre_main.
    Useful since we can call stats() independently from other pauvre programs."""
    df = parse_fastq_length_meanqual(args.fastq)
    stats(df, args.fastq, histogram=args.histogram)