File: pycoQC_parse.py

package info (click to toggle)
pycoqc 2.5.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 98,704 kB
  • sloc: python: 2,295; sh: 165; makefile: 5
file content (402 lines) | stat: -rw-r--r-- 17,123 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# -*- coding: utf-8 -*-

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~IMPORTS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# Standard library imports
from collections import *
import warnings

# Third party imports
import numpy as np
import pandas as pd
import pysam as ps

# Local lib import
from pycoQC.common import *

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~GLOBAL SETTINGS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# Silence futurewarnings
warnings.filterwarnings("ignore", category=FutureWarning)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class pycoQC_parse ():

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~INIT METHOD~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    def __init__ (self,
        summary_file:str,
        barcode_file:str="",
        bam_file:str="",
        runid_list:list=[],
        filter_calibration:bool=False,
        filter_duplicated:bool=False,
        min_barcode_percent:float=0.1,
        cleanup:bool=True,
        verbose:bool=False,
        quiet:bool=False):
        """
        Parse Albacore sequencing_summary.txt file and clean-up the data
        * summary_file
            Path to the sequencing_summary generated by Albacore 1.0.0 + (read_fast5_basecaller.py) / Guppy 2.1.3+ (guppy_basecaller).
            One can also pass multiple space separated file paths or a UNIX style regex matching multiple files
        * barcode_file
            Path to the barcode_file generated by Guppy 2.1.3+ (guppy_barcoder) or Deepbinner 0.2.0+. This is not a required file.
            One can also pass multiple space separated file paths or a UNIX style regex matching multiple files
        * bam_file
            Path to a Bam file corresponding to reads in the summary_file. Preferably aligned with Minimap2
            One can also pass multiple space separated file paths or a UNIX style regex matching multiple files
        * runid_list
            Select only specific runids to be analysed. Can also be used to force pycoQC to order the runids for
            temporal plots, if the sequencing_summary file contain several sucessive runs. By default pycoQC analyses
            all the runids in the file and uses the runid order as defined in the file.
        * filter_calibration
            If True read flagged as calibration strand by the software are removed
        * filter_duplicated
            If True duplicated read_ids are removed but the first occurence is kept (Guppy sometimes outputs the same read multiple times)
        * min_barcode_percent
            Minimal percent of total reads to retain barcode label. If below the barcode value is set as `unclassified`.
        """

        # Set logging level
        self.logger = get_logger(name=__name__, verbose=verbose, quiet=quiet)

        # Save self variables
        self.runid_list = runid_list
        self.filter_calibration = filter_calibration
        self.filter_duplicated = filter_duplicated
        self.min_barcode_percent = min_barcode_percent
        self.cleanup = cleanup

        # Init object counter
        self.counter = OrderedDict()

        # Check input files
        self.logger.warning ("Check input data files")

        # Expand file names and test readability
        self.summary_files_list = expand_file_names(summary_file)
        self.logger.debug ("\t\tSequencing summary files found: {}".format(" ".join(self.summary_files_list)))
        self.counter["Summary files found"] = len(self.summary_files_list)

        if barcode_file:
            self.barcode_files_list = expand_file_names(barcode_file)
            self.logger.debug ("\t\tBarcode files found: {}".format(" ".join(self.barcode_files_list)))
            self.counter["Barcode files found"] = len(self.barcode_files_list)
        else:
            self.barcode_files_list =[]

        if bam_file:
            self.bam_file_list = expand_file_names(bam_file, bam_check=True)
            self.logger.debug ("\t\tBam files found: {}".format(" ".join(self.bam_file_list)))
            self.counter["Bam files found"] = len(self.bam_file_list)
        else:
            self.bam_file_list =[]

        self.logger.warning ("Parse data files")
        summary_reads_df = self._parse_summary()
        barcode_reads_df = self._parse_barcode()
        bam_reads_df, self.alignments_df, self.ref_len_dict = self._parse_bam()

        self.logger.warning ("Merge data")
        self.reads_df = self._merge_reads_df(summary_reads_df, barcode_reads_df, bam_reads_df)

        # Cleanup data
        if self.cleanup:
            self.logger.warning("Cleaning data")
            self.reads_df = self._clean_reads_df(self.reads_df)

    def __str__(self):
        return dict_to_str(self.counter)

    def __repr__(self):
        return "[{}]\n".format(self.__class__.__name__)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PRIVATE METHODS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    def _parse_summary (self):
        """"""
        self.logger.debug ("\tParse summary files")
        df = merge_files_to_df (self.summary_files_list)

        if self.cleanup:
            # Standardise col names for all types of files
            self.logger.debug ("\tRename summary sequencing columns")
            rename_colmanes = {
                "sequence_length_template":"read_len",
                "sequence_length_2d":"read_len",
                "sequence_length":"read_len",
                "mean_qscore_template":"mean_qscore",
                "mean_qscore_2d":"mean_qscore",
                "calibration_strand_genome_template":"calibration",
                "barcode_arrangement":"barcode"}
            df = df.rename(columns=rename_colmanes)

            # Verify the required and optional columns, Drop unused fields
            self.logger.debug ("\tVerifying fields and discarding unused columns")
            df = self._select_df_columns (
                df = df,
                required_colnames = ["read_id", "run_id", "channel", "start_time", "read_len", "mean_qscore"],
                optional_colnames = ["calibration", "barcode"])

        # Collect stats
        n = len(df)
        self.logger.debug ("\t\t{:,} reads found in initial file".format(n))
        self.counter["Initial reads"] = n

        return df

    def _parse_barcode (self):
        """"""
        if not self.barcode_files_list:
            return pd.DataFrame()

        self.logger.debug ("\tParse barcode files")
        df = merge_files_to_df (self.barcode_files_list)

        # check presence of barcode details
        if "read_id" in df and "barcode_arrangement" in df:
            self.logger.debug ("\t\tFound valid Guppy barcode file")
            df = df [["read_id", "barcode_arrangement"]]
            df = df.rename(columns={"barcode_arrangement":"barcode"})

        elif "read_ID" in df and "barcode_call" in df:
            self.logger.debug ("\t\tFound valid Deepbinner barcode file")
            df = df [["read_ID", "barcode_call"]]
            df = df.rename(columns={"read_ID":"read_id", "barcode_call":"barcode"})
            df['barcode'].replace("none", "unclassified", inplace=True)
        else:
            raise pycoQCError ("File {} does not contain required barcode information".format(fp))

        n = len(df[df['barcode']!="unclassified"])
        self.logger.debug ("\t\t{:,} reads with barcodes assigned".format(n))
        self.counter["Reads with barcodes"] = n

        return df

    def _parse_bam (self):
        """"""
        if not self.bam_file_list:
            return (pd.DataFrame(), pd.DataFrame(), OrderedDict())

        # Init collections
        ref_len_dict = OrderedDict()
        alignments_dict = Counter()
        read_dict = OrderedDict ()

        for bam_fn in self.bam_file_list:
            with ps.AlignmentFile(bam_fn, "rb") as bam:

                # Save reference lengths information
                for ref_id, ref_len in zip(bam.references, bam.lengths):
                    if not ref_id in ref_len_dict:
                        ref_len_dict[ref_id] = ref_len

                # Parse reads
                for read in bam:
                    if read.is_unmapped:
                        alignments_dict["Unmapped"]+=1
                    elif read.is_secondary:
                        alignments_dict["Secondary"]+=1
                    elif read.is_supplementary:
                        alignments_dict["Suplementary"]+=1
                    elif read.query_name in read_dict:
                        alignments_dict["Duplicated"]+=1
                    else:
                        alignments_dict["Primary"]+=1
                        read_dict[read.query_name] = self._get_read_stats(read)

        # Convert aligments_dict to df
        if alignments_dict:
            alignments_df = pd.DataFrame.from_dict(alignments_dict, orient="index")
            alignments_df.reset_index(inplace=True)
            alignments_df.columns=["Alignments", "Counts"]
            alignments_df["Percents"] = (alignments_df["Counts"]/alignments_df["Counts"].sum()*100).round(2)
        else:
            alignments_df = pd.DataFrame()

        # Convert read_dict to df
        if read_dict:
            read_df = pd.DataFrame.from_dict(read_dict, orient="index")
            read_df.index.name="read_id"
            read_df.reset_index(inplace=True)
        else:
            read_df = pd.DataFrame()

        return (read_df, alignments_df, ref_len_dict)

    def _merge_reads_df(self, summary_reads_df, barcode_reads_df, bam_reads_df):
        """"""
        df = summary_reads_df

        # Merge df and fill in missing barcode values
        if not barcode_reads_df.empty:
            df = pd.merge(df, barcode_reads_df, on="read_id", how="left")
            df['barcode'].fillna('unclassified', inplace=True)

        # Merge df and fill in missing barcode values
        if not bam_reads_df.empty:
            df = pd.merge(df, bam_reads_df, on="read_id", how="left")

        return df

    def _clean_reads_df (self, df):
        """"""
        # Drop lines containing NA values
        self.logger.info ("\tDiscarding lines containing NA values")
        l = len(df)
        df = df.dropna(subset=["read_id", "run_id", "channel", "start_time", "read_len", "mean_qscore"])
        n=l-len(df)
        self.logger.info ("\t\t{:,} reads discarded".format(n))
        self.counter["Reads with NA values discarded"] = n
        if len(df) <= 1:
            raise pycoQCError("No valid read left after NA values filtering")

        # Filter out zero length reads
        self.logger.info ("\tFiltering out zero length reads")
        l = len(df)
        df = df[(df["read_len"] > 0)]
        n=l-len(df)
        self.logger.info ("\t\t{:,} reads discarded".format(n))
        self.counter["Zero length reads discarded"] = n
        if len(df) <= 1:
            raise pycoQCError("No valid read left after zero_len filtering")

        # Filter out reads with duplicated read_id
        if self.filter_duplicated:
            self.logger.info ("\tFiltering out duplicated reads")
            l = len(df)
            df = df[~df.duplicated(subset="read_id", keep='first')]
            n=l-len(df)
            self.logger.info ("\t\t{:,} reads discarded".format(n))
            self.counter["Duplicated reads discarded"] = n
            if len(df) <= 1:
                raise pycoQCError("No valid read left after calibration strand filtering")

        # Filter out calibration strand reads if the "calibration_strand_genome_template" field is available
        if self.filter_calibration and "calibration" in df:
            self.logger.info ("\tFiltering out calibration strand reads")
            l = len(df)
            df = df[(df["calibration"].isin(["filtered_out", "no_match", "*"]))]
            n=l-len(df)
            self.logger.info ("\t\t{:,} reads discarded".format(n))
            self.counter["Calibration reads discarded"] = n
            if len(df) <= 1:
                raise pycoQCError("No valid read left after calibration strand filtering")

        # Filter and reorder based on runid_list list if passed by user
        if self.runid_list:
            self.logger.info ("\tSelecting run_ids passed by user")
            l = len(df)
            df = df[(df["run_id"].isin(self.runid_list))]
            n=l-len(df)
            self.logger.debug ("\t\t{:,} reads discarded".format(n))
            self.counter["Excluded runid reads discarded"] = n
            if len(df) <= 1:
                raise pycoQCError("No valid read left after run ID filtering")
            runid_list = self.runid_list

        # Else sort the runids by output per time assuming that the throughput decreases over time
        else:
            self.logger.info ("\tSorting run IDs by decreasing throughput")
            d = {}
            for run_id, sdf in df.groupby("run_id"):
                d[run_id] = len(sdf)/np.ptp(sdf["start_time"])
            runid_list = [i for i, j in sorted (d.items(), key=lambda t: t[1], reverse=True)]
            self.logger.info ("\t\tRun-id order {}".format(runid_list))

        # Modify start time per run ids to order them following the runid_list
        self.logger.info ("\tReordering runids")
        increment_time = 0
        runid_start = OrderedDict()
        for runid in runid_list:
            self.logger.info ("\t\tProcessing reads with Run_ID {} / time offset: {}".format(runid, increment_time))
            max_val = df['start_time'][df["run_id"] == runid].max()
            df.loc[df["run_id"] == runid, 'start_time'] += increment_time
            runid_start[runid] = increment_time
            increment_time += max_val+1
        df = df.sort_values ("start_time")

        #  Unset low frequency barcodes
        if "barcode" in df and self.min_barcode_percent:
            self.logger.info ("\tCleaning up low frequency barcodes")
            l = (df["barcode"]=="unclassified").sum()
            barcode_counts = df["barcode"][df["barcode"]!="unclassified"].value_counts()
            cutoff = int(barcode_counts.sum()*self.min_barcode_percent/100)
            low_barcode = barcode_counts[barcode_counts<cutoff].index
            df.loc[df["barcode"].isin(low_barcode), "barcode"] = "unclassified"
            n= int((df["barcode"]=="unclassified").sum()-l)
            self.logger.info ("\t\t{:,} reads with low frequency barcode unset".format(n))
            self.counter["Reads with low frequency barcode unset"] = n

        # Cast values to required types
        self.logger.info ("\tCast value to appropriate type")
        df = df.astype({'channel':"uint16","start_time":"float32","read_len":"uint32","mean_qscore":"float32"})

        # Reindex final df
        self.logger.info ("\tReindexing dataframe by read_ids")
        df = df.reset_index (drop=True)
        df = df.set_index ("read_id")
        self.logger.info ("\t\t{:,} Final valid reads".format(len(df)))

        # Save final df
        self.counter["Valid reads"] = len(df)
        if len(df) < 500:
            self.logger.warning ("WARNING: Low number of reads found. This is likely to lead to errors when trying to generate plots")

        return df

    def _get_read_stats(self, read):
        """"""
        d = OrderedDict()

        # Extract general stats
        d["ref_id"] = read.reference_name
        d["ref_start"] = read.reference_start
        d["ref_end"] = read.reference_end
        d["align_len"] = read.query_alignment_length
        d["mapq"] = read.mapping_quality

        # Extract indel and soft_clip from cigar
        c_stat = read.get_cigar_stats()[0]
        d["insertion"] = c_stat[1]
        d["deletion"] = c_stat[2]
        d["soft_clip"] = c_stat[4]

        # Compute alignment score from NM field if available
        if read.has_tag("NM"):
            edit_dist = read.get_tag("NM")
            d["mismatch"] = edit_dist-(d["deletion"]+d["insertion"])
            try:
                d["identity_freq"] = (d["align_len"]-edit_dist)/d["align_len"]
            except ZeroDivisionError:
                d["identity_freq"] = 0

        # If not NM try to compute score from MD field
        elif read.has_tag("MD"):
            md_err = 0
            for i in read.get_tag("MD"):
                if i in ["A","T","C","G","a","t","c","g"]:
                    md_err += 1
            d["mismatch"] = md_err-d["deletion"]
            edit_dist = d["mismatch"]+d["insertion"]+d["deletion"]
            try:
                d["identity_freq"] = (d["align_len"]-edit_dist)/d["align_len"]
            except ZeroDivisionError:
                d["identity_freq"] = 0

        return d

    def _select_df_columns(self, df, required_colnames, optional_colnames):
        """"""
        col_found = []
        # Verify the presence of the columns required for pycoQC
        for col in required_colnames:
            if col in df:
                col_found.append(col)
            else:
                raise pycoQCError("Column {} not found in the provided sequence_summary file".format(col))
        for col in optional_colnames:
            if col in df:
                col_found.append(col)

        return df[col_found]