File: Barcode_split.py

package info (click to toggle)
pycoqc 2.5.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 98,704 kB
  • sloc: python: 2,295; sh: 165; makefile: 5
file content (117 lines) | stat: -rw-r--r-- 4,621 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~IMPORTS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# Standard library imports
from collections import *
import warnings
import datetime
import os

# Third party imports
import pandas as pd

# Local lib import
from pycoQC import __name__ as package_name
from pycoQC import __version__ as package_version
from pycoQC.common import *
from pycoQC.pycoQC_parse import pycoQC_parse

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~GLOBAL SETTINGS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# Silence futurewarnings
warnings.filterwarnings("ignore", category=FutureWarning)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
def Barcode_split (
    summary_file:str,
    barcode_file:str="",
    output_dir:str="",
    output_unclassified:bool=False,
    min_barcode_percent:float=0.1,
    verbose:bool=False,
    quiet:bool=False):
    """
    Parse Albacore sequencing_summary.txt file and split per barcode
    By default, data for low frequency barcodes and unclassified reads are not written in the output directory
    * summary_file
        Path to a sequencing_summary generated by Albacore 1.0.0 + (read_fast5_basecaller.py) / Guppy 2.1.3+ (guppy_basecaller).
        One can also pass multiple space separated file paths or a UNIX style regex matching multiple files
    * barcode_file
        Path to the barcode_file generated by Guppy 2.1.3+ (guppy_barcoder) or Deepbinner 0.2.0+. This is not a required file.
        One can also pass multiple space separated file paths or a UNIX style regex matching multiple files
    * output_dir
        Folder where to output split barcode data
    * output_unclassified
        If True unclassified barcodes are also written in a file. By default they are skiped
    * min_barcode_percent
        Minimal percent of total reads to write barcode reads in file.
    * verbose
        Increase verbosity
    * quiet
        Reduce verbosity
    """

    # Save args and init options in dict for report
    options_d = locals()
    info_d = {"package_name":package_name, "package_version":package_version, "timestamp":str(datetime.datetime.now())}

    # Set logging level
    logger = get_logger (name=__name__, verbose=verbose, quiet=quiet)

    # Print debug info
    logger.debug("General info")
    logger.debug(dict_to_str(info_d))
    logger.debug("Runtime options")
    logger.debug(dict_to_str(options_d))

    # Process data
    logger.warning ("Import data from sequencing summary file(s) and cleanup")
    logger.info ("\tRead files and import in a dataframe")
    pps = pycoQC_parse(summary_file=summary_file, barcode_file=barcode_file, cleanup=False, verbose=verbose, quiet=quiet)
    df = pps.reads_df

    # Rename barcode field and check if present
    df = df.rename(columns={"barcode_arrangement":"barcode"})
    if not "barcode" in df:
        raise pycoQCError ("No barcode information found in provided file(s)")

    # Quick data cleanup
    logger.info ("\tCleanup missing barcodes values")
    df = df.fillna({"barcode":"unclassified"})

    # List barcode with low frequency
    if min_barcode_percent:
        logger.info ("\tCleaning up low frequency barcodes")
        barcode_counts = df["barcode"][df["barcode"]!="unclassified"].value_counts()
        cutoff = int(barcode_counts.sum()*min_barcode_percent/100)
        low_barcode = barcode_counts[barcode_counts<cutoff].index
        # df.loc[df["barcode"].isin(low_barcode), "barcode"] = "unclassified"

    bc = namedtuple ("bc", ["Counts", "Write"])
    bc_l = OrderedDict()

    logger.warning ("Split data per barcode")
    for barcode, barcode_df in df.groupby("barcode"):
        logger.info ("\tProcessing data for Barcode {}".format(barcode))

        # Skip unclassified if required
        if not output_unclassified and barcode == "unclassified":
            bc_l[barcode] = bc(len(barcode_df), False)

        # Skip low frequency if required
        elif min_barcode_percent and barcode in low_barcode:
            bc_l[barcode] = bc(len(barcode_df), False)

        # Cleanup and write barcode to file
        else:
            bc_l[barcode] = bc(len(barcode_df), True)
            # Remove any field containing barcode in it
            barcode_df = barcode_df[[i for i in df.columns if not i.startswith("barcode")]]
            # Write to file
            file_name = os.path.join(output_dir, "sequencing_summary_{}.txt".format(barcode))
            barcode_df.to_csv(file_name, sep="\t", index=False)

    logger.info("Barcode Counts")
    bc_df = pd.DataFrame.from_dict(bc_l, orient="index")
    logger.info(bc_df)