1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
|
# -*- coding: utf-8 -*-
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~IMPORTS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Standard library imports
from collections import *
import warnings
import datetime
import os
# Third party imports
import pandas as pd
# Local lib import
from pycoQC import __name__ as package_name
from pycoQC import __version__ as package_version
from pycoQC.common import *
from pycoQC.pycoQC_parse import pycoQC_parse
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~GLOBAL SETTINGS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Silence futurewarnings
warnings.filterwarnings("ignore", category=FutureWarning)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
def Barcode_split (
summary_file:str,
barcode_file:str="",
output_dir:str="",
output_unclassified:bool=False,
min_barcode_percent:float=0.1,
verbose:bool=False,
quiet:bool=False):
"""
Parse Albacore sequencing_summary.txt file and split per barcode
By default, data for low frequency barcodes and unclassified reads are not written in the output directory
* summary_file
Path to a sequencing_summary generated by Albacore 1.0.0 + (read_fast5_basecaller.py) / Guppy 2.1.3+ (guppy_basecaller).
One can also pass multiple space separated file paths or a UNIX style regex matching multiple files
* barcode_file
Path to the barcode_file generated by Guppy 2.1.3+ (guppy_barcoder) or Deepbinner 0.2.0+. This is not a required file.
One can also pass multiple space separated file paths or a UNIX style regex matching multiple files
* output_dir
Folder where to output split barcode data
* output_unclassified
If True unclassified barcodes are also written in a file. By default they are skiped
* min_barcode_percent
Minimal percent of total reads to write barcode reads in file.
* verbose
Increase verbosity
* quiet
Reduce verbosity
"""
# Save args and init options in dict for report
options_d = locals()
info_d = {"package_name":package_name, "package_version":package_version, "timestamp":str(datetime.datetime.now())}
# Set logging level
logger = get_logger (name=__name__, verbose=verbose, quiet=quiet)
# Print debug info
logger.debug("General info")
logger.debug(dict_to_str(info_d))
logger.debug("Runtime options")
logger.debug(dict_to_str(options_d))
# Process data
logger.warning ("Import data from sequencing summary file(s) and cleanup")
logger.info ("\tRead files and import in a dataframe")
pps = pycoQC_parse(summary_file=summary_file, barcode_file=barcode_file, cleanup=False, verbose=verbose, quiet=quiet)
df = pps.reads_df
# Rename barcode field and check if present
df = df.rename(columns={"barcode_arrangement":"barcode"})
if not "barcode" in df:
raise pycoQCError ("No barcode information found in provided file(s)")
# Quick data cleanup
logger.info ("\tCleanup missing barcodes values")
df = df.fillna({"barcode":"unclassified"})
# List barcode with low frequency
if min_barcode_percent:
logger.info ("\tCleaning up low frequency barcodes")
barcode_counts = df["barcode"][df["barcode"]!="unclassified"].value_counts()
cutoff = int(barcode_counts.sum()*min_barcode_percent/100)
low_barcode = barcode_counts[barcode_counts<cutoff].index
# df.loc[df["barcode"].isin(low_barcode), "barcode"] = "unclassified"
bc = namedtuple ("bc", ["Counts", "Write"])
bc_l = OrderedDict()
logger.warning ("Split data per barcode")
for barcode, barcode_df in df.groupby("barcode"):
logger.info ("\tProcessing data for Barcode {}".format(barcode))
# Skip unclassified if required
if not output_unclassified and barcode == "unclassified":
bc_l[barcode] = bc(len(barcode_df), False)
# Skip low frequency if required
elif min_barcode_percent and barcode in low_barcode:
bc_l[barcode] = bc(len(barcode_df), False)
# Cleanup and write barcode to file
else:
bc_l[barcode] = bc(len(barcode_df), True)
# Remove any field containing barcode in it
barcode_df = barcode_df[[i for i in df.columns if not i.startswith("barcode")]]
# Write to file
file_name = os.path.join(output_dir, "sequencing_summary_{}.txt".format(barcode))
barcode_df.to_csv(file_name, sep="\t", index=False)
logger.info("Barcode Counts")
bc_df = pd.DataFrame.from_dict(bc_l, orient="index")
logger.info(bc_df)
|