1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
|
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#~~~~~~~~~~~~~~IMPORTS~~~~~~~~~~~~~~#
# Standard library imports
import argparse
import sys
import textwrap
from pkg_resources import resource_filename
# Third party imports
from jinja2 import Environment, PackageLoader, Template
# Local imports
from pycoQC.pycoQC import pycoQC
from pycoQC.Fast5_to_seq_summary import Fast5_to_seq_summary
from pycoQC.Barcode_split import Barcode_split
from pycoQC.common import get_logger
from pycoQC import __version__ as package_version
from pycoQC import __name__ as package_name
#~~~~~~~~~~~~~~pycoQC CLI ENTRY POINT~~~~~~~~~~~~~~#
def main_pycoQC (args=None):
if args is None:
args = sys.argv[1:]
# Define parser object
parser = argparse.ArgumentParser(
formatter_class = argparse.RawDescriptionHelpFormatter,
description = textwrap.dedent("""
pycoQC computes metrics and generates interactive QC plots from the sequencing summary
report generated by Oxford Nanopore technologies basecallers\n
* Minimal usage
pycoQC -f sequencing_summary.txt -o pycoQC_output.html
* Including Guppy barcoding file + html output + json output
pycoQC -f sequencing_summary.txt -b barcoding_sequencing.txt -o pycoQC_output.html -j pycoQC_output.json
* Including Bam file + html output
pycoQC -f sequencing_summary.txt -a alignment.bam -o pycoQC_output.html"""))
parser.add_argument('--version', action='version', version="{} v{}".format(package_name, package_version))
# Define arguments
parser_io = parser.add_argument_group('Input/output options')
parser_io.add_argument("--summary_file", "-f", default=[], nargs='*',
help=textwrap.dedent("""Path to a sequencing_summary generated by Albacore 1.0.0 + (read_fast5_basecaller.py) / Guppy 2.1.3+ (guppy_basecaller).
One can also pass multiple space separated file paths or a UNIX style regex matching multiple files (Required)"""))
parser_io.add_argument("--barcode_file", "-b", default=[], nargs='*',
help=textwrap.dedent("""Path to the barcode_file generated by Guppy 2.1.3+ (guppy_barcoder) or Deepbinner 0.2.0+. This is not a required file.
One can also pass multiple space separated file paths or a UNIX style regex matching multiple files (optional)"""))
parser_io.add_argument("--bam_file", "-a", default=[], nargs='*',
help=textwrap.dedent("""Path to a Bam file corresponding to reads in the summary_file. Preferably aligned with Minimap2
One can also pass multiple space separated file paths or a UNIX style regex matching multiple files (optional)"""))
parser_io.add_argument("--html_outfile", "-o", default="", type=str,
help="Path to an output html file report (required if json_outfile not given)")
parser_io.add_argument("--json_outfile", "-j", default="", type=str,
help="Path to an output json file report (required if html_outfile not given)")
parser_filt = parser.add_argument_group('Filtering options')
parser_filt.add_argument("--min_pass_qual", default=7, type=float,
help="Minimum quality to consider a read as 'pass' (default: %(default)s)")
parser_filt.add_argument("--min_pass_len", default=0, type=int,
help="Minimum read length to consider a read as 'pass' (default: %(default)s)")
parser_filt.add_argument("--filter_calibration", default=False, action='store_true',
help="If given, reads flagged as calibration strand by the basecaller are removed (default: %(default)s)")
parser_filt.add_argument("--filter_duplicated", default=False, action='store_true',
help=textwrap.dedent("""If given, duplicated read_ids are removed but the first occurence is kept
(Guppy sometimes outputs the same read multiple times) (default: %(default)s)"""))
parser_filt.add_argument("--min_barcode_percent", default=0.1, type=float,
help="Minimal percent of total reads to retain barcode label. If below, the barcode value is set as `unclassified` (default: %(default)s)")
parser_html = parser.add_argument_group('HTML report options')
parser_html.add_argument("--report_title", default="PycoQC report", type=str,
help="Title to use in the html report (default: %(default)s)")
parser_html.add_argument("--template_file", type=str, default="",
help="Jinja2 html template for the html report (default: %(default)s)")
parser_html.add_argument("--config_file", type=str, default="",
help=textwrap.dedent("""Path to a JSON configuration file for the html report.
If not provided, looks for it in ~/.pycoQC and ~/.config/pycoQC/config. If it's still not found, falls back to default parameters.
The first level keys are the names of the plots to be included.
The second level keys are the parameters to pass to each plotting function (default: %(default)s)")"""))
parser_html.add_argument("--skip_coverage_plot", default=False, action='store_true',
help="Skip the coverage plot in HTML report. Useful when using a reference file containing many sequences, i.e. transcriptome (default: %(default)s)")
parser_other = parser.add_argument_group('Other options')
parser_other.add_argument("--sample", default=100000, type=int,
help=textwrap.dedent("""If not None a n number of reads will be randomly selected instead of the entire dataset for ploting function
(deterministic sampling) (default: %(default)s)"""))
parser_other.add_argument("--default_config", "-d", action='store_true',
help="Print default configuration file. Can be used to generate a template JSON file (default: %(default)s)")
parser_verbosity = parser.add_mutually_exclusive_group()
parser_verbosity.add_argument("-v", "--verbose", action="store_true", default=False, help="Increase verbosity")
parser_verbosity.add_argument("-q", "--quiet", action="store_true", default=False, help="Reduce verbosity")
# Try to parse arguments
args = parser.parse_args()
# Set logging level
logger = get_logger (name=__name__, verbose=args.verbose, quiet=args.quiet)
# Print the default config parameters and exit
if args.default_config:
config_file = resource_filename("pycoQC", "templates/pycoQC_config.json")
with open (config_file) as fp:
sys.stdout.write(fp.read())
sys.exit()
elif not args.summary_file:
logger.warning ("ERROR: `--summary_file` is a required argument")
parser.print_help()
sys.exit()
elif not args.html_outfile and not args.json_outfile:
logger.warning ("ERROR: At least one output file required `--html_outfile` or `--json_outfile`")
parser.print_help()
sys.exit()
# Run pycoQC
pycoQC (
summary_file = args.summary_file,
barcode_file = args.barcode_file,
bam_file = args.bam_file,
filter_calibration = args.filter_calibration,
filter_duplicated = args.filter_duplicated,
min_barcode_percent = args.min_barcode_percent,
min_pass_qual = args.min_pass_qual,
min_pass_len = args.min_pass_len,
sample = args.sample,
html_outfile = args.html_outfile,
report_title = args.report_title,
config_file = args.config_file,
skip_coverage_plot = args.skip_coverage_plot,
template_file = args.template_file,
json_outfile = args.json_outfile,
verbose = args.verbose,
quiet = args.quiet)
#~~~~~~~~~~~~~~Fast5_to_seq_summary CLI ENTRY POINT~~~~~~~~~~~~~~#
def main_Fast5_to_seq_summary (args=None):
if args is None:
args = sys.argv[1:]
# Define parser object
parser = argparse.ArgumentParser(
description ="Fast5_to_seq_summary generate a sequencing summary like file from a directory containing Fast5 files")
parser.add_argument('--version', '-v', action='version', version="{} v{}".format(package_name, package_version))
# Define arguments
parser.add_argument("--fast5_dir", "-f", required=True, type=str,
help="""Directory containing fast5 files. Can contain multiple subdirectories""")
parser.add_argument("--seq_summary_fn", "-s", required=True, type=str,
help="""path of the summary sequencing file where to write the data extracted from the fast5 files""")
parser.add_argument("--max_fast5", type=int, default=0,
help="Maximum number of file to try to parse. 0 to deactivate (default: %(default)s)")
parser.add_argument("--threads", "-t", type=int, default=4,
help="Total number of threads to use. 1 thread is used for the reader and 1 for the writer. Minimum 3 (default: %(default)s)")
parser.add_argument("--basecall_id", type=int, default=0,
help=textwrap.dedent("""id of the basecalling group. By default leave to 0, but if you perfome multiple basecalling on the same fast5 files,
this can be used to indicate the corresponding group (1, 2 ...) (default: %(default)s)"""))
parser.add_argument("--fields", type=str, nargs="+",
default=["read_id", "run_id", "channel", "start_time", "sequence_length_template",
"mean_qscore_template", "calibration_strand_genome_template", "barcode_arrangement"],
help="list of field names corresponding to attributes to try to fetch from the fast5 files (default: %(default)s)")
parser.add_argument("--include_path", action='store_true', default=False,
help="If given, the absolute path to the corresponding file is added in an extra column (default: %(default)s)")
parser.add_argument("--verbose_level", type=int, default=0,
help="Level of verbosity, from 2 (Chatty) to 0 (Nothing) (default: %(default)s)")
# Try to parse arguments
args = parser.parse_args()
# Run main function
Fast5_to_seq_summary (
fast5_dir = args.fast5_dir,
seq_summary_fn = args.seq_summary_fn,
max_fast5 = args.max_fast5,
threads = args.threads,
basecall_id = args.basecall_id,
fields = args.fields,
include_path = args.include_path,
verbose_level = args.verbose_level)
#~~~~~~~~~~~~~~Barcode_split CLI ENTRY POINT~~~~~~~~~~~~~~#
def main_Barcode_split (args=None):
if args is None:
args = sys.argv[1:]
# Define parser object
parser = argparse.ArgumentParser(
description ="Barcode_split is a simple tool to split sequencing summary report in per barcodes")
parser.add_argument('--version', action='version', version="{} v{}".format(package_name, package_version))
# Define arguments
parser.add_argument("--summary_file", "-f", required=True, nargs='*',
help=textwrap.dedent("""Path to a sequencing_summary generated by Albacore 1.0.0 + (read_fast5_basecaller.py) / Guppy 2.1.3+ (guppy_basecaller).
One can also pass multiple space separated file paths or a UNIX style regex matching multiple files"""))
parser.add_argument("--barcode_file", "-b", default=[], nargs='*',
help=textwrap.dedent("""Path to the barcode_file generated by Guppy 2.1.3+ (guppy_barcoder) or Deepbinner 0.2.0+.
One can also pass multiple space separated file paths or a UNIX style regex matching multiple files"""))
parser.add_argument("--output_dir", "-o", type=str, default="",
help="Folder where to output split barcode data (default: current dir")
parser.add_argument("--output_unclassified", "-u", action='store_true', default=False,
help="If given, unclassified barcodes are also written in a file. By default they are skiped")
parser.add_argument("--min_barcode_percent", "-p", default=0.1, type=float,
help="Minimal percent of total reads to retain barcode label. If below, the barcode value is set as `unclassified` (default: %(default)s)")
parser_verbosity = parser.add_mutually_exclusive_group()
parser_verbosity.add_argument("-v", "--verbose", action="store_true", default=False, help="Increase verbosity")
parser_verbosity.add_argument("-q", "--quiet", action="store_true", default=False, help="Reduce verbosity")
# Try to parse arguments
args = parser.parse_args()
# Run main function
Barcode_split (
summary_file=args.summary_file,
barcode_file=args.barcode_file,
output_dir=args.output_dir,
output_unclassified=args.output_unclassified,
min_barcode_percent=args.min_barcode_percent,
verbose=args.verbose,
quiet=args.quiet)
|