File: __main__.py

package info (click to toggle)
pycoqc 2.5.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 98,704 kB
  • sloc: python: 2,295; sh: 165; makefile: 5
file content (215 lines) | stat: -rw-r--r-- 12,573 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#~~~~~~~~~~~~~~IMPORTS~~~~~~~~~~~~~~#
# Standard library imports
import argparse
import sys
import textwrap
from pkg_resources import resource_filename

# Third party imports
from jinja2 import Environment, PackageLoader, Template

# Local imports
from pycoQC.pycoQC import pycoQC
from pycoQC.Fast5_to_seq_summary import Fast5_to_seq_summary
from pycoQC.Barcode_split import Barcode_split
from pycoQC.common import get_logger
from pycoQC import __version__ as package_version
from pycoQC import __name__ as package_name


#~~~~~~~~~~~~~~pycoQC CLI ENTRY POINT~~~~~~~~~~~~~~#
def main_pycoQC (args=None):
    if args is None:
        args = sys.argv[1:]

    # Define parser object
    parser = argparse.ArgumentParser(
        formatter_class = argparse.RawDescriptionHelpFormatter,
        description = textwrap.dedent("""
            pycoQC computes metrics and generates interactive QC plots from the sequencing summary
            report generated by Oxford Nanopore technologies basecallers\n
            * Minimal usage
                pycoQC -f sequencing_summary.txt -o pycoQC_output.html
            * Including Guppy barcoding file + html output + json output
                pycoQC -f sequencing_summary.txt -b barcoding_sequencing.txt -o pycoQC_output.html -j pycoQC_output.json
            * Including Bam file + html output
                pycoQC -f sequencing_summary.txt -a alignment.bam -o pycoQC_output.html"""))
    parser.add_argument('--version', action='version', version="{} v{}".format(package_name, package_version))

    # Define arguments
    parser_io = parser.add_argument_group('Input/output options')
    parser_io.add_argument("--summary_file", "-f", default=[], nargs='*',
        help=textwrap.dedent("""Path to a sequencing_summary generated by Albacore 1.0.0 + (read_fast5_basecaller.py) / Guppy 2.1.3+ (guppy_basecaller).
            One can also pass multiple space separated file paths or a UNIX style regex matching multiple files (Required)"""))
    parser_io.add_argument("--barcode_file", "-b", default=[], nargs='*',
        help=textwrap.dedent("""Path to the barcode_file generated by Guppy 2.1.3+ (guppy_barcoder) or Deepbinner 0.2.0+. This is not a required file.
        One can also pass multiple space separated file paths or a UNIX style regex matching multiple files (optional)"""))
    parser_io.add_argument("--bam_file", "-a", default=[], nargs='*',
        help=textwrap.dedent("""Path to a Bam file corresponding to reads in the summary_file. Preferably aligned with Minimap2
          One can also pass multiple space separated file paths or a UNIX style regex matching multiple files (optional)"""))
    parser_io.add_argument("--html_outfile", "-o", default="", type=str,
        help="Path to an output html file report (required if json_outfile not given)")
    parser_io.add_argument("--json_outfile", "-j", default="", type=str,
        help="Path to an output json file report (required if html_outfile not given)")
    parser_filt = parser.add_argument_group('Filtering options')
    parser_filt.add_argument("--min_pass_qual", default=7, type=float,
        help="Minimum quality to consider a read as 'pass' (default: %(default)s)")
    parser_filt.add_argument("--min_pass_len", default=0, type=int,
        help="Minimum read length to consider a read as 'pass' (default: %(default)s)")
    parser_filt.add_argument("--filter_calibration", default=False, action='store_true',
        help="If given, reads flagged as calibration strand by the basecaller are removed (default: %(default)s)")
    parser_filt.add_argument("--filter_duplicated", default=False, action='store_true',
        help=textwrap.dedent("""If given, duplicated read_ids are removed but the first occurence is kept
        (Guppy sometimes outputs the same read multiple times) (default: %(default)s)"""))
    parser_filt.add_argument("--min_barcode_percent", default=0.1, type=float,
        help="Minimal percent of total reads to retain barcode label. If below, the barcode value is set as `unclassified` (default: %(default)s)")
    parser_html = parser.add_argument_group('HTML report options')
    parser_html.add_argument("--report_title", default="PycoQC report", type=str,
        help="Title to use in the html report (default: %(default)s)")
    parser_html.add_argument("--template_file", type=str, default="",
        help="Jinja2 html template for the html report (default: %(default)s)")
    parser_html.add_argument("--config_file", type=str, default="",
        help=textwrap.dedent("""Path to a JSON configuration file for the html report.
            If not provided, looks for it in ~/.pycoQC and ~/.config/pycoQC/config. If it's still not found, falls back to default parameters.
            The first level keys are the names of the plots to be included.
            The second level keys are the parameters to pass to each plotting function (default: %(default)s)")"""))
    parser_html.add_argument("--skip_coverage_plot", default=False, action='store_true',
        help="Skip the coverage plot in HTML report. Useful when using a reference file containing many sequences, i.e. transcriptome (default: %(default)s)")
    parser_other = parser.add_argument_group('Other options')
    parser_other.add_argument("--sample", default=100000, type=int,
        help=textwrap.dedent("""If not None a n number of reads will be randomly selected instead of the entire dataset for ploting function
        (deterministic sampling) (default: %(default)s)"""))
    parser_other.add_argument("--default_config", "-d", action='store_true',
        help="Print default configuration file. Can be used to generate a template JSON file (default: %(default)s)")
    parser_verbosity = parser.add_mutually_exclusive_group()
    parser_verbosity.add_argument("-v", "--verbose", action="store_true", default=False, help="Increase verbosity")
    parser_verbosity.add_argument("-q", "--quiet", action="store_true", default=False, help="Reduce verbosity")

    # Try to parse arguments
    args = parser.parse_args()

    # Set logging level
    logger = get_logger (name=__name__, verbose=args.verbose, quiet=args.quiet)

    # Print the default config parameters and exit
    if args.default_config:
        config_file = resource_filename("pycoQC", "templates/pycoQC_config.json")
        with open (config_file) as fp:
            sys.stdout.write(fp.read())
        sys.exit()

    elif not args.summary_file:
        logger.warning ("ERROR: `--summary_file` is a required argument")
        parser.print_help()
        sys.exit()

    elif not args.html_outfile and not args.json_outfile:
        logger.warning ("ERROR: At least one output file required `--html_outfile` or `--json_outfile`")
        parser.print_help()
        sys.exit()

    # Run pycoQC
    pycoQC (
        summary_file = args.summary_file,
        barcode_file = args.barcode_file,
        bam_file = args.bam_file,
        filter_calibration = args.filter_calibration,
        filter_duplicated = args.filter_duplicated,
        min_barcode_percent = args.min_barcode_percent,
        min_pass_qual = args.min_pass_qual,
        min_pass_len = args.min_pass_len,
        sample = args.sample,
        html_outfile = args.html_outfile,
        report_title = args.report_title,
        config_file = args.config_file,
        skip_coverage_plot = args.skip_coverage_plot,
        template_file = args.template_file,
        json_outfile = args.json_outfile,
        verbose = args.verbose,
        quiet = args.quiet)

#~~~~~~~~~~~~~~Fast5_to_seq_summary CLI ENTRY POINT~~~~~~~~~~~~~~#
def main_Fast5_to_seq_summary (args=None):
    if args is None:
        args = sys.argv[1:]

    # Define parser object
    parser = argparse.ArgumentParser(
        description ="Fast5_to_seq_summary generate a sequencing summary like file from a directory containing Fast5 files")
    parser.add_argument('--version', '-v', action='version', version="{} v{}".format(package_name, package_version))
    # Define arguments
    parser.add_argument("--fast5_dir", "-f", required=True, type=str,
        help="""Directory containing fast5 files. Can contain multiple subdirectories""")
    parser.add_argument("--seq_summary_fn", "-s", required=True, type=str,
        help="""path of the summary sequencing file where to write the data extracted from the fast5 files""")
    parser.add_argument("--max_fast5", type=int, default=0,
        help="Maximum number of file to try to parse. 0 to deactivate (default: %(default)s)")
    parser.add_argument("--threads", "-t", type=int, default=4,
        help="Total number of threads to use. 1 thread is used for the reader and 1 for the writer. Minimum 3 (default: %(default)s)")
    parser.add_argument("--basecall_id", type=int, default=0,
        help=textwrap.dedent("""id of the basecalling group. By default leave to 0, but if you perfome multiple basecalling on the same fast5 files,
        this can be used to indicate the corresponding group (1, 2 ...) (default: %(default)s)"""))
    parser.add_argument("--fields", type=str, nargs="+",
        default=["read_id", "run_id", "channel", "start_time", "sequence_length_template",
        "mean_qscore_template", "calibration_strand_genome_template", "barcode_arrangement"],
        help="list of field names corresponding to attributes to try to fetch from the fast5 files (default: %(default)s)")
    parser.add_argument("--include_path", action='store_true', default=False,
        help="If given, the absolute path to the corresponding file is added in an extra column (default: %(default)s)")
    parser.add_argument("--verbose_level", type=int, default=0,
        help="Level of verbosity, from 2 (Chatty) to 0 (Nothing) (default: %(default)s)")

    # Try to parse arguments
    args = parser.parse_args()

    # Run main function
    Fast5_to_seq_summary (
        fast5_dir = args.fast5_dir,
        seq_summary_fn = args.seq_summary_fn,
        max_fast5 = args.max_fast5,
        threads = args.threads,
        basecall_id = args.basecall_id,
        fields = args.fields,
        include_path = args.include_path,
        verbose_level = args.verbose_level)

#~~~~~~~~~~~~~~Barcode_split CLI ENTRY POINT~~~~~~~~~~~~~~#
def main_Barcode_split (args=None):
    if args is None:
        args = sys.argv[1:]

    # Define parser object
    parser = argparse.ArgumentParser(
        description ="Barcode_split is a simple tool to split sequencing summary report in per barcodes")
    parser.add_argument('--version', action='version', version="{} v{}".format(package_name, package_version))
    # Define arguments
    parser.add_argument("--summary_file", "-f", required=True, nargs='*',
        help=textwrap.dedent("""Path to a sequencing_summary generated by Albacore 1.0.0 + (read_fast5_basecaller.py) / Guppy 2.1.3+ (guppy_basecaller).
        One can also pass multiple space separated file paths or a UNIX style regex matching multiple files"""))
    parser.add_argument("--barcode_file", "-b", default=[], nargs='*',
        help=textwrap.dedent("""Path to the barcode_file generated by Guppy 2.1.3+ (guppy_barcoder) or Deepbinner 0.2.0+.
        One can also pass multiple space separated file paths or a UNIX style regex matching multiple files"""))
    parser.add_argument("--output_dir", "-o", type=str, default="",
        help="Folder where to output split barcode data (default: current dir")
    parser.add_argument("--output_unclassified", "-u", action='store_true', default=False,
        help="If given, unclassified barcodes are also written in a file. By default they are skiped")
    parser.add_argument("--min_barcode_percent", "-p", default=0.1, type=float,
        help="Minimal percent of total reads to retain barcode label. If below, the barcode value is set as `unclassified` (default: %(default)s)")
    parser_verbosity = parser.add_mutually_exclusive_group()
    parser_verbosity.add_argument("-v", "--verbose", action="store_true", default=False, help="Increase verbosity")
    parser_verbosity.add_argument("-q", "--quiet", action="store_true", default=False, help="Reduce verbosity")

    # Try to parse arguments
    args = parser.parse_args()

    # Run main function
    Barcode_split (
        summary_file=args.summary_file,
        barcode_file=args.barcode_file,
        output_dir=args.output_dir,
        output_unclassified=args.output_unclassified,
        min_barcode_percent=args.min_barcode_percent,
        verbose=args.verbose,
        quiet=args.quiet)