File: fastq_masker.c

package info (click to toggle)
fastx-toolkit 0.0.14-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,304 kB
  • sloc: ansic: 1,802; xml: 1,503; cpp: 1,393; sh: 525; perl: 360; makefile: 304
file content (124 lines) | stat: -rw-r--r-- 4,061 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*
    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
    Copyright (C) 2009-2013  A. Gordon (assafgordon@gmail.com)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <errno.h>
#include <err.h>

#include <config.h>

#include "fastx.h"
#include "fastx_args.h"

const char* usage=
"usage: fastq_masker [-h] [-v] [-q N] [-r C] [-z] [-i INFILE] [-o OUTFILE]\n" \
"Part of " PACKAGE_STRING " by A. Gordon (assafgordon@gmail.com)\n" \
"\n" \
"   [-h]         = This helpful help screen.\n" \
"   [-q N]       = Quality threshold - nucleotides with lower quality will be masked\n" \
"                  Default is 10.\n" \
"   [-r C]       = Replace low-quality nucleotides with character C. Default is 'N'\n" \
"   [-z]         = Compress output with GZIP.\n" \
"   [-i INFILE]  = FASTQ input file. default is STDIN.\n" \
"   [-o OUTFILE] = FASTQ output file. default is STDOUT.\n" \
"   [-v]         = Verbose - report number of sequences.\n" \
"                  If [-o] is specified,  report will be printed to STDOUT.\n" \
"                  If [-o] is not specified (and output goes to STDOUT),\n" \
"                  report will be printed to STDERR.\n" \
"   [-Q N]       = FASTQ ASCII offset. Default is 33.\n" \
"\n";

int min_quality_threshold=10;
char mask_character='N';

FASTX fastx;

int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
{
	switch(optc) {
	case 'q':
		if (optarg==NULL)
			errx(1, "[-q] parameter requires an argument value");
		min_quality_threshold = atoi(optarg);
		if (min_quality_threshold<-40)
			errx(1,"Invalid minimum length value (-q %s)", optarg);
		break;

	case 'r':
		if (optarg==NULL)
			errx(1, "[-r] parameter requires an argument value");
		if (strlen(optarg)!=1)
			errx(1, "[-r] parameter requires a single character as value");
		mask_character = optarg[0];
		break;

	default:
		errx(1, __FILE__ ":%d: Unknown argument (%c)", __LINE__, optc ) ;
	}
	return 1;
}

int main(int argc, char* argv[])
{
	int i ;
	size_t masked_reads_count=0;
	size_t masked_nucleotides_count=0;

	fastx_parse_cmdline(argc, argv, "q:r:", parse_program_args);

	fastx_init_reader(&fastx, get_input_filename(),
		FASTQ_ONLY, ALLOW_N, REQUIRE_UPPERCASE,
		get_fastq_ascii_quality_offset() );

	fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());

	while ( fastx_read_next_record(&fastx) ) {

		int masked = 0;

		//Scan each sequence - backwards
		for ( i=0; i<(int)strlen(fastx.nucleotides); ++i ) {
			if ( fastx.quality[i] < min_quality_threshold ) {
				fastx.nucleotides[i] = mask_character ;
				masked = 1;
				++masked_nucleotides_count;
			}
		}
		if (masked)
			masked_reads_count += get_reads_count(&fastx);

		fastx_write_record(&fastx);
	}
	//
	//Print verbose report
	if ( verbose_flag() ) {
		fprintf(get_report_file(), "Minimum Quality Threshold: %d\n", min_quality_threshold);
		fprintf(get_report_file(), "Low-quality nucleotides replaced with '%c'\n", mask_character);

		fprintf(get_report_file(), "Input: %zu reads.\n", num_input_reads(&fastx) ) ;
		fprintf(get_report_file(), "Output: %zu reads.\n", num_output_reads(&fastx) ) ;

		fprintf(get_report_file(), "Masked reads: %zu\n", masked_reads_count ) ;
		fprintf(get_report_file(), "Masked nucleotides: %zu\n", masked_nucleotides_count ) ;
	}

	return 0;
}