File: fastQ_to_fastA.pl

package info (click to toggle)
trinityrnaseq 2.15.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 468,004 kB
  • sloc: perl: 49,905; cpp: 17,993; java: 12,489; python: 3,282; sh: 1,989; ansic: 985; makefile: 717; xml: 62
file content (119 lines) | stat: -rwxr-xr-x 3,048 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env perl

use strict;
use warnings;

use lib ("/usr/lib/trinityrnaseq/PerlLib");
use Nuc_translator;
use IO::Uncompress::Gunzip;

use Getopt::Long qw(:config no_ignore_case bundling);


my $usage = <<_EOUSAGE_;

##########################################################
#
#  -I <string>     input.fq  or "input1.fq,input2.fq,input3.fq"
#
#  --ignoreDirty   ignores poorly formed entries
#
#  -a <int>        append "/num" to the accession name.
#
#  --rev           reverse complement nucleotide sequence.
#
###########################################################

_EOUSAGE_
	;

my $inputFile;
my $ignore_dirty = 0;
my $append_num;
my $revcomp_flag = 0;

&GetOptions( 'I=s'          => \$inputFile,
             'ignore_dirty' => \$ignore_dirty,
             'a=i'          => \$append_num,
             'rev'          => \$revcomp_flag,
	);


unless ($inputFile) {
	die $usage;
}

main: {
    my @files = split(/,/, $inputFile);
    foreach my $file (@files) {
        $file =~ s/\s//g;
        if ($file =~ /\w/) {
            &fastQ_to_fastA($file);
        }
    }
    exit(0);
}


sub fastQ_to_fastA {
    my ($file) = @_;

    my $fh = new IO::Uncompress::Gunzip($file) or die "Error, cannot open file $file";

    my $counter = 0;
    my $num_clean = 0;
    my $num_dirty = 0;

    while (my $line = <$fh>) {
        $line =~ s/\cM//g; # remove any cntrl-M characters (sometimes derived from MS-windows text files)

        if ($line =~ /^\@/) {
            $counter++;

            # print STDERR "\r[$counter] [$num_clean clean] [$num_dirty dirty]       " if ($counter % 10000 == 0);

            my $header = $line;
            my $seq = <$fh>;
            my $qual_header = <$fh>;
            my $qual_line   = <$fh>;

            chomp $header;
            chomp $seq if $seq;
            chomp $qual_header if $qual_header;
            chomp $qual_line if $qual_line;

            if ($header && $seq && $qual_header && $qual_line =~ /\S/ &&
                $qual_header =~ /^\+/ && length($seq) == length($qual_line)) {

                # can do some more checks here if needed to be sure that the lines are formatted as expected.

                substr($header,0,1,''); # strip beginning "@"

                my @header_parts = split(/\s+/, $header);
                $header = shift @header_parts;

                if (@header_parts && $header !~ m|/[12]$| && $header_parts[0] =~ /^([12])\:/) {
                    $header .= "/$1";
                }
                if (defined $append_num) {
                    $header .= "/$append_num";
                }
                if ($revcomp_flag) {
                    $seq = &reverse_complement($seq);
                }

                print ">$header\n$seq\n";
                $num_clean++;
            }
            else {
                $num_dirty++;
                unless ($ignore_dirty) {
                    die "Error, improperly formatted entry:\n\n$header\n$seq\n$qual_header\n$qual_line\n";
                }

            }
        }
    }

    return;
}