File: combined_nameSorted_to_dup_pairs_removed.pl

package info (click to toggle)
trinityrnaseq 2.2.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 212,452 kB
  • ctags: 5,067
  • sloc: perl: 45,552; cpp: 19,678; java: 11,865; sh: 1,485; makefile: 613; ansic: 427; python: 313; xml: 83
file content (122 lines) | stat: -rwxr-xr-x 2,798 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env perl

use strict;
use warnings;

my $usage = "usage: $0 combined_nameSorted.sam\n\n";

my $combined_nameSorted_sam = $ARGV[0] or die $usage;


main: {

    my $conglom_file = "$combined_nameSorted_sam.__conglom_tmp";
    
    unless (-s $conglom_file) {
        
        open (my $ofh, ">$conglom_file") or die "Error, cannot write to $conglom_file";
        
        my @paired;
        
        my $prev_acc = "";
        
        open (my $fh, $combined_nameSorted_sam) or die $!;
        while (<$fh>) {
            chomp;
            my @x = split(/\t/);
            if ($x[0] ne $prev_acc) {
                if (scalar(@paired) == 2) {
                    &process_pair($ofh, \@paired);
                }
                @paired = ();
            }
            $prev_acc = $x[0];
            push (@paired, [@x]);
        }
        
        ## get last ones
        if (scalar(@paired) == 2) {
            &process_pair($ofh, \@paired);
        }
        
        close $ofh;
    }

    
    
    my $cmd = "sort -k1,1 -k2,2n -k3,3 -k4,4n $conglom_file > $conglom_file.sorted";
    &process_cmd($cmd) unless (-s "$conglom_file.sorted");

    my $dups_removed_file = "$conglom_file.dups_removed.sam";
    unless (-s $dups_removed_file) {
        open (my $ofh, ">$dups_removed_file") or die $!;
        open (my $fh, "$conglom_file.sorted") or die $!;
        print STDERR "-writing $dups_removed_file\n";
        my $prev_contig_info = "";
        while (<$fh>) {
            chomp;
            my @x = split(/\t/);
            my $contig_info = join("\t", @x[0..3]);
            
            my $read_A_info = $x[4];
            my $read_B_info = $x[5];
            if ($contig_info ne $prev_contig_info) {
                $read_A_info =~ s/$;/\t/g;
                $read_B_info =~ s/$;/\t/g;

                print $ofh "$read_A_info\n$read_B_info\n";
            }
            $prev_contig_info = $contig_info;
        }
        close $fh;
        close $ofh;
    }
    
    
    
    exit(0);
    

}


####
sub process_pair {
    my ($ofh, $pairs_aref) = @_;
    
    my ($read_A, $read_B) = @$pairs_aref;
    
    if ($read_A->[2] gt $read_B->[2]) {
        ## swap 'em
        ($read_A, $read_B) = ($read_B, $read_A);
    }
    
    my $contig_A = $read_A->[2];
    my $pos_A = $read_A->[3];

    my $contig_B = $read_B->[2];
    my $pos_B = $read_B->[3];
    
    my $read_A_text = join("$;", @$read_A);
    my $read_B_text = join("$;", @$read_B);

    print $ofh join("\t", $contig_A, $pos_A, $contig_B, $pos_B, $read_A_text, $read_B_text) . "\n";
    
    return;
    
}


####
sub process_cmd {
    my ($cmd) = @_;

    print STDERR "CMD: $cmd\n";
    my $ret = system($cmd);

    if ($ret) { 
        die "Error, cmd: $cmd died with ret $ret";
    }

    return;
}