File: eland2sga.pl

package info (click to toggle)
chip-seq 1.5.5-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 21,320 kB
  • sloc: ansic: 10,053; perl: 1,493; makefile: 103; sh: 91
file content (119 lines) | stat: -rwxr-xr-x 4,403 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/local/bin/perl 

# transform ChIP-seq eland mappings via hash variable into sorted sga
# usage: eland2sga.pl  -a feature -s species -f eland_file

use strict;
use Storable qw (retrieve nstore);         # package to store persistently variables in files [http://search.cpan.org/author/AMS/Storable-2.07/Storable.pm]
use Getopt::Std;

my %opts;
getopt('asf:', \%opts);  # -a, -s, & -f take arg. Values in %opts, Hash keys will be the switch names

die("\nusage: eland2sga.pl  -a feature -s species -f eland_file\n\n") unless (($opts{'f'})&&($opts{'a'})&&($opts{'s'}));

# hash defining chromosome from SV of chromosomes in current genome assemblies
my $DB = "/usr/share/chip-seq/";
#my $DB = "/home/local/db/genome/";
my $chr2AC = retrieve($DB."chro_idx.nstorage");
# eventually add older assemblies?

my $species=$opts{'s'};

die ("Sorry, $species is not a supported species!\n\n") unless ($chr2AC->{$species});


my ($score_ref);
open(FI, "$opts{'f'}") or die ("Cannot open $opts{'f'}: $!\n") ;
    while(<FI>){
    my $lin=$_;
    chomp $lin;
    my @ar=split(/\t/,$lin);
    $ar[6] =~ s/^.+hromosome\.//;
    $ar[6] =~ s/\.fa$//;
#    my ($chr)= $ar[6] =~ /hromosome.([0-9RLYX]+).fa/;
    my $chr = 'chr'.$ar[6];
    if (exists($chr2AC->{$species}->{$chr})){
# attribute each tag to its start position and increment counter (multiple column formats)
# >EAS38_3_1_526_564      TGTTCTCTGCATGCTATTTTTTAGTATTGCCGTAA     U0      1       0       0       Mus_musculus.NCBIM37.48.dna.chromosome.15.fa    15876793        R       ..
#.bed:           chr12   119248213       119248236       U0      0       +
	if ($ar[8] eq 'F'){
	    $score_ref->{$chr2AC->{$species}->{$chr}}->{'+'}->{$ar[7]} +=1;
	}
	elsif ($ar[8] eq 'R'){
	    my $pos=$ar[7] + length($ar[1]);
	    $score_ref->{$chr2AC->{$species}->{$chr}}->{'-'}->{$pos} +=1;
	}
	else{
	    warn "parsing error of orientation in following line:\n$lin\n";
	}
# attribute counts to its center position and record in hash (force into numeric)
#		$score_ref->{$chr2AC{$ar[0]}}->{int(($ar[1]+$ar[2])/2)}=eval($ar[3]);
    }
    else{
#	warn "parsing error of chromosome number in following line:\n$lin\n";
    }
}
close(FI);



#nstore (\$score_ref, "/scratch/frt/cschmid/EMO-4_bed_ori.nstorage");  

my @chr;
my $spec='T';   # flag
foreach my $ac(keys %$score_ref){
    unless (($spec eq 'T')||($spec eq $chr2AC->{'assembly'}->{$ac})){
            die "The file $opts{'f'} contains entries from both $spec and from ",$chr2AC->{'assembly'}->{$ac}," (chromosome $ac)\n";
    }
    $species = $chr2AC->{'assembly'}->{$ac};
    $chr2AC->{$ac} =~ s/chr//;
    push(@chr, $chr2AC->{$ac});
}
# order chromosome array as in sga files
my @chrb = sort {$a<=>$b} @chr;
my (@spl,$sc);
foreach $sc('X','Y'){
   if ($chrb[0] eq $sc){
      push(@spl, shift(@chrb));
   }
}
push(@chrb,@spl);

foreach my $chro(@chrb){

# foreach position with data, print out counts in sga format:
#   versioned genome sequence ID
#   feature type
#   position
#   strand (+, -, or 0 for un-oriented features)
#   tag count (or signal intensity, positive int)

    my @arr=keys %{$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'+'}};
    push @arr,keys %{$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'-'}};

    my %double; # hash to control for positions with tags in both ori
    foreach my $pos(sort {$a <=> $b} @arr){
        if ((exists($score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'+'}->{$pos}))&&(!$double{$pos})){
             print $chr2AC->{$species}->{'chr'.$chro},"\t$opts{'a'}\t$pos\t+\t",$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'+'}->{$pos},"\n";
             $double{$pos}='T';
        }
        elsif (exists($score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'-'}->{$pos})){
             print $chr2AC->{$species}->{'chr'.$chro},"\t$opts{'a'}\t$pos\t-\t",$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'-'}->{$pos},"\n";
        }
        else{
             print "trouble for position $pos on chro 'chr'.$chro: no tags in either orientation??\n";
             die;
        }
    }
# read length of chromosome from /db/genome/eukaryote.ptr
    my $line = `grep $chr2AC->{$species}->{'chr'.$chro} /db/genome/eukaryote.ptr`;
    my @ar = split(/\s+/, $line);
    print "$ar[0]\tEND\t$ar[2]\t0\t1\n";
# grep 'Mm/ch' /db/genome/eukaryote.ptr| perl -ane 'if (/^\w+\_\d+\.\d+\s+/){@ar=split/\s+/;print "$ar[0]\tEND\t$ar[2]\t0\t1\n";}'
}


1;