1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
|
#!/usr/local/bin/perl
# transform ChIP-seq eland mappings via hash variable into sorted sga
# usage: eland2sga.pl -a feature -s species -f eland_file
use strict;
use Storable qw (retrieve nstore); # package to store persistently variables in files [http://search.cpan.org/author/AMS/Storable-2.07/Storable.pm]
use Getopt::Std;
my %opts;
getopt('asf:', \%opts); # -a, -s, & -f take arg. Values in %opts, Hash keys will be the switch names
die("\nusage: eland2sga.pl -a feature -s species -f eland_file\n\n") unless (($opts{'f'})&&($opts{'a'})&&($opts{'s'}));
# hash defining chromosome from SV of chromosomes in current genome assemblies
my $DB = "/usr/share/chip-seq/";
#my $DB = "/home/local/db/genome/";
my $chr2AC = retrieve($DB."chro_idx.nstorage");
# eventually add older assemblies?
my $species=$opts{'s'};
die ("Sorry, $species is not a supported species!\n\n") unless ($chr2AC->{$species});
my ($score_ref);
open(FI, "$opts{'f'}") or die ("Cannot open $opts{'f'}: $!\n") ;
while(<FI>){
my $lin=$_;
chomp $lin;
my @ar=split(/\t/,$lin);
$ar[6] =~ s/^.+hromosome\.//;
$ar[6] =~ s/\.fa$//;
# my ($chr)= $ar[6] =~ /hromosome.([0-9RLYX]+).fa/;
my $chr = 'chr'.$ar[6];
if (exists($chr2AC->{$species}->{$chr})){
# attribute each tag to its start position and increment counter (multiple column formats)
# >EAS38_3_1_526_564 TGTTCTCTGCATGCTATTTTTTAGTATTGCCGTAA U0 1 0 0 Mus_musculus.NCBIM37.48.dna.chromosome.15.fa 15876793 R ..
#.bed: chr12 119248213 119248236 U0 0 +
if ($ar[8] eq 'F'){
$score_ref->{$chr2AC->{$species}->{$chr}}->{'+'}->{$ar[7]} +=1;
}
elsif ($ar[8] eq 'R'){
my $pos=$ar[7] + length($ar[1]);
$score_ref->{$chr2AC->{$species}->{$chr}}->{'-'}->{$pos} +=1;
}
else{
warn "parsing error of orientation in following line:\n$lin\n";
}
# attribute counts to its center position and record in hash (force into numeric)
# $score_ref->{$chr2AC{$ar[0]}}->{int(($ar[1]+$ar[2])/2)}=eval($ar[3]);
}
else{
# warn "parsing error of chromosome number in following line:\n$lin\n";
}
}
close(FI);
#nstore (\$score_ref, "/scratch/frt/cschmid/EMO-4_bed_ori.nstorage");
my @chr;
my $spec='T'; # flag
foreach my $ac(keys %$score_ref){
unless (($spec eq 'T')||($spec eq $chr2AC->{'assembly'}->{$ac})){
die "The file $opts{'f'} contains entries from both $spec and from ",$chr2AC->{'assembly'}->{$ac}," (chromosome $ac)\n";
}
$species = $chr2AC->{'assembly'}->{$ac};
$chr2AC->{$ac} =~ s/chr//;
push(@chr, $chr2AC->{$ac});
}
# order chromosome array as in sga files
my @chrb = sort {$a<=>$b} @chr;
my (@spl,$sc);
foreach $sc('X','Y'){
if ($chrb[0] eq $sc){
push(@spl, shift(@chrb));
}
}
push(@chrb,@spl);
foreach my $chro(@chrb){
# foreach position with data, print out counts in sga format:
# versioned genome sequence ID
# feature type
# position
# strand (+, -, or 0 for un-oriented features)
# tag count (or signal intensity, positive int)
my @arr=keys %{$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'+'}};
push @arr,keys %{$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'-'}};
my %double; # hash to control for positions with tags in both ori
foreach my $pos(sort {$a <=> $b} @arr){
if ((exists($score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'+'}->{$pos}))&&(!$double{$pos})){
print $chr2AC->{$species}->{'chr'.$chro},"\t$opts{'a'}\t$pos\t+\t",$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'+'}->{$pos},"\n";
$double{$pos}='T';
}
elsif (exists($score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'-'}->{$pos})){
print $chr2AC->{$species}->{'chr'.$chro},"\t$opts{'a'}\t$pos\t-\t",$score_ref->{$chr2AC->{$species}->{'chr'.$chro}}->{'-'}->{$pos},"\n";
}
else{
print "trouble for position $pos on chro 'chr'.$chro: no tags in either orientation??\n";
die;
}
}
# read length of chromosome from /db/genome/eukaryote.ptr
my $line = `grep $chr2AC->{$species}->{'chr'.$chro} /db/genome/eukaryote.ptr`;
my @ar = split(/\s+/, $line);
print "$ar[0]\tEND\t$ar[2]\t0\t1\n";
# grep 'Mm/ch' /db/genome/eukaryote.ptr| perl -ane 'if (/^\w+\_\d+\.\d+\s+/){@ar=split/\s+/;print "$ar[0]\tEND\t$ar[2]\t0\t1\n";}'
}
1;
|