1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
|
#!/usr/bin/perl
#
# convert phastcons track to a file with hints to AUGUSTUS
#
# input file format:
# chromosome begin end name score
# e.g.
# chr1 21102591 21102633 lod=35 321
#
# Each phastcons interval gives rise to 5 CDSpart hints of different regins.
# If the score is above a threshold and the CDSpart interval is not a the ends
# then the grade is 2, otherwise the grade is 1. This can be used in the
# extrinsic.cfg file to consider the internal parts more reliable than the regions at the
# margins of the conserved region.
#
# Mario Stanke, mstanke@gwdg.de
use strict;
use Getopt::Long;
my ($usage, $priority, $grp);
my ($chrom, $name, $chromStart, $blockCount, $blockSizes, $chromStarts, $score, $len, $grade);
my $source = 'X';
my $cutends = 5;
my $marginA = 10;
my $marginB = 10;
my $minscore = 260;
my $thresh = 400;
my $gradeoffset = 1; # to use together with exoniphy (grade 0)
#
# raw region: --------------------------------------------------------------------------------------
# cutends | marginA | marginB | rest | marginB | marginA| cutends
# | CDSpart| CDSpart | CDSpart | CDSpart | CDSpart|
# if raw region is shorter than 2(cutends+marginA+marginB), scale down to all 1/5
#
#default values
$priority = 4;
$usage .= "$0 -- convert phastcons conserved regions to hints\n";
$usage .= "Usage: $0 [--priority=p --minscore=a --source=s --cutends=n]\n";
$usage .= "reads from stdin writes to stdout.\n";
$usage .= "takes input in tab separated format:\n";
$usage .= "chrom\tstart\tend\tname\tscore\n";
GetOptions('priority:i'=>\$priority,
'source:s' =>\$source,
'minscore:i' =>\$minscore,
'cutends:i' =>\$cutends);
my ($from, $to, $newfrom, $newto);
while (<>) {
chomp;
my @f = split /\t/, $_;
my @p;
next unless (@f > 4);
$chrom = $f[0];
$name = $f[3];
$score = $f[4];
$from = $f[1];
$to = $f[2];
$len = $to - $from;
next unless ($score >= $minscore);
next unless ($len-2*$cutends>0);
if ($len > 2*($cutends+$marginA+$marginB)){
@p = ($from+$cutends, $from+$cutends+$marginA, $from+$cutends+$marginA+$marginB, $to - $marginB-$marginA-$cutends, $to-$marginA-$cutends, $to-$cutends);
} else {
my $fifth = int(($len-2*$cutends)/5);
@p = ($from+$cutends, $from+$cutends+$fifth, $from+$cutends+2*$fifth, $to-$cutends-2*$fifth, $to-$cutends-$fifth, $to-$cutends);
}
for (my $i=0; $i<@p-1; $i++){
$newfrom = $p[$i];
$newto = $p[$i+1]-1;
if ($i != 0 && $i != 4 && $score > $thresh) {
$grade = 2;
} else {
$grade = 1;
}
print "$chrom\tphast\tCDSpart\t$newfrom\t$newto\t$grade\t\.\t.\tsrc=$source;pri=$priority\n";
}
}
|