File: phastconsDB2hints.pl

package info (click to toggle)
augustus 3.3.2%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 486,188 kB
  • sloc: cpp: 51,969; perl: 20,926; ansic: 1,251; makefile: 935; python: 120; sh: 118
file content (85 lines) | stat: -rwxr-xr-x 2,761 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/perl
#
# convert phastcons track to a file with hints to AUGUSTUS
# 
# input file format:
# chromosome  begin  end  name  score
# e.g.
# chr1    21102591        21102633        lod=35  321
#
# Each phastcons interval gives rise to 5 CDSpart hints of different regins. 
# If the score is above a threshold and the CDSpart interval is not a the ends
# then the grade is 2, otherwise the grade is 1. This can be used in the 
# extrinsic.cfg file to consider the internal parts more reliable than the regions at the 
# margins of the conserved region.
#
# Mario Stanke, mstanke@gwdg.de

use strict;
use Getopt::Long;
my ($usage, $priority, $grp);
my ($chrom, $name, $chromStart, $blockCount, $blockSizes, $chromStarts, $score, $len, $grade);
my $source = 'X';
my $cutends = 5;
my $marginA = 10;
my $marginB = 10;
my $minscore = 260;
my $thresh = 400;
my $gradeoffset = 1; # to use together with exoniphy (grade 0)

#
# raw region:   --------------------------------------------------------------------------------------
#               cutends | marginA | marginB |             rest            | marginB | marginA| cutends    
#                       |  CDSpart| CDSpart |           CDSpart           | CDSpart | CDSpart|
# if raw region is shorter than 2(cutends+marginA+marginB), scale down to all 1/5
#
#default values
$priority = 4;

$usage .= "$0 -- convert phastcons conserved regions to hints\n";
$usage .= "Usage: $0 [--priority=p --minscore=a --source=s --cutends=n]\n";
$usage .= "reads from stdin writes to stdout.\n";
$usage .= "takes input in tab separated format:\n";
$usage .= "chrom\tstart\tend\tname\tscore\n";

GetOptions('priority:i'=>\$priority,
	   'source:s' =>\$source,
	   'minscore:i' =>\$minscore,
	   'cutends:i' =>\$cutends);

my ($from, $to, $newfrom, $newto);
while (<>) {
    chomp;
    my @f = split /\t/, $_;
    my @p;
    next unless (@f > 4);

    $chrom = $f[0];
    $name = $f[3];
    $score = $f[4];
    $from = $f[1];
    $to = $f[2];
    $len = $to - $from;

    next unless ($score >= $minscore);
    next unless ($len-2*$cutends>0);
    
    if ($len > 2*($cutends+$marginA+$marginB)){
	@p = ($from+$cutends, $from+$cutends+$marginA, $from+$cutends+$marginA+$marginB, $to - $marginB-$marginA-$cutends, $to-$marginA-$cutends, $to-$cutends);
    } else {
	my $fifth = int(($len-2*$cutends)/5);
	@p = ($from+$cutends, $from+$cutends+$fifth, $from+$cutends+2*$fifth, $to-$cutends-2*$fifth, $to-$cutends-$fifth, $to-$cutends);
    }
    for (my $i=0; $i<@p-1; $i++){
	$newfrom = $p[$i];
	$newto = $p[$i+1]-1;
	if ($i != 0 && $i != 4 && $score > $thresh) {
	    $grade = 2;
	} else {
	    $grade = 1;
	}
	print "$chrom\tphast\tCDSpart\t$newfrom\t$newto\t$grade\t\.\t.\tsrc=$source;pri=$priority\n";
    }
}