File: randomSplit.pl

package info (click to toggle)
augustus 3.5.0%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 777,052 kB
  • sloc: cpp: 80,066; perl: 21,491; python: 4,368; ansic: 1,244; makefile: 1,141; sh: 171; javascript: 32
file content (82 lines) | stat: -rwxr-xr-x 1,971 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/perl

##############################################################################
# randomSplit
# randomly split a genbank file
#
# usage: randomSplit dbfile size 
#
# dbfile: genbank file containing the genes
# size: size of the test set
# output: two files are created with names ending in .train 
#         and .test, being a training set and test set, respectively.
#
# This script is used by the braker.pl pipeline.
# Please be extremely careful when changing this script because the braker.pl
# pipeline may fail upon custom modification of this script.
# In case of doubt, contact katharina.hoff@uni-greifswald.de
#
# Mario Stanke, 24.06.2002
#############################################################################

srand 4;

if ($#ARGV != 1) {
    print "$0: randomly split a genbank file in two subsets of given sizes\n";
    print "usage: randomSplit dbfile size\n";
    exit;
}
$dbfilename = $ARGV[0];
$size = $ARGV[1];

open (STDIN, $dbfilename);

@list = <STDIN>;

@namelines = grep /^LOCUS   +/, @list;

if ($size > @namelines) {
    print "size $size is greater than the number of genes in file\n",
	"$dbfilename. Aborting.\n\n";
    exit;
}

my %unique = ();

foreach (@namelines) {
    /LOCUS +([^ ]+) */;
    #print "$1\n";
    push @names, "$1";
    if(not(defined($unique{$1}))){$unique{$1} = 1;}else{
	die( "ERROR in randomSplit.pl line 47: LOCUS names in genbank file are not unique!\n");
    }
}

%testnames=();
while ($size--) {
  $rand = rand (@names);

  $testnames{$names[$rand]}=1; 
  splice @names, $rand, 1;          # delete array element
}

open (TRAINFILE, ">${dbfilename}.train");
open (TESTFILE, ">${dbfilename}.test");

open (STDIN, $dbfilename);

#print "random test set:\n";

$/="\n//\n";
while(<STDIN>) {
    $gendaten=$_;
    m/^LOCUS +(\S+) .*/;
    $genname=$1;
    
    if (exists($testnames{$genname})) {
	#print "$genname\n";
	print TESTFILE "$gendaten";
    } else {
	print TRAINFILE "$gendaten";
    }
}