1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595
|
#!/usr/bin/env perl
=pod
=head1 NAME
make-table - create a tabular data file with random data suitable for parsing by parse-table
=head1 SYNOPSIS
make-table [-rows INT] [-cols INT]
[-rule_set STR]
[-positive_only]
[-non_negative_only]
[-negative_is_missing]
[-missing_data STR]
[-unique_labels]
[-brief]
=head1 DESCRIPTION
This script creates a table with random values, sampled from a normal
distribution, in a format suitable for use with parse-table and the
Circos tableviewer. Use this script to generate random tables to learn
how the tableviewer suite works
=head1 OUTPUT
By default the output will show the mean, standard deviation and sample values for each cell
mean data A B C
mean A 100 500 100
mean B 20 4 20
mean C 100 500 100
sd data A B C
sd A 10 100 10
sd B 2 1 2
sd C 10 100 10
table data A B C
table A 106 509 96
table B 15 2 22
table C 98 416 99
If you only want to show the table values without the leading row identifier, use -brief
data A B C
A 106 509 96
B 15 2 22
C 98 416 99
=head1 CONFIGURATION
Nearly all configuration parameters are defined within the
configuration file. Look in etc/make-table.conf for a sample
configuration file - there are plenty of comments to get you oriented.
The following can be adjusted on the command line
=head2 -rows INT, -cols INT
Set the number of rows and columns. Keep it sane. If only one of the
values is set (e.g. rows or columns) the table will be square.
=head2 -rule_set STR
Set the name of the rule set to use to populate the table cells with
values. Values are generated randomly from a normal distribution with
the parameters MEAN and SD for mean and standard deviation.
Rules are defined in <rules NAME> blocks, like this
<rules SETNAME>
rule = RXROW RXCOL MEAN SD
rule = RXROW RXCOL MEAN SD
rule = RXROW RXCOL MEAN SD
...
</rules>
Here the rule set named SETNAME is defined (use -rule_set SETNAME to
use these rules) and each rule affects rows and cols that pass regular
expressions in RXROW and RXCOL, respectively.
For those cells whose corresponding rows and column names pass the
regular expression, values are sampled randomly from a normal
distribution with mean MEAN and standard deviation SD.
For example,
rule = . . 100 10
sets the values of all rows and columns (the regular expression is
. which matches any character - therefore all labels match) to a
normally distributed value with mean 100 and standard deviation 10.
For another example,
rule = A [BCD] 200 10
sets values for cells A,B A,C A,D (row,col) to mean 200 and standard deviation 10.
Rules are applied to cells in order of increasing specificity. Thus,
rules that affect a larger number of cells are applied before rules
that affect a smaller number of cells.
You can specify the mean and/or standard deviation as a relative value thus
r0.5 10 # mean is relative
10 r0.5 # sd is relative
r0.5 r0.5 # both are relative
The relative notation will be accepted if the rule refines the value of a cell which was populated by a previous rule. For example
. . 100 10 # sets all cels to mean 100 sd 10
A B r.5 r.1 # adjusts cell row=A col=B to mean 50 sd 1
However, if the following is your first rule
. . r.5 r.1
then an error will result because there is no value to which a relative adjustment can be made.
=head2 -non_negative_only
Random values will be re-sampled until each cell has a non-negative
value (zero is allowed).
=head2 -positive_only
Random values will be re-sampled until each cell has a positive value
(zero is not allowes).
=head2 -negative_is_missing
Any negative values in the table are interpreted as missing data and
the cells are populated with the value of the "missing_data"
parameter.
=head2 -brief
Only table data will be reported, without any line headers or mean and
standard deviation reports for each cell.
data A B C
A 106 509 96
B 15 2 22
C 98 416 99
=head2 -show_totals
In addition to cell values, the totals for each row and column will be reported.
data all tot 219 927 217
data all tot A B C
A 930 711 106 509 96
B 966 39 15 2 22
C 830 613 98 416 99
The first column of totals shows the total for any row/column with the label. The second column shows the row total for that label.
If labels are not unique (e.g. rows and columns have the same label) then the first column total is useful. Otherwise, it is the same as the row total.
data all tot 219 927 217
data all tot D E F
A 711 711 106 509 96
B 39 39 15 2 22
C 613 613 98 416 99
=head2 -unique_labels
Each row and column will have a unique label.
data D E F
A 106 509 96
B 15 2 22
C 98 416 99
Each distinct label is associated with its own segment in the table view image. When a row and column have the same label, they share a segment. Therefore, when a table has unique labels each row and column has its own segment.
=head2 -seed NUM
Set the seed of the random number generator. Useful for obtaining reproducible runs for debugging.
=head1 HISTORY
=over
=item * 22 Nov 2012
Updated documentation.
=item * 27 Apr 2010
Fixed column and row labels for rectangular tables.
=item * 21 Jan 2009 v0.11
Repackaged for local use. Added -seed, -unique_labels and show_totals.
=item * 27 May 2008 v0.1
Started and versioned.
=back
=head1 BUGS
=head1 AUTHOR
Martin Krzywinski
=head1 CONTACT
Martin Krzywinski
Genome Sciences Centre
Vancouver BC Canada
www.bcgsc.ca
martink@bcgsc.ca
=cut
################################################################
#
# Copyright 2002-2012 Martin Krzywinski
#
# This file is part of the Genome Sciences Centre Perl code base.
#
# This script is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this script; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
################################################################
use strict;
use Config::General;
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
#use Math::VecStat qw(sum min max average);
use Math::Random qw(random_normal random_set_seed_from_phrase random_get_seed);
use Pod::Usage;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);
################################################################
#
# *** YOUR MODULE IMPORTS HERE
#
################################################################
GetOptions(\%OPT,
"unique_labels",
"brief",
"rows=i",
"cols=i",
"rule_set=s",
"positive_only",
"non_negative_only",
"negative_is_missing",
"missing_data=s",
"show_totals",
"show_sizes",
"seed=i",
"cdump",
"configfile=s",
"help","man","debug+");
pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
if($CONF{cdump}) {
print Dumper(\%CONF);
exit;
}
validateconfiguration();
if($CONF{debug} > 1) {
$Data::Dumper::Pad = "debug parameters";
$Data::Dumper::Indent = 1;
$Data::Dumper::Quotekeys = 0;
$Data::Dumper::Terse = 1;
print Dumper(\%CONF);
}
random_set_seed_from_phrase($CONF{seed}) if defined $CONF{seed};
my @labelscol;
my @labelsrow;
for my $row (0..$CONF{rows}-1) {
push @labelsrow, get_label($row);
}
if($CONF{unique_labels}) {
for my $col (0..$CONF{cols}-1) {
push @labelscol, get_label($col + $CONF{rows});
}
} elsif ($CONF{rows} != $CONF{cols}) {
if($CONF{rows} > $CONF{cols}) {
@labelscol = @labelsrow[ 0 .. $CONF{cols} - 1];
} elsif ($CONF{cols} > $CONF{rows}) {
@labelscol = @labelsrow;
for my $col ($CONF{rows} + 1 .. $CONF{cols}) {
push @labelscol, get_label($col - 1);
}
}
} else {
@labelscol = @labelsrow;
}
my $table;
# collect value parameters
for my $rule (sort {num_rule_hits($b) <=> num_rule_hits($a)} make_list($CONF{rules}{$CONF{rule_set}}{rule})) {
my @rows = rule_rows($rule);
my @cols = rule_cols($rule);
my @params = rule_params($rule);
for my $row (@rows) {
for my $col (@cols) {
if(@params) {
for my $i (0..@params-1) {
my $p = $params[$i];
if ($p =~ /^r(.+)/) {
die "you've asked for a relative value [$p] but this cell [$row,$col] has no present value" unless exists $table->{$row}{$col}{param}[$i];
$table->{$row}{$col}{param}[$i] *= $1;
} else {
$table->{$row}{$col}{param}[$i] = $p;
}
}
$table->{$row}{$col}{param}[1] *= $table->{$row}{$col}{param}[0] if $table->{$row}{$col}{param}[1] < 1;
} else {
$table->{$row}{$col}{param} = undef;
}
}
}
}
# populate values
for my $row (@labelsrow) {
for my $col (@labelscol) {
if(defined $table->{$row}{$col}{param}) {
my $params = $table->{$row}{$col}{param};
if(! $params) {
$table->{$row}{$col}{value} = $CONF{missing_data};
} else {
my $value = get_value( @$params );
if($value < 0 && $CONF{negative_is_missing}) {
$table->{$row}{$col}{value} = $CONF{missing_data};
} else {
$value = sprintf($CONF{format}{data},$value) if $CONF{format}{data};
$table->{$row}{$col}{value} = $value;
}
if ($value == 0 && $CONF{zero_is_missing}) {
$table->{$row}{$col}{value} = $CONF{missing_data};
}
if ($value > 0) {
$value = sprintf($CONF{format}{data},$value) if $CONF{format}{data};
$table->{$row}{$col}{value} = $value;
}
}
} else {
$table->{$row}{$col}{value} = $CONF{missing_data};
}
}
}
# determine basic statistics
my $stats;
for my $row (@labelsrow) {
for my $col (@labelscol) {
my $value = $table->{$row}{$col}{value};
next if $value eq $CONF{missing_data};
$stats->{total}{$row}{out} += $value;
$stats->{total}{$col}{in} += $value;
$stats->{total}{$row}{all} += $value;
$stats->{total}{$col}{all} += $value;
}
}
################################################################
# print parameters for each cell
# mean
! $CONF{brief} && printinfo("mean",sprintf($CONF{format}{label},"data"),
map { sprintf($CONF{format}{label},$_) } @labelscol);
for my $row (@labelsrow) {
my @rowdata = ("mean",
sprintf($CONF{format}{label},$row));
for my $col (@labelscol) {
push @rowdata, sprintf($CONF{format}{data},$table->{$row}{$col}{param}[0]);
}
! $CONF{brief} && printinfo(@rowdata);
}
# sd
! $CONF{brief} && printinfo("sd",sprintf($CONF{format}{label},"data"),
map { sprintf($CONF{format}{label},$_) } @labelscol);
for my $row (@labelsrow) {
my @rowdata = ("sd",
sprintf($CONF{format}{label},$row));
for my $col (@labelscol) {
push @rowdata, sprintf($CONF{format}{data},$table->{$row}{$col}{param}[1]);
}
! $CONF{brief} && printinfo(@rowdata);
}
my @coltotal = (
sprintf($CONF{format}{label},"data"),
sprintf($CONF{format}{totalh},"all"),
sprintf($CONF{format}{totalh},"tot"),
map { sprintf($CONF{format}{total},$stats->{total}{$_}{in}) } @labelscol);
if($CONF{brief}) {
printinfo(@coltotal) if $CONF{show_totals};
} else {
printinfo("table",@coltotal) if $CONF{show_totals};
}
my @header;
if($CONF{show_totals}) {
@header = (
sprintf($CONF{format}{label},"data"),
sprintf($CONF{format}{totalh},"all"),
sprintf($CONF{format}{totalh},"tot"),
map {sprintf($CONF{format}{label},$_)} @labelscol);
} else {
@header = (
sprintf($CONF{format}{label},"data"),
map {sprintf($CONF{format}{label},$_)} @labelscol);
}
if($CONF{brief}) {
printinfo(@header);
} else {
printinfo("table",@header);
}
for my $row (@labelsrow) {
my @rowdata = ("table",
sprintf($CONF{format}{label},$row),
sprintf($CONF{format}{total},$stats->{total}{$row}{all}),
sprintf($CONF{format}{total},$stats->{total}{$row}{out}));
shift @rowdata if $CONF{brief};
splice(@rowdata,-2,2) if ! $CONF{show_totals};
for my $col (@labelscol) {
push @rowdata, $table->{$row}{$col}{value};
}
printinfo(@rowdata);
}
# print links
for my $row (@labelsrow) {
for my $col (@labelscol) {
my $linkid = sprintf("link_%s_%s",$row,$col);
#printinfo($linkid,$row);
#printinfo($linkid,$col);
}
}
sub rule_params {
my $rule = shift;
my ($rx1,$rx2,@params) = split(" ",$rule);
return @params;
}
sub rule_rows {
my $rule = shift;
my ($rx1,$rx2) = split(" ",$rule);
return grep($_ =~ /$rx1/, @labelsrow);
}
sub rule_cols {
my $rule = shift;
my ($rx1,$rx2) = split(" ",$rule);
return grep($_ =~ /$rx2/, @labelscol);
}
sub num_rule_hits {
my $rule = shift;
return rule_cols($rule) * rule_rows($rule);
}
sub get_value {
my ($m,$sd) = @_;
my $x;
do {
$x = random_normal(1,$m,$sd);
} while( ($x < 0 && $CONF{positive_only}) || ($x <= 0 && $CONF{non_negative_only}) );
return $x;
}
sub make_list {
my $value = shift;
if(ref($value)) {
return @$value;
} else {
return ($value);
}
}
sub get_label {
my $i = shift;
if($i < 26) {
return uc chr(97+$i);
} else {
my $k1 = chr(96 + int($i/26));
my $k2 = chr(97 + ($i % 26));
return uc $k1.$k2;
}
}
sub validateconfiguration {
$CONF{missing_data} = sprintf($CONF{format}{missing},$CONF{missing_data});
$CONF{rule_set} ||= "default";
if(! $CONF{rows} && ! $CONF{cols}) {
die "ERROR - you must define the number of rows and columns using -row NUM and -col NUM"
} elsif (! $CONF{rows}) {
$CONF{rows} = $CONF{cols};
} elsif (! $CONF{cols}) {
$CONF{cols} = $CONF{rows};
}
}
################################################################
#
# *** DO NOT EDIT BELOW THIS LINE ***
#
################################################################
sub populateconfiguration {
foreach my $key (keys %OPT) {
$CONF{$key} = $OPT{$key};
}
# any configuration fields of the form __XXX__ are parsed and replaced with eval(XXX). The configuration
# can therefore depend on itself.
#
# flag = 10
# note = __2*$CONF{flag}__ # would become 2*10 = 20
parseconfiguration(\%CONF);
sub parseconfiguration {
my $root = shift;
for my $key (keys %$root) {
my $value = $root->{$key};
if(ref($value) eq "HASH") {
parseconfiguration($value);
} else {
while($value =~ /__(.+?)__/g) {
my $source = "__" . $1 . "__";
my $target = eval $1;
$value =~ s/\Q$source\E/$target/g;
#printinfo($source,$target,$value);
}
$root->{$key} = $value;
}
}
}
}
sub loadconfiguration {
my $file = shift;
my ($scriptname) = fileparse($0);
if(-e $file && -r _) {
# great the file exists
} elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
$file = "$FindBin::RealBin/$scriptname.conf";
} elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
$file = "$FindBin::RealBin/etc/$scriptname.conf";
} elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) {
$file = "$FindBin::RealBin/../etc/$scriptname.conf";
} else {
return undef;
}
print $file;
$OPT{configfile} = $file;
my $conf = new Config::General(-ConfigFile=>$file,
-AllowMultiOptions=>"yes",
-LowerCaseNames=>1,
-AutoTrue=>1);
%CONF = $conf->getall;
}
sub printdebug {
printinfo("debug",@_) if $CONF{debug};
}
sub printinfo {
printf("%s\n",join(" ",@_));
}
|