1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
|
#!/usr/bin/perl
# Original code: uniqtbl,v 2.4 1993/03/29 13:34:46 hobbs
#
$RCS_ID = '$Id: nsq-uniq,v 0.9 1998/03/04 09:12:49 carlos Exp $' ;
$0 =~ s-.*/-- ;
$HelpInfo = <<EOH ;
NoSQL operator: $0
Usage: $0 [options] column ...
Options:
-D Diagnostic output. Prints number of rows removed on STDERR.
-g Group option. Instead of producing unique rows, produce only
groups of repeated rows, with a blank row between. In this case
the '-D' option shows the number of groups.
-help Print this help info.
-v Inverse option. Selects all columns except those named.
Reads the input rdbtable and compares adjacent rows. The second and succeeding
copies of repeated rows, considering only the selected columns, are removed.
That is, adjacent rows are considered equal if the data values in all of the
selected columns are equal. The remaining rows are written to the output
rdbtable. Note that repeated rows must be adjacent in order to be found.
This NoSQL operator reads an rdbtable from STDIN and writes an rdbtable to
STDOUT. Options may be abbreviated.
$RCS_ID
----------------------
NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.
This program comes with ABSOLUTELY NO WARRANTY; for details
refer to the GNU General Public License.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave., Cambridge, MA 02139, USA.
----------------------
EOH
while ( ($_ = $ARGV[0]) =~ /^-/ ) { # Get args
if( /^-D.*/ ){ $DIG++ ; shift ; $remov = 0 ; next ; }
if( /^-g.*/ ){ $GRP++ ; shift ; next ; }
if( /^-h.*/ ){ print $HelpInfo ; exit 1 ; }
if( /^-v.*/ ){ $INV++ ; shift ; next ; }
die "\nBad arg: $_\n", "For help type \"$0 -help\".\n" ;
}
while(<STDIN>){ # read header + 1 or 2 rows
push( @f3, $_ ) ;
next if /^\s*#/ ; # comment
chop ;
if( ++$lln == 1 ){
@H = split( /\t/, $_ ) ; # column names
$nrf = @H ; } # nr of fields
elsif( $GRP && $lln == 2 ){
last ; }
elsif( $lln == 3 ){
@P = split( /\t/, $_ ) ; # 1st data line
last ; } }
&get_col_x ;
print @f3 ;
unless( $GRP ){ &do_reg ; } # read data, regular case
else{ &do_grp ; } # read data, group case
sub do_reg { # regular case
while(<STDIN>){ # read the data
chop ;
@D = split( /\t/, $_, $nrf );
if( &chksame ){
$remov++ ;
next ; }
print $_, "\n" ;
}
print STDERR "Rows Removed: $remov\n" if $DIG ;
}
sub do_grp { # group case
chop( $_sav = <STDIN> ) ;
@p = split( /\t/, $_sav, $nrf );
while(<STDIN>){ # read the data
chop ;
@D = split( /\t/, $_, $nrf );
if( &chksame ){
if( ! $ingrp ){
$remov++ ;
$x = @D -1 ;
print "\t" x $x, "\n" ; # blank row
print $_sav, "\n" ; }
$ingrp++ ;
print $_, "\n" ; }
else{
$_sav = $_ ;
$ingrp = 0 ; }
}
print STDERR "Nr Groups: $remov\n" if $DIG ;
}
sub chksame { # return 1 if @D and @P are same on spec cols, else 0.
$val = 1 ;
for $x (@n) {
if( $D[$x] ne $P[$x] ){
$val = 0 ;
last ; }
}
@P = @D ;
$val ;
}
sub get_col_x { # get, chk column indexes, inc -v, die if bad column
# uses @H, $INV, put indexes in @n.
local( $f, $ok, @nn ) ;
for $arg (@ARGV){
for( $ok=$f=0 ; $f < @H ; $f++ ){
if( $arg eq $H[$f] ){ # match existing column
$ok++ ;
push( @n, $f );
last ; }
}
die "\n$0: Bad column name: $arg\n" if ! $ok ;
}
if( $INV ){ # inverse option
loop: for( $f=0 ; $f < @H ; $f++ ){
for $i (@n){
next loop if $i eq $f ; }
push( @nn, $f ); }
@n = @nn ;
}
}
|