File: nsq-uniq

package info (click to toggle)
nosql 0.9-0
links: PTS
area: main
in suites: hamm
size: 1,364 kB
ctags: 225
sloc: perl: 3,766; sh: 476; makefile: 41
file content (126 lines) | stat: -rwxr-xr-x 3,654 bytes
#!/usr/bin/perl
# Original code: uniqtbl,v 2.4 1993/03/29 13:34:46 hobbs
#
$RCS_ID = '$Id: nsq-uniq,v 0.9 1998/03/04 09:12:49 carlos Exp $' ;
$0 =~ s-.*/-- ;
$HelpInfo = <<EOH ;

	    NoSQL operator: $0

Usage:  $0  [options]  column ...

Options:
    -D	     Diagnostic output. Prints number of rows removed on STDERR.
    -g       Group option. Instead of producing unique rows, produce only
	     groups of repeated rows, with a blank row between. In this case
	     the '-D' option shows the number of groups.
    -help    Print this help info.
    -v	     Inverse option. Selects all columns except those named.

Reads the input rdbtable and compares adjacent rows. The second and succeeding
copies of repeated rows, considering only the selected columns, are removed.
That is, adjacent rows are considered equal if the data values in all of the
selected columns are equal.  The remaining rows are written to the output
rdbtable. Note that repeated rows must be adjacent in order to be found.

This NoSQL operator reads an rdbtable from STDIN and writes an rdbtable to
STDOUT.  Options may be abbreviated.

$RCS_ID

			----------------------
NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.
This program comes with ABSOLUTELY NO WARRANTY; for details
refer to the GNU General Public License.

You should have received a copy of the GNU General Public License
along with this program;  if not, write to the Free Software
Foundation, Inc., 675 Mass Ave., Cambridge, MA 02139, USA.
			----------------------

EOH
while ( ($_ = $ARGV[0]) =~ /^-/ ) {			# Get args
    if( /^-D.*/ ){ $DIG++ ; shift ; $remov = 0 ; next ; }
    if( /^-g.*/ ){ $GRP++ ; shift ; next ; }
    if( /^-h.*/ ){ print $HelpInfo ; exit 1 ; }
    if( /^-v.*/ ){ $INV++ ; shift ; next ; }
    die "\nBad arg: $_\n", "For help type \"$0 -help\".\n" ; 
}
while(<STDIN>){					# read header + 1 or 2 rows
    push( @f3, $_ ) ;
    next if /^\s*#/ ;	# comment
    chop ;
    if( ++$lln == 1 ){
	@H = split( /\t/, $_ ) ; # column names
	$nrf = @H ; }		 # nr of fields 
    elsif( $GRP && $lln == 2 ){
	last ; }
    elsif( $lln == 3 ){
	@P = split( /\t/, $_ ) ; # 1st data line
	last ; } }
&get_col_x ;
print @f3 ;
unless( $GRP ){ &do_reg ; }	# read data, regular case
else{ &do_grp ; }		# read data, group   case

sub do_reg {						# regular case
    while(<STDIN>){			# read the data 
	chop ;
	@D = split( /\t/, $_, $nrf );
	if( &chksame ){
	    $remov++ ;
	    next ; }
	print $_, "\n" ;
    }
    print STDERR "Rows Removed: $remov\n" if $DIG ;
}
sub do_grp {						# group case
    chop( $_sav = <STDIN> ) ;
    @p = split( /\t/, $_sav, $nrf );
    while(<STDIN>){			# read the data 
	chop ;
	@D = split( /\t/, $_, $nrf );
	if( &chksame ){
	    if( ! $ingrp ){
		$remov++ ;
		$x = @D -1 ;
		print "\t" x $x, "\n" ; # blank row
		print $_sav, "\n" ; }
	    $ingrp++ ;
	    print $_, "\n" ; }
	else{
	    $_sav = $_ ;
	    $ingrp = 0 ; }
    }
    print STDERR "Nr Groups: $remov\n" if $DIG ;
}
sub chksame {	# return 1 if @D and @P are same on spec cols, else 0.
    $val = 1 ;
    for $x (@n) {
	if( $D[$x] ne $P[$x] ){
	    $val = 0 ;
	    last ; }
    }
    @P = @D ;
    $val ;
}
sub get_col_x {		# get, chk column indexes, inc -v, die if bad column
			# uses @H, $INV, put indexes in @n.
    local( $f, $ok, @nn ) ;
    for $arg (@ARGV){
	for( $ok=$f=0 ; $f < @H ; $f++ ){
	    if( $arg eq $H[$f] ){	# match existing column
		$ok++ ;
		push( @n, $f );
		last ; }
	}
	die "\n$0: Bad column name: $arg\n" if ! $ok ;
    }
    if( $INV ){					# inverse option
	loop: for( $f=0 ; $f < @H ; $f++ ){
	    for $i (@n){
		next loop if $i eq $f ; }
	    push( @nn, $f ); }
	@n = @nn ;
    }
}