File: findimagedupes

package info (click to toggle)
findimagedupes 0.1.3-5
links: PTS
area: main
in suites: sarge
size: 72 kB
ctags: 10
sloc: perl: 295; makefile: 41
file content (530 lines) | stat: -rwxr-xr-x 15,364 bytes
#!/usr/bin/perl -w

# findimagedupes (name to be changed later)
# copyright 2001 rob kudla
# licensed under the GNU Public License version 2.0 or later
# 
# strengths: recognizes similar pictures with 98% accuracy when
#            the pictures actually have unique features; generates
#            collection files for easy managing of dupes with
#            gqview (hopefully pixie someday)
# weaknesses: lots of false positives on contact sheets and
#             shots of things like ocean horizons, which all 
#             reduce to basically this:
#
#             1111111111111111
#             1111111111111111
#             1111111111111111
#             1111111111111111
#             1111111111111111
#             1111111111111111
#             1111111111111111
#             1111111111111111
#             0000000000000000
#             0000000000000000
#             0000000000000000
#             0000000000000000
#             0000000000000000
#             0000000000000000
#             0000000000000000
#             0000000000000000; slows down geometrically with
#             larger image collections (>18 hours for 25144
#             images, ~15 minutes for 2500)
#
# i've tried moving to an 8x8x8 bit array (twice the size but allows
# greyscale comparisons) and it actually causes more false positives,
# probably because 8x8 pixels just provides too little detail.

use Image::Magick;

my $VERSION = "0.1.3";
my $prog = $0 . '';
$prog = substr($prog,rindex($prog,'/') + 1) if rindex($prog,'/') >= 0;

# check args

while ($arg = shift) {
    if ($arg eq '-rescan') {
	$OPT{'rescan'} = 1;
    } elsif ($arg eq '-t') {
	$OPT{'threshold'} = shift;
    } elsif ($arg eq '-f') {
	$OPT{'dbfile'} = shift;
    } elsif ($arg eq '-?' || $arg eq '-h' || $arg eq '--help') {
	$OPT{'help'} = 1;
    } elsif ($arg eq '-d') {
	$OPT{'scandir'} = shift;
    } elsif ($arg eq '-v') {
	$OPT{'viewpgm'} = shift;
    } elsif ($arg eq '-c') {
	$OPT{'gqvfile'} = shift;
    } elsif ($arg eq '-p') {
	$OPT{'printfp'} = 1;
    } elsif ($arg eq '-g') {
	$OPT{'guimode'} = 1;
    } else { # assume it's a filename
	if (!defined($OPT{'file1'})) {
	    $OPT{'file1'} = $arg;
	} else {
	    $OPT{'file2'} = $arg;
	}
    }
}

# slap user if only one filename specified
$OPT{'help'} = 1 if (defined $OPT{'file1'} && !defined $OPT{'file2'});

# print help message if needed

if ($OPT{'help'}) {
    print "$prog - Copyright 2001 Rob Kudla - http://www.kudla.org/raindog
This program is distributed under the terms of the GNU Public License;
see the file COPYING for details.

Usage: $prog [options] [<file1> <file2>]
Options:
       -rescan         = rescan fingerprints of all files in directory
       -f <file>       = use <file> as image fingerprint database
       -d <dir>        = scan <dir> instead of current directory
       -t <num>        = use <num> as threshold% of similarity (default 90)
       -v <program>    = launch <program> (in bg) to view each set of dupes
       -c <file>       = create GQView collection <file>.gqv of duplicates
       <file1> <file2> = diff just those two files, using -v if present
                         (other options ignored if files are specified)
       -p              = only valid when files specified; prints the
                         hex of the actual fingerprint of each file.
       -g              = GUI mode: produce only machine-friendly output.\n";

    exit 0;
}

# set up defaults

$OPT{'scandir'} = '.' unless defined $OPT{'scandir'};
$OPT{'dbfile'} = 'imagedupes-db.txt' unless defined $OPT{'dbfile'};
$OPT{'dbfile'} = "$OPT{'scandir'}/$OPT{'dbfile'}" unless $OPT{'dbfile'} =~ "/";
$OPT{'threshold'} = 90 unless defined $OPT{'threshold'} && $OPT{'threshold'} > 0;
$OPT{'curdir'} = `pwd`;
chop $OPT{'curdir'};

# set up gqvfile if needed

if (defined $OPT{'gqvfile'} && defined $OPT{'file1'}) {
    $OPT{'gqvfile'} = "$OPT{'gqvfile'}.gqv" if $OPT{'gqvfile'} !~ /\.gqv$/;
    open GQV, ">$OPT{'gqvfile'}";
    print GQV "#GQView collection\n#Created with $prog version $VERSION\n";
    close GQV;
}

# set up countbits array

for ($i = 0; $i < 256; $i++) {
  $countbits_arr[$i] = _countbits (chr($i));
}

# get columns if we can

if (!defined $OPT{'guimode'}) {
    $cols = `tput cols`;
    $cols += 0;
    $cols = 80 if $cols == 0;
}

# scan files in if the user wants or there's no database now

if ((defined $OPT{'rescan'} && ($OPT{'rescan'} > 0) || 
     ((!-e $OPT{'dbfile'}) || (-s $OPT{'dbfile'} == 0)) && 
     !defined $OPT{'file1'})) {

    print "Scanning fingerprints from $OPT{'scandir'} into $OPT{'dbfile'}.\n"
	if !(defined $OPT{'guimode'});

    # get whole tree
    $list = `find $OPT{'scandir'} -type f | sort`;

    # get imagemagick object
    $image = Image::Magick->new;

    # create dbfile
    open IMGFP, ">$OPT{'dbfile'}";

    # put tree into array
    @list = split "\n", $list;

    # max value for our lame little statusbar
    $numfiles = $#list + 1;

    # traverse the array.
    foreach $file (@list) {

	$curfile++;

	# erase current screen line
	system("tput el") if (!defined $OPT{'guimode'}); 

	# build lame little status bar
	if (defined $OPT{'guimode'}) {
	    $outputline = "Status::" . sprintf("%04d", $curfile) . "::" . 
		sprintf("%04d", $numfiles) . "::" . 
		    sprintf("%03.2f", ($curfile/$numfiles) * 100);
	    print "$outputline\n";
	} else {
	    $outputline = "[" . sprintf("%04d", $curfile) . "/" . sprintf("%04d", $numfiles) . "] 0%" . statusbar($curfile, $numfiles) . "100% ";
	    $outputline .= substr($file, 0, $cols - length($outputline) - 1);
	    print "$outputline\n";
	}

	# move cursor back up a line
	# I couldn't just use \r - it only updated like every 15 iterations
	# apparently the linux console only refreshes on a \n
	system("tput cuu1") if (!defined $OPT{'guimode'}); 

	# check what file thinks the file is
	$format = '';
	$filetype = `file "$file"`;

#	# check for file types imagemagick is stupid about, which is
#	# basically anything but a bitmap image
#
#  	unless ($file =~ /\.txt$/i || $file =~ /\.html$/i ||
#  		$filetype =~ /zip/i || $filetype =~ /mp3/i ||
#  		$filetype =~ /link/i || $filetype =~ /rpm/i ||
#  		$filetype =~ /execut/i || $filetype =~ /socket/i ||
#  		$filetype =~ /pipe/i || $filetype =~ /postscript/i ||
#  		$filetype =~ /pdf/i || $filetype =~ /mpeg/i ||
#  		$file =~ /\.man$/i ||
#  		$filetype =~ /text/i || $file =~ /\.htm$/i ) {

	# or we could just assume the user has a good magic file.
	# Of course imagemagick will still crap out on some animated GIFs.

	# imagedups-db.txt =~ /image/ ..., so we want to avoid that.
	if ($filetype =~ /\:.*image/i || $filetype =~ /\:.*bitmap/i) {

	    # Adjusted script to work with both old and new perlmagick
	    # Old perlmagick returns a CSV string, new returns an array

		my @pingstring = $image->Ping($file);
		if(@pingstring) {
		    if ($pingstring[0] =~ /,/) {
			@pingstring = split (',', $pingstring[0]);
		    }
                    $format = $pingstring[3];
		} # else leave $format == ''
	}
	
	# oh yeah, and just in case a text file slips through (crash!!)
	if ($format ne '' && $format ne 'TXT') {

	    $img = &getfingerprint($image, $file);

	    # quote percents and colons in our db file.
	    $filename = $file;
	    $filename =~ s/\%/\%25/g;
	    $filename =~ s/\:/\%3A/g;

	    # only save if the image made a valid pbm.
	    if (defined($img) && length($img) > 0) {
		print IMGFP "$filename:";
		for ($i = 0; $i < length($img); $i++) {
		    # convert each byte of pbm to a hex pair.
		    print IMGFP sprintf("%02x", ord(substr($img,$i,1)));
		}
		print IMGFP "\n";
	  } elsif (!defined($img)) {
		warn "warning: unable to get fingerprint of $file.\n";
	  }

	}
    }

    close IMGFP;
    print "\n" if !(defined $OPT{'guimode'});

}

# find dupes

if (defined $OPT{'file1'}) {

# do file1 and file2

    $image = Image::Magick->new;

    $fp1 = getfingerprint($image, $OPT{'file1'});
    $fp2 = getfingerprint($image, $OPT{'file2'});
    # xor the two binary strings to find differences
    $fpdiff = $fp1 ^ $fp2;

    # print fingerprints if -p specified.
    if ($OPT{'printfp'}) {
	my $i;
	print "$OPT{'file1'}:";
	for ($i = 0; $i < length($fp1); $i++) {
	    # convert each byte of pbm to a hex pair.
	    print sprintf("%02x", ord(substr($fp1,$i,1)));
	}
	print "\n";
	print "$OPT{'file2'}:";
	for ($i = 0; $i < length($fp2); $i++) {
	    # convert each byte of pbm to a hex pair.
	    print sprintf("%02x", ord(substr($fp2,$i,1)));
	}
	print "\n";

	print "Difference:";
	for ($i = 0; $i < length($fpdiff); $i++) {
	    # convert each byte of pbm to a hex pair.
	    print sprintf("%02x", ord(substr($fpdiff,$i,1)));
	}
	print "\n";

    }

    # how many bits are different?  number and %
    $diffbits = countbits($fpdiff);
    $diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);
    if (defined $OPT{'guimode'}) {
	print "Dupe::$OPT{'file1'}::$OPT{'file2'}::$diffpct\n";
    } else {
	print "$OPT{'file1'} $OPT{'file2'}: seem to be $diffpct\% similar.\n";
    }

    # launch the viewer if the user wanted us to
    if (defined($OPT{'viewpgm'})) {
	system("$OPT{'viewpgm'} $key &");
	system("$OPT{'viewpgm'} $keys[$j] &");
	print "Press enter when done viewing. " if !(defined $OPT{'guimode'});
	<STDIN>;
    }


} else {

# do whole tree

    open IMGFP, "<$OPT{'dbfile'}";
    print "Finding duplicates in $OPT{'scandir'}, threshold $OPT{'threshold'}%.\n" if !(defined $OPT{'guimode'});

# load db into hash
    while ($line = <IMGFP>) {
	chop $line;
	($key,$fp) = split(":",$line);

	# remember, : and % are escaped
	$key =~ s/\%3A/\:/g;
	$key =~ s/\%25/\%/g;

	$PFP{$key} = pack("H*", $fp);
    }

    @keys = keys %PFP;

    $i = 0;

	my $bits_that_can_differ = 256 * (1 - $OPT{'threshold'} / 100 );

# traverse the hash
    foreach $key (@keys) { 
	# generate lame little status bar
	if (defined $OPT{'guimode'}) {
	    $outputline = "Status::" . sprintf("%04d", $i) . "::" . 
		sprintf("%04d", $#keys) . "::" . 
		    sprintf("%03.2f", ($i/$#keys) * 100);
	    print "$outputline\n";
	} else {
	    print "[" . sprintf("%04d", $i) . "/" . sprintf("%04d", $#keys) . 
		"] 0%" . statusbar($i, $#keys) . "100%\n";
	}

	# move the cursor up a line, see -rescan section
	system("tput cuu1") if (!defined $OPT{'guimode'});

	# check remainder of hash for close matches
	for ($j = $i + 1; $j <= $#keys; $j++) {
	    # read pbm data for both entries and unhex
	    $fp1 = $PFP{$key};
	    $fp2 = $PFP{$keys[$j]};

	    # xor the two binary strings to find differences
	    $fpdiff = $fp1 ^ $fp2;

	    # how many bits are different?  number and %
	    $diffbits = countbits($fpdiff);

	    if ($diffbits <= $bits_that_can_differ) {
		$diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);

		# blank line, we're going to tell the user something
		if (defined $OPT{'guimode'}) {
		    print "Dupe:\:$key:\:$keys[$j]:\:$diffpct\n";
		} else {
		    system("tput el");
		    print "$key $keys[$j]: seem to be $diffpct\% similar.\n";
		}

		# originally this was a log, now it writes out the gqvfile
		&difflog($key) if not defined($ALREADYDIFF{$key});
		&difflog($keys[$j]) if not defined($ALREADYDIFF{$keys[$j]});

		# and makes sure to only write out each file once
		$ALREADYDIFF{$key} = 1;
		$ALREADYDIFF{$keys[$j]} = 1;

		# launch the viewer if the user wanted us to
		if (defined($OPT{'viewpgm'})) {
		    system("$OPT{'viewpgm'} $key &");
		    system("$OPT{'viewpgm'} $keys[$j] &");
		    print "Press enter when done viewing. ";
		    <STDIN>;
		}
	    }
	}

	$i++;
    }

# write out end of gqvfile, dunno if it's required but GQView does it
    if (defined $OPT{'gqvfile'}) {
	open GQV, ">>$OPT{'gqvfile'}";
	print GQV "#end\n";
	close GQV;
    }

# close db file
    close IMGFP;	
    print "\n";
}

undef $image;

sub base2 {

    # base2: converts binary string to list of 1's and 0's not unlike 
    #        pbm used to provide in text mode

    my $inval = shift;
    my $outval;
    for (my $i = 0; $i < length($inval); $i++) {
	for (my $j = 7; $j >= 0; $j--) {
	    $outval .= (ord(substr($inval,$i,1)) and (2 ** $j) ? 1 : 0);
	}
    }
    $outval;
}

sub _countbits {

    # countbits: counts the 1 bits in a binary string (doesn't use base2)

    my $inval = shift;
    my $outval = 0;    

    for (my $i = 0; $i < length($inval); $i++) {
	for (my $j = 7; $j >= 0; $j--) {
	    my $bit = (ord(substr($inval,$i,1)) & (2 ** $j) ? 1 : 0);
	    $outval += $bit;
	}
    }
    $outval;
}

sub countbits {
  my $inval = shift;
  my $outval = 0;

  for (my $i = 0; $i < length($inval); $i++) {
	$outval += $countbits_arr[ord(substr($inval,$i,1))];
  }
  $outval;
}

sub statusbar {

    # statusbar: prints 0 to 50 dots based on $cur/$fin (arg0/arg1)

    my $cur = shift;
    my $fin = shift;
    my $dots = int(($cur/$fin)*50);
    my $blks = 50 - $dots;
    my $outline = ("." x ($dots)) . (" " x $blks);
    $outline;
}

sub debuglog {

    # debuglog: writes to debug log.

    my $arg = shift;
    open DEBUGLOG, ">>findimagedupes-debug.txt";
    print DEBUGLOG "$arg\n";
    close DEBUGLOG;
}

sub difflog {

    # difflog: used to be a debug thing, now handles the gqvfile output.

    return undef if not defined($OPT{'gqvfile'});
    my $arg = shift;
    $arg =~ s/^\.\//$OPT{'curdir'}\//;
    open DIFFLOG, ">>$OPT{'gqvfile'}";
    print DIFFLOG qq^"$arg"\n^;
    close DIFFLOG;
}

sub getfingerprint {

#  here's a good a place as any to document the algorithm.  it's not
#  so much an algorithm as a philosophy, it's kind of too lame to be
#  an algorithm.  suggestions for improvement are very welcome.

#  1. read file.
#  2. standardize size by resampling to 160x160.
#  3. grayscale it. (reducing saturation seems faster than quantize.)
#  4. blur it a lot. (gets rid of noise.  we're going down 10x more anyway)
#     adding this nudges down false dupes about 10% and makes marginal
#     dupes (e.g. big gamma difference) show up about 10% higher.
#  5. spread the intensity out as much as possible (normalize.)
#  6. make it as contrasty as possible (equalize.)
#     this is for those real dark pictures that someone has slapped
#     a pure white logo on.  yes, i tested this thoroughly on pr0n!
#  7. resample again down to 16x16.  I wanted to use a mosaic/pixelate
#     kind of thing but hopefully imagemagick's resample function works
#     roughly the same way.
#  8. reduce to 1bpp (threshold using defaults)
#  9. convert to pbm, er, um, raw mono
#  10. save out to database as hex string containing raw image data
#  11. when comparing, convert each file pair's thumbprints back to
#      binary and xor them.
#  12. count the 1 bits in the result to approximate similarity.

    my $image = shift;
    my $file = shift;
    my (@blobs, $img);

    $image->Read($file);
    $#$image = 0;
    $image->Sample("160x160!");
    $image->Modulate(saturation=>-100);
    $image->Blur(radius=>5);
    $image->Normalize();
    $image->Equalize();
    $image->Sample("16x16");
    $image->Threshold();
    $image->Set(magick=>'mono');
    @blobs = $image->ImageToBlob();
    if(not defined ($blobs[0])) {
	warn("got undefined blobs for $file\n");
    } else {
	# we used to discard the pbm header, but now we use raw mono
	# so we'll discard all but the first 32 bytes
	$img = substr($blobs[0],0,32);
    }

    # free image but don't delete object.
    undef @$image;

    $img;

}