File: cssdiff.c

package info (click to toggle)
crm114 20050415-4
links: PTS
area: main
in suites: sarge
size: 4,288 kB
ctags: 2,574
sloc: ansic: 25,130; sh: 9,439; makefile: 452; lisp: 208; python: 36; sed: 16
file content (213 lines) | stat: -rwxr-xr-x 5,988 bytes
//  cssutil.c - utility for munging css files, version X0.1
//  Copyright 2001-2004  William S. Yerazunis, all rights reserved.
//
//  2005-05-02 Milan Zamazal <pdm@debian.org>: Patch by
//    Paolo <oopla-iA+eEnwkJgzk1uMJSBkQmQ@public.gmane.org> applied not to
//    crash when the compared files are read-only.
//
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.0.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org .  Other licenses may be negotiated; contact the 
//  author for details.  
//

//  include some standard files

#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

int main (int argc, char **argv)
{

  long i,j,k;    //  some random counters, when we need a loop
  long hfsize, hfsize1, hfsize2;

  long f1, f2;
  long sim, diff, dom1, dom2, hclash, kclash;

  {
    struct stat statbuf;    //  filestat buffer
    FEATUREBUCKET_TYPE *h1, *h2;              //  the text of the hash file
    //   filename is argv [1]
    //             and stat it to get it's length
    if(!argv[1] || !argv[2])
      {
        fprintf (stdout, "Usage: cssdiff <cssfile1> <cssfile2>\n");
        return (EXIT_SUCCESS);
      };
    //             quick check- does the first file even exist?
    k = stat (argv[1], &statbuf);
    if (k != 0)
      {
	fprintf (stderr, "\n CSS file '%s' not found. \n", argv[1]);
	exit (EXIT_FAILURE);
      };
    //    
    hfsize = statbuf.st_size;
    //         mmap the hash file into memory so we can bitwhack it
    h1 = (FEATUREBUCKET_TYPE *) crm_mmap_file_1 (0, argv[1], hfsize, 1);
    if (h1 == NULL)
      {
        fprintf (stderr, "\n Could not open file %s\n",
                 argv[1]);
        exit (EXIT_FAILURE);
      };    
    if (h1 == MAP_FAILED)
      {
	fprintf (stderr, "\n MMAP failed on file %s\n",
		 argv[1]);
	exit (EXIT_FAILURE);
      };
    hfsize1 = statbuf.st_size / sizeof (FEATUREBUCKET_TYPE);

    //
    //  and repeat the process for the second file:
    k = stat (argv[2], &statbuf);
    //             quick check- does the file even exist?
    if (k != 0)
      {
	fprintf (stderr, "\n.CSS file '%s' not found.\n", argv[2]);
	exit (EXIT_FAILURE);
      };

    hfsize2 = statbuf.st_size;
    //         mmap the hash file into memory so we can bitwhack it
    h2 = (FEATUREBUCKET_TYPE *) crm_mmap_file_1 (0, argv[2], hfsize2, 1);
    if (h2 == NULL)
      {
        fprintf (stderr, "\n Could not open file %s\n",
                 argv[2]);
        exit (EXIT_FAILURE);
      };    
    if (h2 == MAP_FAILED)
      {
	fprintf (stderr, "\n MMAP failed on file %s\n",
		 argv[2]);
	exit (EXIT_FAILURE);
      };

    hfsize2 = hfsize2 / sizeof (FEATUREBUCKET_TYPE);

    fprintf (stderr, "Sparse spectra file %s has %ld bins total\n",
	     argv[1], hfsize1);
    

    fprintf (stdout, "Sparse spectra file %s has %ld bins total\n",
	     argv[2], hfsize2);

    //
    //
    if (hfsize1 != hfsize2)
      {
	fprintf (stderr, 
		 "\n.CSS files %s, %s :\n lengths differ: %ld vs %ld.\n",
		 argv[1],argv[2], hfsize1, hfsize2);
	fprintf (stderr, "\n This is not a fatal error, but be warned.\n");
      };

    f1 = 0;
    f2 = 0;
    sim  = 0;
    diff = 0;
    dom1 = 0;
    dom2 = 0;
    hclash = 0;
    kclash = 0;
    //  
    //   The algorithm - for each file, 
    //                      for each bucket in each file
    //                          find corresponding bucket in other file
    //                              increment dom1 or dom2 as appropriate
    //                              always increment sim and diff
    //                          end
    //                      end
    //                      divide sim and diff by 2, as they are doublecounted
    //                      print statistics and exit.
    //
    // start at 1 - no need to check bin 0 (version).
    for ( i = 1; i < hfsize1; i++) 
      {
	if (   h1[i].key != 0 )
	  {
	    f1 += h1[i].value;
	    k = h1[i].hash % hfsize2;
	    if (k == 0) 
	      k = 1;
	    while (h2[k].value != 0 && 
		   (h2[k].hash != h1[i].hash 
		    || h2[k].key != h1[i].key))
	      {
		k++;
		if (k >= hfsize2) k = 1;
	      };
	    
	    //   Now we've found the corresponding (or vacant) slot in
	    //   h2.  Do our tallies...
	    j = h1[i].value ;
	    if (j > h2[k].value ) j = h2[k].value;
	    sim +=  j;
	    
	    j = h1[i].value - h2[k].value;
	    if (j < 0) j = -j;
	    diff += j;
	    
	    j = h1[i].value - h2[k].value;
	    if (j < 0) j = 0;
	    dom1 += j;
	  };
      };
    //
    //      And repeat for file 2.
    for ( i = 1; i < hfsize2; i++) 
      {
	if (   h2[i].key != 0 )
	  {
	    f2 += h2[i].value;
	    k = h2[i].hash % hfsize1;
	    if (k == 0) 
		k = 1;
	      while (h1[k].value != 0 && 
		     (h1[k].hash != h2[i].hash 
		      || h1[k].key != h2[i].key))
		{
		  k++;
		  if (k >= hfsize1) k = 1;
		};

	      //   Now we've found the corresponding (or vacant) slot in
	      //   h1.  Do our tallies...
	      j = h2[i].value ;
	      if (j > h1[k].value ) j = h1[k].value;
	      sim +=  j;
	      
	      j = h1[k].value - h2[i].value;
	      if (j < 0) j = -j;
	      diff += j;
	      
	      j = h2[i].value - h1[k].value;
	      if (j < 0) j = 0;
	      dom2 += j;
	  };
      };
    
    fprintf (stdout, "\n File 1 total features            : %12ld", f1);
    fprintf (stdout, "\n File 2 total features            : %12ld\n", f2);

    fprintf (stdout, "\n Similarities between files       : %12ld", sim/2);
    fprintf (stdout, "\n Differences between files        : %12ld\n", diff/2);

    fprintf (stdout, "\n File 1 dominates file 2          : %12ld", dom1);
    fprintf (stdout, "\n File 2 dominates file 1          : %12ld\n", dom2);
    
  }
  return 0;
}