File: merge.c

package info (click to toggle)
swish-e 2.4.7-1
links: PTS
area: main
in suites: squeeze
size: 7,224 kB
ctags: 8,194
sloc: ansic: 51,637; sh: 8,895; perl: 3,018; makefile: 591; xml: 9
file content (1089 lines) | stat: -rw-r--r-- 36,228 bytes
parent folder | download | duplicates (6)
/*

$Id: merge.c 1945 2007-10-22 14:54:07Z karpet $


    This file is part of Swish-e.

    Swish-e is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Swish-e is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along  with Swish-e; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    
    See the COPYING file that accompanies the Swish-e distribution for details
    of the GNU GPL and the special exception available for linking against
    the Swish-e library.
    
** Mon May  9 15:07:32 CDT 2005
** added GPL
    
**-----------------------------------------------------------------
**
**  rewritten from scratch - moseley Oct 17, 2001
**
*/

#include <assert.h>             /* for bug hunting */
#include "swish.h"
#include "mem.h"
#include "swstring.h"
#include "merge.h"
#include "error.h"
#include "search.h"
#include "index.h"
#include "hash.h"
#include "file.h"
#include "docprop.h"
#include "list.h"
#include "compress.h"
#include "metanames.h"
#include "db.h"
#include "dump.h"
#include "result_sort.h"
#include "swish_qsort.h"
#include "result_output.h"
#include "parse_conffile.h"
#include "stemmer.h"
#include "headers.h"

static void dup_header( SWISH *sw_input, SWISH *sw_output );
static void check_header_match( IndexFILE *in_index, SWISH *sw_output );
static void make_meta_map( IndexFILE *in_index, SWISH *sw_output);
static void load_filename_sort( SWISH *sw, IndexFILE *cur_index );
static IndexFILE *get_next_file_in_order( SWISH *sw_input );
static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_output );
static int *get_map( FILE *filenum_map, IndexFILE *cur_index );
static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output );
static void write_word_pos( IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, unsigned int posdata );


// #define DEBUG_MERGE

/****************************************************************************
*  merge_indexes -- reads from input indexes, and outputs a new index
*
*
*****************************************************************************/

void merge_indexes( SWISH *sw_input, SWISH *sw_output )
{
    IndexFILE   *cur_index;
    FILE        *filenum_map;
    char        *tmpfilename;
    struct MOD_Index *idx_output = sw_output->Index;
    ENTRY       *e, *prev;
    int          hash,
                 sz_worddata,
                 saved_bytes,
                 tmpval,
                 filenum,
                 metaID = 0,
                 frequency,
                 loc_count = 0,
                 word_count = 0;
    sw_off_t     wordID;
    int          metadata_length = 0;
    unsigned char   *worddata;
    unsigned char   *s, *start;
    unsigned char   flag;
    unsigned int          local_posdata[MAX_STACK_POSITIONS];
    unsigned int         *posdata;
    int          i;

    /*******************************************************************************
    * Get ready to merge the indexes.  For each index:
    *   - check that it has the correct headers
    *   - create meta entries in output index, and create a map to convert metas
    *   - load an array of file numbers sorted by filename so can merge sort the filesnames
    *   - set some initial defaults.
    *********************************************************************************/

    cur_index = sw_input->indexlist;
    while( cur_index  )
    {
        printf("Input index '%s' has %d files and %d words\n", cur_index->line, cur_index->header.totalfiles, cur_index->header.totalwords);

        if ( cur_index == sw_input->indexlist )
            /* Duplicate the first index's header into the output index */
            dup_header( sw_input, sw_output );
        else
            check_header_match( cur_index, sw_output );  // errors if headers don't match - don't really need to check first one since it was the one that was dupped


        make_meta_map( cur_index, sw_output);        // add metas to new index, and create map

        load_filename_sort( sw_input, cur_index );   // so can read in filename order

        cur_index->current_file = 0;
        cur_index->cur_prop = NULL;

#ifdef DEBUG_MERGE
        dump_metanames( sw_input, cur_index, 1 );
        dump_metanames( sw_output, sw_output->indexlist, 0 );
#endif

        cur_index = cur_index->next;
    }


#ifdef DEBUG_MERGE
    printf("----- Output Header (requires -H9) ----------\n");
    print_index_headers( sw_output->indexlist );
    printf("\n\n");
#endif



    /****************************************************************************
    *  Now, read in filename order (so can throw out duplicates)
    *  - read properties and write out to new index
    *  - write a temporay of records to identify
    *       - indexfile
    *       - old filenum to new filenum mapping
    *       - total words per file, if set
    ****************************************************************************/

    /* place to store file number map and total words per file */
    filenum_map = create_tempfile(sw_input, F_WRITE_BINARY, "fnum", &tmpfilename, 0 );

    while( (cur_index = get_next_file_in_order( sw_input )) )
        add_file( filenum_map, cur_index, sw_output );



    /* Don't need the pre-sorted indexes any more */
    for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next )
    {
        efree( cur_index->path_order );
        cur_index->path_order = NULL;
    }

    fclose( filenum_map );

    if ( !(filenum_map = fopen( tmpfilename, F_READ_BINARY )) )
        progerrno("failed to reopen '%s' :", tmpfilename );



    /****************************************************************************
    *  Finally, read the indexes one-by-one to read word and position data
    *  - reads through the temp file for each index to build a filenumber map
    *
    ****************************************************************************/

    /* 08/2002 jmruiz
    ** First of all, get all the words
    */
    cur_index = sw_input->indexlist;
    while( cur_index )
    {
        dump_index_words(sw_input, cur_index, sw_output);
        /* Get filr_num_map for later proccess */
        cur_index->merge_file_num_map = get_map( filenum_map, cur_index );
        cur_index = cur_index->next;
    }

    /* At this point we have all the words. Now we have to get worddata
    * and merge it
    */
    word_count = 0;
    printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
    fflush(stdout);
    /* walk the hash list to merge worddata */
    for (hash = 0; hash < VERYBIGHASHSIZE; hash++)
    {
        if (idx_output->hashentriesdirty[hash])
        {
            idx_output->hashentriesdirty[hash] = 0;
            for (e = idx_output->hashentries[hash]; e; e = e->next)
            {
                word_count++;
                /* Search the word in all index and get worddata */
                cur_index = sw_input->indexlist;
                while( cur_index )
                {
                    DB_ReadWordHash(sw_input, e->word, &wordID, cur_index->DB);
                    /* If word exits in the index */
                    if(wordID)
                    {

                        DB_ReadWordData(sw_input, wordID, &worddata, &sz_worddata, &saved_bytes, cur_index->DB);
                        uncompress_worddata(&worddata,&sz_worddata,saved_bytes);

                        /* Now, parse word's data */
                        s = worddata;
                        tmpval = uncompress2(&s);     /* tfrequency */
                        metaID = uncompress2(&s);     /* metaID */

                        if (metaID)
                        {
                            metadata_length = uncompress2(&s);
                        }

                        filenum = 0;
                        start = s;

                        while(1)
                        {                   /* Read on all items */
                            uncompress_location_values(&s,&flag,&tmpval,&frequency);
                            filenum += tmpval;
                            /* Use stack array when possible to avoid malloc/free overhead */
                            if(frequency > MAX_STACK_POSITIONS)
                                posdata = (unsigned int *) emalloc(frequency * sizeof(int));
                            else
                                posdata = local_posdata;

                            /* Read the positions */
                            uncompress_location_positions(&s,flag,frequency,posdata);


                            /* now we have the word data */
                            for (i = 0; i < frequency; i++, loc_count++)
                                write_word_pos( cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]);

                            if(e->tfrequency)
                            {
                                /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time
                                ** to time to help addentry.
                                ** If we do not do this, addentry routine will have to run linked lists
                                ** of positions with thousands of elements and makes the merge proccess
                                ** very slow
                                */
                                if(!(loc_count % 100))
                                    CompressCurrentLocEntry(sw_output, e);
                            }


                            if(posdata != local_posdata)
                                efree(posdata);

                            /* Check for enf of worddata */
                            if ((s - worddata) == sz_worddata)
                                break;   /* End of worddata */

                            /* Check for end of current metaID data */
                            if ( metadata_length == (s - start))
                            {
                                filenum = 0;
                                metaID = uncompress2(&s);
                                metadata_length = uncompress2(&s);
                                start = s;
                            }
                        }

                        if(e->tfrequency)
                            CompressCurrentLocEntry(sw_output, e);

                        efree(worddata);
                    }
                    cur_index = cur_index->next;
                }
                /* Let's coalesce locations for each word to save memory
                ** This makes use of the -e feature
                ** Because we are proccessing one word at a time we can
                ** coalesce its data just once
                */
                coalesce_word_locations(sw_output,e);

                if(!(word_count % 1000))
                {
                    /* Make zone available for reuse and save memory */
                    Mem_ZoneReset(sw_output->Index->currentChunkLocZone);
                    sw_output->Index->freeLocMemChain = NULL;
                    printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
                }
            }
        }
    }

    printf("Processing words in index '%s': %6d words\n", sw_output->indexlist->line, word_count);
    fflush(stdout);

    cur_index = sw_input->indexlist;
    while( cur_index )
    {
        /* free the maps */
        efree( cur_index->merge_file_num_map );
        efree( cur_index->meta_map );
        cur_index->meta_map = NULL;
        cur_index = cur_index->next;
    }


#ifdef DEBUG_MERGE
    printf("----- Final Output Header (requires -H9) ----------\n");
    print_index_headers( sw_output->indexlist );
#endif

    remove( tmpfilename );
    efree( tmpfilename );


    /* 2002/09 MERGE fix jmruiz */
    /* Finally, remove words from the hash array with tfrequncy == 0 */
    /* walk the hash list to merge worddata */
    for (word_count = 0, hash = 0; hash < VERYBIGHASHSIZE; hash++)
    {
        for (prev = NULL, e = idx_output->hashentries[hash]; e; e = e->next)
        {
            if( ! e->tfrequency )
            {
                word_count++;
                if( ! prev)   /* First in list */
                {
                    idx_output->hashentries[hash] = e->next;
                }
                else
                {
                    prev->next = e->next;
                }
                /* Adjust counters */
                idx_output->entryArray->numWords--;
                sw_output->indexlist->header.totalwords--;
            }
            else
            {
                prev = e;
            }
        }
    }
    printf("Removed %6d words no longer present in docs for index '%s'\n",
       word_count, sw_output->indexlist->line);

    /* 2002/09 MERGE FIX end */



}

/****************************************************************************
*  dup_header -- duplicates a header
*
*  rereads the header from the data base, and clears out some values
*
*****************************************************************************/

static void dup_header( SWISH *sw_input, SWISH *sw_output )
{
    INDEXDATAHEADER *out_header = &sw_output->indexlist->header;

    // probably need to free the sw_output header from what's created in swishnew.

    /* Read in the header from the first merge file and store in the output file */
    read_header(sw_input, out_header, sw_input->indexlist->DB);

    out_header->totalfiles = 0;

    /* $$$ This needs to be fixed */
    out_header->removedfiles = 0;
    out_header->removed_word_positions = 0;
    out_header->totalwords = 0;

    freeMetaEntries( out_header );

    /* Remove the date from the index */

    if ( out_header->indexedon )
    {
        efree( out_header->indexedon );
        out_header->indexedon = NULL;
        out_header->lenindexedon = 0;
    }
}

/****************************************************************************
*  check_header_match -- makes sure that the imporant settings match
*
*
*****************************************************************************/

// This assumes that the size will always preceed the content.
typedef struct
{
    int     len;
    char    *str;
} *HEAD_CMP;

static void compare_header( char *index, char *name, void *in, void *out )
{
    HEAD_CMP    in_item = (HEAD_CMP)in;
    HEAD_CMP    out_item = (HEAD_CMP)out;

    if ( in_item->len != out_item->len )
        progerr("Header %s in index %s doesn't match length in length with output header", name, index );

    if ( strcmp( (const char *)in_item->str, (const char *)out_item->str ))
        progerr("Header %s in index %s doesn't match output header", name, index );

    //if ( memcmp( (const void *)in_item->str, (const void *)out_item->str, in_item->len ) )
    //    progerr("Header %s in index %s doesn't match output header", name, index );




}


static void check_header_match( IndexFILE *in_index, SWISH *sw_output )
{
    INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
    INDEXDATAHEADER *in_header = &in_index->header;

    compare_header( in_index->line, "WordCharacters", &in_header->lenwordchars,  &out_header->lenwordchars );
    compare_header( in_index->line, "BeginCharacters", &in_header->lenbeginchars,  &out_header->lenbeginchars );
    compare_header( in_index->line, "EndCharacters", &in_header->lenendchars,  &out_header->lenendchars );

    compare_header( in_index->line, "IgnoreLastChar", &in_header->lenignorelastchar,  &out_header->lenignorelastchar );
    compare_header( in_index->line, "IgnoreFirstChar", &in_header->lenignorefirstchar,  &out_header->lenignorefirstchar );

    compare_header( in_index->line, "BumpPositionChars", &in_header->lenbumpposchars,  &out_header->lenbumpposchars );


    if ( fuzzy_mode_value(in_header->fuzzy_data) != fuzzy_mode_value(out_header->fuzzy_data) )
        progerr("FuzzyIndexingMode in index %s of '%s' doesn't match '%s'",
            in_index->line,
            fuzzy_string( in_header->fuzzy_data ),
            fuzzy_string( out_header->fuzzy_data ));

    if ( in_header->ignoreTotalWordCountWhenRanking != out_header->ignoreTotalWordCountWhenRanking )
        progerr("ignoreTotalWordCountWhenRanking Rules doesn't match for index %s", in_index->line );

    if ( memcmp( &in_header->translatecharslookuptable, &out_header->translatecharslookuptable, sizeof(in_header->translatecharslookuptable) / sizeof( int ) ) )
        progerr("TranslateChars header doesn't match for index %s", in_index->line );


    //??? need to compare stopword lists

    //??? need to compare buzzwords

}

/****************************************************************************
*  make_meta_map - adds metanames to output index and creates map
*
*
*****************************************************************************/

static void make_meta_map( IndexFILE *in_index, SWISH *sw_output)
{
    INDEXDATAHEADER *out_header = &sw_output->indexlist->header;
    INDEXDATAHEADER *in_header = &in_index->header;
    int             i;
    struct metaEntry *in_meta;
    struct metaEntry *out_meta;
    int             *meta_map;

    meta_map = emalloc( sizeof( int ) * (in_header->metaCounter + 1) );
    memset( meta_map, 0, sizeof( int ) * (in_header->metaCounter + 1) );

    for( i = 0; i < in_header->metaCounter; i++ )
    {
        in_meta = in_header->metaEntryArray[i];


        /* Try to see if it's an existing metaname */
        out_meta = is_meta_index( in_meta )
                   ? getMetaNameByNameNoAlias( out_header, in_meta->metaName )
                   : getPropNameByNameNoAlias( out_header, in_meta->metaName );



        /* if meta from input header is not found in the output header then add it */
        if ( !out_meta )
            out_meta = cloneMetaEntry( out_header, in_meta ); /* can't fail */


        /* Validate that the two metas are indeed the same */
        /* This should be done in metanames.c, but error messages are harder */

        if (out_meta->metaType != in_meta->metaType )
            progerr("meta name %s in index %s is different type than in output index", in_meta->metaName, in_index->line );

        if (out_meta->sort_len != in_meta->sort_len )
            progerr("meta name %s in index %s has different sort length than in output index", in_meta->metaName, in_index->line );

        if (out_meta->rank_bias != in_meta->rank_bias )
            progerr("meta name %s in index %s is different rank bias than in output index", in_meta->metaName, in_index->line );




        /* Now, save the mapping */
        meta_map[ in_meta->metaID ] = out_meta->metaID;


        /* 
         * now here's a pain, and lots of room for screw up.
         * Basically, check for alias mappings, and that they are correct
         * you can say title is an alias for swishtitle in one index, and then say
         * title is an alias for doctitle in another index, which would be an error.
         * So, if title is an alias for swishtitle, then the output index either
         * needs to have that alias already, or it must be created.
         */

        if ( in_meta->alias )
        {
            struct metaEntry *in_alias;
            struct metaEntry *out_alias;

            /* Grab alias meta entry so we can look it up in the out_header */

            in_alias = is_meta_index( in_meta )
                   ? getMetaNameByID( in_header, in_meta->alias )
                   : getPropNameByID( in_header, in_meta->alias );


            /* This should not happen -- it would be a very broken input header */
            if ( !in_alias )
                progerr("Failed to lookup alias for %s in index %s", in_meta->metaName, in_index->line );


            /* now lookup the alias in the out_header by name */
            out_alias = is_meta_index( in_alias )
                   ? getMetaNameByNameNoAlias( out_header, in_alias->metaName )
                   : getPropNameByNameNoAlias( out_header, in_alias->metaName );


            /* 
             * should be there, since it would have been added earlier 
             * the real metas must be added before the aliases 
             * */

            if ( !out_alias )
                progerr("Failed to lookup alias for %s in output index", out_meta->metaName );


            /* If this is new (or doesn't point to the alias root, then just assign it */
            if ( !out_meta->alias )
                out_meta->alias = out_alias->metaID;

            /* else, if it is already an alias, but points someplace else, we have a problem */
            else if ( out_meta->alias != out_alias->metaID )
                progerr("In index %s metaname '%s' is an alias for '%s'(%d).  But another input index already mapped '%s' to '%s'(%d)", 
                        in_index->line, in_meta->metaName, in_alias->metaName, in_alias->metaID,
                        out_meta->metaName,
                        is_meta_index( out_meta )
                            ?  getMetaNameByID( out_header,  out_meta->alias )->metaName
                            :  getPropNameByID( out_header,  out_meta->alias )->metaName,
                        out_meta->alias
                        );
        }
    }

    in_index->meta_map = meta_map;


#ifdef DEBUG_MERGE
    printf(" %s   ->   %s  ** Meta Map **\n", in_index->line, sw_output->indexlist->line );
    for ( i=0; i<in_header->metaCounter + 1;i++)
        printf("%4d  ->  %3d\n", i, meta_map[i] );
#endif

}

/****************************************************************************
*  load_filename_sort - creates an array for reading in filename order
*
*
*****************************************************************************/

static int  *sorted_data;  /* Static array to make the qsort function a bit quicker */

static int     compnums(const void *s1, const void *s2)
{
    int         a = *(int *)s1; // filenumber passed from qsort
    int         b = *(int *)s2;
    int         v1 = sorted_data[ a-1 ];
    int         v2 = sorted_data[ b-1 ];

    // return v1 <=> v2;

    if ( v1 < v2 )
        return -1;
    if ( v1 > v2 )
        return 1;

    return 0;
}

/******************************************************************************
* load_filename_sort -
*
*   Creates an array used for sorting file names.
*   Uses the pre-sorted array, if available, otherwise, creates one.
*
*******************************************************************************/

static void load_filename_sort( SWISH *sw, IndexFILE *cur_index )
{
    struct metaEntry *path_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_DOCPATH );
    int         i;
    int         *sort_array;
    int         totalfiles = cur_index->header.totalfiles;

    if ( !path_meta )
        progerr("Can't merge index %s.  It doesn't contain the property %s", cur_index->line, AUTOPROPERTY_DOCPATH );


    /* Save for looking up pathname when sorting */
    cur_index->path_meta = path_meta;

    /* Case is important for most OS when comparing file names */
    cur_index->path_meta->metaType &= ~META_IGNORE_CASE;



    cur_index->modified_meta = getPropNameByName( &cur_index->header, AUTOPROPERTY_LASTMODIFIED );


    /*
     * Since USE_PRESORT_ARRAY has a different internal format that what is generated
     * by CreatePropeSortArray() we must ALWAYS create an actual integer
     * array total_files long.
     * 
     * $$$ The problem is that with USE_PRESORT_ARRAY the format is different
     *     before and after saving the array to disk
     */

#ifdef USE_PRESORT_ARRAY
    if ( 1 )
#else
    if ( !LoadSortedProps( cur_index, path_meta ) )
#endif

    {
        FileRec fi;
        memset( &fi, 0, sizeof( FileRec ));
        path_meta->sorted_data = CreatePropSortArray( cur_index, path_meta, &fi, 1 );
    }


    /* So the qsort compare function can read it */
    sorted_data = path_meta->sorted_data;


    if ( !sorted_data )
        progerr("failed to load or create sorted properties for index %s", cur_index->line );


    sort_array = emalloc(  totalfiles * sizeof( int ) );
    memset( sort_array, 0, totalfiles * sizeof( int ) );


    /* build an array with file numbers and sort into filename order */
    for ( i = 0; i < totalfiles; i++ )
        sort_array[i] = i+1;  // filenumber starts a one


    swish_qsort( sort_array, totalfiles, sizeof( int ), &compnums);

    cur_index->path_order = sort_array;

    /* $$$ can this be freeded when using BTREE??? */
    efree( path_meta->sorted_data );
    path_meta->sorted_data = NULL;
}

/****************************************************************************
*  get_next_file_in_order -- grabs the next file entry from all the indexes
*  in filename (and then modified date) order
*
*
*****************************************************************************/

/* This isn't really accurate, as some other file may come and replace the newer */

static void print_file_removed(IndexFILE *older, propEntry *op, IndexFILE *newer, propEntry *np )
{

    char *p1, *d1, *p2, *d2;
    p1 = DecodeDocProperty( older->path_meta, older->cur_prop );
    d1 = DecodeDocProperty( older->modified_meta, op );

    p2 = DecodeDocProperty( newer->path_meta, newer->cur_prop );
    d2 = DecodeDocProperty( newer->modified_meta, np );

    printf("Replaced file '%s:%s %s' with '%s:%s %s'\n",
         older->line,
         *p1 ? p1 : "(file name not defined)",
         *d1 ? d1 : "(date not defined)",
         newer->line,
         *p2 ? p2 : "(file name not defined)",
         *d2 ? d2 : "(date not defined)"
    );

    efree( p1 );
    efree( d1 );
    efree( p2 );
    efree( d2 );

}


static IndexFILE *get_next_file_in_order( SWISH *sw_input )
{
    IndexFILE   *winner = NULL;
    IndexFILE   *cur_index = sw_input->indexlist;
    FileRec     fi;
    int         ret;
    propEntry   *wp, *cp;

    memset(&fi, 0, sizeof( FileRec ));

    for ( cur_index = sw_input->indexlist; cur_index; cur_index = cur_index->next )
    {
        /* don't use cached props, as they belong to a different index! */
        if ( fi.prop_index )
            efree( fi.prop_index );
        memset(&fi, 0, sizeof( FileRec ));

        /* still some to read in this index? */
        if ( cur_index->current_file >= cur_index->header.totalfiles )
            continue;



        /* get file number from lookup table */
        fi.filenum = cur_index->path_order[cur_index->current_file];

        if ( !cur_index->cur_prop )
            cur_index->cur_prop = ReadSingleDocPropertiesFromDisk(cur_index, &fi, cur_index->path_meta->metaID, 0 );


        if ( !winner )
        {
            winner = cur_index;
            continue;
        }

        ret = Compare_Properties( cur_index->path_meta, cur_index->cur_prop, winner->cur_prop );

        if ( ret != 0 )
        {
            if ( ret < 0 )  /* take cur_index if it's smaller */
                winner = cur_index;

            continue;
        }



        /* if they are the same name, then take the newest, and increment the older one */


        /* read the modified time for the current file */
        /* Use the same fi record, because it has the cached prop seek locations */
        cp = ReadSingleDocPropertiesFromDisk(cur_index, &fi, cur_index->modified_meta->metaID, 0 );


        /* read the modified time for the current winner */
        if ( fi.prop_index )
            efree( fi.prop_index );
        memset(&fi, 0, sizeof( FileRec ));

        fi.filenum = winner->path_order[winner->current_file];
        wp = ReadSingleDocPropertiesFromDisk(winner, &fi, cur_index->modified_meta->metaID, 0 );

        ret = Compare_Properties( cur_index->modified_meta, cp, wp );



        /* If current is greater (newer) then throw away winner */
        if ( ret > 0 )
        {
            print_file_removed( winner, wp, cur_index, cp);
            winner->current_file++;
            if ( winner->cur_prop )
                efree( winner->cur_prop );
            winner->cur_prop = NULL;
            winner = cur_index;
        }
        /* else, keep winner, and throw away current */
        else
        {
            print_file_removed(cur_index, cp, winner, wp );
            cur_index->current_file++;
            if ( cur_index->cur_prop )
                efree( cur_index->cur_prop );

            cur_index->cur_prop = NULL;
        }

        freeProperty( cp );
        freeProperty( wp );

    }

    if ( fi.prop_index )
        efree( fi.prop_index );


    if ( !winner )
        return NULL;


    winner->filenum = winner->path_order[winner->current_file++];

#ifdef DEBUG_MERGE
printf("   Files in order: index %s file# %d winner\n", winner->line, winner->filenum );
#endif

    /* free prop, as it's not needed anymore */
    if ( winner->cur_prop )
        efree( winner->cur_prop );
    winner->cur_prop = NULL;


    return winner;
}


/****************************************************************************
*  add_file
*
*  Now, read in filename order (so can throw out duplicates)
*  - read properties and write out to new index
*  - write a temporay of records to identify
*       - indexfile
*       - old filenum to new filenum mapping
*       - total words per file, if set
****************************************************************************/

static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_output )
{
    FileRec             fi;
    IndexFILE           *indexf = sw_output->indexlist;
    struct MOD_Index    *idx = sw_output->Index;
    docProperties       *d;
    int                 i;
    propEntry           *tmp;
    docProperties       *docProperties=NULL;
    struct metaEntry    meta_entry;


    meta_entry.metaName = "(default)";  /* for error message, I think */


    memset( &fi, 0, sizeof( FileRec ));


#ifdef DEBUG_MERGE
    printf("Reading Properties from input index '%s' file %d\n", cur_index->line, cur_index->filenum);
#endif

    /* read the properties and map them as needed */
    d = ReadAllDocPropertiesFromDisk( cur_index, cur_index->filenum );


#ifdef DEBUG_MERGE
    fi.docProperties = d;
    dump_file_properties( cur_index, &fi );
#endif



    /* all this off-by-one things are a mess */

    /* read through all the property slots, and map them, as needed */
    for ( i = 0; i < d->n; i++ )
        if ( (tmp = d->propEntry[i]) )
        {
            meta_entry.metaID = cur_index->meta_map[ i ];
            addDocProperty(&docProperties, &meta_entry, tmp->propValue, tmp->propLen, 1 );
        }

#ifdef DEBUG_MERGE
    printf(" after mapping file %s\n", indexf->line);
    fi.docProperties = docProperties;
    dump_file_properties( cur_index, &fi );
    printf("\n");
#endif


    /* Now bump the file counter  */
    idx->filenum++;
    indexf->header.totalfiles++;

    if ( docProperties )  /* always true */
    {
        fi.filenum = idx->filenum;
        fi.docProperties = docProperties;

        WritePropertiesToDisk( sw_output , &fi );

        freeDocProperties( d );
    }




    /* now write out the data to be used for mapping file for a given index. */
    //    compress1( cur_index->filenum, filenum_map, fputc );   // what file number this came from

    if ( fwrite( &cur_index->filenum, sizeof(int), 1, filenum_map) != 1 )
        progerrno("Failed to write mapping data: ");

    if ( fwrite( &cur_index, sizeof(IndexFILE *), 1, filenum_map) != 1 )        // what index
        progerrno("Failed to write mapping data: ");


    /* Save total words per file */
    if ( !indexf->header.ignoreTotalWordCountWhenRanking )
    {
        INDEXDATAHEADER *header = &indexf->header;
        int idx1 = fi.filenum - 1;

        if ( !header->TotalWordsPerFile || idx1 >= header->TotalWordsPerFileMax )
        {
            header->TotalWordsPerFileMax += 20000;  /* random guess -- could be a config setting */
            header->TotalWordsPerFile = erealloc( header->TotalWordsPerFile, header->TotalWordsPerFileMax * sizeof(int) );
        }

        header->TotalWordsPerFile[idx1] = cur_index->header.TotalWordsPerFile[cur_index->filenum-1];
    }
}

/****************************************************************************
*  Builds a old_filenum -> new_filenum map;
*
*  This makes is so you can lookup an old file number and map it to a new file number
*
****************************************************************************/

static int *get_map( FILE *filenum_map, IndexFILE *cur_index )
{
    int         *array = emalloc( (cur_index->header.totalfiles+1) * sizeof( int ) );
    IndexFILE   *idf;
    int         filenum;
    int         new_filenum = 0;



    memset( array, 0, (cur_index->header.totalfiles+1) * sizeof( int ) );


    clearerr( filenum_map );
    fseek( filenum_map, 0, 0 );  /* start at beginning */

    while ( 1 )
    {
        new_filenum++;

        if (!fread( &filenum, sizeof(int), 1, filenum_map))
            break;


        if(!fread( &idf, sizeof(IndexFILE *), 1, filenum_map))
            break;

        if ( idf == cur_index )
            array[filenum] = new_filenum;

    }

    return array;
}

/****************************************************************************
*  Reads the index to get the all the words
****************************************************************************/

static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output)
{
    int         j;
    int         word_count = 0;
    char        word[2];
    char       *resultword;
    sw_off_t    wordID;

    DB_InitReadWords(sw, indexf->DB);


    printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
    fflush(stdout);

    for(j=0;j<256;j++)
    {

        word[0] = (unsigned char) j; word[1] = '\0';
        DB_ReadFirstWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB);

        while(wordID)
        {
            /* Add resultword to output */
            getentry(sw_output, resultword);
            efree(resultword);
            DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB);
            word_count++;
            if(!word_count % 10000)
                printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
        }
    }
    printf("Getting words in index '%s': %6d words\n", indexf->line, word_count);

    DB_EndReadWords(sw, indexf->DB);

}

/****************************************************************************
*  Writes a word out to the index
*
*
****************************************************************************/

static void write_word_pos( IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, unsigned int posdata )
{
    int         new_file;
    int         new_meta;

#ifdef DEBUG_MERGE
    printf("\nindex %s '%s' Struct: %d Pos: %d",
    indexf->line, e->word, GET_STRUCTURE(posdata), GET_POSITION(posdata) );


    if ( !(new_file = file_num_map[ filenum ]) )
    {
        printf("  file: %d **File deleted!**\n", filenum);
        return;
    }

    if ( !(new_meta = indexf->meta_map[ metaID ] ))
    {
        printf("  file: %d **Failed to map meta ID **\n", filenum);
        return;
    }

    printf("  File: %d -> %d  Meta: %d -> %d\n", filenum, new_file, metaID, new_meta );

    addentry( sw_output, e, new_file, GET_STRUCTURE(posdata), new_meta, GET_POSITION(posdata) );

    return;


#else


    if ( !(new_file = file_num_map[ filenum ]) )
        return;

    if ( !(new_meta = indexf->meta_map[ metaID ] ))
        return;

    addentry( sw_output, e, new_file, GET_STRUCTURE(posdata), new_meta, GET_POSITION(posdata) );

    return;

#endif


}