File: TextCollector.h

package info (click to toggle)
htdig 1%3A3.2.0b6-21
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 21,292 kB
sloc: ansic: 49,632; cpp: 46,468; sh: 17,400; xml: 4,180; perl: 2,543; makefile: 888; php: 79; asm: 14
file content (133 lines) | stat: -rw-r--r-- 3,600 bytes
//--------------------------------------------------------------------
//
// TextCollector.h
//
// 2/6/2002 created for libhtdig
//
// Neal Richter nealr@rightnow.com
//
// TextCollector:
//            General Purpose Text Document Indexer.
//            Calls appropriate parsers. 
//            The  parser notifies the TextCollector object that it got something
//            (got_* functions) and the TextCollector object feed the databases
//            and statistics accordingly.
//
// Part of the ht://Dig package   <https://htdig.sourceforge.net/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: TextCollector.h,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//--------------------------------------------------------------------


#ifndef _TextCollector_h_
#define _TextCollector_h_

#include "BasicDocument.h"
#include "DocumentRef.h"
#include "Dictionary.h"
#include "Queue.h"
#include "HtWordReference.h"
#include "List.h"
#include "StringList.h"
#include "DocumentDB.h"

class Document;
class HtWordList;

enum  TextCollectorLog {
    TextCollector_noLog,
    TextCollector_logUrl,
    TextCollector_Restart
};

class TextCollector
{
    public:
        //
        // Construction/Destruction
        //
        			TextCollector(TextCollectorLog flags = TextCollector_noLog);
        virtual		~TextCollector();
    
        int        IndexDoc(BasicDocument & adoc);
        int        FlushWordDB();
    
        //
        // Report statistics about the parser
        //
        void		ReportStatistics(const String& name);
    	
        //
        // These are the callbacks that we need to write code for
        //
        void		got_word(const char *word, int location, int heading);
        void		got_href(URL &url, const char *description, int hops = 1);
        void		got_title(const char *title);
        void		got_time(const char *time);
        void		got_head(const char *head);
        void		got_meta_dsc(const char *md);
        void		got_anchor(const char *anchor);
        void		got_image(const char *src);
        void		got_meta_email(const char *);
        void		got_meta_notification(const char *);
        void		got_meta_subject(const char *);
        void                got_noindex();
    
    
    private:
        //
        // A hash to keep track of what we've seen
        //
        Dictionary		visited;
        
        URL			*base;
        String		current_title;
        String		current_head;
        String		current_meta_dsc;
        time_t		current_time;
        int			current_id;
        DocumentRef		*current_ref;
        int			current_anchor_number;
        int			trackWords;
        int			n_links;
        HtWordReference	word_context;
        HtWordList		words;
    	
        int			check_unique_md5;
        int			check_unique_date;
    
    
        TextCollectorLog log;
        //
        // These are weights for the words.  The index is the heading level.
        //
        long int		factor[11];
        int			currenthopcount;
    
        //
        // For efficiency reasons, we will only use one document object which
        // we reuse.
        //
        BasicDocument		*doc;
    
        Database 		*d_md5;
    
        // Some useful constants
        int              minimumWordLength;
    
        //
        // Helper routines
        //
        void		RetrievedDocument(DocumentRef *ref);
    
        int      temp_doc_count;
};

#endif