1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
//--------------------------------------------------------------------
//
// TextCollector.h
//
// 2/6/2002 created for libhtdig
//
// Neal Richter nealr@rightnow.com
//
// TextCollector:
// General Purpose Text Document Indexer.
// Calls appropriate parsers.
// The parser notifies the TextCollector object that it got something
// (got_* functions) and the TextCollector object feed the databases
// and statistics accordingly.
//
// Part of the ht://Dig package <https://htdig.sourceforge.net/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: TextCollector.h,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//--------------------------------------------------------------------
#ifndef _TextCollector_h_
#define _TextCollector_h_
#include "BasicDocument.h"
#include "DocumentRef.h"
#include "Dictionary.h"
#include "Queue.h"
#include "HtWordReference.h"
#include "List.h"
#include "StringList.h"
#include "DocumentDB.h"
class Document;
class HtWordList;
enum TextCollectorLog {
TextCollector_noLog,
TextCollector_logUrl,
TextCollector_Restart
};
class TextCollector
{
public:
//
// Construction/Destruction
//
TextCollector(TextCollectorLog flags = TextCollector_noLog);
virtual ~TextCollector();
int IndexDoc(BasicDocument & adoc);
int FlushWordDB();
//
// Report statistics about the parser
//
void ReportStatistics(const String& name);
//
// These are the callbacks that we need to write code for
//
void got_word(const char *word, int location, int heading);
void got_href(URL &url, const char *description, int hops = 1);
void got_title(const char *title);
void got_time(const char *time);
void got_head(const char *head);
void got_meta_dsc(const char *md);
void got_anchor(const char *anchor);
void got_image(const char *src);
void got_meta_email(const char *);
void got_meta_notification(const char *);
void got_meta_subject(const char *);
void got_noindex();
private:
//
// A hash to keep track of what we've seen
//
Dictionary visited;
URL *base;
String current_title;
String current_head;
String current_meta_dsc;
time_t current_time;
int current_id;
DocumentRef *current_ref;
int current_anchor_number;
int trackWords;
int n_links;
HtWordReference word_context;
HtWordList words;
int check_unique_md5;
int check_unique_date;
TextCollectorLog log;
//
// These are weights for the words. The index is the heading level.
//
long int factor[11];
int currenthopcount;
//
// For efficiency reasons, we will only use one document object which
// we reuse.
//
BasicDocument *doc;
Database *d_md5;
// Some useful constants
int minimumWordLength;
//
// Helper routines
//
void RetrievedDocument(DocumentRef *ref);
int temp_doc_count;
};
#endif
|