File: CrawlHistory.h

package info (click to toggle)
pinot 0.85-1
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 5,524 kB
  • ctags: 3,868
  • sloc: cpp: 33,107; sh: 8,801; ansic: 3,049; makefile: 557; xml: 366; python: 250
file content (101 lines) | stat: -rw-r--r-- 3,206 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
 *  Copyright 2005-2008 Fabrice Colin
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#ifndef _CRAWL_HISTORY_H
#define _CRAWL_HISTORY_H

#include <time.h>
#include <string>
#include <map>
#include <set>

#include "SQLiteBase.h"

/// Manages crawl history.
class CrawlHistory : public SQLiteBase
{
	public:
		CrawlHistory(const std::string &database);
		virtual ~CrawlHistory();

		typedef enum { UNKNOWN, CRAWLING, CRAWLED, ERROR } CrawlStatus;

		/// Creates the CrawlHistory table in the database.
		static bool create(const std::string &database);

		/// Inserts a source.
		unsigned int insertSource(const std::string &url);

		/// Checks if the source exists.
		bool hasSource(const std::string &url, unsigned int &sourceId);

		/// Returns sources.
		unsigned int getSources(std::map<unsigned int, std::string> &sources);

		/// Deletes a source.
		bool deleteSource(unsigned int sourceId);

		/// Inserts an URL.
		bool insertItem(const std::string &url, CrawlStatus status, unsigned int sourceId,
			time_t date, int errNum = 0);

		/// Checks if an URL is in the history.
		bool hasItem(const std::string &url, CrawlStatus &status, time_t &date);

		/// Updates an URL.
		bool updateItem(const std::string &url, CrawlStatus status, time_t date, int errNum = 0);

		/// Updates URLs.
		bool updateItems(const std::map<std::string, time_t> urls, CrawlStatus status);

		/// Updates the status of items en masse.
		bool updateItemsStatus(unsigned int sourceId, CrawlStatus currentStatus, CrawlStatus newStatus);

		/// Gets the error number and date for a URL.
		int getErrorDetails(const std::string &url, time_t &date);

		/// Returns items that belong to a source.
		unsigned int getSourceItems(unsigned int sourceId, CrawlStatus status,
			std::set<std::string> &urls, time_t minDate = 0);

		/// Returns the number of URLs.
		unsigned int getItemsCount(CrawlStatus status);

		/// Deletes an URL.
		bool deleteItem(const std::string &url);

		/// Deletes all items under a given URL.
		bool deleteItems(const std::string &url);

		/// Deletes URLs belonging to a source.
		bool deleteItems(unsigned int sourceId, CrawlStatus status = UNKNOWN);

		/// Expires items older than the given date.
		bool expireItems(time_t expiryDate);

	protected:
		static std::string statusToText(CrawlStatus status);
		static CrawlStatus textToStatus(const std::string &text);

	private:
		CrawlHistory(const CrawlHistory &other);
		CrawlHistory &operator=(const CrawlHistory &other);

};

#endif // _CRAWL_HISTORY_H