File: ebook_search.cpp

package info (click to toggle)
kchmviewer 8.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,900 kB
  • sloc: cpp: 8,099; sh: 145; makefile: 3
file content (225 lines) | stat: -rw-r--r-- 5,383 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
/*
 *  Kchmviewer - a CHM and EPUB file viewer with broad language support
 *  Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <QApplication>

#include "ebook.h"
#include "ebook_search.h"

// Helper class to simplity state management and data keeping
class SearchDataKeeper
{
	public:
		SearchDataKeeper() { m_inPhrase = false; }

		void beginPhrase()
		{
			phrase_terms.clear();
			m_inPhrase = true;
		}

		void endPhrase()
		{
			m_inPhrase = false;
			phrasewords += phrase_terms;
			phrases.push_back( phrase_terms.join(" ") );
		}

		bool isInPhrase() const { return m_inPhrase; }

		void addTerm( const QString& term )
		{
			if ( !term.isEmpty() )
			{
				terms.push_back( term );

				if ( m_inPhrase )
					phrase_terms.push_back( term );
			}
		}

		// Should contain all the search terms present in query, includind those from phrases. One element - one term .
		QStringList terms;

		// Should contain phrases present in query without quotes. One element - one phrase.
		QStringList phrases;

		// Should contain all the terms present in all the phrases (but not outside).
		QStringList phrasewords;

	private:
		bool		m_inPhrase;
		QStringList phrase_terms;
};



EBookSearch::EBookSearch()
{
	m_Index = 0;
}


EBookSearch::~ EBookSearch()
{
	delete m_Index;
}


bool EBookSearch::loadIndex( QDataStream & stream )
{
	delete m_Index;

	m_Index = new QtAs::Index();
	return m_Index->readDict( stream );
}


bool EBookSearch::generateIndex( EBook * ebookFile, QDataStream & stream )
{
	QList< QUrl > documents;
	QList< QUrl > alldocuments;
	
	emit progressStep( 0, "Generating the list of documents" );
	processEvents();

	// Enumerate the documents
	if ( !ebookFile->enumerateFiles( alldocuments ) )
		return false;
			
	if ( m_Index )
		delete m_Index;

	m_Index = new QtAs::Index();
	connect( m_Index, SIGNAL( indexingProgress( int, const QString& ) ), this, SLOT( updateProgress( int, const QString& ) ) );
	
	// Process the list of files in CHM archive and keep only HTML document files from there
	for ( int i = 0; i < alldocuments.size(); i++ )
	{
		QString docpath = alldocuments[i].path();

		if ( docpath.endsWith( ".html", Qt::CaseInsensitive )
		|| docpath.endsWith( ".htm", Qt::CaseInsensitive )
		|| docpath.endsWith( ".xhtml", Qt::CaseInsensitive ) )
			documents.push_back( alldocuments[i] );
	}

    if ( !m_Index->makeIndex( documents, ebookFile ) )
	{
		delete m_Index;
		m_Index = 0;
		return false;
	}
	
	m_Index->writeDict( stream );
	m_keywordDocuments.clear();
	
	return true;
}


void EBookSearch::cancelIndexGeneration()
{
	m_Index->setLastWinClosed();
}


void EBookSearch::updateProgress(int value, const QString & stepName)
{
	emit progressStep( value, stepName );
}

void EBookSearch::processEvents()
{
	// Do it up to ten times; some events generate other events
	for ( int i = 0; i < 10; i++ )
		qApp->processEvents( QEventLoop::ExcludeUserInputEvents );
}

bool EBookSearch::searchQuery(const QString & query, QList< QUrl > * results, EBook *ebookFile, unsigned int limit)
{
	// We should have index
	if ( !m_Index )
		return false;
	
	// Characters which split the words. We need to make them separate tokens
	QString splitChars = m_Index->getCharsSplit();
	
	// Characters which are part of the word. We should keep them apart.
	QString partOfWordChars = m_Index->getCharsPartOfWord();
	
	// Variables to store current state
	SearchDataKeeper keeper;	
	QString term;

	for ( int i = 0; i < query.length(); i++ )
	{
		QChar ch = query[i].toLower();
		
		// a quote either begins or ends the phrase
		if ( ch == '"' )
		{
			keeper.addTerm( term );
			
			if ( keeper.isInPhrase() )
				keeper.endPhrase();
			else
				keeper.beginPhrase();

			continue;
		}
		
		// If new char does not stop the word, add ot and continue
		if ( ch.isLetterOrNumber() || partOfWordChars.indexOf( ch ) != -1 )
		{
			term.append( ch );
			continue;
		}
		
		// If it is a split char, add this term and split char as separate term
		if ( splitChars.indexOf( ch ) != -1 )
		{
			// Add existing term if present
			keeper.addTerm( term );
			
			// Change the term variable, so it will be added when we exit this block
			term = ch;
		}

		// Just add the word; it is most likely a space or terminated by tokenizer.
		keeper.addTerm( term );
		term = QString::null;			
	}
	
	keeper.addTerm( term );
	
	if ( keeper.isInPhrase() )
		return false;
	
	QList< QUrl > foundDocs = m_Index->query( keeper.terms, keeper.phrases, keeper.phrasewords, ebookFile );
	
	for ( QList< QUrl >::iterator it = foundDocs.begin(); it != foundDocs.end() && limit > 0; ++it, limit-- )
		results->push_back( *it );

	return true;
}

bool EBookSearch::hasIndex() const
{
	return m_Index != 0;
}