File: Zend_Search_Lucene-IndexCreation.xml

package info (click to toggle)
zendframework 1.12.9%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie-kfreebsd
  • size: 133,584 kB
  • sloc: xml: 1,311,829; php: 570,173; sh: 170; makefile: 125; sql: 121
file content (346 lines) | stat: -rw-r--r-- 13,338 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
<?xml version="1.0" encoding="UTF-8"?>
<!-- Reviewed: no -->
<sect1 id="zend.search.lucene.index-creation">
    <title>Building Indexes</title>

    <sect2 id="zend.search.lucene.index-creation.creating">
        <title>Creating a New Index</title>

        <para>
            Index creation and updating capabilities are implemented within the
            <classname>Zend_Search_Lucene</classname> component, as well as the Java Lucene project.
            You can use either of these options to create indexes that
            <classname>Zend_Search_Lucene</classname> can search.
        </para>

        <para>
            The <acronym>PHP</acronym> code listing below provides an example of how to index a file
            using <classname>Zend_Search_Lucene</classname> indexing <acronym>API</acronym>:
        </para>

        <programlisting language="php"><![CDATA[
// Create index
$index = Zend_Search_Lucene::create('/data/my-index');

$doc = new Zend_Search_Lucene_Document();

// Store document URL to identify it in the search results
$doc->addField(Zend_Search_Lucene_Field::Text('url', $docUrl));

// Index document contents
$doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $docContent));

// Add document to the index
$index->addDocument($doc);
]]></programlisting>

        <para>
            Newly added documents are immediately searchable in the index.
        </para>
    </sect2>

    <sect2 id="zend.search.lucene.index-creation.updating">
        <title>Updating Index</title>

        <para>
            The same procedure is used to update an existing index. The only difference
            is that the open() method is called instead of the create() method:
        </para>

        <programlisting language="php"><![CDATA[
// Open existing index
$index = Zend_Search_Lucene::open('/data/my-index');

$doc = new Zend_Search_Lucene_Document();
// Store document URL to identify it in search result.
$doc->addField(Zend_Search_Lucene_Field::Text('url', $docUrl));
// Index document content
$doc->addField(Zend_Search_Lucene_Field::UnStored('contents',
                                                  $docContent));

// Add document to the index.
$index->addDocument($doc);
]]></programlisting>
    </sect2>

    <sect2 id="zend.search.lucene.index-creation.document-updating">
        <title>Updating Documents</title>

        <para>
            The Lucene index file format doesn't support document updating.
            Documents should be removed and re-added to the index to effectively update them.
        </para>

        <para>
            <methodname>Zend_Search_Lucene::delete()</methodname> method operates with an internal
            index document id. It can be retrieved from a query hit by 'id' property:
        </para>

        <programlisting language="php"><![CDATA[
$removePath = ...;
$hits = $index->find('path:' . $removePath);
foreach ($hits as $hit) {
    $index->delete($hit->id);
}
]]></programlisting>
    </sect2>

    <sect2 id="zend.search.lucene.index-creation.counting">
        <title>Retrieving Index Size</title>

        <para>
            There are two methods to retrieve the size of an index in
            <classname>Zend_Search_Lucene</classname>.
        </para>

        <para>
             <methodname>Zend_Search_Lucene::maxDoc()</methodname> returns one greater than the
             largest possible document number. It's actually the overall number of the documents in
             the index including deleted documents, so it has a synonym:
             <methodname>Zend_Search_Lucene::count()</methodname>.
        </para>

        <para>
             <methodname>Zend_Search_Lucene::numDocs()</methodname> returns the total number of
             non-deleted documents.
        </para>

        <programlisting language="php"><![CDATA[
$indexSize = $index->count();
$documents = $index->numDocs();
]]></programlisting>

        <para>
            <methodname>Zend_Search_Lucene::isDeleted($id)</methodname> method may be used to check
            if a document is deleted.
        </para>

        <programlisting language="php"><![CDATA[
for ($count = 0; $count < $index->maxDoc(); $count++) {
    if ($index->isDeleted($count)) {
        echo "Document #$id is deleted.\n";
    }
}
]]></programlisting>

        <para>
            Index optimization removes deleted documents and squeezes documents' IDs in to a smaller
            range. A document's internal id may therefore change during index optimization.
        </para>
    </sect2>

    <sect2 id="zend.search.lucene.index-creation.optimization">
        <title>Index optimization</title>

        <para>
            A Lucene index consists of many segments. Each segment is a completely independent set
            of data.
        </para>

        <para>
            Lucene index segment files can't be updated by design. A segment update needs full
            segment reorganization. See Lucene index file formats for details (<ulink
                url="http://lucene.apache.org/java/2_3_0/fileformats.html">http://lucene.apache.org/java/2_3_0/fileformats.html</ulink>)

            <footnote>
                <para>
                    The currently supported Lucene index file format is version 2.3 (starting from
                    Zend Framework 1.6).
                </para>
            </footnote>.

            New documents are added to the index by creating new segment.
        </para>

        <para>
            Increasing number of segments reduces quality of the index, but index optimization
            restores it. Optimization essentially merges several segments into a new one. This
            process also doesn't update segments. It generates one new large segment and updates
            segment list ('segments' file).
        </para>

        <para>
            Full index optimization can be trigger by calling the
            <methodname>Zend_Search_Lucene::optimize()</methodname> method. It merges all index
            segments into one new segment:
        </para>

        <programlisting language="php"><![CDATA[
// Open existing index
$index = Zend_Search_Lucene::open('/data/my-index');

// Optimize index.
$index->optimize();
]]></programlisting>

        <para>
            Automatic index optimization is performed to keep indexes in a consistent state.
        </para>

        <para>
            Automatic optimization is an iterative process managed by several index options. It
            merges very small segments into larger ones, then merges these larger segments into even
            larger segments and so on.
        </para>

        <sect3 id="zend.search.lucene.index-creation.optimization.maxbuffereddocs">
            <title>MaxBufferedDocs auto-optimization option</title>

            <para>
                <emphasis>MaxBufferedDocs</emphasis> is a minimal number of documents required
                before the buffered in-memory documents are written into a new segment.
            </para>

            <para>
                <emphasis>MaxBufferedDocs</emphasis> can be retrieved or set by
                <code>$index->getMaxBufferedDocs()</code> or
                <code>$index->setMaxBufferedDocs($maxBufferedDocs)</code> calls.
            </para>

            <para>
                Default value is 10.
            </para>
        </sect3>

        <sect3 id="zend.search.lucene.index-creation.optimization.maxmergedocs">
            <title>MaxMergeDocs auto-optimization option</title>

            <para>
                <emphasis>MaxMergeDocs</emphasis> is a largest number of documents ever merged by
                addDocument(). Small values (e.g., less than 10.000) are best for interactive
                indexing, as this limits the length of pauses while indexing to a few seconds.
                Larger values are best for batched indexing and speedier searches.
            </para>

            <para>
                <emphasis>MaxMergeDocs</emphasis> can be retrieved or set by
                <code>$index->getMaxMergeDocs()</code> or
                <code>$index->setMaxMergeDocs($maxMergeDocs)</code> calls.
            </para>

            <para>
                Default value is PHP_INT_MAX.
            </para>
        </sect3>

        <sect3 id="zend.search.lucene.index-creation.optimization.mergefactor">
            <title>MergeFactor auto-optimization option</title>

            <para>
                <emphasis>MergeFactor</emphasis> determines how often segment indices are merged by
                addDocument(). With smaller values, less <acronym>RAM</acronym> is used while
                indexing, and searches on unoptimized indices are faster, but indexing speed is
                slower. With larger values, more <acronym>RAM</acronym> is used during indexing, and
                while searches on unoptimized indices are slower, indexing is faster. Thus larger
                values (&gt; 10) are best for batch index creation, and smaller values (&lt; 10) for
                indices that are interactively maintained.
            </para>

            <para>
                <emphasis>MergeFactor</emphasis> is a good estimation for average number of segments
                merged by one auto-optimization pass. Too large values produce large number of
                segments while they are not merged into new one. It may be a cause of "failed to
                open stream: Too many open files" error message. This limitation is system
                dependent.
            </para>

            <para>
                <emphasis>MergeFactor</emphasis> can be retrieved or set by
                <code>$index->getMergeFactor()</code> or
                <code>$index->setMergeFactor($mergeFactor)</code> calls.
            </para>

            <para>
                Default value is 10.
            </para>

            <para>
                Lucene Java and Luke (Lucene Index Toolbox - <ulink
                    url="http://www.getopt.org/luke/">http://www.getopt.org/luke/</ulink>) can also
                be used to optimize an index. Latest Luke release (v0.8) is based on Lucene v2.3 and
                compatible with current implementation of <classname>Zend_Search_Lucene</classname>
                component (Zend Framework 1.6). Earlier versions of
                <classname>Zend_Search_Lucene</classname> implementations need another versions of
                Java Lucene tools to be compatible:

                <itemizedlist>
                    <listitem>
                        <para>
                            Zend Framework 1.5 - Java Lucene 2.1 (Luke tool v0.7.1 - <ulink
                                url="http://www.getopt.org/luke/luke-0.7.1/"/>)
                        </para>
                    </listitem>

                    <listitem>
                        <para>
                            Zend Framework 1.0 - Java Lucene 1.4 - 2.1 (Luke tool v0.6 - <ulink
                                url="http://www.getopt.org/luke/luke-0.6/"/>)
                        </para>
                    </listitem>
                </itemizedlist>
            </para>
        </sect3>
    </sect2>

    <sect2 id="zend.search.lucene.index-creation.permissions">
        <title>Permissions</title>

        <para>
            By default, index files are available for reading and writing by everyone.
        </para>

        <para>
            It's possible to override this with the
            <methodname>Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions()</methodname>
            method:
        </para>

        <programlisting language="php"><![CDATA[
// Get current default file permissions
$currentPermissions =
    Zend_Search_Lucene_Storage_Directory_Filesystem::getDefaultFilePermissions();

// Give read-writing permissions only for current user and group
Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions(0660);
]]></programlisting>
    </sect2>

    <sect2 id="zend.search.lucene.index-creation.limitations">
        <title>Limitations</title>

        <sect3 id="zend.search.lucene.index-creation.limitations.index-size">
            <title>Index size</title>

            <para>
                Index size is limited by 2GB for 32-bit platforms.
            </para>

            <para>
                Use 64-bit platforms for larger indices.
            </para>
        </sect3>

        <sect3 id="zend.search.lucene.index-creation.limitations.filesystems">
            <title>Supported Filesystems</title>

            <para>
                <classname>Zend_Search_Lucene</classname> uses <methodname>flock()</methodname> to
                provide concurrent searching, index updating and optimization.
            </para>

            <para>
                According to the <acronym>PHP</acronym> <ulink
                    url="http://www.php.net/manual/en/function.flock.php">documentation</ulink>,
                "<methodname>flock()</methodname> will not work on NFS and many other networked file
                systems".
            </para>

            <para>
                Do not use networked file systems with <classname>Zend_Search_Lucene</classname>.
            </para>
        </sect3>
    </sect2>
</sect1>
<!--
vim:se ts=4 sw=4 et:
-->