1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
Author: <gregor+debian@comodo.priv.at>
Description: Work around missing lucene demo.
--- libpdfbox-java.orig/src/org/pdfbox/searchengine/lucene/IndexFiles.java
+++ libpdfbox-java/src/org/pdfbox/searchengine/lucene/IndexFiles.java
@@ -61,9 +61,9 @@
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.demo.HTMLDocument;
-
+import org.apache.lucene.ant.HtmlDocument;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DateField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@@ -207,7 +207,7 @@
while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) )
{
System.out.println("deleting " +
- HTMLDocument.uid2url(uidIter.term().text()));
+ uid2url(uidIter.term().text()));
reader.deleteDocuments(uidIter.term());
uidIter.next();
}
@@ -240,7 +240,7 @@
{
if (uidIter != null)
{
- String uid = HTMLDocument.uid(file); // construct uid for doc
+ String uid = uid(file); // construct uid for doc
while( uidIter.term() != null &&
uidIter.term().field().equals( "uid" ) &&
@@ -249,7 +249,7 @@
if (deleting)
{ // delete stale docs
System.out.println("deleting " +
- HTMLDocument.uid2url(uidIter.term().text()));
+ uid2url(uidIter.term().text()));
reader.deleteDocuments(uidIter.term());
}
uidIter.next();
@@ -287,7 +287,7 @@
path.endsWith(".TXT"))
{
System.out.println( "Indexing Text document: " + file );
- doc = HTMLDocument.Document(file);
+ doc = HtmlDocument.Document(file);
}
else if( path.endsWith( ".PDF" ) )
{
@@ -304,4 +304,27 @@
writer.addDocument(doc);
}
}
-}
\ No newline at end of file
+
+
+ /*
+ * The following 2 methods are taken from the
+ * org.apache.lucene.demo.HTMLDocument class shipped with
+ * Lucene 1.4.3.
+ */
+ private static char dirSep = System.getProperty("file.separator").charAt(0);
+
+ private static String uid(File f) {
+ // Append path and date into a string in such a way that lexicographic
+ // sorting gives the same results as a walk of the file hierarchy. Thus
+ // null (\u0000) is used both to separate directory components and to
+ // separate the path from the date.
+ return f.getPath().replace(dirSep, '\u0000') +
+ "\u0000" +
+ DateField.timeToString(f.lastModified());
+ }
+
+ private static String uid2url(String uid) {
+ String url = uid.replace('\u0000', '/'); // replace nulls with slashes
+ return url.substring(0, url.lastIndexOf('/')); // remove date from end
+ }
+}
|