File: 05_lucenedemo.patch

package info (click to toggle)
libpdfbox-java 1%3A0.7.3%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 27,316 kB
  • ctags: 18,061
  • sloc: java: 38,032; xml: 2,650; sh: 48; jsp: 27; makefile: 12
file content (82 lines) | stat: -rw-r--r-- 3,195 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
Author: <gregor+debian@comodo.priv.at>
Description: Work around missing lucene demo.
--- libpdfbox-java.orig/src/org/pdfbox/searchengine/lucene/IndexFiles.java
+++ libpdfbox-java/src/org/pdfbox/searchengine/lucene/IndexFiles.java
@@ -61,9 +61,9 @@
 
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 
-import org.apache.lucene.demo.HTMLDocument;
-
+import org.apache.lucene.ant.HtmlDocument;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DateField;
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
@@ -207,7 +207,7 @@
                 while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) )
                 {
                     System.out.println("deleting " +
-                    HTMLDocument.uid2url(uidIter.term().text()));
+                    uid2url(uidIter.term().text()));
                     reader.deleteDocuments(uidIter.term());
                     uidIter.next();
                 }
@@ -240,7 +240,7 @@
         {
             if (uidIter != null)
             {
-                String uid = HTMLDocument.uid(file);      // construct uid for doc
+                String uid = uid(file);      // construct uid for doc
 
                 while( uidIter.term() != null &&
                 uidIter.term().field().equals( "uid" ) &&
@@ -249,7 +249,7 @@
                     if (deleting)
                     {             // delete stale docs
                         System.out.println("deleting " +
-                        HTMLDocument.uid2url(uidIter.term().text()));
+                        uid2url(uidIter.term().text()));
                         reader.deleteDocuments(uidIter.term());
                     }
                     uidIter.next();
@@ -287,7 +287,7 @@
             path.endsWith(".TXT"))
         {
             System.out.println( "Indexing Text document: " + file );
-            doc = HTMLDocument.Document(file);
+            doc = HtmlDocument.Document(file);
         }
         else if( path.endsWith( ".PDF" ) )
         {
@@ -304,4 +304,27 @@
             writer.addDocument(doc);
         }
     }
-}
\ No newline at end of file
+    
+
+    /*
+     * The following 2 methods are taken from the 
+     * org.apache.lucene.demo.HTMLDocument class shipped with
+     * Lucene 1.4.3.
+     */
+    private static char dirSep = System.getProperty("file.separator").charAt(0);
+
+    private static String uid(File f) {
+        // Append path and date into a string in such a way that lexicographic
+        // sorting gives the same results as a walk of the file hierarchy.  Thus
+        // null (\u0000) is used both to separate directory components and to
+        // separate the path from the date.
+        return f.getPath().replace(dirSep, '\u0000') +
+          "\u0000" +
+          DateField.timeToString(f.lastModified());
+      }
+
+      private static String uid2url(String uid) {
+        String url = uid.replace('\u0000', '/');	  // replace nulls with slashes
+        return url.substring(0, url.lastIndexOf('/')); // remove date from end
+      }
+}