File: rcldlkp.py

package info (click to toggle)
recoll 1.43.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 16,468 kB
  • sloc: cpp: 103,827; python: 9,498; xml: 7,218; ansic: 6,447; sh: 1,212; perl: 130; makefile: 72
file content (120 lines) | stat: -rwxr-xr-x 2,750 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
__doc__ = """
An example indexer for an arbitrary multi-document file format.
Not supposed to run ''as-is'' or be really useful.

''Lookup'' notes file indexing

The file format has text notes separated by lines with a single '%' character

If the script is called with just the file name as an argument, it will 
(re)index the contents.

If the script is called with second numeric argument, it will retrieve the
specified record and output it in html
"""

import os
import stat
import sys
import re

rclconf = "/Users/dockes/.recoll-dlkp"


def udi(docfile, numrec):
    return docfile + "#" + str(numrec)


###############################################################
def index_rec(db, numrec, rec):
    doc = db.doc()
    # url
    doc.url = "file://" + docfile
    # utf8fn
    # ipath
    doc.ipath = str(numrec)
    # mimetype
    doc.mimetype = "text/plain"
    # mtime
    # origcharset
    # title
    lines = rec.split("\n")
    if len(lines) >= 2:
        doc.title = unicode(lines[1], "iso-8859-1")
    if len(doc.title.strip()) == 0 and len(lines) >= 3:
        doc.title = unicode(lines[2], "iso-8859-1")
    # keywords
    # abstract
    # author
    # fbytes
    doc.fbytes = str(fbytes)
    # text
    doc.text = unicode(rec, "iso-8859-1")
    # dbytes
    doc.dbytes = str(len(rec))
    # sig
    if numrec == 0:
        doc.sig = str(fmtime)
    db.addOrUpdate(udi(docfile, numrec), doc)


def output_rec(rec):
    # Escape html
    rec = unicode(rec, "iso-8859-1").encode("utf-8")
    rec = rec.replace("<", "&lt;")
    rec = rec.replace("&", "&amp;")
    rec = rec.replace('"', "&dquot;")
    print("<html><head>")
    print('<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">')
    print("</head><body><pre>")
    print(rec)
    print("</pre></body></html>")


################################################################


def usage():
    sys.stderr.write("Usage: rcldlkp.py <filename> [<recnum>]\n")
    exit(1)


if len(sys.argv) < 2:
    usage()

docfile = sys.argv[1]

if len(sys.argv) > 2:
    targetnum = int(sys.argv[2])
else:
    targetnum = None

stdata = os.stat(docfile)
fmtime = stdata[stat.ST_MTIME]
fbytes = stdata[stat.ST_SIZE]
f = open(docfile, "r")

if targetnum == None:
    import recoll

    db = recoll.connect(confdir=rclconf, writable=1)
    if not db.needUpdate(udi(docfile, 0), str(fmtime)):
        exit(0)

rec = ""
numrec = 1
for line in f:
    if re.compile(r"^%[ \t]*").match(line):
        if targetnum == None:
            index_rec(db, numrec, rec)
        elif targetnum == numrec:
            output_rec(rec)
            exit(0)
        numrec += 1
        rec = ""
    else:
        rec += line

if targetnum == None:
    index_rec(db, 0, "")