File: docdups.py

package info (click to toggle)
recoll 1.43.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 16,468 kB
  • sloc: cpp: 103,827; python: 9,498; xml: 7,218; ansic: 6,447; sh: 1,212; perl: 130; makefile: 72
file content (133 lines) | stat: -rwxr-xr-x 3,304 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3

import sys
import xapian

o_index_stripchars = True
md5wpref = "XM"


# Handle caps/diac-stripping option. If the db is raw the prefixes are
# wrapped with ":"
def wrap_prefix(prefix):
    if o_index_stripchars:
        return prefix
    else:
        return b":" + prefix + b":"


def init_stripchars(xdb):
    global o_index_stripchars
    global md5wpref
    t = xdb.allterms()
    t.skip_to(b":")
    for term in t:
        if term.term.find(b":") == 0:
            o_index_stripchars = False
        break
    md5wpref = wrap_prefix(b"XM")


# Retrieve named value from document data record.
# The record format is a sequence of nm=value lines
def get_attributes(xdb, docid, flds, decode=True):
    doc = xdb.get_document(docid)
    data = doc.get_data()
    res = []
    for fld in flds:
        s = data.find(fld + b"=")
        if s == -1:
            res.append(None)
        else:
            e = data.find(b"\n", s)
            if decode:
                res.append(data[s + len(fld) + 1 : e].decode("UTF-8"))
            else:
                res.append(data[s + len(fld) + 1 : e])
    return res


# Convenience: retrieve postings as Python list
def get_postlist(xdb, term):
    ret = list()
    for posting in xdb.postlist(term):
        ret.append(posting.docid)
    return ret


# Return list of docids having same md5 including self
def get_dups(xdb, docid):
    doc = xdb.get_document(int(docid))

    # It would be more efficient to retrieve the value, but it's
    # binary so we'd have to decode it
    md5term = doc.termlist().skip_to(md5wpref).term
    if not md5term.startswith(md5wpref):
        return

    posts = get_postlist(xdb, md5term)
    return posts


# Retrieve all sets of duplicates:
#   walk the list of all MD5 terms, look up their posting lists, and
#   store the docids where the list is longer than one.
def find_all_dups(xdb):
    alldups = list()

    # Walk the MD5 terms
    t = xdb.allterms()
    t.skip_to(md5wpref)
    for term in t:
        if not term.term.startswith(md5wpref):
            break
        # Check postlist for term, if it's not of length 1, we have a dup
        dups = get_postlist(xdb, term.term)
        if len(dups) != 1:
            alldups.append(dups)
    return alldups


# Print docid url ipath for list of docids
def print_urlipath(xdb, doclist):
    for docid in doclist:
        url, ipath = get_attributes(xdb, docid, [b"url", b"ipath"])
        print("%s %s %s" % (docid, url, ipath))


def msg(s):
    print("%s" % s, file=sys.stderr)


########## Main program

if len(sys.argv) < 2:
    msg("Usage: %s /path/to/db [docid [docid ...]]" % sys.argv[0])
    msg(" will print all sets of dups if no docid is given")
    msg(" else only the duplicates for the given docids")

    sys.exit(1)

xdbpath = sys.argv[1]
xdb = xapian.Database(xdbpath)

init_stripchars(xdb)

try:

    if len(sys.argv) == 2:
        # No docid args,
        alldups = find_all_dups(xdb)

        for dups in alldups:
            print_urlipath(xdb, dups)
            print("")
    else:
        for docid in sys.argv[2:]:
            dups = get_dups(xdb, docid)
            if dups is not None and len(dups) > 1:
                print_urlipath(xdb, dups)

except Exception as e:
    msg("Error: %s" % str(e))
    sys.exit(1)