1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
#!/usr/bin/env python3
import sys
import xapian
o_index_stripchars = True
md5wpref = "XM"
# Handle caps/diac-stripping option. If the db is raw the prefixes are
# wrapped with ":"
def wrap_prefix(prefix):
if o_index_stripchars:
return prefix
else:
return b":" + prefix + b":"
def init_stripchars(xdb):
global o_index_stripchars
global md5wpref
t = xdb.allterms()
t.skip_to(b":")
for term in t:
if term.term.find(b":") == 0:
o_index_stripchars = False
break
md5wpref = wrap_prefix(b"XM")
# Retrieve named value from document data record.
# The record format is a sequence of nm=value lines
def get_attributes(xdb, docid, flds, decode=True):
doc = xdb.get_document(docid)
data = doc.get_data()
res = []
for fld in flds:
s = data.find(fld + b"=")
if s == -1:
res.append(None)
else:
e = data.find(b"\n", s)
if decode:
res.append(data[s + len(fld) + 1 : e].decode("UTF-8"))
else:
res.append(data[s + len(fld) + 1 : e])
return res
# Convenience: retrieve postings as Python list
def get_postlist(xdb, term):
ret = list()
for posting in xdb.postlist(term):
ret.append(posting.docid)
return ret
# Return list of docids having same md5 including self
def get_dups(xdb, docid):
doc = xdb.get_document(int(docid))
# It would be more efficient to retrieve the value, but it's
# binary so we'd have to decode it
md5term = doc.termlist().skip_to(md5wpref).term
if not md5term.startswith(md5wpref):
return
posts = get_postlist(xdb, md5term)
return posts
# Retrieve all sets of duplicates:
# walk the list of all MD5 terms, look up their posting lists, and
# store the docids where the list is longer than one.
def find_all_dups(xdb):
alldups = list()
# Walk the MD5 terms
t = xdb.allterms()
t.skip_to(md5wpref)
for term in t:
if not term.term.startswith(md5wpref):
break
# Check postlist for term, if it's not of length 1, we have a dup
dups = get_postlist(xdb, term.term)
if len(dups) != 1:
alldups.append(dups)
return alldups
# Print docid url ipath for list of docids
def print_urlipath(xdb, doclist):
for docid in doclist:
url, ipath = get_attributes(xdb, docid, [b"url", b"ipath"])
print("%s %s %s" % (docid, url, ipath))
def msg(s):
print("%s" % s, file=sys.stderr)
########## Main program
if len(sys.argv) < 2:
msg("Usage: %s /path/to/db [docid [docid ...]]" % sys.argv[0])
msg(" will print all sets of dups if no docid is given")
msg(" else only the duplicates for the given docids")
sys.exit(1)
xdbpath = sys.argv[1]
xdb = xapian.Database(xdbpath)
init_stripchars(xdb)
try:
if len(sys.argv) == 2:
# No docid args,
alldups = find_all_dups(xdb)
for dups in alldups:
print_urlipath(xdb, dups)
print("")
else:
for docid in sys.argv[2:]:
dups = get_dups(xdb, docid)
if dups is not None and len(dups) > 1:
print_urlipath(xdb, dups)
except Exception as e:
msg("Error: %s" % str(e))
sys.exit(1)
|