File: fileudi.cpp

package info (click to toggle)
recoll 1.43.7-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 16,512 kB
  • sloc: cpp: 104,170; python: 9,500; xml: 7,248; ansic: 6,447; sh: 1,212; perl: 130; makefile: 72
file content (98 lines) | stat: -rw-r--r-- 3,141 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* Copyright (C) 2005-2020 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
#include "autoconfig.h"

#include <cstdlib>
#include <iostream>

#include "fileudi.h"
#include "md5.h"
#include "base64.h"

using std::string;

namespace fileUdi {

// Size of the hashed result (base64 of 16 bytes of md5, minus 2 pad chars)
#define HASHLEN 22

// Convert longish paths by truncating and appending hash of path
// The full length of the base64-encoded (minus pad) of the md5 is 22 chars
// We append this to the truncated path
void pathHash(const std::string &path, std::string &phash, unsigned int maxlen)
{
    if (maxlen < HASHLEN) {
        std::cerr << "pathHash: internal error: requested len too small\n";
        abort();
    }

    if (path.length() <= maxlen) {
        phash = path;
        return;
    }

    // Compute the md5
    unsigned char chash[16];
    MD5_CTX ctx;
    MD5Init(&ctx);
    MD5Update(&ctx, (const unsigned char *)(path.c_str()+maxlen-HASHLEN), 
              path.length() - (maxlen - HASHLEN));
    MD5Final(chash, &ctx);

    // Encode it to ascii. This shouldn't be strictly necessary as
    // xapian terms can be binary
    string hash;
    base64_encode(string((char *)chash, 16), hash);
    // We happen to know there will be 2 pad chars in there, that we
    // don't need as this won't ever be decoded. Resulting length is 22
    hash.resize(hash.length() - 2);

    // Truncate path and append hash
    phash = path.substr(0, maxlen - HASHLEN) + hash;
}


// Maximum length for path/unique terms stored for each document. We truncate
// longer paths and uniquize them by appending a hashed value. This
// is done to avoid xapian max term length limitations, not
// to gain space (we gain very little even with very short maxlens
// like 30). The xapian max key length is 245.
// The value for PATHHASHLEN includes the length of the hash part.
#define PATHHASHLEN 150
int hashed_udi_size()
{
    return PATHHASHLEN;
}
// length of the path part
int hashed_udi_path_size()
{
    return PATHHASHLEN - HASHLEN;
}

// Compute the unique term used to link documents to their file-system source:
// Hashed path + possible internal path
void make_udi(const string& fn, const string& ipath, string &udi)
{
    string s(fn);
    // Note that we append a "|" in all cases. Historical, could be removed
    s.append("|");
    s.append(ipath);
    pathHash(s, udi, PATHHASHLEN);
    return;
}

} // namespace