File: finderxattr.cpp

package info (click to toggle)
recoll 1.43.12-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 16,952 kB
  • sloc: cpp: 104,785; python: 9,933; xml: 7,314; ansic: 6,447; sh: 1,252; perl: 166; makefile: 72
file content (265 lines) | stat: -rw-r--r-- 9,731 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* Copyright (C) 2024 J.F.Dockes
 *
 * License: GPL 2.1
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the
 * Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
#include <iostream>
#include <cstdlib>
#include <string>
#include <cstring>
#include <vector>
#include <tuple>

#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>

inline uint32_t char2toint(const unsigned char *f)
{
    return (
               (uint32_t)(f[1]) +
               ((uint32_t)(f[0]) << 8));
}
static inline uint32_t char4toint(const unsigned char *f)
{
    return (
               ((uint32_t)(f[3])) +
               ((uint32_t)(f[2]) << 8) +
               ((uint32_t)(f[1]) << 16) +
               ((uint32_t)(f[0]) << 24));
}
inline uint64_t char8toint(const unsigned char *f)
{
    return (
               ((uint64_t)(f[7])      ) +
               ((uint64_t)(f[6]) << 8 ) +
               ((uint64_t)(f[5]) << 16) +
               ((uint64_t)(f[4]) << 24) +
               ((uint64_t)(f[3]) << 32) +
               ((uint64_t)(f[2]) << 40) +
               ((uint64_t)(f[1]) << 48) +
               ((uint64_t)(f[0]) << 56)
        );
}

struct plist_trailer {
    // Size of offsets in the offsets table. These point to a marker in the object table as an
    // offset from the beginning of the plist
    unsigned int offset_table_offset_size;
    // Size of offsets into the offsets table
    unsigned int object_ref_size;
    // Total count of objects
    uint64_t num_objects;
    // Offset from the start of the offset table where we find the offset of the first object table
    // marker. Usage? Usually 0.
    uint64_t top_object_offset;
    // Start of offset table, from start of file
    uint64_t offset_table_start;
    void dump(std::ostream& s) {
        s << "offset_table_offset_size " << offset_table_offset_size <<
            " object_ref_size " << object_ref_size <<
            " num_objects " << num_objects <<
            " top_object_offset " << top_object_offset <<
            " offset_table_start " << offset_table_start;
    }
};

std::pair<plist_trailer, bool> decode_trailer(const unsigned char *plist, size_t sz)
{
    plist_trailer t;
    if (sz < 32) {
        //std::cerr << "Size " << sz << " too small\n";
        return {t,false};
    }
    const unsigned char *cp = plist + sz - 32;
    // 0 to 4 are unused. Byte 5: sort version ??
    cp += 6;
    t.offset_table_offset_size = *cp++;
    t.object_ref_size = *cp++;
    t.num_objects = char8toint(cp); cp += 8;
    t.top_object_offset = char8toint(cp); cp += 8;
    t.offset_table_start = char8toint(cp); cp += 8;
    return {t,true};
}

/**
 * Decode a marker.
 * 
 * @param cp0 the plist data base. Used for size overflow computations only.
 * @param [inout] cp the current pointer into the plist data. Updated to point after the marker.
 * @param sz the plist data size.
 * @return A tuple with the marker type, the item count, and a std::string message which 
 *   indicates an error if not empty.
 */
std::tuple<unsigned int, unsigned int, std::string> decode_marker(
    const unsigned char* cp0, const unsigned char*& cp, int sz)
{
    auto format = ((*cp) & 0xf0) >> 4;
    auto numitems = (*cp) & 0x0f;
    // std::cerr << "format is " << format << " numitems " << numitems << "\n";
    if (numitems == 15) {
        // Then the actual length is in the following bytes.
        cp++;
        if (cp - cp0 > sz) {
            return {format, numitems, "Input size too small"};
        }
            
        // std::cerr << "*cp now " << std::hex << (unsigned int)*cp << std::dec << "\n";
        if (((*cp) & 0xf0)>>4 != 1) {
            return {format, numitems, "Int len high nibble not 1"};
        }
        auto pow = (*cp) & 0x0f;
        auto ilen = 1 << pow;
        // std::cerr << "ilen is " << ilen << "\n";
        // The next ilen bytes are a big endian integer.
        // TBD: what ilen values are possible ? For now assume any value <= 4. Could be expanded
        // and simplified if it was a (1,2,4,8) set for example (use the charXtoint routines).
        if (ilen > 4) {
            return {format, numitems, "Bad integer size " + std::to_string(ilen)};
        }
        cp++;
        if (cp + ilen - cp0 > sz) {
            return {format, numitems, "input size too small"};
        }
        unsigned char ar[4] = {0,0,0,0};
        // Fill the array down from lsb (byte 3) down to msb depending on ilen
        for (int i = ilen-1; i >= 0; i--) {
            ar[i + (4-ilen)] = cp[i];
        }
        numitems = char4toint(ar);
        cp += ilen;
    } else {
        cp++;
        if (cp - cp0 > sz) {
            return {format, numitems, "input size too small"};
        }
    }
    return {format, numitems, ""};
}

// Read an expected string object, pointed by its marker. Returns an error if the marker is not for
// a string.
std::pair<std::string, std::string> readstring(
    const unsigned char *cp0, const unsigned char *cp, int sz)
{
    auto [format, numchars, error] = decode_marker(cp0, cp, sz);
    if (!error.empty()) {
        return {"", error};
    }
    bool unicode;
    if (format == 5) {
        unicode = false;
    } else if (format == 6) {
        unicode = true;
    } else {
        return {"", "format " + std::to_string(format) + " neither bytes nor unicode."};
    }
    auto bytelen = unicode ? numchars * 2 : numchars;
    if (cp + bytelen - cp0 > sz) {
        return {"", "input size too small"};
    }
    
    // std::cerr << "numchars " << numchars << " numbytes " << bytelen << "\n";
    return {std::string((const char *)cp, bytelen), ""};
}


// Read an expected vector of strings, pointed to by its marker. (e.g. Finder tags xattr). 
std::pair<std::vector<std::string>, std::string> read_string_array(
    const unsigned char *cp0, const unsigned char *cp, int sz, const plist_trailer& trailer)
{
    std::vector<std::string> out;
    auto [format, numitems, error] = decode_marker(cp0, cp, sz);
    if (format != 10) {
        return {out, "format " + std::to_string(format) + " is not 10"};
    }
    //std::cerr << "Got " << numitems << " entries\n";

    // TBD: Change the following to a single loop: no need to store the object_refs
    out.reserve(numitems);
    int offsidx;
    for (unsigned int i = 0; i < numitems; i++) {
        switch(trailer.object_ref_size) {
        case 1: offsidx = *cp++;                  break;
        case 2: offsidx = char2toint(cp);cp += 2; break;
        case 4: offsidx = char4toint(cp);cp += 4; break;
        case 8: offsidx = char8toint(cp);cp += 8; break;
        default: return {out,
            std::string("bad object ref size ") + std::to_string(trailer.object_ref_size)};
        }
        //std::cerr << "Offset table index: " << offsidx << "\n";
        uint64_t offset;
        const unsigned char *ref = cp0 + trailer.offset_table_start +
            offsidx * trailer.offset_table_offset_size;
        switch(trailer.offset_table_offset_size) {
        case 1: offset = *ref;            break;
        case 2: offset = char2toint(ref); break;
        case 4: offset = char4toint(ref); break;
        case 8: offset = char8toint(ref); break;
        default: return {out, std::string("bad offset table offset size ") +
            std::to_string(trailer.offset_table_offset_size)};
        }
        //std::cerr << "Offset: " << offset << " ";
        auto [str, error] = readstring(cp0, cp0 + offset, sz);
        if (!error.empty()) {
            return {out, error};
        }
        out.push_back(str);
        //std::cerr << "string: [" << str << "]\n";
    }
    return {out, ""};
}

// Decode a MacOS plist in the very specific case of the data found in a
// com.apple.metadata:_kMDItemUserTags extended attribute value.
// This contains an array of bytes or unicode strings.
// Apparently the colors format is color\nNumvalue??
std::pair<std::vector<std::string>, std::string> decode_tags_plist(
    const unsigned char *cp0, int sz)
{
    if (memcmp(cp0, "bplist00", 8) != 0) {
        return {{}, "not bplist00"};
    }
    // Check the plist trailer.
    auto [trailer,status] = decode_trailer(cp0, sz);
    if (!status) {
        return {{}, "Trailer decode failed (data too small ?)"};
    }

    // trailer.dump(std::cerr); std::cerr <<"\n";
    const unsigned char *cp = cp0+8;
    return read_string_array(cp0, cp, sz, trailer);
}

// Decode a MacOS plist in the very specific case of the data found in a
// com.apple.metadata:kMDItemFinderComment extended attribute value. This contains a single string,
// either bytes or unicode (not sure about the encoding, probably ucs-2?).
std::pair<std::string, std::string> decode_comment_plist(const unsigned char *cp0, int sz)
{
    if (memcmp(cp0, "bplist00", 8) != 0) {
        return {"", "not bplist00"};
    }
    // Check the plist trailer.
    auto [trailer,status] = decode_trailer(cp0, sz);
    if (!status) {
        return {"", "Trailer decode failed (data too small ?)"};
    }

    // trailer.dump(std::cerr); std::cerr <<"\n";
    const unsigned char *cp = cp0+8;
    return readstring(cp0, cp, sz);
}