File: syncthingignorepattern.cpp

package info (click to toggle)
syncthingtray 1.7.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 6,804 kB
  • sloc: cpp: 31,085; xml: 1,694; java: 570; sh: 81; javascript: 53; makefile: 25
file content (502 lines) | stat: -rw-r--r-- 20,844 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
#include "./syncthingignorepattern.h"

namespace Data {

/// \cond
namespace SyncthingIgnorePatternState {
enum State {
    Escaping, // passed the esacping character "\"
    AppendRangeLower, // passed a "[" marking the start of a character range
    AppendRangeUpper, // passed a "-" within a range marking the start of an upper-bound range character
    AppendAlternative, // passed a "{" marking the start of an alternative set
    MatchVerbatimly, // initial/default state
    MatchRange, // passed a "]" marking the end of a character range
    MatchAlternatives, // passed a "}" marking the end of an alternative set
    MatchAny, // passed the "?" character that allows matching any character but the path separator
    MatchManyAny, // passed the "*" character that allows matching many arbitrary characters except the path separator
    MatchManyAnyIncludingDirSep, // passed a sequence of two "*" characters allowing to match also the path separator
};

struct Asterisk {
    QString::const_iterator pos;
    SyncthingIgnorePatternState::State state;
    bool visited = false;
};

struct CharacterRange {
    QChar lowerBound, upperBound;
};

struct AlternativeRange {
    explicit AlternativeRange(QString::const_iterator beg)
        : beg(beg)
        , end(beg)
    {
    }
    QString::const_iterator beg, end;
};
} // namespace SyncthingIgnorePatternState
/// \endcond

/*!
 * \struct SyncthingIgnorePattern
 * \brief The SyncthingIgnorePattern struct allows matching a Syncthing ignore pattern against a path.
 * \remarks
 * - The `#include`-syntax is not supported.
 * - A "/" is always treated as path separator within the pattern. Additionally, the character specified
 *   when calling matches() is treated as path separator as well. This means that under Windows where
 *   patterns can contain a "\" one *must* specify paths with a "\" as separator when invoking the matches()
 *   function to allow patterns using "/" and "\" to match correctly; otherwiise ignore patterns containing
 *   "\" do not work.
 * \sa
 * - https://docs.syncthing.net/users/ignoring.html
 * - https://docs.syncthing.net/rest/db-ignores-get.html
 */

/*!
 * \brief Parses the specified \a pattern populating the struct's fields.
 */
SyncthingIgnorePattern::SyncthingIgnorePattern(QString &&pattern)
    : pattern(std::move(pattern))
{
    if (this->pattern.startsWith(QLatin1String("//"))) {
        comment = true;
        ignore = false;
        return;
    }
    glob = this->pattern;
    for (;;) {
        if (glob.startsWith(QLatin1String("!"))) {
            ignore = !ignore;
            glob.remove(0, 1);
        } else if (glob.startsWith(QLatin1String("(?i)"))) {
            caseInsensitive = true;
            glob.remove(0, 4);
        } else if (glob.startsWith(QLatin1String("(?d)"))) {
            allowRemovalOnParentDirRemoval = true;
            glob.remove(0, 4);
        } else {
            break;
        }
    }
}

/*!
 * \brief Moves the ignore pattern.
 */
SyncthingIgnorePattern::SyncthingIgnorePattern(SyncthingIgnorePattern &&) = default;

/*!
 * \brief Destroys the ignore pattern.
 */
SyncthingIgnorePattern::~SyncthingIgnorePattern()
{
}

/*!
 * \brief Matches the assigned glob against the specified \a path.
 * \remarks
 * - Returns always false if the pattern is flagged as comment or the glob is empty.
 * - This function tries to follow rules outlined on https://docs.syncthing.net/users/ignoring.html.
 * - The specified \a path is *not* supposed to start with the \a pathSeparator (or a "/"). It must always be a path
 *   relative to the root of the Syncthing folder it is contained by. A pattern that is only supposed to match from the
 *   root of the Syncthing folder is supposed to start with \a pathSeparator (or a "/"), though.
 * - This function probably doesn't work if the pattern or \a path contain a surrogate pair.
 * - By default, the path separator is "/". If \a path uses a different separator you must specify it as second argument.
 *   Note that "/" is still be treated as path separator in this case and the specified \a pathSeparator is only used in
 *   addition. This is intended because this way one can specify "\" as \a pathSeparator under Windows but patterns using
 *   "/" still work (as they should because "/" and "\" can both be used as path separator under Windows in ignore
 *   patterns).
 */
bool SyncthingIgnorePattern::matches(const QString &path, QChar pathSeparator) const
{
    if (comment || glob.isEmpty()) {
        return false;
    }

    // get iterators
    auto globIter = glob.begin(), globEnd = glob.end();
    auto pathIter = path.begin(), pathEnd = path.end();

    // handle pattners starting with "/" indicating the pattern must match from the root (see last remark in docstring)
    static
#if (QT_VERSION >= QT_VERSION_CHECK(6, 0, 0))
        constexpr
#else
        const
#endif
        auto genericPathSeparator
        = QChar('/');
    const auto matchFromRoot = *globIter == pathSeparator || *globIter == genericPathSeparator;
    if (matchFromRoot) {
        ++globIter;
    }

    // define variables to track the state when processing the glob pattern
    using namespace SyncthingIgnorePatternState;
    auto state = MatchVerbatimly;
    auto escapedState = MatchVerbatimly;
    auto inAsterisk = false;
    auto wasInAsterisk = false;
    asterisks.clear();

    // define behavior to handle the current character in the glob pattern not matching the current pattern in the path
    const auto handleMismatch = [&, this] {
        // fail the match immediately if the glob pattern started with a "/" indicating it is supposed to match only from the root
        if (matchFromRoot) {
            return false;
        }
        // deal with the mismatch by trying to match previous asterisks more greedily
        while (!asterisks.empty() && asterisks.back().visited) {
            // do not consider asterisks we have already visited, though (as it would lead to an endless loop)
            asterisks.pop_back();
        }
        if (!asterisks.empty()) {
            // rewind back to when we have passed the last non-visited asterisk
            auto &asterisk = asterisks.back();
            globIter = asterisk.pos;
            inAsterisk = asterisk.visited = true;
            state = asterisk.state;
            return true;
        }
        // deal with the mismatch by checking the path as of the next path element
        for (; pathIter != pathEnd; ++pathIter) {
            // forward to the next path separator
            if (*pathIter != pathSeparator && *pathIter != genericPathSeparator) {
                continue;
            }
            // skip the path separator itself and give up when the end of the path is reached
            if (++pathIter == pathEnd) {
                return false;
            }
            break;
        }
        // give up when the end of the path is reached
        if (pathIter == pathEnd) {
            return false;
        }
        // start matching the glob pattern from the beginning
        globIter = glob.begin();
        asterisks.clear();
        inAsterisk = wasInAsterisk = false;
        return true;
    };

    // define function to handle single match
    const auto handleSingleMatch = [&, this] {
        // proceed with the next characters on a match
        const auto matchedChar = *(globIter++);
        // consider the asterisk dealt with after a single match
        if (inAsterisk || !(matchedChar == pathSeparator || matchedChar == genericPathSeparator)) {
            // take note that we were at an asterisk when it was a double asterisk (that allows matching the dir sep)
            wasInAsterisk = !asterisks.empty() && asterisks.back().state == MatchManyAnyIncludingDirSep && inAsterisk;
            inAsterisk = false;
            if (!asterisks.empty()) {
                asterisks.back().visited = false;
            }
        }
    };

    // define function to match single character against the current character in the path
    static constexpr auto compareSingleCharBasic = [](QChar expectedChar, QChar presentChar, QChar pathSep, bool caseInsensitive) {
        Q_UNUSED(pathSep)
        return caseInsensitive ? presentChar.toCaseFolded() == expectedChar.toCaseFolded() : presentChar == expectedChar;
    };
    const auto compareSingleChar = pathSeparator == genericPathSeparator
        ? compareSingleCharBasic
        : [](QChar expectedChar, QChar presentChar, QChar pathSep, bool caseInsensitive) {
              return compareSingleCharBasic(expectedChar, presentChar, pathSep, caseInsensitive)
                  || (expectedChar == genericPathSeparator && presentChar == pathSep);
          };
    const auto matchSingleChar = [&, this](QChar expectedChar) { return compareSingleChar(expectedChar, *pathIter, pathSeparator, caseInsensitive); };

    // define function to transition to verbatim matching (which makes only sense when not in any of the "Append…"-states)
    const auto transitionToVerbatimMatching = [&] {
        switch (state) {
        case AppendRangeLower:
        case AppendRangeUpper:
        case AppendAlternative:
            break;
        default:
            state = MatchVerbatimly;
        }
    };

    // try to match each character of the glob against a character in the path
match:
    while (globIter != globEnd) {
        // decide what to do next depending on the current glob pattern character and state transitioning the state accordingly
#if (QT_VERSION >= QT_VERSION_CHECK(6, 0, 0))
        static constexpr auto escapeCharacter = QChar('\\');
        static constexpr auto escapeCharacterUnicode = escapeCharacter.unicode();
#else
#define escapeCharacterUnicode '\\'
#define escapeCharacter QChar(escapeCharacterUnicode)
#endif
        switch (state) {
        case Escaping:
            // treat every character as-is in "escaping" state
            state = escapedState;
            break;
        default:
            // transition state according to special meaning of the current glob pattern character
            switch (globIter->unicode()) {
            case escapeCharacterUnicode:
                if (pathSeparator != escapeCharacter) {
                    // transition into "escaping" state
                    escapedState = state;
                    state = Escaping;
                } else {
                    // treat the escape character as normal character if it is the path separator
                    // quote from Syncthing documentation: Escaped characters are not supported on Windows, where "\" is the path
                    //                                     separator.
                    transitionToVerbatimMatching();
                }
                break;
            case '[':
                state = AppendRangeLower;
                characterRange.clear();
                ++globIter;
                continue;
            case ']':
                switch (state) {
                case AppendRangeLower:
                case AppendRangeUpper:
                    state = MatchRange;
                    break;
                default:
                    transitionToVerbatimMatching();
                }
                break;
            case '-':
                switch (state) {
                case AppendRangeLower:
                    state = AppendRangeUpper;
                    ++globIter;
                    continue;
                default:
                    transitionToVerbatimMatching();
                }
                break;
            case '{':
                switch (state) {
                case AppendAlternative:
                    continue;
                default:
                    state = AppendAlternative;
                    alternatives.clear();
                    alternatives.emplace_back(++globIter);
                }
                continue;
            case '}':
                switch (state) {
                case AppendAlternative:
                    alternatives.back().end = globIter;
                    state = MatchAlternatives;
                    break;
                default:
                    transitionToVerbatimMatching();
                }
                break;
            case ',':
                switch (state) {
                case AppendAlternative:
                    alternatives.back().end = globIter;
                    alternatives.emplace_back(++globIter);
                    continue;
                default:
                    transitionToVerbatimMatching();
                }
                break;
            case '?':
                // transition into "match any" state
                state = MatchAny;
                break;
            case '*':
                // transition into one of the "match many any" state (depending on current state)
                switch (state) {
                case MatchManyAny:
                    state = MatchManyAnyIncludingDirSep;
                    break;
                case MatchManyAnyIncludingDirSep:
                    break;
                default:
                    state = MatchManyAny;
                }
                break;
            default:
                // try to match/append all other non-special characters as-is
                transitionToVerbatimMatching();
            }
        }

        // proceed according to state
        switch (state) {
        case Escaping:
            // proceed with the next character in the glob pattern which will be matched as-is (even if it is special)
            [[fallthrough]];
        case AppendAlternative:
            // just move on to the next character (alternatives are populated completely in the previous switch-case)
            ++globIter;
            break;
        case AppendRangeLower:
            // add the current character in the glob pattern as start of a new range
            characterRange.emplace_back().lowerBound = *globIter++;
            break;
        case AppendRangeUpper:
            // add the current character in the glob pattern as end of a new or the current range
            (characterRange.empty() ? characterRange.emplace_back() : characterRange.back()).upperBound = *globIter++;
            state = AppendRangeLower;
            break;
        case MatchVerbatimly:
            // match the current character in the glob pattern verbatimly against the current character in the path
            if (pathIter != pathEnd && matchSingleChar(*globIter)) {
                ++pathIter;
                handleSingleMatch();
            } else if ((wasInAsterisk || inAsterisk)
                && (asterisks.back().state == MatchManyAnyIncludingDirSep
                    || (pathIter == pathEnd || (*pathIter != pathSeparator && *pathIter != genericPathSeparator)))) {
                if (wasInAsterisk) {
                    // match further characters via the asterisk we thought we have dealt with after all
                    --globIter; // get back to where we were in the glob before considering the asterisk dealt with
                    inAsterisk = true;
                    wasInAsterisk = false;
                }
                // consider the path character dealt with despite no match if we have just passed an asterisk in the glob pattern
                if (pathIter != pathEnd) {
                    ++pathIter;
                } else {
                    inAsterisk = false;
                }
            } else if (!handleMismatch()) {
                return false;
            }
            break;
        case MatchRange:
            // match the concluded character range in the glob pattern against the current character in the path
            if (pathIter != pathEnd) {
                auto inRange = false;
                for (const auto &bounds : characterRange) {
                    if ((!bounds.upperBound.isNull() && *pathIter >= bounds.lowerBound && *pathIter <= bounds.upperBound)
                        || (bounds.upperBound.isNull() && matchSingleChar(bounds.lowerBound))) {
                        inRange = true;
                        break;
                    }
                }
                if (inRange) {
                    characterRange.clear();
                    state = MatchVerbatimly;
                    ++pathIter;
                    handleSingleMatch();
                    break;
                }
            }
            if (!handleMismatch()) {
                return false;
            }
            break;
        case MatchAlternatives:
            // match the current alternatives as of the current character in the path
            if (pathIter != pathEnd) {
                const auto pathStart = pathIter;
                for (auto &alternative : alternatives) {
                    // match characters in the alternative against the path
                    // note: Special characters like "*" are matched verbatimly. Is that the correct behavior?
                    pathIter = pathStart;
                    for (; alternative.beg != alternative.end && pathIter != pathEnd; ++alternative.beg) {
                        if (*alternative.beg == escapeCharacter) {
                            continue;
                        }
                        if (!matchSingleChar(*alternative.beg)) {
                            break;
                        }
                        ++pathIter;
                    }
                    // go with the first alternative that fully matched
                    // note: What is the correct behavior? Should this be most/least greedy (matching the longest/shortest possible alternative) instead?
                    if (alternative.beg == alternative.end) {
                        alternatives.clear();
                        break;
                    }
                }
                if (alternatives.empty()) {
                    state = MatchVerbatimly;
                    handleSingleMatch();
                    break;
                }
            }
            if (!handleMismatch()) {
                return false;
            }
            break;
        case MatchAny:
            // allow the current character in the path to be anything but a path separator; otherwise consider it as mismatch as in the case for an exact match
            if (pathIter == pathEnd || (*pathIter != pathSeparator && *pathIter != genericPathSeparator)) {
                ++globIter, ++pathIter;
            } else if (!handleMismatch()) {
                return false;
            }
            break;
        case MatchManyAny: {
            // take record of the asterisks
            auto &glob = asterisks.emplace_back();
            glob.pos = ++globIter;
            glob.state = MatchManyAny;
            inAsterisk = true;
            break;
        }
        case MatchManyAnyIncludingDirSep: {
            // take record of the second asterisks
            auto &glob = asterisks.back();
            glob.pos = ++globIter;
            glob.state = MatchManyAnyIncludingDirSep;
            break;
        }
        }
    }

    // check whether all characters of the glob have been matched against all characters of the path
    if (globIter == globEnd) {
        // consider the match a success if all characters of the path were matched or the glob ended with a "**"
        if (pathIter == pathEnd || state == MatchManyAnyIncludingDirSep) {
            return true;
        }
        if (const auto remainingPath = QStringView(pathIter, pathEnd);
            state == MatchManyAny && !(remainingPath.contains(pathSeparator) || remainingPath.contains(genericPathSeparator))) {
            return true;
        }

        // try again as of the next path segment if the glob fully matched but there are still characters in the path to be matched
        // note: This allows "foo" to match against "foo/foo" even tough the glob characters have already consumed after matching the first path segment.
        if (!matchFromRoot && (*pathIter == pathSeparator || *pathIter == genericPathSeparator)) {
            state = MatchVerbatimly;
            ++pathIter;
            globIter = glob.begin();
            asterisks.clear();
            inAsterisk = false;
            goto match;
        }
    }
    return false;
}

/*!
 * \brief Makes an ignore pattern for \a path with the specified settings.
 */
QString SyncthingIgnorePattern::forPath(const QString &path, bool ignore, bool caseInsensitive, bool allowRemovalOnParentDirRemoval)
{
    auto res = QString();
    res.reserve(10 + path.size());
    if (!ignore) {
        res += QChar('!');
    }
    if (caseInsensitive) {
        res += QStringLiteral("(?i)");
    }
    if (allowRemovalOnParentDirRemoval) {
        res += QStringLiteral("(?d)");
    }
    return res += path;
}

} // namespace Data