File: tokenize.h

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (110 lines) | stat: -rw-r--r-- 3,398 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// Copyright 2020 The Chromium Authors
// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.

#ifndef THIRD_PARTY_LIBURLPATTERN_LEXER_H_
#define THIRD_PARTY_LIBURLPATTERN_LEXER_H_

#include <string_view>
#include <vector>

#include "base/component_export.h"
#include "base/types/expected.h"
#include "third_party/abseil-cpp/absl/status/status.h"

namespace liburlpattern {

enum class TokenType {
  // Open a scope with a '{'.
  kOpen,

  // Close a scope with a '}'.
  kClose,

  // A regular expression group like '(...)'.
  kRegex,

  // A named group like ':foo'.
  kName,

  // A single character.
  kChar,

  // The '\' escape character.
  kEscapedChar,

  // A '+' or '?' modifier.
  kOtherModifier,

  // A '*' character which can be a wildcard or modifier.
  kAsterisk,

  // The end of the token stream.
  kEnd,

  // A character that is not valid in a properly formed pattern; e.g. the colon
  // in `https://`.  This is only generated when TokenizerPolicy::kLenient is
  // used.
  kInvalidChar,
};

const char* TokenTypeToString(TokenType type);

// Simple structure representing a single lexical token.
struct COMPONENT_EXPORT(LIBURLPATTERN) Token {
  // Indicate the token type.
  TokenType type = TokenType::kEnd;

  // Index of the start of this token in the original pattern string.
  size_t index = 0;

  // The value of the token.  May be one or many characters depending on type.
  // May be null zero characters for the kEnd type.
  std::string_view value;

  Token(TokenType t, size_t i, std::string_view v)
      : type(t), index(i), value(v) {}
  Token() = default;
};

enum class TokenizePolicy {
  // The strict policy causes any problems found during tokenization to be
  // thrown as errors.
  kStrict,

  // The lenient policy converts problems detected during tokenization into
  // kInvalidChar tokens in the returned token list.  For something like a
  // `\` at the end of the string, this simply returns the immediate `\`
  // character.  For validation errors that cause a group to be invalid, the
  // first character of the group is instead returned.  For example, `https://`
  // returns the `:` as a kInvalidChar.  For `(foo(bar))` where capture groups
  // are illegal it causes the first `(` to be returned as a kInvalidChar.
  // Tokenization then continues with the next character after the kInvalidChar.
  kLenient,
};

COMPONENT_EXPORT(LIBURLPATTERN)
inline bool operator==(const Token& lh, const Token& rh) {
  return lh.type == rh.type && lh.index == rh.index && lh.value == rh.value;
}

inline bool operator!=(const Token& lh, const Token& rh) {
  return !(lh == rh);
}

COMPONENT_EXPORT(LIBURLPATTERN)
std::ostream& operator<<(std::ostream& o, Token token);

// Split the given input pattern string into a list of lexical tokens.
// Tokenizing will fail if |pattern| is not valid UTF-8.  Note, the generated
// Token objects simply reference positions within the input |pattern|.  The
// |pattern| must be kept alive as long as the Token objects.
COMPONENT_EXPORT(LIBURLPATTERN)
base::expected<std::vector<Token>, absl::Status> Tokenize(
    std::string_view pattern,
    TokenizePolicy policy = TokenizePolicy::kStrict);

}  // namespace liburlpattern

#endif  // THIRD_PARTY_LIBURLPATTERN_LEXER_H_