1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
|
// Copyright 2015 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "net/base/lookup_string_in_fixed_set.h"
#include "base/check.h"
namespace net {
namespace {
// Read next offset from |pos|, increment |offset| by that amount, and increment
// |pos| either to point to the start of the next encoded offset in its node, or
// nullptr, if there are no remaining offsets.
//
// Returns true if an offset could be read; false otherwise.
inline bool GetNextOffset(const unsigned char** pos,
const unsigned char** offset) {
if (*pos == nullptr)
return false;
size_t bytes_consumed;
switch (**pos & 0x60) {
case 0x60: // Read three byte offset
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: // Read two byte offset
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = nullptr;
} else {
*pos += bytes_consumed;
}
return true;
}
// Check if byte at |offset| is last in label.
bool IsEOL(const unsigned char* offset) {
return (*offset & 0x80) != 0;
}
// Check if byte at |offset| matches key. This version matches both end-of-label
// chars and not-end-of-label chars.
bool IsMatch(const unsigned char* offset, char key) {
return (*offset & 0x7F) == key;
}
// Read return value at |offset|, if it is a return value. Returns true if a
// return value could be read, false otherwise.
bool GetReturnValue(const unsigned char* offset, int* return_value) {
// Return values are always encoded as end-of-label chars (so the high bit is
// set). So byte values in the inclusive range [0x80, 0x9F] encode the return
// values 0 through 31 (though make_dafsa.py doesn't currently encode values
// higher than 7). The following code does that translation.
if ((*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x1F;
return true;
}
return false;
}
} // namespace
FixedSetIncrementalLookup::FixedSetIncrementalLookup(const unsigned char* graph,
size_t length)
: pos_(graph), end_(graph + length) {}
FixedSetIncrementalLookup::FixedSetIncrementalLookup(
const FixedSetIncrementalLookup& other) = default;
FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=(
const FixedSetIncrementalLookup& other) = default;
FixedSetIncrementalLookup::~FixedSetIncrementalLookup() = default;
bool FixedSetIncrementalLookup::Advance(char input) {
if (!pos_) {
// A previous input exhausted the graph, so there are no possible matches.
return false;
}
// Only ASCII printable chars are supported by the current DAFSA format -- the
// high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the
// low values (values 0x00-0x1F) are reserved to encode the return values. So
// values outside this range will never be in the dictionary.
if (input >= 0x20) {
if (pos_is_label_character_) {
// Currently processing a label, so it is only necessary to check the byte
// at |pos_| to see if it encodes a character matching |input|.
bool is_last_char_in_label = IsEOL(pos_);
bool is_match = IsMatch(pos_, input);
if (is_match) {
// If this is not the last character in the label, the next byte should
// be interpreted as a character or return value. Otherwise, the next
// byte should be interpreted as a list of child node offsets.
++pos_;
DCHECK(pos_ < end_);
pos_is_label_character_ = !is_last_char_in_label;
return true;
}
} else {
const unsigned char* offset = pos_;
// Read offsets from |pos_| until the label of the child node at |offset|
// matches |input|, or until there are no more offsets.
while (GetNextOffset(&pos_, &offset)) {
DCHECK(offset < end_);
DCHECK((pos_ == nullptr) || (pos_ < end_));
// |offset| points to a DAFSA node that is a child of the original node.
//
// The low 7 bits of a node encodes a character value; the high bit
// indicates whether it's the last character in the label.
//
// Note that |*offset| could also be a result code value, but these are
// really just out-of-range ASCII values, encoded the same way as
// characters. Since |input| was already validated as a printable ASCII
// value ASCII value, IsMatch will never return true if |offset| is a
// result code.
bool is_last_char_in_label = IsEOL(offset);
bool is_match = IsMatch(offset, input);
if (is_match) {
// If this is not the last character in the label, the next byte
// should be interpreted as a character or return value. Otherwise,
// the next byte should be interpreted as a list of child node
// offsets.
pos_ = offset + 1;
DCHECK(pos_ < end_);
pos_is_label_character_ = !is_last_char_in_label;
return true;
}
}
}
}
// If no match was found, then end of the DAFSA has been reached.
pos_ = nullptr;
pos_is_label_character_ = false;
return false;
}
int FixedSetIncrementalLookup::GetResultForCurrentSequence() const {
int value = kDafsaNotFound;
// Look to see if there is a next character that's a return value.
if (pos_is_label_character_) {
// Currently processing a label, so it is only necessary to check the byte
// at |pos_| to see if encodes a return value.
GetReturnValue(pos_, &value);
} else {
// Otherwise, |pos_| is an offset list (or nullptr). Explore the list of
// child nodes (given by their offsets) to find one whose label is a result
// code.
//
// This search uses a temporary copy of |pos_|, since mutating |pos_| could
// skip over a node that would be important to a subsequent Advance() call.
const unsigned char* temp_pos = pos_;
// Read offsets from |temp_pos| until either |temp_pos| is nullptr or until
// the byte at |offset| contains a result code (encoded as an ASCII
// character below 0x20).
const unsigned char* offset = pos_;
while (GetNextOffset(&temp_pos, &offset)) {
DCHECK(offset < end_);
DCHECK((temp_pos == nullptr) || temp_pos < end_);
if (GetReturnValue(offset, &value))
break;
}
}
return value;
}
int LookupStringInFixedSet(const unsigned char* graph,
size_t length,
const char* key,
size_t key_length) {
// Do an incremental lookup until either the end of the graph is reached, or
// until every character in |key| is consumed.
FixedSetIncrementalLookup lookup(graph, length);
const char* key_end = key + key_length;
while (key != key_end) {
if (!lookup.Advance(*key))
return kDafsaNotFound;
key++;
}
// The entire input was consumed without reaching the end of the graph. Return
// the result code (if present) for the current position, or kDafsaNotFound.
return lookup.GetResultForCurrentSequence();
}
// This function is only used by GetRegistryLengthInStrippedHost(), but is
// implemented here to allow inlining of
// LookupStringInFixedSet::GetResultForCurrentSequence() and
// LookupStringInFixedSet::Advance() at compile time. Tests on x86_64 linux
// indicated about 10% increased runtime cost for GetRegistryLength() in average
// if the implementation of this function was separated from the lookup methods.
int LookupSuffixInReversedSet(const unsigned char* graph,
size_t length,
bool include_private,
base::StringPiece host,
size_t* suffix_length) {
FixedSetIncrementalLookup lookup(graph, length);
*suffix_length = 0;
int result = kDafsaNotFound;
base::StringPiece::const_iterator pos = host.end();
// Look up host from right to left.
while (pos != host.begin() && lookup.Advance(*--pos)) {
// Only host itself or a part that follows a dot can match.
if (pos == host.begin() || *(pos - 1) == '.') {
int value = lookup.GetResultForCurrentSequence();
if (value != kDafsaNotFound) {
// Break if private and private rules should be excluded.
if ((value & kDafsaPrivateRule) && !include_private)
break;
// Save length and return value. Since hosts are looked up from right to
// left, the last saved values will be from the longest match.
*suffix_length = host.end() - pos;
result = value;
}
}
}
return result;
}
} // namespace net
|