1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
#include <zxcvbn/frequency_lists.hpp>
#include <algorithm>
#include <memory>
#include <utility>
#include "base/check.h"
#include "base/check_op.h"
#include "base/files/memory_mapped_file.h"
#include "base/logging.h"
#include "base/no_destructor.h"
#include "base/notreached.h"
#include "base/task/thread_pool.h"
#include "third_party/abseil-cpp/absl/types/optional.h"
#include "third_party/abseil-cpp/absl/types/variant.h"
namespace zxcvbn {
namespace {
// A big-endian 16-bit value, consisting of a 15-bit number and a marker bit in
// the most significant position (in the first byte).
// No alignment requirements.
// This is used to store a "rank", which is the position at which a word
// occurred in a wordlist.
class MarkedBigEndianU15 {
public:
static constexpr size_t MAX_VALUE = (1 << 15) - 1;
static constexpr uint8_t MARKER_BIT = 0x80;
uint16_t get() const {
return (encoded_value[0] & ~MARKER_BIT) * 256 + encoded_value[1];
}
static void AppendToVector(uint16_t value, std::vector<char>& vec) {
CHECK(value <= MAX_VALUE);
vec.push_back((value >> 8) | MARKER_BIT);
vec.push_back(value & 0xff);
}
// Check whether the given byte has the high bit set.
// This always returns true for the first byte of a MarkedBigEndianU15, but
// may also be false-positive for the second byte.
// To reliably determine whether a given byte really is the start of a
// MarkedBigEndianU15, you need to also check the preceding byte if this
// returns true.
static bool IsPossibleMarkerByte(uint8_t c) { return (c & MARKER_BIT) != 0; }
private:
uint8_t encoded_value[2];
};
static_assert(
sizeof(MarkedBigEndianU15) == 2,
"object layout must fit with assumptions in the rest of this file");
struct MergedEntry {
size_t rank;
std::string_view value;
};
// A reference to an entry inside a dictionary.
// The entry consists of a MarkedBigEndianU15 representing the word's rank
// (the position at which the word appears in the original wordlist) and an
// inline string (ASCII, terminated with a byte that has the MARKER_BIT set)
// that stores the actual word.
class RankedDictEntryRef {
public:
explicit RankedDictEntryRef(const RankedDicts::Datawrapper& wrapper,
size_t offset) {
size_t size = wrapper.size();
const char* data = wrapper.data();
CHECK_LT(offset + sizeof(MarkedBigEndianU15), size);
const char* raw_rank = data + offset;
rank_ = reinterpret_cast<const MarkedBigEndianU15*>(raw_rank)->get();
size_t value_start = offset + sizeof(MarkedBigEndianU15);
size_t value_end = value_start;
while (true) {
CHECK_LT(value_end, size);
if (MarkedBigEndianU15::IsPossibleMarkerByte(data[value_end])) {
break;
}
value_end++;
}
value_ = std::string_view(data + value_start, value_end - value_start);
}
RankedDictEntryRef(RankedDictEntryRef&) = delete;
RankedDictEntryRef& operator=(const RankedDictEntryRef&) = delete;
uint16_t rank() const { return rank_; }
std::string_view value() const { return value_; }
static void AppendToVector(MergedEntry entry, std::vector<char>& vec) {
if (entry.rank > MarkedBigEndianU15::MAX_VALUE) {
LOG(ERROR) << "MarkedBigEndianU15 clamping";
entry.rank = MarkedBigEndianU15::MAX_VALUE;
}
MarkedBigEndianU15::AppendToVector(entry.rank, vec);
vec.insert(vec.end(), entry.value.begin(), entry.value.end());
}
private:
size_t rank_;
std::string_view value_;
};
// Helper function that does nothing with the RankedDicts apart from letting
// it destruct as it goes out of scope. This is called on the ThreadPool to
// allow for potentially blocking behavior of `RankedDicts` destructor.
void DoNothing(RankedDicts dicts) {}
} // namespace
RankedDicts::Datawrapper::Datawrapper(std::vector<char> data)
: size_(data.size()), data_(data.data()), content_(std::move(data)) {}
RankedDicts::Datawrapper::Datawrapper(
std::unique_ptr<base::MemoryMappedFile> map)
: size_((map && map->IsValid()) ? map->length() : 0u),
data_(map && map->IsValid() ? reinterpret_cast<const char*>(map->data())
: nullptr),
content_(std::move(map)) {}
RankedDicts::RankedDicts(
const std::vector<std::vector<std::string_view>>& ordered_dicts) {
std::vector<MergedEntry> merged_dicts;
for (const std::vector<std::string_view>& strings : ordered_dicts) {
size_t rank = 1;
for (const std::string_view& s : strings) {
for (char c : s) {
if (MarkedBigEndianU15::IsPossibleMarkerByte(c)) {
NOTREACHED() << "RankedDicts bad character "
<< static_cast<unsigned char>(c);
}
}
merged_dicts.push_back({rank++, s});
}
}
std::sort(merged_dicts.begin(), merged_dicts.end(),
[](MergedEntry& a, MergedEntry& b) { return a.value < b.value; });
if (merged_dicts.size() == 0)
return;
// first pass: calculate required total size
size_t dict_size = sizeof(MarkedBigEndianU15) * merged_dicts.size();
for (MergedEntry& entry : merged_dicts)
dict_size += entry.value.size();
// 1 byte at the end for trailing marker byte (for finding last string size)
std::vector<char> vec;
vec.reserve(dict_size + 1);
// second pass: place elements in allocated array
for (MergedEntry& entry : merged_dicts)
RankedDictEntryRef::AppendToVector(entry, vec);
CHECK_EQ(vec.size(), dict_size);
vec.push_back(MarkedBigEndianU15::MARKER_BIT);
data_ = Datawrapper(std::move(vec));
}
RankedDicts::RankedDicts(std::unique_ptr<base::MemoryMappedFile> map)
: data_(std::move(map)) {}
// Performs a binary search over an array of variable-size elements.
// To find an element in the middle between two others, we first locate the
// *byte* in the middle, then seek forward until we hit a marker byte that
// will only appear at the start of an allocation.
absl::optional<rank_t> RankedDicts::Find(std::string_view needle) const {
// Special case for empty dictionary.
size_t size = data_.size();
if (size == 0) {
return absl::nullopt;
}
CHECK_GE(size, 3u); // 2 bytes header, 1 byte trailing marker
// Create a range whose start and end point to marker bytes.
size_t range_start = 0;
size_t range_last = size - 2u;
CHECK(IsRealMarker(0));
while (!IsRealMarker(range_last))
range_last--;
while (true) {
size_t midpoint = range_start + (range_last - range_start) / 2;
// Find a marker byte from the midpoint onwards. (There must be one, since
// there is one at range_last.)
size_t adjusted_midpoint = midpoint;
while (!IsRealMarker(adjusted_midpoint))
adjusted_midpoint++;
// Perform the actual comparison.
RankedDictEntryRef mid_entry(data_, adjusted_midpoint);
std::string_view mid_value = mid_entry.value();
int cmp_result = mid_value.compare(needle);
if (cmp_result == 0)
return mid_entry.rank();
if (cmp_result < 0) {
if (adjusted_midpoint == range_last)
return absl::nullopt;
range_start = adjusted_midpoint + 1;
while (!IsRealMarker(range_start))
range_start++;
} else {
if (adjusted_midpoint == range_start)
return absl::nullopt;
range_last = adjusted_midpoint - 1;
while (!IsRealMarker(range_last))
range_last--;
}
}
}
// Determine whether an entry starts at the given offset; in other words,
// determine whether a MarkedBigEndianU15 starts there.
bool RankedDicts::IsRealMarker(size_t offset) const {
CHECK_LT(offset, data_.size());
const char* data = data_.data();
if (MarkedBigEndianU15::IsPossibleMarkerByte(data[offset])) {
if (offset == 0)
return true;
if (!MarkedBigEndianU15::IsPossibleMarkerByte(data[offset - 1])) {
return true;
}
}
return false;
}
void SetRankedDictsImplementation(RankedDicts dicts) {
default_ranked_dicts() = std::move(dicts);
}
void SetRankedDicts(RankedDicts dicts) {
// Destroying a `RankedDict` may block if it is based on a `MemoryMappedFile`.
// Therefore this helper moves the task of doing it to a thread pool.
base::ThreadPool::PostTask(
FROM_HERE, {base::MayBlock(), base::TaskPriority::BEST_EFFORT},
base::BindOnce(&DoNothing, std::move(default_ranked_dicts())));
default_ranked_dicts() = std::move(dicts);
}
RankedDicts& default_ranked_dicts() {
static base::NoDestructor<RankedDicts> default_dicts;
return *default_dicts;
}
} // namespace zxcvbn
|