File: frequency_lists.hpp

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (85 lines) | stat: -rw-r--r-- 2,835 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#ifndef __ZXCVBN__FREQUENCY_LISTS_HPP
#define __ZXCVBN__FREQUENCY_LISTS_HPP

#include <cstdint>
#include <memory>
#include <string_view>
#include <vector>

#include "base/files/memory_mapped_file.h"
#include "third_party/abseil-cpp/absl/types/optional.h"
#include "third_party/abseil-cpp/absl/types/variant.h"

namespace zxcvbn {

using rank_t = std::size_t;

// Stores words from a set of dictionaries (originally ordered by word
// frequency) in a sorted flat array.
// Lookups run in roughly logarithmic time and, when a match is found, return
// the position of the word in the original dictionary.
// This data structure is optimized for memory efficiency over lookup speed.
// It does not contain any pointers and its format is target-independent, so it
// could theoretically directly be mapped from disk.
//
// Since this data structure sorts words alphabetically, the lookup code could
// be extended to also answer the question "are there any entries that start
// with the given prefix", which should permit speeding up dictionary_match().
// That isn't implemented yet though.
class RankedDicts {
 public:
  // Abstraction layer for the binary blob of data that contains the contents
  // of the `RankedDicts`. The data can either be held directly in memory or
  // be obtained from a memory mapped file.
  // See `RankedDictEntryRef` and the rest of frequency_lists.cpp for
  // documentation of the data structure.
  class Datawrapper {
   public:
    explicit Datawrapper(std::vector<char> data);
    explicit Datawrapper(std::unique_ptr<base::MemoryMappedFile> map);
    Datawrapper() = default;
    Datawrapper(Datawrapper&&) = default;

    Datawrapper& operator=(Datawrapper&&) = default;

    size_t size() const { return size_; }
    // Returns a pointer to the data chunk belonging to the buffer. Returns a
    // non-null value only if `size()` is non-zero.
    const char* data() const { return data_; }

   private:
    size_t size_ = 0u;
    const char* data_ = nullptr;
    absl::variant<std::vector<char>, std::unique_ptr<base::MemoryMappedFile>>
        content_;
  };

  explicit RankedDicts(
      const std::vector<std::vector<std::string_view>>& ordered_dicts);
  explicit RankedDicts(std::unique_ptr<base::MemoryMappedFile>);
  RankedDicts() = default;
  RankedDicts(RankedDicts&&) = default;
  RankedDicts(const RankedDicts&) = delete;

  RankedDicts& operator=(RankedDicts&&) = default;
  RankedDicts& operator=(const RankedDicts&) = delete;

  absl::optional<rank_t> Find(std::string_view needle) const;

  std::string_view DataForTesting() const {
    return std::string_view(data_.data(), data_.size());
  }

 private:
  bool IsRealMarker(size_t offset) const;

  Datawrapper data_;
};

void SetRankedDicts(RankedDicts dicts);

RankedDicts& default_ranked_dicts();

} // namespace zxcvbn

#endif