File: on_device_tail_tokenizer.h

package info (click to toggle)

chromium 139.0.7258.127-1

links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36

file content (118 lines) | stat: -rw-r--r-- 4,280 bytes

parent folder | download | duplicates (7)

// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_OMNIBOX_BROWSER_ON_DEVICE_TAIL_TOKENIZER_H_
#define COMPONENTS_OMNIBOX_BROWSER_ON_DEVICE_TAIL_TOKENIZER_H_

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "base/containers/flat_map.h"
#include "base/containers/flat_set.h"
#include "base/files/file_path.h"

// The tokenizer performs tokenization for on device tail machine learning
// model. It basically maps raw strings to/from tokens, and tokens to/from IDs
// accepted by the ML model or vice versa based on the given vocabulary file.
// This tokenizer has not supported CJK yet.
class OnDeviceTailTokenizer {
 public:
  using TokenId = int32_t;
  using TokenIds = std::vector<TokenId>;

  // Data structure to store tokenization information.
  struct Tokenization {
    // Unambiguous token IDs. This should at least include the begin query
    // token.
    TokenIds unambiguous_ids;

    // Human-readable unambiguous part of the prefix.
    std::string unambiguous_prefix;

    // The constraint prefix for the next forward RNN step, if the last typed
    // token was ambiguous. For example, given prefix [(pa)(t)] and if the
    // trailing (t) could match multiple tokens, constraint prefix will be set
    // as "t" and only outputs matching this prefix from the next step will be
    // kept.
    std::string constraint_prefix;

    Tokenization();
    ~Tokenization();
  };

  ~OnDeviceTailTokenizer();
  OnDeviceTailTokenizer();

  // Loads the vocabulary file and initializes the tokenizer.
  bool Init(const base::FilePath& vocabulary_filepath);

  // Determines whether the instance is successfully initialized.
  bool IsReady() const;

  // Fills the Tokenization struct for the given prefix.
  void CreatePrefixTokenization(const std::string& prefix,
                                Tokenization* tokenization) const;

  // Tokenizes the previous query greedily.
  void TokenizePrevQuery(const std::string& prev_query,
                         TokenIds* prev_query_token_ids) const;

  // Resets tokens <-> IDs maps.
  void Reset();

  // Maps token to ID and vice versa.
  std::string IdToToken(const TokenId token_id) const;
  TokenId TokenToId(const std::string& token) const;

  // Returns the size of the vocabulary.
  size_t vocab_size() const { return token_to_id_.size(); }

  // Special query token related helpers.
  bool IsEndQueryTokenId(TokenId token_id) const;
  bool IsBeginQueryTokenId(TokenId token_id) const;
  TokenId GetEndQueryTokenId() const;

  // Determines if the token related to the given ID can be properly printed.
  bool IsTokenPrintable(TokenId token_id) const;

 private:
  // Determines if the given token is ambiguous.
  bool IsAmbiguousToken(const std::string& token) const;

  // Initializes the ambiguous tokens map.
  void InitAmbiguousMap();

  // Encodes the given raw string to its corresponding token and ID pairs.
  // Note we always use the longest tokens in the vocabulary first and gradually
  // switch to shorter tokens until a match for the prefix of the remaining
  // string is found. Then jump to the start of the unmatched part of the string
  // and do this again until we match all characters of the string.
  // For example, given vocabulary:
  //    [1:a], [2:b], [3:c], [4:abc], [5:ab]
  // Encoding:
  //    string:abcabc -> tokens:[abc][abc] -> IDs:[4][4]
  //    string:abcab  -> tokens:[abc][ab]  -> IDs:[4][5]
  //    string:abcaba -> tokens:[abc][ab][a] -> IDs:[4][5][1]
  //    string:cbacba -> tokens:[c][b][a][c][b][a] -> IDs:[3][2][1][3][2][1]
  void EncodeRawString(
      const std::string& raw_string,
      std::vector<std::pair<std::string, TokenId>>* token_and_ids) const;

  // Insert token and its ID to tokens <-> IDs maps.
  void InsertTokenToMaps(const std::string& token);

  // Maps for tokens <-> IDs.
  base::flat_map<std::string, TokenId> token_to_id_;
  std::vector<std::string> id_to_token_;

  // The max length of tokens in the vocabulary.
  size_t max_token_length_;

  // The list of ambiguous tokens.
  base::flat_set<std::string> ambiguous_tokens_;
};

#endif  // COMPONENTS_OMNIBOX_BROWSER_ON_DEVICE_TAIL_TOKENIZER_H_