File: ngram_hash_ops_utils.cc

package info (click to toggle)

chromium 140.0.7339.127-1

links: PTS, VCS
area: main
in suites: sid
size: 6,192,880 kB
sloc: cpp: 35,093,808; ansic: 7,161,670; javascript: 4,199,694; python: 1,441,797; asm: 949,904; xml: 747,503; pascal: 187,748; perl: 88,691; sh: 88,248; objc: 79,953; sql: 52,714; cs: 44,599; fortran: 24,137; makefile: 22,114; tcl: 15,277; php: 13,980; yacc: 9,000; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36

file content (92 lines) | stat: -rw-r--r-- 3,262 bytes

parent folder | download | duplicates (4)

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/language_detection/core/ngram_hash_ops_utils.h"

#include <cstring>

#include "base/compiler_specific.h"
#include "third_party/utf/src/include/utf.h"

namespace language_detection {

constexpr char kPrefix[] = "^";
constexpr char kSuffix[] = "$";
constexpr char kReplacementToken[] = " ";

TokenizedOutput::TokenizedOutput() = default;

TokenizedOutput::~TokenizedOutput() = default;

TokenizedOutput::TokenizedOutput(const TokenizedOutput& rhs) = default;

TokenizedOutput Tokenize(std::string_view input_str,
                         size_t max_tokens,
                         bool exclude_nonalphaspace_tokens,
                         bool lower_case_input) {
  std::string lower_cased_str;
  if (lower_case_input && !input_str.empty()) {
    lower_cased_str = LowercaseUnicodeStr(input_str);
    input_str = lower_cased_str;
  }

  TokenizedOutput output;
  size_t token_start = 0;
  output.str.reserve(input_str.length() + 2);
  output.tokens.reserve(input_str.length() + 2);
  output.str.append(kPrefix);
  output.tokens.emplace_back(token_start, strlen(kPrefix));
  token_start += strlen(kPrefix);
  Rune token;
  for (size_t i = 0;
       i < input_str.length() && output.tokens.size() + 1 < max_tokens;) {
    // Use the standard UTF-8 library to find the next token.
    size_t bytes_read =
        charntorune(&token, &input_str.at(i), input_str.length() - i);
    // Stop processing, if we can't read any more tokens, or we have reached
    // maximum allowed tokens, allocating one token for the suffix.
    if (bytes_read == 0) {
      break;
    }
    // If `exclude_nonalphaspace_tokens` is set to true, and the token is not
    // alphanumeric, replace it with a replacement token.
    if (exclude_nonalphaspace_tokens && !isalpharune(token)) {
      output.str.append(kReplacementToken);
      output.tokens.emplace_back(token_start, strlen(kReplacementToken));
      token_start += strlen(kReplacementToken);
      i += bytes_read;
      continue;
    }
    // Append the token in the output string, and note its position and the
    // number of bytes that token consumed.
    output.str.append(input_str, i, bytes_read);
    output.tokens.emplace_back(token_start, bytes_read);
    token_start += bytes_read;
    i += bytes_read;
  }
  output.str.append(kSuffix);
  output.tokens.emplace_back(token_start, strlen(kSuffix));
  token_start += strlen(kSuffix);
  return output;
}

std::string LowercaseUnicodeStr(std::string_view input_str) {
  std::string output_str;
  output_str.reserve(input_str.length());
  for (size_t i = 0; i < input_str.length();) {
    Rune token;
    // Tokenize the given string, and get the appropriate lowercase token.
    size_t bytes_read =
        charntorune(&token, &input_str.at(i), input_str.length() - i);
    token = isalpharune(token) ? tolowerrune(token) : token;
    // Write back the token to the output string.
    char token_buf[UTFmax];
    size_t bytes_to_write = runetochar(token_buf, &token);
    output_str.append(token_buf, bytes_to_write);
    i += bytes_read;
  }
  return output_str;
}

}  // namespace language_detection