1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
|
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/language_detection/core/ngram_hash_ops_utils.h"
#include <cstring>
#include "base/compiler_specific.h"
#include "third_party/utf/src/include/utf.h"
namespace language_detection {
constexpr char kPrefix[] = "^";
constexpr char kSuffix[] = "$";
constexpr char kReplacementToken[] = " ";
TokenizedOutput::TokenizedOutput() = default;
TokenizedOutput::~TokenizedOutput() = default;
TokenizedOutput::TokenizedOutput(const TokenizedOutput& rhs) = default;
TokenizedOutput Tokenize(std::string_view input_str,
size_t max_tokens,
bool exclude_nonalphaspace_tokens,
bool lower_case_input) {
std::string lower_cased_str;
if (lower_case_input && !input_str.empty()) {
lower_cased_str = LowercaseUnicodeStr(input_str);
input_str = lower_cased_str;
}
TokenizedOutput output;
size_t token_start = 0;
output.str.reserve(input_str.length() + 2);
output.tokens.reserve(input_str.length() + 2);
output.str.append(kPrefix);
output.tokens.emplace_back(token_start, strlen(kPrefix));
token_start += strlen(kPrefix);
Rune token;
for (size_t i = 0;
i < input_str.length() && output.tokens.size() + 1 < max_tokens;) {
// Use the standard UTF-8 library to find the next token.
size_t bytes_read =
charntorune(&token, &input_str.at(i), input_str.length() - i);
// Stop processing, if we can't read any more tokens, or we have reached
// maximum allowed tokens, allocating one token for the suffix.
if (bytes_read == 0) {
break;
}
// If `exclude_nonalphaspace_tokens` is set to true, and the token is not
// alphanumeric, replace it with a replacement token.
if (exclude_nonalphaspace_tokens && !isalpharune(token)) {
output.str.append(kReplacementToken);
output.tokens.emplace_back(token_start, strlen(kReplacementToken));
token_start += strlen(kReplacementToken);
i += bytes_read;
continue;
}
// Append the token in the output string, and note its position and the
// number of bytes that token consumed.
output.str.append(input_str, i, bytes_read);
output.tokens.emplace_back(token_start, bytes_read);
token_start += bytes_read;
i += bytes_read;
}
output.str.append(kSuffix);
output.tokens.emplace_back(token_start, strlen(kSuffix));
token_start += strlen(kSuffix);
return output;
}
std::string LowercaseUnicodeStr(std::string_view input_str) {
std::string output_str;
output_str.reserve(input_str.length());
for (size_t i = 0; i < input_str.length();) {
Rune token;
// Tokenize the given string, and get the appropriate lowercase token.
size_t bytes_read =
charntorune(&token, &input_str.at(i), input_str.length() - i);
token = isalpharune(token) ? tolowerrune(token) : token;
// Write back the token to the output string.
char token_buf[UTFmax];
size_t bytes_to_write = runetochar(token_buf, &token);
output_str.append(token_buf, bytes_to_write);
i += bytes_read;
}
return output_str;
}
} // namespace language_detection
|