1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
|
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// File utilities that use the ICU library go in this file.
#include "base/i18n/file_util_icu.h"
#include <stdint.h>
#include "base/check.h"
#include "base/files/file_path.h"
#include "base/i18n/icu_string_conversions.h"
#include "base/i18n/string_compare.h"
#include "base/memory/singleton.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/sys_string_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "build/chromeos_buildflags.h"
#include "third_party/icu/source/common/unicode/uniset.h"
#include "third_party/icu/source/i18n/unicode/coll.h"
namespace base {
namespace i18n {
namespace {
class IllegalCharacters {
public:
IllegalCharacters(const IllegalCharacters&) = delete;
IllegalCharacters& operator=(const IllegalCharacters&) = delete;
static IllegalCharacters* GetInstance() {
return Singleton<IllegalCharacters>::get();
}
bool IsDisallowedEverywhere(UChar32 ucs4) const {
return !!illegal_anywhere_.contains(ucs4);
}
bool IsDisallowedLeadingOrTrailing(UChar32 ucs4) const {
return !!illegal_at_ends_.contains(ucs4);
}
#if BUILDFLAG(IS_WIN)
bool IsDisallowedShortNameCharacter(UChar32 ucs4) const {
return !!illegal_in_short_filenames_.contains(ucs4);
}
bool IsDisallowedIfMayBeShortName(UChar32 ucs4) const {
return !!required_to_be_a_short_filename_.contains(ucs4);
}
template <typename StringT>
bool HasValidDotPositionForShortName(const StringT& s) const {
auto first_dot = s.find_first_of('.');
// Short names are not required to have a "." period character...
if (first_dot == std::string::npos) {
return s.size() <= 8;
}
// ...but they must not contain more than one "." period character...
if (first_dot != s.find_last_of('.')) {
return false;
}
// ... and must contain a basename of 1-8 characters, optionally with one
// "." period character followed by an extension no more than 3 characters
// in length.
return first_dot > 0 && first_dot <= 8 && first_dot + 4 >= s.size();
}
// Returns whether `s` could possibly be in the 8.3 name format AND contains a
// '~' character, which may interact poorly with short filenames on VFAT. See
// https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-cifs/09c2ccc8-4aaf-439f-9b4e-13b3fe85a4cf.
bool CouldBeInvalidShortName(const std::u16string& s) const {
if (s.size() > 12 ||
!required_to_be_a_short_filename_.containsSome(icu::UnicodeString(
/*isTerminated=*/false, s.c_str(), s.size())) ||
!illegal_in_short_filenames_.containsNone(
icu::UnicodeString(/*isTerminated=*/false, s.c_str(), s.size()))) {
return false;
}
return HasValidDotPositionForShortName<std::u16string>(s);
}
#endif
bool IsAllowedName(const std::u16string& s) const {
return s.empty() || (!!illegal_anywhere_.containsNone(icu::UnicodeString(
/*isTerminated=*/false, s.c_str(), s.size())) &&
!illegal_at_ends_.contains(*s.begin()) &&
!illegal_at_ends_.contains(*s.rbegin())
#if BUILDFLAG(IS_WIN)
&& !CouldBeInvalidShortName(s)
#endif
);
}
private:
friend struct DefaultSingletonTraits<IllegalCharacters>;
IllegalCharacters();
~IllegalCharacters() = default;
// Set of characters considered invalid anywhere inside a filename.
icu::UnicodeSet illegal_anywhere_;
// Set of characters considered invalid at either end of a filename.
icu::UnicodeSet illegal_at_ends_;
// #if BUILDFLAG(IS_WIN)
// Set of characters which are guaranteed to exist if the filename is to be of
// the problematic VFAT 8.3 short filename format.
icu::UnicodeSet required_to_be_a_short_filename_;
// Set of characters which are not allowed in VFAT 8.3 short filenames. If
// any of these characters are present, the file cannot be a short filename.
icu::UnicodeSet illegal_in_short_filenames_;
// #endif
};
IllegalCharacters::IllegalCharacters() {
UErrorCode status = U_ZERO_ERROR;
// Control characters, formatting characters, non-characters, path separators,
// and some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
// See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
// and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
// Note that code points in the "Other, Format" (Cf) category are ignored on
// HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being
// legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is
// also excluded in some circumstances due to the possibility of interacting
// poorly with short filenames on VFAT. (Related to CVE-2014-9390)
illegal_anywhere_ = icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[[\"*/:<>?\\\\|][:Cc:][:Cf:]]"), status);
DCHECK(U_SUCCESS(status));
// Add non-characters. If this becomes a performance bottleneck by
// any chance, do not add these to |set| and change IsFilenameLegal()
// to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addition to calling
// IsAllowedName().
illegal_anywhere_.add(0xFDD0, 0xFDEF);
for (int i = 0; i <= 0x10; ++i) {
int plane_base = 0x10000 * i;
illegal_anywhere_.add(plane_base + 0xFFFE, plane_base + 0xFFFF);
}
illegal_anywhere_.freeze();
illegal_at_ends_ =
icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:WSpace:][.~]]"), status);
DCHECK(U_SUCCESS(status));
illegal_at_ends_.freeze();
#if BUILDFLAG(IS_WIN)
required_to_be_a_short_filename_ =
icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[~]]"), status);
DCHECK(U_SUCCESS(status));
required_to_be_a_short_filename_.freeze();
illegal_in_short_filenames_ = icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[[:WSpace:][\"\\/[]:+|<>=;?,*]]"), status);
DCHECK(U_SUCCESS(status));
illegal_in_short_filenames_.freeze();
#endif
}
// Returns the code point at position |cursor| in |file_name|, and increments
// |cursor| to the next position.
UChar32 GetNextCodePoint(const FilePath::StringType* const file_name,
int& cursor) {
UChar32 code_point;
#if BUILDFLAG(IS_WIN)
// Windows uses UTF-16 encoding for filenames.
U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
code_point);
#elif BUILDFLAG(IS_POSIX) || BUILDFLAG(IS_FUCHSIA)
// Mac and Chrome OS use UTF-8 encoding for filenames.
// Linux doesn't actually define file system encoding. Try to parse as
// UTF-8.
U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
code_point);
#else
#error Unsupported platform
#endif
return code_point;
}
} // namespace
bool IsFilenameLegal(const std::u16string& file_name) {
return IllegalCharacters::GetInstance()->IsAllowedName(file_name);
}
void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
char replace_char) {
IllegalCharacters* illegal = IllegalCharacters::GetInstance();
DCHECK(!(illegal->IsDisallowedEverywhere(replace_char)));
const bool is_replace_char_illegal_at_ends =
illegal->IsDisallowedLeadingOrTrailing(replace_char);
#if BUILDFLAG(IS_WIN)
bool could_be_short_name =
file_name->size() <= 12 &&
illegal->HasValidDotPositionForShortName<FilePath::StringType>(
*file_name);
#endif
// Keep track of the earliest and latest legal begin/end characters and file-
// extension separator encountered, -1 if none yet.
int unreplaced_legal_range_begin = -1;
int unreplaced_legal_range_end = -1;
int last_extension_separator = -1;
static const UChar32 kExtensionSeparator =
checked_cast<UChar32>(FilePath::kExtensionSeparator);
int cursor = 0; // The ICU macros expect an int.
#if BUILDFLAG(IS_WIN)
// Loop through the file name, looking for any characters which are invalid in
// an 8.3 short file name. If any of these characters exist, it's not an 8.3
// file name and we don't need to replace the '~' character.
while (could_be_short_name && cursor < static_cast<int>(file_name->size())) {
const UChar32 code_point = GetNextCodePoint(file_name, cursor);
could_be_short_name = !illegal->IsDisallowedShortNameCharacter(code_point);
}
#endif
cursor = 0;
while (cursor < static_cast<int>(file_name->size())) {
int char_begin = cursor;
const UChar32 code_point = GetNextCodePoint(file_name, cursor);
const bool is_illegal_at_ends =
illegal->IsDisallowedLeadingOrTrailing(code_point);
if (illegal->IsDisallowedEverywhere(code_point) ||
#if BUILDFLAG(IS_WIN)
(could_be_short_name &&
illegal->IsDisallowedIfMayBeShortName(code_point)) ||
#endif
((char_begin == 0 || cursor == static_cast<int>(file_name->length())) &&
is_illegal_at_ends && !is_replace_char_illegal_at_ends)) {
file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
// We just made the potentially multi-byte/word char into one that only
// takes one byte/word, so need to adjust the cursor to point to the next
// character again.
cursor = char_begin + 1;
} else if (!is_illegal_at_ends) {
if (unreplaced_legal_range_begin == -1)
unreplaced_legal_range_begin = char_begin;
unreplaced_legal_range_end = cursor;
}
if (code_point == kExtensionSeparator)
last_extension_separator = char_begin;
}
// If |replace_char| is not a legal starting/ending character, ensure that
// |replace_char| is not the first nor last character in |file_name|.
if (is_replace_char_illegal_at_ends) {
if (unreplaced_legal_range_begin == -1) {
// |file_name| has no characters that are legal at ends; enclose in '_'s.
file_name->insert(file_name->begin(), FILE_PATH_LITERAL('_'));
file_name->append(FILE_PATH_LITERAL("_"));
} else {
// Trim trailing instances of |replace_char| and other characters that are
// illegal at ends.
file_name->erase(unreplaced_legal_range_end, FilePath::StringType::npos);
// Trim leading instances of |replace_char| and other characters that are
// illegal at ends, while ensuring that the file-extension separator is
// not removed if present. The file-extension separator is considered the
// last '.' in |file_name| followed by a legal character.
if (last_extension_separator != -1 &&
last_extension_separator == unreplaced_legal_range_begin - 1) {
// If the file-extension separator is at the start of the resulting
// |file_name|, prepend '_' instead of trimming it, e.g.,
// "***.txt" -> "_.txt".
file_name->erase(0, last_extension_separator);
file_name->insert(file_name->begin(), FILE_PATH_LITERAL('_'));
} else {
file_name->erase(0, unreplaced_legal_range_begin);
}
}
DCHECK(!file_name->empty());
}
}
bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
UErrorCode error_code = U_ZERO_ERROR;
// Use the default collator. The default locale should have been properly
// set by the time this constructor is called.
std::unique_ptr<icu::Collator> collator(
icu::Collator::createInstance(error_code));
DCHECK(U_SUCCESS(error_code));
// Make it case-sensitive.
collator->setStrength(icu::Collator::TERTIARY);
#if BUILDFLAG(IS_WIN)
return CompareString16WithCollator(*collator, AsStringPiece16(a.value()),
AsStringPiece16(b.value())) == UCOL_LESS;
#elif BUILDFLAG(IS_POSIX) || BUILDFLAG(IS_FUCHSIA)
// On linux, the file system encoding is not defined. We assume
// SysNativeMBToWide takes care of it.
return CompareString16WithCollator(
*collator, WideToUTF16(SysNativeMBToWide(a.value())),
WideToUTF16(SysNativeMBToWide(b.value()))) == UCOL_LESS;
#endif
}
void NormalizeFileNameEncoding(FilePath* file_name) {
#if BUILDFLAG(IS_CHROMEOS_ASH)
std::string normalized_str;
if (ConvertToUtf8AndNormalize(file_name->BaseName().value(), kCodepageUTF8,
&normalized_str) &&
!normalized_str.empty()) {
*file_name = file_name->DirName().Append(FilePath(normalized_str));
}
#endif
}
} // namespace i18n
} // namespace base
|