1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
***************************************************************************
* Copyright (C) 2008-2013, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*
* uspoof_impl.h
*
* Implementation header for spoof detection
*
*/
#ifndef USPOOFIM_H
#define USPOOFIM_H
#include "uassert.h"
#include <_foundation_unicode/utypes.h>
#include <_foundation_unicode/uspoof.h>
#include <_foundation_unicode/uscript.h>
#include <_foundation_unicode/udata.h>
#include "udataswp.h"
#include "utrie2.h"
#if !UCONFIG_NO_NORMALIZATION
#ifdef __cplusplus
#include "capi_helper.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
// The maximum length (in UTF-16 UChars) of the skeleton replacement string resulting from
// a single input code point. This is function of the unicode.org data.
#define USPOOF_MAX_SKELETON_EXPANSION 20
// The default stack buffer size for copies or conversions or normalizations
// of input strings being checked. (Used in multiple places.)
#define USPOOF_STACK_BUFFER_SIZE 100
// Magic number for sanity checking spoof data.
#define USPOOF_MAGIC 0x3845fdef
// Magic number for sanity checking spoof checkers.
#define USPOOF_CHECK_MAGIC 0x2734ecde
class ScriptSet;
class SpoofData;
struct SpoofDataHeader;
class ConfusableDataUtils;
/**
* Class SpoofImpl corresponds directly to the plain C API opaque type
* USpoofChecker. One can be cast to the other.
*/
class SpoofImpl : public UObject,
public IcuCApiHelper<USpoofChecker, SpoofImpl, USPOOF_MAGIC> {
public:
SpoofImpl(SpoofData *data, UErrorCode& status);
SpoofImpl(UErrorCode& status);
SpoofImpl();
void construct(UErrorCode& status);
virtual ~SpoofImpl();
/** Copy constructor, used by the user level uspoof_clone() function.
*/
SpoofImpl(const SpoofImpl &src, UErrorCode &status);
USpoofChecker *asUSpoofChecker();
static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status);
static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status);
/** Set and Get AllowedLocales, implementations of the corresponding API */
void setAllowedLocales(const char *localesList, UErrorCode &status);
const char * getAllowedLocales(UErrorCode &status);
// Add (union) to the UnicodeSet all of the characters for the scripts used for
// the specified locale. Part of the implementation of setAllowedLocales.
void addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status);
// Functions implementing the features of UTS 39 section 5.
static void getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status);
void getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const;
void getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const;
void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const;
URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const;
int32_t findHiddenOverlay(const UnicodeString& input, UErrorCode& status) const;
bool isIllegalCombiningDotLeadCharacter(UChar32 cp) const;
/** parse a hex number. Untility used by the builders. */
static UChar32 ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status);
static UClassID U_EXPORT2 getStaticClassID();
virtual UClassID getDynamicClassID() const override;
//
// Data Members
//
int32_t fChecks; // Bit vector of checks to perform.
SpoofData *fSpoofData;
const UnicodeSet *fAllowedCharsSet; // The UnicodeSet of allowed characters.
// for this Spoof Checker. Defaults to all chars.
const char *fAllowedLocales; // The list of allowed locales.
URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier.
};
/**
* Class CheckResult corresponds directly to the plain C API opaque type
* USpoofCheckResult. One can be cast to the other.
*/
class CheckResult : public UObject,
public IcuCApiHelper<USpoofCheckResult, CheckResult, USPOOF_CHECK_MAGIC> {
public:
CheckResult();
virtual ~CheckResult();
USpoofCheckResult *asUSpoofCheckResult();
static CheckResult *validateThis(USpoofCheckResult *ptr, UErrorCode &status);
static const CheckResult *validateThis(const USpoofCheckResult *ptr, UErrorCode &status);
void clear();
// Used to convert this CheckResult to the older int32_t return value API
int32_t toCombinedBitmask(int32_t expectedChecks);
// Data Members
int32_t fChecks; // Bit vector of checks that were failed.
UnicodeSet fNumerics; // Set of numerics found in the string.
URestrictionLevel fRestrictionLevel; // The restriction level of the string.
};
//
// Confusable Mappings Data Structures, version 2.0
//
// For the confusable data, we are essentially implementing a map,
// key: a code point
// value: a string. Most commonly one char in length, but can be more.
//
// The keys are stored as a sorted array of 32 bit ints.
// bits 0-23 a code point value
// bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
// The key table is sorted in ascending code point order. (not on the
// 32 bit int value, the flag bits do not participate in the sorting.)
//
// Lookup is done by means of a binary search in the key table.
//
// The corresponding values are kept in a parallel array of 16 bit ints.
// If the value string is of length 1, it is literally in the value array.
// For longer strings, the value array contains an index into the strings table.
//
// String Table:
// The strings table contains all of the value strings (those of length two or greater)
// concatenated together into one long char16_t (UTF-16) array.
//
// There is no nul character or other mark between adjacent strings.
//
//----------------------------------------------------------------------------
//
// Changes from format version 1 to format version 2:
// 1) Removal of the whole-script confusable data tables.
// 2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask.
// 3) Expansion of string length value in the key bitmask from 2 bits to 8 bits.
// 4) Removal of the string lengths table since 8 bits is sufficient for the
// lengths of all entries in confusables.txt.
// Internal functions for manipulating confusable data table keys
#define USPOOF_CONFUSABLE_DATA_FORMAT_VERSION 2 // version for ICU 58
class ConfusableDataUtils {
public:
inline static UChar32 keyToCodePoint(int32_t key) {
return key & 0x00ffffff;
}
inline static int32_t keyToLength(int32_t key) {
return ((key & 0xff000000) >> 24) + 1;
}
inline static int32_t codePointAndLengthToKey(UChar32 codePoint, int32_t length) {
U_ASSERT((codePoint & 0x00ffffff) == codePoint);
U_ASSERT(length <= 256);
return codePoint | ((length - 1) << 24);
}
};
//-------------------------------------------------------------------------------------
//
// SpoofData
//
// A small class that wraps the raw (usually memory mapped) spoof data.
// Serves two primary functions:
// 1. Convenience. Contains real pointers to the data, to avoid dealing with
// the offsets in the raw data.
// 2. Reference counting. When a spoof checker is cloned, the raw data is shared
// and must be retained until all checkers using the data are closed.
// Nothing in this struct includes state that is specific to any particular
// USpoofDetector object.
//
//---------------------------------------------------------------------------------------
class SpoofData: public UMemory {
public:
static SpoofData* getDefault(UErrorCode &status); // Get standard ICU spoof data.
static void releaseDefault(); // Cleanup reference to default spoof data.
SpoofData(UErrorCode &status); // Create new spoof data wrapper.
// Only used when building new data from rules.
// Constructor for use when creating from prebuilt default data.
// A UDataMemory is what the ICU internal data loading functions provide.
// The udm is adopted by the SpoofData.
SpoofData(UDataMemory *udm, UErrorCode &status);
// Constructor for use when creating from serialized data.
//
SpoofData(const void *serializedData, int32_t length, UErrorCode &status);
// Check raw Spoof Data Version compatibility.
// Return true it looks good.
UBool validateDataVersion(UErrorCode &status) const;
~SpoofData(); // Destructor not normally used.
// Use removeReference() instead.
// Reference Counting functions.
// Clone of a user-level spoof detector increments the ref count on the data.
// Close of a user-level spoof detector decrements the ref count.
// If the data is owned by us, it will be deleted when count goes to zero.
SpoofData *addReference();
void removeReference();
// Reset all fields to an initial state.
// Called from the top of all constructors.
void reset();
// Copy this instance's raw data buffer to the specified address.
int32_t serialize(void *buf, int32_t capacity, UErrorCode &status) const;
// Get the total number of bytes of data backed by this SpoofData.
// Not to be confused with length, which returns the number of confusable entries.
int32_t size() const;
// Get the confusable skeleton transform for a single code point.
// The result is a string with a length between 1 and 18 as of Unicode 9.
// This is the main public endpoint for this class.
// @return The length in UTF-16 code units of the substitution string.
int32_t confusableLookup(UChar32 inChar, UnicodeString &dest) const;
// Get the number of confusable entries in this SpoofData.
int32_t length() const;
// Get the code point (key) at the specified index.
UChar32 codePointAt(int32_t index) const;
// Get the confusable skeleton (value) at the specified index.
// Append it to the specified UnicodeString&.
// @return The length in UTF-16 code units of the skeleton string.
int32_t appendValueTo(int32_t index, UnicodeString& dest) const;
private:
// Reserve space in the raw data. For use by builder when putting together a
// new set of data. Init the new storage to zero, to prevent inconsistent
// results if it is not all otherwise set by the requester.
// Return:
// pointer to the new space that was added by this function.
void *reserveSpace(int32_t numBytes, UErrorCode &status);
// initialize the pointers from this object to the raw data.
void initPtrs(UErrorCode &status);
SpoofDataHeader *fRawData; // Ptr to the raw memory-mapped data
UBool fDataOwned; // True if the raw data is owned, and needs
// to be deleted when refcount goes to zero.
UDataMemory *fUDM; // If not nullptr, our data came from a
// UDataMemory, which we must close when
// we are done.
uint32_t fMemLimit; // Limit of available raw data space
u_atomic_int32_t fRefCount;
// Confusable data
int32_t *fCFUKeys;
uint16_t *fCFUValues;
char16_t *fCFUStrings;
friend class ConfusabledataBuilder;
};
//---------------------------------------------------------------------------------------
//
// Raw Binary Data Formats, as loaded from the ICU data file,
// or as built by the builder.
//
//---------------------------------------------------------------------------------------
struct SpoofDataHeader {
int32_t fMagic; // (0x3845fdef)
uint8_t fFormatVersion[4]; // Data Format. Same as the value in struct UDataInfo
// if there is one associated with this data.
int32_t fLength; // Total length in bytes of this spoof data,
// including all sections, not just the header.
// The following four sections refer to data representing the confusable data
// from the Unicode.org data from "confusables.txt"
int32_t fCFUKeys; // byte offset to Keys table (from SpoofDataHeader *)
int32_t fCFUKeysSize; // number of entries in keys table (32 bits each)
// TODO: change name to fCFUValues, for consistency.
int32_t fCFUStringIndex; // byte offset to String Indexes table
int32_t fCFUStringIndexSize; // number of entries in String Indexes table (16 bits each)
// (number of entries must be same as in Keys table
int32_t fCFUStringTable; // byte offset of String table
int32_t fCFUStringTableLen; // length of string table (in 16 bit UChars)
// The following sections are for data from xidmodifications.txt
int32_t unused[15]; // Padding, Room for Expansion
};
U_NAMESPACE_END
#endif /* __cplusplus */
/**
* Endianness swap function for binary spoof data.
* @internal
*/
U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
UErrorCode *status);
#endif
#endif /* USPOOFIM_H */
|