1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2005-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_CONVERSION
#include "inputext.h"
#include "cmemory.h"
#include "cstring.h"
#include <string.h>
U_NAMESPACE_BEGIN
#define BUFFER_SIZE 8192
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
InputText::InputText(UErrorCode &status)
: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
// removed if appropriate.
fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
// Value is percent, not absolute.
fDeclaredEncoding(0),
fRawInput(0),
fRawLength(0)
{
if (fInputBytes == nullptr || fByteStats == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
InputText::~InputText()
{
DELETE_ARRAY(fDeclaredEncoding);
DELETE_ARRAY(fByteStats);
DELETE_ARRAY(fInputBytes);
}
void InputText::setText(const char *in, int32_t len)
{
fInputLen = 0;
fC1Bytes = false;
#if APPLE_ICU_CHANGES
// rdar://56373519
fOnlyTypicalASCII = false;
#endif // APPLE_ICU_CHANGES
fRawInput = (const uint8_t *) in;
fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
}
void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
{
if(encoding) {
if (len == -1) {
len = (int32_t)uprv_strlen(encoding);
}
len += 1; // to make place for the \0 at the end.
uprv_free(fDeclaredEncoding);
fDeclaredEncoding = NEW_ARRAY(char, len);
uprv_strncpy(fDeclaredEncoding, encoding, len);
}
}
UBool InputText::isSet() const
{
return fRawInput != nullptr;
}
/**
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
* it by removing what appears to be html markup.
* Apple comment: Currently only used by CharsetDetector::detectAll.
*
* @internal
*/
void InputText::MungeInput(UBool fStripTags) {
int srci = 0;
int dsti = 0;
uint8_t b;
bool inMarkup = false;
#if APPLE_ICU_CHANGES
// rdar:/
bool inCSSDecl = false;
#endif // APPLE_ICU_CHANGES
int32_t openTags = 0;
int32_t badTags = 0;
//
// html / xml markup stripping.
// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
// discard everything within < brackets >
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
// guess as to whether the input was actually marked up at all.
// TODO: Think about how this interacts with EBCDIC charsets that are detected.
if (fStripTags) {
for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
b = fRawInput[srci];
#if APPLE_ICU_CHANGES
// rdar:/
if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
#else
if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
#endif // APPLE_ICU_CHANGES
if (inMarkup) {
badTags += 1;
}
inMarkup = true;
openTags += 1;
}
#if APPLE_ICU_CHANGES
// rdar:/
if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
if (inCSSDecl) {
badTags += 1;
}
inCSSDecl = true;
openTags += 1;
}
if (!inMarkup && !inCSSDecl) {
#else
if (! inMarkup) {
#endif // APPLE_ICU_CHANGES
fInputBytes[dsti++] = b;
}
if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
inMarkup = false;
}
#if APPLE_ICU_CHANGES
// rdar:/
if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
inCSSDecl = false;
}
#endif // APPLE_ICU_CHANGES
}
fInputLen = dsti;
}
//
// If it looks like this input wasn't marked up, or if it looks like it's
// essentially nothing but markup abandon the markup stripping.
// Detection will have to work on the unstripped input.
//
if (openTags<5 || openTags/5 < badTags ||
(fInputLen < 100 && fRawLength>600))
{
int32_t limit = fRawLength;
if (limit > BUFFER_SIZE) {
limit = BUFFER_SIZE;
}
for (srci=0; srci<limit; srci++) {
fInputBytes[srci] = fRawInput[srci];
}
fInputLen = srci;
}
//
// Tally up the byte occurrence statistics.
// These are available for use by the various detectors.
//
uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
for (srci = 0; srci < fInputLen; srci += 1) {
fByteStats[fInputBytes[srci]] += 1;
}
#if APPLE_ICU_CHANGES
// rdar:/
fOnlyTypicalASCII = true; // rdar://56373519
for (int32_t i = 0x01; i <= 0xFF; i += 1) {
if (fByteStats[i] != 0) {
if ((i < 0x20 && i != 0x09 && i != 0x0A && i != 0x0D) || i > 0x7E) {
fOnlyTypicalASCII = false; // rdar://56373519
if (i >= 0x80 && i <= 0x9F) {
fC1Bytes = true;
}
}
}
}
if (fByteStats[0] > 1) {
fOnlyTypicalASCII = false;
}
#else
for (int32_t i = 0x80; i <= 0x9F; i += 1) {
if (fByteStats[i] != 0) {
fC1Bytes = true;
break;
}
}
#endif // APPLE_ICU_CHANGES
}
U_NAMESPACE_END
#endif
|