1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2001-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bocsu.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* Author: Markus W. Scherer
*
* Modification history:
* 05/18/2001 weiv Made into separate module
*/
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
#include <_foundation_unicode/bytestream.h>
#include <_foundation_unicode/utf16.h>
#include "bocsu.h"
/*
* encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
* preserving lexical order
*/
static uint8_t *
u_writeDiff(int32_t diff, uint8_t *p) {
if(diff>=SLOPE_REACH_NEG_1) {
if(diff<=SLOPE_REACH_POS_1) {
*p++=(uint8_t)(SLOPE_MIDDLE+diff);
} else if(diff<=SLOPE_REACH_POS_2) {
*p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT));
*p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
} else if(diff<=SLOPE_REACH_POS_3) {
p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
diff/=SLOPE_TAIL_COUNT;
p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
*p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT));
p+=3;
} else {
p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
diff/=SLOPE_TAIL_COUNT;
p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
diff/=SLOPE_TAIL_COUNT;
p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
*p=SLOPE_MAX;
p+=4;
}
} else {
int32_t m;
if(diff>=SLOPE_REACH_NEG_2) {
NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
*p++=(uint8_t)(SLOPE_START_NEG_2+diff);
*p++=(uint8_t)(SLOPE_MIN+m);
} else if(diff>=SLOPE_REACH_NEG_3) {
NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
p[2]=(uint8_t)(SLOPE_MIN+m);
NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
p[1]=(uint8_t)(SLOPE_MIN+m);
*p=(uint8_t)(SLOPE_START_NEG_3+diff);
p+=3;
} else {
NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
p[3]=(uint8_t)(SLOPE_MIN+m);
NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
p[2]=(uint8_t)(SLOPE_MIN+m);
NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
p[1]=(uint8_t)(SLOPE_MIN+m);
*p=SLOPE_MIN;
p+=4;
}
}
return p;
}
/*
* Encode the code points of a string as
* a sequence of byte-encoded differences (slope detection),
* preserving lexical order.
*
* Optimize the difference-taking for runs of Unicode text within
* small scripts:
*
* Most small scripts are allocated within aligned 128-blocks of Unicode
* code points. Lexical order is preserved if "prev" is always moved
* into the middle of such a block.
*
* Additionally, "prev" is moved from anywhere in the Unihan
* area into the middle of that area.
* Note that the identical-level run in a sort key is generated from
* NFD text - there are never Hangul characters included.
*/
U_CFUNC UChar32
u_writeIdenticalLevelRun(UChar32 prev, const char16_t *s, int32_t length, icu::ByteSink &sink) {
char scratch[64];
int32_t capacity;
int32_t i=0;
while(i<length) {
char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity);
uint8_t *p;
// We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much,
// but we do not want to force the sink.GetAppendBuffer() to allocate
// for a large min_capacity because we might actually only write one byte.
if(capacity<16) {
buffer=scratch;
capacity=(int32_t)sizeof(scratch);
}
p=reinterpret_cast<uint8_t *>(buffer);
uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES;
while(i<length && p<=lastSafe) {
if(prev<0x4e00 || prev>=0xa000) {
prev=(prev&~0x7f)-SLOPE_REACH_NEG_1;
} else {
/*
* Unihan U+4e00..U+9fa5:
* double-bytes down from the upper end
*/
prev=0x9fff-SLOPE_REACH_POS_2;
}
UChar32 c;
U16_NEXT(s, i, length, c);
if(c==0xfffe) {
*p++=2; // merge separator
prev=0;
} else {
p=u_writeDiff(c-prev, p);
prev=c;
}
}
sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer)));
}
return prev;
}
#endif /* #if !UCONFIG_NO_COLLATION */
|