1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
|
/*
* String cache.
*
* Provides a cache to optimize indexed string lookups. The cache keeps
* track of (byte offset, char offset) states for a fixed number of strings.
* Otherwise we'd need to scan from either end of the string, as we store
* strings in (extended) UTF-8.
*/
#include "duk_internal.h"
/*
* Delete references to given hstring from the heap string cache.
*
* String cache references are 'weak': they are not counted towards
* reference counts, nor serve as roots for mark-and-sweep. When an
* object is about to be freed, such references need to be removed.
*/
DUK_INTERNAL void duk_heap_strcache_string_remove(duk_heap *heap, duk_hstring *h) {
duk_uint_t i;
for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
duk_strcache_entry *c = heap->strcache + i;
if (c->h == h) {
DUK_DD(
DUK_DDPRINT("deleting weak strcache reference to hstring %p from heap %p", (void *) h, (void *) heap));
c->h = NULL;
/* XXX: the string shouldn't appear twice, but we now loop to the
* end anyway; if fixed, add a looping assertion to ensure there
* is no duplicate.
*/
}
}
}
/*
* String scanning helpers
*
* All bytes other than UTF-8 continuation bytes ([0x80,0xbf]) are
* considered to contribute a character. This must match how string
* character length is computed.
*/
DUK_LOCAL const duk_uint8_t *duk__scan_forwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) {
while (n > 0) {
for (;;) {
p++;
if (p >= q) {
return NULL;
}
if ((*p & 0xc0) != 0x80) {
break;
}
}
n--;
}
return p;
}
DUK_LOCAL const duk_uint8_t *duk__scan_backwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) {
while (n > 0) {
for (;;) {
p--;
if (p < q) {
return NULL;
}
if ((*p & 0xc0) != 0x80) {
break;
}
}
n--;
}
return p;
}
/*
* Convert char offset to byte offset
*
* Avoid using the string cache if possible: for ASCII strings byte and
* char offsets are equal and for short strings direct scanning may be
* better than using the string cache (which may evict a more important
* entry).
*
* Typing now assumes 32-bit string byte/char offsets (duk_uint_fast32_t).
* Better typing might be to use duk_size_t.
*
* Caller should ensure 'char_offset' is within the string bounds [0,charlen]
* (endpoint is inclusive). If this is not the case, no memory unsafe
* behavior will happen but an error will be thrown.
*/
DUK_INTERNAL duk_uint_fast32_t duk_heap_strcache_offset_char2byte(duk_hthread *thr, duk_hstring *h, duk_uint_fast32_t char_offset) {
duk_heap *heap;
duk_strcache_entry *sce;
duk_uint_fast32_t byte_offset;
duk_uint_t i;
duk_bool_t use_cache;
duk_uint_fast32_t dist_start, dist_end, dist_sce;
duk_uint_fast32_t char_length;
const duk_uint8_t *p_start;
const duk_uint8_t *p_end;
const duk_uint8_t *p_found;
/*
* For ASCII strings, the answer is simple.
*/
if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) {
return char_offset;
}
char_length = (duk_uint_fast32_t) DUK_HSTRING_GET_CHARLEN(h);
DUK_ASSERT(char_offset <= char_length);
if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) {
/* Must recheck because the 'is ascii' flag may be set
* lazily. Alternatively, we could just compare charlen
* to bytelen.
*/
return char_offset;
}
/*
* For non-ASCII strings, we need to scan forwards or backwards
* from some starting point. The starting point may be the start
* or end of the string, or some cached midpoint in the string
* cache.
*
* For "short" strings we simply scan without checking or updating
* the cache. For longer strings we check and update the cache as
* necessary, inserting a new cache entry if none exists.
*/
DUK_DDD(DUK_DDDPRINT("non-ascii string %p, char_offset=%ld, clen=%ld, blen=%ld",
(void *) h,
(long) char_offset,
(long) DUK_HSTRING_GET_CHARLEN(h),
(long) DUK_HSTRING_GET_BYTELEN(h)));
heap = thr->heap;
sce = NULL;
use_cache = (char_length > DUK_HEAP_STRINGCACHE_NOCACHE_LIMIT);
if (use_cache) {
#if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2)
DUK_DDD(DUK_DDDPRINT("stringcache before char2byte (using cache):"));
for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
duk_strcache_entry *c = heap->strcache + i;
DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld",
(long) i,
(void *) c->h,
(long) c->cidx,
(long) c->bidx));
}
#endif
for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
duk_strcache_entry *c = heap->strcache + i;
if (c->h == h) {
sce = c;
break;
}
}
}
/*
* Scan from shortest distance:
* - start of string
* - end of string
* - cache entry (if exists)
*/
DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h) >= char_offset);
dist_start = char_offset;
dist_end = char_length - char_offset;
dist_sce = 0;
DUK_UNREF(dist_sce); /* initialize for debug prints, needed if sce==NULL */
p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
p_end = (const duk_uint8_t *) (p_start + DUK_HSTRING_GET_BYTELEN(h));
p_found = NULL;
if (sce) {
if (char_offset >= sce->cidx) {
dist_sce = char_offset - sce->cidx;
if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) {
DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
"dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
"scan forwards from sce",
(long) use_cache,
(void *) (sce ? sce->h : NULL),
(sce ? (long) sce->cidx : (long) -1),
(sce ? (long) sce->bidx : (long) -1),
(long) dist_start,
(long) dist_end,
(long) dist_sce));
p_found = duk__scan_forwards(p_start + sce->bidx, p_end, dist_sce);
goto scan_done;
}
} else {
dist_sce = sce->cidx - char_offset;
if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) {
DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
"dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
"scan backwards from sce",
(long) use_cache,
(void *) (sce ? sce->h : NULL),
(sce ? (long) sce->cidx : (long) -1),
(sce ? (long) sce->bidx : (long) -1),
(long) dist_start,
(long) dist_end,
(long) dist_sce));
p_found = duk__scan_backwards(p_start + sce->bidx, p_start, dist_sce);
goto scan_done;
}
}
}
/* no sce, or sce scan not best */
if (dist_start <= dist_end) {
DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
"dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
"scan forwards from string start",
(long) use_cache,
(void *) (sce ? sce->h : NULL),
(sce ? (long) sce->cidx : (long) -1),
(sce ? (long) sce->bidx : (long) -1),
(long) dist_start,
(long) dist_end,
(long) dist_sce));
p_found = duk__scan_forwards(p_start, p_end, dist_start);
} else {
DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
"dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
"scan backwards from string end",
(long) use_cache,
(void *) (sce ? sce->h : NULL),
(sce ? (long) sce->cidx : (long) -1),
(sce ? (long) sce->bidx : (long) -1),
(long) dist_start,
(long) dist_end,
(long) dist_sce));
p_found = duk__scan_backwards(p_end, p_start, dist_end);
}
scan_done:
if (DUK_UNLIKELY(p_found == NULL)) {
/* Scan error: this shouldn't normally happen; it could happen if
* string is not valid UTF-8 data, and clen/blen are not consistent
* with the scanning algorithm.
*/
goto scan_error;
}
DUK_ASSERT(p_found >= p_start);
DUK_ASSERT(p_found <= p_end); /* may be equal */
byte_offset = (duk_uint32_t) (p_found - p_start);
DUK_DDD(DUK_DDDPRINT("-> string %p, cidx %ld -> bidx %ld", (void *) h, (long) char_offset, (long) byte_offset));
/*
* Update cache entry (allocating if necessary), and move the
* cache entry to the first place (in an "LRU" policy).
*/
if (use_cache) {
/* update entry, allocating if necessary */
if (!sce) {
sce = heap->strcache + DUK_HEAP_STRCACHE_SIZE - 1; /* take last entry */
sce->h = h;
}
DUK_ASSERT(sce != NULL);
sce->bidx = (duk_uint32_t) (p_found - p_start);
sce->cidx = (duk_uint32_t) char_offset;
/* LRU: move our entry to first */
if (sce > &heap->strcache[0]) {
/*
* A C
* B A
* C <- sce ==> B
* D D
*/
duk_strcache_entry tmp;
tmp = *sce;
duk_memmove((void *) (&heap->strcache[1]),
(const void *) (&heap->strcache[0]),
(size_t) (((char *) sce) - ((char *) &heap->strcache[0])));
heap->strcache[0] = tmp;
/* 'sce' points to the wrong entry here, but is no longer used */
}
#if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2)
DUK_DDD(DUK_DDDPRINT("stringcache after char2byte (using cache):"));
for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
duk_strcache_entry *c = heap->strcache + i;
DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld",
(long) i,
(void *) c->h,
(long) c->cidx,
(long) c->bidx));
}
#endif
}
return byte_offset;
scan_error:
DUK_ERROR_INTERNAL(thr);
DUK_WO_NORETURN(return 0;);
}
|