1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
|
/* Maxmemory directive handling (LRU eviction and other policies).
*
* ----------------------------------------------------------------------------
*
* Copyright (c) 2009-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of (a) the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#include "server.h"
#include "bio.h"
#include "atomicvar.h"
#include "script.h"
#include <math.h>
/* ----------------------------------------------------------------------------
* Data structures
* --------------------------------------------------------------------------*/
/* To improve the quality of the LRU approximation we take a set of keys
* that are good candidate for eviction across performEvictions() calls.
*
* Entries inside the eviction pool are taken ordered by idle time, putting
* greater idle times to the right (ascending order).
*
* When an LFU policy is used instead, a reverse frequency indication is used
* instead of the idle time, so that we still evict by larger value (larger
* inverse frequency means to evict keys with the least frequent accesses).
*
* Empty entries have the key pointer set to NULL. */
#define EVPOOL_SIZE 16
#define EVPOOL_CACHED_SDS_SIZE 255
struct evictionPoolEntry {
unsigned long long idle; /* Object idle time (inverse frequency for LFU) */
sds key; /* Key name. */
sds cached; /* Cached SDS object for key name. */
int dbid; /* Key DB number. */
int slot; /* Slot. */
};
static struct evictionPoolEntry *EvictionPoolLRU;
/* ----------------------------------------------------------------------------
* Implementation of eviction, aging and LRU
* --------------------------------------------------------------------------*/
/* Return the LRU clock, based on the clock resolution. This is a time
* in a reduced-bits format that can be used to set and check the
* object->lru field of redisObject structures. */
unsigned int getLRUClock(void) {
return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
}
/* This function is used to obtain the current LRU clock.
* If the current resolution is lower than the frequency we refresh the
* LRU clock (as it should be in production servers) we return the
* precomputed value, otherwise we need to resort to a system call. */
unsigned int LRU_CLOCK(void) {
unsigned int lruclock;
if (1000/server.hz <= LRU_CLOCK_RESOLUTION) {
lruclock = server.lruclock;
} else {
lruclock = getLRUClock();
}
return lruclock;
}
/* Given an object returns the min number of milliseconds the object was never
* requested, using an approximated LRU algorithm. */
unsigned long long estimateObjectIdleTime(robj *o) {
unsigned long long lruclock = LRU_CLOCK();
if (lruclock >= o->lru) {
return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
} else {
return (lruclock + (LRU_CLOCK_MAX - o->lru)) *
LRU_CLOCK_RESOLUTION;
}
}
/* LRU approximation algorithm
*
* Redis uses an approximation of the LRU algorithm that runs in constant
* memory. Every time there is a key to expire, we sample N keys (with
* N very small, usually in around 5) to populate a pool of best keys to
* evict of M keys (the pool size is defined by EVPOOL_SIZE).
*
* The N keys sampled are added in the pool of good keys to expire (the one
* with an old access time) if they are better than one of the current keys
* in the pool.
*
* After the pool is populated, the best key we have in the pool is expired.
* However note that we don't remove keys from the pool when they are deleted
* so the pool may contain keys that no longer exist.
*
* When we try to evict a key, and all the entries in the pool don't exist
* we populate it again. This time we'll be sure that the pool has at least
* one key that can be evicted, if there is at least one key that can be
* evicted in the whole database. */
/* Create a new eviction pool. */
void evictionPoolAlloc(void) {
struct evictionPoolEntry *ep;
int j;
ep = zmalloc(sizeof(*ep)*EVPOOL_SIZE);
for (j = 0; j < EVPOOL_SIZE; j++) {
ep[j].idle = 0;
ep[j].key = NULL;
ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE);
ep[j].dbid = 0;
}
EvictionPoolLRU = ep;
}
/* This is a helper function for performEvictions(), it is used in order
* to populate the evictionPool with a few entries every time we want to
* expire a key. Keys with idle time bigger than one of the current
* keys are added. Keys are always added if there are free entries.
*
* We insert keys on place in ascending order, so keys with the smaller
* idle time are on the left, and keys with the higher idle time on the
* right. */
int evictionPoolPopulate(redisDb *db, kvstore *samplekvs, struct evictionPoolEntry *pool) {
int j, k, count;
dictEntry *samples[server.maxmemory_samples];
int slot = kvstoreGetFairRandomDictIndex(samplekvs);
count = kvstoreDictGetSomeKeys(samplekvs,slot,samples,server.maxmemory_samples);
for (j = 0; j < count; j++) {
unsigned long long idle;
sds key;
robj *o;
dictEntry *de;
de = samples[j];
key = dictGetKey(de);
/* If the dictionary we are sampling from is not the main
* dictionary (but the expires one) we need to lookup the key
* again in the key dictionary to obtain the value object. */
if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) {
if (samplekvs != db->keys)
de = kvstoreDictFind(db->keys, slot, key);
o = dictGetVal(de);
}
/* Calculate the idle time according to the policy. This is called
* idle just because the code initially handled LRU, but is in fact
* just a score where a higher score means better candidate. */
if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) {
idle = estimateObjectIdleTime(o);
} else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
/* When we use an LRU policy, we sort the keys by idle time
* so that we expire keys starting from greater idle time.
* However when the policy is an LFU one, we have a frequency
* estimation, and we want to evict keys with lower frequency
* first. So inside the pool we put objects using the inverted
* frequency subtracting the actual frequency to the maximum
* frequency of 255. */
idle = 255-LFUDecrAndReturn(o);
} else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
/* In this case the sooner the expire the better. */
idle = ULLONG_MAX - dictGetSignedIntegerVal(de);
} else {
serverPanic("Unknown eviction policy in evictionPoolPopulate()");
}
/* Insert the element inside the pool.
* First, find the first empty bucket or the first populated
* bucket that has an idle time smaller than our idle time. */
k = 0;
while (k < EVPOOL_SIZE &&
pool[k].key &&
pool[k].idle < idle) k++;
if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) {
/* Can't insert if the element is < the worst element we have
* and there are no empty buckets. */
continue;
} else if (k < EVPOOL_SIZE && pool[k].key == NULL) {
/* Inserting into empty position. No setup needed before insert. */
} else {
/* Inserting in the middle. Now k points to the first element
* greater than the element to insert. */
if (pool[EVPOOL_SIZE-1].key == NULL) {
/* Free space on the right? Insert at k shifting
* all the elements from k to end to the right. */
/* Save SDS before overwriting. */
sds cached = pool[EVPOOL_SIZE-1].cached;
memmove(pool+k+1,pool+k,
sizeof(pool[0])*(EVPOOL_SIZE-k-1));
pool[k].cached = cached;
} else {
/* No free space on right? Insert at k-1 */
k--;
/* Shift all elements on the left of k (included) to the
* left, so we discard the element with smaller idle time. */
sds cached = pool[0].cached; /* Save SDS before overwriting. */
if (pool[0].key != pool[0].cached) sdsfree(pool[0].key);
memmove(pool,pool+1,sizeof(pool[0])*k);
pool[k].cached = cached;
}
}
/* Try to reuse the cached SDS string allocated in the pool entry,
* because allocating and deallocating this object is costly
* (according to the profiler, not my fantasy. Remember:
* premature optimization bla bla bla. */
int klen = sdslen(key);
if (klen > EVPOOL_CACHED_SDS_SIZE) {
pool[k].key = sdsdup(key);
} else {
memcpy(pool[k].cached,key,klen+1);
sdssetlen(pool[k].cached,klen);
pool[k].key = pool[k].cached;
}
pool[k].idle = idle;
pool[k].dbid = db->id;
pool[k].slot = slot;
}
return count;
}
/* ----------------------------------------------------------------------------
* LFU (Least Frequently Used) implementation.
* We have 24 total bits of space in each object in order to implement
* an LFU (Least Frequently Used) eviction policy, since we re-use the
* LRU field for this purpose.
*
* We split the 24 bits into two fields:
*
* 16 bits 8 bits
* +------------------+--------+
* + Last access time | LOG_C |
* +------------------+--------+
*
* LOG_C is a logarithmic counter that provides an indication of the access
* frequency. However this field must also be decremented otherwise what used
* to be a frequently accessed key in the past, will remain ranked like that
* forever, while we want the algorithm to adapt to access pattern changes.
*
* So the remaining 16 bits are used in order to store the "access time",
* a reduced-precision Unix time (we take 16 bits of the time converted
* in minutes since we don't care about wrapping around) where the LOG_C
* counter decays every minute by default (depends on lfu-decay-time).
*
* New keys don't start at zero, in order to have the ability to collect
* some accesses before being trashed away, so they start at LFU_INIT_VAL.
* The logarithmic increment performed on LOG_C takes care of LFU_INIT_VAL
* when incrementing the key, so that keys starting at LFU_INIT_VAL
* (or having a smaller value) have a very high chance of being incremented
* on access. (The chance depends on counter and lfu-log-factor.)
*
* During decrement, the value of the logarithmic counter is decremented by
* one when lfu-decay-time minutes elapsed.
* --------------------------------------------------------------------------*/
/* Return the current time in minutes, just taking the least significant
* 16 bits. The returned time is suitable to be stored as LDT (last access
* time) for the LFU implementation. */
unsigned long LFUGetTimeInMinutes(void) {
return (server.unixtime/60) & 65535;
}
/* Given an object ldt (last access time), compute the minimum number of minutes
* that elapsed since the last access. Handle overflow (ldt greater than
* the current 16 bits minutes time) considering the time as wrapping
* exactly once. */
unsigned long LFUTimeElapsed(unsigned long ldt) {
unsigned long now = LFUGetTimeInMinutes();
if (now >= ldt) return now-ldt;
return 65535-ldt+now;
}
/* Logarithmically increment a counter. The greater is the current counter value
* the less likely is that it gets really incremented. Saturate it at 255. */
uint8_t LFULogIncr(uint8_t counter) {
if (counter == 255) return 255;
double r = (double)rand()/RAND_MAX;
double baseval = counter - LFU_INIT_VAL;
if (baseval < 0) baseval = 0;
double p = 1.0/(baseval*server.lfu_log_factor+1);
if (r < p) counter++;
return counter;
}
/* If the object's ldt (last access time) is reached, decrement the LFU counter but
* do not update LFU fields of the object, we update the access time
* and counter in an explicit way when the object is really accessed.
* And we will decrement the counter according to the times of
* elapsed time than server.lfu_decay_time.
* Return the object frequency counter.
*
* This function is used in order to scan the dataset for the best object
* to fit: as we check for the candidate, we incrementally decrement the
* counter of the scanned objects if needed. */
unsigned long LFUDecrAndReturn(robj *o) {
unsigned long ldt = o->lru >> 8;
unsigned long counter = o->lru & 255;
unsigned long num_periods = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0;
if (num_periods)
counter = (num_periods > counter) ? 0 : counter - num_periods;
return counter;
}
/* We don't want to count AOF buffers and slaves output buffers as
* used memory: the eviction should use mostly data size, because
* it can cause feedback-loop when we push DELs into them, putting
* more and more DELs will make them bigger, if we count them, we
* need to evict more keys, and then generate more DELs, maybe cause
* massive eviction loop, even all keys are evicted.
*
* This function returns the sum of AOF and replication buffer. */
size_t freeMemoryGetNotCountedMemory(void) {
size_t overhead = 0;
/* Since all replicas and replication backlog share global replication
* buffer, we think only the part of exceeding backlog size is the extra
* separate consumption of replicas.
*
* Note that although the backlog is also initially incrementally grown
* (pushing DELs consumes memory), it'll eventually stop growing and
* remain constant in size, so even if its creation will cause some
* eviction, it's capped, and also here to stay (no resonance effect)
*
* Note that, because we trim backlog incrementally in the background,
* backlog size may exceeds our setting if slow replicas that reference
* vast replication buffer blocks disconnect. To avoid massive eviction
* loop, we don't count the delayed freed replication backlog into used
* memory even if there are no replicas, i.e. we still regard this memory
* as replicas'. */
if ((long long)server.repl_buffer_mem > server.repl_backlog_size) {
/* We use list structure to manage replication buffer blocks, so backlog
* also occupies some extra memory, we can't know exact blocks numbers,
* we only get approximate size according to per block size. */
size_t extra_approx_size =
(server.repl_backlog_size/PROTO_REPLY_CHUNK_BYTES + 1) *
(sizeof(replBufBlock)+sizeof(listNode));
size_t counted_mem = server.repl_backlog_size + extra_approx_size;
if (server.repl_buffer_mem > counted_mem) {
overhead += (server.repl_buffer_mem - counted_mem);
}
}
if (server.aof_state != AOF_OFF) {
overhead += sdsAllocSize(server.aof_buf);
}
return overhead;
}
/* Get the memory status from the point of view of the maxmemory directive:
* if the memory used is under the maxmemory setting then C_OK is returned.
* Otherwise, if we are over the memory limit, the function returns
* C_ERR.
*
* The function may return additional info via reference, only if the
* pointers to the respective arguments is not NULL. Certain fields are
* populated only when C_ERR is returned:
*
* 'total' total amount of bytes used.
* (Populated both for C_ERR and C_OK)
*
* 'logical' the amount of memory used minus the slaves/AOF buffers.
* (Populated when C_ERR is returned)
*
* 'tofree' the amount of memory that should be released
* in order to return back into the memory limits.
* (Populated when C_ERR is returned)
*
* 'level' this usually ranges from 0 to 1, and reports the amount of
* memory currently used. May be > 1 if we are over the memory
* limit.
* (Populated both for C_ERR and C_OK)
*/
int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level) {
size_t mem_reported, mem_used, mem_tofree;
/* Check if we are over the memory usage limit. If we are not, no need
* to subtract the slaves output buffers. We can just return ASAP. */
mem_reported = zmalloc_used_memory();
if (total) *total = mem_reported;
/* We may return ASAP if there is no need to compute the level. */
if (!server.maxmemory) {
if (level) *level = 0;
return C_OK;
}
if (mem_reported <= server.maxmemory && !level) return C_OK;
/* Remove the size of slaves output buffers and AOF buffer from the
* count of used memory. */
mem_used = mem_reported;
size_t overhead = freeMemoryGetNotCountedMemory();
mem_used = (mem_used > overhead) ? mem_used-overhead : 0;
/* Compute the ratio of memory usage. */
if (level) *level = (float)mem_used / (float)server.maxmemory;
if (mem_reported <= server.maxmemory) return C_OK;
/* Check if we are still over the memory limit. */
if (mem_used <= server.maxmemory) return C_OK;
/* Compute how much memory we need to free. */
mem_tofree = mem_used - server.maxmemory;
if (logical) *logical = mem_used;
if (tofree) *tofree = mem_tofree;
return C_ERR;
}
/* Return 1 if used memory is more than maxmemory after allocating more memory,
* return 0 if not. Redis may reject user's requests or evict some keys if used
* memory exceeds maxmemory, especially, when we allocate huge memory at once. */
int overMaxmemoryAfterAlloc(size_t moremem) {
if (!server.maxmemory) return 0; /* No limit. */
/* Check quickly. */
size_t mem_used = zmalloc_used_memory();
if (mem_used + moremem <= server.maxmemory) return 0;
size_t overhead = freeMemoryGetNotCountedMemory();
mem_used = (mem_used > overhead) ? mem_used - overhead : 0;
return mem_used + moremem > server.maxmemory;
}
/* The evictionTimeProc is started when "maxmemory" has been breached and
* could not immediately be resolved. This will spin the event loop with short
* eviction cycles until the "maxmemory" condition has resolved or there are no
* more evictable items. */
static int isEvictionProcRunning = 0;
static int evictionTimeProc(
struct aeEventLoop *eventLoop, long long id, void *clientData) {
UNUSED(eventLoop);
UNUSED(id);
UNUSED(clientData);
if (performEvictions() == EVICT_RUNNING) return 0; /* keep evicting */
/* For EVICT_OK - things are good, no need to keep evicting.
* For EVICT_FAIL - there is nothing left to evict. */
isEvictionProcRunning = 0;
return AE_NOMORE;
}
void startEvictionTimeProc(void) {
if (!isEvictionProcRunning) {
isEvictionProcRunning = 1;
aeCreateTimeEvent(server.el, 0,
evictionTimeProc, NULL, NULL);
}
}
/* Check if it's safe to perform evictions.
* Returns 1 if evictions can be performed
* Returns 0 if eviction processing should be skipped
*/
static int isSafeToPerformEvictions(void) {
/* - There must be no script in timeout condition.
* - Nor we are loading data right now. */
if (isInsideYieldingLongCommand() || server.loading) return 0;
/* By default replicas should ignore maxmemory
* and just be masters exact copies. */
if (server.masterhost && server.repl_slave_ignore_maxmemory) return 0;
/* If 'evict' action is paused, for whatever reason, then return false */
if (isPausedActionsWithUpdate(PAUSE_ACTION_EVICT)) return 0;
return 1;
}
/* Algorithm for converting tenacity (0-100) to a time limit. */
static unsigned long evictionTimeLimitUs(void) {
serverAssert(server.maxmemory_eviction_tenacity >= 0);
serverAssert(server.maxmemory_eviction_tenacity <= 100);
if (server.maxmemory_eviction_tenacity <= 10) {
/* A linear progression from 0..500us */
return 50uL * server.maxmemory_eviction_tenacity;
}
if (server.maxmemory_eviction_tenacity < 100) {
/* A 15% geometric progression, resulting in a limit of ~2 min at tenacity==99 */
return (unsigned long)(500.0 * pow(1.15, server.maxmemory_eviction_tenacity - 10.0));
}
return ULONG_MAX; /* No limit to eviction time */
}
/* Check that memory usage is within the current "maxmemory" limit. If over
* "maxmemory", attempt to free memory by evicting data (if it's safe to do so).
*
* It's possible for Redis to suddenly be significantly over the "maxmemory"
* setting. This can happen if there is a large allocation (like a hash table
* resize) or even if the "maxmemory" setting is manually adjusted. Because of
* this, it's important to evict for a managed period of time - otherwise Redis
* would become unresponsive while evicting.
*
* The goal of this function is to improve the memory situation - not to
* immediately resolve it. In the case that some items have been evicted but
* the "maxmemory" limit has not been achieved, an aeTimeProc will be started
* which will continue to evict items until memory limits are achieved or
* nothing more is evictable.
*
* This should be called before execution of commands. If EVICT_FAIL is
* returned, commands which will result in increased memory usage should be
* rejected.
*
* Returns:
* EVICT_OK - memory is OK or it's not possible to perform evictions now
* EVICT_RUNNING - memory is over the limit, but eviction is still processing
* EVICT_FAIL - memory is over the limit, and there's nothing to evict
* */
int performEvictions(void) {
/* Note, we don't goto update_metrics here because this check skips eviction
* as if it wasn't triggered. it's a fake EVICT_OK. */
if (!isSafeToPerformEvictions()) return EVICT_OK;
int keys_freed = 0;
size_t mem_reported, mem_tofree;
long long mem_freed; /* May be negative */
mstime_t latency;
int slaves = listLength(server.slaves);
int result = EVICT_FAIL;
if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL) == C_OK) {
result = EVICT_OK;
goto update_metrics;
}
if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) {
result = EVICT_FAIL; /* We need to free memory, but policy forbids. */
goto update_metrics;
}
unsigned long eviction_time_limit_us = evictionTimeLimitUs();
mem_freed = 0;
latencyStartMonitor(latency);
monotime evictionTimer;
elapsedStart(&evictionTimer);
/* Try to smoke-out bugs (server.also_propagate should be empty here) */
serverAssert(server.also_propagate.numops == 0);
/* Evictions are performed on random keys that have nothing to do with the current command slot. */
while (mem_freed < (long long)mem_tofree) {
int j, k, i;
static unsigned int next_db = 0;
sds bestkey = NULL;
int bestdbid;
redisDb *db;
dictEntry *de;
if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) ||
server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL)
{
struct evictionPoolEntry *pool = EvictionPoolLRU;
while (bestkey == NULL) {
unsigned long total_keys = 0;
/* We don't want to make local-db choices when expiring keys,
* so to start populate the eviction pool sampling keys from
* every DB. */
for (i = 0; i < server.dbnum; i++) {
db = server.db+i;
kvstore *kvs;
if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) {
kvs = db->keys;
} else {
kvs = db->expires;
}
unsigned long sampled_keys = 0;
unsigned long current_db_keys = kvstoreSize(kvs);
if (current_db_keys == 0) continue;
total_keys += current_db_keys;
int l = kvstoreNumNonEmptyDicts(kvs);
/* Do not exceed the number of non-empty slots when looping. */
while (l--) {
sampled_keys += evictionPoolPopulate(db, kvs, pool);
/* We have sampled enough keys in the current db, exit the loop. */
if (sampled_keys >= (unsigned long) server.maxmemory_samples)
break;
/* If there are not a lot of keys in the current db, dict/s may be very
* sparsely populated, exit the loop without meeting the sampling
* requirement. */
if (current_db_keys < (unsigned long) server.maxmemory_samples*10)
break;
}
}
if (!total_keys) break; /* No keys to evict. */
/* Go backward from best to worst element to evict. */
for (k = EVPOOL_SIZE-1; k >= 0; k--) {
if (pool[k].key == NULL) continue;
bestdbid = pool[k].dbid;
kvstore *kvs;
if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) {
kvs = server.db[bestdbid].keys;
} else {
kvs = server.db[bestdbid].expires;
}
de = kvstoreDictFind(kvs, pool[k].slot, pool[k].key);
/* Remove the entry from the pool. */
if (pool[k].key != pool[k].cached)
sdsfree(pool[k].key);
pool[k].key = NULL;
pool[k].idle = 0;
/* If the key exists, is our pick. Otherwise it is
* a ghost and we need to try the next element. */
if (de) {
bestkey = dictGetKey(de);
break;
} else {
/* Ghost... Iterate again. */
}
}
}
}
/* volatile-random and allkeys-random policy */
else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
{
/* When evicting a random key, we try to evict a key for
* each DB, so we use the static 'next_db' variable to
* incrementally visit all DBs. */
for (i = 0; i < server.dbnum; i++) {
j = (++next_db) % server.dbnum;
db = server.db+j;
kvstore *kvs;
if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) {
kvs = db->keys;
} else {
kvs = db->expires;
}
int slot = kvstoreGetFairRandomDictIndex(kvs);
de = kvstoreDictGetRandomKey(kvs, slot);
if (de) {
bestkey = dictGetKey(de);
bestdbid = j;
break;
}
}
}
/* Finally remove the selected key. */
if (bestkey) {
long long key_mem_freed;
db = server.db+bestdbid;
enterExecutionUnit(1, 0);
robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
deleteEvictedKeyAndPropagate(db, keyobj, &key_mem_freed);
decrRefCount(keyobj);
exitExecutionUnit();
/* Propagate the DEL command */
postExecutionUnitOperations();
mem_freed += key_mem_freed;
keys_freed++;
if (keys_freed % 16 == 0) {
/* When the memory to free starts to be big enough, we may
* start spending so much time here that is impossible to
* deliver data to the replicas fast enough, so we force the
* transmission here inside the loop. */
if (slaves) flushSlavesOutputBuffers();
/* Normally our stop condition is the ability to release
* a fixed, pre-computed amount of memory. However when we
* are deleting objects in another thread, it's better to
* check, from time to time, if we already reached our target
* memory, since the "mem_freed" amount is computed only
* across the dbAsyncDelete() call, while the thread can
* release the memory all the time. */
if (server.lazyfree_lazy_eviction) {
if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
break;
}
}
/* After some time, exit the loop early - even if memory limit
* hasn't been reached. If we suddenly need to free a lot of
* memory, don't want to spend too much time here. */
if (elapsedUs(evictionTimer) > eviction_time_limit_us) {
// We still need to free memory - start eviction timer proc
startEvictionTimeProc();
break;
}
}
} else {
goto cant_free; /* nothing to free... */
}
}
/* at this point, the memory is OK, or we have reached the time limit */
result = (isEvictionProcRunning) ? EVICT_RUNNING : EVICT_OK;
cant_free:
if (result == EVICT_FAIL) {
/* At this point, we have run out of evictable items. It's possible
* that some items are being freed in the lazyfree thread. Perform a
* short wait here if such jobs exist, but don't wait long. */
mstime_t lazyfree_latency;
latencyStartMonitor(lazyfree_latency);
while (bioPendingJobsOfType(BIO_LAZY_FREE) &&
elapsedUs(evictionTimer) < eviction_time_limit_us) {
if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
result = EVICT_OK;
break;
}
usleep(eviction_time_limit_us < 1000 ? eviction_time_limit_us : 1000);
}
latencyEndMonitor(lazyfree_latency);
latencyAddSampleIfNeeded("eviction-lazyfree",lazyfree_latency);
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("eviction-cycle",latency);
update_metrics:
if (result == EVICT_RUNNING || result == EVICT_FAIL) {
if (server.stat_last_eviction_exceeded_time == 0)
elapsedStart(&server.stat_last_eviction_exceeded_time);
} else if (result == EVICT_OK) {
if (server.stat_last_eviction_exceeded_time != 0) {
server.stat_total_eviction_exceeded_time += elapsedUs(server.stat_last_eviction_exceeded_time);
server.stat_last_eviction_exceeded_time = 0;
}
}
return result;
}
|