xref: /6.0.3/moxi/src/items.c (revision d0366df5)
1/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2#include "memcached.h"
3#include <errno.h>
4#include <stdlib.h>
5#include <stdio.h>
6#include <string.h>
7#include <time.h>
8#include <platform/cbassert.h>
9#include "log.h"
10
11/* Forward Declarations */
12static void item_link_q(item *it);
13static void item_unlink_q(item *it);
14
15/*
16 * We only reposition items in the LRU queue if they haven't been repositioned
17 * in this many seconds. That saves us from churning on frequently-accessed
18 * items.
19 */
20#define ITEM_UPDATE_INTERVAL 60
21
22#define LARGEST_ID 255
23typedef struct {
24    unsigned int evicted;
25    rel_time_t evicted_time;
26    unsigned int outofmemory;
27    unsigned int tailrepairs;
28} itemstats_t;
29
30static item *heads[LARGEST_ID];
31static item *tails[LARGEST_ID];
32static itemstats_t itemstats[LARGEST_ID];
33static unsigned int sizes[LARGEST_ID];
34
35void item_init(void) {
36    int i;
37    memset(itemstats, 0, sizeof(itemstats_t) * LARGEST_ID);
38    for(i = 0; i < LARGEST_ID; i++) {
39        heads[i] = NULL;
40        tails[i] = NULL;
41        sizes[i] = 0;
42    }
43}
44
45void item_stats_reset(void) {
46    cb_mutex_enter(&cache_lock);
47    memset(itemstats, 0, sizeof(itemstats_t) * LARGEST_ID);
48    cb_mutex_exit(&cache_lock);
49}
50
51
52/* Get the next CAS id for a new item. */
53uint64_t get_cas_id(void) {
54    static uint64_t cas_id = 0;
55    return ++cas_id;
56}
57
58/* Enable this for reference-count debugging. */
59#if 0
60# define DEBUG_REFCNT(it,op) \
61                moxi_log_write("item %x refcnt(%c) %d %c%c%c\n", \
62                        it, op, it->refcount, \
63                        (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
64                        (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
65#else
66# define DEBUG_REFCNT(it,op) do {} while(0)
67#endif
68
69/**
70 * Generates the variable-sized part of the header for an object.
71 *
72 * key     - The key
73 * nkey    - The length of the key
74 * flags   - key flags
75 * nbytes  - Number of bytes to hold value and addition CRLF terminator
76 * suffix  - Buffer for the "VALUE" line suffix (flags, size).
77 * nsuffix - The length of the suffix is stored here.
78 *
79 * Returns the total size of the header.
80 */
81static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes,
82                     char *suffix, uint8_t *nsuffix) {
83    /* suffix is defined at 40 chars elsewhere.. */
84    *nsuffix = (uint8_t) snprintf(suffix, 40, " %d %d\r\n", flags, nbytes - 2);
85    return sizeof(item) + nkey + *nsuffix + nbytes;
86}
87
88/*@null@*/
89item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) {
90    uint8_t nsuffix;
91    char suffix[40];
92    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
93    if (settings.use_cas) {
94        ntotal += sizeof(uint64_t);
95    }
96
97#ifdef MOXI_ITEM_MALLOC
98    item *itx = malloc(ntotal);
99    if (itx != NULL) {
100        itx->refcount = 1;
101        itx->slabs_clsid = 0;
102        itx->next = 0;
103        itx->prev = 0;
104        itx->h_next = 0;
105        itx->it_flags = settings.use_cas ? ITEM_CAS : 0;
106        itx->nkey = nkey;
107        itx->nbytes = nbytes;
108        memcpy(ITEM_key(itx), key, nkey);
109        itx->exptime = exptime;
110        memcpy(ITEM_suffix(itx), suffix, (size_t)nsuffix);
111        itx->nsuffix = nsuffix;
112    }
113
114    return itx;
115#else
116    unsigned int id = slabs_clsid(ntotal);
117    if (id == 0)
118        return 0;
119
120    /* do a quick check if we have any expired items in the tail.. */
121    int tries = 50;
122    item *it = NULL;
123    item *search;
124
125    for (search = tails[id];
126         tries > 0 && search != NULL;
127         tries--, search=search->prev) {
128        if (search->refcount == 0 &&
129            (search->exptime != 0 && search->exptime < current_time)) {
130            it = search;
131            /* I don't want to actually free the object, just steal
132             * the item to avoid to grab the slab mutex twice ;-)
133             */
134            it->refcount = 1;
135            do_item_unlink(it);
136            /* Initialize the item block: */
137            it->slabs_clsid = 0;
138            it->refcount = 0;
139            break;
140        }
141    }
142
143    if (it == NULL && (it = slabs_alloc(ntotal, id)) == NULL) {
144        /*
145        ** Could not find an expired item at the tail, and memory allocation
146        ** failed. Try to evict some items!
147        */
148        tries = 50;
149
150        /* If requested to not push old items out of cache when memory runs out,
151         * we're out of luck at this point...
152         */
153
154        if (settings.evict_to_free == 0) {
155            itemstats[id].outofmemory++;
156            return NULL;
157        }
158
159        /*
160         * try to get one off the right LRU
161         * don't necessariuly unlink the tail because it may be locked: refcount>0
162         * search up from tail an item with refcount==0 and unlink it; give up after 50
163         * tries
164         */
165
166        if (tails[id] == 0) {
167            itemstats[id].outofmemory++;
168            return NULL;
169        }
170
171        for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) {
172            if (search->refcount == 0) {
173                if (search->exptime == 0 || search->exptime > current_time) {
174                    itemstats[id].evicted++;
175                    itemstats[id].evicted_time = current_time - search->time;
176                    STATS_LOCK();
177                    stats.evictions++;
178                    STATS_UNLOCK();
179                }
180                do_item_unlink(search);
181                break;
182            }
183        }
184        it = slabs_alloc(ntotal, id);
185        if (it == 0) {
186            itemstats[id].outofmemory++;
187            /* Last ditch effort. There is a very rare bug which causes
188             * refcount leaks. We've fixed most of them, but it still happens,
189             * and it may happen in the future.
190             * We can reasonably assume no item can stay locked for more than
191             * three hours, so if we find one in the tail which is that old,
192             * free it anyway.
193             */
194            tries = 50;
195            for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) {
196                if (search->refcount != 0 && search->time + TAIL_REPAIR_TIME < current_time) {
197                    itemstats[id].tailrepairs++;
198                    search->refcount = 0;
199                    do_item_unlink(search);
200                    break;
201                }
202            }
203            it = slabs_alloc(ntotal, id);
204            if (it == 0) {
205                return NULL;
206            }
207        }
208    }
209
210    cb_assert(it->slabs_clsid == 0);
211
212    it->slabs_clsid = id;
213
214    cb_assert(it != heads[it->slabs_clsid]);
215
216    it->next = it->prev = it->h_next = 0;
217    it->refcount = 1;     /* the caller will have a reference */
218    DEBUG_REFCNT(it, '*');
219    it->it_flags = settings.use_cas ? ITEM_CAS : 0;
220    it->nkey = nkey;
221    it->nbytes = nbytes;
222    memcpy(ITEM_key(it), key, nkey);
223    it->exptime = exptime;
224    memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
225    it->nsuffix = nsuffix;
226    return it;
227#endif
228}
229
230void item_free(item *it) {
231#ifdef MOXI_ITEM_MALLOC
232    cb_assert(it->refcount > 0);
233    it->refcount--;
234    if (it->refcount == 0) {
235        free(it);
236    }
237#else
238    size_t ntotal = ITEM_ntotal(it);
239    unsigned int clsid;
240    cb_assert((it->it_flags & ITEM_LINKED) == 0);
241    cb_assert(it != heads[it->slabs_clsid]);
242    cb_assert(it != tails[it->slabs_clsid]);
243    cb_assert(it->refcount == 0);
244
245    /* so slab size changer can tell later if item is already free or not */
246    clsid = it->slabs_clsid;
247    it->slabs_clsid = 0;
248    it->it_flags |= ITEM_SLABBED;
249    DEBUG_REFCNT(it, 'F');
250    slabs_free(it, ntotal, clsid);
251#endif
252}
253
254/**
255 * Returns true if an item will fit in the cache (its size does not exceed
256 * the maximum for a cache entry.)
257 */
258bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
259    char prefix[40];
260    uint8_t nsuffix;
261
262    return slabs_clsid(item_make_header(nkey + 1, flags, nbytes,
263                                        prefix, &nsuffix)) != 0;
264}
265
266static void item_link_q(item *it) { /* item is the new head */
267    (void)it;
268#ifndef MOXI_ITEM_MALLOC
269    item **head, **tail;
270    /* always true, warns: cb_assert(it->slabs_clsid <= LARGEST_ID); */
271    cb_assert((it->it_flags & ITEM_SLABBED) == 0);
272
273    head = &heads[it->slabs_clsid];
274    tail = &tails[it->slabs_clsid];
275    cb_assert(it != *head);
276    cb_assert((*head && *tail) || (*head == 0 && *tail == 0));
277    it->prev = 0;
278    it->next = *head;
279    if (it->next) it->next->prev = it;
280    *head = it;
281    if (*tail == 0) *tail = it;
282    sizes[it->slabs_clsid]++;
283    return;
284#endif
285}
286
287static void item_unlink_q(item *it) {
288    (void)it;
289#ifndef MOXI_ITEM_MALLOC
290    item **head, **tail;
291    /* always true, warns: cb_assert(it->slabs_clsid <= LARGEST_ID); */
292    head = &heads[it->slabs_clsid];
293    tail = &tails[it->slabs_clsid];
294
295    if (*head == it) {
296        cb_assert(it->prev == 0);
297        *head = it->next;
298    }
299    if (*tail == it) {
300        cb_assert(it->next == 0);
301        *tail = it->prev;
302    }
303    cb_assert(it->next != it);
304    cb_assert(it->prev != it);
305
306    if (it->next) it->next->prev = it->prev;
307    if (it->prev) it->prev->next = it->next;
308    sizes[it->slabs_clsid]--;
309    return;
310#endif
311}
312
313int do_item_link(item *it) {
314    MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
315    cb_assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
316    cb_assert(it->nbytes < (1024 * 1024));  /* 1MB max size */
317    it->it_flags |= ITEM_LINKED;
318    it->time = current_time;
319    assoc_insert(it);
320
321#ifdef MOXI_ITEM_MALLOC
322    it->refcount++;
323#endif
324
325    STATS_LOCK();
326    stats.curr_bytes += ITEM_ntotal(it);
327    stats.curr_items += 1;
328    stats.total_items += 1;
329    STATS_UNLOCK();
330
331    /* Allocate a new CAS ID on link. */
332    ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
333
334    item_link_q(it);
335
336    return 1;
337}
338
339void do_item_unlink(item *it) {
340    MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
341    if ((it->it_flags & ITEM_LINKED) != 0) {
342        it->it_flags &= ~ITEM_LINKED;
343        STATS_LOCK();
344        stats.curr_bytes -= ITEM_ntotal(it);
345        stats.curr_items -= 1;
346        STATS_UNLOCK();
347        assoc_delete(ITEM_key(it), it->nkey);
348        item_unlink_q(it);
349
350#ifndef MOXI_ITEM_MALLOC
351        if (it->refcount == 0) item_free(it);
352#endif
353    }
354}
355
356void do_item_remove(item *it) {
357#ifdef MOXI_ITEM_MALLOC
358    item_free(it);
359#else
360    MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
361    cb_assert((it->it_flags & ITEM_SLABBED) == 0);
362    if (it->refcount != 0) {
363        it->refcount--;
364        DEBUG_REFCNT(it, '-');
365    }
366    if (it->refcount == 0 && (it->it_flags & ITEM_LINKED) == 0) {
367        item_free(it);
368    }
369#endif
370}
371
372void do_item_update(item *it) {
373    MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
374    if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
375        cb_assert((it->it_flags & ITEM_SLABBED) == 0);
376
377        if ((it->it_flags & ITEM_LINKED) != 0) {
378            item_unlink_q(it);
379            it->time = current_time;
380            item_link_q(it);
381        }
382    }
383}
384
385int do_item_replace(item *it, item *new_it) {
386    MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
387                           ITEM_key(new_it), new_it->nkey, new_it->nbytes);
388    cb_assert((it->it_flags & ITEM_SLABBED) == 0);
389
390    do_item_unlink(it);
391    return do_item_link(new_it);
392}
393
394/*@null@*/
395char *do_item_cachedump(const unsigned int clsid, const unsigned int limit, unsigned int *bytes) {
396    unsigned int memlimit = 2 * 1024 * 1024;   /* 2MB max response size */
397    char *buffer;
398    unsigned int bufcurr;
399    item *it;
400    unsigned int len;
401    unsigned int shown = 0;
402    char key_temp[KEY_MAX_LENGTH + 1];
403    char temp[512];
404
405    if (clsid > LARGEST_ID) return NULL;
406    it = heads[clsid];
407
408    buffer = malloc((size_t)memlimit);
409    if (buffer == 0) return NULL;
410    bufcurr = 0;
411
412    while (it != NULL && (limit == 0 || shown < limit)) {
413        cb_assert(it->nkey <= KEY_MAX_LENGTH);
414        /* Copy the key since it may not be null-terminated in the struct */
415        strncpy(key_temp, ITEM_key(it), it->nkey);
416        key_temp[it->nkey] = 0x00; /* terminate */
417        len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %lu s]\r\n",
418                       key_temp, it->nbytes - 2,
419                       (unsigned long)it->exptime + process_started);
420        if (bufcurr + len + 6 > memlimit)  /* 6 is END\r\n\0 */
421            break;
422        memcpy(buffer + bufcurr, temp, len);
423        bufcurr += len;
424        shown++;
425        it = it->next;
426    }
427
428    memcpy(buffer + bufcurr, "END\r\n", 6);
429    bufcurr += 5;
430
431    *bytes = bufcurr;
432    return buffer;
433}
434
435void do_item_stats(ADD_STAT add_stats, void *c) {
436    int i;
437    for (i = 0; i < LARGEST_ID; i++) {
438        if (tails[i] != NULL) {
439            const char *fmt = "items:%d:%s";
440            char key_str[STAT_KEY_LEN];
441            char val_str[STAT_VAL_LEN];
442            int klen = 0, vlen = 0;
443
444            APPEND_NUM_FMT_STAT(fmt, i, "number", "%u", sizes[i]);
445            APPEND_NUM_FMT_STAT(fmt, i, "age", "%u", tails[i]->time);
446            APPEND_NUM_FMT_STAT(fmt, i, "evicted",
447                                "%u", itemstats[i].evicted);
448            APPEND_NUM_FMT_STAT(fmt, i, "evicted_time",
449                                "%u", itemstats[i].evicted_time);
450            APPEND_NUM_FMT_STAT(fmt, i, "outofmemory",
451                                "%u", itemstats[i].outofmemory);
452            APPEND_NUM_FMT_STAT(fmt, i, "tailrepairs",
453                                "%u", itemstats[i].tailrepairs);;
454        }
455    }
456
457    /* getting here means both ascii and binary terminators fit */
458    add_stats(NULL, 0, NULL, 0, c);
459}
460
461/** dumps out a list of objects of each size, with granularity of 32 bytes */
462/*@null@*/
463void do_item_stats_sizes(ADD_STAT add_stats, void *c) {
464
465    /* max 1MB object, divided into 32 bytes size buckets */
466    const int num_buckets = 32768;
467    unsigned int *histogram = calloc(num_buckets, sizeof(int));
468
469    if (histogram != NULL) {
470        int i;
471
472        /* build the histogram */
473        for (i = 0; i < LARGEST_ID; i++) {
474            item *iter = heads[i];
475            while (iter) {
476                int ntotal = ITEM_ntotal(iter);
477                int bucket = ntotal / 32;
478                if ((ntotal % 32) != 0) bucket++;
479                if (bucket < num_buckets) histogram[bucket]++;
480                iter = iter->next;
481            }
482        }
483
484        /* write the buffer */
485        for (i = 0; i < num_buckets; i++) {
486            if (histogram[i] != 0) {
487                char key[8];
488                int klen = 0;
489                klen = snprintf(key, sizeof(key), "%d", i * 32);
490                cb_assert(klen < (int) sizeof(key));
491                APPEND_STAT(key, "%u", histogram[i]);
492            }
493        }
494        free(histogram);
495    }
496    add_stats(NULL, 0, NULL, 0, c);
497}
498
499/** wrapper around assoc_find which does the lazy expiration logic */
500item *do_item_get(const char *key, const size_t nkey) {
501    item *it = assoc_find(key, nkey);
502    int was_found = 0;
503
504    if (settings.verbose > 2) {
505        if (it == NULL) {
506            moxi_log_write("> NOT FOUND %s", key);
507        } else {
508            moxi_log_write("> FOUND KEY %s", ITEM_key(it));
509            was_found++;
510        }
511    }
512
513    if (it != NULL && settings.oldest_live != 0 && settings.oldest_live <= current_time &&
514        it->time <= settings.oldest_live) {
515        do_item_unlink(it);           /* MTSAFE - cache_lock held */
516        it = NULL;
517    }
518
519    if (it == NULL && was_found) {
520        moxi_log_write(" -nuked by flush");
521        was_found--;
522    }
523
524    if (it != NULL && it->exptime != 0 && it->exptime <= current_time) {
525        do_item_unlink(it);           /* MTSAFE - cache_lock held */
526        it = NULL;
527    }
528
529    if (it == NULL && was_found) {
530        moxi_log_write(" -nuked by expire");
531        was_found--;
532    }
533
534    if (it != NULL) {
535        it->refcount++;
536        DEBUG_REFCNT(it, '+');
537    }
538
539    if (settings.verbose > 2)
540        moxi_log_write("\n");
541
542    return it;
543}
544
545/** returns an item whether or not it's expired. */
546item *do_item_get_nocheck(const char *key, const size_t nkey) {
547    item *it = assoc_find(key, nkey);
548    if (it) {
549        it->refcount++;
550        DEBUG_REFCNT(it, '+');
551    }
552    return it;
553}
554
555/* expires items that are more recent than the oldest_live setting. */
556void do_item_flush_expired(void) {
557    int i;
558    item *iter, *next;
559    if (settings.oldest_live == 0)
560        return;
561    for (i = 0; i < LARGEST_ID; i++) {
562        /* The LRU is sorted in decreasing time order, and an item's timestamp
563         * is never newer than its last access time, so we only need to walk
564         * back until we hit an item older than the oldest_live time.
565         * The oldest_live checking will auto-expire the remaining items.
566         */
567        for (iter = heads[i]; iter != NULL; iter = next) {
568            if (iter->time >= settings.oldest_live) {
569                next = iter->next;
570                if ((iter->it_flags & ITEM_SLABBED) == 0) {
571                    do_item_unlink(iter);
572                }
573            } else {
574                /* We've hit the first old item. Continue to the next queue. */
575                break;
576            }
577        }
578    }
579}
580