1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 #include "memcached.h"
3 #include <errno.h>
4 #include <stdlib.h>
5 #include <stdio.h>
6 #include <string.h>
7 #include <time.h>
8 #include <assert.h>
9 #include "log.h"
10 
11 /* Forward Declarations */
12 static void item_link_q(item *it);
13 static void item_unlink_q(item *it);
14 
15 /*
16  * We only reposition items in the LRU queue if they haven't been repositioned
17  * in this many seconds. That saves us from churning on frequently-accessed
18  * items.
19  */
20 #define ITEM_UPDATE_INTERVAL 60
21 
22 #define LARGEST_ID 255
23 typedef struct {
24     unsigned int evicted;
25     rel_time_t evicted_time;
26     unsigned int outofmemory;
27     unsigned int tailrepairs;
28 } itemstats_t;
29 
30 static item *heads[LARGEST_ID];
31 static item *tails[LARGEST_ID];
32 static itemstats_t itemstats[LARGEST_ID];
33 static unsigned int sizes[LARGEST_ID];
34 
item_init(void)35 void item_init(void) {
36     int i;
37     memset(itemstats, 0, sizeof(itemstats_t) * LARGEST_ID);
38     for(i = 0; i < LARGEST_ID; i++) {
39         heads[i] = NULL;
40         tails[i] = NULL;
41         sizes[i] = 0;
42     }
43 }
44 
item_stats_reset(void)45 void item_stats_reset(void) {
46     cb_mutex_enter(&cache_lock);
47     memset(itemstats, 0, sizeof(itemstats_t) * LARGEST_ID);
48     cb_mutex_exit(&cache_lock);
49 }
50 
51 
52 /* Get the next CAS id for a new item. */
get_cas_id(void)53 uint64_t get_cas_id(void) {
54     static uint64_t cas_id = 0;
55     return ++cas_id;
56 }
57 
58 /* Enable this for reference-count debugging. */
59 #if 0
60 # define DEBUG_REFCNT(it,op) \
61                 moxi_log_write("item %x refcnt(%c) %d %c%c%c\n", \
62                         it, op, it->refcount, \
63                         (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
64                         (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
65 #else
66 # define DEBUG_REFCNT(it,op) do {} while(0)
67 #endif
68 
69 /**
70  * Generates the variable-sized part of the header for an object.
71  *
72  * key     - The key
73  * nkey    - The length of the key
74  * flags   - key flags
75  * nbytes  - Number of bytes to hold value and addition CRLF terminator
76  * suffix  - Buffer for the "VALUE" line suffix (flags, size).
77  * nsuffix - The length of the suffix is stored here.
78  *
79  * Returns the total size of the header.
80  */
item_make_header(const uint8_t nkey, const int flags, const int nbytes, char *suffix, uint8_t *nsuffix)81 static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes,
82                      char *suffix, uint8_t *nsuffix) {
83     /* suffix is defined at 40 chars elsewhere.. */
84     *nsuffix = (uint8_t) snprintf(suffix, 40, " %d %d\r\n", flags, nbytes - 2);
85     return sizeof(item) + nkey + *nsuffix + nbytes;
86 }
87 
88 /*@null@*/
do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes)89 item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) {
90     uint8_t nsuffix;
91     char suffix[40];
92     size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
93     if (settings.use_cas) {
94         ntotal += sizeof(uint64_t);
95     }
96 
97 #ifdef MOXI_ITEM_MALLOC
98     item *itx = malloc(ntotal);
99     if (itx != NULL) {
100         itx->refcount = 1;
101         itx->slabs_clsid = 0;
102         itx->next = 0;
103         itx->prev = 0;
104         itx->h_next = 0;
105         itx->it_flags = settings.use_cas ? ITEM_CAS : 0;
106         itx->nkey = nkey;
107         itx->nbytes = nbytes;
108         memcpy(ITEM_key(itx), key, nkey);
109         itx->exptime = exptime;
110         memcpy(ITEM_suffix(itx), suffix, (size_t)nsuffix);
111         itx->nsuffix = nsuffix;
112     }
113 
114     return itx;
115 #else
116     unsigned int id = slabs_clsid(ntotal);
117     if (id == 0)
118         return 0;
119 
120     /* do a quick check if we have any expired items in the tail.. */
121     int tries = 50;
122     item *it = NULL;
123     item *search;
124 
125     for (search = tails[id];
126          tries > 0 && search != NULL;
127          tries--, search=search->prev) {
128         if (search->refcount == 0 &&
129             (search->exptime != 0 && search->exptime < current_time)) {
130             it = search;
131             /* I don't want to actually free the object, just steal
132              * the item to avoid to grab the slab mutex twice ;-)
133              */
134             it->refcount = 1;
135             do_item_unlink(it);
136             /* Initialize the item block: */
137             it->slabs_clsid = 0;
138             it->refcount = 0;
139             break;
140         }
141     }
142 
143     if (it == NULL && (it = slabs_alloc(ntotal, id)) == NULL) {
144         /*
145         ** Could not find an expired item at the tail, and memory allocation
146         ** failed. Try to evict some items!
147         */
148         tries = 50;
149 
150         /* If requested to not push old items out of cache when memory runs out,
151          * we're out of luck at this point...
152          */
153 
154         if (settings.evict_to_free == 0) {
155             itemstats[id].outofmemory++;
156             return NULL;
157         }
158 
159         /*
160          * try to get one off the right LRU
161          * don't necessariuly unlink the tail because it may be locked: refcount>0
162          * search up from tail an item with refcount==0 and unlink it; give up after 50
163          * tries
164          */
165 
166         if (tails[id] == 0) {
167             itemstats[id].outofmemory++;
168             return NULL;
169         }
170 
171         for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) {
172             if (search->refcount == 0) {
173                 if (search->exptime == 0 || search->exptime > current_time) {
174                     itemstats[id].evicted++;
175                     itemstats[id].evicted_time = current_time - search->time;
176                     STATS_LOCK();
177                     stats.evictions++;
178                     STATS_UNLOCK();
179                 }
180                 do_item_unlink(search);
181                 break;
182             }
183         }
184         it = slabs_alloc(ntotal, id);
185         if (it == 0) {
186             itemstats[id].outofmemory++;
187             /* Last ditch effort. There is a very rare bug which causes
188              * refcount leaks. We've fixed most of them, but it still happens,
189              * and it may happen in the future.
190              * We can reasonably assume no item can stay locked for more than
191              * three hours, so if we find one in the tail which is that old,
192              * free it anyway.
193              */
194             tries = 50;
195             for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) {
196                 if (search->refcount != 0 && search->time + TAIL_REPAIR_TIME < current_time) {
197                     itemstats[id].tailrepairs++;
198                     search->refcount = 0;
199                     do_item_unlink(search);
200                     break;
201                 }
202             }
203             it = slabs_alloc(ntotal, id);
204             if (it == 0) {
205                 return NULL;
206             }
207         }
208     }
209 
210     assert(it->slabs_clsid == 0);
211 
212     it->slabs_clsid = id;
213 
214     assert(it != heads[it->slabs_clsid]);
215 
216     it->next = it->prev = it->h_next = 0;
217     it->refcount = 1;     /* the caller will have a reference */
218     DEBUG_REFCNT(it, '*');
219     it->it_flags = settings.use_cas ? ITEM_CAS : 0;
220     it->nkey = nkey;
221     it->nbytes = nbytes;
222     memcpy(ITEM_key(it), key, nkey);
223     it->exptime = exptime;
224     memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
225     it->nsuffix = nsuffix;
226     return it;
227 #endif
228 }
229 
item_free(item *it)230 void item_free(item *it) {
231 #ifdef MOXI_ITEM_MALLOC
232     assert(it->refcount > 0);
233     it->refcount--;
234     if (it->refcount == 0) {
235         free(it);
236     }
237 #else
238     size_t ntotal = ITEM_ntotal(it);
239     unsigned int clsid;
240     assert((it->it_flags & ITEM_LINKED) == 0);
241     assert(it != heads[it->slabs_clsid]);
242     assert(it != tails[it->slabs_clsid]);
243     assert(it->refcount == 0);
244 
245     /* so slab size changer can tell later if item is already free or not */
246     clsid = it->slabs_clsid;
247     it->slabs_clsid = 0;
248     it->it_flags |= ITEM_SLABBED;
249     DEBUG_REFCNT(it, 'F');
250     slabs_free(it, ntotal, clsid);
251 #endif
252 }
253 
254 /**
255  * Returns true if an item will fit in the cache (its size does not exceed
256  * the maximum for a cache entry.)
257  */
item_size_ok(const size_t nkey, const int flags, const int nbytes)258 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
259     char prefix[40];
260     uint8_t nsuffix;
261 
262     return slabs_clsid(item_make_header(nkey + 1, flags, nbytes,
263                                         prefix, &nsuffix)) != 0;
264 }
265 
item_link_q(item *it)266 static void item_link_q(item *it) { /* item is the new head */
267     (void)it;
268 #ifndef MOXI_ITEM_MALLOC
269     item **head, **tail;
270     /* always true, warns: assert(it->slabs_clsid <= LARGEST_ID); */
271     assert((it->it_flags & ITEM_SLABBED) == 0);
272 
273     head = &heads[it->slabs_clsid];
274     tail = &tails[it->slabs_clsid];
275     assert(it != *head);
276     assert((*head && *tail) || (*head == 0 && *tail == 0));
277     it->prev = 0;
278     it->next = *head;
279     if (it->next) it->next->prev = it;
280     *head = it;
281     if (*tail == 0) *tail = it;
282     sizes[it->slabs_clsid]++;
283     return;
284 #endif
285 }
286 
item_unlink_q(item *it)287 static void item_unlink_q(item *it) {
288     (void)it;
289 #ifndef MOXI_ITEM_MALLOC
290     item **head, **tail;
291     /* always true, warns: assert(it->slabs_clsid <= LARGEST_ID); */
292     head = &heads[it->slabs_clsid];
293     tail = &tails[it->slabs_clsid];
294 
295     if (*head == it) {
296         assert(it->prev == 0);
297         *head = it->next;
298     }
299     if (*tail == it) {
300         assert(it->next == 0);
301         *tail = it->prev;
302     }
303     assert(it->next != it);
304     assert(it->prev != it);
305 
306     if (it->next) it->next->prev = it->prev;
307     if (it->prev) it->prev->next = it->next;
308     sizes[it->slabs_clsid]--;
309     return;
310 #endif
311 }
312 
do_item_link(item *it)313 int do_item_link(item *it) {
314     MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
315     assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
316     assert(it->nbytes < (1024 * 1024));  /* 1MB max size */
317     it->it_flags |= ITEM_LINKED;
318     it->time = current_time;
319     assoc_insert(it);
320 
321 #ifdef MOXI_ITEM_MALLOC
322     it->refcount++;
323 #endif
324 
325     STATS_LOCK();
326     stats.curr_bytes += ITEM_ntotal(it);
327     stats.curr_items += 1;
328     stats.total_items += 1;
329     STATS_UNLOCK();
330 
331     /* Allocate a new CAS ID on link. */
332     ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
333 
334     item_link_q(it);
335 
336     return 1;
337 }
338 
do_item_unlink(item *it)339 void do_item_unlink(item *it) {
340     MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
341     if ((it->it_flags & ITEM_LINKED) != 0) {
342         it->it_flags &= ~ITEM_LINKED;
343         STATS_LOCK();
344         stats.curr_bytes -= ITEM_ntotal(it);
345         stats.curr_items -= 1;
346         STATS_UNLOCK();
347         assoc_delete(ITEM_key(it), it->nkey);
348         item_unlink_q(it);
349 
350 #ifndef MOXI_ITEM_MALLOC
351         if (it->refcount == 0) item_free(it);
352 #endif
353     }
354 }
355 
do_item_remove(item *it)356 void do_item_remove(item *it) {
357 #ifdef MOXI_ITEM_MALLOC
358     item_free(it);
359 #else
360     MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
361     assert((it->it_flags & ITEM_SLABBED) == 0);
362     if (it->refcount != 0) {
363         it->refcount--;
364         DEBUG_REFCNT(it, '-');
365     }
366     if (it->refcount == 0 && (it->it_flags & ITEM_LINKED) == 0) {
367         item_free(it);
368     }
369 #endif
370 }
371 
do_item_update(item *it)372 void do_item_update(item *it) {
373     MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
374     if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
375         assert((it->it_flags & ITEM_SLABBED) == 0);
376 
377         if ((it->it_flags & ITEM_LINKED) != 0) {
378             item_unlink_q(it);
379             it->time = current_time;
380             item_link_q(it);
381         }
382     }
383 }
384 
do_item_replace(item *it, item *new_it)385 int do_item_replace(item *it, item *new_it) {
386     MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
387                            ITEM_key(new_it), new_it->nkey, new_it->nbytes);
388     assert((it->it_flags & ITEM_SLABBED) == 0);
389 
390     do_item_unlink(it);
391     return do_item_link(new_it);
392 }
393 
394 /*@null@*/
do_item_cachedump(const unsigned int clsid, const unsigned int limit, unsigned int *bytes)395 char *do_item_cachedump(const unsigned int clsid, const unsigned int limit, unsigned int *bytes) {
396     unsigned int memlimit = 2 * 1024 * 1024;   /* 2MB max response size */
397     char *buffer;
398     unsigned int bufcurr;
399     item *it;
400     unsigned int len;
401     unsigned int shown = 0;
402     char key_temp[KEY_MAX_LENGTH + 1];
403     char temp[512];
404 
405     if (clsid > LARGEST_ID) return NULL;
406     it = heads[clsid];
407 
408     buffer = malloc((size_t)memlimit);
409     if (buffer == 0) return NULL;
410     bufcurr = 0;
411 
412     while (it != NULL && (limit == 0 || shown < limit)) {
413         assert(it->nkey <= KEY_MAX_LENGTH);
414         /* Copy the key since it may not be null-terminated in the struct */
415         strncpy(key_temp, ITEM_key(it), it->nkey);
416         key_temp[it->nkey] = 0x00; /* terminate */
417         len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %lu s]\r\n",
418                        key_temp, it->nbytes - 2,
419                        (unsigned long)it->exptime + process_started);
420         if (bufcurr + len + 6 > memlimit)  /* 6 is END\r\n\0 */
421             break;
422         memcpy(buffer + bufcurr, temp, len);
423         bufcurr += len;
424         shown++;
425         it = it->next;
426     }
427 
428     memcpy(buffer + bufcurr, "END\r\n", 6);
429     bufcurr += 5;
430 
431     *bytes = bufcurr;
432     return buffer;
433 }
434 
do_item_stats(ADD_STAT add_stats, void *c)435 void do_item_stats(ADD_STAT add_stats, void *c) {
436     int i;
437     for (i = 0; i < LARGEST_ID; i++) {
438         if (tails[i] != NULL) {
439             const char *fmt = "items:%d:%s";
440             char key_str[STAT_KEY_LEN];
441             char val_str[STAT_VAL_LEN];
442             int klen = 0, vlen = 0;
443 
444             APPEND_NUM_FMT_STAT(fmt, i, "number", "%u", sizes[i]);
445             APPEND_NUM_FMT_STAT(fmt, i, "age", "%u", tails[i]->time);
446             APPEND_NUM_FMT_STAT(fmt, i, "evicted",
447                                 "%u", itemstats[i].evicted);
448             APPEND_NUM_FMT_STAT(fmt, i, "evicted_time",
449                                 "%u", itemstats[i].evicted_time);
450             APPEND_NUM_FMT_STAT(fmt, i, "outofmemory",
451                                 "%u", itemstats[i].outofmemory);
452             APPEND_NUM_FMT_STAT(fmt, i, "tailrepairs",
453                                 "%u", itemstats[i].tailrepairs);;
454         }
455     }
456 
457     /* getting here means both ascii and binary terminators fit */
458     add_stats(NULL, 0, NULL, 0, c);
459 }
460 
461 /** dumps out a list of objects of each size, with granularity of 32 bytes */
462 /*@null@*/
do_item_stats_sizes(ADD_STAT add_stats, void *c)463 void do_item_stats_sizes(ADD_STAT add_stats, void *c) {
464 
465     /* max 1MB object, divided into 32 bytes size buckets */
466     const int num_buckets = 32768;
467     unsigned int *histogram = calloc(num_buckets, sizeof(int));
468 
469     if (histogram != NULL) {
470         int i;
471 
472         /* build the histogram */
473         for (i = 0; i < LARGEST_ID; i++) {
474             item *iter = heads[i];
475             while (iter) {
476                 int ntotal = ITEM_ntotal(iter);
477                 int bucket = ntotal / 32;
478                 if ((ntotal % 32) != 0) bucket++;
479                 if (bucket < num_buckets) histogram[bucket]++;
480                 iter = iter->next;
481             }
482         }
483 
484         /* write the buffer */
485         for (i = 0; i < num_buckets; i++) {
486             if (histogram[i] != 0) {
487                 char key[8];
488                 int klen = 0;
489                 klen = snprintf(key, sizeof(key), "%d", i * 32);
490                 assert(klen < (int) sizeof(key));
491                 APPEND_STAT(key, "%u", histogram[i]);
492             }
493         }
494         free(histogram);
495     }
496     add_stats(NULL, 0, NULL, 0, c);
497 }
498 
499 /** wrapper around assoc_find which does the lazy expiration logic */
do_item_get(const char *key, const size_t nkey)500 item *do_item_get(const char *key, const size_t nkey) {
501     item *it = assoc_find(key, nkey);
502     int was_found = 0;
503 
504     if (settings.verbose > 2) {
505         if (it == NULL) {
506             moxi_log_write("> NOT FOUND %s", key);
507         } else {
508             moxi_log_write("> FOUND KEY %s", ITEM_key(it));
509             was_found++;
510         }
511     }
512 
513     if (it != NULL && settings.oldest_live != 0 && settings.oldest_live <= current_time &&
514         it->time <= settings.oldest_live) {
515         do_item_unlink(it);           /* MTSAFE - cache_lock held */
516         it = NULL;
517     }
518 
519     if (it == NULL && was_found) {
520         moxi_log_write(" -nuked by flush");
521         was_found--;
522     }
523 
524     if (it != NULL && it->exptime != 0 && it->exptime <= current_time) {
525         do_item_unlink(it);           /* MTSAFE - cache_lock held */
526         it = NULL;
527     }
528 
529     if (it == NULL && was_found) {
530         moxi_log_write(" -nuked by expire");
531         was_found--;
532     }
533 
534     if (it != NULL) {
535         it->refcount++;
536         DEBUG_REFCNT(it, '+');
537     }
538 
539     if (settings.verbose > 2)
540         moxi_log_write("\n");
541 
542     return it;
543 }
544 
545 /** returns an item whether or not it's expired. */
do_item_get_nocheck(const char *key, const size_t nkey)546 item *do_item_get_nocheck(const char *key, const size_t nkey) {
547     item *it = assoc_find(key, nkey);
548     if (it) {
549         it->refcount++;
550         DEBUG_REFCNT(it, '+');
551     }
552     return it;
553 }
554 
555 /* expires items that are more recent than the oldest_live setting. */
do_item_flush_expired(void)556 void do_item_flush_expired(void) {
557     int i;
558     item *iter, *next;
559     if (settings.oldest_live == 0)
560         return;
561     for (i = 0; i < LARGEST_ID; i++) {
562         /* The LRU is sorted in decreasing time order, and an item's timestamp
563          * is never newer than its last access time, so we only need to walk
564          * back until we hit an item older than the oldest_live time.
565          * The oldest_live checking will auto-expire the remaining items.
566          */
567         for (iter = heads[i]; iter != NULL; iter = next) {
568             if (iter->time >= settings.oldest_live) {
569                 next = iter->next;
570                 if ((iter->it_flags & ITEM_SLABBED) == 0) {
571                     do_item_unlink(iter);
572                 }
573             } else {
574                 /* We've hit the first old item. Continue to the next queue. */
575                 break;
576             }
577         }
578     }
579 }
580