1 /* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3  *     Copyright 2010 Couchbase, Inc
4  *
5  *   Licensed under the Apache License, Version 2.0 (the "License");
6  *   you may not use this file except in compliance with the License.
7  *   You may obtain a copy of the License at
8  *
9  *       http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *   Unless required by applicable law or agreed to in writing, software
12  *   distributed under the License is distributed on an "AS IS" BASIS,
13  *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *   See the License for the specific language governing permissions and
15  *   limitations under the License.
16  */
17 
18 #ifndef SRC_STORED_VALUE_H_
19 #define SRC_STORED_VALUE_H_ 1
20 
21 #include "config.h"
22 
23 #include <algorithm>
24 #include <climits>
25 #include <cstring>
26 #include <string>
27 
28 #include "common.h"
29 #include "ep_time.h"
30 #include "histo.h"
31 #include "item.h"
32 #include "item_pager.h"
33 #include "locks.h"
34 #include "stats.h"
35 
36 // Forward declaration for StoredValue
37 class HashTable;
38 class StoredValueFactory;
39 
40 /**
41  * In-memory storage for an item.
42  */
43 class StoredValue {
44 public:
45 
operator delete(void* p)46     void operator delete(void* p) {
47         ::operator delete(p);
48      }
49 
50     uint8_t getNRUValue();
51 
52     void setNRUValue(uint8_t nru_val);
53 
54     uint8_t incrNRUValue();
55 
56     void referenced();
57 
58     /**
59      * Mark this item as needing to be persisted.
60      */
markDirty()61     void markDirty() {
62         _isDirty = 1;
63     }
64 
65     /**
66      * Mark this item as dirty as of a certain time.
67      *
68      * This method is primarily used to mark an item as dirty again
69      * after a storage failure.
70      *
71      * @param dataAge the previous dataAge of this record
72      */
reDirty()73     void reDirty() {
74         _isDirty = 1;
75     }
76 
77     // returns time this object was dirtied.
78     /**
79      * Mark this item as clean.
80      *
81      * @param dataAge an output parameter that captures the time this
82      *                item was marked dirty
83      */
markClean()84     void markClean() {
85         _isDirty = 0;
86     }
87 
88     /**
89      * True if this object is dirty.
90      */
isDirty() const91     bool isDirty() const {
92         return _isDirty;
93     }
94 
95     /**
96      * True if this object is not dirty.
97      */
isClean() const98     bool isClean() const {
99         return !isDirty();
100     }
101 
eligibleForEviction(item_eviction_policy_t policy)102     bool eligibleForEviction(item_eviction_policy_t policy) {
103         if (policy == VALUE_ONLY) {
104             return isResident() && isClean() && !isDeleted();
105         } else {
106             return isClean() && !isDeleted();
107         }
108     }
109 
110     /**
111      * Check if this item is expired or not.
112      *
113      * @param asOf the time to be compared with this item's expiry time
114      * @return true if this item's expiry time < asOf
115      */
isExpired(time_t asOf) const116     bool isExpired(time_t asOf) const {
117         if (getExptime() != 0 && getExptime() < asOf) {
118             return true;
119         }
120         return false;
121     }
122 
123     /**
124      * Get the pointer to the beginning of the key.
125      */
getKeyBytes() const126     const char* getKeyBytes() const {
127         return keybytes;
128     }
129 
130     /**
131      * Get the length of the key.
132      */
getKeyLen() const133     uint8_t getKeyLen() const {
134         return keylen;
135     }
136 
137     /**
138      * True of this item is for the given key.
139      *
140      * @param k the key we're checking
141      * @return true if this item's key is equal to k
142      */
hasKey(const std::string &k) const143     bool hasKey(const std::string &k) const {
144         return k.length() == getKeyLen()
145             && (std::memcmp(k.data(), getKeyBytes(), getKeyLen()) == 0);
146     }
147 
148     /**
149      * Get this item's key.
150      */
getKey() const151     const std::string getKey() const {
152         return std::string(getKeyBytes(), getKeyLen());
153     }
154 
155     /**
156      * Get this item's value.
157      */
getValue() const158     const value_t &getValue() const {
159         return value;
160     }
161 
162     /**
163      * Get the expiration time of this item.
164      *
165      * @return the expiration time for feature items, 0 for small items
166      */
getExptime() const167     time_t getExptime() const {
168         return exptime;
169     }
170 
setExptime(time_t tim)171     void setExptime(time_t tim) {
172         exptime = tim;
173         markDirty();
174     }
175 
176     /**
177      * Get the client-defined flags of this item.
178      *
179      * @return the flags for feature items, 0 for small items
180      */
getFlags() const181     uint32_t getFlags() const {
182         return flags;
183     }
184 
185     /**
186      * Set the client-defined flags for this item.
187      */
setFlags(uint32_t fl)188     void setFlags(uint32_t fl) {
189         flags = fl;
190     }
191 
192     /**
193      * Set a new value for this item.
194      *
195      * @param itm the item with a new value
196      * @param ht the hashtable that contains this StoredValue instance
197      * @param preserveSeqno Preserve the revision sequence number from the item.
198      */
setValue(Item &itm, HashTable &ht, bool preserveSeqno)199     void setValue(Item &itm, HashTable &ht, bool preserveSeqno) {
200         size_t currSize = size();
201         reduceCacheSize(ht, currSize);
202         value = itm.getValue();
203         deleted = false;
204         flags = itm.getFlags();
205         bySeqno = itm.getBySeqno();
206 
207         cas = itm.getCas();
208         exptime = itm.getExptime();
209         if (preserveSeqno) {
210             revSeqno = itm.getRevSeqno();
211         } else {
212             ++revSeqno;
213             itm.setRevSeqno(revSeqno);
214         }
215 
216         markDirty();
217         size_t newSize = size();
218         increaseCacheSize(ht, newSize);
219     }
220 
221     /**
222      * Reset the value of this item.
223      */
resetValue()224     void resetValue() {
225         cb_assert(!isDeleted());
226         markNotResident();
227         // item no longer resident once reset the value
228         deleted = true;
229     }
230 
231     /**
232      * Eject an item value from memory.
233      * @param ht the hashtable that contains this StoredValue instance
234      */
235     bool ejectValue(HashTable &ht, item_eviction_policy_t policy);
236 
237     /**
238      * Restore the value for this item.
239      * @param itm the item to be restored
240      * @param ht the hashtable that contains this StoredValue instance
241      */
242     bool unlocked_restoreValue(Item *itm, HashTable &ht);
243 
244     /**
245      * Restore the metadata of of a temporary item upon completion of a
246      * background fetch assuming the hashtable bucket is locked.
247      *
248      * @param itm the Item whose metadata is being restored
249      * @param status the engine code describing the result of the background
250      *               fetch
251      */
252     bool unlocked_restoreMeta(Item *itm, ENGINE_ERROR_CODE status,
253                               HashTable &ht);
254 
255     /**
256      * Get this item's CAS identifier.
257      *
258      * @return the cas ID for feature items, 0 for small items
259      */
getCas() const260     uint64_t getCas() const {
261         return cas;
262     }
263 
264     /**
265      * Set a new CAS ID.
266      *
267      * This is a NOOP for small item types.
268      */
setCas(uint64_t c)269     void setCas(uint64_t c) {
270         cas = c;
271     }
272 
273     /**
274      * Lock this item until the given time.
275      *
276      * This is a NOOP for small item types.
277      */
lock(rel_time_t expiry)278     void lock(rel_time_t expiry) {
279         lock_expiry = expiry;
280     }
281 
282     /**
283      * Unlock this item.
284      */
unlock()285     void unlock() {
286         lock_expiry = 0;
287     }
288 
289     /**
290      * True if this item has an ID.
291      *
292      * An item always has an ID after it's been persisted.
293      */
hasBySeqno()294     bool hasBySeqno() {
295         return bySeqno > 0;
296     }
297 
298     /**
299      * Get this item's ID.
300      *
301      * @return the ID for the item; 0 if the item has no ID
302      */
getBySeqno()303     int64_t getBySeqno() {
304         return bySeqno;
305     }
306 
307     /**
308      * Set the ID for this item.
309      *
310      * This is used by the persistene layer.
311      *
312      * It is an error to set an ID on an item that already has one.
313      */
setBySeqno(int64_t to)314     void setBySeqno(int64_t to) {
315         bySeqno = to;
316         cb_assert(hasBySeqno());
317     }
318 
319     /**
320      * Set the stored value state to the specified value
321      */
setStoredValueState(const int64_t to)322     void setStoredValueState(const int64_t to) {
323         cb_assert(to == state_deleted_key || to == state_non_existent_key);
324         bySeqno = to;
325     }
326 
327     /**
328      * Is this a temporary item created for processing a get-meta request?
329      */
isTempItem()330      bool isTempItem() {
331          return(isTempNonExistentItem() || isTempDeletedItem() || isTempInitialItem());
332 
333      }
334 
335     /**
336      * Is this an initial temporary item?
337      */
isTempInitialItem()338     bool isTempInitialItem() {
339         return bySeqno == state_temp_init;
340     }
341 
342     /**
343      * Is this a temporary item created for a non-existent key?
344      */
isTempNonExistentItem()345      bool isTempNonExistentItem() {
346          return bySeqno == state_non_existent_key;
347 
348      }
349 
350     /**
351      * Is this a temporary item created for a deleted key?
352      */
isTempDeletedItem()353      bool isTempDeletedItem() {
354          return bySeqno == state_deleted_key;
355 
356      }
357 
valuelen()358     size_t valuelen() {
359         if (isDeleted() || !isResident()) {
360             return 0;
361         }
362         return value->length();
363     }
364 
365     /**
366      * Get the total size of this item.
367      *
368      * @return the amount of memory used by this item.
369      */
size()370     size_t size() {
371         return sizeof(StoredValue) + getKeyLen() + valuelen();
372     }
373 
metaDataSize()374     size_t metaDataSize() {
375         return sizeof(StoredValue) + getKeyLen();
376     }
377 
378     /**
379      * Return true if this item is locked as of the given timestamp.
380      *
381      * @param curtime lock expiration marker (usually the current time)
382      * @return true if the item is locked
383      */
isLocked(rel_time_t curtime)384     bool isLocked(rel_time_t curtime) {
385         if (lock_expiry == 0 || (curtime > lock_expiry)) {
386             lock_expiry = 0;
387             return false;
388         }
389         return true;
390     }
391 
392     /**
393      * True if this value is resident in memory currently.
394      */
isResident() const395     bool isResident() const {
396         return value.get() != NULL;
397     }
398 
markNotResident()399     void markNotResident() {
400         value.reset();
401     }
402 
403     /**
404      * True if this object is logically deleted.
405      */
isDeleted() const406     bool isDeleted() const {
407         return deleted;
408     }
409 
410     /**
411      * Logically delete this object.
412      */
del(HashTable &ht, bool isMetaDelete=false)413     void del(HashTable &ht, bool isMetaDelete=false) {
414         if (isDeleted()) {
415             return;
416         }
417 
418         reduceCacheSize(ht, valuelen());
419         resetValue();
420         markDirty();
421         if (!isMetaDelete) {
422             setCas(getCas() + 1);
423         }
424     }
425 
426 
getRevSeqno() const427     uint64_t getRevSeqno() const {
428         return revSeqno;
429     }
430 
431     /**
432      * Set a new revision sequence number.
433      */
setRevSeqno(uint64_t s)434     void setRevSeqno(uint64_t s) {
435         revSeqno = s;
436     }
437 
438     /**
439      * Return true if this is a new cache item.
440      */
isNewCacheItem(void)441     bool isNewCacheItem(void) {
442         return newCacheItem;
443     }
444 
445     /**
446      * Set / reset a new cache item flag.
447      */
setNewCacheItem(bool newitem)448     void setNewCacheItem(bool newitem) {
449         newCacheItem = newitem;
450     }
451 
452     /**
453      * Generate a new Item out of this object.
454      *
455      * @param lck if true, the new item will return a locked CAS ID.
456      * @param vbucket the vbucket containing this item.
457      */
458     Item *toItem(bool lck, uint16_t vbucket) const;
459 
460     /**
461      * Set the memory threshold on the current bucket quota for accepting a new mutation
462      */
463     static void setMutationMemoryThreshold(double memThreshold);
464 
465     /*
466      * Values of the bySeqno attribute used by temporarily created StoredValue
467      * objects.
468      * state_deleted_key: represents an item that's deleted from memory but
469      *                    present in the persistent store.
470      * state_non_existent_key: represents a non existent item
471      */
472     static const int64_t state_deleted_key;
473     static const int64_t state_non_existent_key;
474     static const int64_t state_temp_init;
475 
~StoredValue()476     ~StoredValue() {
477         ObjectRegistry::onDeleteStoredValue(this);
478     }
479 
getObjectSize() const480     size_t getObjectSize() const {
481         return sizeof(StoredValue) + keylen;
482     }
483 
484 private:
485 
StoredValue(const Item &itm, StoredValue *n, EPStats &stats, HashTable &ht, bool setDirty = true)486     StoredValue(const Item &itm, StoredValue *n, EPStats &stats, HashTable &ht,
487                 bool setDirty = true) :
488         value(itm.getValue()), next(n), bySeqno(itm.getBySeqno()),
489         flags(itm.getFlags()) {
490         cas = itm.getCas();
491         exptime = itm.getExptime();
492         deleted = false;
493         newCacheItem = true;
494         nru = INITIAL_NRU_VALUE;
495         lock_expiry = 0;
496         keylen = itm.getNKey();
497         revSeqno = itm.getRevSeqno();
498 
499         if (setDirty) {
500             markDirty();
501         } else {
502             markClean();
503         }
504 
505         increaseMetaDataSize(ht, stats, metaDataSize());
506         increaseCacheSize(ht, size());
507 
508         ObjectRegistry::onCreateStoredValue(this);
509     }
510 
511     friend class HashTable;
512     friend class StoredValueFactory;
513 
514     value_t            value;          // 8 bytes
515     StoredValue        *next;          // 8 bytes
516     uint64_t           cas;            //!< CAS identifier.
517     uint64_t           revSeqno;       //!< Revision id sequence number
518     int64_t            bySeqno;        //!< By sequence id number
519     rel_time_t         lock_expiry;    //!< getl lock expiration
520     uint32_t           exptime;        //!< Expiration time of this item.
521     uint32_t           flags;          // 4 bytes
522     bool               _isDirty  :  1; // 1 bit
523     bool               deleted   :  1;
524     bool               newCacheItem : 1;
525     uint8_t            nru       :  2; //!< True if referenced since last sweep
526     uint8_t            keylen;
527     char               keybytes[1];    //!< The key itself.
528 
529     static void increaseMetaDataSize(HashTable &ht, EPStats &st, size_t by);
530     static void reduceMetaDataSize(HashTable &ht, EPStats &st, size_t by);
531     static void increaseCacheSize(HashTable &ht, size_t by);
532     static void reduceCacheSize(HashTable &ht, size_t by);
533     static bool hasAvailableSpace(EPStats&, const Item &item,
534                                   bool isReplication=false);
535     static double mutation_mem_threshold;
536 
537     DISALLOW_COPY_AND_ASSIGN(StoredValue);
538 };
539 
540 /**
541  * Mutation types as returned by store commands.
542  */
543 typedef enum {
544     /**
545      * Storage was attempted on a vbucket not managed by this node.
546      */
547     INVALID_VBUCKET,
548     NOT_FOUND,                  //!< The item was not found for update
549     INVALID_CAS,                //!< The wrong CAS identifier was sent for a CAS update
550     WAS_CLEAN,                  //!< The item was clean before this mutation
551     WAS_DIRTY,                  //!< This item was already dirty before this mutation
552     IS_LOCKED,                  //!< The item is locked and can't be updated.
553     NOMEM,                      //!< Insufficient memory to store this item.
554     NEED_BG_FETCH               //!< Require a bg fetch to process SET op
555 } mutation_type_t;
556 
557 /**
558  * Result from add operation.
559  */
560 typedef enum {
561     ADD_SUCCESS,                //!< Add was successful.
562     ADD_NOMEM,                  //!< No memory for operation
563     ADD_EXISTS,                 //!< Did not update -- item exists with this key
564     ADD_UNDEL,                  //!< Undeletes an existing dirty item
565     ADD_TMP_AND_BG_FETCH,       //!< Create a tmp item and schedule a bg metadata fetch
566     ADD_BG_FETCH                //!< Schedule a bg metadata fetch to process ADD op
567 } add_type_t;
568 
569 /**
570  * Base class for visiting a hash table.
571  */
572 class HashTableVisitor {
573 public:
~HashTableVisitor()574     virtual ~HashTableVisitor() {}
575 
576     /**
577      * Visit an individual item within a hash table.
578      *
579      * @param v a pointer to a value in the hash table
580      */
581     virtual void visit(StoredValue *v) = 0;
582     /**
583      * True if the visiting should continue.
584      *
585      * This is called periodically to ensure the visitor still wants
586      * to visit items.
587      */
shouldContinue()588     virtual bool shouldContinue() { return true; }
589 };
590 
591 /**
592  * Hash table visitor that reports the depth of each hashtable bucket.
593  */
594 class HashTableDepthVisitor {
595 public:
~HashTableDepthVisitor()596     virtual ~HashTableDepthVisitor() {}
597 
598     /**
599      * Called once for each hashtable bucket with its depth.
600      *
601      * @param bucket the index of the hashtable bucket
602      * @param depth the number of entries in this hashtable bucket
603      * @param mem counted memory used by this hash table
604      */
605     virtual void visit(int bucket, int depth, size_t mem) = 0;
606 };
607 
608 /**
609  * Hash table visitor that finds the min and max bucket depths.
610  */
611 class HashTableDepthStatVisitor : public HashTableDepthVisitor {
612 public:
613 
HashTableDepthStatVisitor()614     HashTableDepthStatVisitor() : depthHisto(GrowingWidthGenerator<unsigned int>(1, 1, 1.3),
615                                              10),
616                                   size(0), memUsed(0), min(-1), max(0) {}
617 
visit(int bucket, int depth, size_t mem)618     void visit(int bucket, int depth, size_t mem) {
619         (void)bucket;
620         // -1 is a special case for min.  If there's a value other than
621         // -1, we prefer that.
622         min = std::min(min == -1 ? depth : min, depth);
623         max = std::max(max, depth);
624         depthHisto.add(depth);
625         size += depth;
626         memUsed += mem;
627     }
628 
629     Histogram<unsigned int> depthHisto;
630     size_t                  size;
631     size_t                  memUsed;
632     int                     min;
633     int                     max;
634 };
635 
636 /**
637  * Hash table visitor that collects stats of what's inside.
638  */
639 class HashTableStatVisitor : public HashTableVisitor {
640 public:
641 
HashTableStatVisitor()642     HashTableStatVisitor() : numNonResident(0), numTotal(0),
643                              memSize(0), valSize(0), cacheSize(0) {}
644 
visit(StoredValue *v)645     void visit(StoredValue *v) {
646         ++numTotal;
647         memSize += v->size();
648         valSize += v->valuelen();
649 
650         if (v->isResident()) {
651             cacheSize += v->size();
652         } else {
653             ++numNonResident;
654         }
655     }
656 
657     size_t numNonResident;
658     size_t numTotal;
659     size_t memSize;
660     size_t valSize;
661     size_t cacheSize;
662 };
663 
664 /**
665  * Track the current number of hashtable visitors.
666  *
667  * This class is a pretty generic counter holder that increments on
668  * entry and decrements on return providing RAII guarantees around an
669  * atomic counter.
670  */
671 class VisitorTracker {
672 public:
673 
674     /**
675      * Mark a visitor as visiting.
676      *
677      * @param c the counter that should be incremented (and later
678      * decremented).
679      */
VisitorTracker(AtomicValue<size_t> *c)680     explicit VisitorTracker(AtomicValue<size_t> *c) : counter(c) {
681         counter->fetch_add(1);
682     }
~VisitorTracker()683     ~VisitorTracker() {
684         counter->fetch_sub(1);
685     }
686 private:
687     AtomicValue<size_t> *counter;
688 };
689 
690 /**
691  * Creator of StoredValue instances.
692  */
693 class StoredValueFactory {
694 public:
695 
696     /**
697      * Create a new StoredValueFactory of the given type.
698      */
StoredValueFactory(EPStats &s)699     StoredValueFactory(EPStats &s) : stats(&s) { }
700 
701     /**
702      * Create a new StoredValue with the given item.
703      *
704      * @param itm the item the StoredValue should contain
705      * @param n the the top of the hash bucket into which this will be inserted
706      * @param ht the hashtable that will contain the StoredValue instance created
707      * @param setDirty if true, mark this item as dirty after creating it
708      */
operator ()(const Item &itm, StoredValue *n, HashTable &ht, bool setDirty = true)709     StoredValue *operator ()(const Item &itm, StoredValue *n, HashTable &ht,
710                              bool setDirty = true) {
711         return newStoredValue(itm, n, ht, setDirty);
712     }
713 
714 private:
715 
newStoredValue(const Item &itm, StoredValue *n, HashTable &ht, bool setDirty)716     StoredValue* newStoredValue(const Item &itm, StoredValue *n, HashTable &ht,
717                                 bool setDirty) {
718         size_t base = sizeof(StoredValue);
719 
720         const std::string &key = itm.getKey();
721         cb_assert(key.length() < 256);
722         size_t len = key.length() + base;
723 
724         StoredValue *t = new (::operator new(len))
725                          StoredValue(itm, n, *stats, ht, setDirty);
726         std::memcpy(t->keybytes, key.data(), key.length());
727         return t;
728     }
729 
730     EPStats                *stats;
731 };
732 
733 /**
734  * A container of StoredValue instances.
735  */
736 class HashTable {
737 public:
738 
739     /**
740      * Create a HashTable.
741      *
742      * @param st the global stats reference
743      * @param s the number of hash table buckets
744      * @param l the number of locks in the hash table
745      */
HashTable(EPStats &st, size_t s = 0, size_t l = 0)746     HashTable(EPStats &st, size_t s = 0, size_t l = 0) :
747         maxDeletedRevSeqno(0), numTotalItems(0),
748         numNonResidentItems(0), numEjects(0),
749         memSize(0), cacheSize(0), metaDataMemory(0), stats(st),
750         valFact(st), visitors(0), numItems(0), numResizes(0),
751         numTempItems(0)
752     {
753         size = HashTable::getNumBuckets(s);
754         n_locks = HashTable::getNumLocks(l);
755         cb_assert(size > 0);
756         cb_assert(n_locks > 0);
757         cb_assert(visitors == 0);
758         values = static_cast<StoredValue**>(calloc(size, sizeof(StoredValue*)));
759         mutexes = new Mutex[n_locks];
760         activeState = true;
761     }
762 
~HashTable()763     ~HashTable() {
764         clear(true);
765         // Wait for any outstanding visitors to finish.
766         while (visitors > 0) {
767 #ifdef _MSC_VER
768             Sleep(1);
769 #else
770             usleep(100);
771 #endif
772         }
773         delete []mutexes;
774         free(values);
775         values = NULL;
776     }
777 
memorySize()778     size_t memorySize() {
779         return sizeof(HashTable)
780             + (size * sizeof(StoredValue*))
781             + (n_locks * sizeof(Mutex));
782     }
783 
784     /**
785      * Get the number of hash table buckets this hash table has.
786      */
getSize(void)787     size_t getSize(void) { return size; }
788 
789     /**
790      * Get the number of locks in this hash table.
791      */
getNumLocks(void)792     size_t getNumLocks(void) { return n_locks; }
793 
794     /**
795      * Get the number of in-memory non-resident and resident items within
796      * this hash table.
797      */
getNumInMemoryItems(void)798     size_t getNumInMemoryItems(void) { return numItems; }
799 
800     /**
801      * Get the number of in-memory non-resident items within this hash table.
802      */
getNumInMemoryNonResItems(void)803     size_t getNumInMemoryNonResItems(void) { return numNonResidentItems; }
804 
805     /**
806      * Get the number of non-resident and resident items managed by
807      * this hash table. Note that this will be equal to getNumItems() if
808      * VALUE_ONLY_EVICTION is chosen as a cache management.
809      */
getNumItems(void)810     size_t getNumItems(void) {
811         return numTotalItems;
812     }
813 
814     /**
815      * Get the number of items whose values are ejected from this hash table.
816      */
getNumEjects(void)817     size_t getNumEjects(void) { return numEjects; }
818 
819     /**
820      * Get the total item memory size in this hash table.
821      */
getItemMemory(void)822     size_t getItemMemory(void) { return memSize; }
823 
824     /**
825      * Clear the hash table.
826      *
827      * @param deactivate true when this hash table is being destroyed completely
828      *
829      * @return a stat visitor reporting how much stuff was removed
830      */
831     HashTableStatVisitor clear(bool deactivate = false);
832 
833     /**
834      * Get the number of times this hash table has been resized.
835      */
getNumResizes()836     size_t getNumResizes() { return numResizes; }
837 
838     /**
839      * Get the number of temp. items within this hash table.
840      */
getNumTempItems(void)841     size_t getNumTempItems(void) { return numTempItems; }
842 
843     /**
844      * Automatically resize to fit the current data.
845      */
846     void resize();
847 
848     /**
849      * Resize to the specified size.
850      */
851     void resize(size_t to);
852 
853     /**
854      * Find the item with the given key.
855      *
856      * @param key the key to find
857      * @return a pointer to a StoredValue -- NULL if not found
858      */
find(std::string &key, bool trackReference=true)859     StoredValue *find(std::string &key, bool trackReference=true) {
860         cb_assert(isActive());
861         int bucket_num(0);
862         LockHolder lh = getLockedBucket(key, &bucket_num);
863         return unlocked_find(key, bucket_num, false, trackReference);
864     }
865 
866     /**
867      * Find a resident item
868      *
869      * @param rnd a randomization input
870      * @return an item -- NULL if not fount
871      */
872     Item *getRandomKey(long rnd);
873 
874     /**
875      * Set a new Item into this hashtable. Use this function when your item
876      * doesn't contain meta data.
877      *
878      * @param val the Item to store
879      * @param policy item eviction policy
880      * @param nru the nru bit for the item
881      * @return a result indicating the status of the store
882      */
set(const Item &val, item_eviction_policy_t policy = VALUE_ONLY, uint8_t nru=0xff)883     mutation_type_t set(const Item &val,
884                         item_eviction_policy_t policy = VALUE_ONLY,
885                         uint8_t nru=0xff)
886     {
887         return set(val, val.getCas(), true, false, policy, nru);
888     }
889 
890     /**
891      * Set an Item into the this hashtable. Use this function to do a set
892      * when your item includes meta data.
893      *
894      * @param val the Item to store
895      * @param cas This is the cas value for the item <b>in</b> the cache
896      * @param allowExisting should we allow existing items or not
897      * @param hasMetaData should we keep the same revision seqno or increment it
898      * @param policy item eviction policy
899      * @param nru the nru bit for the item
900      * @param isReplication true if issued by consumer (for replication)
901      * @return a result indicating the status of the store
902      */
set(const Item &val, uint64_t cas, bool allowExisting, bool hasMetaData = true, item_eviction_policy_t policy = VALUE_ONLY, uint8_t nru=0xff)903     mutation_type_t set(const Item &val, uint64_t cas,
904                         bool allowExisting, bool hasMetaData = true,
905                         item_eviction_policy_t policy = VALUE_ONLY,
906                         uint8_t nru=0xff) {
907         int bucket_num(0);
908         LockHolder lh = getLockedBucket(val.getKey(), &bucket_num);
909         StoredValue *v = unlocked_find(val.getKey(), bucket_num, true, false);
910         return unlocked_set(v, val, cas, allowExisting, hasMetaData, policy, nru);
911     }
912 
unlocked_set(StoredValue*& v, const Item &val, uint64_t cas, bool allowExisting, bool hasMetaData = true, item_eviction_policy_t policy = VALUE_ONLY, uint8_t nru=0xff, bool isReplication = false)913     mutation_type_t unlocked_set(StoredValue*& v, const Item &val, uint64_t cas,
914                                  bool allowExisting, bool hasMetaData = true,
915                                  item_eviction_policy_t policy = VALUE_ONLY,
916                                  uint8_t nru=0xff,
917                                  bool isReplication = false) {
918         cb_assert(isActive());
919         Item &itm = const_cast<Item&>(val);
920         if (!StoredValue::hasAvailableSpace(stats, itm, isReplication)) {
921             return NOMEM;
922         }
923 
924         mutation_type_t rv = NOT_FOUND;
925 
926         if (cas && policy == FULL_EVICTION) {
927             if (!v || v->isTempInitialItem()) {
928                 return NEED_BG_FETCH;
929             }
930         }
931 
932         /*
933          * prior to checking for the lock, we should check if this object
934          * has expired. If so, then check if CAS value has been provided
935          * for this set op. In this case the operation should be denied since
936          * a cas operation for a key that doesn't exist is not a very cool
937          * thing to do. See MB 3252
938          */
939         if (v && v->isExpired(ep_real_time()) && !hasMetaData) {
940             if (v->isLocked(ep_current_time())) {
941                 v->unlock();
942             }
943             if (cas) {
944                 /* item has expired and cas value provided. Deny ! */
945                 return NOT_FOUND;
946             }
947         }
948 
949         if (v) {
950             if (!allowExisting && !v->isTempItem()) {
951                 return INVALID_CAS;
952             }
953             if (v->isLocked(ep_current_time())) {
954                 /*
955                  * item is locked, deny if there is cas value mismatch
956                  * or no cas value is provided by the user
957                  */
958                 if (cas != v->getCas()) {
959                     return IS_LOCKED;
960                 }
961                 /* allow operation*/
962                 v->unlock();
963             } else if (cas && cas != v->getCas()) {
964                 if (v->isTempDeletedItem() || v->isTempNonExistentItem()) {
965                     return NOT_FOUND;
966                 }
967                 return INVALID_CAS;
968             }
969 
970             if (!hasMetaData) {
971                 itm.setCas();
972             }
973             rv = v->isClean() ? WAS_CLEAN : WAS_DIRTY;
974             if (!v->isResident() && !v->isDeleted() && !v->isTempItem()) {
975                 --numNonResidentItems;
976             }
977 
978             if (v->isTempItem()) {
979                 --numTempItems;
980                 ++numItems;
981                 ++numTotalItems;
982             }
983 
984             v->setValue(itm, *this, hasMetaData /*Preserve revSeqno*/);
985             if (nru <= MAX_NRU_VALUE) {
986                 v->setNRUValue(nru);
987             }
988         } else if (cas != 0) {
989             rv = NOT_FOUND;
990         } else {
991             if (!hasMetaData) {
992                 itm.setCas();
993             }
994             int bucket_num = getBucketForHash(hash(itm.getKey()));
995             v = valFact(itm, values[bucket_num], *this);
996             values[bucket_num] = v;
997             ++numItems;
998             ++numTotalItems;
999             if (nru <= MAX_NRU_VALUE && !v->isTempItem()) {
1000                 v->setNRUValue(nru);
1001             }
1002 
1003             if (!hasMetaData) {
1004                 /**
1005                  * Possibly, this item is being recreated. Conservatively assign it
1006                  * a seqno that is greater than the greatest seqno of all deleted
1007                  * items seen so far.
1008                  */
1009                 uint64_t seqno = getMaxDeletedRevSeqno() + 1;
1010                 v->setRevSeqno(seqno);
1011                 itm.setRevSeqno(seqno);
1012             }
1013             rv = WAS_CLEAN;
1014         }
1015         return rv;
1016     }
1017 
1018     /**
1019      * Insert an item to this hashtable. This is called from the backfill
1020      * so we need a bit more logic here. If we're trying to insert a partial
1021      * item we don't allow the object to be stored there (and if you try to
1022      * insert a full item we're only allowing an item without the value
1023      * in memory...)
1024      *
1025      * @param val the Item to insert
1026      * @param policy item eviction policy
1027      * @param eject true if we should eject the value immediately
1028      * @param partial is this a complete item, or just the key and meta-data
1029      * @return a result indicating the status of the store
1030      */
1031     mutation_type_t insert(Item &itm, item_eviction_policy_t policy,
1032                            bool eject, bool partial);
1033 
1034     /**
1035      * Add an item to the hash table iff it doesn't already exist.
1036      *
1037      * @param val the item to store
1038      * @param policy item eviction policy
1039      * @param isDirty true if the item should be marked dirty on store
1040      * @param storeVal true if the value should be stored (paged-in)
1041      * @return an indication of what happened
1042      */
add(const Item &val, item_eviction_policy_t policy, bool isDirty = true, bool storeVal = true)1043     add_type_t add(const Item &val, item_eviction_policy_t policy,
1044                    bool isDirty = true, bool storeVal = true) {
1045         cb_assert(isActive());
1046         int bucket_num(0);
1047         LockHolder lh = getLockedBucket(val.getKey(), &bucket_num);
1048         StoredValue *v = unlocked_find(val.getKey(), bucket_num, true, false);
1049         return unlocked_add(bucket_num, v, val, policy, isDirty, storeVal);
1050     }
1051 
1052     /**
1053      * Unlocked version of the add() method.
1054      *
1055      * @param bucket_num the locked partition where the key belongs
1056      * @param v the stored value to do this operaiton on
1057      * @param val the item to store
1058      * @param policy item eviction policy
1059      * @param isDirty true if the item should be marked dirty on store
1060      * @param storeVal true if the value should be stored (paged-in)
1061      * @param isReplication true if issued by consumer (for replication)
1062      * @return an indication of what happened
1063      */
1064     add_type_t unlocked_add(int &bucket_num,
1065                             StoredValue*& v,
1066                             const Item &val,
1067                             item_eviction_policy_t policy,
1068                             bool isDirty = true,
1069                             bool storeVal = true,
1070                             bool isReplication = false);
1071 
1072     /**
1073      * Add a temporary item to the hash table iff it doesn't already exist.
1074      *
1075      * NOTE: This method should be called after acquiring the correct
1076      *       bucket/partition lock.
1077      *
1078      * @param bucket_num the locked partition where the key belongs
1079      * @param key the key for which a temporary item needs to be added
1080      * @param policy item eviction policy
1081      * @param isReplication true if issued by consumer (for replication)
1082      * @return an indication of what happened
1083      */
1084     add_type_t unlocked_addTempItem(int &bucket_num,
1085                                     const std::string &key,
1086                                     item_eviction_policy_t policy,
1087                                     bool isReplication = false);
1088 
1089     /**
1090      * Mark the given record logically deleted.
1091      *
1092      * @param key the key of the item to delete
1093      * @param cas the expected CAS of the item (or 0 to override)
1094      * @param policy item eviction policy
1095      * @return an indicator of what the deletion did
1096      */
softDelete(const std::string &key, uint64_t cas, item_eviction_policy_t policy = VALUE_ONLY)1097     mutation_type_t softDelete(const std::string &key, uint64_t cas,
1098                                item_eviction_policy_t policy = VALUE_ONLY) {
1099         cb_assert(isActive());
1100         int bucket_num(0);
1101         LockHolder lh = getLockedBucket(key, &bucket_num);
1102         StoredValue *v = unlocked_find(key, bucket_num, false, false);
1103         return unlocked_softDelete(v, cas, policy);
1104     }
1105 
unlocked_softDelete(StoredValue *v, uint64_t cas, item_eviction_policy_t policy = VALUE_ONLY)1106     mutation_type_t unlocked_softDelete(StoredValue *v,
1107                                         uint64_t cas,
1108                                         item_eviction_policy_t policy = VALUE_ONLY) {
1109         ItemMetaData metadata;
1110         if (v) {
1111             metadata.revSeqno = v->getRevSeqno() + 1;
1112         }
1113         return unlocked_softDelete(v, cas, metadata, policy);
1114     }
1115 
1116     /**
1117      * Unlocked implementation of softDelete.
1118      */
unlocked_softDelete(StoredValue *v, uint64_t cas, ItemMetaData &metadata, item_eviction_policy_t policy, bool use_meta=false)1119     mutation_type_t unlocked_softDelete(StoredValue *v,
1120                                         uint64_t cas,
1121                                         ItemMetaData &metadata,
1122                                         item_eviction_policy_t policy,
1123                                         bool use_meta=false) {
1124         mutation_type_t rv = NOT_FOUND;
1125 
1126         if ((!v || v->isTempInitialItem()) && policy == FULL_EVICTION) {
1127             return NEED_BG_FETCH;
1128         }
1129 
1130         if (v) {
1131             if (v->isExpired(ep_real_time()) && !use_meta) {
1132                 if (!v->isResident() && !v->isDeleted() && !v->isTempItem()) {
1133                     --numNonResidentItems;
1134                 }
1135                 v->setRevSeqno(metadata.revSeqno);
1136                 v->del(*this, use_meta);
1137                 updateMaxDeletedRevSeqno(v->getRevSeqno());
1138                 return rv;
1139             }
1140 
1141             if (v->isLocked(ep_current_time())) {
1142                 if (cas != v->getCas()) {
1143                     return IS_LOCKED;
1144                 }
1145                 v->unlock();
1146             }
1147 
1148             if (cas != 0 && cas != v->getCas()) {
1149                 return INVALID_CAS;
1150             }
1151 
1152             if (!v->isResident() && !v->isDeleted() && !v->isTempItem()) {
1153                 --numNonResidentItems;
1154             }
1155 
1156             if (v->isTempItem()) {
1157                 --numTempItems;
1158                 ++numItems;
1159                 ++numTotalItems;
1160             }
1161 
1162             /* allow operation*/
1163             v->unlock();
1164 
1165             rv = v->isClean() ? WAS_CLEAN : WAS_DIRTY;
1166             v->setRevSeqno(metadata.revSeqno);
1167             if (use_meta) {
1168                 v->setCas(metadata.cas);
1169                 v->setFlags(metadata.flags);
1170                 v->setExptime(metadata.exptime);
1171             }
1172             v->del(*this, use_meta);
1173             updateMaxDeletedRevSeqno(v->getRevSeqno());
1174         }
1175         return rv;
1176     }
1177 
1178     /**
1179      * Find an item within a specific bucket assuming you already
1180      * locked the bucket.
1181      *
1182      * @param key the key of the item to find
1183      * @param bucket_num the bucket number
1184      * @param wantsDeleted true if soft deleted items should be returned
1185      *
1186      * @return a pointer to a StoredValue -- NULL if not found
1187      */
unlocked_find(const std::string &key, int bucket_num, bool wantsDeleted=false, bool trackReference=true)1188     StoredValue *unlocked_find(const std::string &key, int bucket_num,
1189                                bool wantsDeleted=false, bool trackReference=true) {
1190         StoredValue *v = values[bucket_num];
1191         while (v) {
1192             if (v->hasKey(key)) {
1193                 if (trackReference && !v->isDeleted()) {
1194                     v->referenced();
1195                 }
1196                 if (wantsDeleted || !v->isDeleted()) {
1197                     return v;
1198                 } else {
1199                     return NULL;
1200                 }
1201             }
1202             v = v->next;
1203         }
1204         return NULL;
1205     }
1206 
1207     /**
1208      * Compute a hash for the given string.
1209      *
1210      * @param str the beginning of the string
1211      * @param len the number of bytes in the string
1212      *
1213      * @return the hash value
1214      */
hash(const char *str, const size_t len)1215     inline int hash(const char *str, const size_t len) {
1216         cb_assert(isActive());
1217         int h=5381;
1218 
1219         for(size_t i=0; i < len; i++) {
1220             h = ((h << 5) + h) ^ str[i];
1221         }
1222 
1223         return h;
1224     }
1225 
1226     /**
1227      * Compute a hash for the given string.
1228      *
1229      * @param s the string
1230      * @return the hash value
1231      */
hash(const std::string &s)1232     inline int hash(const std::string &s) {
1233         return hash(s.data(), s.length());
1234     }
1235 
1236     /**
1237      * Get a lock holder holding a lock for the given bucket
1238      *
1239      * @param bucket the bucket to lock
1240      * @return a locked LockHolder
1241      */
getLockedBucket(int bucket)1242     inline LockHolder getLockedBucket(int bucket) {
1243         LockHolder rv(mutexes[mutexForBucket(bucket)]);
1244         return rv;
1245     }
1246 
1247     /**
1248      * Get a lock holder holding a lock for the bucket for the given
1249      * hash.
1250      *
1251      * @param h the input hash
1252      * @param bucket output parameter to receive a bucket
1253      * @return a locked LockHolder
1254      */
getLockedBucket(int h, int *bucket)1255     inline LockHolder getLockedBucket(int h, int *bucket) {
1256         while (true) {
1257             cb_assert(isActive());
1258             *bucket = getBucketForHash(h);
1259             LockHolder rv(mutexes[mutexForBucket(*bucket)]);
1260             if (*bucket == getBucketForHash(h)) {
1261                 return rv;
1262             }
1263         }
1264     }
1265 
1266     /**
1267      * Get a lock holder holding a lock for the bucket for the hash of
1268      * the given key.
1269      *
1270      * @param s the start of the key
1271      * @param n the size of the key
1272      * @param bucket output parameter to receive a bucket
1273      * @return a locked LockHolder
1274      */
getLockedBucket(const char *s, size_t n, int *bucket)1275     inline LockHolder getLockedBucket(const char *s, size_t n, int *bucket) {
1276         return getLockedBucket(hash(s, n), bucket);
1277     }
1278 
1279     /**
1280      * Get a lock holder holding a lock for the bucket for the hash of
1281      * the given key.
1282      *
1283      * @param s the key
1284      * @param bucket output parameter to receive a bucket
1285      * @return a locked LockHolder
1286      */
getLockedBucket(const std::string &s, int *bucket)1287     inline LockHolder getLockedBucket(const std::string &s, int *bucket) {
1288         return getLockedBucket(hash(s.data(), s.size()), bucket);
1289     }
1290 
1291     /**
1292      * Delete a key from the cache without trying to lock the cache first
1293      * (Please note that you <b>MUST</b> acquire the mutex before calling
1294      * this function!!!
1295      *
1296      * @param key the key to delete
1297      * @param bucket_num the bucket to look in (must already be locked)
1298      * @return true if an object was deleted, false otherwise
1299      */
unlocked_del(const std::string &key, int bucket_num)1300     bool unlocked_del(const std::string &key, int bucket_num) {
1301         cb_assert(isActive());
1302         StoredValue *v = values[bucket_num];
1303 
1304         // Special case empty bucket.
1305         if (!v) {
1306             return false;
1307         }
1308 
1309         // Special case the first one
1310         if (v->hasKey(key)) {
1311             if (!v->isDeleted() && v->isLocked(ep_current_time())) {
1312                 return false;
1313             }
1314 
1315             values[bucket_num] = v->next;
1316             StoredValue::reduceCacheSize(*this, v->size());
1317             StoredValue::reduceMetaDataSize(*this, stats, v->metaDataSize());
1318             if (v->isTempItem()) {
1319                 --numTempItems;
1320             } else {
1321                 --numItems;
1322                 --numTotalItems;
1323             }
1324             delete v;
1325             return true;
1326         }
1327 
1328         while (v->next) {
1329             if (v->next->hasKey(key)) {
1330                 StoredValue *tmp = v->next;
1331                 if (!tmp->isDeleted() && tmp->isLocked(ep_current_time())) {
1332                     return false;
1333                 }
1334 
1335                 v->next = v->next->next;
1336                 StoredValue::reduceCacheSize(*this, tmp->size());
1337                 StoredValue::reduceMetaDataSize(*this, stats, tmp->metaDataSize());
1338                 if (tmp->isTempItem()) {
1339                     --numTempItems;
1340                 } else {
1341                     --numItems;
1342                     --numTotalItems;
1343                 }
1344                 delete tmp;
1345                 return true;
1346             } else {
1347                 v = v->next;
1348             }
1349         }
1350 
1351         return false;
1352     }
1353 
1354     /**
1355      * Delete the item with the given key.
1356      *
1357      * @param key the key to delete
1358      * @return true if the item existed before this call
1359      */
del(const std::string &key)1360     bool del(const std::string &key) {
1361         cb_assert(isActive());
1362         int bucket_num(0);
1363         LockHolder lh = getLockedBucket(key, &bucket_num);
1364         return unlocked_del(key, bucket_num);
1365     }
1366 
1367     /**
1368      * Visit all items within this hashtable.
1369      */
1370     void visit(HashTableVisitor &visitor);
1371 
1372     /**
1373      * Visit all items within this call with a depth visitor.
1374      */
1375     void visitDepth(HashTableDepthVisitor &visitor);
1376 
1377     /**
1378      * Get the number of buckets that should be used for initialization.
1379      *
1380      * @param s if 0, return the default number of buckets, else return s
1381      */
1382     static size_t getNumBuckets(size_t s = 0);
1383 
1384     /**
1385      * Get the number of locks that should be used for initialization.
1386      *
1387      * @param s if 0, return the default number of locks, else return s
1388      */
1389     static size_t getNumLocks(size_t s);
1390 
1391     /**
1392      * Set the default number of buckets.
1393      */
1394     static void setDefaultNumBuckets(size_t);
1395 
1396     /**
1397      * Set the default number of locks.
1398      */
1399     static void setDefaultNumLocks(size_t);
1400 
1401     /**
1402      * Get the max deleted revision seqno seen so far.
1403      */
getMaxDeletedRevSeqno() const1404     uint64_t getMaxDeletedRevSeqno() const {
1405         return maxDeletedRevSeqno.load();
1406     }
1407 
1408     /**
1409      * Set the max deleted seqno (required during warmup).
1410      */
setMaxDeletedRevSeqno(const uint64_t seqno)1411     void setMaxDeletedRevSeqno(const uint64_t seqno) {
1412         maxDeletedRevSeqno.store(seqno);
1413     }
1414 
1415     /**
1416      * Update maxDeletedRevSeqno to a (possibly) new value.
1417      */
updateMaxDeletedRevSeqno(const uint64_t seqno)1418     void updateMaxDeletedRevSeqno(const uint64_t seqno) {
1419         atomic_setIfBigger(maxDeletedRevSeqno, seqno);
1420     }
1421 
1422     /**
1423      * Eject an item meta data and value from memory.
1424      * @param vptr the reference to the pointer to the StoredValue instance
1425      * @param policy item eviction policy
1426      * @return true if an item is ejected.
1427      */
1428     bool unlocked_ejectItem(StoredValue*& vptr, item_eviction_policy_t policy);
1429 
1430     AtomicValue<uint64_t>     maxDeletedRevSeqno;
1431     AtomicValue<size_t>       numTotalItems;
1432     AtomicValue<size_t>       numNonResidentItems;
1433     AtomicValue<size_t>       numEjects;
1434     //! Memory consumed by items in this hashtable.
1435     AtomicValue<size_t>       memSize;
1436     //! Cache size.
1437     AtomicValue<size_t>       cacheSize;
1438     //! Meta-data size.
1439     AtomicValue<size_t>       metaDataMemory;
1440 
1441 private:
1442     friend class StoredValue;
1443 
isActive() const1444     inline bool isActive() const { return activeState; }
setActiveState(bool newv)1445     inline void setActiveState(bool newv) { activeState = newv; }
1446 
1447     size_t               size;
1448     size_t               n_locks;
1449     StoredValue        **values;
1450     Mutex               *mutexes;
1451     EPStats&             stats;
1452     StoredValueFactory   valFact;
1453     AtomicValue<size_t>       visitors;
1454     AtomicValue<size_t>       numItems;
1455     AtomicValue<size_t>       numResizes;
1456     AtomicValue<size_t>       numTempItems;
1457     bool                 activeState;
1458 
1459     static size_t                 defaultNumBuckets;
1460     static size_t                 defaultNumLocks;
1461 
getBucketForHash(int h)1462     int getBucketForHash(int h) {
1463         return abs(h % static_cast<int>(size));
1464     }
1465 
mutexForBucket(int bucket_num)1466     inline int mutexForBucket(int bucket_num) {
1467         cb_assert(isActive());
1468         cb_assert(bucket_num >= 0);
1469         int lock_num = bucket_num % static_cast<int>(n_locks);
1470         cb_assert(lock_num < static_cast<int>(n_locks));
1471         cb_assert(lock_num >= 0);
1472         return lock_num;
1473     }
1474 
1475     Item *getRandomKeyFromSlot(int slot);
1476 
1477     DISALLOW_COPY_AND_ASSIGN(HashTable);
1478 };
1479 
1480 #endif  // SRC_STORED_VALUE_H_
1481