xref: /6.6.0/kv_engine/engines/ep/src/hash_table.cc (revision 3d701d2d)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 *     Copyright 2016 Couchbase, Inc
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 */
17
18#include "hash_table.h"
19
20#include "ep_time.h"
21#include "item.h"
22#include "stats.h"
23#include "stored_value_factories.h"
24
25#include <folly/lang/Assume.h>
26#include <phosphor/phosphor.h>
27#include <platform/compress.h>
28
29#include <logtags.h>
30#include <nlohmann/json.hpp>
31#include <cstring>
32
33static const ssize_t prime_size_table[] = {
34    3, 7, 13, 23, 47, 97, 193, 383, 769, 1531, 3079, 6143, 12289, 24571, 49157,
35    98299, 196613, 393209, 786433, 1572869, 3145721, 6291449, 12582917,
36    25165813, 50331653, 100663291, 201326611, 402653189, 805306357,
37    1610612741, -1
38};
39
40/**
41 * Define the increment factor for the ProbabilisticCounter being used for
42 * the frequency counter.  The value is set such that it allows an 8-bit
43 * ProbabilisticCounter to mimic a uint16 counter.
44 *
45 * The value was reached by running the following code using a variety of
46 * incFactor values.
47 *
48 * ProbabilisticCounter<uint8_t> probabilisticCounter(incFactor);
49 * uint64_t iterationCount{0};
50 * uint8_t counter{0};
51 *     while (counter != std::numeric_limits<uint8_t>::max()) {
52 *         counter = probabilisticCounter.generateValue(counter);
53 *         iterationCount++;
54 *     }
55 * std::cerr << "iterationCount=" <<  iterationCount << std::endl;
56 *
57 * To mimic a uint16 counter, iterationCount needs to be ~65000 (the maximum
58 * value of a uint16_t is 65,536).  Through experimentation this was found to be
59 * achieved with an incFactor of 0.012.
60 */
61static const double freqCounterIncFactor = 0.012;
62
63std::string to_string(MutationStatus status) {
64    switch (status) {
65    case MutationStatus::NotFound:
66        return "NotFound";
67    case MutationStatus::InvalidCas:
68        return "InvalidCas";
69    case MutationStatus::WasClean:
70        return "WasClean";
71    case MutationStatus::WasDirty:
72        return "WasDirty";
73    case MutationStatus::IsLocked:
74        return "IsLocked";
75    case MutationStatus::NoMem:
76        return "NoMem";
77    case MutationStatus::NeedBgFetch:
78        return "NeedBgFetch";
79    case MutationStatus::IsPendingSyncWrite:
80        return "IsPendingSyncWrite";
81    }
82    return "<invalid>(" + std::to_string(int(status)) + ")";
83}
84
85std::ostream& operator<<(std::ostream& os, const HashTable::Position& pos) {
86    os << "{lock:" << pos.lock << " bucket:" << pos.hash_bucket << "/" << pos.ht_size << "}";
87    return os;
88}
89
90HashTable::StoredValueProxy::StoredValueProxy(HashBucketLock&& hbl,
91                                              StoredValue* sv,
92                                              Statistics& stats)
93    : lock(std::move(hbl)),
94      value(sv),
95      valueStats(stats),
96      pre(valueStats.get().prologue(sv)) {
97}
98
99HashTable::StoredValueProxy::~StoredValueProxy() {
100    if (value) {
101        valueStats.get().epilogue(pre, value);
102    }
103}
104
105void HashTable::StoredValueProxy::setCommitted(CommittedState state) {
106    value->setCommitted(state);
107    value->markDirty();
108    value->setCompletedOrDeletedTime(ep_real_time());
109}
110
111StoredValue* HashTable::StoredValueProxy::release() {
112    auto* tmp = value;
113    value = nullptr;
114    return tmp;
115}
116
117StoredValue* HashTable::FindUpdateResult::selectSVToModify(bool durability) {
118    if (durability) {
119        if (pending) {
120            return pending.getSV();
121        } else {
122            return committed;
123        }
124    } else {
125        if (pending && !pending->isCompleted()) {
126            return pending.getSV();
127        } else {
128            return committed;
129        }
130    }
131}
132
133StoredValue* HashTable::FindUpdateResult::selectSVToModify(const Item& itm) {
134    return selectSVToModify(itm.isPending());
135}
136
137HashTable::HashTable(EPStats& st,
138                     std::unique_ptr<AbstractStoredValueFactory> svFactory,
139                     size_t initialSize,
140                     size_t locks)
141    : initialSize(initialSize),
142      size(initialSize),
143      mutexes(locks),
144      stats(st),
145      valFact(std::move(svFactory)),
146      visitors(0),
147      valueStats(stats),
148      numEjects(0),
149      numResizes(0),
150      maxDeletedRevSeqno(0),
151      probabilisticCounter(freqCounterIncFactor) {
152    values.resize(size);
153    activeState = true;
154}
155
156HashTable::~HashTable() {
157    // Use unlocked clear for the destructor, avoids lock inversions on VBucket
158    // delete
159    clear_UNLOCKED(true);
160    // Wait for any outstanding visitors to finish.
161    while (visitors > 0) {
162        std::this_thread::sleep_for(std::chrono::microseconds(100));
163    }
164}
165
166void HashTable::cleanupIfTemporaryItem(const HashBucketLock& hbl,
167                                       StoredValue& v) {
168    if (v.isTempDeletedItem() || v.isTempNonExistentItem()) {
169        unlocked_del(hbl, &v);
170    }
171}
172
173void HashTable::clear(bool deactivate) {
174    if (!deactivate) {
175        // If not deactivating, assert we're already active.
176        if (!isActive()) {
177            throw std::logic_error("HashTable::clear: Cannot call on a "
178                    "non-active object");
179        }
180    }
181    MultiLockHolder mlh(mutexes);
182    clear_UNLOCKED(deactivate);
183}
184
185void HashTable::clear_UNLOCKED(bool deactivate) {
186    if (deactivate) {
187        setActiveState(false);
188    }
189    size_t clearedMemSize = 0;
190    size_t clearedValSize = 0;
191    for (int i = 0; i < (int)size; i++) {
192        while (values[i]) {
193            // Take ownership of the StoredValue from the vector, update
194            // statistics and release it.
195            auto v = std::move(values[i]);
196            clearedMemSize += v->size();
197            clearedValSize += v->valuelen();
198            values[i] = std::move(v->getNext());
199        }
200    }
201
202    stats.coreLocal.get()->currentSize.fetch_sub(clearedMemSize -
203                                                 clearedValSize);
204
205    valueStats.reset();
206}
207
208static size_t distance(size_t a, size_t b) {
209    return std::max(a, b) - std::min(a, b);
210}
211
212static size_t nearest(size_t n, size_t a, size_t b) {
213    return (distance(n, a) < distance(b, n)) ? a : b;
214}
215
216static bool isCurrently(size_t size, ssize_t a, ssize_t b) {
217    ssize_t current(static_cast<ssize_t>(size));
218    return (current == a || current == b);
219}
220
221void HashTable::resize() {
222    size_t ni = getNumInMemoryItems();
223    int i(0);
224    size_t new_size(0);
225
226    // Figure out where in the prime table we are.
227    ssize_t target(static_cast<ssize_t>(ni));
228    for (i = 0; prime_size_table[i] > 0 && prime_size_table[i] < target; ++i) {
229        // Just looking...
230    }
231
232    if (prime_size_table[i] == -1) {
233        // We're at the end, take the biggest
234        new_size = prime_size_table[i-1];
235    } else if (prime_size_table[i] < static_cast<ssize_t>(initialSize)) {
236        // Was going to be smaller than the initial size.
237        new_size = initialSize;
238    } else if (0 == i) {
239        new_size = prime_size_table[i];
240    }else if (isCurrently(size, prime_size_table[i-1], prime_size_table[i])) {
241        // If one of the candidate sizes is the current size, maintain
242        // the current size in order to remain stable.
243        new_size = size;
244    } else {
245        // Somewhere in the middle, use the one we're closer to.
246        new_size = nearest(ni, prime_size_table[i-1], prime_size_table[i]);
247    }
248
249    resize(new_size);
250}
251
252void HashTable::resize(size_t newSize) {
253    if (!isActive()) {
254        throw std::logic_error("HashTable::resize: Cannot call on a "
255                "non-active object");
256    }
257
258    // Due to the way hashing works, we can't fit anything larger than
259    // an int.
260    if (newSize > static_cast<size_t>(std::numeric_limits<int>::max())) {
261        return;
262    }
263
264    // Don't resize to the same size, either.
265    if (newSize == size) {
266        return;
267    }
268
269    TRACE_EVENT2(
270            "HashTable", "resize", "size", size.load(), "newSize", newSize);
271
272    MultiLockHolder mlh(mutexes);
273    if (visitors.load() > 0) {
274        // Do not allow a resize while any visitors are actually
275        // processing.  The next attempt will have to pick it up.  New
276        // visitors cannot start doing meaningful work (we own all
277        // locks at this point).
278        return;
279    }
280
281    // Get a place for the new items.
282    table_type newValues(newSize);
283
284    stats.coreLocal.get()->memOverhead.fetch_sub(memorySize());
285    ++numResizes;
286
287    // Set the new size so all the hashy stuff works.
288    size_t oldSize = size;
289    size.store(newSize);
290
291    // Move existing records into the new space.
292    for (size_t i = 0; i < oldSize; i++) {
293        while (values[i]) {
294            // unlink the front element from the hash chain at values[i].
295            auto v = std::move(values[i]);
296            values[i] = std::move(v->getNext());
297
298            // And re-link it into the correct place in newValues.
299            int newBucket = getBucketForHash(v->getKey().hash());
300            v->setNext(std::move(newValues[newBucket]));
301            newValues[newBucket] = std::move(v);
302        }
303    }
304
305    // Finally assign the new table to values.
306    values = std::move(newValues);
307
308    stats.coreLocal.get()->memOverhead.fetch_add(memorySize());
309}
310
311HashTable::FindInnerResult HashTable::findInner(const DocKey& key) {
312    if (!isActive()) {
313        throw std::logic_error(
314                "HashTable::find: Cannot call on a "
315                "non-active object");
316    }
317    HashBucketLock hbl = getLockedBucket(key);
318    // Scan through all elements in the hash bucket chain looking for Committed
319    // and Pending items with the same key.
320    StoredValue* foundCmt = nullptr;
321    StoredValue* foundPend = nullptr;
322    for (StoredValue* v = values[hbl.getBucketNum()].get().get(); v;
323         v = v->getNext().get().get()) {
324        if (v->hasKey(key)) {
325            if (v->isPending() || v->isCompleted()) {
326                Expects(!foundPend);
327                foundPend = v;
328            } else {
329                Expects(!foundCmt);
330                foundCmt = v;
331            }
332        }
333    }
334
335    return {std::move(hbl), foundCmt, foundPend};
336}
337
338std::unique_ptr<Item> HashTable::getRandomKey(long rnd) {
339    /* Try to locate a partition */
340    size_t start = rnd % size;
341    size_t curr = start;
342    std::unique_ptr<Item> ret;
343
344    do {
345        ret = getRandomKeyFromSlot(curr++);
346        if (curr == size) {
347            curr = 0;
348        }
349    } while (ret == NULL && curr != start);
350
351    return ret;
352}
353
354MutationStatus HashTable::set(const Item& val) {
355    auto htRes = findForWrite(val.getKey());
356    if (htRes.storedValue) {
357        return unlocked_updateStoredValue(htRes.lock, *htRes.storedValue, val)
358                .status;
359    } else {
360        unlocked_addNewStoredValue(htRes.lock, val);
361        return MutationStatus::WasClean;
362    }
363}
364
365void HashTable::rollbackItem(const Item& item) {
366    auto htRes = findItem(item);
367    if (htRes.storedValue) {
368        unlocked_updateStoredValue(htRes.lock, *htRes.storedValue, item);
369    } else {
370        unlocked_addNewStoredValue(htRes.lock, item);
371    }
372}
373
374HashTable::UpdateResult HashTable::unlocked_updateStoredValue(
375        const HashBucketLock& hbl, StoredValue& v, const Item& itm) {
376    if (!hbl.getHTLock()) {
377        throw std::invalid_argument(
378                "HashTable::unlocked_updateStoredValue: htLock "
379                "not held");
380    }
381
382    if (!isActive()) {
383        throw std::logic_error(
384                "HashTable::unlocked_updateStoredValue: Cannot "
385                "call on a non-active HT object");
386    }
387
388    // Can directly replace the existing SV.
389    MutationStatus status =
390            v.isDirty() ? MutationStatus::WasDirty : MutationStatus::WasClean;
391
392    const auto preProps = valueStats.prologue(&v);
393
394    /* setValue() will mark v as undeleted if required */
395    v.setValue(itm);
396    updateFreqCounter(v);
397
398    valueStats.epilogue(preProps, &v);
399
400    return {status, &v};
401}
402
403StoredValue* HashTable::unlocked_addNewStoredValue(const HashBucketLock& hbl,
404                                                   const Item& itm) {
405    if (!hbl.getHTLock()) {
406        throw std::invalid_argument(
407                "HashTable::unlocked_addNewStoredValue: htLock "
408                "not held");
409    }
410
411    if (!isActive()) {
412        throw std::invalid_argument(
413                "HashTable::unlocked_addNewStoredValue: Cannot "
414                "call on a non-active HT object");
415    }
416
417    const auto emptyProperties = valueStats.prologue(nullptr);
418
419    // Create a new StoredValue and link it into the head of the bucket chain.
420    auto v = (*valFact)(itm, std::move(values[hbl.getBucketNum()]));
421
422    valueStats.epilogue(emptyProperties, v.get().get());
423
424    values[hbl.getBucketNum()] = std::move(v);
425    return values[hbl.getBucketNum()].get().get();
426}
427
428HashTable::Statistics::StoredValueProperties::StoredValueProperties(
429        const StoredValue* sv) {
430    // If no previous StoredValue exists; return default constructed object.
431    if (sv == nullptr) {
432        return;
433    }
434
435    // Record all properties of the stored value which statistics require.
436    isValid = true;
437    size = sv->size();
438    metaDataSize = sv->metaDataSize();
439    datatype = sv->getDatatype();
440    uncompressedSize = sv->uncompressedSize();
441    isResident = sv->isResident();
442    isDeleted = sv->isDeleted();
443    isTempItem = sv->isTempItem();
444    isSystemItem = sv->getKey().getCollectionID().isSystem();
445    isPreparedSyncWrite = sv->isPending() || sv->isCompleted();
446}
447
448HashTable::Statistics::StoredValueProperties HashTable::Statistics::prologue(
449        const StoredValue* v) const {
450    return StoredValueProperties(v);
451}
452
453void HashTable::Statistics::epilogue(StoredValueProperties pre,
454                                     const StoredValue* v) {
455    // After performing updates to sv; compare with the previous properties and
456    // update all statistics for all properties which have changed.
457
458    const auto post = StoredValueProperties(v);
459
460    // Update size, metadataSize & uncompressed size if pre/post differ.
461    if (pre.size != post.size) {
462        cacheSize.fetch_add(post.size - pre.size);
463        memSize.fetch_add(post.size - pre.size);
464    }
465    if (pre.metaDataSize != post.metaDataSize) {
466        metaDataMemory.fetch_add(post.metaDataSize - pre.metaDataSize);
467        epStats.coreLocal.get()->currentSize.fetch_add(post.metaDataSize -
468                                                       pre.metaDataSize);
469    }
470    if (pre.uncompressedSize != post.uncompressedSize) {
471        uncompressedMemSize.fetch_add(post.uncompressedSize -
472                                      pre.uncompressedSize);
473    }
474
475    // Determine if valid, non resident; and update numNonResidentItems if
476    // differ.
477    bool preNonResident = pre.isValid && (!pre.isResident && !pre.isDeleted &&
478                                          !pre.isTempItem);
479    bool postNonResident =
480            post.isValid &&
481            (!post.isResident && !post.isDeleted && !post.isTempItem);
482    if (preNonResident != postNonResident) {
483        numNonResidentItems.fetch_add(postNonResident - preNonResident);
484    }
485
486    if (pre.isTempItem != post.isTempItem) {
487        numTempItems.fetch_add(post.isTempItem - pre.isTempItem);
488    }
489
490    // nonItems only considers valid; non-temporary items:
491    bool preNonTemp = pre.isValid && !pre.isTempItem;
492    bool postNonTemp = post.isValid && !post.isTempItem;
493    if (preNonTemp != postNonTemp) {
494        numItems.fetch_add(postNonTemp - preNonTemp);
495    }
496
497    if (pre.isSystemItem != post.isSystemItem) {
498        numSystemItems.fetch_add(post.isSystemItem - pre.isSystemItem);
499    }
500
501    // numPreparedItems counts valid, prepared (not yet committed) items.
502    const bool prePrepared = pre.isValid && pre.isPreparedSyncWrite;
503    const bool postPrepared = post.isValid && post.isPreparedSyncWrite;
504    if (prePrepared != postPrepared) {
505        numPreparedSyncWrites.fetch_add(postPrepared - prePrepared);
506    }
507
508    // Don't include system items in the deleted count, numSystemItems will
509    // count both types (a marked deleted system event still has purpose)
510    // Don't include prepared items in the deleted count - they haven't (yet)
511    // been deleted.
512    const bool preDeleted =
513            pre.isDeleted && !pre.isSystemItem && !pre.isPreparedSyncWrite;
514    const bool postDeleted =
515            post.isDeleted && !post.isSystemItem && !post.isPreparedSyncWrite;
516    if (preDeleted != postDeleted) {
517        numDeletedItems.fetch_add(postDeleted - preDeleted);
518    }
519
520    // Update datatypes. These are only tracked for non-temp, non-deleted,
521    // committed items.
522    if (preNonTemp && !pre.isDeleted && !pre.isPreparedSyncWrite) {
523        --datatypeCounts[pre.datatype];
524    }
525    if (postNonTemp && !post.isDeleted && !post.isPreparedSyncWrite) {
526        ++datatypeCounts[post.datatype];
527    }
528}
529
530void HashTable::Statistics::reset() {
531    datatypeCounts.fill(0);
532    numItems.store(0);
533    numTempItems.store(0);
534    numNonResidentItems.store(0);
535    memSize.store(0);
536    cacheSize.store(0);
537    uncompressedMemSize.store(0);
538}
539
540std::pair<StoredValue*, StoredValue::UniquePtr>
541HashTable::unlocked_replaceByCopy(const HashBucketLock& hbl,
542                                  StoredValue& vToCopy) {
543    if (!hbl.getHTLock()) {
544        throw std::invalid_argument(
545                "HashTable::unlocked_replaceByCopy: htLock "
546                "not held");
547    }
548
549    if (!isActive()) {
550        throw std::invalid_argument(
551                "HashTable::unlocked_replaceByCopy: Cannot "
552                "call on a non-active HT object");
553    }
554
555    /* Release (remove) the StoredValue from the hash table */
556    auto releasedSv = unlocked_release(hbl, &vToCopy);
557
558    /* Copy the StoredValue and link it into the head of the bucket chain. */
559    auto newSv = valFact->copyStoredValue(
560            vToCopy, std::move(values[hbl.getBucketNum()]));
561
562    // Adding a new item into the HashTable; update stats.
563    const auto emptyProperties = valueStats.prologue(nullptr);
564    valueStats.epilogue(emptyProperties, newSv.get().get());
565
566    values[hbl.getBucketNum()] = std::move(newSv);
567    return {values[hbl.getBucketNum()].get().get(), std::move(releasedSv)};
568}
569
570HashTable::DeleteResult HashTable::unlocked_softDelete(
571        const HashBucketLock& hbl,
572        StoredValue& v,
573        bool onlyMarkDeleted,
574        DeleteSource delSource) {
575    switch (v.getCommitted()) {
576    case CommittedState::PrepareAborted:
577    case CommittedState::PrepareCommitted:
578        // We shouldn't be trying to use PrepareCompleted states yet
579        throw std::logic_error(
580                "HashTable::unlocked_softDelete attempting"
581                " to delete a completed prepare");
582    case CommittedState::Pending:
583    case CommittedState::PreparedMaybeVisible:
584    case CommittedState::CommittedViaMutation:
585    case CommittedState::CommittedViaPrepare:
586        const auto preProps = valueStats.prologue(&v);
587
588        if (onlyMarkDeleted) {
589            v.markDeleted(delSource);
590        } else {
591            v.del(delSource);
592            // As part of deleting, set committedState to CommittedViaMutation -
593            // this is necessary so when we later queue this SV into
594            // CheckpoitnManager, if if was previously CommittedViaPrepare it
595            // isn't mis-interpreted for a SyncDelete.
596            v.setCommitted(CommittedState::CommittedViaMutation);
597        }
598
599        valueStats.epilogue(preProps, &v);
600        return {DeletionStatus::Success, &v};
601    }
602    folly::assume_unreachable();
603}
604
605HashTable::DeleteResult HashTable::unlocked_abortPrepare(
606        const HashTable::HashBucketLock& hbl, StoredValue& v) {
607    const auto preProps = valueStats.prologue(&v);
608    // We consider a prepare that is non-resident to be a completed abort.
609    v.setCommitted(CommittedState::PrepareAborted);
610
611    // Set the completed time so we don't prematurely purge the SV
612    v.setCompletedOrDeletedTime(ep_real_time());
613    valueStats.epilogue(preProps, &v);
614    return {DeletionStatus::Success, &v};
615}
616
617StoredValue::UniquePtr HashTable::unlocked_createSyncDeletePrepare(
618        const HashTable::HashBucketLock& hbl,
619        const StoredValue& v,
620        DeleteSource delSource) {
621    auto pendingDel = valFact->copyStoredValue(v, nullptr /*next chain ptr*/);
622    pendingDel->setCommitted(CommittedState::Pending);
623    pendingDel->del(delSource);
624    return pendingDel;
625}
626
627HashTable::FindROResult HashTable::findForRead(
628        const DocKey& key,
629        TrackReference trackReference,
630        WantsDeleted wantsDeleted,
631        const ForGetReplicaOp fetchRequestedForReplicaItem) {
632    auto result = findInner(key);
633
634    /// Reading normally uses the Committed StoredValue - however if a
635    /// pendingSV is found we must check if it's marked as MaybeVisible -
636    /// which will block reading.
637    /// However if this request is for a GET_REPLICA then we should only
638    /// return committed items
639    if (fetchRequestedForReplicaItem == ForGetReplicaOp::No &&
640        result.pendingSV && result.pendingSV->isPreparedMaybeVisible()) {
641        // Return the pending one as an indication the caller cannot read it.
642        return {result.pendingSV, std::move(result.lock)};
643    }
644    auto* sv = result.committedSV;
645
646    if (!sv) {
647        // No item found - return null.
648        return {nullptr, std::move(result.lock)};
649    }
650
651    if (result.committedSV->isDeleted()) {
652        // Deleted items should only be returned if caller asked for them,
653        // and we don't update ref-counts for them.
654        return {(wantsDeleted == WantsDeleted::Yes) ? sv : nullptr,
655                std::move(result.lock)};
656    }
657
658    // Found a non-deleted item. Now check if we should update ref-count.
659    if (trackReference == TrackReference::Yes) {
660        updateFreqCounter(*sv);
661
662        // @todo remove the referenced call when eviction algorithm is
663        // updated to use the frequency counter value.
664        sv->referenced();
665    }
666
667    return {sv, std::move(result.lock)};
668}
669
670HashTable::FindResult HashTable::findForWrite(const DocKey& key,
671                                              WantsDeleted wantsDeleted) {
672    auto result = findInner(key);
673
674    // We found a prepare. It may have been completed (Ephemeral) though. If it
675    // has been completed we will return the committed StoredValue.
676    if (result.pendingSV && !result.pendingSV->isCompleted()) {
677        // Early return if we found a prepare. We should always return prepares
678        // regardless of whether or not they are deleted or the caller has asked
679        // for deleted SVs. For example, consider searching for a SyncDelete, we
680        // should always return the deleted prepare.
681        return {result.pendingSV, std::move(result.lock)};
682    }
683
684    /// Writing using the Pending StoredValue (if found), else committed.
685    if (!result.committedSV) {
686        // No item found - return null.
687        return {nullptr, std::move(result.lock)};
688    }
689
690    if (result.committedSV->isDeleted() && wantsDeleted == WantsDeleted::No) {
691        // Deleted items should only be returned if caller asked for them -
692        // otherwise return null.
693        return {nullptr, std::move(result.lock)};
694    }
695    return {result.committedSV, std::move(result.lock)};
696}
697
698HashTable::FindResult HashTable::findForSyncWrite(const DocKey& key) {
699    auto result = findInner(key);
700
701    if (result.pendingSV) {
702        // Early return if we found a prepare. We should always return
703        // prepares regardless of whether or not they are deleted or the caller
704        // has asked for deleted SVs. For example, consider searching for a
705        // SyncDelete, we should always return the deleted prepare. Also,
706        // we always return completed prepares.
707        return {result.pendingSV, std::move(result.lock)};
708    }
709
710    if (!result.committedSV) {
711        // No item found - return null.
712        return {nullptr, std::move(result.lock)};
713    }
714
715    if (result.committedSV->isDeleted()) {
716        // Deleted items should only be returned if caller asked for them -
717        // otherwise return null.
718        return {nullptr, std::move(result.lock)};
719    }
720    return {result.committedSV, std::move(result.lock)};
721}
722
723HashTable::FindResult HashTable::findForSyncReplace(const DocKey& key) {
724    auto result = findInner(key);
725
726    if (result.pendingSV) {
727        // For the replace case, we should return ENGINE_KEY_ENOENT if no
728        // document exists for the given key. For the case where we abort a
729        // SyncWrite then attempt to do another we would find the AbortedPrepare
730        // (in the Ephemeral case) which we would then use to do another
731        // SyncWrite if we called the findForSyncWrite function. So, if we find
732        // a completed SyncWrite but the committed StoredValue does not exist,
733        // then return nullptr as logically a replace is not possible.
734        if (result.pendingSV->isCompleted() && !result.committedSV) {
735            return {nullptr, std::move(result.lock)};
736        }
737        // Otherwise, return the prepare so that we can re-use it.
738        return {result.pendingSV, std::move(result.lock)};
739    }
740
741    if (!result.committedSV) {
742        // No item found - return null.
743        return {nullptr, std::move(result.lock)};
744    }
745
746    if (result.committedSV->isDeleted()) {
747        // Deleted items should only be returned if caller asked for them -
748        // otherwise return null.
749        return {nullptr, std::move(result.lock)};
750    }
751    return {result.committedSV, std::move(result.lock)};
752}
753
754HashTable::StoredValueProxy HashTable::findForWrite(StoredValueProxy::RetSVPTag,
755                                                    const DocKey& key,
756                                                    WantsDeleted wantsDeleted) {
757    auto result = findForWrite(key, wantsDeleted);
758    return StoredValueProxy(
759            std::move(result.lock), result.storedValue, valueStats);
760}
761
762HashTable::FindUpdateResult HashTable::findForUpdate(const DocKey& key) {
763    auto result = findInner(key);
764
765    StoredValueProxy prepare{
766            std::move(result.lock), result.pendingSV, valueStats};
767    return {std::move(prepare), result.committedSV};
768}
769
770HashTable::FindResult HashTable::findOnlyCommitted(const DocKey& key) {
771    auto result = findInner(key);
772    return {result.committedSV, std::move(result.lock)};
773}
774
775HashTable::FindResult HashTable::findOnlyPrepared(const DocKey& key) {
776    auto result = findInner(key);
777    return {result.pendingSV, std::move(result.lock)};
778}
779
780HashTable::FindResult HashTable::findItem(const Item& item) {
781    auto result = findInner(item.getKey());
782    auto preparedNamespace = item.isPending() || item.isAbort();
783    return {preparedNamespace ? result.pendingSV : result.committedSV,
784            std::move(result.lock)};
785}
786
787void HashTable::unlocked_del(const HashBucketLock& hbl, StoredValue* value) {
788    unlocked_release(hbl, value).reset();
789}
790
791StoredValue::UniquePtr HashTable::unlocked_release(
792        const HashBucketLock& hbl,
793        StoredValue* valueToRelease) {
794    if (!hbl.getHTLock()) {
795        throw std::invalid_argument(
796                "HashTable::unlocked_release_base: htLock not held");
797    }
798
799    if (!isActive()) {
800        throw std::logic_error(
801                "HashTable::unlocked_release_base: Cannot call on a "
802                "non-active object");
803    }
804    // Remove the first (should only be one) StoredValue matching the given
805    // pointer
806    auto released = hashChainRemoveFirst(
807            values[hbl.getBucketNum()], [valueToRelease](const StoredValue* v) {
808                return v == valueToRelease;
809            });
810
811    if (!released) {
812        /* We shouldn't reach here, we must delete the StoredValue in the
813           HashTable */
814        throw std::logic_error(
815                "HashTable::unlocked_release_base: StoredValue to be released "
816                "not found in HashTable; possibly HashTable leak");
817    }
818
819    // Update statistics for the item which is now gone.
820    const auto preProps = valueStats.prologue(released.get().get());
821    valueStats.epilogue(preProps, nullptr);
822
823    return released;
824}
825
826MutationStatus HashTable::insertFromWarmup(const Item& itm,
827                                           bool eject,
828                                           bool keyMetaDataOnly,
829                                           EvictionPolicy evictionPolicy) {
830    auto htRes = findInner(itm.getKey());
831    auto* v = (itm.isCommitted() ? htRes.committedSV : htRes.pendingSV);
832    auto& hbl = htRes.lock;
833
834    if (v == NULL) {
835        v = unlocked_addNewStoredValue(hbl, itm);
836
837        // TODO: Would be faster if we just skipped creating the value in the
838        // first place instead of adding it to the Item and then discarding it
839        // in markNotResident.
840        if (keyMetaDataOnly) {
841            const auto preProps = valueStats.prologue(v);
842            v->markNotResident();
843            valueStats.epilogue(preProps, v);
844        }
845    } else {
846        if (keyMetaDataOnly) {
847            // We don't have a better error code ;)
848            return MutationStatus::InvalidCas;
849        }
850
851        // Existing item found. This should only occur if:
852        // a) The existing item is temporary (i.e. result of a front-end
853        //    thread attempting to read and triggered a bgFetch); or
854        // b) The existing item is non-temporary and was loaded as the result of
855        //    a previous BGfetch (and has the same CAS).
856        //
857        // Verify that the CAS isn't changed
858        if (v->getCas() != itm.getCas()) {
859            if (v->getCas() == 0) {
860                v->setCas(itm.getCas());
861                v->setFlags(itm.getFlags());
862                v->setExptime(itm.getExptime());
863                v->setRevSeqno(itm.getRevSeqno());
864            } else {
865                return MutationStatus::InvalidCas;
866            }
867        }
868
869        // CAS is equal - exact same item. Update the SV if it's not already
870        // resident.
871        if (!v->isResident()) {
872            Expects(unlocked_restoreValue(hbl.getHTLock(), itm, *v));
873        }
874    }
875
876    v->markClean();
877
878    if (eject && !keyMetaDataOnly) {
879        unlocked_ejectItem(hbl, v, evictionPolicy);
880    }
881
882    return MutationStatus::NotFound;
883}
884
885bool HashTable::reallocateStoredValue(StoredValue&& sv) {
886    // Search the chain and reallocate
887    for (StoredValue::UniquePtr* curr =
888                 &values[getBucketForHash(sv.getKey().hash())];
889         curr->get().get();
890         curr = &curr->get()->getNext()) {
891        if (&sv == curr->get().get()) {
892            auto newSv = valFact->copyStoredValue(sv, std::move(sv.getNext()));
893            curr->swap(newSv);
894            return true;
895        }
896    }
897    return false;
898}
899
900void HashTable::dump() const {
901    std::cerr << *this << std::endl;
902}
903
904nlohmann::json HashTable::dumpStoredValuesAsJson() const {
905    MultiLockHolder mlh(mutexes);
906    auto obj = nlohmann::json::array();
907    for (const auto& chain : values) {
908        if (chain) {
909            for (StoredValue* sv = chain.get().get(); sv != nullptr;
910                 sv = sv->getNext().get().get()) {
911                std::stringstream ss;
912                ss << sv->getKey();
913                obj.push_back(*sv);
914            }
915        }
916    }
917
918    return obj;
919}
920
921void HashTable::storeCompressedBuffer(cb::const_char_buffer buf,
922                                      StoredValue& v) {
923    const auto preProps = valueStats.prologue(&v);
924
925    v.storeCompressedBuffer(buf);
926
927    valueStats.epilogue(preProps, &v);
928}
929
930void HashTable::visit(HashTableVisitor& visitor) {
931    HashTable::Position ht_pos;
932    while (ht_pos != endPosition()) {
933        ht_pos = pauseResumeVisit(visitor, ht_pos);
934    }
935}
936
937void HashTable::visitDepth(HashTableDepthVisitor &visitor) {
938    if (valueStats.getNumItems() == 0 || !isActive()) {
939        return;
940    }
941    size_t visited = 0;
942    // Acquire one (any) of the mutexes before incrementing {visitors}, this
943    // prevents any race between this visitor and the HashTable resizer.
944    // See comments in pauseResumeVisit() for further details.
945    std::unique_lock<std::mutex> lh(mutexes[0]);
946    VisitorTracker vt(&visitors);
947    lh.unlock();
948
949    for (int l = 0; l < static_cast<int>(mutexes.size()); l++) {
950        for (int i = l; i < static_cast<int>(size); i+= mutexes.size()) {
951            // (re)acquire mutex on each HashBucket, to minimise any impact
952            // on front-end threads.
953            LockHolder lh(mutexes[l]);
954
955            size_t depth = 0;
956            StoredValue* p = values[i].get().get();
957            if (p) {
958                // TODO: Perf: This check seems costly - do we think it's still
959                // worth keeping?
960                auto hashbucket = getBucketForHash(p->getKey().hash());
961                if (i != hashbucket) {
962                    throw std::logic_error("HashTable::visit: inconsistency "
963                            "between StoredValue's calculated hashbucket "
964                            "(which is " + std::to_string(hashbucket) +
965                            ") and bucket it is located in (which is " +
966                            std::to_string(i) + ")");
967                }
968            }
969            size_t mem(0);
970            while (p) {
971                depth++;
972                mem += p->size();
973                p = p->getNext().get().get();
974            }
975            visitor.visit(i, depth, mem);
976            ++visited;
977        }
978    }
979}
980
981HashTable::Position HashTable::pauseResumeVisit(HashTableVisitor& visitor,
982                                                Position& start_pos) {
983    if ((valueStats.getNumItems() + valueStats.getNumTempItems()) == 0 ||
984        !isActive()) {
985        // Nothing to visit
986        return endPosition();
987    }
988
989    bool paused = false;
990
991    // To attempt to minimize the impact the visitor has on normal frontend
992    // operations, we deliberately acquire (and release) the mutex between
993    // each hash_bucket - see `lh` in the inner for() loop below. This means we
994    // hold a given mutex for a large number of short durations, instead of just
995    // one single, long duration.
996    // *However*, there is a potential race with this approach - the {size} of
997    // the HashTable may be changed (by the Resizer task) between us first
998    // reading it to calculate the starting hash_bucket, and then reading it
999    // inside the inner for() loop. To prevent this race, we explicitly acquire
1000    // (any) mutex, increment {visitors} and then release the mutex. This
1001    //avoids the race as if visitors >0 then Resizer will not attempt to resize.
1002    std::unique_lock<std::mutex> lh(mutexes[0]);
1003    VisitorTracker vt(&visitors);
1004    lh.unlock();
1005
1006    // Start from the requested lock number if in range.
1007    size_t lock = (start_pos.lock < mutexes.size()) ? start_pos.lock : 0;
1008    size_t hash_bucket = 0;
1009
1010    for (; isActive() && !paused && lock < mutexes.size(); lock++) {
1011
1012        // If the bucket position is *this* lock, then start from the
1013        // recorded bucket (as long as we haven't resized).
1014        hash_bucket = lock;
1015        if (start_pos.lock == lock &&
1016            start_pos.ht_size == size &&
1017            start_pos.hash_bucket < size) {
1018            hash_bucket = start_pos.hash_bucket;
1019        }
1020
1021        // Iterate across all values in the hash buckets owned by this lock.
1022        // Note: we don't record how far into the bucket linked-list we
1023        // pause at; so any restart will begin from the next bucket.
1024        for (; !paused && hash_bucket < size; hash_bucket += mutexes.size()) {
1025            visitor.setUpHashBucketVisit();
1026
1027            // HashBucketLock scope. If a visitor needs additional locking
1028            // around the HashBucket visit then we need to release it before
1029            // tearDownHashBucketVisit() is called.
1030            {
1031                HashBucketLock lh(hash_bucket, mutexes[lock]);
1032
1033                StoredValue* v = values[hash_bucket].get().get();
1034                while (!paused && v) {
1035                    StoredValue* tmp = v->getNext().get().get();
1036                    paused = !visitor.visit(lh, *v);
1037                    v = tmp;
1038                }
1039            }
1040
1041            visitor.tearDownHashBucketVisit();
1042        }
1043
1044        // If the visitor paused us before we visited all hash buckets owned
1045        // by this lock, we don't want to skip the remaining hash buckets, so
1046        // stop the outer for loop from advancing to the next lock.
1047        if (paused && hash_bucket < size) {
1048            break;
1049        }
1050
1051        // Finished all buckets owned by this lock. Set hash_bucket to 'size'
1052        // to give a consistent marker for "end of lock".
1053        hash_bucket = size;
1054    }
1055
1056    // Return the *next* location that should be visited.
1057    return HashTable::Position(size, lock, hash_bucket);
1058}
1059
1060HashTable::Position HashTable::endPosition() const  {
1061    return HashTable::Position(size, mutexes.size(), size);
1062}
1063
1064bool HashTable::unlocked_ejectItem(const HashTable::HashBucketLock&,
1065                                   StoredValue*& vptr,
1066                                   EvictionPolicy policy) {
1067    if (vptr == nullptr) {
1068        throw std::invalid_argument("HashTable::unlocked_ejectItem: "
1069                "Unable to delete NULL StoredValue");
1070    }
1071
1072    if (!vptr->eligibleForEviction(policy)) {
1073        ++stats.numFailedEjects;
1074        return false;
1075    }
1076
1077    const auto preProps = valueStats.prologue(vptr);
1078
1079    switch (policy) {
1080    case EvictionPolicy::Value: {
1081        vptr->ejectValue();
1082        ++stats.numValueEjects;
1083        valueStats.epilogue(preProps, vptr);
1084        break;
1085    }
1086    case EvictionPolicy::Full: {
1087        // Remove the item from the hash table.
1088        int bucket_num = getBucketForHash(vptr->getKey().hash());
1089        auto removed = hashChainRemoveFirst(
1090                values[bucket_num],
1091                [vptr](const StoredValue* v) { return v == vptr; });
1092
1093        if (removed->isResident()) {
1094            ++stats.numValueEjects;
1095        }
1096        valueStats.epilogue(preProps, nullptr);
1097
1098        updateMaxDeletedRevSeqno(vptr->getRevSeqno());
1099        break;
1100    }
1101    }
1102
1103    ++numEjects;
1104    return true;
1105}
1106
1107std::unique_ptr<Item> HashTable::getRandomKeyFromSlot(int slot) {
1108    auto lh = getLockedBucket(slot);
1109    for (StoredValue* v = values[slot].get().get(); v;
1110            v = v->getNext().get().get()) {
1111        if (!v->isTempItem() && !v->isDeleted() && v->isResident() &&
1112            v->isCommitted()) {
1113            return v->toItem(Vbid(0));
1114        }
1115    }
1116
1117    return nullptr;
1118}
1119
1120bool HashTable::unlocked_restoreValue(
1121        const std::unique_lock<std::mutex>& htLock,
1122        const Item& itm,
1123        StoredValue& v) {
1124    if (!htLock || !isActive() || v.isResident()) {
1125        return false;
1126    }
1127
1128    const auto preProps = valueStats.prologue(&v);
1129
1130    v.restoreValue(itm);
1131
1132    valueStats.epilogue(preProps, &v);
1133
1134    return true;
1135}
1136
1137void HashTable::unlocked_restoreMeta(const std::unique_lock<std::mutex>& htLock,
1138                                     const Item& itm,
1139                                     StoredValue& v) {
1140    if (!htLock) {
1141        throw std::invalid_argument(
1142                "HashTable::unlocked_restoreMeta: htLock "
1143                "not held");
1144    }
1145
1146    if (!isActive()) {
1147        throw std::logic_error(
1148                "HashTable::unlocked_restoreMeta: Cannot "
1149                "call on a non-active HT object");
1150    }
1151
1152    const auto preProps = valueStats.prologue(&v);
1153
1154    v.restoreMeta(itm);
1155
1156    valueStats.epilogue(preProps, &v);
1157}
1158
1159uint8_t HashTable::generateFreqValue(uint8_t counter) {
1160    return probabilisticCounter.generateValue(counter);
1161}
1162
1163void HashTable::updateFreqCounter(StoredValue& v) {
1164    // Attempt to increment the storedValue frequency counter
1165    // value.  Because a probabilistic counter is used the new
1166    // value will either be the same or an increment of the
1167    // current value.
1168    auto updatedFreqCounterValue = generateFreqValue(v.getFreqCounterValue());
1169    v.setFreqCounterValue(updatedFreqCounterValue);
1170
1171    if (updatedFreqCounterValue == std::numeric_limits<uint8_t>::max()) {
1172        // Invoke the registered callback function which
1173        // wakeups the ItemFreqDecayer task.
1174        frequencyCounterSaturated();
1175    }
1176}
1177
1178std::ostream& operator<<(std::ostream& os, const HashTable& ht) {
1179    os << "HashTable[" << &ht << "] with"
1180       << " numItems:" << ht.getNumItems()
1181       << " numInMemory:" << ht.getNumInMemoryItems()
1182       << " numDeleted:" << ht.getNumDeletedItems()
1183       << " numNonResident:" << ht.getNumInMemoryNonResItems()
1184       << " numTemp:" << ht.getNumTempItems()
1185       << " numSystemItems:" << ht.getNumSystemItems()
1186       << " numPreparedSW:" << ht.getNumPreparedSyncWrites()
1187       << " values: " << std::endl;
1188    for (const auto& chain : ht.values) {
1189        if (chain) {
1190            for (StoredValue* sv = chain.get().get(); sv != nullptr;
1191                 sv = sv->getNext().get().get()) {
1192                os << "    " << *sv << std::endl;
1193            }
1194        }
1195    }
1196    return os;
1197}
1198