1bc68bb02SChiyoung Seo/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
27c0433f5SJung-Sang Ahn/*
3bc68bb02SChiyoung Seo *     Copyright 2010 Couchbase, Inc
4bc68bb02SChiyoung Seo *
5bc68bb02SChiyoung Seo *   Licensed under the Apache License, Version 2.0 (the "License");
6bc68bb02SChiyoung Seo *   you may not use this file except in compliance with the License.
7bc68bb02SChiyoung Seo *   You may obtain a copy of the License at
8bc68bb02SChiyoung Seo *
9bc68bb02SChiyoung Seo *       http://www.apache.org/licenses/LICENSE-2.0
10bc68bb02SChiyoung Seo *
11bc68bb02SChiyoung Seo *   Unless required by applicable law or agreed to in writing, software
12bc68bb02SChiyoung Seo *   distributed under the License is distributed on an "AS IS" BASIS,
13bc68bb02SChiyoung Seo *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14bc68bb02SChiyoung Seo *   See the License for the specific language governing permissions and
15bc68bb02SChiyoung Seo *   limitations under the License.
167c0433f5SJung-Sang Ahn */
177c0433f5SJung-Sang Ahn
187c0433f5SJung-Sang Ahn#include <stdio.h>
197c0433f5SJung-Sang Ahn#include <stdlib.h>
207c0433f5SJung-Sang Ahn#include <string.h>
212534ac38SJung-Sang Ahn#include <fcntl.h>
221907e9beSJung-Sang Ahn#include <time.h>
231907e9beSJung-Sang Ahn#if !defined(WIN32) && !defined(_WIN32)
241907e9beSJung-Sang Ahn#include <sys/time.h>
251907e9beSJung-Sang Ahn#endif
267c0433f5SJung-Sang Ahn
2708a30bf6SChiyoung Seo#include "libforestdb/forestdb.h"
28eb364281SChiyoung Seo#include "fdb_internal.h"
297c0433f5SJung-Sang Ahn#include "filemgr.h"
307c0433f5SJung-Sang Ahn#include "hbtrie.h"
3159c1c4f5SJung-Sang Ahn#include "list.h"
32b95738edSabhinavdangeti#include "breakpad.h"
33762e6a5dSJung-Sang Ahn#include "btree.h"
347be5b070SJung-Sang Ahn#include "btree_kv.h"
35c9a3f3c3SJung-Sang Ahn#include "btree_var_kv_ops.h"
367c0433f5SJung-Sang Ahn#include "docio.h"
377c0433f5SJung-Sang Ahn#include "btreeblock.h"
387c0433f5SJung-Sang Ahn#include "common.h"
397c0433f5SJung-Sang Ahn#include "wal.h"
40f693a021SSundar Sridharan#include "filemgr_ops.h"
4188a8c486SChiyoung Seo#include "configuration.h"
4265b92d59SChiyoung Seo#include "internal_types.h"
434cdca91eSSundar Sridharan#include "bgflusher.h"
444fcd9f6eSJung-Sang Ahn#include "compactor.h"
453d812dfcSJung-Sang Ahn#include "memleak.h"
462ff77207SChiyoung Seo#include "time_utils.h"
47ac637abaSSundar Sridharan#include "timing.h"
48c5675c59SChiyoung Seo#include "system_resource_stats.h"
493c6e2608SJung-Sang Ahn#include "version.h"
504862310eSJung-Sang Ahn#include "staleblock.h"
513d812dfcSJung-Sang Ahn
52bb05885fSJung-Sang Ahn#ifdef __DEBUG
53bb05885fSJung-Sang Ahn#ifndef __DEBUG_FDB
542889254eSJung-Sang Ahn    #undef DBG
552889254eSJung-Sang Ahn    #undef DBGCMD
56eea9c5e9SJung-Sang Ahn    #undef DBGSW
57ceca3b9fSJung-Sang Ahn    #define DBG(...)
58ceca3b9fSJung-Sang Ahn    #define DBGCMD(...)
59ceca3b9fSJung-Sang Ahn    #define DBGSW(n, ...)
60bb05885fSJung-Sang Ahn#endif
61bb05885fSJung-Sang Ahn#endif
627c0433f5SJung-Sang Ahn
63b6be7a1dSSundar Sridharan
64b86dfa16Sabhinavdangetistatic atomic_uint8_t fdb_initialized(0);
65a348112cSChiyoung Seostatic volatile uint32_t fdb_open_inprog = 0;
6659c1c4f5SJung-Sang Ahn#ifdef SPIN_INITIALIZER
6759c1c4f5SJung-Sang Ahnstatic spin_t initial_lock = SPIN_INITIALIZER;
6859c1c4f5SJung-Sang Ahn#else
6959c1c4f5SJung-Sang Ahnstatic volatile unsigned int initial_lock_status = 0;
7059c1c4f5SJung-Sang Ahnstatic spin_t initial_lock;
7159c1c4f5SJung-Sang Ahn#endif
7259c1c4f5SJung-Sang Ahn
73c9a3f3c3SJung-Sang AhnINLINE int _cmp_uint64_t_endian_safe(void *key1, void *key2, void *aux)
746d79432aSJung-Sang Ahn{
75c9a3f3c3SJung-Sang Ahn    (void) aux;
766d79432aSJung-Sang Ahn    uint64_t a,b;
776d79432aSJung-Sang Ahn    a = *(uint64_t*)key1;
786d79432aSJung-Sang Ahn    b = *(uint64_t*)key2;
796d79432aSJung-Sang Ahn    a = _endian_decode(a);
806d79432aSJung-Sang Ahn    b = _endian_decode(b);
816d79432aSJung-Sang Ahn    return _CMP_U64(a, b);
826d79432aSJung-Sang Ahn}
836d79432aSJung-Sang Ahn
84d32401daSJung-Sang Ahnsize_t _fdb_readkey_wrap(void *handle, uint64_t offset, void *buf)
857c0433f5SJung-Sang Ahn{
86b84960c2SJung-Sang Ahn    fdb_status fs;
872889254eSJung-Sang Ahn    keylen_t keylen;
88b84960c2SJung-Sang Ahn    struct docio_handle *dhandle = (struct docio_handle*)handle;
89b84960c2SJung-Sang Ahn
906d79432aSJung-Sang Ahn    offset = _endian_decode(offset);
91b84960c2SJung-Sang Ahn    fs = docio_read_doc_key(dhandle, offset, &keylen, buf);
92b84960c2SJung-Sang Ahn    if (fs == FDB_RESULT_SUCCESS) {
93ce30d60aSChiyoung Seo        return keylen;
94ce30d60aSChiyoung Seo    } else {
95b84960c2SJung-Sang Ahn        const char *msg = "docio_read_doc_key error: read failure on "
96b84960c2SJung-Sang Ahn            "offset %" _F64 " in a database file '%s' "
97b84960c2SJung-Sang Ahn            ": FDB status %d, lastbid 0x%" _X64 ", "
98b84960c2SJung-Sang Ahn            "curblock 0x%" _X64 ", curpos 0x%x\n";
99b84960c2SJung-Sang Ahn        fdb_log(NULL, FDB_RESULT_READ_FAIL, msg, offset,
100b84960c2SJung-Sang Ahn                dhandle->file->filename, fs, dhandle->lastbid,
101b84960c2SJung-Sang Ahn                dhandle->curblock, dhandle->curpos);
102b84960c2SJung-Sang Ahn        dbg_print_buf(dhandle->readbuffer, dhandle->file->blocksize, true, 16);
103ce30d60aSChiyoung Seo        return 0;
104ce30d60aSChiyoung Seo    }
1057c0433f5SJung-Sang Ahn}
1067c0433f5SJung-Sang Ahn
10756236603Ssduvuruvoid _fdb_invalidate_dbheader(fdb_kvs_handle *handle ){
10856236603Ssduvuru    bid_t hdr_bid;
10956236603Ssduvuru    hdr_bid = handle->last_hdr_bid;
11056236603Ssduvuru    if (hdr_bid != BLK_NOT_FOUND){
11156236603Ssduvuru        // invalidate the last dbheader
11256236603Ssduvuru        filemgr_invalidate_dbheader(handle->file, hdr_bid, &handle->log_callback);
11356236603Ssduvuru    }
115d32401daSJung-Sang Ahnsize_t _fdb_readseq_wrap(void *handle, uint64_t offset, void *buf)
116c9a3f3c3SJung-Sang Ahn{
11747025c39SJung-Sang Ahn    int size_id, size_seq, size_chunk;
118d32401daSJung-Sang Ahn    fdb_seqnum_t _seqnum;
119d32401daSJung-Sang Ahn    struct docio_object doc;
12047025c39SJung-Sang Ahn    struct docio_handle *dhandle = (struct docio_handle *)handle;
121d32401daSJung-Sang Ahn
122d32401daSJung-Sang Ahn    size_id = sizeof(fdb_kvs_id_t);
123d32401daSJung-Sang Ahn    size_seq = sizeof(fdb_seqnum_t);
12447025c39SJung-Sang Ahn    size_chunk = dhandle->file->config->chunksize;
125d32401daSJung-Sang Ahn    memset(&doc, 0, sizeof(struct docio_object));
126d32401daSJung-Sang Ahn
127d32401daSJung-Sang Ahn    offset = _endian_decode(offset);
128ce30d60aSChiyoung Seo    if (docio_read_doc_key_meta((struct docio_handle *)handle, offset,
129ce30d60aSChiyoung Seo                                &doc, true) <= 0) {
130ce30d60aSChiyoung Seo        return 0;
131ce30d60aSChiyoung Seo    }
13247025c39SJung-Sang Ahn    buf2buf(size_chunk, doc.key, size_id, buf);
133d32401daSJung-Sang Ahn    _seqnum = _endian_encode(doc.seqnum);
134d32401daSJung-Sang Ahn    memcpy((uint8_t*)buf + size_id, &_seqnum, size_seq);
135d32401daSJung-Sang Ahn
136d32401daSJung-Sang Ahn    free(doc.key);
137d32401daSJung-Sang Ahn    free(doc.meta);
138d32401daSJung-Sang Ahn
139d32401daSJung-Sang Ahn    return size_id + size_seq;
140c9a3f3c3SJung-Sang Ahn}
141c9a3f3c3SJung-Sang Ahn
142d32401daSJung-Sang Ahnint _fdb_custom_cmp_wrap(void *key1, void *key2, void *aux)
143c9a3f3c3SJung-Sang Ahn{
144d32401daSJung-Sang Ahn    int is_key1_inf, is_key2_inf;
145d32401daSJung-Sang Ahn    uint8_t *keystr1 = alca(uint8_t, FDB_MAX_KEYLEN_INTERNAL);
146d32401daSJung-Sang Ahn    uint8_t *keystr2 = alca(uint8_t, FDB_MAX_KEYLEN_INTERNAL);
147c9a3f3c3SJung-Sang Ahn    size_t keylen1, keylen2;
14847025c39SJung-Sang Ahn    btree_cmp_args *args = (btree_cmp_args *)aux;
14947025c39SJung-Sang Ahn    fdb_custom_cmp_variable cmp = (fdb_custom_cmp_variable)args->aux;
150d32401daSJung-Sang Ahn
151d32401daSJung-Sang Ahn    is_key1_inf = _is_inf_key(key1);
152d32401daSJung-Sang Ahn    is_key2_inf = _is_inf_key(key2);
153d32401daSJung-Sang Ahn    if (is_key1_inf && is_key2_inf) { // both are infinite
154d32401daSJung-Sang Ahn        return 0;
155d32401daSJung-Sang Ahn    } else if (!is_key1_inf && is_key2_inf) { // key2 is infinite
156d32401daSJung-Sang Ahn        return -1;
157d32401daSJung-Sang Ahn    } else if (is_key1_inf && !is_key2_inf) { // key1 is infinite
158d32401daSJung-Sang Ahn        return 1;
159d32401daSJung-Sang Ahn    }
160c9a3f3c3SJung-Sang Ahn
161c9a3f3c3SJung-Sang Ahn    _get_var_key(key1, (void*)keystr1, &keylen1);
162c9a3f3c3SJung-Sang Ahn    _get_var_key(key2, (void*)keystr2, &keylen2);
163c9a3f3c3SJung-Sang Ahn
164bc3885dcSJung-Sang Ahn    if (keylen1 == 0 && keylen2 == 0) {
165bc3885dcSJung-Sang Ahn        return 0;
166bc3885dcSJung-Sang Ahn    } else if (keylen1 ==0 && keylen2 > 0) {
167bc3885dcSJung-Sang Ahn        return -1;
168bc3885dcSJung-Sang Ahn    } else if (keylen1 > 0 && keylen2 == 0) {
169bc3885dcSJung-Sang Ahn        return 1;
170bc3885dcSJung-Sang Ahn    }
171bc3885dcSJung-Sang Ahn
172d32401daSJung-Sang Ahn    return cmp(keystr1, keylen1, keystr2, keylen2);
173c9a3f3c3SJung-Sang Ahn}
174c9a3f3c3SJung-Sang Ahn
175b609b73eSSundar Sridharanvoid fdb_fetch_header(uint64_t version,
176b609b73eSSundar Sridharan                      void *header_buf,
177eb364281SChiyoung Seo                      bid_t *trie_root_bid,
178eb364281SChiyoung Seo                      bid_t *seq_root_bid,
17911c968c1SJung-Sang Ahn                      bid_t *stale_root_bid,
180eb364281SChiyoung Seo                      uint64_t *ndocs,
181b609b73eSSundar Sridharan                      uint64_t *ndeletes,
182eb364281SChiyoung Seo                      uint64_t *nlivenodes,
183eb364281SChiyoung Seo                      uint64_t *datasize,
18440a23059SJung-Sang Ahn                      uint64_t *last_wal_flush_hdr_bid,
185d32401daSJung-Sang Ahn                      uint64_t *kv_info_offset,
186d32401daSJung-Sang Ahn                      uint64_t *header_flags,
187eb364281SChiyoung Seo                      char **new_filename,
188eb364281SChiyoung Seo                      char **old_filename)
189e6449f52SJung-Sang Ahn{
190e6449f52SJung-Sang Ahn    size_t offset = 0;
1914a0379f5SJung-Sang Ahn    uint16_t new_filename_len;
1924a0379f5SJung-Sang Ahn    uint16_t old_filename_len;
1936d79432aSJung-Sang Ahn
1946d79432aSJung-Sang Ahn    seq_memcpy(trie_root_bid, (uint8_t *)header_buf + offset,
1956d79432aSJung-Sang Ahn               sizeof(bid_t), offset);
1966d79432aSJung-Sang Ahn    *trie_root_bid = _endian_decode(*trie_root_bid);
1976d79432aSJung-Sang Ahn
1986d79432aSJung-Sang Ahn    seq_memcpy(seq_root_bid, (uint8_t *)header_buf + offset,
1996d79432aSJung-Sang Ahn               sizeof(bid_t), offset);
2006d79432aSJung-Sang Ahn    *seq_root_bid = _endian_decode(*seq_root_bid);
2016d79432aSJung-Sang Ahn
20211c968c1SJung-Sang Ahn    if (ver_staletree_support(version)) {
20311c968c1SJung-Sang Ahn        seq_memcpy(stale_root_bid, (uint8_t *)header_buf + offset,
20411c968c1SJung-Sang Ahn                   sizeof(bid_t), offset);
20511c968c1SJung-Sang Ahn        *stale_root_bid = _endian_decode(*stale_root_bid);
20611c968c1SJung-Sang Ahn    } else {
20711c968c1SJung-Sang Ahn        *stale_root_bid = BLK_NOT_FOUND;
20811c968c1SJung-Sang Ahn    }
20911c968c1SJung-Sang Ahn
2106d79432aSJung-Sang Ahn    seq_memcpy(ndocs, (uint8_t *)header_buf + offset,
2116d79432aSJung-Sang Ahn               sizeof(uint64_t), offset);
2126d79432aSJung-Sang Ahn    *ndocs = _endian_decode(*ndocs);
213e4935a20SJung-Sang Ahn    if (ver_is_atleast_magic_001(version)) {
214b609b73eSSundar Sridharan        seq_memcpy(ndeletes, (uint8_t *)header_buf + offset,
215b609b73eSSundar Sridharan                   sizeof(uint64_t), offset);
216b609b73eSSundar Sridharan        *ndeletes = _endian_decode(*ndeletes);
217b609b73eSSundar Sridharan    } else {
218b609b73eSSundar Sridharan        *ndeletes = 0;
219b609b73eSSundar Sridharan    }
2206d79432aSJung-Sang Ahn
221e8dd5304SJung-Sang Ahn    seq_memcpy(nlivenodes, (uint8_t *)header_buf + offset,
222e8dd5304SJung-Sang Ahn               sizeof(uint64_t), offset);
223e8dd5304SJung-Sang Ahn    *nlivenodes = _endian_decode(*nlivenodes);
224e8dd5304SJung-Sang Ahn
2256d79432aSJung-Sang Ahn    seq_memcpy(datasize, (uint8_t *)header_buf + offset,
2266d79432aSJung-Sang Ahn               sizeof(uint64_t), offset);
2276d79432aSJung-Sang Ahn    *datasize = _endian_decode(*datasize);
2286d79432aSJung-Sang Ahn
22940a23059SJung-Sang Ahn    seq_memcpy(last_wal_flush_hdr_bid, (uint8_t *)header_buf + offset,
2306d79432aSJung-Sang Ahn               sizeof(uint64_t), offset);
23140a23059SJung-Sang Ahn    *last_wal_flush_hdr_bid = _endian_decode(*last_wal_flush_hdr_bid);
2326d79432aSJung-Sang Ahn
233d32401daSJung-Sang Ahn    seq_memcpy(kv_info_offset, (uint8_t *)header_buf + offset,
234d32401daSJung-Sang Ahn               sizeof(uint64_t), offset);
235d32401daSJung-Sang Ahn    *kv_info_offset = _endian_decode(*kv_info_offset);
236d32401daSJung-Sang Ahn
237d32401daSJung-Sang Ahn    seq_memcpy(header_flags, (uint8_t *)header_buf + offset,
238d32401daSJung-Sang Ahn               sizeof(uint64_t), offset);
239d32401daSJung-Sang Ahn    *header_flags = _endian_decode(*header_flags);
240d32401daSJung-Sang Ahn
2416d79432aSJung-Sang Ahn    seq_memcpy(&new_filename_len, (uint8_t *)header_buf + offset,
2424a0379f5SJung-Sang Ahn               sizeof(new_filename_len), offset);
2434a0379f5SJung-Sang Ahn    new_filename_len = _endian_decode(new_filename_len);
2446d79432aSJung-Sang Ahn    seq_memcpy(&old_filename_len, (uint8_t *)header_buf + offset,
2454a0379f5SJung-Sang Ahn               sizeof(old_filename_len), offset);
2464a0379f5SJung-Sang Ahn    old_filename_len = _endian_decode(old_filename_len);
2477b78f9b0SChiyoung Seo    if (new_filename_len) {
2488e1b9ec2SJung-Sang Ahn        *new_filename = (char*)((uint8_t *)header_buf + offset);
2492ff77207SChiyoung Seo    } else {
2502ff77207SChiyoung Seo        *new_filename = NULL;
25193d8acb2SSundar Sridharan    }
2524a0379f5SJung-Sang Ahn    offset += new_filename_len;
2533ad384e0SSundar Sridharan    if (old_filename && old_filename_len) {
254894f9fa6SSundar Sridharan        *old_filename = (char *) malloc(old_filename_len);
2554a0379f5SJung-Sang Ahn        seq_memcpy(*old_filename,
2564a0379f5SJung-Sang Ahn                   (uint8_t *)header_buf + offset,
2574a0379f5SJung-Sang Ahn                   old_filename_len, offset);
25893d8acb2SSundar Sridharan    }
259e6449f52SJung-Sang Ahn}
260e6449f52SJung-Sang Ahn
261f0b1bf77SJung-Sang Ahn// read the revnum of the given header of BID
262f0b1bf77SJung-Sang AhnINLINE filemgr_header_revnum_t _fdb_get_header_revnum(fdb_kvs_handle *handle, bid_t bid)
263f0b1bf77SJung-Sang Ahn{
264f0b1bf77SJung-Sang Ahn    uint8_t *buf = alca(uint8_t, handle->file->blocksize);
265f0b1bf77SJung-Sang Ahn    uint64_t version;
266f0b1bf77SJung-Sang Ahn    size_t header_len;
267f0b1bf77SJung-Sang Ahn    fdb_seqnum_t seqnum;
268f0b1bf77SJung-Sang Ahn    filemgr_header_revnum_t revnum = 0;
269f0b1bf77SJung-Sang Ahn    fdb_status fs;
270f0b1bf77SJung-Sang Ahn
271f0b1bf77SJung-Sang Ahn    fs = filemgr_fetch_header(handle->file, bid, buf, &header_len,
272f0b1bf77SJung-Sang Ahn                              &seqnum, &revnum, NULL, &version, NULL,
273f0b1bf77SJung-Sang Ahn                              &handle->log_callback);
274f0b1bf77SJung-Sang Ahn    if (fs != FDB_RESULT_SUCCESS) {
275f0b1bf77SJung-Sang Ahn        return 0;
276f0b1bf77SJung-Sang Ahn    }
277f0b1bf77SJung-Sang Ahn    return revnum;
278f0b1bf77SJung-Sang Ahn}
279f0b1bf77SJung-Sang Ahn
280f0b1bf77SJung-Sang AhnINLINE filemgr_header_revnum_t _fdb_get_bmp_revnum(fdb_kvs_handle *handle, bid_t bid)
281f0b1bf77SJung-Sang Ahn{
282f0b1bf77SJung-Sang Ahn    uint8_t *buf = alca(uint8_t, handle->file->blocksize);
283f0b1bf77SJung-Sang Ahn    uint64_t version, bmp_revnum = 0;
284f0b1bf77SJung-Sang Ahn    size_t header_len;
285f0b1bf77SJung-Sang Ahn    fdb_seqnum_t seqnum;
286f0b1bf77SJung-Sang Ahn    filemgr_header_revnum_t revnum;
287f0b1bf77SJung-Sang Ahn    fdb_status fs;
288f0b1bf77SJung-Sang Ahn
289f0b1bf77SJung-Sang Ahn    fs = filemgr_fetch_header(handle->file, bid, buf, &header_len,
290f0b1bf77SJung-Sang Ahn                              &seqnum, &revnum, NULL, &version, &bmp_revnum,
291f0b1bf77SJung-Sang Ahn                              &handle->log_callback);
292f0b1bf77SJung-Sang Ahn    if (fs != FDB_RESULT_SUCCESS) {
293f0b1bf77SJung-Sang Ahn        return 0;
294f0b1bf77SJung-Sang Ahn    }
295f0b1bf77SJung-Sang Ahn    return bmp_revnum;
296f0b1bf77SJung-Sang Ahn}
297f0b1bf77SJung-Sang Ahn
298b84960c2SJung-Sang Ahnvoid fdb_dummy_log_callback(int err_code, const char *err_msg, void *ctx_data)
299b84960c2SJung-Sang Ahn{
300b84960c2SJung-Sang Ahn    (void)err_code;
301b84960c2SJung-Sang Ahn    (void)err_msg;
302b84960c2SJung-Sang Ahn    (void)ctx_data;
303b84960c2SJung-Sang Ahn    return;
304b84960c2SJung-Sang Ahn}
305b84960c2SJung-Sang Ahn
306e6e251adSSundar SridharanINLINE void _fdb_restore_wal(fdb_kvs_handle *handle,
307d32401daSJung-Sang Ahn                             fdb_restore_mode_t mode,
308d32401daSJung-Sang Ahn                             bid_t hdr_bid,
309d32401daSJung-Sang Ahn                             fdb_kvs_id_t kv_id_req)
310abe1d5a9SSundar Sridharan{
311abe1d5a9SSundar Sridharan    struct filemgr *file = handle->file;
312abe1d5a9SSundar Sridharan    uint32_t blocksize = handle->file->blocksize;
31340a23059SJung-Sang Ahn    uint64_t last_wal_flush_hdr_bid = handle->last_wal_flush_hdr_bid;
3143ad384e0SSundar Sridharan    uint64_t hdr_off = hdr_bid * FDB_BLOCKSIZE;
315abe1d5a9SSundar Sridharan    uint64_t offset = 0; //assume everything from first block needs restoration
316f0b1bf77SJung-Sang Ahn    uint64_t filesize = filemgr_get_pos(handle->file);
317f0b1bf77SJung-Sang Ahn    uint64_t doc_scan_limit;
318f0b1bf77SJung-Sang Ahn    uint64_t start_bmp_revnum, stop_bmp_revnum;
319f0b1bf77SJung-Sang Ahn    uint64_t cur_bmp_revnum = (uint64_t)-1;
32060b03adbSJung-Sang Ahn    bid_t next_doc_block = BLK_NOT_FOUND;
32134e10f85SSundar Sridharan    struct _fdb_key_cmp_info cmp_info;
322dad7792aSSundar Sridharan    err_log_callback *log_callback;
323abe1d5a9SSundar Sridharan
3247e4d9806SSundar Sridharan    if (mode == FDB_RESTORE_NORMAL && !handle->shandle &&
3257e4d9806SSundar Sridharan        !wal_try_restore(handle->file)) { // Atomically try to restore WAL
3267e4d9806SSundar Sridharan        // Some other thread or previous open had successfully initialized WAL
3277e4d9806SSundar Sridharan        // We can simply return here
3287e4d9806SSundar Sridharan        return;
3297e4d9806SSundar Sridharan    }
3307e4d9806SSundar Sridharan
331ae797c0bSSundar Sridharan    if (!hdr_off) { // Nothing to do if we don't have a header block offset
332ae797c0bSSundar Sridharan        return;
333ae797c0bSSundar Sridharan    }
334ae797c0bSSundar Sridharan
33540a23059SJung-Sang Ahn    if (last_wal_flush_hdr_bid != BLK_NOT_FOUND) {
33640a23059SJung-Sang Ahn        offset = (last_wal_flush_hdr_bid + 1) * blocksize;
337abe1d5a9SSundar Sridharan    }
338abe1d5a9SSundar Sridharan
339abe1d5a9SSundar Sridharan    // If a valid last header was retrieved and it matches the current header
3407e4d9806SSundar Sridharan    if (hdr_off == offset || hdr_bid == last_wal_flush_hdr_bid) {
3417e4d9806SSundar Sridharan        return; // No WAL section in the file
342abe1d5a9SSundar Sridharan    }
343abe1d5a9SSundar Sridharan
344da9194a1SJung-Sang Ahn    if (mode == FDB_RESTORE_NORMAL && !handle->shandle) {
345da9194a1SJung-Sang Ahn        // for normal WAL restore, set status to dirty
346671823e3SChiyoung Seo        // (only when the previous status is clean or dirty)
347671823e3SChiyoung Seo        wal_set_dirty_status(handle->file, FDB_WAL_DIRTY, true);
348da9194a1SJung-Sang Ahn    }
349da9194a1SJung-Sang Ahn
350d106c48cSChiyoung Seo    // Temporarily disable the error logging callback as there are false positive
351d106c48cSChiyoung Seo    // checksum errors in docio_read_doc.
352d106c48cSChiyoung Seo    // TODO: Need to adapt docio_read_doc to separate false checksum errors.
353b84960c2SJung-Sang Ahn    err_log_callback dummy_cb;
354dad7792aSSundar Sridharan    log_callback = handle->dhandle->log_callback;
355b84960c2SJung-Sang Ahn    dummy_cb.callback = fdb_dummy_log_callback;
356b84960c2SJung-Sang Ahn    dummy_cb.ctx_data = NULL;
357b84960c2SJung-Sang Ahn    handle->dhandle->log_callback = &dummy_cb;
358d106c48cSChiyoung Seo
359e62631c5SChiyoung Seo    if (!handle->shandle) {
360e62631c5SChiyoung Seo        filemgr_mutex_lock(file);
361e62631c5SChiyoung Seo    }
36234e10f85SSundar Sridharan    cmp_info.kvs_config = handle->kvs_config;
36334e10f85SSundar Sridharan    cmp_info.kvs = handle->kvs;
364f0b1bf77SJung-Sang Ahn
365f0b1bf77SJung-Sang Ahn    start_bmp_revnum = _fdb_get_bmp_revnum(handle, last_wal_flush_hdr_bid);
366f0b1bf77SJung-Sang Ahn    stop_bmp_revnum= _fdb_get_bmp_revnum(handle, hdr_bid);
367f0b1bf77SJung-Sang Ahn    cur_bmp_revnum = start_bmp_revnum;
368f0b1bf77SJung-Sang Ahn
369f0b1bf77SJung-Sang Ahn    // A: reused blocks during the 1st block reclaim (bmp_revnum: 1)
370f0b1bf77SJung-Sang Ahn    // B: reused blocks during the 2nd block reclaim (bmp_revnum: 2)
371f0b1bf77SJung-Sang Ahn    // otherwise: live block (bmp_revnum: 0)
372f0b1bf77SJung-Sang Ahn    //  1 2   3    4    5 6  7  8   9  10
373f0b1bf77SJung-Sang Ahn    // +-------------------------------------------+
374f0b1bf77SJung-Sang Ahn    // |  BBBBAAAAABBBBB  AAABBB    AAA            |
375f0b1bf77SJung-Sang Ahn    // +-------------------------------------------+
376f0b1bf77SJung-Sang Ahn    //              ^                     ^
377f0b1bf77SJung-Sang Ahn    //              hdr_bid               last_wal_flush
378f0b1bf77SJung-Sang Ahn    //
379f0b1bf77SJung-Sang Ahn    // scan order: 1 -> 5 -> 8 -> 10 -> 3 -> 6 -> 9 -> 2 -> 4 -> 7
380f0b1bf77SJung-Sang Ahn    // iteration #1: scan docs with bmp_revnum==0 in [last_wal_flush ~ filesize]
381f0b1bf77SJung-Sang Ahn    // iteration #2: scan docs with bmp_revnum==1 in [0 ~ filesize]
382f0b1bf77SJung-Sang Ahn    // iteration #3: scan docs with bmp_revnum==2 in [0 ~ hdr_bid]
383f0b1bf77SJung-Sang Ahn
384f0b1bf77SJung-Sang Ahn    do {
385f0b1bf77SJung-Sang Ahn        if (cur_bmp_revnum > stop_bmp_revnum) {
386f0b1bf77SJung-Sang Ahn            break;
387f0b1bf77SJung-Sang Ahn        } else if (cur_bmp_revnum == stop_bmp_revnum) {
3881a6d57c6SJung-Sang Ahn
389b5540fc3SJung-Sang Ahn            bid_t sb_last_hdr_bid = BLK_NOT_FOUND;
390b5540fc3SJung-Sang Ahn            if (handle->file->sb) {
391b5540fc3SJung-Sang Ahn                sb_last_hdr_bid = atomic_get_uint64_t(&handle->file->sb->last_hdr_bid);
392b5540fc3SJung-Sang Ahn            }
393c7f1c121SJung-Sang Ahn            if (!handle->shandle && handle->file->sb &&
394b5540fc3SJung-Sang Ahn                sb_last_hdr_bid != BLK_NOT_FOUND) {
395b5540fc3SJung-Sang Ahn                hdr_off = (sb_last_hdr_bid+1) * blocksize;
3961a6d57c6SJung-Sang Ahn            }
3971a6d57c6SJung-Sang Ahn
398f0b1bf77SJung-Sang Ahn            doc_scan_limit = hdr_off;
399f0b1bf77SJung-Sang Ahn            if (offset >= hdr_off) {
400f0b1bf77SJung-Sang Ahn                break;
401f0b1bf77SJung-Sang Ahn            }
402f0b1bf77SJung-Sang Ahn        } else {
403f0b1bf77SJung-Sang Ahn            doc_scan_limit = filesize;
404f0b1bf77SJung-Sang Ahn        }
405f0b1bf77SJung-Sang Ahn
406f0b1bf77SJung-Sang Ahn        if (!docio_check_buffer(handle->dhandle, offset / blocksize,
407f0b1bf77SJung-Sang Ahn                                cur_bmp_revnum)) {
408f0b1bf77SJung-Sang Ahn            // not a document block .. move to next block
409c337f3f7SJung-Sang Ahn        } else {
410c337f3f7SJung-Sang Ahn            do {
411c337f3f7SJung-Sang Ahn                struct docio_object doc;
412ce30d60aSChiyoung Seo                int64_t _offset;
41359c1c4f5SJung-Sang Ahn                uint64_t doc_offset;
414c337f3f7SJung-Sang Ahn                memset(&doc, 0, sizeof(doc));
415c6c3d274SChiyoung Seo                _offset = docio_read_doc(handle->dhandle, offset, &doc, true);
416ce30d60aSChiyoung Seo                if (_offset <= 0) { // reached unreadable doc, skip block
417ce30d60aSChiyoung Seo                    // TODO: Need to have this function return fdb_status, so that
418ce30d60aSChiyoung Seo                    // WAL restore operation should fail if offset < 0
419b787ba1bSSundar Sridharan                    break;
420ce30d60aSChiyoung Seo                } else if ((uint64_t)_offset < offset) {
42160b03adbSJung-Sang Ahn                    // If more than one writer is appending docs concurrently,
42260b03adbSJung-Sang Ahn                    // they have their own doc block linked list and doc blocks
42360b03adbSJung-Sang Ahn                    // may not be consecutive. For example,
42460b03adbSJung-Sang Ahn                    //
42560b03adbSJung-Sang Ahn                    // Writer 1): 100 -> 102 -> 2 -> 4     | commit
42660b03adbSJung-Sang Ahn                    // Writer 2):    101 - > 103 -> 3 -> 5 |
42760b03adbSJung-Sang Ahn                    //
42860b03adbSJung-Sang Ahn                    // In this case, if we read doc BID 102, then 'offset' will jump
42960b03adbSJung-Sang Ahn                    // to doc BID 2, without reading BID 103.
43060b03adbSJung-Sang Ahn                    //
43160b03adbSJung-Sang Ahn                    // To address this issue, in case that 'offset' decreases,
43260b03adbSJung-Sang Ahn                    // remember the next doc block, and follow the doc linked list
43360b03adbSJung-Sang Ahn                    // first. After the linked list ends, 'offset' cursor will be
43460b03adbSJung-Sang Ahn                    // reset to 'next_doc_block'.
43560b03adbSJung-Sang Ahn                    next_doc_block = (offset / blocksize) + 1;
436b787ba1bSSundar Sridharan                }
43759c1c4f5SJung-Sang Ahn                if (doc.key || (doc.length.flag & DOCIO_TXN_COMMITTED)) {
438d32401daSJung-Sang Ahn                    // check if the doc is transactional or not, and
439d32401daSJung-Sang Ahn                    // also check if the doc contains system info
440d32401daSJung-Sang Ahn                    if (!(doc.length.flag & DOCIO_TXN_DIRTY) &&
441d32401daSJung-Sang Ahn                        !(doc.length.flag & DOCIO_SYSTEM)) {
44259c1c4f5SJung-Sang Ahn                        if (doc.length.flag & DOCIO_TXN_COMMITTED) {
44359c1c4f5SJung-Sang Ahn                            // commit mark .. read doc offset
44459c1c4f5SJung-Sang Ahn                            doc_offset = doc.doc_offset;
44559c1c4f5SJung-Sang Ahn                            // read the previously skipped doc
446ce30d60aSChiyoung Seo                            if (docio_read_doc(handle->dhandle, doc_offset, &doc, true) <= 0) {
447ce30d60aSChiyoung Seo                                // doc read error
448ce30d60aSChiyoung Seo                                free(doc.key);
44959c1c4f5SJung-Sang Ahn                                free(doc.meta);
45059c1c4f5SJung-Sang Ahn                                free(doc.body);
45159c1c4f5SJung-Sang Ahn                                offset = _offset;
45259c1c4f5SJung-Sang Ahn                                continue;
45359c1c4f5SJung-Sang Ahn                            }
45459c1c4f5SJung-Sang Ahn                        } else {
45559c1c4f5SJung-Sang Ahn                            doc_offset = offset;
45659c1c4f5SJung-Sang Ahn                        }
45759c1c4f5SJung-Sang Ahn
4582fe34a0cSSundar Sridharan                        // If say a snapshot is taken on a db handle after
4592fe34a0cSSundar Sridharan                        // rollback, then skip WAL items after rollback point
46027fba53dSChiyoung Seo                        if ((mode == FDB_RESTORE_KV_INS || !handle->kvs) &&
461d32401daSJung-Sang Ahn                            doc.seqnum > handle->seqnum) {
4622fe34a0cSSundar Sridharan                            free(doc.key);
4632fe34a0cSSundar Sridharan                            free(doc.meta);
4642fe34a0cSSundar Sridharan                            free(doc.body);
4652fe34a0cSSundar Sridharan                            offset = _offset;
4662fe34a0cSSundar Sridharan                            continue;
4672fe34a0cSSundar Sridharan                        }
4682fe34a0cSSundar Sridharan
46959c1c4f5SJung-Sang Ahn                        // restore document
47059c1c4f5SJung-Sang Ahn                        fdb_doc wal_doc;
47159c1c4f5SJung-Sang Ahn                        wal_doc.keylen = doc.length.keylen;
47259c1c4f5SJung-Sang Ahn                        wal_doc.bodylen = doc.length.bodylen;
47359c1c4f5SJung-Sang Ahn                        wal_doc.key = doc.key;
47459c1c4f5SJung-Sang Ahn                        wal_doc.seqnum = doc.seqnum;
47559c1c4f5SJung-Sang Ahn                        wal_doc.deleted = doc.length.flag & DOCIO_DELETED;
47659c1c4f5SJung-Sang Ahn
47759c1c4f5SJung-Sang Ahn                        if (!handle->shandle) {
47859c1c4f5SJung-Sang Ahn                            wal_doc.metalen = doc.length.metalen;
47959c1c4f5SJung-Sang Ahn                            wal_doc.meta = doc.meta;
48059c1c4f5SJung-Sang Ahn                            wal_doc.size_ondisk = _fdb_get_docsize(doc.length);
48159c1c4f5SJung-Sang Ahn
482d32401daSJung-Sang Ahn                            if (handle->kvs) {
483d32401daSJung-Sang Ahn                                // check seqnum before insert
48447025c39SJung-Sang Ahn                                fdb_kvs_id_t kv_id;
485d32401daSJung-Sang Ahn                                fdb_seqnum_t kv_seqnum;
48647025c39SJung-Sang Ahn                                buf2kvid(handle->config.chunksize,
48747025c39SJung-Sang Ahn                                         wal_doc.key, &kv_id);
488d32401daSJung-Sang Ahn
48927fba53dSChiyoung Seo                                kv_seqnum = fdb_kvs_get_seqnum(handle->file, kv_id);
490d32401daSJung-Sang Ahn                                if (doc.seqnum <= kv_seqnum &&
491d32401daSJung-Sang Ahn                                        ((mode == FDB_RESTORE_KV_INS &&
492d32401daSJung-Sang Ahn                                            kv_id == kv_id_req) ||
493d32401daSJung-Sang Ahn                                         (mode == FDB_RESTORE_NORMAL)) ) {
494d32401daSJung-Sang Ahn                                    // if mode is NORMAL, restore all items
495d32401daSJung-Sang Ahn                                    // if mode is KV_INS, restore items matching ID
49634e10f85SSundar Sridharan                                    wal_insert(&file->global_txn, file, &cmp_info,
4978f753a2dSSundar Sridharan                                               &wal_doc, doc_offset,
4988f753a2dSSundar Sridharan                                               WAL_INS_WRITER);
499d32401daSJung-Sang Ahn                                }
500d32401daSJung-Sang Ahn                            } else {
50134e10f85SSundar Sridharan                                wal_insert(&file->global_txn, file, &cmp_info,
5028f753a2dSSundar Sridharan                                           &wal_doc, doc_offset,
5038f753a2dSSundar Sridharan                                           WAL_INS_WRITER);
504d32401daSJung-Sang Ahn                            }
50559c1c4f5SJung-Sang Ahn                            if (doc.key) free(doc.key);
50659c1c4f5SJung-Sang Ahn                        } else {
507105f8b13SJung-Sang Ahn                            // snapshot
508d32401daSJung-Sang Ahn                            if (handle->kvs) {
50947025c39SJung-Sang Ahn                                fdb_kvs_id_t kv_id;
51047025c39SJung-Sang Ahn                                buf2kvid(handle->config.chunksize,
51147025c39SJung-Sang Ahn                                         wal_doc.key, &kv_id);
512d32401daSJung-Sang Ahn                                if (kv_id == handle->kvs->id) {
513d32401daSJung-Sang Ahn                                    // snapshot: insert ID matched documents only
51434e10f85SSundar Sridharan                                    wal_snap_insert(handle->shandle,
51534e10f85SSundar Sridharan                                                    &wal_doc, doc_offset);
516d32401daSJung-Sang Ahn                                } else {
517d32401daSJung-Sang Ahn                                    free(doc.key);
518d32401daSJung-Sang Ahn                                }
519d32401daSJung-Sang Ahn                            } else {
52034e10f85SSundar Sridharan                                wal_snap_insert(handle->shandle, &wal_doc,
52134e10f85SSundar Sridharan                                                doc_offset);
522d32401daSJung-Sang Ahn                            }
52359c1c4f5SJung-Sang Ahn                        }
52459c1c4f5SJung-Sang Ahn                        free(doc.meta);
52559c1c4f5SJung-Sang Ahn                        free(doc.body);
52659c1c4f5SJung-Sang Ahn                        offset = _offset;
5273ad384e0SSundar Sridharan                    } else {
528d32401daSJung-Sang Ahn                        // skip transactional document or system document
52959c1c4f5SJung-Sang Ahn                        free(doc.key);
53059c1c4f5SJung-Sang Ahn                        free(doc.meta);
53159c1c4f5SJung-Sang Ahn                        free(doc.body);
53259c1c4f5SJung-Sang Ahn                        offset = _offset;
53359c1c4f5SJung-Sang Ahn                        // do not break.. read next doc
5343ad384e0SSundar Sridharan                    }
535c337f3f7SJung-Sang Ahn                } else {
53659c1c4f5SJung-Sang Ahn                    free(doc.key);
53759c1c4f5SJung-Sang Ahn                    free(doc.meta);
53859c1c4f5SJung-Sang Ahn                    free(doc.body);
539c337f3f7SJung-Sang Ahn                    offset = _offset;
540c337f3f7SJung-Sang Ahn                    break;
541c337f3f7SJung-Sang Ahn                }
542f0b1bf77SJung-Sang Ahn            } while (offset + sizeof(struct docio_length) < doc_scan_limit);
543c337f3f7SJung-Sang Ahn        }
544f0b1bf77SJung-Sang Ahn
54560b03adbSJung-Sang Ahn        if (next_doc_block != BLK_NOT_FOUND) {
54660b03adbSJung-Sang Ahn            offset = next_doc_block * blocksize;
54760b03adbSJung-Sang Ahn            next_doc_block = BLK_NOT_FOUND;
54860b03adbSJung-Sang Ahn        } else {
54960b03adbSJung-Sang Ahn            offset = ((offset / blocksize) + 1) * blocksize;
55060b03adbSJung-Sang Ahn        }
551f0b1bf77SJung-Sang Ahn        if (ver_superblock_support(handle->file->version) &&
552f0b1bf77SJung-Sang Ahn            offset >= filesize) {
553f0b1bf77SJung-Sang Ahn            // circular scan
5543f8ea205Sabhinavdangeti            struct superblock *sb = handle->file->sb;
5553f8ea205Sabhinavdangeti            if (sb && sb->config) {
5563f8ea205Sabhinavdangeti                offset = blocksize * sb->config->num_sb;
5573f8ea205Sabhinavdangeti                cur_bmp_revnum++;
5583f8ea205Sabhinavdangeti            }
559f0b1bf77SJung-Sang Ahn        }
560f0b1bf77SJung-Sang Ahn    } while(true);
561f0b1bf77SJung-Sang Ahn
56237555280SJung-Sang Ahn    // wal commit
56337555280SJung-Sang Ahn    if (!handle->shandle) {
564272a199dSChiyoung Seo        wal_commit(&file->global_txn, file, NULL, &handle->log_callback);
565e62631c5SChiyoung Seo        filemgr_mutex_unlock(file);
56637555280SJung-Sang Ahn    }
567d106c48cSChiyoung Seo    handle->dhandle->log_callback = log_callback;
568abe1d5a9SSundar Sridharan}
569abe1d5a9SSundar Sridharan
570e6e251adSSundar SridharanINLINE fdb_status _fdb_recover_compaction(fdb_kvs_handle *handle,
571041beb8aSChiyoung Seo                                          const char *new_filename)
572505f35b8SJung-Sang Ahn{
573e6e251adSSundar Sridharan    fdb_kvs_handle new_db;
574d5c97dcbSSundar Sridharan    fdb_config config = handle->config;
575505f35b8SJung-Sang Ahn    struct filemgr *new_file;
576894f9fa6SSundar Sridharan
577709b3fd6SJung-Sang Ahn    // As partially compacted file may contain various errors,
578709b3fd6SJung-Sang Ahn    // we temporarily disable log callback for compaction recovery.
5793ad384e0SSundar Sridharan    memset(&new_db, 0, sizeof(new_db));
580709b3fd6SJung-Sang Ahn    new_db.log_callback.callback = NULL;
581709b3fd6SJung-Sang Ahn    new_db.log_callback.ctx_data = NULL;
58288a8c486SChiyoung Seo    config.flags |= FDB_OPEN_FLAG_RDONLY;
583d32401daSJung-Sang Ahn    new_db.fhandle = handle->fhandle;