1bc68bb02SChiyoung Seo/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
27c0433f5SJung-Sang Ahn/*
3bc68bb02SChiyoung Seo *     Copyright 2010 Couchbase, Inc
4bc68bb02SChiyoung Seo *
5bc68bb02SChiyoung Seo *   Licensed under the Apache License, Version 2.0 (the "License");
6bc68bb02SChiyoung Seo *   you may not use this file except in compliance with the License.
7bc68bb02SChiyoung Seo *   You may obtain a copy of the License at
8bc68bb02SChiyoung Seo *
9bc68bb02SChiyoung Seo *       http://www.apache.org/licenses/LICENSE-2.0
10bc68bb02SChiyoung Seo *
11bc68bb02SChiyoung Seo *   Unless required by applicable law or agreed to in writing, software
12bc68bb02SChiyoung Seo *   distributed under the License is distributed on an "AS IS" BASIS,
13bc68bb02SChiyoung Seo *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14bc68bb02SChiyoung Seo *   See the License for the specific language governing permissions and
15bc68bb02SChiyoung Seo *   limitations under the License.
167c0433f5SJung-Sang Ahn */
177c0433f5SJung-Sang Ahn
187c0433f5SJung-Sang Ahn#include <stdio.h>
197c0433f5SJung-Sang Ahn#include <stdlib.h>
207c0433f5SJung-Sang Ahn#include <string.h>
212534ac38SJung-Sang Ahn#include <fcntl.h>
221907e9beSJung-Sang Ahn#include <time.h>
231907e9beSJung-Sang Ahn#if !defined(WIN32) && !defined(_WIN32)
241907e9beSJung-Sang Ahn#include <sys/time.h>
251907e9beSJung-Sang Ahn#endif
267c0433f5SJung-Sang Ahn
2708a30bf6SChiyoung Seo#include "libforestdb/forestdb.h"
28eb364281SChiyoung Seo#include "fdb_internal.h"
297c0433f5SJung-Sang Ahn#include "filemgr.h"
307c0433f5SJung-Sang Ahn#include "hbtrie.h"
3159c1c4f5SJung-Sang Ahn#include "list.h"
32762e6a5dSJung-Sang Ahn#include "btree.h"
337be5b070SJung-Sang Ahn#include "btree_kv.h"
34c9a3f3c3SJung-Sang Ahn#include "btree_var_kv_ops.h"
357c0433f5SJung-Sang Ahn#include "docio.h"
367c0433f5SJung-Sang Ahn#include "btreeblock.h"
377c0433f5SJung-Sang Ahn#include "common.h"
387c0433f5SJung-Sang Ahn#include "wal.h"
393ad384e0SSundar Sridharan#include "snapshot.h"
40f693a021SSundar Sridharan#include "filemgr_ops.h"
4188a8c486SChiyoung Seo#include "configuration.h"
4265b92d59SChiyoung Seo#include "internal_types.h"
434fcd9f6eSJung-Sang Ahn#include "compactor.h"
443d812dfcSJung-Sang Ahn#include "memleak.h"
452ff77207SChiyoung Seo#include "time_utils.h"
46c5675c59SChiyoung Seo#include "system_resource_stats.h"
473d812dfcSJung-Sang Ahn
48bb05885fSJung-Sang Ahn#ifdef __DEBUG
49bb05885fSJung-Sang Ahn#ifndef __DEBUG_FDB
502889254eSJung-Sang Ahn    #undef DBG
512889254eSJung-Sang Ahn    #undef DBGCMD
52eea9c5e9SJung-Sang Ahn    #undef DBGSW
53ceca3b9fSJung-Sang Ahn    #define DBG(...)
54ceca3b9fSJung-Sang Ahn    #define DBGCMD(...)
55ceca3b9fSJung-Sang Ahn    #define DBGSW(n, ...)
56bb05885fSJung-Sang Ahn#endif
57bb05885fSJung-Sang Ahn#endif
587c0433f5SJung-Sang Ahn
59b6be7a1dSSundar Sridharan#ifdef _TRACE_HANDLES
60b6be7a1dSSundar Sridharanstruct avl_tree open_handles;
61b6be7a1dSSundar Sridharanstatic spin_t open_handle_lock;
62b6be7a1dSSundar Sridharanstatic int _fdb_handle_cmp(struct avl_node *a, struct avl_node *b, void *aux)
63b6be7a1dSSundar Sridharan{
64b6be7a1dSSundar Sridharan    struct _fdb_kvs_handle *aa, *bb;
65b6be7a1dSSundar Sridharan    aa = _get_entry(a, struct _fdb_kvs_handle, avl_trace);
66b6be7a1dSSundar Sridharan    bb = _get_entry(b, struct _fdb_kvs_handle, avl_trace);
67b6be7a1dSSundar Sridharan    return (aa > bb) ? 1 : -1;
68b6be7a1dSSundar Sridharan}
69b6be7a1dSSundar Sridharan#endif
70b6be7a1dSSundar Sridharan
7159c1c4f5SJung-Sang Ahnstatic volatile uint8_t fdb_initialized = 0;
72f30bfecdSSundar Sridharanstatic volatile uint8_t fdb_open_inprog = 0;
7359c1c4f5SJung-Sang Ahn#ifdef SPIN_INITIALIZER
7459c1c4f5SJung-Sang Ahnstatic spin_t initial_lock = SPIN_INITIALIZER;
7559c1c4f5SJung-Sang Ahn#else
7659c1c4f5SJung-Sang Ahnstatic volatile unsigned int initial_lock_status = 0;
7759c1c4f5SJung-Sang Ahnstatic spin_t initial_lock;
7859c1c4f5SJung-Sang Ahn#endif
7959c1c4f5SJung-Sang Ahn
80c99459d2SSundar Sridharanstatic fdb_status _fdb_wal_snapshot_func(void *handle, fdb_doc *doc,
81ae797c0bSSundar Sridharan                                         uint64_t offset);
82ae797c0bSSundar Sridharan
83c9a3f3c3SJung-Sang AhnINLINE int _cmp_uint64_t_endian_safe(void *key1, void *key2, void *aux)
846d79432aSJung-Sang Ahn{
85c9a3f3c3SJung-Sang Ahn    (void) aux;
866d79432aSJung-Sang Ahn    uint64_t a,b;
876d79432aSJung-Sang Ahn    a = *(uint64_t*)key1;
886d79432aSJung-Sang Ahn    b = *(uint64_t*)key2;
896d79432aSJung-Sang Ahn    a = _endian_decode(a);
906d79432aSJung-Sang Ahn    b = _endian_decode(b);
916d79432aSJung-Sang Ahn    return _CMP_U64(a, b);
926d79432aSJung-Sang Ahn}
936d79432aSJung-Sang Ahn
94d32401daSJung-Sang Ahnsize_t _fdb_readkey_wrap(void *handle, uint64_t offset, void *buf)
957c0433f5SJung-Sang Ahn{
962889254eSJung-Sang Ahn    keylen_t keylen;
976d79432aSJung-Sang Ahn    offset = _endian_decode(offset);
982889254eSJung-Sang Ahn    docio_read_doc_key((struct docio_handle *)handle, offset, &keylen, buf);
992889254eSJung-Sang Ahn    return keylen;
1007c0433f5SJung-Sang Ahn}
1017c0433f5SJung-Sang Ahn
102d32401daSJung-Sang Ahnsize_t _fdb_readseq_wrap(void *handle, uint64_t offset, void *buf)
103c9a3f3c3SJung-Sang Ahn{
10447025c39SJung-Sang Ahn    int size_id, size_seq, size_chunk;
105d32401daSJung-Sang Ahn    fdb_seqnum_t _seqnum;
106d32401daSJung-Sang Ahn    struct docio_object doc;
10747025c39SJung-Sang Ahn    struct docio_handle *dhandle = (struct docio_handle *)handle;
108d32401daSJung-Sang Ahn
109d32401daSJung-Sang Ahn    size_id = sizeof(fdb_kvs_id_t);
110d32401daSJung-Sang Ahn    size_seq = sizeof(fdb_seqnum_t);
11147025c39SJung-Sang Ahn    size_chunk = dhandle->file->config->chunksize;
112d32401daSJung-Sang Ahn    memset(&doc, 0, sizeof(struct docio_object));
113d32401daSJung-Sang Ahn
114d32401daSJung-Sang Ahn    offset = _endian_decode(offset);
1157171f367SSundar Sridharan    docio_read_doc_key_meta((struct docio_handle *)handle, offset, &doc,
1167171f367SSundar Sridharan                            true);
11747025c39SJung-Sang Ahn    buf2buf(size_chunk, doc.key, size_id, buf);
118d32401daSJung-Sang Ahn    _seqnum = _endian_encode(doc.seqnum);
119d32401daSJung-Sang Ahn    memcpy((uint8_t*)buf + size_id, &_seqnum, size_seq);
120d32401daSJung-Sang Ahn
121d32401daSJung-Sang Ahn    free(doc.key);
122d32401daSJung-Sang Ahn    free(doc.meta);
123d32401daSJung-Sang Ahn
124d32401daSJung-Sang Ahn    return size_id + size_seq;
125c9a3f3c3SJung-Sang Ahn}
126c9a3f3c3SJung-Sang Ahn
127d32401daSJung-Sang Ahnint _fdb_custom_cmp_wrap(void *key1, void *key2, void *aux)
128c9a3f3c3SJung-Sang Ahn{
129d32401daSJung-Sang Ahn    int is_key1_inf, is_key2_inf;
130d32401daSJung-Sang Ahn    uint8_t *keystr1 = alca(uint8_t, FDB_MAX_KEYLEN_INTERNAL);
131d32401daSJung-Sang Ahn    uint8_t *keystr2 = alca(uint8_t, FDB_MAX_KEYLEN_INTERNAL);
132c9a3f3c3SJung-Sang Ahn    size_t keylen1, keylen2;
13347025c39SJung-Sang Ahn    btree_cmp_args *args = (btree_cmp_args *)aux;
13447025c39SJung-Sang Ahn    fdb_custom_cmp_variable cmp = (fdb_custom_cmp_variable)args->aux;
135d32401daSJung-Sang Ahn
136d32401daSJung-Sang Ahn    is_key1_inf = _is_inf_key(key1);
137d32401daSJung-Sang Ahn    is_key2_inf = _is_inf_key(key2);
138d32401daSJung-Sang Ahn    if (is_key1_inf && is_key2_inf) { // both are infinite
139d32401daSJung-Sang Ahn        return 0;
140d32401daSJung-Sang Ahn    } else if (!is_key1_inf && is_key2_inf) { // key2 is infinite
141d32401daSJung-Sang Ahn        return -1;
142d32401daSJung-Sang Ahn    } else if (is_key1_inf && !is_key2_inf) { // key1 is infinite
143d32401daSJung-Sang Ahn        return 1;
144d32401daSJung-Sang Ahn    }
145c9a3f3c3SJung-Sang Ahn
146c9a3f3c3SJung-Sang Ahn    _get_var_key(key1, (void*)keystr1, &keylen1);
147c9a3f3c3SJung-Sang Ahn    _get_var_key(key2, (void*)keystr2, &keylen2);
148c9a3f3c3SJung-Sang Ahn
149bc3885dcSJung-Sang Ahn    if (keylen1 == 0 && keylen2 == 0) {
150bc3885dcSJung-Sang Ahn        return 0;
151bc3885dcSJung-Sang Ahn    } else if (keylen1 ==0 && keylen2 > 0) {
152bc3885dcSJung-Sang Ahn        return -1;
153bc3885dcSJung-Sang Ahn    } else if (keylen1 > 0 && keylen2 == 0) {
154bc3885dcSJung-Sang Ahn        return 1;
155bc3885dcSJung-Sang Ahn    }
156bc3885dcSJung-Sang Ahn
157d32401daSJung-Sang Ahn    return cmp(keystr1, keylen1, keystr2, keylen2);
158c9a3f3c3SJung-Sang Ahn}
159c9a3f3c3SJung-Sang Ahn
160eb364281SChiyoung Seovoid fdb_fetch_header(void *header_buf,
161eb364281SChiyoung Seo                      bid_t *trie_root_bid,
162eb364281SChiyoung Seo                      bid_t *seq_root_bid,
163eb364281SChiyoung Seo                      uint64_t *ndocs,
164eb364281SChiyoung Seo                      uint64_t *nlivenodes,
165eb364281SChiyoung Seo                      uint64_t *datasize,
16640a23059SJung-Sang Ahn                      uint64_t *last_wal_flush_hdr_bid,
167d32401daSJung-Sang Ahn                      uint64_t *kv_info_offset,
168d32401daSJung-Sang Ahn                      uint64_t *header_flags,
169eb364281SChiyoung Seo                      char **new_filename,
170eb364281SChiyoung Seo                      char **old_filename)
171e6449f52SJung-Sang Ahn{
172e6449f52SJung-Sang Ahn    size_t offset = 0;
1734a0379f5SJung-Sang Ahn    uint16_t new_filename_len;
1744a0379f5SJung-Sang Ahn    uint16_t old_filename_len;
1756d79432aSJung-Sang Ahn
1766d79432aSJung-Sang Ahn    seq_memcpy(trie_root_bid, (uint8_t *)header_buf + offset,
1776d79432aSJung-Sang Ahn               sizeof(bid_t), offset);
1786d79432aSJung-Sang Ahn    *trie_root_bid = _endian_decode(*trie_root_bid);
1796d79432aSJung-Sang Ahn
1806d79432aSJung-Sang Ahn    seq_memcpy(seq_root_bid, (uint8_t *)header_buf + offset,
1816d79432aSJung-Sang Ahn               sizeof(bid_t), offset);
1826d79432aSJung-Sang Ahn    *seq_root_bid = _endian_decode(*seq_root_bid);
1836d79432aSJung-Sang Ahn
1846d79432aSJung-Sang Ahn    seq_memcpy(ndocs, (uint8_t *)header_buf + offset,
1856d79432aSJung-Sang Ahn               sizeof(uint64_t), offset);
1866d79432aSJung-Sang Ahn    *ndocs = _endian_decode(*ndocs);
1876d79432aSJung-Sang Ahn
188e8dd5304SJung-Sang Ahn    seq_memcpy(nlivenodes, (uint8_t *)header_buf + offset,
189e8dd5304SJung-Sang Ahn               sizeof(uint64_t), offset);
190e8dd5304SJung-Sang Ahn    *nlivenodes = _endian_decode(*nlivenodes);
191e8dd5304SJung-Sang Ahn
1926d79432aSJung-Sang Ahn    seq_memcpy(datasize, (uint8_t *)header_buf + offset,
1936d79432aSJung-Sang Ahn               sizeof(uint64_t), offset);
1946d79432aSJung-Sang Ahn    *datasize = _endian_decode(*datasize);
1956d79432aSJung-Sang Ahn
19640a23059SJung-Sang Ahn    seq_memcpy(last_wal_flush_hdr_bid, (uint8_t *)header_buf + offset,
1976d79432aSJung-Sang Ahn               sizeof(uint64_t), offset);
19840a23059SJung-Sang Ahn    *last_wal_flush_hdr_bid = _endian_decode(*last_wal_flush_hdr_bid);
1996d79432aSJung-Sang Ahn
200d32401daSJung-Sang Ahn    seq_memcpy(kv_info_offset, (uint8_t *)header_buf + offset,
201d32401daSJung-Sang Ahn               sizeof(uint64_t), offset);
202d32401daSJung-Sang Ahn    *kv_info_offset = _endian_decode(*kv_info_offset);
203d32401daSJung-Sang Ahn
204d32401daSJung-Sang Ahn    seq_memcpy(header_flags, (uint8_t *)header_buf + offset,
205d32401daSJung-Sang Ahn               sizeof(uint64_t), offset);
206d32401daSJung-Sang Ahn    *header_flags = _endian_decode(*header_flags);
207d32401daSJung-Sang Ahn
2086d79432aSJung-Sang Ahn    seq_memcpy(&new_filename_len, (uint8_t *)header_buf + offset,
2094a0379f5SJung-Sang Ahn               sizeof(new_filename_len), offset);
2104a0379f5SJung-Sang Ahn    new_filename_len = _endian_decode(new_filename_len);
2116d79432aSJung-Sang Ahn    seq_memcpy(&old_filename_len, (uint8_t *)header_buf + offset,
2124a0379f5SJung-Sang Ahn               sizeof(old_filename_len), offset);
2134a0379f5SJung-Sang Ahn    old_filename_len = _endian_decode(old_filename_len);
2147b78f9b0SChiyoung Seo    if (new_filename_len) {
2158e1b9ec2SJung-Sang Ahn        *new_filename = (char*)((uint8_t *)header_buf + offset);
2162ff77207SChiyoung Seo    } else {
2172ff77207SChiyoung Seo        *new_filename = NULL;
21893d8acb2SSundar Sridharan    }
2194a0379f5SJung-Sang Ahn    offset += new_filename_len;
2203ad384e0SSundar Sridharan    if (old_filename && old_filename_len) {
221894f9fa6SSundar Sridharan        *old_filename = (char *) malloc(old_filename_len);
2224a0379f5SJung-Sang Ahn        seq_memcpy(*old_filename,
2234a0379f5SJung-Sang Ahn                   (uint8_t *)header_buf + offset,
2244a0379f5SJung-Sang Ahn                   old_filename_len, offset);
22593d8acb2SSundar Sridharan    }
226e6449f52SJung-Sang Ahn}
227e6449f52SJung-Sang Ahn
228d32401daSJung-Sang Ahntypedef enum {
229d32401daSJung-Sang Ahn    FDB_RESTORE_NORMAL,
230d32401daSJung-Sang Ahn    FDB_RESTORE_KV_INS,
231d32401daSJung-Sang Ahn} fdb_restore_mode_t;
232d32401daSJung-Sang Ahn
233e6e251adSSundar SridharanINLINE void _fdb_restore_wal(fdb_kvs_handle *handle,
234d32401daSJung-Sang Ahn                             fdb_restore_mode_t mode,
235d32401daSJung-Sang Ahn                             bid_t hdr_bid,
236d32401daSJung-Sang Ahn                             fdb_kvs_id_t kv_id_req)
237abe1d5a9SSundar Sridharan{
238abe1d5a9SSundar Sridharan    struct filemgr *file = handle->file;
239abe1d5a9SSundar Sridharan    uint32_t blocksize = handle->file->blocksize;
24040a23059SJung-Sang Ahn    uint64_t last_wal_flush_hdr_bid = handle->last_wal_flush_hdr_bid;
2413ad384e0SSundar Sridharan    uint64_t hdr_off = hdr_bid * FDB_BLOCKSIZE;
242abe1d5a9SSundar Sridharan    uint64_t offset = 0; //assume everything from first block needs restoration
243dad7792aSSundar Sridharan    err_log_callback *log_callback;
244abe1d5a9SSundar Sridharan
245ae797c0bSSundar Sridharan    if (!hdr_off) { // Nothing to do if we don't have a header block offset
246ae797c0bSSundar Sridharan        return;
247ae797c0bSSundar Sridharan    }
248ae797c0bSSundar Sridharan
24940a23059SJung-Sang Ahn    if (last_wal_flush_hdr_bid != BLK_NOT_FOUND) {
25040a23059SJung-Sang Ahn        offset = (last_wal_flush_hdr_bid + 1) * blocksize;
251abe1d5a9SSundar Sridharan    }
252abe1d5a9SSundar Sridharan
253abe1d5a9SSundar Sridharan    // If a valid last header was retrieved and it matches the current header
254abe1d5a9SSundar Sridharan    // OR if WAL already had entries populated, then no crash recovery needed
255d32401daSJung-Sang Ahn    if (hdr_off <= offset ||
256d32401daSJung-Sang Ahn        (!handle->shandle && wal_get_size(file) &&
257d32401daSJung-Sang Ahn            mode != FDB_RESTORE_KV_INS)) {
258abe1d5a9SSundar Sridharan        return;
259abe1d5a9SSundar Sridharan    }
260abe1d5a9SSundar Sridharan
261d106c48cSChiyoung Seo    // Temporarily disable the error logging callback as there are false positive
262d106c48cSChiyoung Seo    // checksum errors in docio_read_doc.
263d106c48cSChiyoung Seo    // TODO: Need to adapt docio_read_doc to separate false checksum errors.
264dad7792aSSundar Sridharan    log_callback = handle->dhandle->log_callback;
265d106c48cSChiyoung Seo    handle->dhandle->log_callback = NULL;
266d106c48cSChiyoung Seo
267e62631c5SChiyoung Seo    if (!handle->shandle) {
268e62631c5SChiyoung Seo        filemgr_mutex_lock(file);
269e62631c5SChiyoung Seo    }
2703ad384e0SSundar Sridharan    for (; offset < hdr_off;
2719ceccb2cSJung-Sang Ahn        offset = ((offset / blocksize) + 1) * blocksize) { // next block's off
2729ceccb2cSJung-Sang Ahn        if (!docio_check_buffer(handle->dhandle, offset / blocksize)) {
273abe1d5a9SSundar Sridharan            continue;
274c337f3f7SJung-Sang Ahn        } else {
275c337f3f7SJung-Sang Ahn            do {
276c337f3f7SJung-Sang Ahn                struct docio_object doc;
277c337f3f7SJung-Sang Ahn                uint64_t _offset;
27859c1c4f5SJung-Sang Ahn                uint64_t doc_offset;
279c337f3f7SJung-Sang Ahn                memset(&doc, 0, sizeof(doc));
280c6c3d274SChiyoung Seo                _offset = docio_read_doc(handle->dhandle, offset, &doc, true);
281b787ba1bSSundar Sridharan                if (_offset == offset) { // reached unreadable doc, skip block
282b787ba1bSSundar Sridharan                    break;
283b787ba1bSSundar Sridharan                }
28459c1c4f5SJung-Sang Ahn                if (doc.key || (doc.length.flag & DOCIO_TXN_COMMITTED)) {
285d32401daSJung-Sang Ahn                    // check if the doc is transactional or not, and
286d32401daSJung-Sang Ahn                    // also check if the doc contains system info
287d32401daSJung-Sang Ahn                    if (!(doc.length.flag & DOCIO_TXN_DIRTY) &&
288d32401daSJung-Sang Ahn                        !(doc.length.flag & DOCIO_SYSTEM)) {
28959c1c4f5SJung-Sang Ahn                        if (doc.length.flag & DOCIO_TXN_COMMITTED) {
29059c1c4f5SJung-Sang Ahn                            // commit mark .. read doc offset
29159c1c4f5SJung-Sang Ahn                            doc_offset = doc.doc_offset;
29259c1c4f5SJung-Sang Ahn                            // read the previously skipped doc
293c6c3d274SChiyoung Seo                            docio_read_doc(handle->dhandle, doc_offset, &doc, true);
2942fe34a0cSSundar Sridharan                            if (doc.key == NULL) { // doc read error
29559c1c4f5SJung-Sang Ahn                                free(doc.meta);
29659c1c4f5SJung-Sang Ahn                                free(doc.body);
29759c1c4f5SJung-Sang Ahn                                offset = _offset;
29859c1c4f5SJung-Sang Ahn                                continue;
29959c1c4f5SJung-Sang Ahn                            }
30059c1c4f5SJung-Sang Ahn                        } else {
30159c1c4f5SJung-Sang Ahn                            doc_offset = offset;
30259c1c4f5SJung-Sang Ahn                        }
30359c1c4f5SJung-Sang Ahn
3042fe34a0cSSundar Sridharan                        // If say a snapshot is taken on a db handle after
3052fe34a0cSSundar Sridharan                        // rollback, then skip WAL items after rollback point
306a0eddcc0SJung-Sang Ahn                        if (handle->config.seqtree_opt == FDB_SEQTREE_USE &&
307a0eddcc0SJung-Sang Ahn                            (mode == FDB_RESTORE_KV_INS || !handle->kvs) &&
308d32401daSJung-Sang Ahn                            doc.seqnum > handle->seqnum) {
3092fe34a0cSSundar Sridharan                            free(doc.key);
3102fe34a0cSSundar Sridharan                            free(doc.meta);
3112fe34a0cSSundar Sridharan                            free(doc.body);
3122fe34a0cSSundar Sridharan                            offset = _offset;
3132fe34a0cSSundar Sridharan                            continue;
3142fe34a0cSSundar Sridharan                        }
3152fe34a0cSSundar Sridharan
31659c1c4f5SJung-Sang Ahn                        // restore document
31759c1c4f5SJung-Sang Ahn                        fdb_doc wal_doc;
31859c1c4f5SJung-Sang Ahn                        wal_doc.keylen = doc.length.keylen;
31959c1c4f5SJung-Sang Ahn                        wal_doc.bodylen = doc.length.bodylen;
32059c1c4f5SJung-Sang Ahn                        wal_doc.key = doc.key;
32159c1c4f5SJung-Sang Ahn                        wal_doc.seqnum = doc.seqnum;
32259c1c4f5SJung-Sang Ahn                        wal_doc.deleted = doc.length.flag & DOCIO_DELETED;
32359c1c4f5SJung-Sang Ahn
32459c1c4f5SJung-Sang Ahn                        if (!handle->shandle) {
32559c1c4f5SJung-Sang Ahn                            wal_doc.metalen = doc.length.metalen;
32659c1c4f5SJung-Sang Ahn                            wal_doc.meta = doc.meta;
32759c1c4f5SJung-Sang Ahn                            wal_doc.size_ondisk = _fdb_get_docsize(doc.length);
32859c1c4f5SJung-Sang Ahn
329d32401daSJung-Sang Ahn                            if (handle->kvs) {
330d32401daSJung-Sang Ahn                                // check seqnum before insert
33147025c39SJung-Sang Ahn                                fdb_kvs_id_t kv_id;
332d32401daSJung-Sang Ahn                                fdb_seqnum_t kv_seqnum;
33347025c39SJung-Sang Ahn                                buf2kvid(handle->config.chunksize,
33447025c39SJung-Sang Ahn                                         wal_doc.key, &kv_id);
335d32401daSJung-Sang Ahn
336f54fbdb6SJung-Sang Ahn                                if (handle->config.seqtree_opt == FDB_SEQTREE_USE) {
337f54fbdb6SJung-Sang Ahn                                    kv_seqnum = fdb_kvs_get_seqnum(handle->file, kv_id);
338f54fbdb6SJung-Sang Ahn                                } else {
339f54fbdb6SJung-Sang Ahn                                    kv_seqnum = SEQNUM_NOT_USED;
340f54fbdb6SJung-Sang Ahn                                }
341d32401daSJung-Sang Ahn                                if (doc.seqnum <= kv_seqnum &&
342d32401daSJung-Sang Ahn                                        ((mode == FDB_RESTORE_KV_INS &&
343d32401daSJung-Sang Ahn                                            kv_id == kv_id_req) ||
344d32401daSJung-Sang Ahn                                         (mode == FDB_RESTORE_NORMAL)) ) {
345d32401daSJung-Sang Ahn                                    // if mode is NORMAL, restore all items
346d32401daSJung-Sang Ahn                                    // if mode is KV_INS, restore items matching ID
347d32401daSJung-Sang Ahn                                    wal_insert(&file->global_txn, file,
348899a7f61SChiyoung Seo                                               &wal_doc, doc_offset, 0);
349d32401daSJung-Sang Ahn                                }
350d32401daSJung-Sang Ahn                            } else {
351d32401daSJung-Sang Ahn                                wal_insert(&file->global_txn, file,
352899a7f61SChiyoung Seo                                           &wal_doc, doc_offset, 0);
353d32401daSJung-Sang Ahn                            }
35459c1c4f5SJung-Sang Ahn                            if (doc.key) free(doc.key);
35559c1c4f5SJung-Sang Ahn                        } else {
356105f8b13SJung-Sang Ahn                            // snapshot
357d32401daSJung-Sang Ahn                            if (handle->kvs) {
35847025c39SJung-Sang Ahn                                fdb_kvs_id_t kv_id;
35947025c39SJung-Sang Ahn                                buf2kvid(handle->config.chunksize,
36047025c39SJung-Sang Ahn                                         wal_doc.key, &kv_id);
361d32401daSJung-Sang Ahn                                if (kv_id == handle->kvs->id) {
362d32401daSJung-Sang Ahn                                    // snapshot: insert ID matched documents only
363d32401daSJung-Sang Ahn                                    snap_insert(handle->shandle,
364d32401daSJung-Sang Ahn                                                &wal_doc, doc_offset);
365d32401daSJung-Sang Ahn                                } else {
366d32401daSJung-Sang Ahn                                    free(doc.key);
367d32401daSJung-Sang Ahn                                }
368d32401daSJung-Sang Ahn                            } else {
369d32401daSJung-Sang Ahn                                snap_insert(handle->shandle, &wal_doc, doc_offset);
370d32401daSJung-Sang Ahn                            }
37159c1c4f5SJung-Sang Ahn                        }
37259c1c4f5SJung-Sang Ahn                        free(doc.meta);
37359c1c4f5SJung-Sang Ahn                        free(doc.body);
37459c1c4f5SJung-Sang Ahn                        offset = _offset;
3753ad384e0SSundar Sridharan                    } else {
376d32401daSJung-Sang Ahn                        // skip transactional document or system document
37759c1c4f5SJung-Sang Ahn                        free(doc.key);
37859c1c4f5SJung-Sang Ahn                        free(doc.meta);
37959c1c4f5SJung-Sang Ahn                        free(doc.body);
38059c1c4f5SJung-Sang Ahn                        offset = _offset;
38159c1c4f5SJung-Sang Ahn                        // do not break.. read next doc
3823ad384e0SSundar Sridharan                    }
383c337f3f7SJung-Sang Ahn                } else {
38459c1c4f5SJung-Sang Ahn                    free(doc.key);
38559c1c4f5SJung-Sang Ahn                    free(doc.meta);
38659c1c4f5SJung-Sang Ahn                    free(doc.body);
387c337f3f7SJung-Sang Ahn                    offset = _offset;
388c337f3f7SJung-Sang Ahn                    break;
389c337f3f7SJung-Sang Ahn                }
3903ad384e0SSundar Sridharan            } while (offset + sizeof(struct docio_length) < hdr_off);
391c337f3f7SJung-Sang Ahn        }
392abe1d5a9SSundar Sridharan    }
39337555280SJung-Sang Ahn    // wal commit
39437555280SJung-Sang Ahn    if (!handle->shandle) {
395272a199dSChiyoung Seo        wal_commit(&file->global_txn, file, NULL, &handle->log_callback);
396e62631c5SChiyoung Seo        filemgr_mutex_unlock(file);
39737555280SJung-Sang Ahn    }
398d106c48cSChiyoung Seo    handle->dhandle->log_callback = log_callback;
399abe1d5a9SSundar Sridharan}
400abe1d5a9SSundar Sridharan
401e6e251adSSundar SridharanINLINE fdb_status _fdb_recover_compaction(fdb_kvs_handle *handle,
402041beb8aSChiyoung Seo                                          const char *new_filename)
403505f35b8SJung-Sang Ahn{
404e6e251adSSundar Sridharan    fdb_kvs_handle new_db;
405d5c97dcbSSundar Sridharan    fdb_config config = handle->config;
406505f35b8SJung-Sang Ahn    struct filemgr *new_file;
407894f9fa6SSundar Sridharan
4083ad384e0SSundar Sridharan    memset(&new_db, 0, sizeof(new_db));
40901e33605SChiyoung Seo    new_db.log_callback.callback = handle->log_callback.callback;
41001e33605SChiyoung Seo    new_db.log_callback.ctx_data = handle->log_callback.ctx_data;
41188a8c486SChiyoung Seo    config.flags |= FDB_OPEN_FLAG_RDONLY;
412d32401daSJung-Sang Ahn    new_db.fhandle = handle->fhandle;
413d32401daSJung-Sang Ahn    new_db.kvs_config = handle->kvs_config;
41480683324SJung-Sang Ahn    fdb_status status = _fdb_open(&new_db, new_filename,
41580683324SJung-Sang Ahn                                  FDB_AFILENAME, &config);
416d5c97dcbSSundar Sridharan    if (status != FDB_RESULT_SUCCESS) {
417f9a2bf6bSJens Alfke        return fdb_log(&handle->log_callback, status,
418f9a2bf6bSJens Alfke                       "Error in opening a partially compacted file '%s' for recovery.",
419f9a2bf6bSJens Alfke                       new_filename);
420d5c97dcbSSundar Sridharan    }
421c9a3f3c3SJung-Sang Ahn
42293d8acb2SSundar Sridharan    new_file = new_db.file;
4233070be81SJung-Sang Ahn
42493d8acb2SSundar Sridharan    if (new_file->old_filename &&
42593d8acb2SSundar Sridharan        !strncmp(new_file->old_filename, handle->file->filename,
42693d8acb2SSundar Sridharan                 FDB_MAX_FILENAME_LEN)) {
42793d8acb2SSundar Sridharan        struct filemgr *old_file = handle->file;
42893d8acb2SSundar Sridharan        // If new file has a recorded old_filename then it means that
42993d8acb2SSundar Sridharan        // compaction has completed successfully. Mark self for deletion
43093d8acb2SSundar Sridharan        filemgr_mutex_lock(new_file);
431505f35b8SJung-Sang Ahn
432a94e4677SSundar Sridharan        status = btreeblk_end(handle->bhandle);
433a94e4677SSundar Sridharan        if (status != FDB_RESULT_SUCCESS) {
434a94e4677SSundar Sridharan            filemgr_mutex_unlock(new_file);
435a94e4677SSundar Sridharan            _fdb_close(&new_db);
436a94e4677SSundar Sridharan            return status;
437a94e4677SSundar Sridharan        }
43893d8acb2SSundar Sridharan        btreeblk_free(handle->bhandle);
43993d8acb2SSundar Sridharan        free(handle->bhandle);
44093d8acb2SSundar Sridharan        handle->bhandle = new_db.bhandle;
44193d8acb2SSundar Sridharan
44293d8acb2SSundar Sridharan        docio_free(handle->dhandle);
44393d8acb2SSundar Sridharan        free(handle->dhandle);
44493d8acb2SSundar Sridharan        handle->dhandle = new_db.dhandle;
44593d8acb2SSundar Sridharan
44693d8acb2SSundar Sridharan        hbtrie_free(handle->trie);
44793d8acb2SSundar Sridharan        free(handle->trie);
44893d8acb2SSundar Sridharan        handle->trie = new_db.trie;
44993d8acb2SSundar Sridharan
45093d8acb2SSundar Sridharan        wal_shutdown(handle->file);
45193d8acb2SSundar Sridharan        handle->file = new_file;
4525379abc9SJung-Sang Ahn
45393d8acb2SSundar Sridharan        if (handle->config.seqtree_opt == FDB_SEQTREE_USE) {
454d32401daSJung-Sang Ahn            if (handle->kvs) {
455d32401daSJung-Sang Ahn                // multi KV instance mode
456d32401daSJung-Sang Ahn                hbtrie_free(handle->seqtrie);
457d32401daSJung-Sang Ahn                free(handle->seqtrie);
458d32401daSJung-Sang Ahn                if (new_db.config.seqtree_opt == FDB_SEQTREE_USE) {
459d32401daSJung-Sang Ahn                    handle->seqtrie = new_db.seqtrie;
460d32401daSJung-Sang Ahn                }
461d32401daSJung-Sang Ahn            } else {
462d32401daSJung-Sang Ahn                free(handle->seqtree->kv_ops);
463d32401daSJung-Sang Ahn                free(handle->seqtree);
464d32401daSJung-Sang Ahn                if (new_db.config.seqtree_opt == FDB_SEQTREE_USE) {
465d32401daSJung-Sang Ahn                    handle->seqtree = new_db.seqtree;
466d32401daSJung-Sang Ahn                }
46793d8acb2SSundar Sridharan            }
46893d8acb2SSundar Sridharan        }
4695379abc9SJung-Sang Ahn
47009c5bf46SChiyoung Seo        filemgr_mutex_unlock(new_file);
471d32401daSJung-Sang Ahn        if (new_db.kvs) {
472d32401daSJung-Sang Ahn            fdb_kvs_info_free(&new_db);
473d32401daSJung-Sang Ahn        }
47493d8acb2SSundar Sridharan        // remove self: WARNING must not close this handle if snapshots
47593d8acb2SSundar Sridharan        // are yet to open this file
47693d8acb2SSundar Sridharan        filemgr_remove_pending(old_file, new_db.file);
477fef67a48SChiyoung Seo        filemgr_close(old_file, 0, handle->filename, &handle->log_callback);
478fef67a48SChiyoung Seo        free(new_db.filename);
4796bd10060SSundar Sridharan        return FDB_RESULT_FAIL_BY_COMPACTION;
48093d8acb2SSundar Sridharan    }
48159c1c4f5SJung-Sang Ahn
482ccf0294fSChiyoung Seo    // As the new file is partially compacted, it should be removed upon close.
4830eb598e6SSundar Sridharan    // Just in-case the new file gets opened before removal, point it to the old
4840eb598e6SSundar Sridharan    // file to ensure availability of data.
4850eb598e6SSundar Sridharan    filemgr_remove_pending(new_db.file, handle->file);
486b277f819SChiyoung Seo    _fdb_close(&new_db);
487505f35b8SJung-Sang Ahn
488505f35b8SJung-Sang Ahn    return FDB_RESULT_SUCCESS;
489505f35b8SJung-Sang Ahn}
490505f35b8SJung-Sang Ahn
49159c1c4f5SJung-Sang AhnLIBFDB_API
49259c1c4f5SJung-Sang Ahnfdb_status fdb_init(fdb_config *config)
49359c1c4f5SJung-Sang Ahn{
49459c1c4f5SJung-Sang Ahn    fdb_config _config;
49559c1c4f5SJung-Sang Ahn    compactor_config c_config;
49659c1c4f5SJung-Sang Ahn    struct filemgr_config f_config;
49759c1c4f5SJung-Sang Ahn
49859c1c4f5SJung-Sang Ahn    if (config) {
49959c1c4f5SJung-Sang Ahn        if (validate_fdb_config(config)) {
50059c1c4f5SJung-Sang Ahn            _config = *config;
50159c1c4f5SJung-Sang Ahn        } else {
50259c1c4f5SJung-Sang Ahn            return FDB_RESULT_INVALID_CONFIG;
50359c1c4f5SJung-Sang Ahn        }
50459c1c4f5SJung-Sang Ahn    } else {
50559c1c4f5SJung-Sang Ahn        _config = get_default_config();
50659c1c4f5SJung-Sang Ahn    }
50759c1c4f5SJung-Sang Ahn
50859c1c4f5SJung-Sang Ahn    // global initialization
50959c1c4f5SJung-Sang Ahn    // initialized only once at first time
51059c1c4f5SJung-Sang Ahn    if (!fdb_initialized) {
511b6be7a1dSSundar Sridharan#ifdef _TRACE_HANDLES
512b6be7a1dSSundar Sridharan        spin_init(&open_handle_lock);
513b6be7a1dSSundar Sridharan        avl_init(&open_handles, NULL);
514b6be7a1dSSundar Sridharan#endif
515b6be7a1dSSundar Sridharan
51659c1c4f5SJung-Sang Ahn#ifndef SPIN_INITIALIZER
51759c1c4f5SJung-Sang Ahn        // Note that only Windows passes through this routine
51859c1c4f5SJung-Sang Ahn        if (InterlockedCompareExchange(&initial_lock_status, 1, 0) == 0) {
51959c1c4f5SJung-Sang Ahn            // atomically initialize spin lock only once
52059c1c4f5SJung-Sang Ahn            spin_init(&initial_lock);
52159c1c4f5SJung-Sang Ahn            initial_lock_status = 2;
52259c1c4f5SJung-Sang Ahn        } else {
52359c1c4f5SJung-Sang Ahn            // the others .. wait until initializing 'initial_lock' is done
52459c1c4f5SJung-Sang Ahn            while (initial_lock_status != 2) {
52559c1c4f5SJung-Sang Ahn                Sleep(1);
52659c1c4f5SJung-Sang Ahn            }
52759c1c4f5SJung-Sang Ahn        }
52859c1c4f5SJung-Sang Ahn#endif
52959c1c4f5SJung-Sang Ahn
530f30bfecdSSundar Sridharan    }
531f30bfecdSSundar Sridharan    spin_lock(&initial_lock);
532f30bfecdSSundar Sridharan    if (!fdb_initialized) {
5335257ddacSChiyoung Seo        double ram_size = (double) get_memory_size();
5345257ddacSChiyoung Seo        if (ram_size * BCACHE_MEMORY_THRESHOLD < (double) _config.buffercache_size) {
5355257ddacSChiyoung Seo            spin_unlock(&initial_lock);
5365257ddacSChiyoung Seo            return FDB_RESULT_TOO_BIG_BUFFER_CACHE;
5375257ddacSChiyoung Seo        }
538f30bfecdSSundar Sridharan        // initialize file manager and block cache
539f30bfecdSSundar Sridharan        f_config.blocksize = _config.blocksize;
540f30bfecdSSundar Sridharan        f_config.ncacheblock = _config.buffercache_size / _config.blocksize;
541f30bfecdSSundar Sridharan        filemgr_init(&f_config);
542d6eb040eSJung-Sang Ahn        filemgr_set_lazy_file_deletion(true,
543d6eb040eSJung-Sang Ahn                                       compactor_register_file_removing,
544d6eb040eSJung-Sang Ahn                                       compactor_is_file_removed);
54559c1c4f5SJung-Sang Ahn
546f30bfecdSSundar Sridharan        // initialize compaction daemon
547f30bfecdSSundar Sridharan        c_config.sleep_duration = _config.compactor_sleep_duration;
548c40231deSChiyoung Seo        c_config.num_threads = _config.num_compactor_threads;
549f30bfecdSSundar Sridharan        compactor_init(&c_config);
55059c1c4f5SJung-Sang Ahn
551f30bfecdSSundar Sridharan        fdb_initialized = 1;
55259c1c4f5SJung-Sang Ahn    }
553f30bfecdSSundar Sridharan    fdb_open_inprog++;
554f30bfecdSSundar Sridharan    spin_unlock(&initial_lock);
555f30bfecdSSundar Sridharan
55659c1c4f5SJung-Sang Ahn    return FDB_RESULT_SUCCESS;
55759c1c4f5SJung-Sang Ahn}
55859c1c4f5SJung-Sang Ahn
559ddae8414SChiyoung SeoLIBFDB_API
560ddae8414SChiyoung Seofdb_config fdb_get_default_config(void) {
561ddae8414SChiyoung Seo    return get_default_config();
562ddae8414SChiyoung Seo}
563ddae8414SChiyoung Seo
5648e1b9ec2SJung-Sang AhnLIBFDB_API
565d32401daSJung-Sang Ahnfdb_kvs_config fdb_get_default_kvs_config(void) {
566d32401daSJung-Sang Ahn    return get_default_kvs_config();
567d32401daSJung-Sang Ahn}
568d32401daSJung-Sang Ahn
569d32401daSJung-Sang AhnLIBFDB_API
570d32401daSJung-Sang Ahnfdb_status fdb_open(fdb_file_handle **ptr_fhandle,
57188a8c486SChiyoung Seo                    const char *filename,
572f145a9b7SChiyoung Seo                    fdb_config *fconfig)
57388a8c486SChiyoung Seo{
57488a8c486SChiyoung Seo#ifdef _MEMPOOL
57588a8c486SChiyoung Seo    mempool_init();
57688a8c486SChiyoung Seo#endif
5774fcd9f6eSJung-Sang Ahn
57888a8c486SChiyoung Seo    fdb_config config;
579d32401daSJung-Sang Ahn    fdb_file_handle *fhandle;
580e6e251adSSundar Sridharan    fdb_kvs_handle *handle;
5814fcd9f6eSJung-Sang Ahn
582f145a9b7SChiyoung Seo    if (fconfig) {
583f145a9b7SChiyoung Seo        if (validate_fdb_config(fconfig)) {
584f145a9b7SChiyoung Seo            config = *fconfig;
585f145a9b7SChiyoung Seo        } else {
586f145a9b7SChiyoung Seo            return FDB_RESULT_INVALID_CONFIG;
587f145a9b7SChiyoung Seo        }
588f145a9b7SChiyoung Seo    } else {
589ddae8414SChiyoung Seo        config = get_default_config();
590f145a9b7SChiyoung Seo    }
59188a8c486SChiyoung Seo
592d32401daSJung-Sang Ahn    fhandle = (fdb_file_handle*)calloc(1, sizeof(fdb_file_handle));
593e4da0646SChiyoung Seo    if (!fhandle) { // LCOV_EXCL_START
594d32401daSJung-Sang Ahn        return FDB_RESULT_ALLOC_FAIL;
595e4da0646SChiyoung Seo    } // LCOV_EXCL_STOP
596d32401daSJung-Sang Ahn
597e6e251adSSundar Sridharan    handle = (fdb_kvs_handle *) calloc(1, sizeof(fdb_kvs_handle));
598e4da0646SChiyoung Seo    if (!handle) { // LCOV_EXCL_START
599d32401daSJung-Sang Ahn        free(fhandle);
600b277f819SChiyoung Seo        return FDB_RESULT_ALLOC_FAIL;
601e4da0646SChiyoung Seo    } // LCOV_EXCL_STOP
602