1bc68bb02SChiyoung Seo/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
27c0433f5SJung-Sang Ahn/*
3bc68bb02SChiyoung Seo *     Copyright 2010 Couchbase, Inc
4bc68bb02SChiyoung Seo *
5bc68bb02SChiyoung Seo *   Licensed under the Apache License, Version 2.0 (the "License");
6bc68bb02SChiyoung Seo *   you may not use this file except in compliance with the License.
7bc68bb02SChiyoung Seo *   You may obtain a copy of the License at
8bc68bb02SChiyoung Seo *
9bc68bb02SChiyoung Seo *       http://www.apache.org/licenses/LICENSE-2.0
10bc68bb02SChiyoung Seo *
11bc68bb02SChiyoung Seo *   Unless required by applicable law or agreed to in writing, software
12bc68bb02SChiyoung Seo *   distributed under the License is distributed on an "AS IS" BASIS,
13bc68bb02SChiyoung Seo *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14bc68bb02SChiyoung Seo *   See the License for the specific language governing permissions and
15bc68bb02SChiyoung Seo *   limitations under the License.
167c0433f5SJung-Sang Ahn */
177c0433f5SJung-Sang Ahn
187c0433f5SJung-Sang Ahn#include <stdio.h>
197c0433f5SJung-Sang Ahn#include <stdlib.h>
207c0433f5SJung-Sang Ahn#include <string.h>
217c0433f5SJung-Sang Ahn#include <fcntl.h>
222534ac38SJung-Sang Ahn#include <sys/stat.h>
23f9a2bf6bSJens Alfke#include <stdarg.h>
246a3a5c2dSJung-Sang Ahn#if !defined(WIN32) && !defined(_WIN32)
256a3a5c2dSJung-Sang Ahn#include <sys/time.h>
266a3a5c2dSJung-Sang Ahn#endif
277c0433f5SJung-Sang Ahn
287c0433f5SJung-Sang Ahn#include "filemgr.h"
2947ec8bfcSSundar Sridharan#include "filemgr_ops.h"
307c0433f5SJung-Sang Ahn#include "hash_functions.h"
317c0433f5SJung-Sang Ahn#include "blockcache.h"
327c0433f5SJung-Sang Ahn#include "wal.h"
33d25f8d6dSJung-Sang Ahn#include "list.h"
34f9a2bf6bSJens Alfke#include "fdb_internal.h"
356a3a5c2dSJung-Sang Ahn#include "time_utils.h"
367c0433f5SJung-Sang Ahn
373d812dfcSJung-Sang Ahn#include "memleak.h"
383d812dfcSJung-Sang Ahn
397c0433f5SJung-Sang Ahn#ifdef __DEBUG
407c0433f5SJung-Sang Ahn#ifndef __DEBUG_FILEMGR
412889254eSJung-Sang Ahn    #undef DBG
422889254eSJung-Sang Ahn    #undef DBGCMD
43eea9c5e9SJung-Sang Ahn    #undef DBGSW
44ceca3b9fSJung-Sang Ahn    #define DBG(...)
45ceca3b9fSJung-Sang Ahn    #define DBGCMD(...)
46ceca3b9fSJung-Sang Ahn    #define DBGSW(n, ...)
477c0433f5SJung-Sang Ahn#endif
487c0433f5SJung-Sang Ahn#endif
497c0433f5SJung-Sang Ahn
507c0433f5SJung-Sang Ahn// NBUCKET must be power of 2
517c0433f5SJung-Sang Ahn#define NBUCKET (1024)
52393e6cadSGihwan Oh#define FILEMGR_MAGIC (UINT64_C(0xdeadcafebeefbeef))
537c0433f5SJung-Sang Ahn
547c0433f5SJung-Sang Ahn// global static variables
55f693a021SSundar Sridharan#ifdef SPIN_INITIALIZER
567c0433f5SJung-Sang Ahnstatic spin_t initial_lock = SPIN_INITIALIZER;
57f693a021SSundar Sridharan#else
584fcd9f6eSJung-Sang Ahnstatic volatile unsigned int initial_lock_status = 0;
59f693a021SSundar Sridharanstatic spin_t initial_lock;
60f693a021SSundar Sridharan#endif
61f693a021SSundar Sridharan
62f693a021SSundar Sridharan
6359c1c4f5SJung-Sang Ahnstatic volatile uint8_t filemgr_initialized = 0;
647c0433f5SJung-Sang Ahnstatic struct filemgr_config global_config;
657c0433f5SJung-Sang Ahnstatic struct hash hash;
663d812dfcSJung-Sang Ahnstatic spin_t filemgr_openlock;
677c0433f5SJung-Sang Ahn
68d25f8d6dSJung-Sang Ahnstruct temp_buf_item{
69d25f8d6dSJung-Sang Ahn    void *addr;
70d25f8d6dSJung-Sang Ahn    struct list_elem le;
71d25f8d6dSJung-Sang Ahn};
72d25f8d6dSJung-Sang Ahnstatic struct list temp_buf;
739e14d68eSJung-Sang Ahnstatic spin_t temp_buf_lock;
747c0433f5SJung-Sang Ahn
75d6eb040eSJung-Sang Ahnstatic bool lazy_file_deletion_enabled = false;
76d6eb040eSJung-Sang Ahnstatic register_file_removal_func register_file_removal = NULL;
77d6eb040eSJung-Sang Ahnstatic check_file_removal_func is_file_removed = NULL;
78d5bf4155SChiyoung Seo
7930a02785SJung-Sang Ahnstatic void spin_init_wrap(void *lock) {
8030a02785SJung-Sang Ahn    spin_init((spin_t*)lock);
8130a02785SJung-Sang Ahn}
8230a02785SJung-Sang Ahn
8330a02785SJung-Sang Ahnstatic void spin_destroy_wrap(void *lock) {
8430a02785SJung-Sang Ahn    spin_destroy((spin_t*)lock);
8530a02785SJung-Sang Ahn}
8630a02785SJung-Sang Ahn
8730a02785SJung-Sang Ahnstatic void spin_lock_wrap(void *lock) {
8830a02785SJung-Sang Ahn    spin_lock((spin_t*)lock);
8930a02785SJung-Sang Ahn}
9030a02785SJung-Sang Ahn
9130a02785SJung-Sang Ahnstatic void spin_unlock_wrap(void *lock) {
9230a02785SJung-Sang Ahn    spin_unlock((spin_t*)lock);
9330a02785SJung-Sang Ahn}
9430a02785SJung-Sang Ahn
9530a02785SJung-Sang Ahnstatic void mutex_init_wrap(void *lock) {
9630a02785SJung-Sang Ahn    mutex_init((mutex_t*)lock);
9730a02785SJung-Sang Ahn}
9830a02785SJung-Sang Ahn
9930a02785SJung-Sang Ahnstatic void mutex_destroy_wrap(void *lock) {
10030a02785SJung-Sang Ahn    mutex_destroy((mutex_t*)lock);
10130a02785SJung-Sang Ahn}
10230a02785SJung-Sang Ahn
10330a02785SJung-Sang Ahnstatic void mutex_lock_wrap(void *lock) {
10430a02785SJung-Sang Ahn    mutex_lock((mutex_t*)lock);
10530a02785SJung-Sang Ahn}
10630a02785SJung-Sang Ahn
10730a02785SJung-Sang Ahnstatic void mutex_unlock_wrap(void *lock) {
10830a02785SJung-Sang Ahn    mutex_unlock((mutex_t*)lock);
10930a02785SJung-Sang Ahn}
11030a02785SJung-Sang Ahn
111272e741fSSundar Sridharanstatic int _kvs_stat_cmp(struct avl_node *a, struct avl_node *b, void *aux)
112272e741fSSundar Sridharan{
113272e741fSSundar Sridharan    struct kvs_node *aa, *bb;
114272e741fSSundar Sridharan    aa = _get_entry(a, struct kvs_node, avl_id);
115272e741fSSundar Sridharan    bb = _get_entry(b, struct kvs_node, avl_id);
116272e741fSSundar Sridharan
117272e741fSSundar Sridharan    if (aa->id < bb->id) {
118272e741fSSundar Sridharan        return -1;
119272e741fSSundar Sridharan    } else if (aa->id > bb->id) {
120272e741fSSundar Sridharan        return 1;
121272e741fSSundar Sridharan    } else {
122272e741fSSundar Sridharan        return 0;
123272e741fSSundar Sridharan    }
124272e741fSSundar Sridharan}
125272e741fSSundar Sridharan
12630a02785SJung-Sang Ahnstatic int _block_is_overlapped(void *pbid1, void *pis_writer1,
12730a02785SJung-Sang Ahn                                void *pbid2, void *pis_writer2,
12830a02785SJung-Sang Ahn                                void *aux)
12930a02785SJung-Sang Ahn{
13030a02785SJung-Sang Ahn    (void)aux;
13130a02785SJung-Sang Ahn    bid_t bid1, is_writer1, bid2, is_writer2;
13230a02785SJung-Sang Ahn    bid1 = *(bid_t*)pbid1;
13330a02785SJung-Sang Ahn    is_writer1 = *(bid_t*)pis_writer1;
13430a02785SJung-Sang Ahn    bid2 = *(bid_t*)pbid2;
13530a02785SJung-Sang Ahn    is_writer2 = *(bid_t*)pis_writer2;
13630a02785SJung-Sang Ahn
13730a02785SJung-Sang Ahn    if (bid1 != bid2) {
13830a02785SJung-Sang Ahn        // not overlapped
13930a02785SJung-Sang Ahn        return 0;
14030a02785SJung-Sang Ahn    } else {
14130a02785SJung-Sang Ahn        // overlapped
14230a02785SJung-Sang Ahn        if (!is_writer1 && !is_writer2) {
14330a02785SJung-Sang Ahn            // both are readers
14430a02785SJung-Sang Ahn            return 0;
14530a02785SJung-Sang Ahn        } else {
14630a02785SJung-Sang Ahn            return 1;
14730a02785SJung-Sang Ahn        }
14830a02785SJung-Sang Ahn    }
14930a02785SJung-Sang Ahn}
15030a02785SJung-Sang Ahn
151f9a2bf6bSJens Alfkefdb_status fdb_log(err_log_callback *log_callback,
152f9a2bf6bSJens Alfke                   fdb_status status,
153f9a2bf6bSJens Alfke                   const char *format, ...)
154f9a2bf6bSJens Alfke{
155f9a2bf6bSJens Alfke    if (log_callback && log_callback->callback) {
156f9a2bf6bSJens Alfke        char msg[1024];
157f9a2bf6bSJens Alfke        va_list args;
158f9a2bf6bSJens Alfke        va_start(args, format);
159f9a2bf6bSJens Alfke        vsprintf(msg, format, args);
160f9a2bf6bSJens Alfke        va_end(args);
161f9a2bf6bSJens Alfke        log_callback->callback(status, msg, log_callback->ctx_data);
162f9a2bf6bSJens Alfke    }
163f9a2bf6bSJens Alfke    return status;
164f9a2bf6bSJens Alfke}
165f9a2bf6bSJens Alfke
16601e33605SChiyoung Seostatic void _log_errno_str(struct filemgr_ops *ops,
16701e33605SChiyoung Seo                           err_log_callback *log_callback,
168f9a2bf6bSJens Alfke                           fdb_status io_error,
169f9a2bf6bSJens Alfke                           const char *what,
170f9a2bf6bSJens Alfke                           const char *filename)
171f9a2bf6bSJens Alfke{
172f9a2bf6bSJens Alfke    if (io_error < 0) {
17301e33605SChiyoung Seo        char errno_msg[512];
17401e33605SChiyoung Seo        ops->get_errno_str(errno_msg, 512);
175f9a2bf6bSJens Alfke        fdb_log(log_callback, io_error,
176f9a2bf6bSJens Alfke                "Error in %s on a database file '%s', %s", what, filename, errno_msg);
17701e33605SChiyoung Seo    }
17801e33605SChiyoung Seo}
17901e33605SChiyoung Seo
180f80ddf4dSChiyoung Seostatic uint32_t _file_hash(struct hash *hash, struct hash_elem *e)
1817c0433f5SJung-Sang Ahn{
1822889254eSJung-Sang Ahn    struct filemgr *file = _get_entry(e, struct filemgr, e);
1832889254eSJung-Sang Ahn    int len = strlen(file->filename);
1844f2ebc0aSJung-Sang Ahn    return chksum(file->filename, len) & ((unsigned)(NBUCKET-1));
1857c0433f5SJung-Sang Ahn}
1867c0433f5SJung-Sang Ahn
187f80ddf4dSChiyoung Seostatic int _file_cmp(struct hash_elem *a, struct hash_elem *b)
1887c0433f5SJung-Sang Ahn{
1892889254eSJung-Sang Ahn    struct filemgr *aa, *bb;
1902889254eSJung-Sang Ahn    aa = _get_entry(a, struct filemgr, e);
1912889254eSJung-Sang Ahn    bb = _get_entry(b, struct filemgr, e);
1922889254eSJung-Sang Ahn    return strcmp(aa->filename, bb->filename);
1937c0433f5SJung-Sang Ahn}
1947c0433f5SJung-Sang Ahn
195eccdfcd9SJung-Sang Ahnvoid filemgr_init(struct filemgr_config *config)
1967c0433f5SJung-Sang Ahn{
19759c1c4f5SJung-Sang Ahn    // global initialization
19859c1c4f5SJung-Sang Ahn    // initialized only once at first time
1992889254eSJung-Sang Ahn    if (!filemgr_initialized) {
20059c1c4f5SJung-Sang Ahn#ifndef SPIN_INITIALIZER
20159c1c4f5SJung-Sang Ahn        // Note that only Windows passes through this routine
20259c1c4f5SJung-Sang Ahn        if (InterlockedCompareExchange(&initial_lock_status, 1, 0) == 0) {
20359c1c4f5SJung-Sang Ahn            // atomically initialize spin lock only once
20459c1c4f5SJung-Sang Ahn            spin_init(&initial_lock);
20559c1c4f5SJung-Sang Ahn            initial_lock_status = 2;
20659c1c4f5SJung-Sang Ahn        } else {
2075aa14abcSSundar Sridharan            // the others ... wait until initializing 'initial_lock' is done
20859c1c4f5SJung-Sang Ahn            while (initial_lock_status != 2) {
20959c1c4f5SJung-Sang Ahn                Sleep(1);
21059c1c4f5SJung-Sang Ahn            }
21159c1c4f5SJung-Sang Ahn        }
21259c1c4f5SJung-Sang Ahn#endif
2132889254eSJung-Sang Ahn
21459c1c4f5SJung-Sang Ahn        spin_lock(&initial_lock);
21559c1c4f5SJung-Sang Ahn        if (!filemgr_initialized) {
21659c1c4f5SJung-Sang Ahn            global_config = *config;
21759c1c4f5SJung-Sang Ahn
21859c1c4f5SJung-Sang Ahn            if (global_config.ncacheblock > 0)
21959c1c4f5SJung-Sang Ahn                bcache_init(global_config.ncacheblock, global_config.blocksize);
220abe1d5a9SSundar Sridharan
22159c1c4f5SJung-Sang Ahn            hash_init(&hash, NBUCKET, _file_hash, _file_cmp);
2222889254eSJung-Sang Ahn
22359c1c4f5SJung-Sang Ahn            // initialize temp buffer
22459c1c4f5SJung-Sang Ahn            list_init(&temp_buf);
22559c1c4f5SJung-Sang Ahn            spin_init(&temp_buf_lock);
2263d812dfcSJung-Sang Ahn
22759c1c4f5SJung-Sang Ahn            // initialize global lock
22859c1c4f5SJung-Sang Ahn            spin_init(&filemgr_openlock);
229d25f8d6dSJung-Sang Ahn
23059c1c4f5SJung-Sang Ahn            // set the initialize flag
23159c1c4f5SJung-Sang Ahn            filemgr_initialized = 1;
23259c1c4f5SJung-Sang Ahn        }
23359c1c4f5SJung-Sang Ahn        spin_unlock(&initial_lock);
2342889254eSJung-Sang Ahn    }
2357c0433f5SJung-Sang Ahn}
2367c0433f5SJung-Sang Ahn
237d6eb040eSJung-Sang Ahnvoid filemgr_set_lazy_file_deletion(bool enable,
238d6eb040eSJung-Sang Ahn                                    register_file_removal_func regis_func,
239d6eb040eSJung-Sang Ahn                                    check_file_removal_func check_func)
240d6eb040eSJung-Sang Ahn{
241d6eb040eSJung-Sang Ahn    lazy_file_deletion_enabled = enable;
242d6eb040eSJung-Sang Ahn    register_file_removal = regis_func;
243d6eb040eSJung-Sang Ahn    is_file_removed = check_func;
244d6eb040eSJung-Sang Ahn}
245d6eb040eSJung-Sang Ahn
246f80ddf4dSChiyoung Seostatic void * _filemgr_get_temp_buf()
2479e14d68eSJung-Sang Ahn{
248d25f8d6dSJung-Sang Ahn    struct list_elem *e;
249d25f8d6dSJung-Sang Ahn    struct temp_buf_item *item;
250d25f8d6dSJung-Sang Ahn
2519e14d68eSJung-Sang Ahn    spin_lock(&temp_buf_lock);
252d25f8d6dSJung-Sang Ahn    e = list_pop_front(&temp_buf);
253d25f8d6dSJung-Sang Ahn    if (e) {
254d25f8d6dSJung-Sang Ahn        item = _get_entry(e, struct temp_buf_item, le);
25518054ff6SChiyoung Seo    } else {
256d25f8d6dSJung-Sang Ahn        void *addr;
257abe1d5a9SSundar Sridharan
25818054ff6SChiyoung Seo        malloc_align(addr, FDB_SECTOR_SIZE,
25918054ff6SChiyoung Seo                     global_config.blocksize + sizeof(struct temp_buf_item));
260d25f8d6dSJung-Sang Ahn
2618e1b9ec2SJung-Sang Ahn        item = (struct temp_buf_item *)((uint8_t *) addr + global_config.blocksize);
262d25f8d6dSJung-Sang Ahn        item->addr = addr;
263d25f8d6dSJung-Sang Ahn    }
264d25f8d6dSJung-Sang Ahn    spin_unlock(&temp_buf_lock);
265abe1d5a9SSundar Sridharan
266d25f8d6dSJung-Sang Ahn    return item->addr;
267d25f8d6dSJung-Sang Ahn}
268d25f8d6dSJung-Sang Ahn
269f80ddf4dSChiyoung Seostatic void _filemgr_release_temp_buf(void *buf)
270d25f8d6dSJung-Sang Ahn{
271d25f8d6dSJung-Sang Ahn    struct temp_buf_item *item;
272d25f8d6dSJung-Sang Ahn
273d25f8d6dSJung-Sang Ahn    spin_lock(&temp_buf_lock);
2748e1b9ec2SJung-Sang Ahn    item = (struct temp_buf_item*)((uint8_t *)buf + global_config.blocksize);
275d25f8d6dSJung-Sang Ahn    list_push_front(&temp_buf, &item->le);
276abe1d5a9SSundar Sridharan    spin_unlock(&temp_buf_lock);
277d25f8d6dSJung-Sang Ahn}
278d25f8d6dSJung-Sang Ahn
279f80ddf4dSChiyoung Seostatic void _filemgr_shutdown_temp_buf()
280d25f8d6dSJung-Sang Ahn{
281d25f8d6dSJung-Sang Ahn    struct list_elem *e;
282d25f8d6dSJung-Sang Ahn    struct temp_buf_item *item;
283d25f8d6dSJung-Sang Ahn    size_t count=0;
284d25f8d6dSJung-Sang Ahn
285d25f8d6dSJung-Sang Ahn    spin_lock(&temp_buf_lock);
286d25f8d6dSJung-Sang Ahn    e = list_begin(&temp_buf);
287d25f8d6dSJung-Sang Ahn    while(e){
288d25f8d6dSJung-Sang Ahn        item = _get_entry(e, struct temp_buf_item, le);
289d25f8d6dSJung-Sang Ahn        e = list_remove(&temp_buf, e);
290f693a021SSundar Sridharan        free_align(item->addr);
291d25f8d6dSJung-Sang Ahn        count++;
292d25f8d6dSJung-Sang Ahn    }
2939e14d68eSJung-Sang Ahn    spin_unlock(&temp_buf_lock);
2949e14d68eSJung-Sang Ahn}
2959e14d68eSJung-Sang Ahn
29618054ff6SChiyoung Seostatic fdb_status _filemgr_read_header(struct filemgr *file,
29718054ff6SChiyoung Seo                                       err_log_callback *log_callback)
2987c0433f5SJung-Sang Ahn{
299818cce61SJung-Sang Ahn    uint8_t marker[BLK_MARKER_SIZE];
30080e69aceSJung-Sang Ahn    filemgr_magic_t magic;
30180e69aceSJung-Sang Ahn    filemgr_header_len_t len;
302abe1d5a9SSundar Sridharan    uint8_t *buf;
3034a0379f5SJung-Sang Ahn    uint32_t crc, crc_file;
304cb0f0747SSundar Sridharan    fdb_status status = FDB_RESULT_SUCCESS;
305d25f8d6dSJung-Sang Ahn
306d25f8d6dSJung-Sang Ahn    // get temp buffer
3078e1b9ec2SJung-Sang Ahn    buf = (uint8_t *) _filemgr_get_temp_buf();
3089e14d68eSJung-Sang Ahn
309ddbd458dSChiyoung Seo    if (atomic_get_uint64_t(&file->pos) > 0) {
310abe1d5a9SSundar Sridharan        // Crash Recovery Test 1: unaligned last block write
311ddbd458dSChiyoung Seo        uint64_t remain = atomic_get_uint64_t(&file->pos) % file->blocksize;
312abe1d5a9SSundar Sridharan        if (remain) {
3130745bc38SJung-Sang Ahn            atomic_sub_uint64_t(&file->pos, remain);
314ddbd458dSChiyoung Seo            atomic_store_uint64_t(&file->last_commit, atomic_get_uint64_t(&file->pos));
315c279f70eSChiyoung Seo            const char *msg = "Crash Detected: %" _F64 " non-block aligned bytes discarded "
31618054ff6SChiyoung Seo                "from a database file '%s'\n";
31718054ff6SChiyoung Seo            DBG(msg, remain, file->filename);
31818054ff6SChiyoung Seo            fdb_log(log_callback, FDB_RESULT_READ_FAIL /* Need to add a better error code*/,
31918054ff6SChiyoung Seo                    msg, remain, file->filename);
320abe1d5a9SSundar Sridharan        }
321d9045a26SChiyoung Seo
3226ad4926cSChiyoung Seo        size_t block_counter = 0;
323abe1d5a9SSundar Sridharan        do {
324cb0f0747SSundar Sridharan            ssize_t rv = file->ops->pread(file->fd, buf, file->blocksize,
325ddbd458dSChiyoung Seo                atomic_get_uint64_t(&file->pos) - file->blocksize);
326cb0f0747SSundar Sridharan            if (rv != file->blocksize) {
327cb0f0747SSundar Sridharan                status = FDB_RESULT_READ_FAIL;
32818054ff6SChiyoung Seo                const char *msg = "Unable to read a database file '%s' with "
329c279f70eSChiyoung Seo                    "blocksize %" _F64 "\n";
33018054ff6SChiyoung Seo                DBG(msg, file->filename, file->blocksize);
33118054ff6SChiyoung Seo                fdb_log(log_callback, status, msg, file->filename, file->blocksize);
332cb0f0747SSundar Sridharan                break;
333cb0f0747SSundar Sridharan            }
3346ad4926cSChiyoung Seo            ++block_counter;
335abe1d5a9SSundar Sridharan            memcpy(marker, buf + file->blocksize - BLK_MARKER_SIZE,
336abe1d5a9SSundar Sridharan                   BLK_MARKER_SIZE);
337abe1d5a9SSundar Sridharan
338abe1d5a9SSundar Sridharan            if (marker[0] == BLK_MARKER_DBHEADER) {
339abe1d5a9SSundar Sridharan                // possible need for byte conversions here
3404a0379f5SJung-Sang Ahn                memcpy(&magic,
3414a0379f5SJung-Sang Ahn                       buf + file->blocksize - BLK_MARKER_SIZE - sizeof(magic),
3424a0379f5SJung-Sang Ahn                       sizeof(magic));
3436d79432aSJung-Sang Ahn                magic = _endian_decode(magic);
3446d79432aSJung-Sang Ahn
345abe1d5a9SSundar Sridharan                if (magic == FILEMGR_MAGIC) {
3464a0379f5SJung-Sang Ahn                    memcpy(&len,
3474a0379f5SJung-Sang Ahn                           buf + file->blocksize - BLK_MARKER_SIZE -
3484a0379f5SJung-Sang Ahn                           sizeof(magic) - sizeof(len),
3494a0379f5SJung-Sang Ahn                           sizeof(len));
3506d79432aSJung-Sang Ahn                    len = _endian_decode(len);
3516d79432aSJung-Sang Ahn
3524a0379f5SJung-Sang Ahn                    crc = chksum(buf, len - sizeof(crc));
3534a0379f5SJung-Sang Ahn                    memcpy(&crc_file, buf + len - sizeof(crc), sizeof(crc));
3544a0379f5SJung-Sang Ahn                    crc_file = _endian_decode(crc_file);
3554a0379f5SJung-Sang Ahn                    if (crc == crc_file) {
3564a0379f5SJung-Sang Ahn                        file->header.data = (void *)malloc(len);
3574a0379f5SJung-Sang Ahn
3584a0379f5SJung-Sang Ahn                        memcpy(file->header.data, buf, len);
3594a0379f5SJung-Sang Ahn                        memcpy(&file->header.revnum, buf + len,
3604a0379f5SJung-Sang Ahn                               sizeof(filemgr_header_revnum_t));
3614a0379f5SJung-Sang Ahn                        memcpy((void *) &file->header.seqnum,
3624a0379f5SJung-Sang Ahn                                buf + len + sizeof(filemgr_header_revnum_t),
3634a0379f5SJung-Sang Ahn                                sizeof(fdb_seqnum_t));
3644a0379f5SJung-Sang Ahn                        file->header.revnum =
3654a0379f5SJung-Sang Ahn                            _endian_decode(file->header.revnum);
3664a0379f5SJung-Sang Ahn                        file->header.seqnum =
3674a0379f5SJung-Sang Ahn                            _endian_decode(file->header.seqnum);
3684a0379f5SJung-Sang Ahn                        file->header.size = len;
3690745bc38SJung-Sang Ahn                        atomic_store_uint64_t(&file->header.bid,
370ddbd458dSChiyoung Seo                            (atomic_get_uint64_t(&file->pos) / file->blocksize) - 1);
3710745bc38SJung-Sang Ahn                        atomic_store_uint64_t(&file->header.dirty_idtree_root,
3720745bc38SJung-Sang Ahn                                              BLK_NOT_FOUND);
3730745bc38SJung-Sang Ahn                        atomic_store_uint64_t(&file->header.dirty_seqtree_root,
3740745bc38SJung-Sang Ahn                                              BLK_NOT_FOUND);
375f54fbdb6SJung-Sang Ahn                        memset(&file->header.stat, 0x0, sizeof(file->header.stat));
376590b6bb7SJung-Sang Ahn
3774a0379f5SJung-Sang Ahn                        // release temp buffer
3784a0379f5SJung-Sang Ahn                        _filemgr_release_temp_buf(buf);
3794a0379f5SJung-Sang Ahn
3803c6d74b4SSundar Sridharan                        return FDB_RESULT_SUCCESS;
381abe1d5a9SSundar Sridharan                    } else {
3823c6d74b4SSundar Sridharan                        status = FDB_RESULT_CHECKSUM_ERROR;
38318054ff6SChiyoung Seo                        const char *msg = "Crash Detected: CRC on disk %u != %u "
38418054ff6SChiyoung Seo                            "in a database file '%s'\n";
38518054ff6SChiyoung Seo                        DBG(msg, crc_file, crc, file->filename);
38618054ff6SChiyoung Seo                        fdb_log(log_callback, status, msg, crc_file, crc,
38718054ff6SChiyoung Seo                                file->filename);
388abe1d5a9SSundar Sridharan                    }
389abe1d5a9SSundar Sridharan                } else {
3903c6d74b4SSundar Sridharan                    status = FDB_RESULT_FILE_CORRUPTION;
391c279f70eSChiyoung Seo                    const char *msg = "Crash Detected: Wrong Magic %" _F64 " != %" _F64
392c279f70eSChiyoung Seo                        " in a database file '%s'\n";
39318054ff6SChiyoung Seo                    DBG(msg, magic, FILEMGR_MAGIC, file->filename);
39418054ff6SChiyoung Seo                    fdb_log(log_callback, status, msg, magic, FILEMGR_MAGIC,
39518054ff6SChiyoung Seo                            file->filename);
396abe1d5a9SSundar Sridharan                }
397abe1d5a9SSundar Sridharan            } else {
3986ad4926cSChiyoung Seo                status = FDB_RESULT_NO_DB_HEADERS;
3996ad4926cSChiyoung Seo                if (block_counter == 1) {
4006ad4926cSChiyoung Seo                    const char *msg = "Crash Detected: Last Block not DBHEADER %0.01x "
4016ad4926cSChiyoung Seo                                      "in a database file '%s'\n";
4026ad4926cSChiyoung Seo                    DBG(msg, marker[0], file->filename);
4036ad4926cSChiyoung Seo                    fdb_log(log_callback, status, msg, marker[0], file->filename);
4046ad4926cSChiyoung Seo                }
405e6449f52SJung-Sang Ahn            }
406abe1d5a9SSundar Sridharan
4070745bc38SJung-Sang Ahn            atomic_sub_uint64_t(&file->pos, file->blocksize);
408ddbd458dSChiyoung Seo            atomic_store_uint64_t(&file->last_commit, atomic_get_uint64_t(&file->pos));
409ddbd458dSChiyoung Seo        } while (atomic_get_uint64_t(&file->pos));
4109e14d68eSJung-Sang Ahn    }
411d25f8d6dSJung-Sang Ahn
412d25f8d6dSJung-Sang Ahn    // release temp buffer
413d25f8d6dSJung-Sang Ahn    _filemgr_release_temp_buf(buf);
414abe1d5a9SSundar Sridharan
415818cce61SJung-Sang Ahn    file->header.size = 0;
416e6449f52SJung-Sang Ahn    file->header.revnum = 0;
4173b8a7d04SJung-Sang Ahn    file->header.seqnum = 0;
418818cce61SJung-Sang Ahn    file->header.data = NULL;
4190745bc38SJung-Sang Ahn    atomic_store_uint64_t(&file->header.bid, 0);
4200745bc38SJung-Sang Ahn    atomic_store_uint64_t(&file->header.dirty_idtree_root, BLK_NOT_FOUND);
4210745bc38SJung-Sang Ahn    atomic_store_uint64_t(&file->header.dirty_seqtree_root, BLK_NOT_FOUND);
422f54fbdb6SJung-Sang Ahn    memset(&file->header.stat, 0x0, sizeof(file->header.stat));
423cb0f0747SSundar Sridharan    return status;
4247c0433f5SJung-Sang Ahn}
4257c0433f5SJung-Sang Ahn
42647fac250SJung-Sang Ahnsize_t filemgr_get_ref_count(struct filemgr *file)
42747fac250SJung-Sang Ahn{
42847fac250SJung-Sang Ahn    size_t ret = 0;
42947fac250SJung-Sang Ahn    spin_lock(&file->lock);
43047fac250SJung-Sang Ahn    ret = file->ref_count;
43147fac250SJung-Sang Ahn    spin_unlock(&file->lock);
43247fac250SJung-Sang Ahn    return ret;
43347fac250SJung-Sang Ahn}
43447fac250SJung-Sang Ahn
4355030839bSSundar Sridharanuint64_t filemgr_get_bcache_used_space(void)
4365030839bSSundar Sridharan{
4375030839bSSundar Sridharan    uint64_t bcache_free_space = 0;
4385030839bSSundar Sridharan    if (global_config.ncacheblock) { // If buffer cache is indeed configured
4395030839bSSundar Sridharan        bcache_free_space = bcache_get_num_free_blocks();
4405030839bSSundar Sridharan        bcache_free_space = (global_config.ncacheblock - bcache_free_space)
4415030839bSSundar Sridharan                          * global_config.blocksize;
4425030839bSSundar Sridharan    }
4435030839bSSundar Sridharan    return bcache_free_space;
4445030839bSSundar Sridharan}
4455030839bSSundar Sridharan
4466a3a5c2dSJung-Sang Ahnstruct filemgr_prefetch_args {
4476a3a5c2dSJung-Sang Ahn    struct filemgr *file;
4486a3a5c2dSJung-Sang Ahn    uint64_t duration;
44918054ff6SChiyoung Seo    err_log_callback *log_callback;
4506a3a5c2dSJung-Sang Ahn    void *aux;
4516a3a5c2dSJung-Sang Ahn};
4526a3a5c2dSJung-Sang Ahn
453f80ddf4dSChiyoung Seostatic void *_filemgr_prefetch_thread(void *voidargs)
4546a3a5c2dSJung-Sang Ahn{
4556a3a5c2dSJung-Sang Ahn    struct filemgr_prefetch_args *args = (struct filemgr_prefetch_args*)voidargs;
4566a3a5c2dSJung-Sang Ahn    uint8_t *buf = alca(uint8_t, args->file->blocksize);
4576a3a5c2dSJung-Sang Ahn    uint64_t cur_pos = 0, i;
4586a3a5c2dSJung-Sang Ahn    uint64_t bcache_free_space;
4596a3a5c2dSJung-Sang Ahn    bid_t bid;
4606a3a5c2dSJung-Sang Ahn    bool terminate = false;
4616a3a5c2dSJung-Sang Ahn    struct timeval begin, cur, gap;
4626a3a5c2dSJung-Sang Ahn
4636a3a5c2dSJung-Sang Ahn    spin_lock(&args->file->lock);
464ddbd458dSChiyoung Seo    cur_pos = atomic_get_uint64_t(&args->file->last_commit);
4656a3a5c2dSJung-Sang Ahn    spin_unlock(&args->file->lock);
4666a3a5c2dSJung-Sang Ahn    if (cur_pos < FILEMGR_PREFETCH_UNIT) {
4676a3a5c2dSJung-Sang Ahn        terminate = true;
4686a3a5c2dSJung-Sang Ahn    } else {
4696a3a5c2dSJung-Sang Ahn        cur_pos -= FILEMGR_PREFETCH_UNIT;
4706a3a5c2dSJung-Sang Ahn    }
4716a3a5c2dSJung-Sang Ahn    // read backwards from the end of the file, in the unit of FILEMGR_PREFETCH_UNIT
4726a3a5c2dSJung-Sang Ahn    gettimeofday(&begin, NULL);
4736a3a5c2dSJung-Sang Ahn    while (!terminate) {
4746a3a5c2dSJung-Sang Ahn        for (i = cur_pos;
4756a3a5c2dSJung-Sang Ahn             i < cur_pos + FILEMGR_PREFETCH_UNIT;
4766a3a5c2dSJung-Sang Ahn             i += args->file->blocksize) {
4776a3a5c2dSJung-Sang Ahn
4786a3a5c2dSJung-Sang Ahn            gettimeofday(&cur, NULL);
4796a3a5c2dSJung-Sang Ahn            gap = _utime_gap(begin, cur);
4806a3a5c2dSJung-Sang Ahn            bcache_free_space = bcache_get_num_free_blocks();
4816a3a5c2dSJung-Sang Ahn            bcache_free_space *= args->file->blocksize;
4826a3a5c2dSJung-Sang Ahn
4836a3a5c2dSJung-Sang Ahn            if (args->file->prefetch_status == FILEMGR_PREFETCH_ABORT ||
4845aa14abcSSundar Sridharan                gap.tv_sec >= (int64_t)args->duration ||
4856a3a5c2dSJung-Sang Ahn                bcache_free_space < FILEMGR_PREFETCH_UNIT) {
4866a3a5c2dSJung-Sang Ahn                // terminate thread when
4876a3a5c2dSJung-Sang Ahn                // 1. got abort signal
4886a3a5c2dSJung-Sang Ahn                // 2. time out
4896a3a5c2dSJung-Sang Ahn                // 3. not enough free space in block cache
4906a3a5c2dSJung-Sang Ahn                terminate = true;
4916a3a5c2dSJung-Sang Ahn                break;
4926a3a5c2dSJung-Sang Ahn            } else {
4936a3a5c2dSJung-Sang Ahn                bid = i / args->file->blocksize;
494c6c3d274SChiyoung Seo                if (filemgr_read(args->file, bid, buf, NULL, true)
495cb0f0747SSundar Sridharan                        != FDB_RESULT_SUCCESS) {
496cb0f0747SSundar Sridharan                    // 4. read failure
49718054ff6SChiyoung Seo                    fdb_log(args->log_callback, FDB_RESULT_READ_FAIL,
498c279f70eSChiyoung Seo                            "Prefetch thread failed to read a block with block id %" _F64
499c279f70eSChiyoung Seo                            " from a database file '%s'", bid, args->file->filename);
500cb0f0747SSundar Sridharan                    terminate = true;
501cb0f0747SSundar Sridharan                    break;
502cb0f0747SSundar Sridharan                }
5036a3a5c2dSJung-Sang Ahn            }
5046a3a5c2dSJung-Sang Ahn        }
5056a3a5c2dSJung-Sang Ahn
5066a3a5c2dSJung-Sang Ahn        if (cur_pos >= FILEMGR_PREFETCH_UNIT) {
5076a3a5c2dSJung-Sang Ahn            cur_pos -= FILEMGR_PREFETCH_UNIT;
5086a3a5c2dSJung-Sang Ahn        } else {
5096a3a5c2dSJung-Sang Ahn            // remaining space is less than FILEMGR_PREFETCH_UNIT
5106a3a5c2dSJung-Sang Ahn            terminate = true;
5116a3a5c2dSJung-Sang Ahn        }
5126a3a5c2dSJung-Sang Ahn    }
5136a3a5c2dSJung-Sang Ahn
5146a3a5c2dSJung-Sang Ahn    args->file->prefetch_status = FILEMGR_PREFETCH_IDLE;
5156a3a5c2dSJung-Sang Ahn    free(args);
5166a3a5c2dSJung-Sang Ahn    return NULL;
5176a3a5c2dSJung-Sang Ahn}
5186a3a5c2dSJung-Sang Ahn
5196a3a5c2dSJung-Sang Ahn// prefetch the given DB file
5206a3a5c2dSJung-Sang Ahnvoid filemgr_prefetch(struct filemgr *file,
52118054ff6SChiyoung Seo                      struct filemgr_config *config,
52218054ff6SChiyoung Seo                      err_log_callback *log_callback)
5236a3a5c2dSJung-Sang Ahn{
5246a3a5c2dSJung-Sang Ahn    uint64_t bcache_free_space;
5256a3a5c2dSJung-Sang Ahn
5266a3a5c2dSJung-Sang Ahn    bcache_free_space = bcache_get_num_free_blocks();
5276a3a5c2dSJung-Sang Ahn    bcache_free_space *= file->blocksize;
5286a3a5c2dSJung-Sang Ahn
5296a3a5c2dSJung-Sang Ahn    // block cache should have free space larger than FILEMGR_PREFETCH_UNIT
5306a3a5c2dSJung-Sang Ahn    spin_lock(&file->lock);
531ddbd458dSChiyoung Seo    if (atomic_get_uint64_t(&file->last_commit) > 0 &&
5326a3a5c2dSJung-Sang Ahn        bcache_free_space >= FILEMGR_PREFETCH_UNIT) {
5336a3a5c2dSJung-Sang Ahn        // invoke prefetch thread
5346a3a5c2dSJung-Sang Ahn        struct filemgr_prefetch_args *args;
5356a3a5c2dSJung-Sang Ahn        args = (struct filemgr_prefetch_args *)
5366a3a5c2dSJung-Sang Ahn               calloc(1, sizeof(struct filemgr_prefetch_args));
5376a3a5c2dSJung-Sang Ahn        args->file = file;
5386a3a5c2dSJung-Sang Ahn        args->duration = config->prefetch_duration;
53918054ff6SChiyoung Seo        args->log_callback = log_callback;
5406a3a5c2dSJung-Sang Ahn
5416a3a5c2dSJung-Sang Ahn        file->prefetch_status = FILEMGR_PREFETCH_RUNNING;
5426a3a5c2dSJung-Sang Ahn        thread_create(&file->prefetch_tid, _filemgr_prefetch_thread, args);
5436a3a5c2dSJung-Sang Ahn    }
5446a3a5c2dSJung-Sang Ahn    spin_unlock(&file->lock);
5456a3a5c2dSJung-Sang Ahn}
5466a3a5c2dSJung-Sang Ahn
547d1c2e4d7SSundar Sridharanfdb_status filemgr_does_file_exist(char *filename) {
548d1c2e4d7SSundar Sridharan    struct filemgr_ops *ops = get_filemgr_ops();
549d1c2e4d7SSundar Sridharan    int fd = ops->open(filename, O_RDONLY, 0444);
550d1c2e4d7SSundar Sridharan    if (fd < 0) {
551d1c2e4d7SSundar Sridharan        return (fdb_status) fd;
552d1c2e4d7SSundar Sridharan    }
553d1c2e4d7SSundar Sridharan    ops->close(fd);
554d1c2e4d7SSundar Sridharan    return FDB_RESULT_SUCCESS;
555d1c2e4d7SSundar Sridharan}
556d1c2e4d7SSundar Sridharan
557a765ad59SChiyoung Seofilemgr_open_result filemgr_open(char *filename, struct filemgr_ops *ops,
558a765ad59SChiyoung Seo                                 struct filemgr_config *config,
559a765ad59SChiyoung Seo                                 err_log_callback *log_callback)
5607c0433f5SJung-Sang Ahn{
561d9045a26SChiyoung Seo    struct filemgr *file = NULL;
562d9045a26SChiyoung Seo    struct filemgr query;
563d9045a26SChiyoung Seo    struct hash_elem *e = NULL;
564a765ad59SChiyoung Seo    bool create = config->options & FILEMGR_CREATE;
5652534ac38SJung-Sang Ahn    int file_flag = 0x0;
566d5c97dcbSSundar Sridharan    int fd = -1;
5673c6d74b4SSundar Sridharan    fdb_status status;
568a765ad59SChiyoung Seo    filemgr_open_result result = {NULL, FDB_RESULT_OPEN_FAIL};
569d9045a26SChiyoung Seo
57059c1c4f5SJung-Sang Ahn    filemgr_init(config);
571d9045a26SChiyoung Seo
572d9045a26SChiyoung Seo    // check whether file is already opened or not
573d9045a26SChiyoung Seo    query.filename = filename;
5743d812dfcSJung-Sang Ahn    spin_lock(&filemgr_openlock);
575d9045a26SChiyoung Seo    e = hash_find(&hash, &query.e);
576d9045a26SChiyoung Seo
577d9045a26SChiyoung Seo    if (e) {
5787be5b070SJung-Sang Ahn        // already opened (return existing structure)
579d9045a26SChiyoung Seo        file = _get_entry(e, struct filemgr, e);
580e6449f52SJung-Sang Ahn
5813d812dfcSJung-Sang Ahn        spin_lock(&file->lock);
582d9045a26SChiyoung Seo        file->ref_count++;
583d5bf4155SChiyoung Seo
584ddbd458dSChiyoung Seo        if (atomic_get_uint8_t(&file->status) == FILE_CLOSED) { // if file was closed before
585352faaaaSChiyoung Seo            file_flag = O_RDWR;
586a765ad59SChiyoung Seo            if (create) {
587a765ad59SChiyoung Seo                file_flag |= O_CREAT;
588a765ad59SChiyoung Seo            }
589add47b41SJung-Sang Ahn            *file->config = *config;
590add47b41SJung-Sang Ahn            file->config->blocksize = global_config.blocksize;
591add47b41SJung-Sang Ahn            file->config->ncacheblock = global_config.ncacheblock;
592d0a4f834SChiyoung Seo            file_flag |= config->flag;
5932534ac38SJung-Sang Ahn            file->fd = file->ops->open(file->filename, file_flag, 0666);
594d5c97dcbSSundar Sridharan            if (file->fd < 0) {
595d5bf4155SChiyoung Seo                if (file->fd == FDB_RESULT_NO_SUCH_FILE) {
596d5bf4155SChiyoung Seo                    // A database file was manually deleted by the user.
597d5bf4155SChiyoung Seo                    // Clean up global hash table, WAL index, and buffer cache.
598a58fe5bbSSundar Sridharan                    // Then, retry it with a create option below IFF it is not
599a58fe5bbSSundar Sridharan                    // a read-only open attempt
600beee2b43SJung-Sang Ahn                    struct hash_elem *ret;
601d5bf4155SChiyoung Seo                    spin_unlock(&file->lock);
602beee2b43SJung-Sang Ahn                    ret = hash_remove(&hash, &file->e);
603b6be7a1dSSundar Sridharan                    fdb_assert(ret, 0, 0);
6045f7821baSChiyoung Seo                    filemgr_free_func(&file->e);
605a58fe5bbSSundar Sridharan                    if (!create) {
606a58fe5bbSSundar Sridharan                        _log_errno_str(ops, log_callback,
607a58fe5bbSSundar Sridharan                                FDB_RESULT_NO_SUCH_FILE, "OPEN", filename);
608a58fe5bbSSundar Sridharan                        spin_unlock(&filemgr_openlock);
609a58fe5bbSSundar Sridharan                        result.rv = FDB_RESULT_NO_SUCH_FILE;
610a58fe5bbSSundar Sridharan                        return result;
611a58fe5bbSSundar Sridharan                    }
612d5bf4155SChiyoung Seo                } else {
613a58fe5bbSSundar Sridharan                    _log_errno_str(file->ops, log_callback,
614a58fe5bbSSundar Sridharan                                  (fdb_status)file->fd, "OPEN", filename);
615d5c97dcbSSundar Sridharan                    file->ref_count--;
616d5c97dcbSSundar Sridharan                    spin_unlock(&file->