1bc68bb02SChiyoung Seo/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
27c0433f5SJung-Sang Ahn/*
3bc68bb02SChiyoung Seo *     Copyright 2010 Couchbase, Inc
4bc68bb02SChiyoung Seo *
5bc68bb02SChiyoung Seo *   Licensed under the Apache License, Version 2.0 (the "License");
6bc68bb02SChiyoung Seo *   you may not use this file except in compliance with the License.
7bc68bb02SChiyoung Seo *   You may obtain a copy of the License at
8bc68bb02SChiyoung Seo *
9bc68bb02SChiyoung Seo *       http://www.apache.org/licenses/LICENSE-2.0
10bc68bb02SChiyoung Seo *
11bc68bb02SChiyoung Seo *   Unless required by applicable law or agreed to in writing, software
12bc68bb02SChiyoung Seo *   distributed under the License is distributed on an "AS IS" BASIS,
13bc68bb02SChiyoung Seo *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14bc68bb02SChiyoung Seo *   See the License for the specific language governing permissions and
15bc68bb02SChiyoung Seo *   limitations under the License.
167c0433f5SJung-Sang Ahn */
177c0433f5SJung-Sang Ahn
187c0433f5SJung-Sang Ahn#include <stdio.h>
197c0433f5SJung-Sang Ahn#include <stdlib.h>
207c0433f5SJung-Sang Ahn#include <string.h>
217c0433f5SJung-Sang Ahn#include <fcntl.h>
222534ac38SJung-Sang Ahn#include <sys/stat.h>
23f9a2bf6bSJens Alfke#include <stdarg.h>
246a3a5c2dSJung-Sang Ahn#if !defined(WIN32) && !defined(_WIN32)
256a3a5c2dSJung-Sang Ahn#include <sys/time.h>
26e94d4c0aSsduvuru#include <libgen.h>
276a3a5c2dSJung-Sang Ahn#endif
289c4bf803SJung-Sang Ahn#include <time.h>
297c0433f5SJung-Sang Ahn
307c0433f5SJung-Sang Ahn#include "filemgr.h"
3147ec8bfcSSundar Sridharan#include "filemgr_ops.h"
327c0433f5SJung-Sang Ahn#include "hash_functions.h"
337c0433f5SJung-Sang Ahn#include "blockcache.h"
347c0433f5SJung-Sang Ahn#include "wal.h"
35d25f8d6dSJung-Sang Ahn#include "list.h"
36f9a2bf6bSJens Alfke#include "fdb_internal.h"
376a3a5c2dSJung-Sang Ahn#include "time_utils.h"
384690a73fSJens Alfke#include "encryption.h"
393c6e2608SJung-Sang Ahn#include "version.h"
407c0433f5SJung-Sang Ahn
413d812dfcSJung-Sang Ahn#include "memleak.h"
423d812dfcSJung-Sang Ahn
437c0433f5SJung-Sang Ahn#ifdef __DEBUG
447c0433f5SJung-Sang Ahn#ifndef __DEBUG_FILEMGR
452889254eSJung-Sang Ahn    #undef DBG
462889254eSJung-Sang Ahn    #undef DBGCMD
47eea9c5e9SJung-Sang Ahn    #undef DBGSW
48ceca3b9fSJung-Sang Ahn    #define DBG(...)
49ceca3b9fSJung-Sang Ahn    #define DBGCMD(...)
50ceca3b9fSJung-Sang Ahn    #define DBGSW(n, ...)
517c0433f5SJung-Sang Ahn#endif
527c0433f5SJung-Sang Ahn#endif
537c0433f5SJung-Sang Ahn
547c0433f5SJung-Sang Ahn// NBUCKET must be power of 2
557c0433f5SJung-Sang Ahn#define NBUCKET (1024)
567c0433f5SJung-Sang Ahn
577c0433f5SJung-Sang Ahn// global static variables
58f693a021SSundar Sridharan#ifdef SPIN_INITIALIZER
597c0433f5SJung-Sang Ahnstatic spin_t initial_lock = SPIN_INITIALIZER;
60f693a021SSundar Sridharan#else
614fcd9f6eSJung-Sang Ahnstatic volatile unsigned int initial_lock_status = 0;
62f693a021SSundar Sridharanstatic spin_t initial_lock;
63f693a021SSundar Sridharan#endif
64f693a021SSundar Sridharan
65f693a021SSundar Sridharan
6659c1c4f5SJung-Sang Ahnstatic volatile uint8_t filemgr_initialized = 0;
674cdca91eSSundar Sridharanextern volatile uint8_t bgflusher_initialized;
687c0433f5SJung-Sang Ahnstatic struct filemgr_config global_config;
697c0433f5SJung-Sang Ahnstatic struct hash hash;
703d812dfcSJung-Sang Ahnstatic spin_t filemgr_openlock;
717c0433f5SJung-Sang Ahn
7288ae8e23SSundar Sridharanstatic const int MAX_STAT_UPDATE_RETRIES = 5;
7388ae8e23SSundar Sridharan
74d25f8d6dSJung-Sang Ahnstruct temp_buf_item{
75d25f8d6dSJung-Sang Ahn    void *addr;
76d25f8d6dSJung-Sang Ahn    struct list_elem le;
77d25f8d6dSJung-Sang Ahn};
78d25f8d6dSJung-Sang Ahnstatic struct list temp_buf;
799e14d68eSJung-Sang Ahnstatic spin_t temp_buf_lock;
807c0433f5SJung-Sang Ahn
81d6eb040eSJung-Sang Ahnstatic bool lazy_file_deletion_enabled = false;
82d6eb040eSJung-Sang Ahnstatic register_file_removal_func register_file_removal = NULL;
83d6eb040eSJung-Sang Ahnstatic check_file_removal_func is_file_removed = NULL;
84d5bf4155SChiyoung Seo
85f0b1bf77SJung-Sang Ahnstatic struct sb_ops sb_ops;
86f0b1bf77SJung-Sang Ahn
8730a02785SJung-Sang Ahnstatic void spin_init_wrap(void *lock) {
8830a02785SJung-Sang Ahn    spin_init((spin_t*)lock);
8930a02785SJung-Sang Ahn}
9030a02785SJung-Sang Ahn
9130a02785SJung-Sang Ahnstatic void spin_destroy_wrap(void *lock) {
9230a02785SJung-Sang Ahn    spin_destroy((spin_t*)lock);
9330a02785SJung-Sang Ahn}
9430a02785SJung-Sang Ahn
9530a02785SJung-Sang Ahnstatic void spin_lock_wrap(void *lock) {
9630a02785SJung-Sang Ahn    spin_lock((spin_t*)lock);
9730a02785SJung-Sang Ahn}
9830a02785SJung-Sang Ahn
9930a02785SJung-Sang Ahnstatic void spin_unlock_wrap(void *lock) {
10030a02785SJung-Sang Ahn    spin_unlock((spin_t*)lock);
10130a02785SJung-Sang Ahn}
10230a02785SJung-Sang Ahn
10330a02785SJung-Sang Ahnstatic void mutex_init_wrap(void *lock) {
10430a02785SJung-Sang Ahn    mutex_init((mutex_t*)lock);
10530a02785SJung-Sang Ahn}
10630a02785SJung-Sang Ahn
10730a02785SJung-Sang Ahnstatic void mutex_destroy_wrap(void *lock) {
10830a02785SJung-Sang Ahn    mutex_destroy((mutex_t*)lock);
10930a02785SJung-Sang Ahn}
11030a02785SJung-Sang Ahn
11130a02785SJung-Sang Ahnstatic void mutex_lock_wrap(void *lock) {
11230a02785SJung-Sang Ahn    mutex_lock((mutex_t*)lock);
11330a02785SJung-Sang Ahn}
11430a02785SJung-Sang Ahn
11530a02785SJung-Sang Ahnstatic void mutex_unlock_wrap(void *lock) {
11630a02785SJung-Sang Ahn    mutex_unlock((mutex_t*)lock);
11730a02785SJung-Sang Ahn}
11830a02785SJung-Sang Ahn
119272e741fSSundar Sridharanstatic int _kvs_stat_cmp(struct avl_node *a, struct avl_node *b, void *aux)
120272e741fSSundar Sridharan{
121272e741fSSundar Sridharan    struct kvs_node *aa, *bb;
122272e741fSSundar Sridharan    aa = _get_entry(a, struct kvs_node, avl_id);
123272e741fSSundar Sridharan    bb = _get_entry(b, struct kvs_node, avl_id);
124272e741fSSundar Sridharan
125272e741fSSundar Sridharan    if (aa->id < bb->id) {
126272e741fSSundar Sridharan        return -1;
127272e741fSSundar Sridharan    } else if (aa->id > bb->id) {
128272e741fSSundar Sridharan        return 1;
129272e741fSSundar Sridharan    } else {
130272e741fSSundar Sridharan        return 0;
131272e741fSSundar Sridharan    }
132272e741fSSundar Sridharan}
133272e741fSSundar Sridharan
13430a02785SJung-Sang Ahnstatic int _block_is_overlapped(void *pbid1, void *pis_writer1,
13530a02785SJung-Sang Ahn                                void *pbid2, void *pis_writer2,
13630a02785SJung-Sang Ahn                                void *aux)
13730a02785SJung-Sang Ahn{
13830a02785SJung-Sang Ahn    (void)aux;
13930a02785SJung-Sang Ahn    bid_t bid1, is_writer1, bid2, is_writer2;
14030a02785SJung-Sang Ahn    bid1 = *(bid_t*)pbid1;
14130a02785SJung-Sang Ahn    is_writer1 = *(bid_t*)pis_writer1;
14230a02785SJung-Sang Ahn    bid2 = *(bid_t*)pbid2;
14330a02785SJung-Sang Ahn    is_writer2 = *(bid_t*)pis_writer2;
14430a02785SJung-Sang Ahn
14530a02785SJung-Sang Ahn    if (bid1 != bid2) {
14630a02785SJung-Sang Ahn        // not overlapped
14730a02785SJung-Sang Ahn        return 0;
14830a02785SJung-Sang Ahn    } else {
14930a02785SJung-Sang Ahn        // overlapped
15030a02785SJung-Sang Ahn        if (!is_writer1 && !is_writer2) {
15130a02785SJung-Sang Ahn            // both are readers
15230a02785SJung-Sang Ahn            return 0;
15330a02785SJung-Sang Ahn        } else {
15430a02785SJung-Sang Ahn            return 1;
15530a02785SJung-Sang Ahn        }
15630a02785SJung-Sang Ahn    }
15730a02785SJung-Sang Ahn}
15830a02785SJung-Sang Ahn
1599c4bf803SJung-Sang Ahnvoid printISOTime(char* buffer, size_t buffer_len) {
1609c4bf803SJung-Sang Ahn    struct tm* tm_now;
1619c4bf803SJung-Sang Ahn    time_t rawtime;
1629c4bf803SJung-Sang Ahn    time(&rawtime);
1639c4bf803SJung-Sang Ahn    tm_now = localtime(&rawtime);
1649c4bf803SJung-Sang Ahn
1659c4bf803SJung-Sang Ahn    // 2017-06-22T10:00:00
1669c4bf803SJung-Sang Ahn    size_t time_len = strftime(buffer, buffer_len,
1679c4bf803SJung-Sang Ahn                               "%Y-%m-%dT%H:%M:%S", tm_now);
1689c4bf803SJung-Sang Ahn
1699c4bf803SJung-Sang Ahn    // Add milliseconds
1709c4bf803SJung-Sang Ahn    timeval cur_time;
1719c4bf803SJung-Sang Ahn    gettimeofday(&cur_time, NULL);
1729c4bf803SJung-Sang Ahn    size_t milli = cur_time.tv_usec / 1000;
1739c4bf803SJung-Sang Ahn    // 2017-06-22T10:00:00.123
1749c4bf803SJung-Sang Ahn    sprintf(buffer + time_len, ".%03d", (int)milli);
1759c4bf803SJung-Sang Ahn    time_len += 4;
1769c4bf803SJung-Sang Ahn
1779c4bf803SJung-Sang Ahn    // timezone offset format: -0500
1789c4bf803SJung-Sang Ahn    char tz_offset_str[6];
1799c4bf803SJung-Sang Ahn    size_t offset_len =  strftime(tz_offset_str, 6,
1809c4bf803SJung-Sang Ahn                                  "%z", tm_now);
1819c4bf803SJung-Sang Ahn    if (offset_len < 5) {
1829c4bf803SJung-Sang Ahn        // Time zone info is not supported, skip it.
1839c4bf803SJung-Sang Ahn        return;
1849c4bf803SJung-Sang Ahn    }
1859c4bf803SJung-Sang Ahn
1869c4bf803SJung-Sang Ahn    // hour
1879c4bf803SJung-Sang Ahn    strncat(buffer, tz_offset_str, 3);
1889c4bf803SJung-Sang Ahn    // :
1899c4bf803SJung-Sang Ahn    strcat(buffer, ":");
1909c4bf803SJung-Sang Ahn    // min
1919c4bf803SJung-Sang Ahn    strncat(buffer, tz_offset_str + 3, 2);
1929c4bf803SJung-Sang Ahn    // final format: 2017-06-22T10:00:00.123-05:00
1939c4bf803SJung-Sang Ahn}
1949c4bf803SJung-Sang Ahn
195f9a2bf6bSJens Alfkefdb_status fdb_log(err_log_callback *log_callback,
196f9a2bf6bSJens Alfke                   fdb_status status,
197f9a2bf6bSJens Alfke                   const char *format, ...)
198f9a2bf6bSJens Alfke{
199b84960c2SJung-Sang Ahn    char msg[4096];
200b84960c2SJung-Sang Ahn    va_list args;
201b84960c2SJung-Sang Ahn    va_start(args, format);
202b84960c2SJung-Sang Ahn    vsprintf(msg, format, args);
203b84960c2SJung-Sang Ahn    va_end(args);
204b84960c2SJung-Sang Ahn
205f9a2bf6bSJens Alfke    if (log_callback && log_callback->callback) {
206f9a2bf6bSJens Alfke        log_callback->callback(status, msg, log_callback->ctx_data);
207b84960c2SJung-Sang Ahn    } else {
2089c4bf803SJung-Sang Ahn        char ISO_time_buffer[64];
2099c4bf803SJung-Sang Ahn        printISOTime(ISO_time_buffer, 64);
2101ccfc6d3SSundar Sridharan        if (status != FDB_RESULT_SUCCESS) {
2119c4bf803SJung-Sang Ahn            fprintf(stderr, "%s [ERRO][FDB] %s\n", ISO_time_buffer, msg);
2121ccfc6d3SSundar Sridharan        } else {
2139c4bf803SJung-Sang Ahn            fprintf(stderr, "%s [INFO][FDB] %s\n", ISO_time_buffer, msg);
2141ccfc6d3SSundar Sridharan        }
215f9a2bf6bSJens Alfke    }
216f9a2bf6bSJens Alfke    return status;
217f9a2bf6bSJens Alfke}
218f9a2bf6bSJens Alfke
21901e33605SChiyoung Seostatic void _log_errno_str(struct filemgr_ops *ops,
22001e33605SChiyoung Seo                           err_log_callback *log_callback,
221f9a2bf6bSJens Alfke                           fdb_status io_error,
222f9a2bf6bSJens Alfke                           const char *what,
223f9a2bf6bSJens Alfke                           const char *filename)
224f9a2bf6bSJens Alfke{
225f9a2bf6bSJens Alfke    if (io_error < 0) {
22601e33605SChiyoung Seo        char errno_msg[512];
22701e33605SChiyoung Seo        ops->get_errno_str(errno_msg, 512);
228f9a2bf6bSJens Alfke        fdb_log(log_callback, io_error,
229f9a2bf6bSJens Alfke                "Error in %s on a database file '%s', %s", what, filename, errno_msg);
23001e33605SChiyoung Seo    }
23101e33605SChiyoung Seo}
23201e33605SChiyoung Seo
233f80ddf4dSChiyoung Seostatic uint32_t _file_hash(struct hash *hash, struct hash_elem *e)
2347c0433f5SJung-Sang Ahn{
2352889254eSJung-Sang Ahn    struct filemgr *file = _get_entry(e, struct filemgr, e);
2362889254eSJung-Sang Ahn    int len = strlen(file->filename);
23776e16138SJim Walker
23876e16138SJim Walker    return get_checksum(reinterpret_cast<const uint8_t*>(file->filename), len) &
23976e16138SJim Walker                        ((unsigned)(NBUCKET-1));
2407c0433f5SJung-Sang Ahn}
2417c0433f5SJung-Sang Ahn
242f80ddf4dSChiyoung Seostatic int _file_cmp(struct hash_elem *a, struct hash_elem *b)
2437c0433f5SJung-Sang Ahn{
2442889254eSJung-Sang Ahn    struct filemgr *aa, *bb;
2452889254eSJung-Sang Ahn    aa = _get_entry(a, struct filemgr, e);
2462889254eSJung-Sang Ahn    bb = _get_entry(b, struct filemgr, e);
2472889254eSJung-Sang Ahn    return strcmp(aa->filename, bb->filename);
2487c0433f5SJung-Sang Ahn}
2497c0433f5SJung-Sang Ahn
250eccdfcd9SJung-Sang Ahnvoid filemgr_init(struct filemgr_config *config)
2517c0433f5SJung-Sang Ahn{
25259c1c4f5SJung-Sang Ahn    // global initialization
25359c1c4f5SJung-Sang Ahn    // initialized only once at first time
2542889254eSJung-Sang Ahn    if (!filemgr_initialized) {
25559c1c4f5SJung-Sang Ahn#ifndef SPIN_INITIALIZER
25659c1c4f5SJung-Sang Ahn        // Note that only Windows passes through this routine
25759c1c4f5SJung-Sang Ahn        if (InterlockedCompareExchange(&initial_lock_status, 1, 0) == 0) {
25859c1c4f5SJung-Sang Ahn            // atomically initialize spin lock only once
25959c1c4f5SJung-Sang Ahn            spin_init(&initial_lock);
26059c1c4f5SJung-Sang Ahn            initial_lock_status = 2;
26159c1c4f5SJung-Sang Ahn        } else {
2625aa14abcSSundar Sridharan            // the others ... wait until initializing 'initial_lock' is done
26359c1c4f5SJung-Sang Ahn            while (initial_lock_status != 2) {
26459c1c4f5SJung-Sang Ahn                Sleep(1);
26559c1c4f5SJung-Sang Ahn            }
26659c1c4f5SJung-Sang Ahn        }
26759c1c4f5SJung-Sang Ahn#endif
2682889254eSJung-Sang Ahn
26959c1c4f5SJung-Sang Ahn        spin_lock(&initial_lock);
27059c1c4f5SJung-Sang Ahn        if (!filemgr_initialized) {
271f0b1bf77SJung-Sang Ahn            memset(&sb_ops, 0x0, sizeof(sb_ops));
27259c1c4f5SJung-Sang Ahn            global_config = *config;
27359c1c4f5SJung-Sang Ahn
27459c1c4f5SJung-Sang Ahn            if (global_config.ncacheblock > 0)
27559c1c4f5SJung-Sang Ahn                bcache_init(global_config.ncacheblock, global_config.blocksize);
276abe1d5a9SSundar Sridharan
27759c1c4f5SJung-Sang Ahn            hash_init(&hash, NBUCKET, _file_hash, _file_cmp);
2782889254eSJung-Sang Ahn
27959c1c4f5SJung-Sang Ahn            // initialize temp buffer
28059c1c4f5SJung-Sang Ahn            list_init(&temp_buf);
28159c1c4f5SJung-Sang Ahn            spin_init(&temp_buf_lock);
2823d812dfcSJung-Sang Ahn
28359c1c4f5SJung-Sang Ahn            // initialize global lock
28459c1c4f5SJung-Sang Ahn            spin_init(&filemgr_openlock);
285d25f8d6dSJung-Sang Ahn
28659c1c4f5SJung-Sang Ahn            // set the initialize flag
28759c1c4f5SJung-Sang Ahn            filemgr_initialized = 1;
28859c1c4f5SJung-Sang Ahn        }
28959c1c4f5SJung-Sang Ahn        spin_unlock(&initial_lock);
2902889254eSJung-Sang Ahn    }
2917c0433f5SJung-Sang Ahn}
2927c0433f5SJung-Sang Ahn
293d6eb040eSJung-Sang Ahnvoid filemgr_set_lazy_file_deletion(bool enable,
294d6eb040eSJung-Sang Ahn                                    register_file_removal_func regis_func,
295d6eb040eSJung-Sang Ahn                                    check_file_removal_func check_func)
296d6eb040eSJung-Sang Ahn{
297d6eb040eSJung-Sang Ahn    lazy_file_deletion_enabled = enable;
298d6eb040eSJung-Sang Ahn    register_file_removal = regis_func;
299d6eb040eSJung-Sang Ahn    is_file_removed = check_func;
300d6eb040eSJung-Sang Ahn}
301d6eb040eSJung-Sang Ahn
302f0b1bf77SJung-Sang Ahnvoid filemgr_set_sb_operation(struct sb_ops ops)
303f0b1bf77SJung-Sang Ahn{
304f0b1bf77SJung-Sang Ahn    sb_ops = ops;
305f0b1bf77SJung-Sang Ahn}
306f0b1bf77SJung-Sang Ahn
307f80ddf4dSChiyoung Seostatic void * _filemgr_get_temp_buf()
3089e14d68eSJung-Sang Ahn{
309d25f8d6dSJung-Sang Ahn    struct list_elem *e;
310d25f8d6dSJung-Sang Ahn    struct temp_buf_item *item;
311d25f8d6dSJung-Sang Ahn
3129e14d68eSJung-Sang Ahn    spin_lock(&temp_buf_lock);
313d25f8d6dSJung-Sang Ahn    e = list_pop_front(&temp_buf);
314d25f8d6dSJung-Sang Ahn    if (e) {
315d25f8d6dSJung-Sang Ahn        item = _get_entry(e, struct temp_buf_item, le);
31618054ff6SChiyoung Seo    } else {
3175d00796bSabhinavdangeti        void *addr = NULL;
318abe1d5a9SSundar Sridharan
31918054ff6SChiyoung Seo        malloc_align(addr, FDB_SECTOR_SIZE,
32018054ff6SChiyoung Seo                     global_config.blocksize + sizeof(struct temp_buf_item));
321d25f8d6dSJung-Sang Ahn
3228e1b9ec2SJung-Sang Ahn        item = (struct temp_buf_item *)((uint8_t *) addr + global_config.blocksize);
323d25f8d6dSJung-Sang Ahn        item->addr = addr;
324d25f8d6dSJung-Sang Ahn    }
325d25f8d6dSJung-Sang Ahn    spin_unlock(&temp_buf_lock);
326abe1d5a9SSundar Sridharan
327d25f8d6dSJung-Sang Ahn    return item->addr;
328d25f8d6dSJung-Sang Ahn}
329d25f8d6dSJung-Sang Ahn
330f80ddf4dSChiyoung Seostatic void _filemgr_release_temp_buf(void *buf)
331d25f8d6dSJung-Sang Ahn{
332d25f8d6dSJung-Sang Ahn    struct temp_buf_item *item;
333d25f8d6dSJung-Sang Ahn
334d25f8d6dSJung-Sang Ahn    spin_lock(&temp_buf_lock);
3358e1b9ec2SJung-Sang Ahn    item = (struct temp_buf_item*)((uint8_t *)buf + global_config.blocksize);
336d25f8d6dSJung-Sang Ahn    list_push_front(&temp_buf, &item->le);
337abe1d5a9SSundar Sridharan    spin_unlock(&temp_buf_lock);
338d25f8d6dSJung-Sang Ahn}
339d25f8d6dSJung-Sang Ahn
340f80ddf4dSChiyoung Seostatic void _filemgr_shutdown_temp_buf()
341d25f8d6dSJung-Sang Ahn{
342d25f8d6dSJung-Sang Ahn    struct list_elem *e;
343d25f8d6dSJung-Sang Ahn    struct temp_buf_item *item;
344d25f8d6dSJung-Sang Ahn    size_t count=0;
345d25f8d6dSJung-Sang Ahn
346d25f8d6dSJung-Sang Ahn    spin_lock(&temp_buf_lock);
347d25f8d6dSJung-Sang Ahn    e = list_begin(&temp_buf);
348d25f8d6dSJung-Sang Ahn    while(e){
349d25f8d6dSJung-Sang Ahn        item = _get_entry(e, struct temp_buf_item, le);
350d25f8d6dSJung-Sang Ahn        e = list_remove(&temp_buf, e);
351f693a021SSundar Sridharan        free_align(item->addr);
352d25f8d6dSJung-Sang Ahn        count++;
353d25f8d6dSJung-Sang Ahn    }
3549e14d68eSJung-Sang Ahn    spin_unlock(&temp_buf_lock);
3559e14d68eSJung-Sang Ahn}
3569e14d68eSJung-Sang Ahn
3574690a73fSJens Alfke// Read a block from the file, decrypting if necessary.
3584690a73fSJens Alfkestatic ssize_t filemgr_read_block(struct filemgr *file, void *buf, bid_t bid) {
3591de419c8SSundar Sridharan    ssize_t result = file->ops->pread(file->fd, buf, file->blocksize,
3601de419c8SSundar Sridharan                                      file->blocksize*bid);
3614690a73fSJens Alfke    if (file->encryption.ops && result > 0) {
3621de419c8SSundar Sridharan        if (result != (ssize_t)file->blocksize)
3634690a73fSJens Alfke            return FDB_RESULT_READ_FAIL;
3644690a73fSJens Alfke        fdb_status status = fdb_decrypt_block(&file->encryption, buf, result, bid);
3654690a73fSJens Alfke        if (status != FDB_RESULT_SUCCESS)
3664690a73fSJens Alfke            return status;
3674690a73fSJens Alfke    }
3684690a73fSJens Alfke    return result;
3694690a73fSJens Alfke}
3704690a73fSJens Alfke
3714690a73fSJens Alfke// Write consecutive block(s) to the file, encrypting if necessary.
3724690a73fSJens Alfkessize_t filemgr_write_blocks(struct filemgr *file, void *buf, unsigned num_blocks, bid_t start_bid) {
3734690a73fSJens Alfke    size_t blocksize = file->blocksize;
3744690a73fSJens Alfke    cs_off_t offset = start_bid * blocksize;
3754690a73fSJens Alfke    size_t nbytes = num_blocks * blocksize;
3764690a73fSJens Alfke    if (file->encryption.ops == NULL) {
3774690a73fSJens Alfke        return file->ops->pwrite(file->fd, buf, nbytes, offset);
3784690a73fSJens Alfke    } else {
3794690a73fSJens Alfke        uint8_t *encrypted_buf;
3804690a73fSJens Alfke        if (nbytes > 4096)
3814690a73fSJens Alfke            encrypted_buf = (uint8_t*)malloc(nbytes);
3824690a73fSJens Alfke        else
3834690a73fSJens Alfke            encrypted_buf = alca(uint8_t, nbytes); // most common case (writing single block)
3844690a73fSJens Alfke        if (!encrypted_buf)
3854690a73fSJens Alfke            return FDB_RESULT_ALLOC_FAIL;
3864690a73fSJens Alfke        fdb_status status = fdb_encrypt_blocks(&file->encryption,
3874690a73fSJens Alfke                                               encrypted_buf,
3884690a73fSJens Alfke                                               buf,
3894690a73fSJens Alfke                                               blocksize,
3904690a73fSJens Alfke                                               num_blocks,
3914690a73fSJens Alfke                                               start_bid);
3924690a73fSJens Alfke        if (nbytes > 4096)
3934690a73fSJens Alfke            free(encrypted_buf);
3944690a73fSJens Alfke        if (status != FDB_RESULT_SUCCESS)
3954690a73fSJens Alfke            return status;
3964690a73fSJens Alfke        return file->ops->pwrite(file->fd, encrypted_buf, nbytes, offset);
3974690a73fSJens Alfke    }
3984690a73fSJens Alfke}
3994690a73fSJens Alfke
400f0b1bf77SJung-Sang Ahnint filemgr_is_writable(struct filemgr *file, bid_t bid)
401f0b1bf77SJung-Sang Ahn{
4021f644b8dSJung-Sang Ahn    if (sb_bmp_exists(file->sb) && sb_ops.is_writable) {
403f0b1bf77SJung-Sang Ahn        // block reusing is enabled
404f0b1bf77SJung-Sang Ahn        return sb_ops.is_writable(file, bid);
405f0b1bf77SJung-Sang Ahn    } else {
406f0b1bf77SJung-Sang Ahn        uint64_t pos = bid * file->blocksize;
407f0b1bf77SJung-Sang Ahn        // Note that we don't need to grab file->lock here because
408f0b1bf77SJung-Sang Ahn        // 1) both file->pos and file->last_commit are only incremented.
409f0b1bf77SJung-Sang Ahn        // 2) file->last_commit is updated using the value of file->pos,
410f0b1bf77SJung-Sang Ahn        //    and always equal to or smaller than file->pos.
411f0b1bf77SJung-Sang Ahn        return (pos <  atomic_get_uint64_t(&file->pos) &&
412f0b1bf77SJung-Sang Ahn                pos >= atomic_get_uint64_t(&file->last_commit));
413f0b1bf77SJung-Sang Ahn    }
414f0b1bf77SJung-Sang Ahn}
415f0b1bf77SJung-Sang Ahn
41697bd3815SJung-Sang Ahnuint64_t filemgr_get_sb_bmp_revnum(struct filemgr *file)
41797bd3815SJung-Sang Ahn{
41897bd3815SJung-Sang Ahn    if (file->sb && sb_ops.get_bmp_revnum) {
41997bd3815SJung-Sang Ahn        return sb_ops.get_bmp_revnum(file);
42097bd3815SJung-Sang Ahn    } else {
42197bd3815SJung-Sang Ahn        return 0;
42297bd3815SJung-Sang Ahn    }
42397bd3815SJung-Sang Ahn}
42497bd3815SJung-Sang Ahn
42518054ff6SChiyoung Seostatic fdb_status _filemgr_read_header(struct filemgr *file,
42618054ff6SChiyoung Seo                                       err_log_callback *log_callback)
4277c0433f5SJung-Sang Ahn{
428818cce61SJung-Sang Ahn    uint8_t marker[BLK_MARKER_SIZE];
4293c6e2608SJung-Sang Ahn    filemgr_magic_t magic = ver_get_latest_magic();
43080e69aceSJung-Sang Ahn    filemgr_header_len_t len;
431abe1d5a9SSundar Sridharan    uint8_t *buf;
4324a0379f5SJung-Sang Ahn    uint32_t crc, crc_file;
43376e16138SJim Walker    bool check_crc32_open_rule = false;
434cb0f0747SSundar Sridharan    fdb_status status = FDB_RESULT_SUCCESS;
435f0b1bf77SJung-Sang Ahn    bid_t hdr_bid, hdr_bid_local;
436f0b1bf77SJung-Sang Ahn    size_t min_filesize = 0;
43756236603Ssduvuru    size_t bad_header_count = 0;
438d25f8d6dSJung-Sang Ahn
439d25f8d6dSJung-Sang Ahn    // get temp buffer
4408e1b9ec2SJung-Sang Ahn    buf = (uint8_t *) _filemgr_get_temp_buf();
4419e14d68eSJung-Sang Ahn
44276e16138SJim Walker    // If a header is found crc_mode can change to reflect the file
443f0b1bf77SJung-Sang Ahn    if (file->crc_mode == CRC32) {
44476e16138SJim Walker        check_crc32_open_rule = true;
44576e16138SJim Walker    }
44676e16138SJim Walker
447f0b1bf77SJung-Sang Ahn    hdr_bid = atomic_get_uint64_t(&file->pos) / file->blocksize - 1;
448f0b1bf77SJung-Sang Ahn    hdr_bid_local = hdr_bid;
449f0b1bf77SJung-Sang Ahn
450f0b1bf77SJung-Sang Ahn    if (file->sb) {
451f0b1bf77SJung-Sang Ahn        // superblock exists .. file size does not start from zero.
452f0b1bf77SJung-Sang Ahn        min_filesize = file->sb->config->num_sb * file->blocksize;
453b5540fc3SJung-Sang Ahn        bid_t sb_last_hdr_bid = atomic_get_uint64_t(&file->sb->last_hdr_bid);
454b5540fc3SJung-Sang Ahn        if (sb_last_hdr_bid != BLK_NOT_FOUND) {
455b5540fc3SJung-Sang Ahn            hdr_bid = hdr_bid_local = sb_last_hdr_bid;
456f0b1bf77SJung-Sang Ahn        }
457c50e978bSJung-Sang Ahn        // if header info does not exist in superblock,
458c50e978bSJung-Sang Ahn        // get DB header at the end of the file.
459f0b1bf77SJung-Sang Ahn    }
460f0b1bf77SJung-Sang Ahn
461c50e978bSJung-Sang Ahn    if (atomic_get_uint64_t(&file->pos) > min_filesize) {
462abe1d5a9SSundar Sridharan        // Crash Recovery Test 1: unaligned last block write
463ddbd458dSChiyoung Seo        uint64_t remain = atomic_get_uint64_t(&file->pos) % file->blocksize;
464abe1d5a9SSundar Sridharan        if (remain) {
4650745bc38SJung-Sang Ahn            atomic_sub_uint64_t(&file->pos, remain);
466ddbd458dSChiyoung Seo            atomic_store_uint64_t(&file->last_commit, atomic_get_uint64_t(&file->pos));
467c279f70eSChiyoung Seo            const char *msg = "Crash Detected: %" _F64 " non-block aligned bytes discarded "
46818054ff6SChiyoung Seo                "from a database file '%s'\n";
46918054ff6SChiyoung Seo            DBG(msg, remain, file->filename);
47018054ff6SChiyoung Seo            fdb_log(log_callback, FDB_RESULT_READ_FAIL /* Need to add a better error code*/,
47118054ff6SChiyoung Seo                    msg, remain, file->filename);
472abe1d5a9SSundar Sridharan        }
473d9045a26SChiyoung Seo
4746ad4926cSChiyoung Seo        size_t block_counter = 0;
475abe1d5a9SSundar Sridharan        do {
47610265520Sabhinavdangeti            if (hdr_bid_local * file->blocksize >= file->pos) {
47710265520Sabhinavdangeti                // Handling EOF scenario
47810265520Sabhinavdangeti                status = FDB_RESULT_NO_DB_HEADERS;
47910265520Sabhinavdangeti                const char *msg = "Unable to read block from file '%s' as EOF "
48010265520Sabhinavdangeti                                  "reached\n";
48110265520Sabhinavdangeti                fdb_log(log_callback, status, msg, file->filename);
48210265520Sabhinavdangeti                break;
48310265520Sabhinavdangeti            }
484f0b1bf77SJung-Sang Ahn            ssize_t rv = filemgr_read_block(file, buf, hdr_bid_local);
4851de419c8SSundar Sridharan            if (rv != (ssize_t)file->blocksize) {
486671d9fbaSChiyoung Seo                status = (fdb_status) rv;
48718054ff6SChiyoung Seo                const char *msg = "Unable to read a database file '%s' with "
48810265520Sabhinavdangeti                                  "blocksize %u\n";
48918054ff6SChiyoung Seo                DBG(msg, file->filename, file->blocksize);
49018054ff6SChiyoung Seo                fdb_log(log_callback, status, msg, file->filename, file->blocksize);
491cb0f0747SSundar Sridharan                break;
492cb0f0747SSundar Sridharan            }
4936ad4926cSChiyoung Seo            ++block_counter;
494abe1d5a9SSundar Sridharan            memcpy(marker, buf + file->blocksize - BLK_MARKER_SIZE,
495abe1d5a9SSundar Sridharan                   BLK_MARKER_SIZE);
496abe1d5a9SSundar Sridharan
497abe1d5a9SSundar Sridharan            if (marker[0] == BLK_MARKER_DBHEADER) {
498abe1d5a9SSundar Sridharan                // possible need for byte conversions here
4994a0379f5SJung-Sang Ahn                memcpy(&magic,
5004a0379f5SJung-Sang Ahn                       buf + file->blocksize - BLK_MARKER_SIZE - sizeof(magic),
5014a0379f5SJung-Sang Ahn                       sizeof(magic));
5026d79432aSJung-Sang Ahn                magic = _endian_decode(magic);
5036d79432aSJung-Sang Ahn
5043c6e2608SJung-Sang Ahn                if (ver_is_valid_magic(magic)) {
505e4935a20SJung-Sang Ahn
5064a0379f5SJung-Sang Ahn                    memcpy(&len,
5074a0379f5SJung-Sang Ahn                           buf + file->blocksize - BLK_MARKER_SIZE -
5084a0379f5SJung-Sang Ahn                           sizeof(magic) - sizeof(len),
5094a0379f5SJung-Sang Ahn                           sizeof(len));
5106d79432aSJung-Sang Ahn                    len = _endian_decode(len);
5116d79432aSJung-Sang Ahn
5124a0379f5SJung-Sang Ahn                    memcpy(&crc_file, buf + len - sizeof(crc), sizeof(crc));
5134a0379f5SJung-Sang Ahn                    crc_file = _endian_decode(crc_file);
51476e16138SJim Walker
51576e16138SJim Walker                    // crc check and detect the crc_mode
51676e16138SJim Walker                    if (detect_and_check_crc(reinterpret_cast<const uint8_t*>(buf),
51776e16138SJim Walker                                             len - sizeof(crc),
51876e16138SJim Walker                                             crc_file,
51976e16138SJim Walker                                             &file->crc_mode)) {
52076e16138SJim Walker                        // crc mode is detected and known.
52176e16138SJim Walker                        // check the rules of opening legacy CRC
52276e16138SJim Walker                        if (check_crc32_open_rule && file->crc_mode != CRC32) {
52376e16138SJim Walker                            const char *msg = "Open of CRC32C file"
52476e16138SJim Walker                                              " with forced CRC32\n";
52576e16138SJim Walker                            status = FDB_RESULT_INVALID_ARGS;
52676e16138SJim Walker                            DBG(msg);
52776e16138SJim Walker                            fdb_log(log_callback, status, msg);
52876e16138SJim Walker                            break;
52976e16138SJim Walker                        } else {
53076e16138SJim Walker                            status = FDB_RESULT_SUCCESS;
53176e16138SJim Walker
532a3086834SJung-Sang Ahn                            file->header.data = (void *)malloc(file->blocksize);
53376e16138SJim Walker
53476e16138SJim Walker                            memcpy(file->header.data, buf, len);
53576e16138SJim Walker                            memcpy(&file->header.revnum,