1bc68bb02SChiyoung Seo/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
27c0433f5SJung-Sang Ahn/*
3bc68bb02SChiyoung Seo *     Copyright 2010 Couchbase, Inc
4bc68bb02SChiyoung Seo *
5bc68bb02SChiyoung Seo *   Licensed under the Apache License, Version 2.0 (the "License");
6bc68bb02SChiyoung Seo *   you may not use this file except in compliance with the License.
7bc68bb02SChiyoung Seo *   You may obtain a copy of the License at
8bc68bb02SChiyoung Seo *
9bc68bb02SChiyoung Seo *       http://www.apache.org/licenses/LICENSE-2.0
10bc68bb02SChiyoung Seo *
11bc68bb02SChiyoung Seo *   Unless required by applicable law or agreed to in writing, software
12bc68bb02SChiyoung Seo *   distributed under the License is distributed on an "AS IS" BASIS,
13bc68bb02SChiyoung Seo *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14bc68bb02SChiyoung Seo *   See the License for the specific language governing permissions and
15bc68bb02SChiyoung Seo *   limitations under the License.
167c0433f5SJung-Sang Ahn */
177c0433f5SJung-Sang Ahn
187c0433f5SJung-Sang Ahn#include <stdlib.h>
197c0433f5SJung-Sang Ahn#include <string.h>
207c0433f5SJung-Sang Ahn#include <stdint.h>
217c0433f5SJung-Sang Ahn
22b31f83a4SChiyoung Seo#if !defined(WIN32) && !defined(_WIN32)
23b31f83a4SChiyoung Seo#include <sys/time.h>
24b31f83a4SChiyoung Seo#endif
25b31f83a4SChiyoung Seo
267c0433f5SJung-Sang Ahn#include "hash_functions.h"
277c0433f5SJung-Sang Ahn#include "common.h"
28b6be7a1dSSundar Sridharan#include "libforestdb/fdb_errors.h"
297c0433f5SJung-Sang Ahn#include "hash.h"
307c0433f5SJung-Sang Ahn#include "list.h"
317c0433f5SJung-Sang Ahn#include "blockcache.h"
3255f0984dSJung-Sang Ahn#include "avltree.h"
3369467020SChiyoung Seo#include "atomic.h"
341ccfc6d3SSundar Sridharan#include "fdb_internal.h"
357c0433f5SJung-Sang Ahn
363d812dfcSJung-Sang Ahn#include "memleak.h"
373d812dfcSJung-Sang Ahn
387c0433f5SJung-Sang Ahn#ifdef __DEBUG
397c0433f5SJung-Sang Ahn#ifndef __DEBUG_BCACHE
402889254eSJung-Sang Ahn    #undef DBG
412889254eSJung-Sang Ahn    #undef DBGCMD
42eea9c5e9SJung-Sang Ahn    #undef DBGSW
43ceca3b9fSJung-Sang Ahn    #define DBG(...)
44ceca3b9fSJung-Sang Ahn    #define DBGCMD(...)
45ceca3b9fSJung-Sang Ahn    #define DBGSW(n, ...)
467c0433f5SJung-Sang Ahn#endif
477c0433f5SJung-Sang Ahn#endif
487c0433f5SJung-Sang Ahn
49d25f8d6dSJung-Sang Ahn// global lock
503d812dfcSJung-Sang Ahnstatic spin_t bcache_lock;
513d812dfcSJung-Sang Ahn
5258e43058SJung-Sang Ahn// hash table for filename
537c0433f5SJung-Sang Ahnstatic struct hash fnamedic;
5458e43058SJung-Sang Ahn
55d25f8d6dSJung-Sang Ahn// free block list
567e4455b1Sabhinavdangetistatic atomic_uint64_t freelist_count(0);
57d25f8d6dSJung-Sang Ahnstatic struct list freelist;
58d25f8d6dSJung-Sang Ahnstatic spin_t freelist_lock;
59d25f8d6dSJung-Sang Ahn
60b31f83a4SChiyoung Seostatic const size_t MAX_VICTIM_SELECTIONS = 5;
61b31f83a4SChiyoung Seo
62d25f8d6dSJung-Sang Ahn// file structure list
63b31f83a4SChiyoung Seostatic size_t num_files;
64b31f83a4SChiyoung Seostatic size_t file_array_capacity;
65b31f83a4SChiyoung Seostatic fnamedic_item ** file_list;
66b31f83a4SChiyoung Seostatic struct list file_zombies;
675a6f3f27Sabhinavdangetistatic fdb_rw_lock filelist_lock; // Reader-Writer lock for the file list.
68d25f8d6dSJung-Sang Ahn
69d25f8d6dSJung-Sang Ahn//static struct list cleanlist, dirtylist;
70d25f8d6dSJung-Sang Ahn//static uint64_t nfree, nclean, ndirty;
71d25f8d6dSJung-Sang Ahnstatic uint64_t bcache_nblock;
72d25f8d6dSJung-Sang Ahn
737c0433f5SJung-Sang Ahnstatic int bcache_blocksize;
7458e43058SJung-Sang Ahnstatic size_t bcache_flush_unit;
757c0433f5SJung-Sang Ahn
7669467020SChiyoung Seostruct bcache_shard {
7769467020SChiyoung Seo    spin_t lock;
7869467020SChiyoung Seo    // list for clean blocks
7969467020SChiyoung Seo    struct list cleanlist;
8069467020SChiyoung Seo    // tree for normal dirty blocks
8169467020SChiyoung Seo    struct avl_tree tree;
8269467020SChiyoung Seo    // tree for index nodes
8369467020SChiyoung Seo    struct avl_tree tree_idx;
8469467020SChiyoung Seo    // hash table for block lookup
8569467020SChiyoung Seo    struct hash hashtable;
8669467020SChiyoung Seo    // list elem for shard LRU
8769467020SChiyoung Seo    struct list_elem le;
8869467020SChiyoung Seo};
8969467020SChiyoung Seo
907c0433f5SJung-Sang Ahnstruct fnamedic_item {
912889254eSJung-Sang Ahn    char *filename;
922889254eSJung-Sang Ahn    uint16_t filename_len;
93818cce61SJung-Sang Ahn    uint32_t hash;
942889254eSJung-Sang Ahn
9565a4da43SJung-Sang Ahn    // current opened filemgr instance
9665a4da43SJung-Sang Ahn    // (can be changed on-the-fly when file is closed and re-opened)
972889254eSJung-Sang Ahn    struct filemgr *curfile;
98d25f8d6dSJung-Sang Ahn
9969467020SChiyoung Seo    // Shards of the block cache for a file.
10069467020SChiyoung Seo    struct bcache_shard *shards;
101d25f8d6dSJung-Sang Ahn
102b31f83a4SChiyoung Seo    // list elem for FILE_ZOMBIE
10365a4da43SJung-Sang Ahn    struct list_elem le;
104d25f8d6dSJung-Sang Ahn    // hash elem for FNAMEDIC
105d25f8d6dSJung-Sang Ahn    struct hash_elem hash_elem;
106d25f8d6dSJung-Sang Ahn
1077e3e5567SSundar Sridharan    atomic_uint32_t ref_count;
10869467020SChiyoung Seo    atomic_uint64_t nvictim;
10969467020SChiyoung Seo    atomic_uint64_t nitems;
11067d223e3SSundar Sridharan    atomic_uint64_t nimmutable;
111b31f83a4SChiyoung Seo    atomic_uint64_t access_timestamp;
11269467020SChiyoung Seo    size_t num_shards;
1137c0433f5SJung-Sang Ahn};
1147c0433f5SJung-Sang Ahn
1159ceccb2cSJung-Sang Ahn#define BCACHE_DIRTY (0x1)
11667d223e3SSundar Sridharan#define BCACHE_IMMUTABLE (0x2)
1179ceccb2cSJung-Sang Ahn#define BCACHE_FREE (0x4)
118d25f8d6dSJung-Sang Ahn
119b91ccc2fSSundar Sridharanstatic void *buffercache_addr = NULL;
120b91ccc2fSSundar Sridharan
1217c0433f5SJung-Sang Ahnstruct bcache_item {
1222889254eSJung-Sang Ahn    // BID
1232889254eSJung-Sang Ahn    bid_t bid;
1242889254eSJung-Sang Ahn    // contents address
1252889254eSJung-Sang Ahn    void *addr;
1262889254eSJung-Sang Ahn    // hash elem for lookup hash table
1272889254eSJung-Sang Ahn    struct hash_elem hash_elem;
1282889254eSJung-Sang Ahn    // list elem for {free, clean, dirty} lists
12965a4da43SJung-Sang Ahn    struct list_elem list_elem;
130d25f8d6dSJung-Sang Ahn    // flag
13190c28779Sabhinavdangeti    atomic_uint8_t flag;
13200a9594fSJung-Sang Ahn    // score
13300a9594fSJung-Sang Ahn    uint8_t score;
1347c0433f5SJung-Sang Ahn};
1357c0433f5SJung-Sang Ahn
13658e43058SJung-Sang Ahnstruct dirty_item {
1372889254eSJung-Sang Ahn    struct bcache_item *item;
13855f0984dSJung-Sang Ahn    struct avl_node avl;
13958e43058SJung-Sang Ahn};
14058e43058SJung-Sang Ahn
14155f0984dSJung-Sang AhnINLINE int _dirty_cmp(struct avl_node *a, struct avl_node *b, void *aux)
14258e43058SJung-Sang Ahn{
1432889254eSJung-Sang Ahn    struct dirty_item *aa, *bb;
14455f0984dSJung-Sang Ahn    aa = _get_entry(a, struct dirty_item, avl);
14555f0984dSJung-Sang Ahn    bb = _get_entry(b, struct dirty_item, avl);
1469ceccb2cSJung-Sang Ahn
1472889254eSJung-Sang Ahn    #ifdef __BIT_CMP
1482889254eSJung-Sang Ahn        return _CMP_U64(aa->item->bid , bb->item->bid);
1499ceccb2cSJung-Sang Ahn
1502889254eSJung-Sang Ahn    #else
1512889254eSJung-Sang Ahn        if (aa->item->bid < bb->item->bid) return -1;
1522889254eSJung-Sang Ahn        else if (aa->item->bid > bb->item->bid) return 1;
1532889254eSJung-Sang Ahn        else return 0;
1549ceccb2cSJung-Sang Ahn
1552889254eSJung-Sang Ahn    #endif
15658e43058SJung-Sang Ahn}
15758e43058SJung-Sang Ahn
1587c0433f5SJung-Sang AhnINLINE uint32_t _fname_hash(struct hash *hash, struct hash_elem *e)
1597c0433f5SJung-Sang Ahn{
16065a4da43SJung-Sang Ahn    struct fnamedic_item *item;
16165a4da43SJung-Sang Ahn    item = _get_entry(e, struct fnamedic_item, hash_elem);
16269467020SChiyoung Seo    return item->hash % ((unsigned)(BCACHE_NDICBUCKET));
1637c0433f5SJung-Sang Ahn}
1647c0433f5SJung-Sang Ahn
1659ceccb2cSJung-Sang AhnINLINE int _fname_cmp(struct hash_elem *a, struct hash_elem *b)
1667c0433f5SJung-Sang Ahn{
1672889254eSJung-Sang Ahn    size_t len;
1682889254eSJung-Sang Ahn    struct fnamedic_item *aa, *bb;
1692889254eSJung-Sang Ahn    aa = _get_entry(a, struct fnamedic_item, hash_elem);
1702889254eSJung-Sang Ahn    bb = _get_entry(b, struct fnamedic_item, hash_elem);
1712889254eSJung-Sang Ahn
172818cce61SJung-Sang Ahn    if (aa->filename_len == bb->filename_len) {
173818cce61SJung-Sang Ahn        return memcmp(aa->filename, bb->filename, aa->filename_len);
174818cce61SJung-Sang Ahn    }else {
1752889254eSJung-Sang Ahn        len = MIN(aa->filename_len , bb->filename_len);
1762889254eSJung-Sang Ahn        int cmp = memcmp(aa->filename, bb->filename, len);
1772889254eSJung-Sang Ahn        if (cmp != 0) return cmp;
1782889254eSJung-Sang Ahn        else {
179818cce61SJung-Sang Ahn            return (int)((int)aa->filename_len - (int)bb->filename_len);
1802889254eSJung-Sang Ahn        }
1812889254eSJung-Sang Ahn    }
1827c0433f5SJung-Sang Ahn}
1837c0433f5SJung-Sang Ahn
1847c0433f5SJung-Sang AhnINLINE uint32_t _bcache_hash(struct hash *hash, struct hash_elem *e)
1857c0433f5SJung-Sang Ahn{
1862889254eSJung-Sang Ahn    struct bcache_item *item = _get_entry(e, struct bcache_item, hash_elem);
18769467020SChiyoung Seo    return (item->bid) % ((uint32_t)BCACHE_NBUCKET);
1887c0433f5SJung-Sang Ahn}
1897c0433f5SJung-Sang Ahn
1907c0433f5SJung-Sang AhnINLINE int _bcache_cmp(struct hash_elem *a, struct hash_elem *b)
1917c0433f5SJung-Sang Ahn{
1922889254eSJung-Sang Ahn    struct bcache_item *aa, *bb;
1932889254eSJung-Sang Ahn    aa = _get_entry(a, struct bcache_item, hash_elem);
1942889254eSJung-Sang Ahn    bb = _get_entry(b, struct bcache_item, hash_elem);
1952889254eSJung-Sang Ahn
1962889254eSJung-Sang Ahn    #ifdef __BIT_CMP
197d25f8d6dSJung-Sang Ahn
198d25f8d6dSJung-Sang Ahn        return _CMP_U64(aa->bid, bb->bid);
1992889254eSJung-Sang Ahn
2002889254eSJung-Sang Ahn    #else
201634c63f1SJung-Sang Ahn
202d25f8d6dSJung-Sang Ahn        if (aa->bid == bb->bid) return 0;
203d25f8d6dSJung-Sang Ahn        else if (aa->bid < bb->bid) return -1;
204d25f8d6dSJung-Sang Ahn        else return 1;
2059ceccb2cSJung-Sang Ahn
2062889254eSJung-Sang Ahn    #endif
2077c0433f5SJung-Sang Ahn}
2087c0433f5SJung-Sang Ahn
20969467020SChiyoung Seo#define _list_empty(list) (list.head == NULL)
21069467020SChiyoung Seo#define _tree_empty(tree) (tree.root == NULL)
21169467020SChiyoung Seo
21269467020SChiyoung Seostatic void _acquire_all_shard_locks(struct fnamedic_item *fname) {
21369467020SChiyoung Seo    size_t i = 0;
21469467020SChiyoung Seo    for (; i < fname->num_shards; ++i) {
21569467020SChiyoung Seo        spin_lock(&fname->shards[i].lock);
21669467020SChiyoung Seo    }
21769467020SChiyoung Seo}
21869467020SChiyoung Seo
21969467020SChiyoung Seostatic void _release_all_shard_locks(struct fnamedic_item *fname) {
22069467020SChiyoung Seo    size_t i = 0;
22169467020SChiyoung Seo    for (; i < fname->num_shards; ++i) {
22269467020SChiyoung Seo        spin_unlock(&fname->shards[i].lock);
22369467020SChiyoung Seo    }
22469467020SChiyoung Seo}
22555f0984dSJung-Sang Ahn
22669467020SChiyoung Seostatic bool _file_empty(struct fnamedic_item *fname) {
22769467020SChiyoung Seo    bool empty = true;
22869467020SChiyoung Seo    size_t i = 0;
22969467020SChiyoung Seo    _acquire_all_shard_locks(fname);
23069467020SChiyoung Seo    for (; i < fname->num_shards; ++i) {
23169467020SChiyoung Seo        if (!(_list_empty(fname->shards[i].cleanlist) &&
23269467020SChiyoung Seo              _tree_empty(fname->shards[i].tree) &&
23369467020SChiyoung Seo              _tree_empty(fname->shards[i].tree_idx))) {
23469467020SChiyoung Seo            empty = false;
23569467020SChiyoung Seo            break;
23669467020SChiyoung Seo        }
23769467020SChiyoung Seo    }
23869467020SChiyoung Seo    _release_all_shard_locks(fname);
23969467020SChiyoung Seo    return empty;
24069467020SChiyoung Seo}
24169467020SChiyoung Seo
24269467020SChiyoung SeoINLINE bool _shard_empty(struct bcache_shard *bshard) {
24369467020SChiyoung Seo    // Caller should grab the shard lock before calling this function.
24469467020SChiyoung Seo    return _list_empty(bshard->cleanlist) &&
24569467020SChiyoung Seo           _tree_empty(bshard->tree) &&
24669467020SChiyoung Seo           _tree_empty(bshard->tree_idx);
24769467020SChiyoung Seo}
24865a4da43SJung-Sang Ahn
2496d480545SJung-Sang Ahnstruct fnamedic_item *_bcache_get_victim()
250d25f8d6dSJung-Sang Ahn{
2517e3e5567SSundar Sridharan    struct fnamedic_item *ret = NULL;
252b31f83a4SChiyoung Seo    uint64_t min_timestamp = (uint64_t) -1;
253b31f83a4SChiyoung Seo    uint64_t victim_timestamp;
254b31f83a4SChiyoung Seo    int victim_idx;
255b31f83a4SChiyoung Seo    size_t num_attempts;
2562889254eSJung-Sang Ahn
2575a6f3f27Sabhinavdangeti    if (reader_lock(&filelist_lock) == 0) {
2585a6f3f27Sabhinavdangeti        // Pick the victim that has the smallest access timestamp
2595a6f3f27Sabhinavdangeti        // among files randomly selected.
2605a6f3f27Sabhinavdangeti        num_attempts = num_files / 10 + 1;
2615a6f3f27Sabhinavdangeti        if (num_attempts > MAX_VICTIM_SELECTIONS) {
2625a6f3f27Sabhinavdangeti            num_attempts = MAX_VICTIM_SELECTIONS;
2635a6f3f27Sabhinavdangeti        } else {
2645a6f3f27Sabhinavdangeti            if(num_attempts == 1 && num_files > 1) {
2655a6f3f27Sabhinavdangeti                ++num_attempts;
2665a6f3f27Sabhinavdangeti            }
2675a6f3f27Sabhinavdangeti        }
2685a6f3f27Sabhinavdangeti        for (size_t i = 0; i < num_attempts && num_files; ++i) {
2695a6f3f27Sabhinavdangeti            victim_idx = rand() % num_files;
2705a6f3f27Sabhinavdangeti            victim_timestamp =
2715a6f3f27Sabhinavdangeti                atomic_get_uint64_t(&file_list[victim_idx]->access_timestamp,
2725a6f3f27Sabhinavdangeti                                    std::memory_order_relaxed);
2735a6f3f27Sabhinavdangeti            if (victim_timestamp < min_timestamp &&
2745a6f3f27Sabhinavdangeti                    atomic_get_uint64_t(&file_list[victim_idx]->nitems)) {
2755a6f3f27Sabhinavdangeti                min_timestamp = victim_timestamp;
2765a6f3f27Sabhinavdangeti                ret = file_list[victim_idx];
2775a6f3f27Sabhinavdangeti            }
2785a6f3f27Sabhinavdangeti        }
2805a6f3f27Sabhinavdangeti        if (ret) {
2815a6f3f27Sabhinavdangeti            atomic_incr_uint32_t(&ret->ref_count);
2825a6f3f27Sabhinavdangeti        }
2835a6f3f27Sabhinavdangeti        reader_unlock(&filelist_lock);
2845a6f3f27Sabhinavdangeti    } else {
2855a6f3f27Sabhinavdangeti        fprintf(stderr, "Error in _bcache_get_victim(): "
2865a6f3f27Sabhinavdangeti                        "Failed to acquire ReaderLock on filelist_lock!\n");
287d25f8d6dSJung-Sang Ahn    }
2887e3e5567SSundar Sridharan
2897e3e5567SSundar Sridharan    return ret;
290d25f8d6dSJung-Sang Ahn}
2912889254eSJung-Sang Ahn
292064f53e0SChiyoung Seostatic struct bcache_item *_bcache_alloc_freeblock()
293d25f8d6dSJung-Sang Ahn{
2949ceccb2cSJung-Sang Ahn    struct list_elem *e = NULL;
295d25f8d6dSJung-Sang Ahn    struct bcache_item *item;
2962889254eSJung-Sang Ahn
297d25f8d6dSJung-Sang Ahn    spin_lock(&freelist_lock);
298d25f8d6dSJung-Sang Ahn    e = list_pop_front(&freelist);
2994404c491SSundar Sridharan    if (e) {
3004404c491SSundar Sridharan        freelist_count--;
3014404c491SSundar Sridharan    }
302d25f8d6dSJung-Sang Ahn    spin_unlock(&freelist_lock);
3039ceccb2cSJung-Sang Ahn
304d25f8d6dSJung-Sang Ahn    if (e) {
3059ceccb2cSJung-Sang Ahn        item = _get_entry(e, struct bcache_item, list_elem);
3069ceccb2cSJung-Sang Ahn        return item;
3072889254eSJung-Sang Ahn    }
308d25f8d6dSJung-Sang Ahn    return NULL;
309d25f8d6dSJung-Sang Ahn}
3102889254eSJung-Sang Ahn
311064f53e0SChiyoung Seostatic void _bcache_release_freeblock(struct bcache_item *item)
312d25f8d6dSJung-Sang Ahn{
313d25f8d6dSJung-Sang Ahn    spin_lock(&freelist_lock);
3149ceccb2cSJung-Sang Ahn    item->flag = BCACHE_FREE;
31500a9594fSJung-Sang Ahn    item->score = 0;
316d25f8d6dSJung-Sang Ahn    list_push_front(&freelist, &item->list_elem);
317eccdfcd9SJung-Sang Ahn    freelist_count++;
318d25f8d6dSJung-Sang Ahn    spin_unlock(&freelist_lock);
3197c0433f5SJung-Sang Ahn}
3207c0433f5SJung-Sang Ahn
3217e3e5567SSundar Sridharanstatic struct fnamedic_item *_next_dead_fname_zombie(void) {
3227e3e5567SSundar Sridharan    struct list_elem *e;
32352ade7a4SSundar Sridharan    struct fnamedic_item *fname_item;
32452ade7a4SSundar Sridharan    bool found = false;
3255a6f3f27Sabhinavdangeti    if (writer_lock(&filelist_lock) == 0) {
3265a6f3f27Sabhinavdangeti        e = list_begin(&file_zombies);
3275a6f3f27Sabhinavdangeti        while (e) {
3285a6f3f27Sabhinavdangeti            fname_item = _get_entry(e, struct fnamedic_item, le);
3295a6f3f27Sabhinavdangeti            if (atomic_get_uint32_t(&fname_item->ref_count) == 0) {
3305a6f3f27Sabhinavdangeti                list_remove(&file_zombies, e);
3315a6f3f27Sabhinavdangeti                found = true;
3325a6f3f27Sabhinavdangeti                break;
3335a6f3f27Sabhinavdangeti            } else {
3345a6f3f27Sabhinavdangeti                e = list_next(e);
3355a6f3f27Sabhinavdangeti            }
3367e3e5567SSundar Sridharan        }
3375a6f3f27Sabhinavdangeti        writer_unlock(&filelist_lock);
3385a6f3f27Sabhinavdangeti    } else {
3395a6f3f27Sabhinavdangeti        fprintf(stderr, "Error in _next_dead_fname_zombie(): "
3405a6f3f27Sabhinavdangeti                        "Failed to acquire WriterLock on filelist_lock!\n");
3417e3e5567SSundar Sridharan    }
34252ade7a4SSundar Sridharan    return found ? fname_item : NULL;
3437e3e5567SSundar Sridharan}
3447e3e5567SSundar Sridharan
3457e3e5567SSundar Sridharanstatic void _fname_free(struct fnamedic_item *fname);
3467e3e5567SSundar Sridharan
3477e3e5567SSundar Sridharanstatic void _garbage_collect_zombie_fnames(void) {
3487e3e5567SSundar Sridharan    struct fnamedic_item *fname_item = _next_dead_fname_zombie();
3497e3e5567SSundar Sridharan    while (fname_item) {
3507e3e5567SSundar Sridharan        _fname_free(fname_item);
3517e3e5567SSundar Sridharan        fname_item = _next_dead_fname_zombie();
3527e3e5567SSundar Sridharan    }
3537e3e5567SSundar Sridharan}
3547e3e5567SSundar Sridharan
35569467020SChiyoung Seostruct dirty_bid {
35669467020SChiyoung Seo    bid_t bid;
35769467020SChiyoung Seo    struct avl_node avl;
35869467020SChiyoung Seo};
35969467020SChiyoung Seo
36069467020SChiyoung SeoINLINE int _dirty_bid_cmp(struct avl_node *a, struct avl_node *b, void *aux)
36169467020SChiyoung Seo{
36269467020SChiyoung Seo    struct dirty_bid *aa, *bb;
36369467020SChiyoung Seo    aa = _get_entry(a, struct dirty_bid, avl);
36469467020SChiyoung Seo    bb = _get_entry(b, struct dirty_bid, avl);
36569467020SChiyoung Seo
36669467020SChiyoung Seo    #ifdef __BIT_CMP
36769467020SChiyoung Seo        return _CMP_U64(aa->bid , bb->bid);
36869467020SChiyoung Seo
36969467020SChiyoung Seo    #else
37069467020SChiyoung Seo        if (aa->bid < bb->bid) return -1;
37169467020SChiyoung Seo        else if (aa->bid > bb->bid) return 1;
37269467020SChiyoung Seo        else return 0;
37369467020SChiyoung Seo
37469467020SChiyoung Seo    #endif
37569467020SChiyoung Seo}
37669467020SChiyoung Seo
37767d223e3SSundar Sridharanstatic void _free_dirty_bids(struct dirty_bid **dirty_bids, size_t n) {
37869467020SChiyoung Seo    size_t i = 0;
37969467020SChiyoung Seo    for (; i < n; ++i) {
38069467020SChiyoung Seo        if (dirty_bids[i]) {
38169467020SChiyoung Seo            mempool_free(dirty_bids[i]);
38269467020SChiyoung Seo        }
38369467020SChiyoung Seo    }
38469467020SChiyoung Seo}
38569467020SChiyoung Seo
38669467020SChiyoung Seo// Flush some consecutive or all dirty blocks for a given file and
38769467020SChiyoung Seo// move them to the clean list.
38869467020SChiyoung Seostatic fdb_status _flush_dirty_blocks(struct fnamedic_item *fname_item,
38967d223e3SSundar Sridharan                                      bool sync, bool flush_all,
39067d223e3SSundar Sridharan                                      bool immutables_only)
39158e43058SJung-Sang Ahn{
392eccdfcd9SJung-Sang Ahn    void *buf = NULL;
39369467020SChiyoung Seo    struct list_elem *prevhead = NULL;
39469467020SChiyoung Seo    struct avl_tree *cur_tree = NULL;
39569467020SChiyoung Seo    struct avl_node *node = NULL;
39669467020SChiyoung Seo    struct dirty_bid *dbid = NULL;
39769467020SChiyoung Seo    uint64_t count = 0;
39869467020SChiyoung Seo    ssize_t ret = 0;
39969467020SChiyoung Seo    bid_t start_bid = 0, prev_bid = 0;
4008e93bc7bSJung-Sang Ahn    void *ptr = NULL;
4018e93bc7bSJung-Sang Ahn    uint8_t marker = 0x0;
402cb0f0747SSundar Sridharan    fdb_status status = FDB_RESULT_SUCCESS;
40365a4da43SJung-Sang Ahn    bool o_direct = false;
40469467020SChiyoung Seo    bool data_block_completed = false;
40569467020SChiyoung Seo    struct avl_tree dirty_blocks; // Cross-shard dirty block list for sequential writes.
40665a4da43SJung-Sang Ahn
40765a4da43SJung-Sang Ahn    if (fname_item->curfile->config->flag & _ARCH_O_DIRECT) {
40865a4da43SJung-Sang Ahn        o_direct = true;
40965a4da43SJung-Sang Ahn    }
4102889254eSJung-Sang Ahn
41169467020SChiyoung Seo    // scan and write back dirty blocks sequentially for O_DIRECT option.
41265a4da43SJung-Sang Ahn    if (sync && o_direct) {
413f693a021SSundar Sridharan        malloc_align(buf, FDB_SECTOR_SIZE, bcache_flush_unit);
41469467020SChiyoung Seo        _acquire_all_shard_locks(fname_item);
4152889254eSJung-Sang Ahn    }
4163d812dfcSJung-Sang Ahn
4172889254eSJung-Sang Ahn    prev_bid = start_bid = BLK_NOT_FOUND;
4182889254eSJung-Sang Ahn    count = 0;
4193d812dfcSJung-Sang Ahn
42069467020SChiyoung Seo    avl_init(&dirty_blocks, NULL);
42169467020SChiyoung Seo
42269467020SChiyoung Seo    // Try to flush the dirty data blocks first and then index blocks.
42369467020SChiyoung Seo    size_t i = 0;
42469467020SChiyoung Seo    bool consecutive_blocks = true;
42569467020SChiyoung Seo    struct dirty_bid **dirty_bids = alca(struct dirty_bid *, fname_item->num_shards);
42669467020SChiyoung Seo    memset(dirty_bids, 0x0, sizeof(dirty_bid *) * fname_item->num_shards);
42769467020SChiyoung Seo    while (1) {
42869467020SChiyoung Seo        if (!(node = avl_first(&dirty_blocks))) {
42969467020SChiyoung Seo            for (i = 0; i < fname_item->num_shards; ++i) {
43069467020SChiyoung Seo                if (!(sync && o_direct)) {
43169467020SChiyoung Seo                    spin_lock(&fname_item->shards[i].lock);
43269467020SChiyoung Seo                }
43369467020SChiyoung Seo                if (!data_block_completed) {
43469467020SChiyoung Seo                    node = avl_first(&fname_item->shards[i].tree);
43569467020SChiyoung Seo                } else {
43669467020SChiyoung Seo                    node = avl_first(&fname_item->shards[i].tree_idx);
43769467020SChiyoung Seo                }
43869467020SChiyoung Seo                if (node) {
43967d223e3SSundar Sridharan                    struct dirty_item *ditem = _get_entry(node,
44067d223e3SSundar Sridharan                                                struct dirty_item, avl);
44167d223e3SSundar Sridharan                    if (!immutables_only || // don't load mutable items
44267d223e3SSundar Sridharan                        ditem->item->flag & BCACHE_IMMUTABLE) {
44367d223e3SSundar Sridharan                        if (!dirty_bids[i]) {
44467d223e3SSundar Sridharan                            dirty_bids[i] = (struct dirty_bid *)
44567d223e3SSundar Sridharan                                mempool_alloc(sizeof(struct dirty_bid));
44667d223e3SSundar Sridharan                        }
44767d223e3SSundar Sridharan                        dirty_bids[i]->bid = ditem->item->bid;
44867d223e3SSundar Sridharan                        avl_insert(&dirty_blocks, &dirty_bids[i]->avl,
44967d223e3SSundar Sridharan                                   _dirty_bid_cmp);
45069467020SChiyoung Seo                    }
45169467020SChiyoung Seo                }
45269467020SChiyoung Seo                if (!(sync && o_direct)) {
45369467020SChiyoung Seo                    spin_unlock(&fname_item->shards[i].lock);
45469467020SChiyoung Seo                }
45569467020SChiyoung Seo            }
45669467020SChiyoung Seo            if (!(node = avl_first(&dirty_blocks))) {
45769467020SChiyoung Seo                if (!data_block_completed) {
45869467020SChiyoung Seo                    data_block_completed = true;
45969467020SChiyoung Seo                    if (count > 0 && !flush_all) {
46069467020SChiyoung Seo                        // Finished flushing some dirty data blocks.
46169467020SChiyoung Seo                        // Not move over to the dirty index block list because
46269467020SChiyoung Seo                        // flush_all is not requestd.
46369467020SChiyoung Seo                        break;
46469467020SChiyoung Seo                    }
46569467020SChiyoung Seo                    continue;
46669467020SChiyoung Seo                } else {
46769467020SChiyoung Seo                    break;
46869467020SChiyoung Seo                }
46969467020SChiyoung Seo            }
47069467020SChiyoung Seo        }
47165a4da43SJung-Sang Ahn
47269467020SChiyoung Seo        dbid = _get_entry(node, struct dirty_bid, avl);
473d25f8d6dSJung-Sang Ahn
47469467020SChiyoung Seo        size_t shard_num = dbid->bid % fname_item->num_shards;
47569467020SChiyoung Seo        if (!(sync && o_direct)) {
47669467020SChiyoung Seo            spin_lock(&fname_item->shards[shard_num].lock);
47769467020SChiyoung Seo        }
47869467020SChiyoung Seo        if (!data_block_completed) {
47969467020SChiyoung Seo            cur_tree = &fname_item->shards[shard_num].tree;
48069467020SChiyoung Seo        } else {
48169467020SChiyoung Seo            cur_tree = &fname_item->shards[shard_num].tree_idx;
48269467020SChiyoung Seo        }
4832889254eSJung-Sang Ahn
48469467020SChiyoung Seo        struct dirty_item *dirty_block = NULL;
48569467020SChiyoung Seo        bool item_exist = false;
48669467020SChiyoung Seo        node = avl_first(cur_tree);
48769467020SChiyoung Seo        if (node) {
48869467020SChiyoung Seo            dirty_block = _get_entry(node, struct dirty_item, avl);
48969467020SChiyoung Seo            if (dbid->bid == dirty_block->item->bid) {
49069467020SChiyoung Seo                item_exist = true;
49169467020SChiyoung Seo            }
49269467020SChiyoung Seo        }
49369467020SChiyoung Seo        // remove from the cross-shard dirty block list.
49469467020SChiyoung Seo        avl_remove(&dirty_blocks, &dbid->avl);
49569467020SChiyoung Seo        if (!item_exist) {
49669467020SChiyoung Seo            // The original first item in the shard dirty block list was removed.
49769467020SChiyoung Seo            // Grab the next one from the cross-shard dirty block list.
49869467020SChiyoung Seo            if (!(sync && o_direct)) {
49969467020SChiyoung Seo                spin_unlock(&fname_item->shards[shard_num].lock);
50069467020SChiyoung Seo            }
50167d223e3SSundar Sridharan            if (immutables_only &&
50267d223e3SSundar Sridharan                !atomic_get_uint64_t(&fname_item->nimmutable)) {
50367d223e3SSundar Sridharan                break;
50467d223e3SSundar Sridharan            }
50569467020SChiyoung Seo            continue;
50669467020SChiyoung Seo        }
50769467020SChiyoung Seo
50869467020SChiyoung Seo        consecutive_blocks = true;
50969467020SChiyoung Seo        // if BID of next dirty block is not consecutive .. stop
51069467020SChiyoung Seo        if (dirty_block->item->bid != prev_bid + 1 && prev_bid != BLK_NOT_FOUND &&
51169467020SChiyoung Seo            sync) {
51269467020SChiyoung Seo            if (flush_all) {
51369467020SChiyoung Seo                consecutive_blocks = false;
51469467020SChiyoung Seo            } else {
51569467020SChiyoung Seo                if (!(sync && o_direct)) {
51669467020SChiyoung Seo                    spin_unlock(&fname_item->shards[shard_num].lock);
51769467020SChiyoung Seo                }
51869467020SChiyoung Seo                break;
51969467020SChiyoung Seo            }
52069467020SChiyoung Seo        }
52169467020SChiyoung Seo        // set START_BID if this is the start block for a single batch write.
52269467020SChiyoung Seo        if (start_bid == BLK_NOT_FOUND) {
52369467020SChiyoung Seo            start_bid = dirty_block->item->bid;
52469467020SChiyoung Seo        }
525d25f8d6dSJung-Sang Ahn        // set PREV_BID and go to next block
52669467020SChiyoung Seo        prev_bid = dirty_block->item->bid;
5272889254eSJung-Sang Ahn
528d25f8d6dSJung-Sang Ahn        // set PTR and get block MARKER
52969467020SChiyoung Seo        ptr = dirty_block->item->addr;
5308e93bc7bSJung-Sang Ahn        marker = *((uint8_t*)(ptr) + bcache_blocksize-1);
53167d223e3SSundar Sridharan
53267d223e3SSundar Sridharan        node = avl_next(node);
53367d223e3SSundar Sridharan        // Get the next dirty block from the victim shard and insert it into
53467d223e3SSundar Sridharan        // the cross-shard dirty block list.
53567d223e3SSundar Sridharan        if (node) {
53667d223e3SSundar Sridharan            struct dirty_item *ditem = _get_entry(node, struct dirty_item,
53767d223e3SSundar Sridharan                                                  avl);
53867d223e3SSundar Sridharan            if (!immutables_only || ditem->item->flag & BCACHE_IMMUTABLE) {
53967d223e3SSundar Sridharan                dbid->bid = ditem->item->bid;
54067d223e3SSundar Sridharan                avl_insert(&dirty_blocks, &dbid->avl, _dirty_bid_cmp);
54167d223e3SSundar Sridharan            }
54267d223e3SSundar Sridharan        }
54367d223e3SSundar Sridharan
54467d223e3SSundar Sridharan        // remove from the shard dirty block list.
54567d223e3SSundar Sridharan        avl_remove(cur_tree, &dirty_block->avl);
54667d223e3SSundar Sridharan        if (dirty_block->item->flag & BCACHE_IMMUTABLE) {
54767d223e3SSundar Sridharan            atomic_decr_uint64_t(&fname_item->nimmutable);
54867d223e3SSundar Sridharan            if (!(sync && o_direct)) {
54967d223e3SSundar Sridharan                spin_unlock(&fname_item->shards[shard_num].lock);
55067d223e3SSundar Sridharan            }
55167d223e3SSundar Sridharan        }
55267d223e3SSundar Sridharan
553fa5a43c0SJung-Sang Ahn        if (sync) {
554d25f8d6dSJung-Sang Ahn            // copy to buffer
55500a9594fSJung-Sang Ahn#ifdef __CRC32
55669467020SChiyoung Seo            if (marker == BLK_MARKER_BNODE) {
55700a9594fSJung-Sang Ahn                // b-tree node .. calculate crc32 and put it into the block
55865a4da43SJung-Sang Ahn                memset((uint8_t *)(ptr) + BTREE_CRC_OFFSET,
55965a4da43SJung-Sang Ahn                       0xff, BTREE_CRC_FIELD_LEN);
56076e16138SJim Walker                uint32_t crc = get_checksum(reinterpret_cast<const uint8_t*>(ptr),
56176e16138SJim Walker                                            bcache_blocksize,
56276e16138SJim Walker                                            fname_item->curfile->crc_mode);
5636d79432aSJung-Sang Ahn                crc = _endian_encode(crc);
5646d79432aSJung-Sang Ahn                memcpy((uint8_t *)(ptr) + BTREE_CRC_OFFSET, &crc, sizeof(crc));
56500a9594fSJung-Sang Ahn            }
56600a9594fSJung-Sang Ahn#endif
56765a4da43SJung-Sang Ahn            if (o_direct) {
56869467020SChiyoung Seo                if (count > 0 && !consecutive_blocks) {
569671d9fbaSChiyoung Seo                    int64_t bytes_written;
57069467020SChiyoung Seo                    // Note that this path can be only executed in flush_all case.
571671d9fbaSChiyoung Seo                    bytes_written = filemgr_write_blocks(fname_item->curfile,
572671d9fbaSChiyoung Seo                                                         buf, count, start_bid);
573671d9fbaSChiyoung Seo                    if ((uint64_t)bytes_written != count * bcache_blocksize) {
57469467020SChiyoung Seo                        count = 0;
575671d9fbaSChiyoung Seo                        status = bytes_written < 0 ?
576671d9fbaSChiyoung Seo                            (fdb_status) bytes_written : FDB_RESULT_WRITE_FAIL;
57769467020SChiyoung Seo                        break;
57869467020SChiyoung Seo                    }
57969467020SChiyoung Seo                    // Start a new batch again.
58069467020SChiyoung Seo                    count = 0;
58169467020SChiyoung Seo                    start_bid = dirty_block->item->bid;
58269467020SChiyoung Seo                }
58365a4da43SJung-Sang Ahn                memcpy((uint8_t *)(buf) + count*bcache_blocksize,
58469467020SChiyoung Seo                       dirty_block->item->addr, bcache_blocksize);
58565a4da43SJung-Sang Ahn            } else {
5864690a73fSJens Alfke                ret = filemgr_write_blocks(fname_item->curfile,
5874690a73fSJens Alfke                                           dirty_block->item->addr,
5884690a73fSJens Alfke                                           1,
5894690a73fSJens Alfke                                           dirty_block->item->bid);
59065a4da43SJung-Sang Ahn                if (ret != bcache_blocksize) {
59167d223e3SSundar Sridharan                    if (!(dirty_block->item->flag & BCACHE_IMMUTABLE) &&
59267d223e3SSundar Sridharan                        !(sync && o_direct)) {
59369467020SChiyoung Seo                        spin_unlock(&fname_item->shards[shard_num].lock);
59469467020SChiyoung Seo                    }
595671d9fbaSChiyoung Seo                    status = ret < 0 ? (fdb_status) ret : FDB_RESULT_WRITE_FAIL;
59665a4da43SJung-Sang Ahn                    break;
59765a4da43SJung-Sang Ahn                }
59865a4da43SJung-Sang Ahn            }
599fa5a43c0SJung-Sang Ahn        }
6002889254eSJung-Sang Ahn
60167d223e3SSundar Sridharan        if (!(sync && o_direct)) {
60267d223e3SSundar Sridharan            if (dirty_block->item->flag & BCACHE_IMMUTABLE) {
60367d223e3SSundar Sridharan                spin_lock(&fname_item->shards[shard_num].lock);
60467d223e3SSundar Sridharan            }
60567d223e3SSundar Sridharan        }
6069ceccb2cSJung-Sang Ahn
60767d223e3SSundar Sridharan        dirty_block->item->flag &= ~(BCACHE_DIRTY);
60867d223e3SSundar Sridharan        dirty_block->item->flag &= ~(BCACHE_IMMUTABLE);
60969467020SChiyoung Seo        // move to the shard clean block list.
61069467020SChiyoung Seo        prevhead = fname_item->shards[shard_num].cleanlist.head;
61169467020SChiyoung Seo        (void)prevhead;
61269467020SChiyoung Seo        list_push_front(&fname_item->shards[shard_num].cleanlist,
61369467020SChiyoung Seo                        &dirty_block->item->list_elem);
61469467020SChiyoung Seo
615b6be7a1dSSundar Sridharan        fdb_assert(!(dirty_block->item->flag & BCACHE_FREE),
616b6be7a1dSSundar Sridharan                   dirty_block->item->flag, BCACHE_FREE);
617b6be7a1dSSundar Sridharan        fdb_assert(dirty_block->item->list_elem.prev == NULL &&
618b6be7a1dSSundar Sridharan                   prevhead == dirty_block->item->list_elem.next,
619b6be7a1dSSundar Sridharan                   prevhead, dirty_block->item->list_elem.next);
62069467020SChiyoung Seo        mempool_free(dirty_block);
62169467020SChiyoung Seo
62269467020SChiyoung Seo        if (!(sync && o_direct)) {
62369467020SChiyoung Seo            spin_unlock(&fname_item->shards[shard_num].lock);
62469467020SChiyoung Seo        }
6259ceccb2cSJung-Sang Ahn
6262534ac38SJung-Sang Ahn        count++;
62765a4da43SJung-Sang Ahn        if (count*bcache_blocksize >= bcache_flush_unit && sync) {
62869467020SChiyoung Seo            if (flush_all) {
62969467020SChiyoung Seo                if (o_direct) {
6304690a73fSJens Alfke                    ret = filemgr_write_blocks(fname_item->curfile, buf, count, start_bid);
6312b79205fSChiyoung Seo                    if ((size_t)ret != count * bcache_blocksize) {
63269467020SChiyoung Seo                        count = 0;
633671d9fbaSChiyoung Seo                        status = ret < 0 ? (fdb_status) ret : FDB_RESULT_WRITE_FAIL;
63469467020SChiyoung Seo                        break;
63569467020SChiyoung Seo                    }
63669467020SChiyoung Seo                    count = 0;
63769467020SChiyoung Seo                    start_bid = BLK_NOT_FOUND;
63869467020SChiyoung Seo                    prev_bid = BLK_NOT_FOUND;
63969467020SChiyoung Seo                }
64069467020SChiyoung Seo            } else {
64169467020SChiyoung Seo                break;
64269467020SChiyoung Seo            }
64365a4da43SJung-Sang Ahn        }
6442889254eSJung-Sang Ahn    }
6452889254eSJung-Sang Ahn
6462889254eSJung-Sang Ahn    // synchronize
64769467020SChiyoung Seo    if (sync && o_direct) {
64869467020SChiyoung Seo        if (count > 0) {
6494690a73fSJens Alfke            ret = filemgr_write_blocks(fname_item->curfile, buf, count, start_bid);
6502b79205fSChiyoung Seo            if ((size_t)ret != count * bcache_blocksize) {
651671d9fbaSChiyoung Seo                status = ret < 0 ? (fdb_status) ret : FDB_RESULT_WRITE_FAIL;