xref: /6.0.3/forestdb/src/blockcache.cc (revision a8f01baa)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 *     Copyright 2010 Couchbase, Inc
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 */
17
18#include <stdlib.h>
19#include <string.h>
20#include <stdint.h>
21
22#if !defined(WIN32) && !defined(_WIN32)
23#include <sys/time.h>
24#endif
25
26#include "hash_functions.h"
27#include "common.h"
28#include "libforestdb/fdb_errors.h"
29#include "hash.h"
30#include "list.h"
31#include "blockcache.h"
32#include "avltree.h"
33#include "atomic.h"
34
35#include "memleak.h"
36
37#ifdef __DEBUG
38#ifndef __DEBUG_BCACHE
39    #undef DBG
40    #undef DBGCMD
41    #undef DBGSW
42    #define DBG(...)
43    #define DBGCMD(...)
44    #define DBGSW(n, ...)
45#endif
46#endif
47
48// global lock
49static spin_t bcache_lock;
50
51// hash table for filename
52static struct hash fnamedic;
53
54// free block list
55static atomic_uint64_t freelist_count(0);
56static struct list freelist;
57static spin_t freelist_lock;
58
59static const size_t MAX_VICTIM_SELECTIONS = 5;
60
61// file structure list
62static size_t num_files;
63static size_t file_array_capacity;
64static fnamedic_item ** file_list;
65static struct list file_zombies;
66static fdb_rw_lock filelist_lock; // Reader-Writer lock for the file list.
67
68//static struct list cleanlist, dirtylist;
69//static uint64_t nfree, nclean, ndirty;
70static uint64_t bcache_nblock;
71
72static int bcache_blocksize;
73static size_t bcache_flush_unit;
74
75struct bcache_shard {
76    spin_t lock;
77    // list for clean blocks
78    struct list cleanlist;
79    // tree for normal dirty blocks
80    struct avl_tree tree;
81    // tree for index nodes
82    struct avl_tree tree_idx;
83    // hash table for block lookup
84    struct hash hashtable;
85    // list elem for shard LRU
86    struct list_elem le;
87};
88
89struct fnamedic_item {
90    char *filename;
91    uint16_t filename_len;
92    uint32_t hash;
93
94    // current opened filemgr instance
95    // (can be changed on-the-fly when file is closed and re-opened)
96    struct filemgr *curfile;
97
98    // Shards of the block cache for a file.
99    struct bcache_shard *shards;
100
101    // list elem for FILE_ZOMBIE
102    struct list_elem le;
103    // hash elem for FNAMEDIC
104    struct hash_elem hash_elem;
105
106    atomic_uint32_t ref_count;
107    atomic_uint64_t nvictim;
108    atomic_uint64_t nitems;
109    atomic_uint64_t nimmutable;
110    atomic_uint64_t access_timestamp;
111    size_t num_shards;
112};
113
114#define BCACHE_DIRTY (0x1)
115#define BCACHE_IMMUTABLE (0x2)
116#define BCACHE_FREE (0x4)
117
118static void *buffercache_addr = NULL;
119
120struct bcache_item {
121    // BID
122    bid_t bid;
123    // contents address
124    void *addr;
125    // hash elem for lookup hash table
126    struct hash_elem hash_elem;
127    // list elem for {free, clean, dirty} lists
128    struct list_elem list_elem;
129    // flag
130    atomic_uint8_t flag;
131    // score
132    uint8_t score;
133};
134
135struct dirty_item {
136    struct bcache_item *item;
137    struct avl_node avl;
138};
139
140INLINE int _dirty_cmp(struct avl_node *a, struct avl_node *b, void *aux)
141{
142    struct dirty_item *aa, *bb;
143    aa = _get_entry(a, struct dirty_item, avl);
144    bb = _get_entry(b, struct dirty_item, avl);
145
146    #ifdef __BIT_CMP
147        return _CMP_U64(aa->item->bid , bb->item->bid);
148
149    #else
150        if (aa->item->bid < bb->item->bid) return -1;
151        else if (aa->item->bid > bb->item->bid) return 1;
152        else return 0;
153
154    #endif
155}
156
157INLINE uint32_t _fname_hash(struct hash *hash, struct hash_elem *e)
158{
159    struct fnamedic_item *item;
160    item = _get_entry(e, struct fnamedic_item, hash_elem);
161    return item->hash % ((unsigned)(BCACHE_NDICBUCKET));
162}
163
164INLINE int _fname_cmp(struct hash_elem *a, struct hash_elem *b)
165{
166    size_t len;
167    struct fnamedic_item *aa, *bb;
168    aa = _get_entry(a, struct fnamedic_item, hash_elem);
169    bb = _get_entry(b, struct fnamedic_item, hash_elem);
170
171    if (aa->filename_len == bb->filename_len) {
172        return memcmp(aa->filename, bb->filename, aa->filename_len);
173    }else {
174        len = MIN(aa->filename_len , bb->filename_len);
175        int cmp = memcmp(aa->filename, bb->filename, len);
176        if (cmp != 0) return cmp;
177        else {
178            return (int)((int)aa->filename_len - (int)bb->filename_len);
179        }
180    }
181}
182
183INLINE uint32_t _bcache_hash(struct hash *hash, struct hash_elem *e)
184{
185    struct bcache_item *item = _get_entry(e, struct bcache_item, hash_elem);
186    return (item->bid) % ((uint32_t)BCACHE_NBUCKET);
187}
188
189INLINE int _bcache_cmp(struct hash_elem *a, struct hash_elem *b)
190{
191    struct bcache_item *aa, *bb;
192    aa = _get_entry(a, struct bcache_item, hash_elem);
193    bb = _get_entry(b, struct bcache_item, hash_elem);
194
195    #ifdef __BIT_CMP
196
197        return _CMP_U64(aa->bid, bb->bid);
198
199    #else
200
201        if (aa->bid == bb->bid) return 0;
202        else if (aa->bid < bb->bid) return -1;
203        else return 1;
204
205    #endif
206}
207
208#define _list_empty(list) (list.head == NULL)
209#define _tree_empty(tree) (tree.root == NULL)
210
211static void _acquire_all_shard_locks(struct fnamedic_item *fname) {
212    size_t i = 0;
213    for (; i < fname->num_shards; ++i) {
214        spin_lock(&fname->shards[i].lock);
215    }
216}
217
218static void _release_all_shard_locks(struct fnamedic_item *fname) {
219    size_t i = 0;
220    for (; i < fname->num_shards; ++i) {
221        spin_unlock(&fname->shards[i].lock);
222    }
223}
224
225static bool _file_empty(struct fnamedic_item *fname) {
226    bool empty = true;
227    size_t i = 0;
228    _acquire_all_shard_locks(fname);
229    for (; i < fname->num_shards; ++i) {
230        if (!(_list_empty(fname->shards[i].cleanlist) &&
231              _tree_empty(fname->shards[i].tree) &&
232              _tree_empty(fname->shards[i].tree_idx))) {
233            empty = false;
234            break;
235        }
236    }
237    _release_all_shard_locks(fname);
238    return empty;
239}
240
241INLINE bool _shard_empty(struct bcache_shard *bshard) {
242    // Caller should grab the shard lock before calling this function.
243    return _list_empty(bshard->cleanlist) &&
244           _tree_empty(bshard->tree) &&
245           _tree_empty(bshard->tree_idx);
246}
247
248struct fnamedic_item *_bcache_get_victim()
249{
250    struct fnamedic_item *ret = NULL;
251    uint64_t min_timestamp = (uint64_t) -1;
252    uint64_t victim_timestamp;
253    int victim_idx;
254    size_t num_attempts;
255
256    if (reader_lock(&filelist_lock) == 0) {
257        // Pick the victim that has the smallest access timestamp
258        // among files randomly selected.
259        num_attempts = num_files / 10 + 1;
260        if (num_attempts > MAX_VICTIM_SELECTIONS) {
261            num_attempts = MAX_VICTIM_SELECTIONS;
262        } else {
263            if(num_attempts == 1 && num_files > 1) {
264                ++num_attempts;
265            }
266        }
267        for (size_t i = 0; i < num_attempts && num_files; ++i) {
268            victim_idx = rand() % num_files;
269            victim_timestamp =
270                atomic_get_uint64_t(&file_list[victim_idx]->access_timestamp,
271                                    std::memory_order_relaxed);
272            if (victim_timestamp < min_timestamp &&
273                    atomic_get_uint64_t(&file_list[victim_idx]->nitems)) {
274                min_timestamp = victim_timestamp;
275                ret = file_list[victim_idx];
276            }
277        }
278
279        if (ret) {
280            atomic_incr_uint32_t(&ret->ref_count);
281        }
282        reader_unlock(&filelist_lock);
283    } else {
284        fprintf(stderr, "Error in _bcache_get_victim(): "
285                        "Failed to acquire ReaderLock on filelist_lock!\n");
286    }
287
288    return ret;
289}
290
291static struct bcache_item *_bcache_alloc_freeblock()
292{
293    struct list_elem *e = NULL;
294    struct bcache_item *item;
295
296    spin_lock(&freelist_lock);
297    e = list_pop_front(&freelist);
298    if (e) {
299        freelist_count--;
300    }
301    spin_unlock(&freelist_lock);
302
303    if (e) {
304        item = _get_entry(e, struct bcache_item, list_elem);
305        return item;
306    }
307    return NULL;
308}
309
310static void _bcache_release_freeblock(struct bcache_item *item)
311{
312    spin_lock(&freelist_lock);
313    item->flag = BCACHE_FREE;
314    item->score = 0;
315    list_push_front(&freelist, &item->list_elem);
316    freelist_count++;
317    spin_unlock(&freelist_lock);
318}
319
320static struct fnamedic_item *_next_dead_fname_zombie(void) {
321    struct list_elem *e;
322    struct fnamedic_item *fname_item;
323    bool found = false;
324    if (writer_lock(&filelist_lock) == 0) {
325        e = list_begin(&file_zombies);
326        while (e) {
327            fname_item = _get_entry(e, struct fnamedic_item, le);
328            if (atomic_get_uint32_t(&fname_item->ref_count) == 0) {
329                list_remove(&file_zombies, e);
330                found = true;
331                break;
332            } else {
333                e = list_next(e);
334            }
335        }
336        writer_unlock(&filelist_lock);
337    } else {
338        fprintf(stderr, "Error in _next_dead_fname_zombie(): "
339                        "Failed to acquire WriterLock on filelist_lock!\n");
340    }
341    return found ? fname_item : NULL;
342}
343
344static void _fname_free(struct fnamedic_item *fname);
345
346static void _garbage_collect_zombie_fnames(void) {
347    struct fnamedic_item *fname_item = _next_dead_fname_zombie();
348    while (fname_item) {
349        _fname_free(fname_item);
350        fname_item = _next_dead_fname_zombie();
351    }
352}
353
354struct dirty_bid {
355    bid_t bid;
356    struct avl_node avl;
357};
358
359INLINE int _dirty_bid_cmp(struct avl_node *a, struct avl_node *b, void *aux)
360{
361    struct dirty_bid *aa, *bb;
362    aa = _get_entry(a, struct dirty_bid, avl);
363    bb = _get_entry(b, struct dirty_bid, avl);
364
365    #ifdef __BIT_CMP
366        return _CMP_U64(aa->bid , bb->bid);
367
368    #else
369        if (aa->bid < bb->bid) return -1;
370        else if (aa->bid > bb->bid) return 1;
371        else return 0;
372
373    #endif
374}
375
376static void _free_dirty_bids(struct dirty_bid **dirty_bids, size_t n) {
377    size_t i = 0;
378    for (; i < n; ++i) {
379        if (dirty_bids[i]) {
380            mempool_free(dirty_bids[i]);
381        }
382    }
383}
384
385// Flush some consecutive or all dirty blocks for a given file and
386// move them to the clean list.
387static fdb_status _flush_dirty_blocks(struct fnamedic_item *fname_item,
388                                      bool sync, bool flush_all,
389                                      bool immutables_only)
390{
391    void *buf = NULL;
392    struct list_elem *prevhead = NULL;
393    struct avl_tree *cur_tree = NULL;
394    struct avl_node *node = NULL;
395    struct dirty_bid *dbid = NULL;
396    uint64_t count = 0;
397    ssize_t ret = 0;
398    bid_t start_bid = 0, prev_bid = 0;
399    void *ptr = NULL;
400    uint8_t marker = 0x0;
401    fdb_status status = FDB_RESULT_SUCCESS;
402    bool o_direct = false;
403    bool data_block_completed = false;
404    struct avl_tree dirty_blocks; // Cross-shard dirty block list for sequential writes.
405
406    if (fname_item->curfile->config->flag & _ARCH_O_DIRECT) {
407        o_direct = true;
408    }
409
410    // scan and write back dirty blocks sequentially for O_DIRECT option.
411    if (sync && o_direct) {
412        malloc_align(buf, FDB_SECTOR_SIZE, bcache_flush_unit);
413        _acquire_all_shard_locks(fname_item);
414    }
415
416    prev_bid = start_bid = BLK_NOT_FOUND;
417    count = 0;
418
419    avl_init(&dirty_blocks, NULL);
420
421    // Try to flush the dirty data blocks first and then index blocks.
422    size_t i = 0;
423    bool consecutive_blocks = true;
424    struct dirty_bid **dirty_bids = alca(struct dirty_bid *, fname_item->num_shards);
425    memset(dirty_bids, 0x0, sizeof(dirty_bid *) * fname_item->num_shards);
426    while (1) {
427        if (!(node = avl_first(&dirty_blocks))) {
428            for (i = 0; i < fname_item->num_shards; ++i) {
429                if (!(sync && o_direct)) {
430                    spin_lock(&fname_item->shards[i].lock);
431                }
432                if (!data_block_completed) {
433                    node = avl_first(&fname_item->shards[i].tree);
434                } else {
435                    node = avl_first(&fname_item->shards[i].tree_idx);
436                }
437                if (node) {
438                    struct dirty_item *ditem = _get_entry(node,
439                                                struct dirty_item, avl);
440                    if (!immutables_only || // don't load mutable items
441                        ditem->item->flag & BCACHE_IMMUTABLE) {
442                        if (!dirty_bids[i]) {
443                            dirty_bids[i] = (struct dirty_bid *)
444                                mempool_alloc(sizeof(struct dirty_bid));
445                        }
446                        dirty_bids[i]->bid = ditem->item->bid;
447                        avl_insert(&dirty_blocks, &dirty_bids[i]->avl,
448                                   _dirty_bid_cmp);
449                    }
450                }
451                if (!(sync && o_direct)) {
452                    spin_unlock(&fname_item->shards[i].lock);
453                }
454            }
455            if (!(node = avl_first(&dirty_blocks))) {
456                if (!data_block_completed) {
457                    data_block_completed = true;
458                    if (count > 0 && !flush_all) {
459                        // Finished flushing some dirty data blocks.
460                        // Not move over to the dirty index block list because
461                        // flush_all is not requestd.
462                        break;
463                    }
464                    continue;
465                } else {
466                    break;
467                }
468            }
469        }
470
471        dbid = _get_entry(node, struct dirty_bid, avl);
472
473        size_t shard_num = dbid->bid % fname_item->num_shards;
474        if (!(sync && o_direct)) {
475            spin_lock(&fname_item->shards[shard_num].lock);
476        }
477        if (!data_block_completed) {
478            cur_tree = &fname_item->shards[shard_num].tree;
479        } else {
480            cur_tree = &fname_item->shards[shard_num].tree_idx;
481        }
482
483        struct dirty_item *dirty_block = NULL;
484        bool item_exist = false;
485        node = avl_first(cur_tree);
486        if (node) {
487            dirty_block = _get_entry(node, struct dirty_item, avl);
488            if (dbid->bid == dirty_block->item->bid) {
489                item_exist = true;
490            }
491        }
492        // remove from the cross-shard dirty block list.
493        avl_remove(&dirty_blocks, &dbid->avl);
494        if (!item_exist) {
495            // The original first item in the shard dirty block list was removed.
496            // Grab the next one from the cross-shard dirty block list.
497            if (!(sync && o_direct)) {
498                spin_unlock(&fname_item->shards[shard_num].lock);
499            }
500            if (immutables_only &&
501                !atomic_get_uint64_t(&fname_item->nimmutable)) {
502                break;
503            }
504            continue;
505        }
506
507        consecutive_blocks = true;
508        // if BID of next dirty block is not consecutive .. stop
509        if (dirty_block->item->bid != prev_bid + 1 && prev_bid != BLK_NOT_FOUND &&
510            sync) {
511            if (flush_all) {
512                consecutive_blocks = false;
513            } else {
514                if (!(sync && o_direct)) {
515                    spin_unlock(&fname_item->shards[shard_num].lock);
516                }
517                break;
518            }
519        }
520        // set START_BID if this is the start block for a single batch write.
521        if (start_bid == BLK_NOT_FOUND) {
522            start_bid = dirty_block->item->bid;
523        }
524        // set PREV_BID and go to next block
525        prev_bid = dirty_block->item->bid;
526
527        // set PTR and get block MARKER
528        ptr = dirty_block->item->addr;
529        marker = *((uint8_t*)(ptr) + bcache_blocksize-1);
530
531        node = avl_next(node);
532        // Get the next dirty block from the victim shard and insert it into
533        // the cross-shard dirty block list.
534        if (node) {
535            struct dirty_item *ditem = _get_entry(node, struct dirty_item,
536                                                  avl);
537            if (!immutables_only || ditem->item->flag & BCACHE_IMMUTABLE) {
538                dbid->bid = ditem->item->bid;
539                avl_insert(&dirty_blocks, &dbid->avl, _dirty_bid_cmp);
540            }
541        }
542
543        // remove from the shard dirty block list.
544        avl_remove(cur_tree, &dirty_block->avl);
545        if (dirty_block->item->flag & BCACHE_IMMUTABLE) {
546            atomic_decr_uint64_t(&fname_item->nimmutable);
547            if (!(sync && o_direct)) {
548                spin_unlock(&fname_item->shards[shard_num].lock);
549            }
550        }
551
552        if (sync) {
553            // copy to buffer
554#ifdef __CRC32
555            if (marker == BLK_MARKER_BNODE) {
556                // b-tree node .. calculate crc32 and put it into the block
557                memset((uint8_t *)(ptr) + BTREE_CRC_OFFSET,
558                       0xff, BTREE_CRC_FIELD_LEN);
559                uint32_t crc = get_checksum(reinterpret_cast<const uint8_t*>(ptr),
560                                            bcache_blocksize,
561                                            fname_item->curfile->crc_mode);
562                crc = _endian_encode(crc);
563                memcpy((uint8_t *)(ptr) + BTREE_CRC_OFFSET, &crc, sizeof(crc));
564            }
565#endif
566            if (o_direct) {
567                if (count > 0 && !consecutive_blocks) {
568                    int64_t bytes_written;
569                    // Note that this path can be only executed in flush_all case.
570                    bytes_written = filemgr_write_blocks(fname_item->curfile,
571                                                         buf, count, start_bid);
572                    if ((uint64_t)bytes_written != count * bcache_blocksize) {
573                        count = 0;
574                        status = bytes_written < 0 ?
575                            (fdb_status) bytes_written : FDB_RESULT_WRITE_FAIL;
576                        break;
577                    }
578                    // Start a new batch again.
579                    count = 0;
580                    start_bid = dirty_block->item->bid;
581                }
582                memcpy((uint8_t *)(buf) + count*bcache_blocksize,
583                       dirty_block->item->addr, bcache_blocksize);
584            } else {
585                ret = filemgr_write_blocks(fname_item->curfile,
586                                           dirty_block->item->addr,
587                                           1,
588                                           dirty_block->item->bid);
589                if (ret != bcache_blocksize) {
590                    if (!(dirty_block->item->flag & BCACHE_IMMUTABLE) &&
591                        !(sync && o_direct)) {
592                        spin_unlock(&fname_item->shards[shard_num].lock);
593                    }
594                    status = ret < 0 ? (fdb_status) ret : FDB_RESULT_WRITE_FAIL;
595                    break;
596                }
597            }
598        }
599
600        if (!(sync && o_direct)) {
601            if (dirty_block->item->flag & BCACHE_IMMUTABLE) {
602                spin_lock(&fname_item->shards[shard_num].lock);
603            }
604        }
605
606        dirty_block->item->flag &= ~(BCACHE_DIRTY);
607        dirty_block->item->flag &= ~(BCACHE_IMMUTABLE);
608        // move to the shard clean block list.
609        prevhead = fname_item->shards[shard_num].cleanlist.head;
610        (void)prevhead;
611        list_push_front(&fname_item->shards[shard_num].cleanlist,
612                        &dirty_block->item->list_elem);
613
614        fdb_assert(!(dirty_block->item->flag & BCACHE_FREE),
615                   dirty_block->item->flag, BCACHE_FREE);
616        fdb_assert(dirty_block->item->list_elem.prev == NULL &&
617                   prevhead == dirty_block->item->list_elem.next,
618                   prevhead, dirty_block->item->list_elem.next);
619        mempool_free(dirty_block);
620
621        if (!(sync && o_direct)) {
622            spin_unlock(&fname_item->shards[shard_num].lock);
623        }
624
625        count++;
626        if (count*bcache_blocksize >= bcache_flush_unit && sync) {
627            if (flush_all) {
628                if (o_direct) {
629                    ret = filemgr_write_blocks(fname_item->curfile, buf, count, start_bid);
630                    if ((size_t)ret != count * bcache_blocksize) {
631                        count = 0;
632                        status = ret < 0 ? (fdb_status) ret : FDB_RESULT_WRITE_FAIL;
633                        break;
634                    }
635                    count = 0;
636                    start_bid = BLK_NOT_FOUND;
637                    prev_bid = BLK_NOT_FOUND;
638                }
639            } else {
640                break;
641            }
642        }
643    }
644
645    // synchronize
646    if (sync && o_direct) {
647        if (count > 0) {
648            ret = filemgr_write_blocks(fname_item->curfile, buf, count, start_bid);
649            if ((size_t)ret != count * bcache_blocksize) {
650                status = ret < 0 ? (fdb_status) ret : FDB_RESULT_WRITE_FAIL;
651            }
652        }
653        _release_all_shard_locks(fname_item);
654        free_align(buf);
655    }
656
657    _free_dirty_bids(dirty_bids, fname_item->num_shards);
658    return status;
659}
660
661// perform eviction
662static struct list_elem * _bcache_evict(struct fnamedic_item *curfile)
663{
664    size_t n_evict;
665    struct list_elem *e = NULL;
666    struct bcache_item *item;
667    struct fnamedic_item *victim = NULL;
668
669    // We don't need to grab the global buffer cache lock here because
670    // the file's buffer cache instance (fnamedic_item) can be freed only if
671    // there are no database handles opened for that file.
672
673    while (victim == NULL) {
674        // select a victim file
675        victim = _bcache_get_victim();
676        while(victim) {
677            // check whether this file has at least one block to be evictied
678            if (atomic_get_uint64_t(&victim->nitems)) {
679                // select this file as victim
680                break;
681            } else {
682                atomic_decr_uint32_t(&victim->ref_count);
683                victim = NULL; // Try to select a victim again
684            }
685        }
686    }
687    fdb_assert(victim, victim, NULL);
688
689    atomic_incr_uint64_t(&victim->nvictim, std::memory_order_relaxed);
690
691    // select the clean blocks from the victim file
692    n_evict = 0;
693    while(n_evict < BCACHE_EVICT_UNIT) {
694        size_t num_shards = victim->num_shards;
695        size_t i = random(num_shards);
696        bool found_victim_shard = false;
697        bcache_shard *bshard = NULL;
698
699        for (size_t to_visit = num_shards; to_visit; --to_visit) {
700            i = (i + 1) % num_shards; // Round robin over empty shards..
701            bshard = &victim->shards[i];
702            spin_lock(&bshard->lock);
703            if (_shard_empty(bshard)) {
704                spin_unlock(&bshard->lock);
705                continue;
706            }
707            e = list_pop_back(&bshard->cleanlist);
708            if(!e) {
709                spin_unlock(&bshard->lock);
710                // When the victim shard has no clean block, evict some dirty blocks
711                // from shards.
712                if (_flush_dirty_blocks(victim, true, false, false)
713                    != FDB_RESULT_SUCCESS) {
714                    atomic_decr_uint32_t(&victim->ref_count);
715                    return NULL;
716                }
717                continue; // Select a victim shard again.
718            }
719
720            item = _get_entry(e, struct bcache_item, list_elem);
721#ifdef __BCACHE_SECOND_CHANCE
722            // repeat until zero-score item is found
723            if (item->score == 0) {
724                found_victim_shard = true;
725                break;
726            } else {
727                // give second chance to the item
728                item->score--;
729                list_push_front(&bshard->cleanlist, &item->list_elem);
730                spin_unlock(&bshard->lock);
731            }
732#else
733            found_victim_shard = true;
734            break;
735#endif
736        }
737        if (!found_victim_shard) {
738            // We couldn't find any non-empty shards even after 'num_shards'
739            // attempts.
740            // The file is *likely* empty. Note that it is OK to return NULL
741            // even if the file is not empty because the caller will retry again.
742            atomic_decr_uint32_t(&victim->ref_count);
743            return NULL;
744        }
745
746        atomic_decr_uint64_t(&victim->nitems);
747        // remove from hash and insert into freelist
748        hash_remove(&bshard->hashtable, &item->hash_elem);
749        // add to freelist
750        _bcache_release_freeblock(item);
751        n_evict++;
752
753        spin_unlock(&bshard->lock);
754
755        if (atomic_get_uint64_t(&victim->nitems) == 0) {
756            break;
757        }
758    }
759
760    atomic_decr_uint32_t(&victim->ref_count);
761    return &item->list_elem;
762}
763
764static struct fnamedic_item * _fname_create(struct filemgr *file) {
765    // TODO: we MUST NOT directly read file sturcture
766
767    struct fnamedic_item *fname_new;
768    // Before we create a new filename entry, garbage collect zombies
769    _garbage_collect_zombie_fnames();
770    fname_new = (struct fnamedic_item *)malloc(sizeof(struct fnamedic_item));
771
772    fname_new->filename_len = strlen(file->filename);
773    fname_new->filename = (char *)malloc(fname_new->filename_len + 1);
774    memcpy(fname_new->filename, file->filename, fname_new->filename_len);
775    fname_new->filename[fname_new->filename_len] = 0;
776
777    // calculate hash value
778    fname_new->hash = get_checksum(reinterpret_cast<const uint8_t*>(fname_new->filename),
779                                   fname_new->filename_len,
780                                   file->crc_mode);
781    fname_new->curfile = file;
782    atomic_init_uint64_t(&fname_new->nvictim, 0);
783    atomic_init_uint64_t(&fname_new->nitems, 0);
784    atomic_init_uint64_t(&fname_new->nimmutable, 0);
785    atomic_init_uint32_t(&fname_new->ref_count, 0);
786    atomic_init_uint64_t(&fname_new->access_timestamp, 0);
787    if (file->config->num_bcache_shards) {
788        fname_new->num_shards = file->config->num_bcache_shards;
789    } else {
790        fname_new->num_shards = DEFAULT_NUM_BCACHE_PARTITIONS;
791    }
792    // For random eviction among shards
793    randomize();
794
795    fname_new->shards = (bcache_shard *)
796        malloc(sizeof(struct bcache_shard) * fname_new->num_shards);
797    size_t i = 0;
798    for (; i < fname_new->num_shards; ++i) {
799        // initialize tree
800        avl_init(&fname_new->shards[i].tree, NULL);
801        avl_init(&fname_new->shards[i].tree_idx, NULL);
802        // initialize clean list
803        list_init(&fname_new->shards[i].cleanlist);
804        // initialize hash table
805        hash_init(&fname_new->shards[i].hashtable, BCACHE_NBUCKET,
806                  _bcache_hash, _bcache_cmp);
807        spin_init(&fname_new->shards[i].lock);
808    }
809
810    // insert into fname dictionary
811    hash_insert(&fnamedic, &fname_new->hash_elem);
812    file->bcache.store(fname_new, std::memory_order_relaxed);
813
814    if (writer_lock(&filelist_lock) == 0) {
815        if (num_files == file_array_capacity) {
816            file_array_capacity *= 2;
817            file_list = (struct fnamedic_item **) realloc(file_list,
818                                                          file_array_capacity);
819        }
820        file_list[num_files++] = fname_new;
821        writer_unlock(&filelist_lock);
822    } else {
823        fprintf(stderr, "Error in _fname_create(): "
824                        "Failed to acquire WriterLock on filelist_lock!\n");
825    }
826
827    return fname_new;
828}
829
830static bool _fname_try_free(struct fnamedic_item *fname)
831{
832    bool ret = true;
833
834    if (writer_lock(&filelist_lock) == 0) {
835        // Remove from the file list array
836        bool found = false;
837        for (size_t i = 0; i < num_files; ++i) {
838            if (file_list[i] == fname) {
839                found = true;
840            }
841            if (found && (i+1 < num_files)) {
842                file_list[i] = file_list[i+1];
843            }
844        }
845        if (!found) {
846            writer_unlock(&filelist_lock);
847            DBG("Error: fnamedic_item instance for a file '%s' can't be "
848                    "found in the buffer cache's file list.\n", fname->filename);
849            return false;
850        }
851
852        file_list[num_files - 1] = NULL;
853        --num_files;
854        if (atomic_get_uint32_t(&fname->ref_count) != 0) {
855            // This item is a victim by another thread's _bcache_evict()
856            list_push_front(&file_zombies, &fname->le);
857            ret = false; // Delay deletion
858        }
859
860        writer_unlock(&filelist_lock);
861    } else {
862        fprintf(stderr, "Error in _fname_try_free(): "
863                        "Failed to acquire WriterLock on filelist_lock!\n");
864    }
865
866    return ret;
867}
868
869static void _fname_free(struct fnamedic_item *fname)
870{
871    // file must be empty
872    if (!_file_empty(fname)) {
873        DBG("Warning: failed to free fnamedic_item instance for a file '%s' "
874            "because the fnamedic_item instance is not empty!\n",
875            fname->filename);
876        return;
877    }
878    uint32_t ref_count = atomic_get_uint32_t(&fname->ref_count);
879    if (ref_count != 0) {
880        DBG("Warning: failed to free fnamedic_item instance for a file '%s' "
881            "because its ref counter is not zero!\n",
882            fname->filename);
883        return;
884    }
885
886    // free hash
887    size_t i = 0;
888    for (; i < fname->num_shards; ++i) {
889        hash_free(&fname->shards[i].hashtable);
890        spin_destroy(&fname->shards[i].lock);
891    }
892
893    free(fname->shards);
894    free(fname->filename);
895    free(fname);
896}
897
898INLINE void _bcache_set_score(struct bcache_item *item)
899{
900#ifdef __CRC32
901    uint8_t marker;
902
903    // set PTR and get block MARKER
904    marker = *((uint8_t*)(item->addr) + bcache_blocksize-1);
905    if (marker == BLK_MARKER_BNODE ) {
906        // b-tree node .. set item's score to 1
907        item->score = 1;
908    } else {
909        item->score = 0;
910    }
911#endif
912}
913
914int bcache_read(struct filemgr *file, bid_t bid, void *buf)
915{
916    struct hash_elem *h;
917    struct bcache_item *item;
918    struct bcache_item query;
919    struct fnamedic_item *fname;
920
921    // Note that we don't need to grab bcache_lock here as the block cache
922    // is already created and binded when the file is created or opened for
923    // the first time.
924    fname = file->bcache.load(std::memory_order_relaxed);
925
926    if (fname) {
927        // file exists
928        // set query
929        query.bid = bid;
930        // Update the access timestamp.
931        struct timeval tp;
932        gettimeofday(&tp, NULL); // TODO: Need to implement a better way of
933                                 // getting the timestamp to avoid the overhead of
934                                 // gettimeofday()
935        atomic_store_uint64_t(&fname->access_timestamp,
936                              (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec),
937                              std::memory_order_relaxed);
938
939        size_t shard_num = bid % fname->num_shards;
940        spin_lock(&fname->shards[shard_num].lock);
941
942        // search shard hash table
943        h = hash_find(&fname->shards[shard_num].hashtable, &query.hash_elem);
944        if (h) {
945            // cache hit
946            item = _get_entry(h, struct bcache_item, hash_elem);
947            if (item->flag & BCACHE_FREE) {
948                spin_unlock(&fname->shards[shard_num].lock);
949                DBG("Warning: failed to read the buffer cache entry for a file '%s' "
950                    "because the entry belongs to the free list!\n",
951                    file->filename);
952                return 0;
953            }
954
955            // move the item to the head of list if the block is clean
956            // (don't care if the block is dirty)
957            if (!(item->flag & BCACHE_DIRTY)) {
958                // TODO: Scanning the list would cause some overhead. We need to devise
959                // the better data structure to provide a fast lookup for the clean list.
960                list_remove(&fname->shards[shard_num].cleanlist, &item->list_elem);
961                list_push_front(&fname->shards[shard_num].cleanlist, &item->list_elem);
962            }
963
964            memcpy(buf, item->addr, bcache_blocksize);
965            _bcache_set_score(item);
966
967            spin_unlock(&fname->shards[shard_num].lock);
968
969            return bcache_blocksize;
970        } else {
971            // cache miss
972            spin_unlock(&fname->shards[shard_num].lock);
973        }
974    }
975
976    // does not exist .. cache miss
977    return 0;
978}
979
980bool bcache_invalidate_block(struct filemgr *file, bid_t bid)
981{
982    struct hash_elem *h;
983    struct bcache_item *item;
984    struct bcache_item query;
985    struct fnamedic_item *fname;
986    bool ret = false;
987
988    // Note that we don't need to grab bcache_lock here as the block cache
989    // is already created and binded when the file is created or opened for
990    // the first time.
991    fname = file->bcache;
992
993    if (fname) {
994        // file exists
995        // set query
996        query.bid = bid;
997        // Update the access timestamp.
998        struct timeval tp;
999        gettimeofday(&tp, NULL);
1000        atomic_store_uint64_t(&fname->access_timestamp,
1001                              (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec),
1002                              std::memory_order_relaxed);
1003
1004        size_t shard_num = bid % fname->num_shards;
1005        spin_lock(&fname->shards[shard_num].lock);
1006
1007        // search BHASH
1008        h = hash_find(&fname->shards[shard_num].hashtable, &query.hash_elem);
1009        if (h) {
1010            // cache hit
1011            item = _get_entry(h, struct bcache_item, hash_elem);
1012            if (item->flag & BCACHE_FREE) {
1013                spin_unlock(&fname->shards[shard_num].lock);
1014                DBG("Warning: failed to invalidate the buffer cache entry for a file '%s' "
1015                    "because the entry belongs to the free list!\n",
1016                    file->filename);
1017                return false;
1018            }
1019
1020            if (!(item->flag & BCACHE_DIRTY)) {
1021                atomic_decr_uint64_t(&fname->nitems);
1022                // only for clean blocks
1023                // remove from hash and insert into freelist
1024                hash_remove(&fname->shards[shard_num].hashtable, &item->hash_elem);
1025                // remove from clean list
1026                list_remove(&fname->shards[shard_num].cleanlist, &item->list_elem);
1027                spin_unlock(&fname->shards[shard_num].lock);
1028
1029                // add to freelist
1030                _bcache_release_freeblock(item);
1031                ret = true;
1032            } else {
1033                item->flag |= BCACHE_IMMUTABLE; // (stale index node block)
1034                atomic_incr_uint64_t(&fname->nimmutable);
1035                spin_unlock(&fname->shards[shard_num].lock);
1036            }
1037        } else {
1038            // cache miss
1039            spin_unlock(&fname->shards[shard_num].lock);
1040        }
1041    }
1042    return ret;
1043}
1044
1045int bcache_write(struct filemgr *file,
1046                 bid_t bid,
1047                 void *buf,
1048                 bcache_dirty_t dirty,
1049                 bool final_write)
1050{
1051    struct hash_elem *h = NULL;
1052    struct bcache_item *item;
1053    struct bcache_item query;
1054    struct fnamedic_item *fname_new;
1055
1056    fname_new = file->bcache;
1057    if (fname_new == NULL) {
1058        spin_lock(&bcache_lock);
1059        fname_new = file->bcache;
1060        if (fname_new == NULL) {
1061            // filename doesn't exist in filename dictionary .. create
1062            fname_new = _fname_create(file);
1063        }
1064        spin_unlock(&bcache_lock);
1065    }
1066
1067    // Update the access timestamp.
1068    struct timeval tp;
1069    gettimeofday(&tp, NULL);
1070    atomic_store_uint64_t(&fname_new->access_timestamp,
1071                          (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec),
1072                          std::memory_order_relaxed);
1073
1074    size_t shard_num = bid % fname_new->num_shards;
1075    // set query
1076    query.bid = bid;
1077
1078    spin_lock(&fname_new->shards[shard_num].lock);
1079
1080    // search hash table
1081    h = hash_find(&fname_new->shards[shard_num].hashtable, &query.hash_elem);
1082    if (h == NULL) {
1083        // cache miss
1084        // get a free block
1085        while ((item = _bcache_alloc_freeblock()) == NULL) {
1086            // no free block .. perform eviction
1087            spin_unlock(&fname_new->shards[shard_num].lock);
1088
1089            _bcache_evict(fname_new);
1090
1091            spin_lock(&fname_new->shards[shard_num].lock);
1092        }
1093
1094        // re-search hash table
1095        h = hash_find(&fname_new->shards[shard_num].hashtable, &query.hash_elem);
1096        if (h == NULL) {
1097            // insert into hash table
1098            item->bid = bid;
1099            item->flag = BCACHE_FREE;
1100            hash_insert(&fname_new->shards[shard_num].hashtable, &item->hash_elem);
1101            h = &item->hash_elem;
1102        } else {
1103            // insert into freelist again
1104            _bcache_release_freeblock(item);
1105            item = _get_entry(h, struct bcache_item, hash_elem);
1106        }
1107    } else {
1108        item = _get_entry(h, struct bcache_item, hash_elem);
1109    }
1110
1111    fdb_assert(h, h, NULL);
1112
1113    if (item->flag & BCACHE_FREE) {
1114        atomic_incr_uint64_t(&fname_new->nitems);
1115    }
1116
1117    // remove from the list if the block is in clean list
1118    if (!(item->flag & BCACHE_DIRTY) && !(item->flag & BCACHE_FREE)) {
1119        list_remove(&fname_new->shards[shard_num].cleanlist, &item->list_elem);
1120    }
1121    item->flag &= ~BCACHE_FREE;
1122
1123    if (dirty == BCACHE_REQ_DIRTY) {
1124        // DIRTY request
1125        // to avoid re-insert already existing item into tree
1126        if (!(item->flag & BCACHE_DIRTY)) {
1127            // dirty block
1128            // insert into tree
1129            struct dirty_item *ditem;
1130            uint8_t marker;
1131
1132            ditem = (struct dirty_item *)
1133                    mempool_alloc(sizeof(struct dirty_item));
1134            ditem->item = item;
1135
1136            marker = *((uint8_t*)buf + bcache_blocksize-1);
1137            if (marker == BLK_MARKER_BNODE) {
1138                // b-tree node
1139                avl_insert(&fname_new->shards[shard_num].tree_idx, &ditem->avl, _dirty_cmp);
1140            } else {
1141                if (final_write) {
1142                    item->flag |= BCACHE_IMMUTABLE; // (fully written doc block)
1143                    atomic_incr_uint64_t(&fname_new->nimmutable);
1144                }
1145                avl_insert(&fname_new->shards[shard_num].tree, &ditem->avl,
1146                           _dirty_cmp);
1147            }
1148        }
1149        item->flag |= BCACHE_DIRTY;
1150    } else {
1151        // CLEAN request
1152        // insert into clean list only when it was originally clean
1153        if (!(item->flag & BCACHE_DIRTY)) {
1154            list_push_front(&fname_new->shards[shard_num].cleanlist, &item->list_elem);
1155            item->flag &= ~(BCACHE_DIRTY);
1156        }
1157    }
1158
1159    memcpy(item->addr, buf, bcache_blocksize);
1160    _bcache_set_score(item);
1161
1162    spin_unlock(&fname_new->shards[shard_num].lock);
1163
1164    return bcache_blocksize;
1165}
1166
1167int bcache_write_partial(struct filemgr *file,
1168                         bid_t bid,
1169                         void *buf,
1170                         size_t offset,
1171                         size_t len,
1172                         bool final_write)
1173{
1174    struct hash_elem *h;
1175    struct bcache_item *item;
1176    struct bcache_item query;
1177    struct fnamedic_item *fname_new;
1178
1179    fname_new = file->bcache;
1180    if (fname_new == NULL) {
1181        spin_lock(&bcache_lock);
1182        fname_new = file->bcache;
1183        if (fname_new == NULL) {
1184            // filename doesn't exist in filename dictionary .. create
1185            fname_new = _fname_create(file);
1186        }
1187        spin_unlock(&bcache_lock);
1188    }
1189
1190    // Update the access timestamp.
1191    struct timeval tp;
1192    gettimeofday(&tp, NULL);
1193    atomic_store_uint64_t(&fname_new->access_timestamp,
1194                          (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec),
1195                          std::memory_order_relaxed);
1196
1197    size_t shard_num = bid % fname_new->num_shards;
1198    // set query
1199    query.bid = bid;
1200
1201    spin_lock(&fname_new->shards[shard_num].lock);
1202
1203    // search hash table
1204    h = hash_find(&fname_new->shards[shard_num].hashtable, &query.hash_elem);
1205    if (h == NULL) {
1206        // cache miss .. partial write fail .. return 0
1207        spin_unlock(&fname_new->shards[shard_num].lock);
1208        return 0;
1209
1210    } else {
1211        // cache hit .. get the block
1212        item = _get_entry(h, struct bcache_item, hash_elem);
1213    }
1214
1215    if (item->flag & BCACHE_FREE) {
1216        DBG("Warning: failed to write on the buffer cache entry for a file '%s' "
1217            "because the entry belongs to the free list!\n",
1218            file->filename);
1219        return 0;
1220    }
1221
1222    // check whether this is dirty block
1223    // to avoid re-insert already existing item into tree
1224    if (!(item->flag & BCACHE_DIRTY)) {
1225        // this block was clean block
1226        uint8_t marker;
1227        struct dirty_item *ditem;
1228
1229        // remove from clean list
1230        list_remove(&fname_new->shards[shard_num].cleanlist, &item->list_elem);
1231
1232        ditem = (struct dirty_item *)mempool_alloc(sizeof(struct dirty_item));
1233        ditem->item = item;
1234
1235        // insert into tree
1236        marker = *((uint8_t*)item->addr + bcache_blocksize-1);
1237        if (marker == BLK_MARKER_BNODE ) {
1238            // b-tree node
1239            avl_insert(&fname_new->shards[shard_num].tree_idx, &ditem->avl, _dirty_cmp);
1240        } else {
1241            avl_insert(&fname_new->shards[shard_num].tree, &ditem->avl,
1242                       _dirty_cmp);
1243            if (final_write) {
1244                item->flag |= BCACHE_IMMUTABLE; // (fully written doc block)
1245                atomic_incr_uint64_t(&fname_new->nimmutable);
1246            }
1247        }
1248    } else if (!(item->flag & BCACHE_IMMUTABLE)) {
1249        if (final_write) {
1250            item->flag |= BCACHE_IMMUTABLE; // (fully written doc block)
1251            atomic_incr_uint64_t(&fname_new->nimmutable);
1252        }
1253    }
1254
1255    // always set this block as dirty
1256    item->flag |= BCACHE_DIRTY;
1257
1258    memcpy((uint8_t *)(item->addr) + offset, buf, len);
1259    _bcache_set_score(item);
1260
1261    spin_unlock(&fname_new->shards[shard_num].lock);
1262
1263    return len;
1264}
1265
1266// flush and synchronize a batch of contiguous dirty immutable blocks in file
1267// dirty blocks will be changed to clean blocks (not discarded)
1268// Note: This function can be invoked as part of background flushing
1269fdb_status bcache_flush_immutable(struct filemgr *file)
1270{
1271    struct fnamedic_item *fname_item;
1272    fdb_status status = FDB_RESULT_SUCCESS;
1273    fname_item = file->bcache;
1274    if (fname_item) {
1275        status = _flush_dirty_blocks(fname_item, true, true, true);
1276    }
1277    return status;
1278}
1279
1280// remove all dirty blocks of the FILE
1281// (they are only discarded and not written back)
1282void bcache_remove_dirty_blocks(struct filemgr *file)
1283{
1284    struct fnamedic_item *fname_item;
1285
1286    fname_item = file->bcache;
1287
1288    if (fname_item) {
1289        // Note that this function is only invoked as part of database file close or
1290        // removal when there are no database handles for a given file. Therefore,
1291        // we don't need to grab all the shard locks at once.
1292
1293        // remove all dirty blocks
1294        _flush_dirty_blocks(fname_item, false, true, false);
1295    }
1296}
1297
1298// remove all clean blocks of the FILE
1299void bcache_remove_clean_blocks(struct filemgr *file)
1300{
1301    struct list_elem *e;
1302    struct bcache_item *item;
1303    struct fnamedic_item *fname_item;
1304
1305    fname_item = file->bcache;
1306
1307    if (fname_item) {
1308        // Note that this function is only invoked as part of database file close or
1309        // removal when there are no database handles for a given file. Therefore,
1310        // we don't need to grab all the shard locks at once.
1311
1312        // remove all clean blocks from each shard in a file.
1313        size_t i = 0;
1314        for (; i < fname_item->num_shards; ++i) {
1315            spin_lock(&fname_item->shards[i].lock);
1316            e = list_begin(&fname_item->shards[i].cleanlist);
1317            while(e){
1318                item = _get_entry(e, struct bcache_item, list_elem);
1319                // remove from clean list
1320                e = list_remove(&fname_item->shards[i].cleanlist, e);
1321                // remove from hash table
1322                hash_remove(&fname_item->shards[i].hashtable, &item->hash_elem);
1323                // insert into free list
1324                _bcache_release_freeblock(item);
1325            }
1326            spin_unlock(&fname_item->shards[i].lock);
1327        }
1328    }
1329}
1330
1331// remove file from filename dictionary
1332// MUST sure that there is no dirty block belongs to this FILE
1333// (or memory leak occurs)
1334bool bcache_remove_file(struct filemgr *file)
1335{
1336    bool rv = false;
1337    struct fnamedic_item *fname_item;
1338
1339    // Before proceeding with deletion, garbage collect zombie files
1340    _garbage_collect_zombie_fnames();
1341    fname_item = file->bcache;
1342
1343    if (fname_item) {
1344        // acquire lock
1345        spin_lock(&bcache_lock);
1346        // file must be empty
1347        if (!_file_empty(fname_item)) {
1348            spin_unlock(&bcache_lock);
1349            DBG("Warning: failed to remove fnamedic_item instance for a file '%s' "
1350                "because the fnamedic_item instance is not empty!\n",
1351                file->filename);
1352            return rv;
1353        }
1354
1355        // remove from fname dictionary hash table
1356        hash_remove(&fnamedic, &fname_item->hash_elem);
1357        spin_unlock(&bcache_lock);
1358
1359        // We don't need to grab the file buffer cache's partition locks
1360        // at once because this function is only invoked when there are
1361        // no database handles that access the file.
1362        if (_fname_try_free(fname_item)) {
1363            _fname_free(fname_item); // no other callers accessing this file
1364            rv = true;
1365        } // else fnamedic_item is in use by _bcache_evict. Deletion delayed
1366    }
1367    return rv;
1368}
1369
1370// flush and synchronize all dirty blocks of the FILE
1371// dirty blocks will be changed to clean blocks (not discarded)
1372fdb_status bcache_flush(struct filemgr *file)
1373{
1374    struct fnamedic_item *fname_item;
1375    fdb_status status = FDB_RESULT_SUCCESS;
1376
1377    fname_item = file->bcache;
1378
1379    if (fname_item) {
1380        // Note that this function is invoked as part of a commit operation while
1381        // the filemgr's lock is already grabbed by a committer.
1382        // Therefore, we don't need to grab all the shard locks at once.
1383        status = _flush_dirty_blocks(fname_item, true, true, false);
1384    }
1385    return status;
1386}
1387
1388void bcache_init(int nblock, int blocksize)
1389{
1390    int i;
1391    struct bcache_item *item;
1392    uint8_t *block_ptr;
1393
1394    list_init(&freelist);
1395    list_init(&file_zombies);
1396
1397    hash_init(&fnamedic, BCACHE_NDICBUCKET, _fname_hash, _fname_cmp);
1398
1399    bcache_blocksize = blocksize;
1400    bcache_flush_unit = BCACHE_FLUSH_UNIT;
1401    bcache_nblock = nblock;
1402    spin_init(&bcache_lock);
1403    spin_init(&freelist_lock);
1404
1405    int rv = init_rw_lock(&filelist_lock);
1406    if (rv != 0) {
1407        fprintf(stderr, "Error in bcache_init(): "
1408                        "RW Lock initialization failed; ErrorCode: %d\n", rv);
1409    }
1410
1411    freelist_count = 0;
1412
1413    num_files = 0;
1414    file_array_capacity = 4096; // Initial capacity of file list array.
1415    file_list = (fnamedic_item **) calloc(file_array_capacity, sizeof(fnamedic_item *));
1416    // Allocate entire buffer cache memory
1417    block_ptr = (uint8_t *)malloc((uint64_t)bcache_blocksize * nblock);
1418    buffercache_addr = block_ptr;
1419
1420    for (i=0;i<nblock;++i){
1421        item = (struct bcache_item *)malloc(sizeof(struct bcache_item));
1422
1423        item->bid = BLK_NOT_FOUND;
1424        item->flag = 0x0 | BCACHE_FREE;
1425        item->score = 0;
1426        item->addr = block_ptr;
1427        block_ptr += bcache_blocksize;
1428
1429        list_push_front(&freelist, &item->list_elem);
1430        freelist_count++;
1431    }
1432}
1433
1434uint64_t bcache_get_num_free_blocks()
1435{
1436    return freelist_count;
1437}
1438
1439uint64_t bcache_get_num_blocks(struct filemgr *file)
1440{
1441    struct fnamedic_item *fname = file->bcache;
1442    if (fname) {
1443        return atomic_get_uint64_t(&fname->nitems);
1444    }
1445    return 0;
1446}
1447
1448uint64_t bcache_get_num_immutable(struct filemgr *file)
1449{
1450    struct fnamedic_item *fname = file->bcache;
1451    if (fname) {
1452        return atomic_get_uint64_t(&fname->nimmutable);
1453    }
1454    return 0;
1455
1456}
1457
1458// LCOV_EXCL_START
1459void bcache_print_items()
1460{
1461    size_t n=1;
1462    size_t nfiles, nitems, nfileitems, nclean, ndirty;
1463    size_t scores[100], i, scores_local[100];
1464    size_t docs, bnodes;
1465    size_t docs_local, bnodes_local;
1466    uint8_t *ptr;
1467
1468    nfiles = nitems = nfileitems = nclean = ndirty = 0;
1469    docs = bnodes = 0;
1470    memset(scores, 0, sizeof(size_t)*100);
1471
1472    struct fnamedic_item *fname;
1473    struct bcache_item *item;
1474    struct dirty_item *dirty;
1475    struct list_elem *ee;
1476    struct avl_node *a;
1477
1478    printf(" === Block cache statistics summary ===\n");
1479    printf("%3s %20s (%6s)(%6s)(c%6s d%6s)",
1480        "No", "Filename", "#Pages", "#Evict", "Clean", "Dirty");
1481#ifdef __CRC32
1482    printf("%6s%6s", "Doc", "Node");
1483#endif
1484    for (i=0;i<=n;++i) {
1485        printf("   [%d] ", (int)i);
1486    }
1487    printf("\n");
1488
1489    for (size_t idx = 0; idx < num_files; ++idx) {
1490        fname = file_list[idx];
1491        memset(scores_local, 0, sizeof(size_t)*100);
1492        nfileitems = nclean = ndirty = 0;
1493        docs_local = bnodes_local = 0;
1494
1495        size_t i = 0;
1496        for (; i < fname->num_shards; ++i) {
1497            ee = list_begin(&fname->shards[i].cleanlist);
1498            a = avl_first(&fname->shards[i].tree);
1499
1500            while(ee){
1501                item = _get_entry(ee, struct bcache_item, list_elem);
1502                scores[item->score]++;
1503                scores_local[item->score]++;
1504                nitems++;
1505                nfileitems++;
1506                nclean++;
1507#ifdef __CRC32
1508                ptr = (uint8_t*)item->addr + bcache_blocksize - 1;
1509                switch (*ptr) {
1510                case BLK_MARKER_BNODE:
1511                    bnodes_local++;
1512                    break;
1513                case BLK_MARKER_DOC:
1514                    docs_local++;
1515                    break;
1516                }
1517#endif
1518                ee = list_next(ee);
1519            }
1520            while(a){
1521                dirty = _get_entry(a, struct dirty_item, avl);
1522                item = dirty->item;
1523                scores[item->score]++;
1524                scores_local[item->score]++;
1525                nitems++;
1526                nfileitems++;
1527                ndirty++;
1528#ifdef __CRC32
1529                ptr = (uint8_t*)item->addr + bcache_blocksize - 1;
1530                switch (*ptr) {
1531                case BLK_MARKER_BNODE:
1532                    bnodes_local++;
1533                    break;
1534                case BLK_MARKER_DOC:
1535                    docs_local++;
1536                    break;
1537                }
1538#endif
1539                a = avl_next(a);
1540            }
1541        }
1542
1543        printf("%3d %20s (%6d)(%6d)(c%6d d%6d)",
1544               (int)nfiles+1, fname->filename,
1545               (int)atomic_get_uint64_t(&fname->nitems),
1546               (int)atomic_get_uint64_t(&fname->nvictim),
1547               (int)nclean, (int)ndirty);
1548        printf("%6d%6d", (int)docs_local, (int)bnodes_local);
1549        for (i=0;i<=n;++i){
1550            printf("%6d ", (int)scores_local[i]);
1551        }
1552        printf("\n");
1553
1554        docs += docs_local;
1555        bnodes += bnodes_local;
1556
1557        nfiles++;
1558    }
1559    printf(" ===\n");
1560
1561    printf("%d files %d items\n", (int)nfiles, (int)nitems);
1562    for (i=0;i<=n;++i){
1563        printf("[%d]: %d\n", (int)i, (int)scores[i]);
1564    }
1565    printf("Documents: %d blocks\n", (int)docs);
1566    printf("Index nodes: %d blocks\n", (int)bnodes);
1567}
1568// LCOV_EXCL_STOP
1569
1570// LCOV_EXCL_START
1571INLINE void _bcache_free_bcache_item(struct hash_elem *h)
1572{
1573    struct bcache_item *item = _get_entry(h, struct bcache_item, hash_elem);
1574    free(item);
1575}
1576// LCOV_EXCL_STOP
1577
1578// LCOV_EXCL_START
1579INLINE void _bcache_free_fnamedic(struct hash_elem *h)
1580{
1581    size_t i = 0;
1582    struct fnamedic_item *item;
1583    item = _get_entry(h, struct fnamedic_item, hash_elem);
1584
1585    for (; i < item->num_shards; ++i) {
1586        hash_free_active(&item->shards[i].hashtable, _bcache_free_bcache_item);
1587        spin_destroy(&item->shards[i].lock);
1588    }
1589
1590    free(item->shards);
1591    free(item->filename);
1592
1593    free(item);
1594}
1595// LCOV_EXCL_STOP
1596
1597void bcache_shutdown()
1598{
1599    struct bcache_item *item;
1600    struct list_elem *e;
1601
1602    e = list_begin(&freelist);
1603    while(e) {
1604        item = _get_entry(e, struct bcache_item, list_elem);
1605        e = list_remove(&freelist, e);
1606        freelist_count--;
1607        free(item);
1608    }
1609
1610    // Force clean zombies if any
1611    e = list_begin(&file_zombies);
1612    while (e) {
1613        struct fnamedic_item *fname = _get_entry(e, struct fnamedic_item, le);
1614        e = list_remove(&file_zombies, e);
1615        _fname_free(fname);
1616    }
1617    // Free the file list array
1618    free(file_list);
1619
1620    // Free entire buffercache memory
1621    free(buffercache_addr);
1622
1623    spin_lock(&bcache_lock);
1624    hash_free_active(&fnamedic, _bcache_free_fnamedic);
1625    spin_unlock(&bcache_lock);
1626
1627    spin_destroy(&bcache_lock);
1628    spin_destroy(&freelist_lock);
1629
1630    int rv = destroy_rw_lock(&filelist_lock);
1631    if (rv != 0) {
1632        fprintf(stderr, "Error in bcache_shutdown(): "
1633                        "RW Lock's destruction failed; ErrorCode: %d\n", rv);
1634    }
1635}
1636
1637