1 /* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3  *     Copyright 2010 Couchbase, Inc
4  *
5  *   Licensed under the Apache License, Version 2.0 (the "License");
6  *   you may not use this file except in compliance with the License.
7  *   You may obtain a copy of the License at
8  *
9  *       http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *   Unless required by applicable law or agreed to in writing, software
12  *   distributed under the License is distributed on an "AS IS" BASIS,
13  *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *   See the License for the specific language governing permissions and
15  *   limitations under the License.
16  */
17 
18 #include <stdlib.h>
19 #include <string.h>
20 #include <stdint.h>
21 
22 #if !defined(WIN32) && !defined(_WIN32)
23 #include <sys/time.h>
24 #endif
25 
26 #include "hash_functions.h"
27 #include "common.h"
28 #include "libforestdb/fdb_errors.h"
29 #include "hash.h"
30 #include "list.h"
31 #include "blockcache.h"
32 #include "avltree.h"
33 #include "atomic.h"
34 
35 #include "memleak.h"
36 
37 #ifdef __DEBUG
38 #ifndef __DEBUG_BCACHE
39     #undef DBG
40     #undef DBGCMD
41     #undef DBGSW
42     #define DBG(...)
43     #define DBGCMD(...)
44     #define DBGSW(n, ...)
45 #endif
46 #endif
47 
48 // global lock
49 static spin_t bcache_lock;
50 
51 // hash table for filename
52 static struct hash fnamedic;
53 
54 // free block list
55 static volatile size_t freelist_count;
56 static struct list freelist;
57 static spin_t freelist_lock;
58 
59 static const size_t MAX_VICTIM_SELECTIONS = 5;
60 
61 // file structure list
62 static size_t num_files;
63 static size_t file_array_capacity;
64 static fnamedic_item ** file_list;
65 static struct list file_zombies;
66 static rw_spin_t filelist_lock; // Reader-Writer spinlock for the file list.
67 
68 //static struct list cleanlist, dirtylist;
69 //static uint64_t nfree, nclean, ndirty;
70 static uint64_t bcache_nblock;
71 
72 static int bcache_blocksize;
73 static size_t bcache_flush_unit;
74 
75 struct bcache_shard {
76     spin_t lock;
77     // list for clean blocks
78     struct list cleanlist;
79     // tree for normal dirty blocks
80     struct avl_tree tree;
81     // tree for index nodes
82     struct avl_tree tree_idx;
83     // hash table for block lookup
84     struct hash hashtable;
85     // list elem for shard LRU
86     struct list_elem le;
87 };
88 
89 struct fnamedic_item {
90     char *filename;
91     uint16_t filename_len;
92     uint32_t hash;
93 
94     // current opened filemgr instance
95     // (can be changed on-the-fly when file is closed and re-opened)
96     struct filemgr *curfile;
97 
98     // Shards of the block cache for a file.
99     struct bcache_shard *shards;
100 
101     // list elem for FILE_ZOMBIE
102     struct list_elem le;
103     // hash elem for FNAMEDIC
104     struct hash_elem hash_elem;
105 
106     atomic_uint32_t ref_count;
107     atomic_uint64_t nvictim;
108     atomic_uint64_t nitems;
109     atomic_uint64_t access_timestamp;
110     size_t num_shards;
111 };
112 
113 #define BCACHE_DIRTY (0x1)
114 #define BCACHE_FREE (0x4)
115 
116 struct bcache_item {
117     // BID
118     bid_t bid;
119     // contents address
120     void *addr;
121     // hash elem for lookup hash table
122     struct hash_elem hash_elem;
123     // list elem for {free, clean, dirty} lists
124     struct list_elem list_elem;
125     // flag
126     uint8_t flag;
127     // score
128     uint8_t score;
129 };
130 
131 struct dirty_item {
132     struct bcache_item *item;
133     struct avl_node avl;
134 };
135 
_dirty_cmp(struct avl_node *a, struct avl_node *b, void *aux)136 INLINE int _dirty_cmp(struct avl_node *a, struct avl_node *b, void *aux)
137 {
138     struct dirty_item *aa, *bb;
139     aa = _get_entry(a, struct dirty_item, avl);
140     bb = _get_entry(b, struct dirty_item, avl);
141 
142     #ifdef __BIT_CMP
143         return _CMP_U64(aa->item->bid , bb->item->bid);
144 
145     #else
146         if (aa->item->bid < bb->item->bid) return -1;
147         else if (aa->item->bid > bb->item->bid) return 1;
148         else return 0;
149 
150     #endif
151 }
152 
_fname_hash(struct hash *hash, struct hash_elem *e)153 INLINE uint32_t _fname_hash(struct hash *hash, struct hash_elem *e)
154 {
155     struct fnamedic_item *item;
156     item = _get_entry(e, struct fnamedic_item, hash_elem);
157     return item->hash % ((unsigned)(BCACHE_NDICBUCKET));
158 }
159 
_fname_cmp(struct hash_elem *a, struct hash_elem *b)160 INLINE int _fname_cmp(struct hash_elem *a, struct hash_elem *b)
161 {
162     size_t len;
163     struct fnamedic_item *aa, *bb;
164     aa = _get_entry(a, struct fnamedic_item, hash_elem);
165     bb = _get_entry(b, struct fnamedic_item, hash_elem);
166 
167     if (aa->filename_len == bb->filename_len) {
168         return memcmp(aa->filename, bb->filename, aa->filename_len);
169     }else {
170         len = MIN(aa->filename_len , bb->filename_len);
171         int cmp = memcmp(aa->filename, bb->filename, len);
172         if (cmp != 0) return cmp;
173         else {
174             return (int)((int)aa->filename_len - (int)bb->filename_len);
175         }
176     }
177 }
178 
_bcache_hash(struct hash *hash, struct hash_elem *e)179 INLINE uint32_t _bcache_hash(struct hash *hash, struct hash_elem *e)
180 {
181     struct bcache_item *item = _get_entry(e, struct bcache_item, hash_elem);
182     return (item->bid) % ((uint32_t)BCACHE_NBUCKET);
183 }
184 
_bcache_cmp(struct hash_elem *a, struct hash_elem *b)185 INLINE int _bcache_cmp(struct hash_elem *a, struct hash_elem *b)
186 {
187     struct bcache_item *aa, *bb;
188     aa = _get_entry(a, struct bcache_item, hash_elem);
189     bb = _get_entry(b, struct bcache_item, hash_elem);
190 
191     #ifdef __BIT_CMP
192 
193         return _CMP_U64(aa->bid, bb->bid);
194 
195     #else
196 
197         if (aa->bid == bb->bid) return 0;
198         else if (aa->bid < bb->bid) return -1;
199         else return 1;
200 
201     #endif
202 }
203 
204 #define _list_empty(list) (list.head == NULL)
205 #define _tree_empty(tree) (tree.root == NULL)
206 
_acquire_all_shard_locks(struct fnamedic_item *fname)207 static void _acquire_all_shard_locks(struct fnamedic_item *fname) {
208     size_t i = 0;
209     for (; i < fname->num_shards; ++i) {
210         spin_lock(&fname->shards[i].lock);
211     }
212 }
213 
_release_all_shard_locks(struct fnamedic_item *fname)214 static void _release_all_shard_locks(struct fnamedic_item *fname) {
215     size_t i = 0;
216     for (; i < fname->num_shards; ++i) {
217         spin_unlock(&fname->shards[i].lock);
218     }
219 }
220 
_file_empty(struct fnamedic_item *fname)221 static bool _file_empty(struct fnamedic_item *fname) {
222     bool empty = true;
223     size_t i = 0;
224     _acquire_all_shard_locks(fname);
225     for (; i < fname->num_shards; ++i) {
226         if (!(_list_empty(fname->shards[i].cleanlist) &&
227               _tree_empty(fname->shards[i].tree) &&
228               _tree_empty(fname->shards[i].tree_idx))) {
229             empty = false;
230             break;
231         }
232     }
233     _release_all_shard_locks(fname);
234     return empty;
235 }
236 
_shard_empty(struct bcache_shard *bshard)237 INLINE bool _shard_empty(struct bcache_shard *bshard) {
238     // Caller should grab the shard lock before calling this function.
239     return _list_empty(bshard->cleanlist) &&
240            _tree_empty(bshard->tree) &&
241            _tree_empty(bshard->tree_idx);
242 }
243 
_bcache_get_victim()244 struct fnamedic_item *_bcache_get_victim()
245 {
246     struct fnamedic_item *ret = NULL;
247     uint64_t min_timestamp = (uint64_t) -1;
248     uint64_t victim_timestamp;
249     int victim_idx;
250     size_t num_attempts;
251 
252     rw_spin_read_lock(&filelist_lock);
253     // Pick the victim that has the smallest access timestamp among files randomly selected.
254     num_attempts = num_files / 10 + 1;
255      if (num_attempts > MAX_VICTIM_SELECTIONS) {
256          num_attempts = MAX_VICTIM_SELECTIONS;
257      } else {
258          if(num_attempts == 1 && num_files > 1) {
259              ++num_attempts;
260          }
261      }
262      for (size_t i = 0; i < num_attempts && num_files; ++i) {
263          victim_idx = rand() % num_files;
264          victim_timestamp = atomic_get_uint64_t(&file_list[victim_idx]->access_timestamp);
265          if (victim_timestamp < min_timestamp &&
266              atomic_get_uint64_t(&file_list[victim_idx]->nitems)) {
267              min_timestamp = victim_timestamp;
268              ret = file_list[victim_idx];
269          }
270      }
271 
272     if (ret) {
273         atomic_incr_uint32_t(&ret->ref_count);
274     }
275     rw_spin_read_unlock(&filelist_lock);
276 
277     return ret;
278 }
279 
_bcache_alloc_freeblock()280 static struct bcache_item *_bcache_alloc_freeblock()
281 {
282     struct list_elem *e = NULL;
283     struct bcache_item *item;
284 
285     spin_lock(&freelist_lock);
286     e = list_pop_front(&freelist);
287     if (e) {
288         freelist_count--;
289     }
290     spin_unlock(&freelist_lock);
291 
292     if (e) {
293         item = _get_entry(e, struct bcache_item, list_elem);
294         return item;
295     }
296     return NULL;
297 }
298 
_bcache_release_freeblock(struct bcache_item *item)299 static void _bcache_release_freeblock(struct bcache_item *item)
300 {
301     spin_lock(&freelist_lock);
302     item->flag = BCACHE_FREE;
303     item->score = 0;
304     list_push_front(&freelist, &item->list_elem);
305     freelist_count++;
306     spin_unlock(&freelist_lock);
307 }
308 
_next_dead_fname_zombie(void)309 static struct fnamedic_item *_next_dead_fname_zombie(void) {
310     struct list_elem *e;
311     struct fnamedic_item *fname_item;
312     bool found = false;
313     rw_spin_write_lock(&filelist_lock);
314     e = list_begin(&file_zombies);
315     while (e) {
316         fname_item = _get_entry(e, struct fnamedic_item, le);
317         if (atomic_get_uint32_t(&fname_item->ref_count) == 0) {
318             list_remove(&file_zombies, e);
319             found = true;
320             break;
321         } else {
322             e = list_next(e);
323         }
324     }
325     rw_spin_write_unlock(&filelist_lock);
326     return found ? fname_item : NULL;
327 }
328 
329 static void _fname_free(struct fnamedic_item *fname);
330 
_garbage_collect_zombie_fnames(void)331 static void _garbage_collect_zombie_fnames(void) {
332     struct fnamedic_item *fname_item = _next_dead_fname_zombie();
333     while (fname_item) {
334         _fname_free(fname_item);
335         fname_item = _next_dead_fname_zombie();
336     }
337 }
338 
339 struct dirty_bid {
340     bid_t bid;
341     struct avl_node avl;
342 };
343 
_dirty_bid_cmp(struct avl_node *a, struct avl_node *b, void *aux)344 INLINE int _dirty_bid_cmp(struct avl_node *a, struct avl_node *b, void *aux)
345 {
346     struct dirty_bid *aa, *bb;
347     aa = _get_entry(a, struct dirty_bid, avl);
348     bb = _get_entry(b, struct dirty_bid, avl);
349 
350     #ifdef __BIT_CMP
351         return _CMP_U64(aa->bid , bb->bid);
352 
353     #else
354         if (aa->bid < bb->bid) return -1;
355         else if (aa->bid > bb->bid) return 1;
356         else return 0;
357 
358     #endif
359 }
360 
_free_dirty_blocks(struct dirty_bid **dirty_bids, size_t n)361 static void _free_dirty_blocks(struct dirty_bid **dirty_bids, size_t n) {
362     size_t i = 0;
363     for (; i < n; ++i) {
364         if (dirty_bids[i]) {
365             mempool_free(dirty_bids[i]);
366         }
367     }
368 }
369 
370 // Flush some consecutive or all dirty blocks for a given file and
371 // move them to the clean list.
_flush_dirty_blocks(struct fnamedic_item *fname_item, bool sync, bool flush_all)372 static fdb_status _flush_dirty_blocks(struct fnamedic_item *fname_item,
373                                       bool sync, bool flush_all)
374 {
375     void *buf = NULL;
376     struct list_elem *prevhead = NULL;
377     struct avl_tree *cur_tree = NULL;
378     struct avl_node *node = NULL;
379     struct dirty_bid *dbid = NULL;
380     uint64_t count = 0;
381     ssize_t ret = 0;
382     bid_t start_bid = 0, prev_bid = 0;
383     void *ptr = NULL;
384     uint8_t marker = 0x0;
385     fdb_status status = FDB_RESULT_SUCCESS;
386     bool o_direct = false;
387     bool data_block_completed = false;
388     struct avl_tree dirty_blocks; // Cross-shard dirty block list for sequential writes.
389 
390     if (fname_item->curfile->config->flag & _ARCH_O_DIRECT) {
391         o_direct = true;
392     }
393 
394     // scan and write back dirty blocks sequentially for O_DIRECT option.
395     if (sync && o_direct) {
396         malloc_align(buf, FDB_SECTOR_SIZE, bcache_flush_unit);
397         _acquire_all_shard_locks(fname_item);
398     }
399 
400     prev_bid = start_bid = BLK_NOT_FOUND;
401     count = 0;
402 
403     avl_init(&dirty_blocks, NULL);
404 
405     // Try to flush the dirty data blocks first and then index blocks.
406     size_t i = 0;
407     bool consecutive_blocks = true;
408     struct dirty_bid **dirty_bids = alca(struct dirty_bid *, fname_item->num_shards);
409     memset(dirty_bids, 0x0, sizeof(dirty_bid *) * fname_item->num_shards);
410     while (1) {
411         if (!(node = avl_first(&dirty_blocks))) {
412             for (i = 0; i < fname_item->num_shards; ++i) {
413                 if (!(sync && o_direct)) {
414                     spin_lock(&fname_item->shards[i].lock);
415                 }
416                 if (!data_block_completed) {
417                     node = avl_first(&fname_item->shards[i].tree);
418                 } else {
419                     node = avl_first(&fname_item->shards[i].tree_idx);
420                 }
421                 if (node) {
422                     if (!dirty_bids[i]) {
423                         dirty_bids[i] = (struct dirty_bid *)
424                             mempool_alloc(sizeof(struct dirty_bid));
425                     }
426                     dirty_bids[i]->bid = _get_entry(node, struct dirty_item, avl)->item->bid;
427                     avl_insert(&dirty_blocks, &dirty_bids[i]->avl, _dirty_bid_cmp);
428                 }
429                 if (!(sync && o_direct)) {
430                     spin_unlock(&fname_item->shards[i].lock);
431                 }
432             }
433             if (!(node = avl_first(&dirty_blocks))) {
434                 if (!data_block_completed) {
435                     data_block_completed = true;
436                     if (count > 0 && !flush_all) {
437                         // Finished flushing some dirty data blocks.
438                         // Not move over to the dirty index block list because
439                         // flush_all is not requestd.
440                         break;
441                     }
442                     continue;
443                 } else {
444                     break;
445                 }
446             }
447         }
448 
449         dbid = _get_entry(node, struct dirty_bid, avl);
450 
451         size_t shard_num = dbid->bid % fname_item->num_shards;
452         if (!(sync && o_direct)) {
453             spin_lock(&fname_item->shards[shard_num].lock);
454         }
455         if (!data_block_completed) {
456             cur_tree = &fname_item->shards[shard_num].tree;
457         } else {
458             cur_tree = &fname_item->shards[shard_num].tree_idx;
459         }
460 
461         struct dirty_item *dirty_block = NULL;
462         bool item_exist = false;
463         node = avl_first(cur_tree);
464         if (node) {
465             dirty_block = _get_entry(node, struct dirty_item, avl);
466             if (dbid->bid == dirty_block->item->bid) {
467                 item_exist = true;
468             }
469         }
470         // remove from the cross-shard dirty block list.
471         avl_remove(&dirty_blocks, &dbid->avl);
472         if (!item_exist) {
473             // The original first item in the shard dirty block list was removed.
474             // Grab the next one from the cross-shard dirty block list.
475             if (!(sync && o_direct)) {
476                 spin_unlock(&fname_item->shards[shard_num].lock);
477             }
478             continue;
479         }
480 
481         consecutive_blocks = true;
482         // if BID of next dirty block is not consecutive .. stop
483         if (dirty_block->item->bid != prev_bid + 1 && prev_bid != BLK_NOT_FOUND &&
484             sync) {
485             if (flush_all) {
486                 consecutive_blocks = false;
487             } else {
488                 if (!(sync && o_direct)) {
489                     spin_unlock(&fname_item->shards[shard_num].lock);
490                 }
491                 break;
492             }
493         }
494         // set START_BID if this is the start block for a single batch write.
495         if (start_bid == BLK_NOT_FOUND) {
496             start_bid = dirty_block->item->bid;
497         }
498         // set PREV_BID and go to next block
499         prev_bid = dirty_block->item->bid;
500 
501         // set PTR and get block MARKER
502         ptr = dirty_block->item->addr;
503         marker = *((uint8_t*)(ptr) + bcache_blocksize-1);
504         dirty_block->item->flag &= ~(BCACHE_DIRTY);
505         if (sync) {
506             // copy to buffer
507 #ifdef __CRC32
508             if (marker == BLK_MARKER_BNODE) {
509                 // b-tree node .. calculate crc32 and put it into the block
510                 memset((uint8_t *)(ptr) + BTREE_CRC_OFFSET,
511                        0xff, BTREE_CRC_FIELD_LEN);
512                 uint32_t crc = chksum(ptr, bcache_blocksize);
513                 crc = _endian_encode(crc);
514                 memcpy((uint8_t *)(ptr) + BTREE_CRC_OFFSET, &crc, sizeof(crc));
515             }
516 #endif
517             if (o_direct) {
518                 if (count > 0 && !consecutive_blocks) {
519                     size_t bytes_written;
520                     // Note that this path can be only executed in flush_all case.
521                     bytes_written = (size_t)fname_item->curfile->ops->pwrite(
522                                               fname_item->curfile->fd,
523                                               buf, count * bcache_blocksize,
524                                               start_bid * bcache_blocksize);
525                     if (bytes_written != count * bcache_blocksize) {
526                         count = 0;
527                         status = FDB_RESULT_WRITE_FAIL;
528                         break;
529                     }
530                     // Start a new batch again.
531                     count = 0;
532                     start_bid = dirty_block->item->bid;
533                 }
534                 memcpy((uint8_t *)(buf) + count*bcache_blocksize,
535                        dirty_block->item->addr, bcache_blocksize);
536             } else {
537                 ret = fname_item->curfile->ops->pwrite(fname_item->curfile->fd,
538                                                        dirty_block->item->addr,
539                                                        bcache_blocksize,
540                                                        dirty_block->item->bid * bcache_blocksize);
541                 if (ret != bcache_blocksize) {
542                     if (!(sync && o_direct)) {
543                         spin_unlock(&fname_item->shards[shard_num].lock);
544                     }
545                     status = FDB_RESULT_WRITE_FAIL;
546                     break;
547                 }
548             }
549         }
550 
551         node = avl_next(node);
552         // remove from the shard dirty block list.
553         avl_remove(cur_tree, &dirty_block->avl);
554 
555         // move to the shard clean block list.
556         prevhead = fname_item->shards[shard_num].cleanlist.head;
557         (void)prevhead;
558         list_push_front(&fname_item->shards[shard_num].cleanlist,
559                         &dirty_block->item->list_elem);
560 
561         fdb_assert(!(dirty_block->item->flag & BCACHE_FREE),
562                    dirty_block->item->flag, BCACHE_FREE);
563         fdb_assert(dirty_block->item->list_elem.prev == NULL &&
564                    prevhead == dirty_block->item->list_elem.next,
565                    prevhead, dirty_block->item->list_elem.next);
566         mempool_free(dirty_block);
567 
568         // Get the next dirty block from the victim shard and insert it into
569         // the cross-shard dirty block list.
570         if (node) {
571             dbid->bid = _get_entry(node, struct dirty_item, avl)->item->bid;
572             avl_insert(&dirty_blocks, &dbid->avl, _dirty_bid_cmp);
573         }
574         if (!(sync && o_direct)) {
575             spin_unlock(&fname_item->shards[shard_num].lock);
576         }
577 
578         count++;
579         if (count*bcache_blocksize >= bcache_flush_unit && sync) {
580             if (flush_all) {
581                 if (o_direct) {
582                     ret = fname_item->curfile->ops->pwrite(fname_item->curfile->fd,
583                                                            buf, count * bcache_blocksize,
584                                                            start_bid * bcache_blocksize);
585                     if ((size_t)ret != count * bcache_blocksize) {
586                         count = 0;
587                         status = FDB_RESULT_WRITE_FAIL;
588                         break;
589                     }
590                     count = 0;
591                     start_bid = BLK_NOT_FOUND;
592                     prev_bid = BLK_NOT_FOUND;
593                 }
594             } else {
595                 break;
596             }
597         }
598     }
599 
600     // synchronize
601     if (sync && o_direct) {
602         if (count > 0) {
603             ret = fname_item->curfile->ops->pwrite(fname_item->curfile->fd, buf,
604                                                    count * bcache_blocksize,
605                                                    start_bid * bcache_blocksize);
606             if ((size_t)ret != count * bcache_blocksize) {
607                 status = FDB_RESULT_WRITE_FAIL;
608             }
609         }
610         _release_all_shard_locks(fname_item);
611         free_align(buf);
612     }
613 
614     _free_dirty_blocks(dirty_bids, fname_item->num_shards);
615     return status;
616 }
617 
618 // perform eviction
_bcache_evict(struct fnamedic_item *curfile)619 static struct list_elem * _bcache_evict(struct fnamedic_item *curfile)
620 {
621     size_t n_evict;
622     struct list_elem *e = NULL;
623     struct bcache_item *item;
624     struct fnamedic_item *victim = NULL;
625 
626     // We don't need to grab the global buffer cache lock here because
627     // the file's buffer cache instance (fnamedic_item) can be freed only if
628     // there are no database handles opened for that file.
629 
630     while (victim == NULL) {
631         // select a victim file
632         victim = _bcache_get_victim();
633         while(victim) {
634             // check whether this file has at least one block to be evictied
635             if (atomic_get_uint64_t(&victim->nitems)) {
636                 // select this file as victim
637                 break;
638             } else {
639                 atomic_decr_uint32_t(&victim->ref_count);
640                 victim = NULL; // Try to select a victim again
641             }
642         }
643     }
644     fdb_assert(victim, victim, NULL);
645 
646     atomic_incr_uint64_t(&victim->nvictim);
647 
648     // select the clean blocks from the victim file
649     n_evict = 0;
650     while(n_evict < BCACHE_EVICT_UNIT) {
651         size_t num_shards = victim->num_shards;
652         size_t i = random(num_shards);
653         bool found_victim_shard = false;
654         bcache_shard *bshard = NULL;
655 
656         for (size_t to_visit = num_shards; to_visit; --to_visit) {
657             i = (i + 1) % num_shards; // Round robin over empty shards..
658             bshard = &victim->shards[i];
659             spin_lock(&bshard->lock);
660             if (_shard_empty(bshard)) {
661                 spin_unlock(&bshard->lock);
662                 continue;
663             }
664             e = list_pop_back(&bshard->cleanlist);
665             if(!e) {
666                 spin_unlock(&bshard->lock);
667                 // When the victim shard has no clean block, evict some dirty blocks
668                 // from shards.
669                 if (_flush_dirty_blocks(victim, true, false) != FDB_RESULT_SUCCESS) {
670                     atomic_decr_uint32_t(&victim->ref_count);
671                     return NULL;
672                 }
673                 continue; // Select a victim shard again.
674             }
675 
676             item = _get_entry(e, struct bcache_item, list_elem);
677 #ifdef __BCACHE_SECOND_CHANCE
678             // repeat until zero-score item is found
679             if (item->score == 0) {
680                 found_victim_shard = true;
681                 break;
682             } else {
683                 // give second chance to the item
684                 item->score--;
685                 list_push_front(&bshard->cleanlist, &item->list_elem);
686                 spin_unlock(&bshard->lock);
687             }
688 #else
689             found_victim_shard = true;
690             break;
691 #endif
692         }
693         if (!found_victim_shard) {
694             // We couldn't find any non-empty shards even after 'num_shards'
695             // attempts.
696             // The file is *likely* empty. Note that it is OK to return NULL
697             // even if the file is not empty because the caller will retry again.
698             atomic_decr_uint32_t(&victim->ref_count);
699             return NULL;
700         }
701 
702         atomic_decr_uint64_t(&victim->nitems);
703         // remove from hash and insert into freelist
704         hash_remove(&bshard->hashtable, &item->hash_elem);
705         // add to freelist
706         _bcache_release_freeblock(item);
707         n_evict++;
708 
709         spin_unlock(&bshard->lock);
710 
711         if (atomic_get_uint64_t(&victim->nitems) == 0) {
712             break;
713         }
714     }
715 
716     atomic_decr_uint32_t(&victim->ref_count);
717     return &item->list_elem;
718 }
719 
_fname_create(struct filemgr *file)720 static struct fnamedic_item * _fname_create(struct filemgr *file) {
721     // TODO: we MUST NOT directly read file sturcture
722 
723     struct fnamedic_item *fname_new;
724     // Before we create a new filename entry, garbage collect zombies
725     _garbage_collect_zombie_fnames();
726     fname_new = (struct fnamedic_item *)malloc(sizeof(struct fnamedic_item));
727 
728     fname_new->filename_len = strlen(file->filename);
729     fname_new->filename = (char *)malloc(fname_new->filename_len + 1);
730     memcpy(fname_new->filename, file->filename, fname_new->filename_len);
731     fname_new->filename[fname_new->filename_len] = 0;
732 
733     // calculate hash value
734     fname_new->hash = chksum((void *)fname_new->filename,
735                              fname_new->filename_len);
736     fname_new->curfile = file;
737     atomic_init_uint64_t(&fname_new->nvictim, 0);
738     atomic_init_uint64_t(&fname_new->nitems, 0);
739     atomic_init_uint32_t(&fname_new->ref_count, 0);
740     atomic_init_uint64_t(&fname_new->access_timestamp, 0);
741     if (file->config->num_bcache_shards) {
742         fname_new->num_shards = file->config->num_bcache_shards;
743     } else {
744         fname_new->num_shards = DEFAULT_NUM_BCACHE_PARTITIONS;
745     }
746     // For random eviction among shards
747     randomize();
748 
749     fname_new->shards = (bcache_shard *)
750         malloc(sizeof(struct bcache_shard) * fname_new->num_shards);
751     size_t i = 0;
752     for (; i < fname_new->num_shards; ++i) {
753         // initialize tree
754         avl_init(&fname_new->shards[i].tree, NULL);
755         avl_init(&fname_new->shards[i].tree_idx, NULL);
756         // initialize clean list
757         list_init(&fname_new->shards[i].cleanlist);
758         // initialize hash table
759         hash_init(&fname_new->shards[i].hashtable, BCACHE_NBUCKET,
760                   _bcache_hash, _bcache_cmp);
761         spin_init(&fname_new->shards[i].lock);
762     }
763 
764     // insert into fname dictionary
765     hash_insert(&fnamedic, &fname_new->hash_elem);
766     file->bcache = fname_new;
767 
768     rw_spin_write_lock(&filelist_lock);
769     if (num_files == file_array_capacity) {
770         file_array_capacity *= 2;
771         file_list = (struct fnamedic_item **) realloc(file_list, file_array_capacity);
772     }
773     file_list[num_files++] = fname_new;
774     rw_spin_write_unlock(&filelist_lock);
775 
776     return fname_new;
777 }
778 
_fname_try_free(struct fnamedic_item *fname)779 static bool _fname_try_free(struct fnamedic_item *fname)
780 {
781     bool ret = true;
782 
783     rw_spin_write_lock(&filelist_lock);
784     // Remove from the file list array
785     bool found = false;
786     for (size_t i = 0; i < num_files; ++i) {
787         if (file_list[i] == fname) {
788             found = true;
789         }
790         if (found && (i+1 < num_files)) {
791             file_list[i] = file_list[i+1];
792         }
793     }
794     fdb_assert(found, num_files, fname->ref_count.val);
795     file_list[num_files - 1] = NULL;
796     --num_files;
797     if (atomic_get_uint32_t(&fname->ref_count) != 0) {
798         // This item is a victim by another thread's _bcache_evict()
799         list_push_front(&file_zombies, &fname->le);
800         ret = false; // Delay deletion
801     }
802 
803     rw_spin_write_unlock(&filelist_lock);
804     return ret;
805 }
806 
_fname_free(struct fnamedic_item *fname)807 static void _fname_free(struct fnamedic_item *fname)
808 {
809     // file must be empty
810     fdb_assert(_file_empty(fname), false, true);
811     uint32_t ref_count = atomic_get_uint32_t(&fname->ref_count);
812     fdb_assert(ref_count == 0, 0, ref_count);
813 
814     // free hash
815     size_t i = 0;
816     for (; i < fname->num_shards; ++i) {
817         hash_free(&fname->shards[i].hashtable);
818         spin_destroy(&fname->shards[i].lock);
819     }
820 
821     free(fname->shards);
822     free(fname->filename);
823     atomic_destroy_uint32_t(&fname->ref_count);
824     atomic_destroy_uint64_t(&fname->nvictim);
825     atomic_destroy_uint64_t(&fname->nitems);
826     atomic_destroy_uint64_t(&fname->access_timestamp);
827     free(fname);
828 }
829 
_bcache_set_score(struct bcache_item *item)830 INLINE void _bcache_set_score(struct bcache_item *item)
831 {
832 #ifdef __CRC32
833     uint8_t marker;
834 
835     // set PTR and get block MARKER
836     marker = *((uint8_t*)(item->addr) + bcache_blocksize-1);
837     if (marker == BLK_MARKER_BNODE ) {
838         // b-tree node .. set item's score to 1
839         item->score = 1;
840     } else {
841         item->score = 0;
842     }
843 #endif
844 }
845 
bcache_read(struct filemgr *file, bid_t bid, void *buf)846 int bcache_read(struct filemgr *file, bid_t bid, void *buf)
847 {
848     struct hash_elem *h;
849     struct bcache_item *item;
850     struct bcache_item query;
851     struct fnamedic_item *fname;
852 
853     // Note that we don't need to grab bcache_lock here as the block cache
854     // is already created and binded when the file is created or opened for
855     // the first time.
856     fname = file->bcache;
857 
858     if (fname) {
859         // file exists
860         // set query
861         query.bid = bid;
862         // Update the access timestamp.
863         struct timeval tp;
864         gettimeofday(&tp, NULL); // TODO: Need to implement a better way of
865                                  // getting the timestamp to avoid the overhead of
866                                  // gettimeofday()
867         atomic_store_uint64_t(&fname->access_timestamp,
868                               (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec));
869 
870         size_t shard_num = bid % fname->num_shards;
871         spin_lock(&fname->shards[shard_num].lock);
872 
873         // search shard hash table
874         h = hash_find(&fname->shards[shard_num].hashtable, &query.hash_elem);
875         if (h) {
876             // cache hit
877             item = _get_entry(h, struct bcache_item, hash_elem);
878             fdb_assert(!(item->flag & BCACHE_FREE), item->flag, file);
879 
880             // move the item to the head of list if the block is clean
881             // (don't care if the block is dirty)
882             if (!(item->flag & BCACHE_DIRTY)) {
883                 // TODO: Scanning the list would cause some overhead. We need to devise
884                 // the better data structure to provide a fast lookup for the clean list.
885                 list_remove(&fname->shards[shard_num].cleanlist, &item->list_elem);
886                 list_push_front(&fname->shards[shard_num].cleanlist, &item->list_elem);
887             }
888 
889             memcpy(buf, item->addr, bcache_blocksize);
890             _bcache_set_score(item);
891 
892             spin_unlock(&fname->shards[shard_num].lock);
893 
894             return bcache_blocksize;
895         } else {
896             // cache miss
897             spin_unlock(&fname->shards[shard_num].lock);
898         }
899     }
900 
901     // does not exist .. cache miss
902     return 0;
903 }
904 
bcache_invalidate_block(struct filemgr *file, bid_t bid)905 void bcache_invalidate_block(struct filemgr *file, bid_t bid)
906 {
907     struct hash_elem *h;
908     struct bcache_item *item;
909     struct bcache_item query;
910     struct fnamedic_item *fname;
911 
912     // Note that we don't need to grab bcache_lock here as the block cache
913     // is already created and binded when the file is created or opened for
914     // the first time.
915     fname = file->bcache;
916 
917     if (fname) {
918         // file exists
919         // set query
920         query.bid = bid;
921         // Update the access timestamp.
922         struct timeval tp;
923         gettimeofday(&tp, NULL);
924         atomic_store_uint64_t(&fname->access_timestamp,
925                               (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec));
926 
927         size_t shard_num = bid % fname->num_shards;
928         spin_lock(&fname->shards[shard_num].lock);
929 
930         // search BHASH
931         h = hash_find(&fname->shards[shard_num].hashtable, &query.hash_elem);
932         if (h) {
933             // cache hit
934             item = _get_entry(h, struct bcache_item, hash_elem);
935             fdb_assert(!(item->flag & BCACHE_FREE), item->flag, BCACHE_FREE);
936 
937             if (!(item->flag & BCACHE_DIRTY)) {
938                 atomic_decr_uint64_t(&fname->nitems);
939                 // only for clean blocks
940                 // remove from hash and insert into freelist
941                 hash_remove(&fname->shards[shard_num].hashtable, &item->hash_elem);
942                 // remove from clean list
943                 list_remove(&fname->shards[shard_num].cleanlist, &item->list_elem);
944                 spin_unlock(&fname->shards[shard_num].lock);
945 
946                 // add to freelist
947                 _bcache_release_freeblock(item);
948             } else {
949                 spin_unlock(&fname->shards[shard_num].lock);
950             }
951         } else {
952             // cache miss
953             spin_unlock(&fname->shards[shard_num].lock);
954         }
955     }
956 }
957 
bcache_write(struct filemgr *file, bid_t bid, void *buf, bcache_dirty_t dirty)958 int bcache_write(struct filemgr *file,
959                  bid_t bid,
960                  void *buf,
961                  bcache_dirty_t dirty)
962 {
963     struct hash_elem *h = NULL;
964     struct bcache_item *item;
965     struct bcache_item query;
966     struct fnamedic_item *fname_new;
967 
968     fname_new = file->bcache;
969     if (fname_new == NULL) {
970         spin_lock(&bcache_lock);
971         fname_new = file->bcache;
972         if (fname_new == NULL) {
973             // filename doesn't exist in filename dictionary .. create
974             fname_new = _fname_create(file);
975         }
976         spin_unlock(&bcache_lock);
977     }
978 
979     // Update the access timestamp.
980     struct timeval tp;
981     gettimeofday(&tp, NULL);
982     atomic_store_uint64_t(&fname_new->access_timestamp,
983                           (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec));
984 
985     size_t shard_num = bid % fname_new->num_shards;
986     // set query
987     query.bid = bid;
988 
989     spin_lock(&fname_new->shards[shard_num].lock);
990 
991     // search hash table
992     h = hash_find(&fname_new->shards[shard_num].hashtable, &query.hash_elem);
993     if (h == NULL) {
994         // cache miss
995         // get a free block
996         while ((item = _bcache_alloc_freeblock()) == NULL) {
997             // no free block .. perform eviction
998             spin_unlock(&fname_new->shards[shard_num].lock);
999 
1000             _bcache_evict(fname_new);
1001 
1002             spin_lock(&fname_new->shards[shard_num].lock);
1003         }
1004 
1005         // re-search hash table
1006         h = hash_find(&fname_new->shards[shard_num].hashtable, &query.hash_elem);
1007         if (h == NULL) {
1008             // insert into hash table
1009             item->bid = bid;
1010             item->flag = BCACHE_FREE;
1011             hash_insert(&fname_new->shards[shard_num].hashtable, &item->hash_elem);
1012             h = &item->hash_elem;
1013         } else {
1014             // insert into freelist again
1015             _bcache_release_freeblock(item);
1016             item = _get_entry(h, struct bcache_item, hash_elem);
1017         }
1018     } else {
1019         item = _get_entry(h, struct bcache_item, hash_elem);
1020     }
1021 
1022     fdb_assert(h, h, NULL);
1023 
1024     if (item->flag & BCACHE_FREE) {
1025         atomic_incr_uint64_t(&fname_new->nitems);
1026     }
1027 
1028     // remove from the list if the block is in clean list
1029     if (!(item->flag & BCACHE_DIRTY) && !(item->flag & BCACHE_FREE)) {
1030         list_remove(&fname_new->shards[shard_num].cleanlist, &item->list_elem);
1031     }
1032     item->flag &= ~BCACHE_FREE;
1033 
1034     if (dirty == BCACHE_REQ_DIRTY) {
1035         // DIRTY request
1036         // to avoid re-insert already existing item into tree
1037         if (!(item->flag & BCACHE_DIRTY)) {
1038             // dirty block
1039             // insert into tree
1040             struct dirty_item *ditem;
1041             uint8_t marker;
1042 
1043             ditem = (struct dirty_item *)
1044                     mempool_alloc(sizeof(struct dirty_item));
1045             ditem->item = item;
1046 
1047             marker = *((uint8_t*)buf + bcache_blocksize-1);
1048             if (marker == BLK_MARKER_BNODE ) {
1049                 // b-tree node
1050                 avl_insert(&fname_new->shards[shard_num].tree_idx, &ditem->avl, _dirty_cmp);
1051             } else {
1052                 avl_insert(&fname_new->shards[shard_num].tree, &ditem->avl, _dirty_cmp);
1053             }
1054         }
1055         item->flag |= BCACHE_DIRTY;
1056     } else {
1057         // CLEAN request
1058         // insert into clean list only when it was originally clean
1059         if (!(item->flag & BCACHE_DIRTY)) {
1060             list_push_front(&fname_new->shards[shard_num].cleanlist, &item->list_elem);
1061             item->flag &= ~(BCACHE_DIRTY);
1062         }
1063     }
1064 
1065     memcpy(item->addr, buf, bcache_blocksize);
1066     _bcache_set_score(item);
1067 
1068     spin_unlock(&fname_new->shards[shard_num].lock);
1069 
1070     return bcache_blocksize;
1071 }
1072 
bcache_write_partial(struct filemgr *file, bid_t bid, void *buf, size_t offset, size_t len)1073 int bcache_write_partial(struct filemgr *file,
1074                          bid_t bid,
1075                          void *buf,
1076                          size_t offset,
1077                          size_t len)
1078 {
1079     struct hash_elem *h;
1080     struct bcache_item *item;
1081     struct bcache_item query;
1082     struct fnamedic_item *fname_new;
1083 
1084     fname_new = file->bcache;
1085     if (fname_new == NULL) {
1086         spin_lock(&bcache_lock);
1087         fname_new = file->bcache;
1088         if (fname_new == NULL) {
1089             // filename doesn't exist in filename dictionary .. create
1090             fname_new = _fname_create(file);
1091         }
1092         spin_unlock(&bcache_lock);
1093     }
1094 
1095     // Update the access timestamp.
1096     struct timeval tp;
1097     gettimeofday(&tp, NULL);
1098     atomic_store_uint64_t(&fname_new->access_timestamp,
1099                           (uint64_t) (tp.tv_sec * 1000000 + tp.tv_usec));
1100 
1101     size_t shard_num = bid % fname_new->num_shards;
1102     // set query
1103     query.bid = bid;
1104 
1105     spin_lock(&fname_new->shards[shard_num].lock);
1106 
1107     // search hash table
1108     h = hash_find(&fname_new->shards[shard_num].hashtable, &query.hash_elem);
1109     if (h == NULL) {
1110         // cache miss .. partial write fail .. return 0
1111         spin_unlock(&fname_new->shards[shard_num].lock);
1112         return 0;
1113 
1114     } else {
1115         // cache hit .. get the block
1116         item = _get_entry(h, struct bcache_item, hash_elem);
1117     }
1118 
1119     fdb_assert(!(item->flag & BCACHE_FREE), item->flag, BCACHE_FREE);
1120 
1121     // check whether this is dirty block
1122     // to avoid re-insert already existing item into tree
1123     if (!(item->flag & BCACHE_DIRTY)) {
1124         // this block was clean block
1125         uint8_t marker;
1126         struct dirty_item *ditem;
1127 
1128         // remove from clean list
1129         list_remove(&fname_new->shards[shard_num].cleanlist, &item->list_elem);
1130 
1131         ditem = (struct dirty_item *)mempool_alloc(sizeof(struct dirty_item));
1132         ditem->item = item;
1133 
1134         // insert into tree
1135         marker = *((uint8_t*)item->addr + bcache_blocksize-1);
1136         if (marker == BLK_MARKER_BNODE ) {
1137             // b-tree node
1138             avl_insert(&fname_new->shards[shard_num].tree_idx, &ditem->avl, _dirty_cmp);
1139         } else {
1140             avl_insert(&fname_new->shards[shard_num].tree, &ditem->avl, _dirty_cmp);
1141         }
1142     }
1143 
1144     // always set this block as dirty
1145     item->flag |= BCACHE_DIRTY;
1146 
1147     memcpy((uint8_t *)(item->addr) + offset, buf, len);
1148     _bcache_set_score(item);
1149 
1150     spin_unlock(&fname_new->shards[shard_num].lock);
1151 
1152     return len;
1153 }
1154 
1155 // remove all dirty blocks of the FILE
1156 // (they are only discarded and not written back)
bcache_remove_dirty_blocks(struct filemgr *file)1157 void bcache_remove_dirty_blocks(struct filemgr *file)
1158 {
1159     struct fnamedic_item *fname_item;
1160 
1161     fname_item = file->bcache;
1162 
1163     if (fname_item) {
1164         // Note that this function is only invoked as part of database file close or
1165         // removal when there are no database handles for a given file. Therefore,
1166         // we don't need to grab all the shard locks at once.
1167 
1168         // remove all dirty blocks
1169         _flush_dirty_blocks(fname_item, false, true);
1170     }
1171 }
1172 
1173 // remove all clean blocks of the FILE
bcache_remove_clean_blocks(struct filemgr *file)1174 void bcache_remove_clean_blocks(struct filemgr *file)
1175 {
1176     struct list_elem *e;
1177     struct bcache_item *item;
1178     struct fnamedic_item *fname_item;
1179 
1180     fname_item = file->bcache;
1181 
1182     if (fname_item) {
1183         // Note that this function is only invoked as part of database file close or
1184         // removal when there are no database handles for a given file. Therefore,
1185         // we don't need to grab all the shard locks at once.
1186 
1187         // remove all clean blocks from each shard in a file.
1188         size_t i = 0;
1189         for (; i < fname_item->num_shards; ++i) {
1190             spin_lock(&fname_item->shards[i].lock);
1191             e = list_begin(&fname_item->shards[i].cleanlist);
1192             while(e){
1193                 item = _get_entry(e, struct bcache_item, list_elem);
1194                 // remove from clean list
1195                 e = list_remove(&fname_item->shards[i].cleanlist, e);
1196                 // remove from hash table
1197                 hash_remove(&fname_item->shards[i].hashtable, &item->hash_elem);
1198                 // insert into free list
1199                 _bcache_release_freeblock(item);
1200             }
1201             spin_unlock(&fname_item->shards[i].lock);
1202         }
1203     }
1204 }
1205 
1206 // remove file from filename dictionary
1207 // MUST sure that there is no dirty block belongs to this FILE
1208 // (or memory leak occurs)
bcache_remove_file(struct filemgr *file)1209 bool bcache_remove_file(struct filemgr *file)
1210 {
1211     bool rv = false;
1212     struct fnamedic_item *fname_item;
1213 
1214     // Before proceeding with deletion, garbage collect zombie files
1215     _garbage_collect_zombie_fnames();
1216     fname_item = file->bcache;
1217 
1218     if (fname_item) {
1219         // acquire lock
1220         spin_lock(&bcache_lock);
1221         // file must be empty
1222         fdb_assert(_file_empty(fname_item), fname_item, NULL);
1223 
1224         // remove from fname dictionary hash table
1225         hash_remove(&fnamedic, &fname_item->hash_elem);
1226         spin_unlock(&bcache_lock);
1227 
1228         // We don't need to grab the file buffer cache's partition locks
1229         // at once because this function is only invoked when there are
1230         // no database handles that access the file.
1231         if (_fname_try_free(fname_item)) {
1232             _fname_free(fname_item); // no other callers accessing this file
1233             rv = true;
1234         } // else fnamedic_item is in use by _bcache_evict. Deletion delayed
1235     }
1236     return rv;
1237 }
1238 
1239 // flush and synchronize all dirty blocks of the FILE
1240 // dirty blocks will be changed to clean blocks (not discarded)
bcache_flush(struct filemgr *file)1241 fdb_status bcache_flush(struct filemgr *file)
1242 {
1243     struct fnamedic_item *fname_item;
1244     fdb_status status = FDB_RESULT_SUCCESS;
1245 
1246     fname_item = file->bcache;
1247 
1248     if (fname_item) {
1249         // Note that this function is invoked as part of a commit operation while
1250         // the filemgr's lock is already grabbed by a committer.
1251         // Therefore, we don't need to grab all the shard locks at once.
1252         status = _flush_dirty_blocks(fname_item, true, true);
1253     }
1254     return status;
1255 }
1256 
bcache_init(int nblock, int blocksize)1257 void bcache_init(int nblock, int blocksize)
1258 {
1259     int i;
1260     struct bcache_item *item;
1261     struct list_elem *e;
1262 
1263     list_init(&freelist);
1264     list_init(&file_zombies);
1265 
1266     hash_init(&fnamedic, BCACHE_NDICBUCKET, _fname_hash, _fname_cmp);
1267 
1268     bcache_blocksize = blocksize;
1269     bcache_flush_unit = BCACHE_FLUSH_UNIT;
1270     bcache_nblock = nblock;
1271     spin_init(&bcache_lock);
1272     spin_init(&freelist_lock);
1273     rw_spin_init(&filelist_lock);
1274     freelist_count = 0;
1275 
1276     num_files = 0;
1277     file_array_capacity = 4096; // Initial capacity of file list array.
1278     file_list = (fnamedic_item **) calloc(file_array_capacity, sizeof(fnamedic_item *));
1279 
1280     for (i=0;i<nblock;++i){
1281         item = (struct bcache_item *)malloc(sizeof(struct bcache_item));
1282 
1283         item->bid = BLK_NOT_FOUND;
1284         item->flag = 0x0 | BCACHE_FREE;
1285         item->score = 0;
1286 
1287         list_push_front(&freelist, &item->list_elem);
1288         freelist_count++;
1289     }
1290     e = list_begin(&freelist);
1291     while(e){
1292         item = _get_entry(e, struct bcache_item, list_elem);
1293         item->addr = (void *)malloc(bcache_blocksize);
1294         e = list_next(e);
1295     }
1296 
1297 }
1298 
bcache_get_num_free_blocks()1299 uint64_t bcache_get_num_free_blocks()
1300 {
1301     return freelist_count;
1302 }
1303 
1304 // LCOV_EXCL_START
bcache_print_items()1305 void bcache_print_items()
1306 {
1307     size_t n=1;
1308     size_t nfiles, nitems, nfileitems, nclean, ndirty;
1309     size_t scores[100], i, scores_local[100];
1310     size_t docs, bnodes;
1311     size_t docs_local, bnodes_local;
1312     uint8_t *ptr;
1313 
1314     nfiles = nitems = nfileitems = nclean = ndirty = 0;
1315     docs = bnodes = 0;
1316     memset(scores, 0, sizeof(size_t)*100);
1317 
1318     struct fnamedic_item *fname;
1319     struct bcache_item *item;
1320     struct dirty_item *dirty;
1321     struct list_elem *ee;
1322     struct avl_node *a;
1323 
1324     printf(" === Block cache statistics summary ===\n");
1325     printf("%3s %20s (%6s)(%6s)(c%6s d%6s)",
1326         "No", "Filename", "#Pages", "#Evict", "Clean", "Dirty");
1327 #ifdef __CRC32
1328     printf("%6s%6s", "Doc", "Node");
1329 #endif
1330     for (i=0;i<=n;++i) {
1331         printf("   [%d] ", (int)i);
1332     }
1333     printf("\n");
1334 
1335     for (size_t idx = 0; idx < num_files; ++idx) {
1336         fname = file_list[idx];
1337         memset(scores_local, 0, sizeof(size_t)*100);
1338         nfileitems = nclean = ndirty = 0;
1339         docs_local = bnodes_local = 0;
1340 
1341         size_t i = 0;
1342         for (; i < fname->num_shards; ++i) {
1343             ee = list_begin(&fname->shards[i].cleanlist);
1344             a = avl_first(&fname->shards[i].tree);
1345 
1346             while(ee){
1347                 item = _get_entry(ee, struct bcache_item, list_elem);
1348                 scores[item->score]++;
1349                 scores_local[item->score]++;
1350                 nitems++;
1351                 nfileitems++;
1352                 nclean++;
1353 #ifdef __CRC32
1354                 ptr = (uint8_t*)item->addr + bcache_blocksize - 1;
1355                 switch (*ptr) {
1356                 case BLK_MARKER_BNODE:
1357                     bnodes_local++;
1358                     break;
1359                 case BLK_MARKER_DOC:
1360                     docs_local++;
1361                     break;
1362                 }
1363 #endif
1364                 ee = list_next(ee);
1365             }
1366             while(a){
1367                 dirty = _get_entry(a, struct dirty_item, avl);
1368                 item = dirty->item;
1369                 scores[item->score]++;
1370                 scores_local[item->score]++;
1371                 nitems++;
1372                 nfileitems++;
1373                 ndirty++;
1374 #ifdef __CRC32
1375                 ptr = (uint8_t*)item->addr + bcache_blocksize - 1;
1376                 switch (*ptr) {
1377                 case BLK_MARKER_BNODE:
1378                     bnodes_local++;
1379                     break;
1380                 case BLK_MARKER_DOC:
1381                     docs_local++;
1382                     break;
1383                 }
1384 #endif
1385                 a = avl_next(a);
1386             }
1387         }
1388 
1389         printf("%3d %20s (%6d)(%6d)(c%6d d%6d)",
1390                (int)nfiles+1, fname->filename,
1391                (int)atomic_get_uint64_t(&fname->nitems),
1392                (int)atomic_get_uint64_t(&fname->nvictim),
1393                (int)nclean, (int)ndirty);
1394         printf("%6d%6d", (int)docs_local, (int)bnodes_local);
1395         for (i=0;i<=n;++i){
1396             printf("%6d ", (int)scores_local[i]);
1397         }
1398         printf("\n");
1399 
1400         docs += docs_local;
1401         bnodes += bnodes_local;
1402 
1403         nfiles++;
1404     }
1405     printf(" ===\n");
1406 
1407     printf("%d files %d items\n", (int)nfiles, (int)nitems);
1408     for (i=0;i<=n;++i){
1409         printf("[%d]: %d\n", (int)i, (int)scores[i]);
1410     }
1411     printf("Documents: %d blocks\n", (int)docs);
1412     printf("Index nodes: %d blocks\n", (int)bnodes);
1413 }
1414 // LCOV_EXCL_STOP
1415 
1416 // LCOV_EXCL_START
_bcache_free_bcache_item(struct hash_elem *h)1417 INLINE void _bcache_free_bcache_item(struct hash_elem *h)
1418 {
1419     struct bcache_item *item = _get_entry(h, struct bcache_item, hash_elem);
1420     free(item->addr);
1421     free(item);
1422 }
1423 // LCOV_EXCL_STOP
1424 
1425 // LCOV_EXCL_START
_bcache_free_fnamedic(struct hash_elem *h)1426 INLINE void _bcache_free_fnamedic(struct hash_elem *h)
1427 {
1428     size_t i = 0;
1429     struct fnamedic_item *item;
1430     item = _get_entry(h, struct fnamedic_item, hash_elem);
1431 
1432     for (; i < item->num_shards; ++i) {
1433         hash_free_active(&item->shards[i].hashtable, _bcache_free_bcache_item);
1434         spin_destroy(&item->shards[i].lock);
1435     }
1436 
1437     free(item->shards);
1438     free(item->filename);
1439 
1440     atomic_destroy_uint32_t(&item->ref_count);
1441     atomic_destroy_uint64_t(&item->nvictim);
1442     atomic_destroy_uint64_t(&item->nitems);
1443     atomic_destroy_uint64_t(&item->access_timestamp);
1444     free(item);
1445 }
1446 // LCOV_EXCL_STOP
1447 
bcache_shutdown()1448 void bcache_shutdown()
1449 {
1450     struct bcache_item *item;
1451     struct list_elem *e;
1452 
1453     e = list_begin(&freelist);
1454     while(e) {
1455         item = _get_entry(e, struct bcache_item, list_elem);
1456         e = list_remove(&freelist, e);
1457         freelist_count--;
1458         free(item->addr);
1459         free(item);
1460     }
1461 
1462     // Force clean zombies if any
1463     e = list_begin(&file_zombies);
1464     while (e) {
1465         struct fnamedic_item *fname = _get_entry(e, struct fnamedic_item, le);
1466         e = list_remove(&file_zombies, e);
1467         _fname_free(fname);
1468     }
1469     // Free the file list array
1470     free(file_list);
1471 
1472     spin_lock(&bcache_lock);
1473     hash_free_active(&fnamedic, _bcache_free_fnamedic);
1474     spin_unlock(&bcache_lock);
1475 
1476     spin_destroy(&bcache_lock);
1477     spin_destroy(&freelist_lock);
1478     rw_spin_destroy(&filelist_lock);
1479 }
1480 
1481