xref: /6.0.3/forestdb/src/btree.cc (revision 56236603)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 * Generic B+Tree
4 * (C) 2013  Jung-Sang Ahn <jungsang.ahn@gmail.com>
5 */
6
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10
11#include "btree.h"
12#include "common.h"
13
14#include "list.h"
15
16#ifdef __DEBUG
17    #include "memleak.h"
18#ifndef __DEBUG_BTREE
19    #undef DBG
20    #undef DBGCMD
21    #undef DBGSW
22    #define DBG(...)
23    #define DBGCMD(...)
24    #define DBGSW(n, ...)
25#endif
26#endif
27
28#define METASIZE_ALIGN_UNIT (16)
29#ifdef METASIZE_ALIGN_UNIT
30    #define _metasize_align(size) \
31        (((( (size + sizeof(metasize_t)) + (METASIZE_ALIGN_UNIT-1)) \
32            / METASIZE_ALIGN_UNIT) * METASIZE_ALIGN_UNIT) - sizeof(metasize_t))
33#else
34    #define _metasize_align(size) (size)
35#endif
36
37INLINE int is_subblock(bid_t subbid)
38{
39    uint8_t flag;
40    flag = (subbid >> (8 * (sizeof(bid_t)-2))) & 0x00ff;
41    return flag;
42}
43
44INLINE struct bnode *_fetch_bnode(struct btree *btree, void *addr, uint16_t level)
45{
46    struct bnode *node = NULL;
47
48    node = (struct bnode *)addr;
49
50    if (!(node->flag & BNODE_MASK_METADATA)) {
51        // no metadata
52        node->data = (uint8_t *)addr + sizeof(struct bnode);
53    } else {
54        // metadata
55        metasize_t metasize;
56        memcpy(&metasize, (uint8_t *)addr + sizeof(struct bnode), sizeof(metasize_t));
57        metasize = _endian_decode(metasize);
58        node->data = (uint8_t *)addr + sizeof(struct bnode) + sizeof(metasize_t) +
59                     _metasize_align(metasize);
60    }
61    return node;
62}
63
64#ifdef _BTREE_HAS_MULTIPLE_BNODES
65struct bnode ** btree_get_bnode_array(void *addr, size_t *nnode_out)
66{
67    // original b+tree always has only a single node per block
68    struct bnode **ret;
69    *nnode_out = 1;
70    ret = (struct bnode **)malloc(sizeof(struct bnode*) * (*nnode_out));
71    ret[0] = _fetch_bnode(NULL, addr, 0);
72
73    return ret;
74}
75#else
76struct bnode * btree_get_bnode(void *addr)
77{
78    return _fetch_bnode(NULL, addr, 0);
79}
80#endif
81
82INLINE int _bnode_size(
83    struct btree *btree, struct bnode *node, void *new_minkey, void *key_arr, void *value_arr, size_t len)
84{
85    int nodesize = 0;
86
87    if (node->flag & BNODE_MASK_METADATA) {
88        metasize_t size;
89        memcpy(&size, (uint8_t *)node + sizeof(struct bnode), sizeof(metasize_t));
90        size = _endian_decode(size);
91        nodesize =
92            sizeof(struct bnode) +
93            btree->kv_ops->get_data_size(node, new_minkey, key_arr, value_arr, len) +
94            _metasize_align(size) + sizeof(metasize_t);
95    }else{
96        nodesize =
97            sizeof(struct bnode) +
98            btree->kv_ops->get_data_size(node, new_minkey, key_arr, value_arr, len);
99    }
100
101    return nodesize;
102}
103
104struct kv_ins_item {
105    void *key;
106    void *value;
107    struct list_elem le;
108};
109
110INLINE struct kv_ins_item * _kv_ins_item_create(
111    struct btree *btree, void *key, void *value)
112{
113    struct kv_ins_item *item;
114    item = (struct kv_ins_item*)malloc(sizeof(struct kv_ins_item));
115    item->key = (void *)malloc(btree->ksize);
116    item->value = (void *)malloc(btree->vsize);
117
118    if (btree->kv_ops->init_kv_var) {
119        btree->kv_ops->init_kv_var(btree, item->key, item->value);
120    }
121    if (key) {
122        btree->kv_ops->set_key(btree, item->key, key);
123    }
124    if (value) {
125        btree->kv_ops->set_value(btree, item->value, value);
126    }
127    return item;
128}
129
130INLINE void _kv_ins_item_free(struct kv_ins_item *item){
131    free(item->key);
132    free(item->value);
133    free(item);
134}
135
136INLINE int _bnode_size_check(
137    struct btree *btree, bid_t bid, struct bnode *node,
138    void *new_minkey, struct list *kv_ins_list, size_t *size_out)
139{
140    size_t nitem;
141    size_t cursize;
142    size_t nodesize;
143    struct list_elem *e;
144    struct kv_ins_item *item;
145
146    nodesize = btree->blk_ops->blk_get_size(btree->blk_handle, bid);
147#ifdef __CRC32
148    nodesize -= BLK_MARKER_SIZE;
149#endif
150
151    nitem = 0;
152    if (kv_ins_list) {
153        e = list_begin(kv_ins_list);
154        while(e){
155            nitem++;
156            e = list_next(e);
157        }
158    }
159
160    if (nitem > 1) {
161        int i;
162        void *key_arr, *value_arr;
163
164        key_arr = (void*)malloc(btree->ksize * nitem);
165        value_arr = (void*)malloc(btree->vsize * nitem);
166
167        i = 0;
168        e = list_begin(kv_ins_list);
169        while(e){
170            item = _get_entry(e, struct kv_ins_item, le);
171            memcpy((uint8_t *)key_arr + btree->ksize * i, item->key, btree->ksize);
172            memcpy((uint8_t *)value_arr + btree->ksize * i, item->value, btree->ksize);
173            i++;
174            e = list_next(e);
175        }
176        cursize = _bnode_size(btree, node, new_minkey, key_arr, value_arr, nitem);
177
178        free(key_arr);
179        free(value_arr);
180    }else if (nitem == 1) {
181        e = list_begin(kv_ins_list);
182        item = _get_entry(e, struct kv_ins_item, le);
183        cursize = _bnode_size(btree, node, new_minkey, item->key, item->value, 1);
184    } else {
185        /* nitem should never be negative due to size_t */
186        fdb_assert(nitem == 0, nitem, btree);
187        cursize = _bnode_size(btree, node, new_minkey, NULL, NULL, 0);
188    }
189
190    *size_out = cursize;
191    return ( cursize <= nodesize );
192}
193
194INLINE struct bnode * _btree_init_node(
195    struct btree *btree, bid_t bid, void *addr, bnode_flag_t flag, uint16_t level, struct btree_meta *meta)
196{
197    struct bnode *node;
198    void *node_addr;
199    metasize_t _size;
200
201    node_addr = addr;
202
203    node = (struct bnode *)node_addr;
204    node->kvsize = btree->ksize<<8 | btree->vsize;
205    node->nentry = 0;
206    node->level = level;
207    node->flag = flag;
208
209    if ((flag & BNODE_MASK_METADATA) && meta) {
210        _size = _endian_encode(meta->size);
211        memcpy((uint8_t *)node_addr + sizeof(struct bnode), &_size, sizeof(metasize_t));
212        memcpy((uint8_t *)node_addr + sizeof(struct bnode) + sizeof(metasize_t),
213               meta->data, meta->size);
214        node->data = (uint8_t *)node_addr + sizeof(struct bnode) + sizeof(metasize_t) +
215                     _metasize_align(meta->size);
216    }else{
217        node->data = (uint8_t *)node_addr + sizeof(struct bnode);
218    }
219
220    return node;
221}
222
223INLINE size_t _btree_get_nsplitnode(struct btree *btree, bid_t bid, struct bnode *node, size_t size)
224{
225    size_t headersize, dataspace;
226    size_t nodesize;
227    size_t nnode = 0;
228
229    nodesize = btree->blk_ops->blk_get_size(btree->blk_handle, bid);
230#ifdef __CRC32
231    nodesize -= BLK_MARKER_SIZE;
232#endif
233
234    if (node->flag & BNODE_MASK_METADATA) {
235        metasize_t size;
236        memcpy(&size, (uint8_t *)node + sizeof(struct bnode), sizeof(metasize_t));
237        size = _endian_decode(size);
238        headersize = sizeof(struct bnode) + _metasize_align(size) + sizeof(metasize_t);
239    }else{
240        headersize = sizeof(struct bnode);
241    }
242
243    dataspace = nodesize - headersize;
244    // round up
245    nnode = ((size - headersize) + (dataspace-1)) / dataspace;
246
247    return nnode;
248}
249
250metasize_t btree_read_meta(struct btree *btree, void *buf)
251{
252    void *addr;
253    void *ptr;
254    metasize_t size;
255    struct bnode *node;
256
257    addr = btree->blk_ops->blk_read(btree->blk_handle, btree->root_bid);
258    node = _fetch_bnode(btree, addr, btree->height);
259    if (node->flag & BNODE_MASK_METADATA) {
260        ptr = ((uint8_t *)node) + sizeof(struct bnode);
261        memcpy(&size, (uint8_t *)ptr, sizeof(metasize_t));
262        size = _endian_decode(size);
263        memcpy(buf, (uint8_t *)ptr + sizeof(metasize_t), size);
264    } else {
265        size = 0;
266    }
267
268    return size;
269}
270
271void btree_update_meta(struct btree *btree, struct btree_meta *meta)
272{
273    void *addr;
274    void *ptr;
275    metasize_t metasize, _metasize;
276    metasize_t old_metasize = (metasize_t)(-1);
277    struct bnode *node;
278
279    // read root node
280    addr = btree->blk_ops->blk_read(btree->blk_handle, btree->root_bid);
281    node = _fetch_bnode(btree, addr, btree->height);
282
283    ptr = ((uint8_t *)node) + sizeof(struct bnode);
284
285    if (node->flag & BNODE_MASK_METADATA) {
286        memcpy(&old_metasize, ptr, sizeof(metasize_t));
287        old_metasize = _endian_decode(old_metasize);
288    }
289
290    if (meta) {
291        metasize = meta->size;
292
293        // new meta size cannot be larger than old meta size
294        fdb_assert(metasize <= old_metasize, metasize, old_metasize);
295        (void)metasize;
296
297        // overwrite
298        if (meta->size > 0) {
299            _metasize = _endian_encode(metasize);
300            memcpy(ptr, &_metasize, sizeof(metasize_t));
301            memcpy((uint8_t *)ptr + sizeof(metasize_t), meta->data, metasize);
302            node->flag |= BNODE_MASK_METADATA;
303        }else{
304            // clear the flag
305            node->flag &= ~BNODE_MASK_METADATA;
306        }
307        // move kv-pairs (only if meta size is changed)
308        if (_metasize_align(metasize) < _metasize_align(old_metasize)){
309            memmove(
310                (uint8_t *)ptr + sizeof(metasize_t) + _metasize_align(metasize),
311                node->data,
312                btree->kv_ops->get_data_size(node, NULL, NULL, NULL, 0));
313            node->data = (uint8_t *)node->data - (_metasize_align(old_metasize) -
314                         _metasize_align(metasize));
315        }
316
317    }else {
318        if (node->flag & BNODE_MASK_METADATA) {
319            // existing metadata is removed
320            memmove(ptr, node->data, btree->kv_ops->get_data_size(node,
321                                                    NULL, NULL, NULL, 0));
322            node->data = (uint8_t *)node->data - (_metasize_align(old_metasize) +
323                         sizeof(metasize_t));
324            // clear the flag
325            node->flag &= ~BNODE_MASK_METADATA;
326        }
327    }
328
329    if (!btree->blk_ops->blk_is_writable(btree->blk_handle, btree->root_bid)) {
330        // already flushed block -> cannot overwrite, we have to move to new block
331        btree->blk_ops->blk_move(btree->blk_handle, btree->root_bid,
332                                &btree->root_bid);
333    }else{
334        btree->blk_ops->blk_set_dirty(btree->blk_handle, btree->root_bid);
335    }
336}
337
338btree_result btree_init_from_bid(struct btree *btree, void *blk_handle,
339                                 struct btree_blk_ops *blk_ops,
340                                 struct btree_kv_ops *kv_ops,
341                                 uint32_t nodesize, bid_t root_bid)
342{
343    void *addr;
344    struct bnode *root;
345
346    btree->blk_ops = blk_ops;
347    btree->blk_handle = blk_handle;
348    btree->kv_ops = kv_ops;
349    btree->blksize = nodesize;
350    btree->root_bid = root_bid;
351
352    addr = btree->blk_ops->blk_read(btree->blk_handle, btree->root_bid);
353    if (!addr)
354        return BTREE_RESULT_FAIL;
355    root = _fetch_bnode(btree, addr, 0);
356
357    btree->root_flag = root->flag;
358    btree->height = root->level;
359    _get_kvsize(root->kvsize, btree->ksize, btree->vsize);
360
361    return BTREE_RESULT_SUCCESS;
362}
363
364btree_result btree_init(
365        struct btree *btree, void *blk_handle,
366        struct btree_blk_ops *blk_ops,     struct btree_kv_ops *kv_ops,
367        uint32_t nodesize, uint8_t ksize, uint8_t vsize,
368        bnode_flag_t flag, struct btree_meta *meta)
369{
370    void *addr;
371    size_t min_nodesize = 0;
372
373    btree->root_flag = BNODE_MASK_ROOT | flag;
374    btree->blk_ops = blk_ops;
375    btree->blk_handle = blk_handle;
376    btree->kv_ops = kv_ops;
377    btree->height = 1;
378    btree->blksize = nodesize;
379    btree->ksize = ksize;
380    btree->vsize = vsize;
381    if (meta) {
382        btree->root_flag |= BNODE_MASK_METADATA;
383        min_nodesize = sizeof(struct bnode) + _metasize_align(meta->size) +
384                       sizeof(metasize_t) + BLK_MARKER_SIZE;
385    } else {
386        min_nodesize = sizeof(struct bnode) + BLK_MARKER_SIZE;
387    }
388
389    if (min_nodesize > btree->blksize) {
390        // too large metadata .. init fail
391        return BTREE_RESULT_FAIL;
392    }
393
394    // create the first root node
395    if (btree->blk_ops->blk_alloc_sub && btree->blk_ops->blk_enlarge_node) {
396        addr = btree->blk_ops->blk_alloc_sub(btree->blk_handle, &btree->root_bid);
397        if (meta) {
398            // check if the initial node size including metadata is
399            // larger than the subblock size
400            size_t subblock_size;
401            subblock_size = btree->blk_ops->blk_get_size(btree->blk_handle,
402                                                         btree->root_bid);
403            if (subblock_size < min_nodesize) {
404                addr = btree->blk_ops->blk_enlarge_node(btree->blk_handle,
405                                                        btree->root_bid,
406                                                        min_nodesize,
407                                                        &btree->root_bid);
408            }
409        }
410    } else {
411        addr = btree->blk_ops->blk_alloc (btree->blk_handle, &btree->root_bid);
412    }
413    _btree_init_node(btree, btree->root_bid, addr,
414                     btree->root_flag, BNODE_MASK_ROOT, meta);
415
416    return BTREE_RESULT_SUCCESS;
417}
418
419/*
420return index# of largest key equal or smaller than KEY
421example)
422node: [2 4 6 8]
423key: 5
424largest key equal or smaller than KEY: 4
425return: 1 (index# of the key '4')
426*/
427static idx_t _btree_find_entry(struct btree *btree, struct bnode *node, void *key)
428{
429    idx_t start, end, middle, temp;
430    uint8_t *k = alca(uint8_t, btree->ksize);
431    int cmp;
432    #ifdef __BIT_CMP
433        idx_t *_map1[3] = {&end, &start, &start};
434        idx_t *_map2[3] = {&temp, &end, &temp};
435    #endif
436
437    if (btree->kv_ops->init_kv_var) btree->kv_ops->init_kv_var(btree, k, NULL);
438
439    start = middle = 0;
440    end = node->nentry;
441
442    if (end > 0) {
443        // compare with smallest key
444        btree->kv_ops->get_kv(node, 0, k, NULL);
445        // smaller than smallest key
446        if (btree->kv_ops->cmp(key, k, btree->aux) < 0) {
447            if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, NULL);
448            return BTREE_IDX_NOT_FOUND;
449        }
450
451        // compare with largest key
452        btree->kv_ops->get_kv(node, end-1, k, NULL);
453        // larger than largest key
454        if (btree->kv_ops->cmp(key, k, btree->aux) >= 0) {
455            if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, NULL);
456            return end-1;
457        }
458
459        // binary search
460        while(start+1 < end) {
461            middle = (start + end) >> 1;
462
463            // get key at middle
464            btree->kv_ops->get_kv(node, middle, k, NULL);
465            cmp = btree->kv_ops->cmp(key, k, btree->aux);
466
467            #ifdef __BIT_CMP
468                cmp = _MAP(cmp) + 1;
469                *_map1[cmp] = middle;
470                *_map2[cmp] = 0;
471            #else
472                if (cmp < 0) end = middle;
473                else if (cmp > 0) start = middle;
474                else {
475                    if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, NULL);
476                    return middle;
477                }
478            #endif
479        }
480        if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, NULL);
481        return start;
482    }
483
484    if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, NULL);
485    return BTREE_IDX_NOT_FOUND;
486}
487
488static idx_t _btree_add_entry(struct btree *btree, struct bnode *node, void *key, void *value)
489{
490    idx_t idx, idx_insert;
491    uint8_t *k = alca(uint8_t, btree->ksize);
492
493    if (btree->kv_ops->init_kv_var) btree->kv_ops->init_kv_var(btree, k, NULL);
494
495    if (node->nentry > 0) {
496        idx = _btree_find_entry(btree, node, key);
497
498        if (idx == BTREE_IDX_NOT_FOUND) idx_insert = 0;
499        else {
500            btree->kv_ops->get_kv(node, idx, k, NULL);
501            if (!btree->kv_ops->cmp(key, k, btree->aux)) {
502                // if same key already exists -> update its value
503                btree->kv_ops->set_kv(node, idx, key, value);
504                if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, NULL);
505                return idx;
506            }else{
507                idx_insert = idx+1;
508            }
509        }
510
511        if (idx_insert < node->nentry) {
512
513            /*
514            shift [idx+1, nentry) key-value pairs to right
515            example)
516            idx = 1 (i.e. idx_insert = 2)
517            [2 4 6 8] -> [2 4 _ 6 8]
518            return 2
519            */
520            btree->kv_ops->ins_kv(node, idx_insert, key, value);
521        }else{
522            btree->kv_ops->set_kv(node, idx_insert, key, value);
523        }
524
525    }else{
526        idx_insert = 0;
527        // add at idx_insert
528        btree->kv_ops->set_kv(node, idx_insert, key, value);
529    }
530
531    // add at idx_insert
532    node->nentry++;
533
534    if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, NULL);
535    return idx_insert;
536}
537
538static idx_t _btree_remove_entry(struct btree *btree, struct bnode *node, void *key)
539{
540    idx_t idx;
541
542    if (node->nentry > 0) {
543        idx = _btree_find_entry(btree, node, key);
544
545        if (idx == BTREE_IDX_NOT_FOUND) return idx;
546
547        /*
548        shift [idx+1, nentry) key-value pairs to left
549        example)
550        idx = 2
551        [2 4 6 8 10] -> [2 4 8 10]
552        return 2
553        */
554        btree->kv_ops->ins_kv(node, idx, NULL, NULL);
555
556        node->nentry--;
557
558        return idx;
559
560    }else{
561        return BTREE_IDX_NOT_FOUND;
562    }
563}
564
565static void _btree_print_node(struct btree *btree, int depth,
566                              bid_t bid, btree_print_func func)
567{
568    int i;
569    uint8_t *k = alca(uint8_t, btree->ksize);
570    uint8_t *v = alca(uint8_t, btree->vsize);
571    void *addr;
572    bid_t _bid;
573    struct bnode *node;
574
575    if (btree->kv_ops->init_kv_var) btree->kv_ops->init_kv_var(btree, k, v);
576
577    addr = btree->blk_ops->blk_read(btree->blk_handle, bid);
578    node = _fetch_bnode(btree, addr, depth);
579
580    fprintf(stderr, "[d:%d n:%d f:%x b:%" _F64 " ", node->level, node->nentry, node->flag, bid);
581
582    for (i=0;i<node->nentry;++i){
583        btree->kv_ops->get_kv(node, i, k, v);
584        _bid = btree->kv_ops->value2bid(v);
585        _bid = _endian_decode(_bid);
586        func(btree, k, (void*)&_bid);
587    }
588    fprintf(stderr, "]\n");
589    if (depth > 1) {
590        for (i=0;i<node->nentry;++i){
591            btree->kv_ops->get_kv(node, i, k, v);
592            _bid = btree->kv_ops->value2bid(v);
593            _bid = _endian_decode(_bid);
594            _btree_print_node(btree, depth-1, _bid, func);
595        }
596    }
597
598    if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, v);
599}
600
601void btree_print_node(struct btree *btree, btree_print_func func)
602{
603    fprintf(stderr, "tree height: %d\n", btree->height);
604    _btree_print_node(btree, btree->height, btree->root_bid, func);
605}
606
607btree_result btree_get_key_range(
608    struct btree *btree, idx_t num, idx_t den, void *key_begin, void *key_end)
609{
610    void *addr;
611    uint8_t *k = alca(uint8_t, btree->ksize);
612    uint8_t *v = alca(uint8_t, btree->vsize);
613    idx_t idx_begin, idx_end, idx;
614    bid_t bid;
615    struct bnode *root, *node;
616    uint64_t _num, _den, _nentry, resolution, mask, _idx_begin, _idx_end;
617
618    if (num >= den) {
619        // TODO: Need to log the corresponding error message
620        return BTREE_RESULT_FAIL;
621    }
622    resolution = 1<<4; mask = resolution-1;
623
624    if (btree->kv_ops->init_kv_var) btree->kv_ops->init_kv_var(btree, k, v);
625    _num = (uint64_t)num * resolution;
626    _den = (uint64_t)den * resolution;
627
628    // get root node
629    addr = btree->blk_ops->blk_read(btree->blk_handle, btree->root_bid);
630    root = _fetch_bnode(btree, addr, btree->height);
631    _nentry = (uint64_t)root->nentry * resolution;
632
633    if (btree->height == 1) {
634        btree->kv_ops->get_kv(root, ((num+0) * root->nentry / den)-0, key_begin, NULL);
635        btree->kv_ops->get_kv(root, ((num+1) * root->nentry / den)-1, key_end, NULL);
636    }else{
637        _idx_begin = (_num * _nentry / _den);
638        _idx_end = ((_num+resolution) * _nentry / _den)-1;
639
640        idx_begin = _idx_begin / resolution;
641        idx_end = (_idx_end / resolution);
642        if (idx_end >= root->nentry) idx_end = root->nentry-1;
643
644        // get first child node (for KEY_BEGIN)
645        btree->kv_ops->get_kv(root, idx_begin, k, v);
646        bid = btree->kv_ops->value2bid(v);
647        bid = _endian_decode(bid);
648        addr = btree->blk_ops->blk_read(btree->blk_handle, bid);
649        node = _fetch_bnode(btree, addr, btree->height-1);
650
651        idx = ((_idx_begin & mask) * (node->nentry-1) / (resolution-1));
652        btree->kv_ops->get_kv(node, idx, key_begin, NULL);
653
654        // get second child node (for KEY_END)
655        if (idx_end != idx_begin) {
656            btree->kv_ops->get_kv(root, idx_end, k, v);
657            bid = btree->kv_ops->value2bid(v);
658            bid = _endian_decode(bid);
659            addr = btree->blk_ops->blk_read(btree->blk_handle, bid);
660            node = _fetch_bnode(btree, addr, btree->height-1);
661        }
662
663        idx = ((_idx_end & mask) * (node->nentry-1) / (resolution-1));
664        btree->kv_ops->get_kv(node, idx, key_end, NULL);
665    }
666
667    if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, v);
668    return BTREE_RESULT_SUCCESS;
669}
670
671btree_result btree_find(struct btree *btree, void *key, void *value_buf)
672{
673    void *addr;
674    uint8_t *k = alca(uint8_t, btree->ksize);
675    uint8_t *v = alca(uint8_t, btree->vsize);
676    idx_t *idx = alca(idx_t, btree->height);
677    bid_t *bid = alca(bid_t, btree->height);
678    struct bnode **node = alca(struct bnode *, btree->height);
679    int i;
680
681    if (btree->kv_ops->init_kv_var) btree->kv_ops->init_kv_var(btree, k, v);
682
683    // set root
684    bid[btree->height-1] = btree->root_bid;
685
686    for (i=btree->height-1; i>=0; --i) {
687        // read block using bid
688        addr = btree->blk_ops->blk_read(btree->blk_handle, bid[i]);
689        // fetch node structure from block
690        node[i] = _fetch_bnode(btree, addr, i+1);
691
692        // lookup key in current node
693        idx[i] = _btree_find_entry(btree, node[i], key);
694
695        if (idx[i] == BTREE_IDX_NOT_FOUND) {
696            // not found .. return NULL
697            if (btree->blk_ops->blk_operation_end)
698                btree->blk_ops->blk_operation_end(btree->blk_handle);
699            if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, v);
700            return BTREE_RESULT_FAIL;
701        }
702
703        btree->kv_ops->get_kv(node[i], idx[i], k, v);
704
705        if (i>0) {
706            // index (non-leaf) node
707            // get bid of child node from value
708            bid[i-1] = btree->kv_ops->value2bid(v);
709            bid[i-1] = _endian_decode(bid[i-1]);
710        }else{
711            // leaf node
712            // return (address of) value if KEY == k
713            if (!btree->kv_ops->cmp(key, k, btree->aux)) {
714                btree->kv_ops->set_value(btree, value_buf, v);
715            }else{
716                if (btree->blk_ops->blk_operation_end)
717                    btree->blk_ops->blk_operation_end(btree->blk_handle);
718                if (btree->kv_ops->free_kv_var) {
719                    btree->kv_ops->free_kv_var(btree, k, v);
720                }
721                return BTREE_RESULT_FAIL;
722            }
723        }
724    }
725    if (btree->blk_ops->blk_operation_end) {
726        btree->blk_ops->blk_operation_end(btree->blk_handle);
727    }
728    if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, v);
729    return BTREE_RESULT_SUCCESS;
730}
731
732static int _btree_split_node(
733    struct btree *btree, void *key, struct bnode **node, bid_t *bid, idx_t *idx, int i,
734    struct list *kv_ins_list, size_t nsplitnode, void *k, void *v,
735    int8_t *modified, int8_t *minkey_replace, int8_t *ins)
736{
737    void *addr;
738    size_t nnode = nsplitnode;
739    size_t j;
740    int *nentry = alca(int, nnode);
741    memset(nentry, 0, nnode * sizeof(int));
742    bid_t _bid;
743    bid_t *new_bid = alca(bid_t, nnode);
744    memset(new_bid, 0, nnode * sizeof(bid_t));
745    idx_t *split_idx = alca(idx_t, nnode+1);
746    memset(split_idx, 0, (nnode + 1) * sizeof(idx_t));
747    idx_t *idx_ins = alca(idx_t, btree->height);
748    memset(idx_ins, 0, btree->height * sizeof(idx_t));
749    struct list_elem *e;
750    struct bnode **new_node = alca(struct bnode *, nnode);
751    memset(new_node, 0, nnode * sizeof(struct bnode*));
752    struct kv_ins_item *kv_item = NULL;
753
754    // allocate new block(s)
755    new_node[0] = node[i];
756    for (j=1; j<nnode;++j){
757        addr = btree->blk_ops->blk_alloc(btree->blk_handle, &new_bid[j]);
758        new_node[j] = _btree_init_node(btree, new_bid[j], addr, 0x0,
759                                       node[i]->level, NULL);
760    }
761
762    // calculate # entry
763    for (j=0;j<nnode+1;++j){
764        btree->kv_ops->get_nth_idx(node[i], j, nnode, &split_idx[j]);
765        if (j>0) {
766            nentry[j-1] = split_idx[j] - split_idx[j-1];
767        }
768    }
769
770    // copy kv-pairs to new node(s)
771    for (j=1;j<nnode;++j){
772        btree->kv_ops->copy_kv(new_node[j], node[i], 0, split_idx[j],
773                               nentry[j]);
774    }
775    j = 0;
776    btree->kv_ops->copy_kv(new_node[j], node[i], 0, split_idx[j], nentry[j]);
777
778    // header
779    for (j=0;j<nnode;++j){
780        new_node[j]->nentry = nentry[j];
781    }
782    modified[i] = 1;
783
784    if (ins[i]) {
785        // insert kv-pair(s) to appropriate node
786        e = list_begin(&kv_ins_list[i]);
787        while(e) {
788            kv_item = _get_entry(e, struct kv_ins_item, le);
789
790            idx_ins[i] = BTREE_IDX_NOT_FOUND;
791            for (j=1;j<nnode;++j){
792                btree->kv_ops->get_kv(new_node[j], 0, k, v);
793                if (btree->kv_ops->cmp(kv_item->key, k, btree->aux) < 0) {
794                    idx_ins[i] =
795                        _btree_add_entry(btree, new_node[j-1], kv_item->key,
796                                         kv_item->value);
797                    break;
798                }
799            }
800            if (idx_ins[i] == BTREE_IDX_NOT_FOUND) {
801                // insert into the last split node
802                idx_ins[i] =
803                    _btree_add_entry(btree, new_node[nnode-1], kv_item->key,
804                                     kv_item->value);
805            }
806            e = list_next(e);
807        }
808    }
809    if (minkey_replace[i]){
810        // replace the minimum key in the (first split) node to KEY
811        btree->kv_ops->get_kv(new_node[0], idx[i], k, v);
812        btree->kv_ops->set_kv(new_node[0], idx[i], key, v);
813    }
814
815    if (i+1 < btree->height) {
816        // non-root node
817        // reserve kv-pair (i.e. splitters) to be inserted into parent node
818        for (j=1; j<nnode; ++j){
819            _bid = _endian_encode(new_bid[j]);
820            kv_item = _kv_ins_item_create(btree, NULL, (void *)&_bid);
821            btree->kv_ops->get_nth_splitter(new_node[j-1], new_node[j],
822                                            kv_item->key);
823            list_push_back(&kv_ins_list[i+1], &kv_item->le);
824        }
825        ins[i+1] = 1;
826
827    }else{
828        //2 root node -> height grow up
829        // allocate new block for new root node
830        bid_t new_root_bid;
831        struct bnode *new_root;
832        uint8_t *buf = alca(uint8_t, btree->blksize);
833        struct btree_meta meta;
834
835        meta.size = btree_read_meta(btree, buf);
836        meta.data = buf;
837        // remove metadata section of existing node
838        // (this node is not root anymore)
839        btree_update_meta(btree, NULL);
840
841        btree->height++;
842
843        addr = btree->blk_ops->blk_alloc(btree->blk_handle, &new_root_bid);
844        if (meta.size > 0)
845            new_root =
846                _btree_init_node(
847                    btree, new_root_bid, addr, btree->root_flag,
848                    node[i]->level + 1, &meta);
849        else
850            new_root =
851                _btree_init_node(
852                    btree, new_root_bid, addr, btree->root_flag,
853                    node[i]->level + 1, NULL);
854
855        // clear old root node flag
856        node[i]->flag &= ~BNODE_MASK_ROOT;
857        node[i]->flag &= ~BNODE_MASK_SEQTREE;
858        // change root bid
859        btree->root_bid = new_root_bid;
860
861        // move the former node if not dirty
862        if (!btree->blk_ops->blk_is_writable(btree->blk_handle, bid[i])) {
863            addr = btree->blk_ops->blk_move(btree->blk_handle, bid[i], &bid[i]);
864            node[i] = _fetch_bnode(btree, addr, i+1);
865        }else{
866            btree->blk_ops->blk_set_dirty(btree->blk_handle, bid[i]);
867        }
868
869        // insert kv-pairs pointing to their child nodes
870        // original (i.e. the first node)
871        btree->kv_ops->get_kv(node[i], 0, k, v);
872        _bid = _endian_encode(bid[i]);
873        _btree_add_entry(btree, new_root, k, btree->kv_ops->bid2value(&_bid));
874
875        // the others
876        for (j=1;j<nnode;++j){
877            //btree->kv_ops->get_kv(new_node[j], 0, k, v);
878            btree->kv_ops->get_nth_splitter(new_node[j-1], new_node[j], k);
879            _bid = _endian_encode(new_bid[j]);
880            _btree_add_entry(btree, new_root, k,
881                             btree->kv_ops->bid2value(&_bid));
882        }
883
884        return 1;
885    } // height growup
886
887    return 0;
888}
889
890static int _btree_move_modified_node(
891    struct btree *btree, void *key, struct bnode **node, bid_t *bid,
892    idx_t *idx, int i,
893    struct list *kv_ins_list, void *k, void *v,
894    int8_t *modified, int8_t *minkey_replace, int8_t *ins, int8_t *moved)
895{
896    void *addr;
897
898    // get new bid[i]
899    addr = btree->blk_ops->blk_move(btree->blk_handle, bid[i], &bid[i]);
900    (void)addr;
901    //node[i] = _fetch_bnode(btree, addr, i+1);
902    moved[i] = 1;
903
904    if (i+1 == btree->height)
905        // if moved node is root node
906        btree->root_bid = bid[i];
907
908    return 0;
909}
910
911btree_result btree_insert(struct btree *btree, void *key, void *value)
912{
913    void *addr;
914    size_t nsplitnode = 1;
915    uint8_t *k = alca(uint8_t, btree->ksize);
916    uint8_t *v = alca(uint8_t, btree->vsize);
917    // index# and block ID for each level
918    idx_t *idx = alca(idx_t, btree->height);
919    bid_t *bid = alca(bid_t, btree->height);
920    bid_t _bid;
921    // flags
922    int8_t *modified = alca(int8_t, btree->height);
923    int8_t *moved = alca(int8_t, btree->height);
924    int8_t *ins = alca(int8_t, btree->height);
925    int8_t *minkey_replace = alca(int8_t, btree->height);
926    int8_t height_growup;
927
928    // key, value to be inserted
929    struct list *kv_ins_list = alca(struct list, btree->height);
930    struct kv_ins_item *kv_item;
931    struct list_elem *e;
932
933    // index# where kv is inserted
934    idx_t *idx_ins = alca(idx_t, btree->height);
935    struct bnode **node = alca(struct bnode *, btree->height);
936    int i, j, _is_update = 0;
937
938    // initialize flags
939    for (i=0;i<btree->height;++i) {
940        moved[i] = modified[i] = ins[i] = minkey_replace[i] = 0;
941    }
942    height_growup = 0;
943
944    // initialize temporary variables
945    if (btree->kv_ops->init_kv_var) {
946        btree->kv_ops->init_kv_var(btree, k, v);
947        for (i=0;i<btree->height;++i){
948            list_init(&kv_ins_list[i]);
949        }
950    }
951
952    // copy key-value pair to be inserted into leaf node
953    kv_item = _kv_ins_item_create(btree, key, value);
954    list_push_back(&kv_ins_list[0], &kv_item->le);
955
956    ins[0] = 1;
957
958    // set root node
959    bid[btree->height-1] = btree->root_bid;
960
961    // find path from root to leaf
962    for (i=btree->height-1; i>=0; --i){
963        // read block using bid
964        addr = btree->blk_ops->blk_read(btree->blk_handle, bid[i]);
965        // fetch node structure from block
966        node[i] = _fetch_bnode(btree, addr, i+1);
967
968        // lookup key in current node
969        idx[i] = _btree_find_entry(btree, node[i], key);
970
971        if (i > 0) {
972            // index (non-leaf) node
973            if (idx[i] == BTREE_IDX_NOT_FOUND)
974                // KEY is smaller than the smallest key in this node ..
975                // just follow the smallest key
976                idx[i] = 0;
977
978            // get bid of child node from value
979            btree->kv_ops->get_kv(node[i], idx[i], k, v);
980            bid[i-1] = btree->kv_ops->value2bid(v);
981            bid[i-1] = _endian_decode(bid[i-1]);
982        }else{
983            // leaf node .. do nothing
984        }
985    }
986
987    // cascaded insert from leaf to root
988    for (i=0;i<btree->height;++i){
989
990        if (idx[i] != BTREE_IDX_NOT_FOUND)
991            btree->kv_ops->get_kv(node[i], idx[i], k, NULL);
992
993        if (i > 0) {
994            // in case of index node
995            // when KEY is smaller than smallest key in index node
996            if (idx[i] == 0 && btree->kv_ops->cmp(key, k, btree->aux) < 0) {
997                // change node's smallest key
998                minkey_replace[i] = 1;
999            }
1000
1001            // when child node is moved to new block
1002            if (moved[i-1]) {
1003                // replace the bid (value)
1004                _bid = _endian_encode(bid[i-1]);
1005                btree->kv_ops->set_kv(node[i], idx[i], k,
1006                                      btree->kv_ops->bid2value(&_bid));
1007                modified[i] = 1;
1008            }
1009        }
1010
1011        if (ins[i] || minkey_replace[i]) {
1012            // there is a key-value pair to be inserted into this (level of)
1013            // node check whether btree node space is enough to add new
1014            // key-value pair or not, OR action is not insertion but update
1015            // (key_ins exists in current node)
1016            _is_update = 0;
1017            size_t nodesize;
1018            void *new_minkey = (minkey_replace[i])?(key):(NULL);
1019
1020            if (i==0) {
1021                e = list_begin(&kv_ins_list[i]);
1022                kv_item = _get_entry(e, struct kv_ins_item, le);
1023                _is_update =
1024                    (idx[i] != BTREE_IDX_NOT_FOUND &&
1025                     !btree->kv_ops->cmp(kv_item->key, k, btree->aux));
1026            }
1027
1028        check_node:;
1029            int _size_check =
1030                _bnode_size_check(btree, bid[i], node[i], new_minkey,
1031                                  &kv_ins_list[i], &nodesize);
1032
1033            if (_size_check || _is_update ) {
1034                //2 enough space
1035                if (ins[i]) {
1036                    // insert key/value pair(s)
1037                    // insert all kv pairs on list
1038                    e = list_begin(&kv_ins_list[i]);
1039                    while(e) {
1040                        kv_item = _get_entry(e, struct kv_ins_item, le);
1041                        idx_ins[i] = _btree_add_entry(btree, node[i],
1042                                                 kv_item->key, kv_item->value);
1043                        e = list_next(e);
1044                    }
1045                }
1046                if (minkey_replace[i]) {
1047                    // replace the minimum key in the node to KEY
1048                    btree->kv_ops->get_kv(node[i], idx[i], k, v);
1049                    btree->kv_ops->set_kv(node[i], idx[i], key, v);
1050                }
1051                modified[i] = 1;
1052
1053            }else{
1054                //2 not enough
1055                // first check if the node can be enlarged
1056                if (btree->blk_ops->blk_enlarge_node &&
1057                    is_subblock(bid[i])) {
1058                    bid_t new_bid;
1059                    addr = btree->blk_ops->blk_enlarge_node(btree->blk_handle,
1060                        bid[i], nodesize, &new_bid);
1061                    if (addr) {
1062                        // the node can be enlarged .. fetch the enlarged node
1063                        moved[i] = 1;
1064                        bid[i] = new_bid;
1065                        node[i] = _fetch_bnode(btree, addr, i+1);
1066                        if (i+1 == btree->height) {
1067                            // if moved node is root node
1068                            btree->root_bid = bid[i];
1069                        }
1070                        // start over
1071                        goto check_node;
1072                    }
1073                }
1074
1075                //otherwise, split the node
1076                nsplitnode = _btree_get_nsplitnode(btree, BLK_NOT_FOUND,
1077                                                   node[i], nodesize);
1078                // force the node split when the node size of new layout is
1079                // larger than the current node size
1080                if (nsplitnode == 1) nsplitnode = 2;
1081
1082                height_growup = _btree_split_node(
1083                    btree, key, node, bid, idx, i,
1084                    kv_ins_list, nsplitnode, k, v,
1085                    modified, minkey_replace, ins);
1086            } // split
1087        } // insert
1088
1089        if (height_growup) break;
1090
1091        if (modified[i]) {
1092            //2 when the node is modified
1093            if (!btree->blk_ops->blk_is_writable(btree->blk_handle, bid[i])) {
1094                // not writable .. already flushed block -> cannot overwrite,
1095                // we have to move to new block
1096                height_growup = _btree_move_modified_node(
1097                    btree, key, node, bid, idx, i,
1098                    kv_ins_list, k, v,
1099                    modified, minkey_replace, ins, moved);
1100
1101                if (height_growup) break;
1102            }else{
1103                // writable .. just set dirty to write back into storage
1104                btree->blk_ops->blk_set_dirty(btree->blk_handle, bid[i]);
1105            } // is writable
1106        } // is modified
1107    } // for loop
1108
1109    // release temporary resources
1110    if (btree->blk_ops->blk_operation_end) {
1111        btree->blk_ops->blk_operation_end(btree->blk_handle);
1112    }
1113    if (btree->kv_ops->free_kv_var) btree->kv_ops->free_kv_var(btree, k, v);
1114
1115    for (j=0; j < ((height_growup)?(btree->height-1):(btree->height)); ++j){
1116        e = list_begin(&kv_ins_list[j]);
1117        while(e) {
1118            kv_item = _get_entry(e, struct kv_ins_item, le);
1119            e = list_remove(&kv_ins_list[j], e);
1120
1121            if (btree->kv_ops->free_kv_var) {
1122                btree->kv_ops->free_kv_var(btree, kv_item->key, kv_item->value);
1123            }
1124            _kv_ins_item_free(kv_item);
1125        }
1126    }
1127
1128    if (_is_update) {
1129        return BTREE_RESULT_UPDATE;
1130    }else{
1131        return BTREE_RESULT_SUCCESS;
1132    }
1133}
1134
1135btree_result btree_remove(struct btree *btree, void *key)
1136{
1137    void *addr;
1138    uint8_t *k = alca(uint8_t, btree->ksize);
1139    uint8_t *v= alca(uint8_t, btree->vsize);
1140    uint8_t *kk = alca(uint8_t, btree->ksize);
1141    uint8_t *vv = alca(uint8_t, btree->vsize);
1142    // index# and block ID for each level
1143    idx_t *idx = alca(idx_t, btree->height);
1144    bid_t *bid= alca(bid_t, btree->height);
1145    bid_t _bid;
1146    // flags
1147    int8_t *modified = alca(int8_t, btree->height);
1148    int8_t *moved = alca(int8_t, btree->height);
1149    int8_t *rmv = alca(int8_t, btree->height);
1150    // index# of removed key
1151    idx_t *idx_rmv = alca(idx_t, btree->height);
1152    struct bnode **node = alca(struct bnode *, btree->height);
1153    int i;
1154
1155    // initialize flags
1156    for (i=0;i<btree->height;++i) {
1157        moved[i] = modified[i] = rmv[i] = 0;
1158    }
1159    if (btree->kv_ops->init_kv_var) {
1160        btree->kv_ops->init_kv_var(btree, k, v);
1161        btree->kv_ops->init_kv_var(btree, kk, vv);
1162    }
1163
1164    rmv[0] = 1;
1165
1166    // set root
1167    bid[btree->height-1] = btree->root_bid;
1168
1169    // find path from root to leaf
1170    for (i=btree->height-1; i>=0; --i) {
1171        // read block using bid
1172        addr = btree->blk_ops->blk_read(btree->blk_handle, bid[i]);
1173        // fetch node structure from block
1174        node[i] = _fetch_bnode(btree, addr, i+1);
1175
1176        // lookup key in current node
1177        idx[i] = _btree_find_entry(btree, node[i], key);
1178
1179        if (idx[i] == BTREE_IDX_NOT_FOUND) {
1180            // not found
1181            if (btree->blk_ops->blk_operation_end)
1182                btree->blk_ops->blk_operation_end(btree->blk_handle);
1183            if (btree->kv_ops->free_kv_var) {
1184                btree->kv_ops->free_kv_var(btree, k, v);
1185                btree->kv_ops->free_kv_var(btree, kk, vv);
1186            }
1187            return BTREE_RESULT_FAIL;
1188        }
1189
1190        btree->kv_ops->get_kv(node[i], idx[i], k, v);
1191
1192        if (i>0) {
1193            // index (non-leaf) node
1194            // get bid of child node from value
1195            bid[i-1] = btree->kv_ops->value2bid(v);
1196            bid[i-1] = _endian_decode(bid[i-1]);
1197        }else{
1198            // leaf node
1199        }
1200    }
1201
1202    // cascaded remove from leaf to root
1203    for (i=0;i<btree->height;++i){
1204        // in case of index node
1205        if (i > 0) {
1206            btree->kv_ops->get_kv(node[i], idx[i], k, v);
1207
1208            // when child node's smallest key is changed due to remove
1209              if (node[i-1]->nentry > 0) {
1210                btree->kv_ops->get_kv(node[i-1], 0, kk, vv);
1211                if (btree->kv_ops->cmp(kk, k, btree->aux)) {
1212                    // change current node's corresponding key
1213                    btree->kv_ops->set_kv(node[i], idx[i], kk, v);
1214                    //memcpy(k, kk, btree->ksize);
1215                    btree->kv_ops->set_key(btree, k, kk);
1216                    modified[i] = 1;
1217                }
1218            }
1219
1220            // when child node is moved to new block
1221            if (moved[i-1]) {
1222                // replace the bid (value)
1223                _bid = _endian_encode(bid[i-1]);
1224                btree->kv_ops->set_kv(node[i], idx[i], k,
1225                                      btree->kv_ops->bid2value(&_bid));
1226                modified[i] = 1;
1227            }
1228        }
1229
1230        if (rmv[i]) {
1231            // there is a key-value pair to be removed
1232            btree->kv_ops->get_kv(node[i], idx[i], k, v);
1233            idx_rmv[i] = _btree_remove_entry(btree, node[i], k);
1234            modified[i] = 1;
1235
1236            /*
1237            remove the node when
1238            1. non-root node has no kv-pair or
1239            2. root node has less or equal than one kv-pair
1240            */
1241            if (((node[i]->nentry < 1 && i+1 < btree->height) ||
1242                (node[i]->nentry <= 1 && i+1 == btree->height)) &&
1243                btree->height > 1) {
1244                // remove the node
1245                btree->blk_ops->blk_remove(btree->blk_handle, bid[i]);
1246                if (i+1 < btree->height) {
1247                    // if non-root node
1248                    rmv[i+1] = 1;
1249                } else {
1250                    // if root node, shrink the height
1251
1252                    // allocate new block for new root node
1253                    uint8_t *buf = alca(uint8_t, btree->blksize);
1254                    uint32_t nodesize = 0, new_rootsize = 0;
1255                    bid_t child_bid, new_root_bid;
1256                    struct bnode *new_root, *child;
1257                    struct btree_meta meta;
1258
1259                    // read the child node
1260                    btree->kv_ops->get_kv(node[i], 0, k, v);
1261                    child_bid = btree->kv_ops->value2bid(v);
1262                    child_bid = _endian_decode(child_bid);
1263                    addr = btree->blk_ops->blk_read(btree->blk_handle, child_bid);
1264                    child = _fetch_bnode(btree, addr, btree->height);
1265
1266                    nodesize = btree->blk_ops->blk_get_size(btree->blk_handle, child_bid);
1267#ifdef __CRC32
1268                    nodesize -= BLK_MARKER_SIZE;
1269#endif
1270
1271                    // estimate the new root node size including metadata
1272                    meta.size = btree_read_meta(btree, buf);
1273                    meta.data = buf;
1274
1275                    if (meta.size) {
1276                        new_rootsize += _metasize_align(meta.size) + sizeof(metasize_t);
1277                    }
1278                    new_rootsize += btree->kv_ops->get_data_size(child, NULL, NULL, NULL, 0);
1279                    new_rootsize += sizeof(struct bnode);
1280
1281                    if (new_rootsize < nodesize) {
1282                        // new root node has enough space for metadata .. shrink height
1283                        btree->height--;
1284
1285                        // allocate a new node with the given meta
1286                        addr = btree->blk_ops->blk_alloc(btree->blk_handle,
1287                                                         &new_root_bid);
1288                        _btree_init_node(btree, new_root_bid, addr,
1289                                         btree->root_flag, btree->height, &meta);
1290                        new_root = _fetch_bnode(btree, addr, btree->height);
1291
1292                        // copy all entries
1293                        btree->kv_ops->copy_kv(new_root, child, 0, 0, child->nentry);
1294                        new_root->nentry = child->nentry;
1295                        // invalidate chlid node
1296                        btree->blk_ops->blk_remove(btree->blk_handle, child_bid);
1297
1298                        btree->root_bid = new_root_bid;
1299
1300                        // as the old node is invalidated,
1301                        // we don't need to move it.
1302                        modified[i] = 0;
1303                    }
1304                }
1305            }
1306        }
1307
1308        if (modified[i]) {
1309            // when node is modified
1310            if (!btree->blk_ops->blk_is_writable(btree->blk_handle, bid[i])) {
1311                // already flushed block -> cannot overwrite, so
1312                // we have to move to new block
1313                // get new bid[i]
1314                addr = btree->blk_ops->blk_move(btree->blk_handle, bid[i],
1315                                                &bid[i]);
1316                node[i] = _fetch_bnode(btree, addr, i+1);
1317                moved[i] = 1;
1318
1319                if (i+1 == btree->height)
1320                    // if moved node is root node
1321                    btree->root_bid = bid[i];
1322
1323            }else{
1324                btree->blk_ops->blk_set_dirty(btree->blk_handle, bid[i]);
1325            }
1326
1327        }
1328    }
1329
1330    if (btree->blk_ops->blk_operation_end) {
1331        btree->blk_ops->blk_operation_end(btree->blk_handle);
1332    }
1333    if (btree->kv_ops->free_kv_var) {
1334        btree->kv_ops->free_kv_var(btree, k, v);
1335        btree->kv_ops->free_kv_var(btree, kk, vv);
1336    }
1337    return BTREE_RESULT_SUCCESS;
1338}
1339
1340// LCOV_EXCL_START
1341btree_result btree_operation_end(struct btree *btree)
1342{
1343    if (btree->blk_ops->blk_operation_end) {
1344        btree->blk_ops->blk_operation_end(btree->blk_handle);
1345    }
1346    return BTREE_RESULT_SUCCESS;
1347}
1348// LCOV_EXCL_STOP
1349
1350btree_result btree_iterator_init(struct btree *btree,
1351                                 struct btree_iterator *it, void *initial_key)
1352{
1353    int i;
1354
1355    it->btree = *btree;
1356    it->curkey = (void *)mempool_alloc(btree->ksize);
1357    if (btree->kv_ops->init_kv_var) {
1358        btree->kv_ops->init_kv_var(btree, it->curkey, NULL);
1359    }
1360    if (initial_key) {
1361        // set initial key if exists
1362        //memcpy(it->curkey, initial_key, btree->ksize);
1363        btree->kv_ops->set_key(btree, it->curkey, initial_key);
1364    }else{
1365        // NULL initial key .. set minimum key (start from leftmost key)
1366        // replaced by kv_ops->init_kv_var
1367        //memset(it->curkey, 0, btree->ksize);
1368    }
1369    it->bid = (bid_t*)mempool_alloc(sizeof(bid_t) * btree->height);
1370    it->idx = (idx_t*)mempool_alloc(sizeof(idx_t) * btree->height);
1371    it->node = (struct bnode **)mempool_alloc(sizeof(struct bnode *)
1372                                                              * btree->height);
1373    it->addr = (void**)mempool_alloc(sizeof(void*) * btree->height);
1374
1375    for (i=0;i<btree->height;++i){
1376        it->bid[i] = BTREE_BLK_NOT_FOUND;
1377        it->idx[i] = BTREE_IDX_NOT_FOUND;
1378        it->node[i] = NULL;
1379        it->addr[i] = NULL;
1380    }
1381    it->bid[btree->height-1] = btree->root_bid;
1382    it->flags = 0;
1383
1384    return BTREE_RESULT_SUCCESS;
1385}
1386
1387btree_result btree_iterator_free(struct btree_iterator *it)
1388{
1389    int i;
1390    if (it->btree.kv_ops->free_kv_var) {
1391        it->btree.kv_ops->free_kv_var(&it->btree, it->curkey, NULL);
1392    }
1393    mempool_free(it->curkey);
1394    mempool_free(it->bid);
1395    mempool_free(it->idx);
1396    for (i=0;i<it->btree.height;++i){
1397        if (it->node[i]) mempool_free(it->addr[i]);
1398    }
1399    mempool_free(it->node);
1400    mempool_free(it->addr);
1401    return BTREE_RESULT_SUCCESS;
1402}
1403
1404static btree_result _btree_prev(struct btree_iterator *it, void *key_buf,
1405                                void *value_buf, int depth)
1406{
1407    struct btree *btree;
1408    btree = &it->btree;
1409    int i;
1410    uint8_t *k = alca(uint8_t, btree->ksize);
1411    uint8_t *v = alca(uint8_t, btree->vsize);
1412    void *addr;
1413    struct bnode *node;
1414    btree_result r;
1415
1416    if (it->btree.kv_ops->init_kv_var) {
1417        it->btree.kv_ops->init_kv_var(&it->btree, k, v);
1418    }
1419
1420    if (it->node[depth] == NULL){
1421        size_t blksize;
1422        addr = btree->blk_ops->blk_read(btree->blk_handle, it->bid[depth]);
1423        it->addr[depth] = (void *)mempool_alloc(btree->blksize);
1424        blksize = btree->blk_ops->blk_get_size(btree->blk_handle,
1425                                               it->bid[depth]);
1426        memcpy(it->addr[depth], addr, blksize);
1427        it->node[depth] = _fetch_bnode(&it->btree, it->addr[depth], depth+1);
1428    }
1429    node = _fetch_bnode(&it->btree, it->addr[depth], depth+1);
1430    //assert(node->level == depth+1);
1431
1432    if (node->nentry <= 0) {
1433        if (it->btree.kv_ops->free_kv_var) {
1434            it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1435        }
1436        if (it->node[depth] != NULL) mempool_free(it->addr[depth]);
1437        it->node[depth] = NULL;
1438        it->addr[depth] = NULL;
1439        return BTREE_RESULT_FAIL;
1440    }
1441
1442    if (it->idx[depth] == BTREE_IDX_NOT_FOUND) {
1443        // curkey: lastly returned key
1444        it->idx[depth] = _btree_find_entry(btree, node, it->curkey);
1445        if (it->idx[depth] == BTREE_IDX_NOT_FOUND) {
1446            it->idx[depth] = 0;
1447           // it->idx[depth] = node->nentry - 1;
1448        }
1449        btree->kv_ops->get_kv(node, it->idx[depth], key_buf, value_buf);
1450        if (btree->kv_ops->cmp(it->curkey, key_buf, btree->aux) < 0 &&
1451            depth == 0) {
1452            // in leaf node, prev key must be smaller than current key
1453            it->idx[depth] = it->idx[depth] ? it->idx[depth] - 1
1454                                            : BTREE_IDX_NOT_FOUND;
1455        } // else return the value at idx[depth] at this turn
1456    }
1457
1458    if (BTREE_ITR_IS_FWD(it) && depth ==0) {
1459        if (it->idx[depth] >= 2) {
1460            it->idx[depth] -= 2;
1461        } else {
1462            // out-of-bounds
1463            it->idx[depth] = node->nentry; // ending condition
1464            // we have to reset flag because _btree_prev will recursively
1465            // visit the left leaf node.
1466            BTREE_ITR_SET_NONE(it);
1467        }
1468    }
1469
1470    if (it->idx[depth] >= node->nentry) { // reused nentry for ending iteration
1471        // out of bound .. go up to parent node
1472        it->idx[depth] = BTREE_IDX_NOT_FOUND; // reset for btree_next
1473        if (it->node[depth] != NULL) mempool_free(it->addr[depth]);
1474        it->node[depth] = NULL;
1475        it->addr[depth] = NULL;
1476
1477        if (it->btree.kv_ops->free_kv_var) {
1478            it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1479        }
1480        return BTREE_RESULT_FAIL;
1481    }
1482
1483    if (depth > 0) {
1484        // index node
1485        if (it->node[depth-1] == NULL) {
1486            btree->kv_ops->get_kv(node, it->idx[depth], k, v);
1487            it->bid[depth-1] = btree->kv_ops->value2bid(v);
1488            it->bid[depth-1] = _endian_decode(it->bid[depth-1]);
1489        }
1490        r = _btree_prev(it, key_buf, value_buf, depth-1);
1491
1492        if (r == BTREE_RESULT_FAIL) {
1493            // move index to left
1494            it->idx[depth] = it->idx[depth] ? it->idx[depth] - 1
1495                                            : node->nentry; // ending condition
1496
1497            if (it->idx[depth] >= node->nentry){
1498                // out of bound .. go up to parent node
1499                it->idx[depth] = BTREE_IDX_NOT_FOUND;
1500                if (it->node[depth] != NULL) mempool_free(it->addr[depth]);
1501                it->node[depth] = NULL;
1502                it->addr[depth] = NULL;
1503
1504                if (it->btree.kv_ops->free_kv_var) {
1505                    it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1506                }
1507                return BTREE_RESULT_FAIL;
1508            }else{
1509                btree->kv_ops->get_kv(node, it->idx[depth], k, v);
1510                it->bid[depth-1] = btree->kv_ops->value2bid(v);
1511                it->bid[depth-1] = _endian_decode(it->bid[depth-1]);
1512                // reset child index
1513                for (i=depth-1; i>=0; --i) {
1514                    it->idx[i] = BTREE_IDX_NOT_FOUND;
1515                    if (it->node[i] != NULL) mempool_free(it->addr[i]);
1516                    it->node[i] = NULL;
1517                    it->addr[i] = NULL;
1518                }
1519                // retry
1520                r = _btree_prev(it, key_buf, value_buf, depth-1);
1521            }
1522        }
1523        if (it->btree.kv_ops->free_kv_var) {
1524            it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1525        }
1526        return r;
1527    }else{
1528        // leaf node
1529        btree->kv_ops->get_kv(node, it->idx[depth], key_buf, value_buf);
1530        //memcpy(it->curkey, key_buf, btree->ksize);
1531        btree->kv_ops->set_key(btree, it->curkey, key_buf);
1532        it->idx[depth] = it->idx[depth] ? it->idx[depth] - 1
1533                                        : node->nentry; // condition for ending
1534        if (it->btree.kv_ops->free_kv_var) {
1535            it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1536        }
1537        return BTREE_RESULT_SUCCESS;
1538    }
1539}
1540
1541btree_result btree_prev(struct btree_iterator *it, void *key_buf,
1542                        void *value_buf)
1543{
1544    btree_result br = _btree_prev(it, key_buf, value_buf, it->btree.height-1);
1545    if (br == BTREE_RESULT_SUCCESS) {
1546        BTREE_ITR_SET_REV(it);
1547    } else {
1548        BTREE_ITR_SET_NONE(it);
1549    }
1550    return br;
1551}
1552
1553static btree_result _btree_next(struct btree_iterator *it, void *key_buf,
1554                                void *value_buf, int depth)
1555{
1556    struct btree *btree;
1557    btree = &it->btree;
1558    int i;
1559    uint8_t *k = alca(uint8_t, btree->ksize);
1560    uint8_t *v = alca(uint8_t, btree->vsize);
1561    void *addr;
1562    struct bnode *node;
1563    btree_result r;
1564
1565    if (it->btree.kv_ops->init_kv_var) {
1566        it->btree.kv_ops->init_kv_var(&it->btree, k, v);
1567    }
1568
1569    if (it->node[depth] == NULL){
1570        size_t blksize;
1571        addr = btree->blk_ops->blk_read(btree->blk_handle, it->bid[depth]);
1572        it->addr[depth] = (void *)mempool_alloc(btree->blksize);
1573        blksize = btree->blk_ops->blk_get_size(btree->blk_handle,
1574                                               it->bid[depth]);
1575        memcpy(it->addr[depth], addr, blksize);
1576        it->node[depth] = _fetch_bnode(&it->btree, it->addr[depth], depth+1);
1577    }
1578    node = _fetch_bnode(&it->btree, it->addr[depth], depth+1);
1579    //assert(node->level == depth+1);
1580
1581    if (node->nentry <= 0) {
1582        if (it->btree.kv_ops->free_kv_var) it->btree.kv_ops->free_kv_var(
1583                                                             &it->btree, k, v);
1584        if (it->node[depth] != NULL) mempool_free(it->addr[depth]);
1585        it->node[depth] = NULL;
1586        it->addr[depth] = NULL;
1587        return BTREE_RESULT_FAIL;
1588    }
1589
1590    if (it->idx[depth] == BTREE_IDX_NOT_FOUND) {
1591        // curkey: lastly returned key
1592        it->idx[depth] = _btree_find_entry(btree, node, it->curkey);
1593        if (it->idx[depth] == BTREE_IDX_NOT_FOUND) {
1594            it->idx[depth] = 0;
1595        }
1596        btree->kv_ops->get_kv(node, it->idx[depth], key_buf, value_buf);
1597        if (btree->kv_ops->cmp(it->curkey, key_buf, btree->aux) > 0 &&
1598            depth == 0) {
1599            // in leaf node, next key must be larger than previous key
1600            // (i.e. it->curkey)
1601            it->idx[depth]++;
1602        }
1603    }
1604
1605    if (BTREE_ITR_IS_REV(it) && depth == 0) {
1606        if (it->idx[depth] >= node->nentry) {
1607            // 'idx' becomes out-of-bounds by previous btree_prev() call.
1608            // this means that the last returned entry was the smallest entry.
1609            // set idx to the second smallest entry.
1610            it->idx[depth] = 1;
1611        } else {
1612            it->idx[depth] += 2;
1613        }
1614    }
1615
1616    if (it->idx[depth] >= node->nentry) {
1617        // out of bound .. go up to parent node
1618        it->idx[depth] = BTREE_IDX_NOT_FOUND; // reset for btree_prev
1619        if (it->node[depth] != NULL) mempool_free(it->addr[depth]);
1620        it->node[depth] = NULL;
1621        it->addr[depth] = NULL;
1622
1623        if (it->btree.kv_ops->free_kv_var) {
1624            it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1625        }
1626        return BTREE_RESULT_FAIL;
1627    }
1628
1629    if (depth > 0) {
1630        // index node
1631        if (it->node[depth-1] == NULL) {
1632            btree->kv_ops->get_kv(node, it->idx[depth], k, v);
1633            it->bid[depth-1] = btree->kv_ops->value2bid(v);
1634            it->bid[depth-1] = _endian_decode(it->bid[depth-1]);
1635        }
1636        r = _btree_next(it, key_buf, value_buf, depth-1);
1637
1638        if (r == BTREE_RESULT_FAIL) {
1639            // move index to right
1640            it->idx[depth]++;
1641
1642            if (it->idx[depth] >= node->nentry){
1643                // out of bound .. go up to parent node
1644                it->idx[depth] = BTREE_IDX_NOT_FOUND;
1645                if (it->node[depth] != NULL) mempool_free(it->addr[depth]);
1646                it->node[depth] = NULL;
1647                it->addr[depth] = NULL;
1648
1649                if (it->btree.kv_ops->free_kv_var) {
1650                    it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1651                }
1652                return BTREE_RESULT_FAIL;
1653            }else{
1654                btree->kv_ops->get_kv(node, it->idx[depth], k, v);
1655                it->bid[depth-1] = btree->kv_ops->value2bid(v);
1656                it->bid[depth-1] = _endian_decode(it->bid[depth-1]);
1657                // reset child index
1658                for (i=depth-1; i>=0; --i) {
1659                    it->idx[i] = 0;
1660                    if (it->node[i] != NULL) mempool_free(it->addr[i]);
1661                    it->node[i] = NULL;
1662                    it->addr[i] = NULL;
1663                }
1664                // retry
1665                r = _btree_next(it, key_buf, value_buf, depth-1);
1666            }
1667        }
1668        if (it->btree.kv_ops->free_kv_var) {
1669            it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1670        }
1671        return r;
1672    }else{
1673        // leaf node
1674        btree->kv_ops->get_kv(node, it->idx[depth], key_buf, value_buf);
1675        //memcpy(it->curkey, key_buf, btree->ksize);
1676        btree->kv_ops->set_key(btree, it->curkey, key_buf);
1677        it->idx[depth]++;
1678        if (it->btree.kv_ops->free_kv_var) {
1679            it->btree.kv_ops->free_kv_var(&it->btree, k, v);
1680        }
1681        return BTREE_RESULT_SUCCESS;
1682    }
1683}
1684
1685btree_result btree_next(struct btree_iterator *it, void *key_buf,
1686                        void *value_buf)
1687{
1688    btree_result br = _btree_next(it, key_buf, value_buf, it->btree.height-1);
1689    if (br == BTREE_RESULT_SUCCESS) {
1690        BTREE_ITR_SET_FWD(it);
1691    } else {
1692        BTREE_ITR_SET_NONE(it);
1693    }
1694    return br;
1695}
1696
1697