xref: /6.6.0/forestdb/src/wal.h (revision 56236603)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 *     Copyright 2010 Couchbase, Inc
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 */
17
18#ifndef _JSAHN_WAL_H
19#define _JSAHN_WAL_H
20
21#include <stdint.h>
22#include "internal_types.h"
23#include "hash.h"
24#include "list.h"
25#include "avltree.h"
26#include "atomic.h"
27#include "libforestdb/fdb_errors.h"
28
29#ifdef __cplusplus
30extern "C" {
31#endif
32
33typedef uint8_t wal_item_action;
34enum{
35    WAL_ACT_INSERT,
36    WAL_ACT_LOGICAL_REMOVE, // An item is marked as "DELETED" by removing its doc body only.
37    WAL_ACT_REMOVE // An item (key, metadata, body) is removed immediately.
38};
39typedef int wal_insert_by;
40enum {
41    WAL_INS_WRITER = 0, // normal writer inserting
42    WAL_INS_COMPACT_PHASE1, // compactor in first phase moving unique docs
43    WAL_INS_COMPACT_PHASE2 // compactor in delta phase (catchup, uncommitted)
44};
45
46struct wal_item_header{
47    struct avl_node avl_key;
48    void *key;
49    uint16_t keylen;
50    uint8_t chunksize;
51    struct list items;
52};
53
54typedef uint64_t wal_snapid_t;
55#define OPEN_SNAPSHOT_TAG ((wal_snapid_t)(-1)) // any latest snapshot item
56struct snap_handle {
57    /**
58     * Link to the tree of snapshots indexed by (kvs_id, snap_id) pair.
59     */
60    struct avl_node avl_id;
61    /**
62     * Unique KV Store ID (keep as second member).
63     */
64    fdb_kvs_id_t id;
65    /**
66     * Back pointer to index into the global WAL snapshot array
67     */
68    wal_snapid_t snap_tag_idx;
69    /**
70     * Snapshot stop index that denotes a WAL flush
71     */
72    wal_snapid_t snap_stop_idx;
73    /**
74     * Incremented on snapshot_open, decremented on snapshot_close(Write Barrier)
75     * Reference count to avoid copy if same KV store WAL snapshot is cloned.
76     */
77    atomic_uint16_t ref_cnt_kvs;
78    /**
79     * Did wal_flush make me inaccessible to later snapshots, (Read-Write Barrier)
80     */
81    bool is_flushed;
82    /**
83     * Is this a persistent snapshot completely separate from WAL.
84     */
85    bool is_persisted_snapshot;
86    /**
87     * Number of WAL items put into this snapshot before it became immutable.
88     */
89    atomic_uint64_t wal_ndocs;
90    /**
91     * Highest sequence number seen in this KV store snapshot.
92     */
93    fdb_seqnum_t seqnum;
94    /**
95     * Transaction that the handle was in at the time of snapshot creation.
96     */
97    fdb_txn *snap_txn;
98    /**
99     * Global transaction pointer to distinguish from local transactions
100     * for partially committed items.
101     */
102    fdb_txn *global_txn;
103    /**
104     * Active transaction list to hide partially committed items whose
105     * transaction is still being ended.
106     */
107    struct list active_txn_list;
108    /**
109     * Local DB stats for cloned snapshots
110     */
111    struct kvs_stat stat;
112    /**
113     * Custom compare function context and callback set by user.
114     * TODO: Store original pointer on which snapshot was taken & use to clone
115     * index nodes as well!
116     */
117    struct _fdb_key_cmp_info cmp_info;
118    /**
119     * AVL tree to store unflushed WAL entries of a snapshot by key range
120     */
121    struct avl_tree key_tree;
122    /**
123     * AVL tree to store unflushed WAL entries of a snapshot by sequence number
124     */
125    struct avl_tree seq_tree;
126};
127
128#define WAL_ITEM_COMMITTED (0x01)
129#define WAL_ITEM_FLUSH_READY (0x02)
130#define WAL_ITEM_MULTI_KV_INS_MODE (0x04)
131#define WAL_ITEM_FLUSHED_OUT (0x08)
132struct wal_item{
133    struct list_elem list_elem; // for wal_item_header's 'items'
134    struct avl_node avl_seq; // used for indexing by sequence number
135    struct wal_item_header *header;
136    fdb_txn *txn;
137    uint64_t txn_id; // used to track closed transactions
138    struct snap_handle *shandle; // Pointer into wal_snapshot_tree for KV Store
139    wal_item_action action;
140    atomic_uint8_t flag;
141    uint32_t doc_size;
142    uint64_t offset;
143    fdb_seqnum_t seqnum;
144    uint64_t old_offset;
145    union { // for offset-based sorting for WAL flush
146        struct list_elem list_elem_txn; // for transaction
147        struct avl_node avl_flush;
148        struct list_elem list_elem_flush;
149        struct avl_node avl_keysnap; // for durable snapshot unique key lookup
150    };
151};
152
153typedef fdb_status wal_flush_func(void *dbhandle, struct wal_item *item,
154                                  struct avl_tree *stale_seqnum_list,
155                                  struct avl_tree *kvs_delta_stats);
156
157/**
158 * Pointer of function that purges stale entries from the sequence tree
159 * as part of WAL flush.
160 */
161typedef fdb_status wal_flush_seq_purge_func(void *dbhandle,
162                                      struct avl_tree *stale_seqnum_list,
163                                      struct avl_tree *kvs_delta_stats);
164
165/**
166 * Pointer of function that updates a KV store stats for each WAL flush
167 */
168typedef void wal_flush_kvs_delta_stats_func(struct filemgr *file,
169                                            avl_tree *kvs_delta_stats);
170
171typedef fdb_status wal_snapshot_func(void *shandle, fdb_doc *doc,
172                                     uint64_t offset);
173typedef fdb_status wal_get_old_offset_func(void *dbhandle,
174                                           struct wal_item *item,
175                                           uint64_t *old_offset);
176typedef int64_t wal_doc_move_func(void *dbhandle,
177                                  void *new_dhandle,
178                                  struct wal_item *item,
179                                  fdb_doc *doc);
180typedef fdb_status wal_commit_mark_func(void *dbhandle,
181                                        uint64_t offset);
182
183#define WAL_FLAG_INITIALIZED 0x1
184
185
186typedef uint8_t wal_dirty_t;
187enum {
188    FDB_WAL_CLEAN = 0,
189    FDB_WAL_DIRTY = 1,
190    FDB_WAL_PENDING = 2
191};
192
193struct wal_shard {
194    struct avl_tree _map;
195    spin_t lock;
196};
197
198struct wal {
199    uint8_t flag;
200    atomic_uint8_t isPopulated; // Set when WAL is first populated OR restored
201    atomic_uint32_t size; // total # entries in WAL (uint32_t)
202    atomic_uint32_t num_flushable; // # flushable entries in WAL (uint32_t)
203    atomic_uint64_t datasize; // total data size in WAL (uint64_t)
204    atomic_uint64_t mem_overhead; // memory overhead of all WAL entries
205    struct list txn_list; // list of active transactions
206    wal_dirty_t wal_dirty;
207    // tree of all 'wal_item_header' (keys) in shard
208    struct wal_shard *key_shards;
209    // indexes 'wal_item's seq num in WAL shard
210    struct wal_shard *seq_shards;
211    size_t num_shards;
212    // Global shared WAL Snapshot Data
213    struct avl_tree wal_snapshot_tree;
214    spin_t lock;
215};
216
217struct wal_cursor {
218    struct avl_node avl_merge; // avl node for merge sort across all shards
219    struct wal_item *item; // pointer to the shared WAL snapshot item
220    struct wal_item *first_item; // first key/seqnum item in snapshot of shard
221    struct wal_item *last_item; // last key/seqnum item in snapshot of shard
222};
223
224struct wal_iterator {
225    struct wal *_wal; // Pointer to global WAL
226    struct snap_handle *shandle; // Pointer to KVS snapshot handle.
227    struct wal_shard *map_shards; // pointer to the shared WAL key/seq shards
228    size_t num_shards; // number of shards in the global shared WAL key/seq
229    bool by_key; // if not set means iteration is by sequence number range
230    bool multi_kvs; // single kv mode vs multi kv instance mode
231    uint8_t direction; // forward/backward/none to avoid grabbing all locks
232    struct avl_tree merge_tree; // AVL tree to perform merge-sort over cursors
233    struct avl_node *cursor_pos; // points to shard that returns current item
234    struct wal_item *item_prev; // points to previous iterator item returned
235    struct wal_cursor *cursors; // cursor to item from each shard's tree
236};
237
238struct wal_txn_wrapper {
239    struct list_elem le;
240    union {
241        fdb_txn *txn; // when used in wal's transaction list
242        uint64_t txn_id; // when used in snapshot's active_txn_list
243    };
244};
245
246union wal_flush_items {
247    struct avl_tree tree; // if WAL items are to be sorted by offset
248    struct list list; // if WAL items need not be sorted
249};
250
251
252fdb_status wal_init(struct filemgr *file, int nbucket);
253int wal_is_initialized(struct filemgr *file);
254bool wal_try_restore(struct filemgr *file);
255fdb_status wal_insert(fdb_txn *txn,
256                      struct filemgr *file,
257                      struct _fdb_key_cmp_info *cmp_info,
258                      fdb_doc *doc,
259                      uint64_t offset,
260                      wal_insert_by caller);
261fdb_status wal_immediate_remove(fdb_txn *txn,
262                                struct filemgr *file,
263                                struct _fdb_key_cmp_info *cmp_info,
264                                fdb_doc *doc,
265                                uint64_t offset,
266                                wal_insert_by caller);
267fdb_status wal_find(fdb_txn *txn, struct filemgr *file,
268                    struct _fdb_key_cmp_info *cmp_info,
269                    struct snap_handle *shandle,
270                    fdb_doc *doc, uint64_t *offset);
271fdb_status wal_find_kv_id(fdb_txn *txn,
272                          struct filemgr *file,
273                          fdb_kvs_id_t kv_id,
274                          struct _fdb_key_cmp_info *cmp_info,
275                          struct snap_handle *shandle,
276                          fdb_doc *doc,
277                          uint64_t *offset);
278
279fdb_status wal_txn_migration(void *dbhandle,
280                             void *new_dhandle,
281                             struct filemgr *old_file,
282                             struct filemgr *new_file,
283                             wal_doc_move_func *move_doc);
284fdb_status wal_commit(fdb_txn *txn, struct filemgr *file, wal_commit_mark_func *func,
285                      err_log_callback *log_callback);
286fdb_status wal_release_flushed_items(struct filemgr *file,
287                                     union wal_flush_items *flush_items);
288
289/**
290 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
291 *
292 * @param file Pointer to the file manager
293 * @param dbhandle Pointer to the KV store handle
294 * @param flush_func Pointer of function that flushes each WAL entry into the
295 *                   main indexes
296 * @param get_old_offset Pointer of function that retrieves an offset of the
297 *                       old KV item from the hbtrie
298 * @param seq_purge_func Pointer of function that purges an old entry with the
299 *                       same key from the sequence tree
300 * @param delta_stats_func Pointer of function that updates each KV store's stats
301 * @param flush_items Pointer to the list that contains the list of all WAL entries
302 *                    that are flushed into the main indexes
303 * @return FDB_RESULT upon successful WAL flush
304 */
305fdb_status wal_flush(struct filemgr *file,
306                     void *dbhandle,
307                     wal_flush_func *flush_func,
308                     wal_get_old_offset_func *get_old_offset,
309                     wal_flush_seq_purge_func *seq_purge_func,
310                     wal_flush_kvs_delta_stats_func *delta_stats_func,
311                     union wal_flush_items *flush_items);
312
313/**
314 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
315 * by the compactor
316 *
317 * @param file Pointer to the file manager
318 * @param dbhandle Pointer to the KV store handle
319 * @param flush_func Pointer of function that flushes each WAL entry into the
320 *                   main indexes
321 * @param get_old_offset Pointer of function that retrieves an offset of the
322 *                       old KV item from the hbtrie
323 * @param seq_purge_func Pointer of function that purges an old entry with the
324 *                       same key from the sequence tree
325 * @param delta_stats_func Pointer of function that updates each KV store's stats
326 * @param flush_items Pointer to the list that contains the list of all WAL entries
327 *                    that are flushed into the main indexes
328 * @return FDB_RESULT upon successful WAL flush
329 */
330fdb_status wal_flush_by_compactor(struct filemgr *file,
331                                  void *dbhandle,
332                                  wal_flush_func *flush_func,
333                                  wal_get_old_offset_func *get_old_offset,
334                                  wal_flush_seq_purge_func *seq_purge_func,
335                                  wal_flush_kvs_delta_stats_func *delta_stats_func,
336                                  union wal_flush_items *flush_items);
337/**
338 * Create a WAL snapshot for a specific KV Store
339 * @param file - the underlying file for the database
340 * @param txn - transaction that the snapshot is to be taken on
341 * @param kv_id - KV Store ID whose snapshot is to be taken
342 * @param seqnum - The sequence number at which the snapshot is to be taken
343 * @param key_cmp_info - custom comparison function.
344 * @param shandle - WAL snapshot handle result
345 */
346fdb_status wal_snapshot_open(struct filemgr *file,
347                             fdb_txn *txn,
348                             fdb_kvs_id_t kv_id,
349                             fdb_seqnum_t seqnum,
350                             _fdb_key_cmp_info *key_cmp_info,
351                             struct snap_handle **shandle);
352/**
353 * Clone from an existing WAL snapshot
354 * @param shandle_in - incoming snapshot handle
355 * @param shandle_out - cloned snapshot handle out
356 * @param seqnum - The sequence number at which the snapshot is to be taken
357 */
358fdb_status wal_snapshot_clone(struct snap_handle *shandle_in,
359                              struct snap_handle **shandle_out,
360                              fdb_seqnum_t seqnum);
361/**
362 * Create a persisted (durable) WAL snapshot for a specific KV Store
363 * @param seqnum - the highest sequence number for this persisted snapshot.
364 * @param key_cmp_info - custom comparison function.
365 * @param file - the underlying file
366 * @param txn - the current active transaction at time of snapshot creation
367 * @param shandle - WAL snapshot handle result
368 */
369fdb_status wal_dur_snapshot_open(fdb_seqnum_t seqnum,
370                                 _fdb_key_cmp_info *key_cmp_info,
371                                 struct filemgr *file, fdb_txn *txn,
372                                 struct snap_handle **shandle);
373/**
374 * Create an exclusive Snapshot of the WAL by copying all entries to
375 * immutable AVL trees
376 * @param file - the underlying file
377 * @param shandle - WAL snapshot handle created by wal_dur_snapshot_open()
378 * @param is_multi_kv - Does the WAL have multiple KV Stores
379 */
380
381fdb_status wal_copyto_snapshot(struct filemgr *file,
382                               struct snap_handle *shandle,
383                               bool is_multi_kv);
384
385/**
386 * Closes a WAL snapshot
387 * @param shandle - the snapshot handle to be closed
388 * @param file - the underlying file for the database
389 */
390fdb_status wal_snapshot_close(struct snap_handle *shandle,
391                              struct filemgr *file);
392/**
393 * Retrieve the KV Store stats of this KV Store from the WAL Snapshot
394 * @param shandle - the WAL snapshot handle
395 * @param stat - (OUT) returned stat
396 */
397fdb_status snap_get_stat(struct snap_handle *shandle, struct kvs_stat *stat);
398
399/**
400 * Persisted snapshots opened from disk needs to have its own immutable tree
401 * This routine is used by _fdb_restore_wal() to load immutable items into it
402 * @param shandle - the WAL snapshot handle
403 * @param doc - the immutable document to be inserted into the snapshot
404 * @param offset - offset of the immutable doc
405 */
406fdb_status wal_snap_insert(struct snap_handle *shandle, fdb_doc *doc,
407                           uint64_t offset);
408/**
409 * Initialize a WAL iterator snapshot by creating a barrier to future writes
410 * @param file - underlying ForestDB database file shared by all KV Stores
411 * @param shandle - pointer to snap_handle created by snapshot_open
412 * @param by_key - is the iteration done by key or by sequence number
413 * @param wal_iterator - Return pointer to initialized wal_iterator
414 */
415fdb_status wal_itr_init(struct filemgr *file,
416                        struct snap_handle *shandle,
417                        bool by_key,
418                        struct wal_iterator **wal_iterator);
419/**
420 * Initialize the boundaries of the iteration by setting first element
421 * @param - wal_itr - the WAL iterator whose bounds need to be set
422 * @param - first_elem - could be key or seqnum, NULL means first in snapshot
423 */
424fdb_status wal_itr_set_first(struct wal_iterator *wal_itr,
425                             struct wal_item *first_elem);
426/**
427 * Initialize the boundaries of the iteration by setting last element
428 * @param - wal_itr - the WAL iterator whose bounds need to be set
429 * @param - last_elem - could be key or seqnum, NULL means last in snapshot
430 */
431fdb_status wal_itr_set_last(struct wal_iterator *wal_itr,
432                            struct wal_item *last_elem);
433/**
434 * Position the sharded WAL iterator to a key/seqnum greater than the query
435 * if the queried key/seqnum does not exist.
436 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
437 * @param wal_item - the avl pointer of the query wal_item.
438 */
439struct wal_item *wal_itr_search_greater(struct wal_iterator *wal_itr,
440                                        struct wal_item *query_item);
441/**
442 * Position the sharded WAL iterator to a key/seqnum smaller than the query
443 * if the queried key/seqnum does not exist.
444 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
445 * @param wal_item - the pointer of the query wal_item.
446 */
447struct wal_item *wal_itr_search_smaller(struct wal_iterator *wal_itr,
448                                        struct wal_item *query_item);
449/**
450 * Position the sharded WAL iterator to the next key/seqnum than current pos
451 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
452 */
453struct wal_item *wal_itr_next(struct wal_iterator *wal_itr);
454
455/**
456 * Position the sharded WAL iterator to the previous key/seqnum from current pos
457 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
458 */
459struct wal_item *wal_itr_prev(struct wal_iterator *wal_itr);
460/**
461 * Position the sharded WAL iterator to the first key/seqnum in KV Store snapshot
462 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
463 */
464struct wal_item *wal_itr_first(struct wal_iterator *wal_itr);
465/**
466 * Position the sharded WAL iterator to the last key/seqnum in KV Store snapshot
467 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
468 */
469struct wal_item *wal_itr_last(struct wal_iterator *wal_itr);
470/**
471 * Free memory associated with the wal iteration
472 * @param wal_itr - the wal iterator whose memory needs to be freed.
473 */
474fdb_status wal_itr_close(struct wal_iterator *wal_itr);
475
476fdb_status wal_discard(struct filemgr *file, fdb_txn *txn);
477fdb_status wal_close(struct filemgr *file, err_log_callback *log_callback);
478fdb_status wal_shutdown(struct filemgr *file, err_log_callback *log_callback);
479
480/**
481 * Free memory associated with wal data structures.
482 */
483fdb_status wal_destroy(struct filemgr *file);
484
485fdb_status wal_close_kv_ins(struct filemgr *file,
486                            fdb_kvs_id_t kv_id, err_log_callback *log_callback);
487
488size_t wal_get_size(struct filemgr *file);
489size_t wal_get_num_shards(struct filemgr *file);
490size_t wal_get_num_flushable(struct filemgr *file);
491size_t wal_get_num_docs(struct filemgr *file);
492size_t wal_get_num_deletes(struct filemgr *file);
493size_t wal_get_datasize(struct filemgr *file);
494
495/**
496 * Set the dirty status of the WAL
497 *
498 * @param file Pointer to the file manager instance
499 * @param status New dirty status to be set
500 * @param set_on_non_pending Flag indicating the status can be only overriden
501 *        if the current status is not in FDB_WAL_PENDING
502 */
503void wal_set_dirty_status(struct filemgr *file,
504                          wal_dirty_t status,
505                          bool set_on_non_pending = false);
506
507wal_dirty_t wal_get_dirty_status(struct filemgr *file);
508
509void wal_add_transaction(struct filemgr *file, fdb_txn *txn);
510void wal_remove_transaction(struct filemgr *file, fdb_txn *txn);
511fdb_txn * wal_earliest_txn(struct filemgr *file, fdb_txn *cur_txn);
512bool wal_txn_exists(struct filemgr *file);
513
514/**
515 * Get the memory overhead of the WAL index for a given file manager
516 * @param file the instance of a file manager whose WAL index memory overhead
517 *        should be returned
518 * @return the memory overhead of a given file manager's WAL index
519 */
520size_t wal_get_mem_overhead(struct filemgr *file);
521
522#ifdef __cplusplus
523}
524#endif
525
526#endif
527