xref: /4.6.0/forestdb/src/wal.h (revision 7e4d9806)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 *     Copyright 2010 Couchbase, Inc
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 */
17
18#ifndef _JSAHN_WAL_H
19#define _JSAHN_WAL_H
20
21#include <stdint.h>
22#include "internal_types.h"
23#include "hash.h"
24#include "list.h"
25#include "avltree.h"
26#include "atomic.h"
27#include "libforestdb/fdb_errors.h"
28
29#ifdef __cplusplus
30extern "C" {
31#endif
32
33typedef uint8_t wal_item_action;
34enum{
35    WAL_ACT_INSERT,
36    WAL_ACT_LOGICAL_REMOVE, // An item is marked as "DELETED" by removing its doc body only.
37    WAL_ACT_REMOVE // An item (key, metadata, body) is removed immediately.
38};
39typedef int wal_insert_by;
40enum {
41    WAL_INS_WRITER = 0, // normal writer inserting
42    WAL_INS_COMPACT_PHASE1, // compactor in first phase moving unique docs
43    WAL_INS_COMPACT_PHASE2 // compactor in delta phase (catchup, uncommitted)
44};
45
46struct wal_item_header{
47    struct avl_node avl_key;
48    void *key;
49    uint16_t keylen;
50    uint8_t chunksize;
51    struct list items;
52};
53
54typedef uint64_t wal_snapid_t;
55#define OPEN_SNAPSHOT_TAG ((wal_snapid_t)(-1)) // any latest snapshot item
56struct snap_handle {
57    /**
58     * Link to the tree of snapshots indexed by (kvs_id, snap_id) pair.
59     */
60    struct avl_node avl_id;
61    /**
62     * Unique KV Store ID (keep as second member).
63     */
64    fdb_kvs_id_t id;
65    /**
66     * Back pointer to index into the global WAL snapshot array
67     */
68    wal_snapid_t snap_tag_idx;
69    /**
70     * Snapshot stop index that denotes a WAL flush
71     */
72    wal_snapid_t snap_stop_idx;
73    /**
74     * Incremented on snapshot_open, decremented on snapshot_close(Write Barrier)
75     * Reference count to avoid copy if same KV store WAL snapshot is cloned.
76     */
77    atomic_uint16_t ref_cnt_kvs;
78    /**
79     * Did wal_flush make me inaccessible to later snapshots, (Read-Write Barrier)
80     */
81    bool is_flushed;
82    /**
83     * Is this a persistent snapshot completely separate from WAL.
84     */
85    bool is_persisted_snapshot;
86    /**
87     * Number of WAL items put into this snapshot before it became immutable.
88     */
89    atomic_uint64_t wal_ndocs;
90    /**
91     * Highest sequence number seen in this KV store snapshot.
92     */
93    fdb_seqnum_t seqnum;
94    /**
95     * Transaction that the handle was in at the time of snapshot creation.
96     */
97    fdb_txn *snap_txn;
98    /**
99     * Global transaction pointer to distinguish from local transactions
100     * for partially committed items.
101     */
102    fdb_txn *global_txn;
103    /**
104     * Active transaction list to hide partially committed items whose
105     * transaction is still being ended.
106     */
107    struct list active_txn_list;
108    /**
109     * Local DB stats for cloned snapshots
110     */
111    struct kvs_stat stat;
112    /**
113     * Custom compare function context and callback set by user.
114     * TODO: Store original pointer on which snapshot was taken & use to clone
115     * index nodes as well!
116     */
117    struct _fdb_key_cmp_info cmp_info;
118    /**
119     * AVL tree to store unflushed WAL entries of a snapshot by key range
120     */
121    struct avl_tree key_tree;
122    /**
123     * AVL tree to store unflushed WAL entries of a snapshot by sequence number
124     */
125    struct avl_tree seq_tree;
126};
127
128#define WAL_ITEM_COMMITTED (0x01)
129#define WAL_ITEM_FLUSH_READY (0x02)
130#define WAL_ITEM_MULTI_KV_INS_MODE (0x04)
131#define WAL_ITEM_FLUSHED_OUT (0x08)
132struct wal_item{
133    struct list_elem list_elem; // for wal_item_header's 'items'
134    struct avl_node avl_seq; // used for indexing by sequence number
135    struct wal_item_header *header;
136    fdb_txn *txn;
137    uint64_t txn_id; // used to track closed transactions
138    struct snap_handle *shandle; // Pointer into wal_snapshot_tree for KV Store
139    wal_item_action action;
140    atomic_uint8_t flag;
141    uint32_t doc_size;
142    uint64_t offset;
143    fdb_seqnum_t seqnum;
144    uint64_t old_offset;
145    union { // for offset-based sorting for WAL flush
146        struct list_elem list_elem_txn; // for transaction
147        struct avl_node avl_flush;
148        struct list_elem list_elem_flush;
149        struct avl_node avl_keysnap; // for durable snapshot unique key lookup
150    };
151};
152
153typedef fdb_status wal_flush_func(void *dbhandle, struct wal_item *item,
154                                  struct avl_tree *stale_seqnum_list,
155                                  struct avl_tree *kvs_delta_stats);
156
157/**
158 * Pointer of function that purges stale entries from the sequence tree
159 * as part of WAL flush.
160 */
161typedef void wal_flush_seq_purge_func(void *dbhandle,
162                                      struct avl_tree *stale_seqnum_list,
163                                      struct avl_tree *kvs_delta_stats);
164
165/**
166 * Pointer of function that updates a KV store stats for each WAL flush
167 */
168typedef void wal_flush_kvs_delta_stats_func(struct filemgr *file,
169                                            avl_tree *kvs_delta_stats);
170
171typedef fdb_status wal_snapshot_func(void *shandle, fdb_doc *doc,
172                                     uint64_t offset);
173typedef uint64_t wal_get_old_offset_func(void *dbhandle,
174                                         struct wal_item *item);
175typedef int64_t wal_doc_move_func(void *dbhandle,
176                                  void *new_dhandle,
177                                  struct wal_item *item,
178                                  fdb_doc *doc);
179typedef fdb_status wal_commit_mark_func(void *dbhandle,
180                                        uint64_t offset);
181
182#define WAL_FLAG_INITIALIZED 0x1
183
184
185typedef uint8_t wal_dirty_t;
186enum {
187    FDB_WAL_CLEAN = 0,
188    FDB_WAL_DIRTY = 1,
189    FDB_WAL_PENDING = 2
190};
191
192struct wal_shard {
193    struct avl_tree _map;
194    spin_t lock;
195};
196
197struct wal {
198    uint8_t flag;
199    atomic_uint8_t isPopulated; // Set when WAL is first populated OR restored
200    atomic_uint32_t size; // total # entries in WAL (uint32_t)
201    atomic_uint32_t num_flushable; // # flushable entries in WAL (uint32_t)
202    atomic_uint64_t datasize; // total data size in WAL (uint64_t)
203    atomic_uint64_t mem_overhead; // memory overhead of all WAL entries
204    struct list txn_list; // list of active transactions
205    wal_dirty_t wal_dirty;
206    // tree of all 'wal_item_header' (keys) in shard
207    struct wal_shard *key_shards;
208    // indexes 'wal_item's seq num in WAL shard
209    struct wal_shard *seq_shards;
210    size_t num_shards;
211    // Global shared WAL Snapshot Data
212    struct avl_tree wal_snapshot_tree;
213    spin_t lock;
214};
215
216struct wal_cursor {
217    struct avl_node avl_merge; // avl node for merge sort across all shards
218    struct wal_item *item; // pointer to the shared WAL snapshot item
219    struct wal_item *first_item; // first key/seqnum item in snapshot of shard
220    struct wal_item *last_item; // last key/seqnum item in snapshot of shard
221};
222
223struct wal_iterator {
224    struct wal *_wal; // Pointer to global WAL
225    struct snap_handle *shandle; // Pointer to KVS snapshot handle.
226    struct wal_shard *map_shards; // pointer to the shared WAL key/seq shards
227    size_t num_shards; // number of shards in the global shared WAL key/seq
228    bool by_key; // if not set means iteration is by sequence number range
229    bool multi_kvs; // single kv mode vs multi kv instance mode
230    uint8_t direction; // forward/backward/none to avoid grabbing all locks
231    struct avl_tree merge_tree; // AVL tree to perform merge-sort over cursors
232    struct avl_node *cursor_pos; // points to shard that returns current item
233    struct wal_item *item_prev; // points to previous iterator item returned
234    struct wal_cursor *cursors; // cursor to item from each shard's tree
235};
236
237struct wal_txn_wrapper {
238    struct list_elem le;
239    union {
240        fdb_txn *txn; // when used in wal's transaction list
241        uint64_t txn_id; // when used in snapshot's active_txn_list
242    };
243};
244
245union wal_flush_items {
246    struct avl_tree tree; // if WAL items are to be sorted by offset
247    struct list list; // if WAL items need not be sorted
248};
249
250
251fdb_status wal_init(struct filemgr *file, int nbucket);
252int wal_is_initialized(struct filemgr *file);
253bool wal_try_restore(struct filemgr *file);
254fdb_status wal_insert(fdb_txn *txn,
255                      struct filemgr *file,
256                      struct _fdb_key_cmp_info *cmp_info,
257                      fdb_doc *doc,
258                      uint64_t offset,
259                      wal_insert_by caller);
260fdb_status wal_immediate_remove(fdb_txn *txn,
261                                struct filemgr *file,
262                                struct _fdb_key_cmp_info *cmp_info,
263                                fdb_doc *doc,
264                                uint64_t offset,
265                                wal_insert_by caller);
266fdb_status wal_find(fdb_txn *txn, struct filemgr *file,
267                    struct _fdb_key_cmp_info *cmp_info,
268                    struct snap_handle *shandle,
269                    fdb_doc *doc, uint64_t *offset);
270fdb_status wal_find_kv_id(fdb_txn *txn,
271                          struct filemgr *file,
272                          fdb_kvs_id_t kv_id,
273                          struct _fdb_key_cmp_info *cmp_info,
274                          struct snap_handle *shandle,
275                          fdb_doc *doc,
276                          uint64_t *offset);
277
278fdb_status wal_txn_migration(void *dbhandle,
279                             void *new_dhandle,
280                             struct filemgr *old_file,
281                             struct filemgr *new_file,
282                             wal_doc_move_func *move_doc);
283fdb_status wal_commit(fdb_txn *txn, struct filemgr *file, wal_commit_mark_func *func,
284                      err_log_callback *log_callback);
285fdb_status wal_release_flushed_items(struct filemgr *file,
286                                     union wal_flush_items *flush_items);
287
288/**
289 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
290 *
291 * @param file Pointer to the file manager
292 * @param dbhandle Pointer to the KV store handle
293 * @param flush_func Pointer of function that flushes each WAL entry into the
294 *                   main indexes
295 * @param get_old_offset Pointer of function that retrieves an offset of the
296 *                       old KV item from the hbtrie
297 * @param seq_purge_func Pointer of function that purges an old entry with the
298 *                       same key from the sequence tree
299 * @param delta_stats_func Pointer of function that updates each KV store's stats
300 * @param flush_items Pointer to the list that contains the list of all WAL entries
301 *                    that are flushed into the main indexes
302 * @return FDB_RESULT upon successful WAL flush
303 */
304fdb_status wal_flush(struct filemgr *file,
305                     void *dbhandle,
306                     wal_flush_func *flush_func,
307                     wal_get_old_offset_func *get_old_offset,
308                     wal_flush_seq_purge_func *seq_purge_func,
309                     wal_flush_kvs_delta_stats_func *delta_stats_func,
310                     union wal_flush_items *flush_items);
311
312/**
313 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
314 * by the compactor
315 *
316 * @param file Pointer to the file manager
317 * @param dbhandle Pointer to the KV store handle
318 * @param flush_func Pointer of function that flushes each WAL entry into the
319 *                   main indexes
320 * @param get_old_offset Pointer of function that retrieves an offset of the
321 *                       old KV item from the hbtrie
322 * @param seq_purge_func Pointer of function that purges an old entry with the
323 *                       same key from the sequence tree
324 * @param delta_stats_func Pointer of function that updates each KV store's stats
325 * @param flush_items Pointer to the list that contains the list of all WAL entries
326 *                    that are flushed into the main indexes
327 * @return FDB_RESULT upon successful WAL flush
328 */
329fdb_status wal_flush_by_compactor(struct filemgr *file,
330                                  void *dbhandle,
331                                  wal_flush_func *flush_func,
332                                  wal_get_old_offset_func *get_old_offset,
333                                  wal_flush_seq_purge_func *seq_purge_func,
334                                  wal_flush_kvs_delta_stats_func *delta_stats_func,
335                                  union wal_flush_items *flush_items);
336/**
337 * Create a WAL snapshot for a specific KV Store
338 * @param file - the underlying file for the database
339 * @param txn - transaction that the snapshot is to be taken on
340 * @param kv_id - KV Store ID whose snapshot is to be taken
341 * @param seqnum - The sequence number at which the snapshot is to be taken
342 * @param key_cmp_info - custom comparison function.
343 * @param shandle - WAL snapshot handle result
344 */
345fdb_status wal_snapshot_open(struct filemgr *file,
346                             fdb_txn *txn,
347                             fdb_kvs_id_t kv_id,
348                             fdb_seqnum_t seqnum,
349                             _fdb_key_cmp_info *key_cmp_info,
350                             struct snap_handle **shandle);
351/**
352 * Clone from an existing WAL snapshot
353 * @param shandle_in - incoming snapshot handle
354 * @param shandle_out - cloned snapshot handle out
355 * @param seqnum - The sequence number at which the snapshot is to be taken
356 */
357fdb_status wal_snapshot_clone(struct snap_handle *shandle_in,
358                              struct snap_handle **shandle_out,
359                              fdb_seqnum_t seqnum);
360/**
361 * Create a persisted (durable) WAL snapshot for a specific KV Store
362 * @param seqnum - the highest sequence number for this persisted snapshot.
363 * @param key_cmp_info - custom comparison function.
364 * @param file - the underlying file
365 * @param txn - the current active transaction at time of snapshot creation
366 * @param shandle - WAL snapshot handle result
367 */
368fdb_status wal_dur_snapshot_open(fdb_seqnum_t seqnum,
369                                 _fdb_key_cmp_info *key_cmp_info,
370                                 struct filemgr *file, fdb_txn *txn,
371                                 struct snap_handle **shandle);
372/**
373 * Create an exclusive Snapshot of the WAL by copying all entries to
374 * immutable AVL trees
375 * @param file - the underlying file
376 * @param shandle - WAL snapshot handle created by wal_dur_snapshot_open()
377 * @param is_multi_kv - Does the WAL have multiple KV Stores
378 */
379
380fdb_status wal_copyto_snapshot(struct filemgr *file,
381                               struct snap_handle *shandle,
382                               bool is_multi_kv);
383
384/**
385 * Closes a WAL snapshot
386 * @param shandle - the snapshot handle to be closed
387 * @param file - the underlying file for the database
388 */
389fdb_status wal_snapshot_close(struct snap_handle *shandle,
390                              struct filemgr *file);
391/**
392 * Retrieve the KV Store stats of this KV Store from the WAL Snapshot
393 * @param shandle - the WAL snapshot handle
394 * @param stat - (OUT) returned stat
395 */
396fdb_status snap_get_stat(struct snap_handle *shandle, struct kvs_stat *stat);
397
398/**
399 * Persisted snapshots opened from disk needs to have its own immutable tree
400 * This routine is used by _fdb_restore_wal() to load immutable items into it
401 * @param shandle - the WAL snapshot handle
402 * @param doc - the immutable document to be inserted into the snapshot
403 * @param offset - offset of the immutable doc
404 */
405fdb_status wal_snap_insert(struct snap_handle *shandle, fdb_doc *doc,
406                           uint64_t offset);
407/**
408 * Initialize a WAL iterator snapshot by creating a barrier to future writes
409 * @param file - underlying ForestDB database file shared by all KV Stores
410 * @param shandle - pointer to snap_handle created by snapshot_open
411 * @param by_key - is the iteration done by key or by sequence number
412 * @param wal_iterator - Return pointer to initialized wal_iterator
413 */
414fdb_status wal_itr_init(struct filemgr *file,
415                        struct snap_handle *shandle,
416                        bool by_key,
417                        struct wal_iterator **wal_iterator);
418/**
419 * Initialize the boundaries of the iteration by setting first element
420 * @param - wal_itr - the WAL iterator whose bounds need to be set
421 * @param - first_elem - could be key or seqnum, NULL means first in snapshot
422 */
423fdb_status wal_itr_set_first(struct wal_iterator *wal_itr,
424                             struct wal_item *first_elem);
425/**
426 * Initialize the boundaries of the iteration by setting last element
427 * @param - wal_itr - the WAL iterator whose bounds need to be set
428 * @param - last_elem - could be key or seqnum, NULL means last in snapshot
429 */
430fdb_status wal_itr_set_last(struct wal_iterator *wal_itr,
431                            struct wal_item *last_elem);
432/**
433 * Position the sharded WAL iterator to a key/seqnum greater than the query
434 * if the queried key/seqnum does not exist.
435 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
436 * @param wal_item - the avl pointer of the query wal_item.
437 */
438struct wal_item *wal_itr_search_greater(struct wal_iterator *wal_itr,
439                                        struct wal_item *query_item);
440/**
441 * Position the sharded WAL iterator to a key/seqnum smaller than the query
442 * if the queried key/seqnum does not exist.
443 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
444 * @param wal_item - the pointer of the query wal_item.
445 */
446struct wal_item *wal_itr_search_smaller(struct wal_iterator *wal_itr,
447                                        struct wal_item *query_item);
448/**
449 * Position the sharded WAL iterator to the next key/seqnum than current pos
450 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
451 */
452struct wal_item *wal_itr_next(struct wal_iterator *wal_itr);
453
454/**
455 * Position the sharded WAL iterator to the previous key/seqnum from current pos
456 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
457 */
458struct wal_item *wal_itr_prev(struct wal_iterator *wal_itr);
459/**
460 * Position the sharded WAL iterator to the first key/seqnum in KV Store snapshot
461 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
462 */
463struct wal_item *wal_itr_first(struct wal_iterator *wal_itr);
464/**
465 * Position the sharded WAL iterator to the last key/seqnum in KV Store snapshot
466 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
467 */
468struct wal_item *wal_itr_last(struct wal_iterator *wal_itr);
469/**
470 * Free memory associated with the wal iteration
471 * @param wal_itr - the wal iterator whose memory needs to be freed.
472 */
473fdb_status wal_itr_close(struct wal_iterator *wal_itr);
474
475fdb_status wal_discard(struct filemgr *file, fdb_txn *txn);
476fdb_status wal_close(struct filemgr *file, err_log_callback *log_callback);
477fdb_status wal_shutdown(struct filemgr *file, err_log_callback *log_callback);
478
479/**
480 * Free memory associated with wal data structures.
481 */
482fdb_status wal_destroy(struct filemgr *file);
483
484fdb_status wal_close_kv_ins(struct filemgr *file,
485                            fdb_kvs_id_t kv_id, err_log_callback *log_callback);
486
487size_t wal_get_size(struct filemgr *file);
488size_t wal_get_num_shards(struct filemgr *file);
489size_t wal_get_num_flushable(struct filemgr *file);
490size_t wal_get_num_docs(struct filemgr *file);
491size_t wal_get_num_deletes(struct filemgr *file);
492size_t wal_get_datasize(struct filemgr *file);
493
494/**
495 * Set the dirty status of the WAL
496 *
497 * @param file Pointer to the file manager instance
498 * @param status New dirty status to be set
499 * @param set_on_non_pending Flag indicating the status can be only overriden
500 *        if the current status is not in FDB_WAL_PENDING
501 */
502void wal_set_dirty_status(struct filemgr *file,
503                          wal_dirty_t status,
504                          bool set_on_non_pending = false);
505
506wal_dirty_t wal_get_dirty_status(struct filemgr *file);
507
508void wal_add_transaction(struct filemgr *file, fdb_txn *txn);
509void wal_remove_transaction(struct filemgr *file, fdb_txn *txn);
510fdb_txn * wal_earliest_txn(struct filemgr *file, fdb_txn *cur_txn);
511bool wal_txn_exists(struct filemgr *file);
512
513/**
514 * Get the memory overhead of the WAL index for a given file manager
515 * @param file the instance of a file manager whose WAL index memory overhead
516 *        should be returned
517 * @return the memory overhead of a given file manager's WAL index
518 */
519size_t wal_get_mem_overhead(struct filemgr *file);
520
521#ifdef __cplusplus
522}
523#endif
524
525#endif
526