xref: /4.6.0/forestdb/src/wal.h (revision 5f874bbf)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 *     Copyright 2010 Couchbase, Inc
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 */
17
18#ifndef _JSAHN_WAL_H
19#define _JSAHN_WAL_H
20
21#include <stdint.h>
22#include "internal_types.h"
23#include "hash.h"
24#include "list.h"
25#include "avltree.h"
26#include "atomic.h"
27#include "libforestdb/fdb_errors.h"
28
29#ifdef __cplusplus
30extern "C" {
31#endif
32
33typedef uint8_t wal_item_action;
34enum{
35    WAL_ACT_INSERT,
36    WAL_ACT_LOGICAL_REMOVE, // An item is marked as "DELETED" by removing its doc body only.
37    WAL_ACT_REMOVE // An item (key, metadata, body) is removed immediately.
38};
39typedef int wal_insert_by;
40enum {
41    WAL_INS_WRITER = 0, // normal writer inserting
42    WAL_INS_COMPACT_PHASE1, // compactor in first phase moving unique docs
43    WAL_INS_COMPACT_PHASE2 // compactor in delta phase (catchup, uncommitted)
44};
45
46struct wal_item_header{
47    struct avl_node avl_key;
48    void *key;
49    uint16_t keylen;
50    uint8_t chunksize;
51    struct list items;
52};
53
54typedef uint64_t wal_snapid_t;
55#define OPEN_SNAPSHOT_TAG ((wal_snapid_t)(-1)) // any latest snapshot item
56struct snap_handle {
57    /**
58     * Link to the tree of snapshots indexed by (kvs_id, snap_id) pair.
59     */
60    struct avl_node avl_id;
61    /**
62     * Unique KV Store ID (keep as second member).
63     */
64    fdb_kvs_id_t id;
65    /**
66     * Back pointer to index into the global WAL snapshot array
67     */
68    wal_snapid_t snap_tag_idx;
69    /**
70     * Snapshot stop index that denotes a WAL flush
71     */
72    wal_snapid_t snap_stop_idx;
73    /**
74     * Incremented on snapshot_open, decremented on snapshot_close(Write Barrier)
75     * Reference count to avoid copy if same KV store WAL snapshot is cloned.
76     */
77    atomic_uint16_t ref_cnt_kvs;
78    /**
79     * Did wal_flush make me inaccessible to later snapshots, (Read-Write Barrier)
80     */
81    bool is_flushed;
82    /**
83     * Is this a persistent snapshot completely separate from WAL.
84     */
85    bool is_persisted_snapshot;
86    /**
87     * Number of WAL items put into this snapshot before it became immutable.
88     */
89    atomic_uint64_t wal_ndocs;
90    /**
91     * Highest sequence number seen in this KV store snapshot.
92     */
93    fdb_seqnum_t seqnum;
94    /**
95     * Transaction that the handle was in at the time of snapshot creation.
96     */
97    fdb_txn *snap_txn;
98    /**
99     * Global transaction pointer to distinguish from local transactions
100     * for partially committed items.
101     */
102    fdb_txn *global_txn;
103    /**
104     * Active transaction list to hide partially committed items whose
105     * transaction is still being ended.
106     */
107    struct list active_txn_list;
108    /**
109     * Local DB stats for cloned snapshots
110     */
111    struct kvs_stat stat;
112    /**
113     * Custom compare function context and callback set by user.
114     * TODO: Store original pointer on which snapshot was taken & use to clone
115     * index nodes as well!
116     */
117    struct _fdb_key_cmp_info cmp_info;
118    /**
119     * AVL tree to store unflushed WAL entries of a snapshot by key range
120     */
121    struct avl_tree key_tree;
122    /**
123     * AVL tree to store unflushed WAL entries of a snapshot by sequence number
124     */
125    struct avl_tree seq_tree;
126};
127
128#define WAL_ITEM_COMMITTED (0x01)
129#define WAL_ITEM_FLUSH_READY (0x02)
130#define WAL_ITEM_MULTI_KV_INS_MODE (0x04)
131#define WAL_ITEM_FLUSHED_OUT (0x08)
132struct wal_item{
133    struct list_elem list_elem; // for wal_item_header's 'items'
134    struct avl_node avl_seq; // used for indexing by sequence number
135    struct wal_item_header *header;
136    fdb_txn *txn;
137    uint64_t txn_id; // used to track closed transactions
138    struct snap_handle *shandle; // Pointer into wal_snapshot_tree for KV Store
139    wal_item_action action;
140    atomic_uint8_t flag;
141    uint32_t doc_size;
142    uint64_t offset;
143    fdb_seqnum_t seqnum;
144    uint64_t old_offset;
145    union { // for offset-based sorting for WAL flush
146        struct list_elem list_elem_txn; // for transaction
147        struct avl_node avl_flush;
148        struct list_elem list_elem_flush;
149        struct avl_node avl_keysnap; // for durable snapshot unique key lookup
150    };
151};
152
153typedef fdb_status wal_flush_func(void *dbhandle, struct wal_item *item,
154                                  struct avl_tree *stale_seqnum_list,
155                                  struct avl_tree *kvs_delta_stats);
156
157/**
158 * Pointer of function that purges stale entries from the sequence tree
159 * as part of WAL flush.
160 */
161typedef void wal_flush_seq_purge_func(void *dbhandle,
162                                      struct avl_tree *stale_seqnum_list,
163                                      struct avl_tree *kvs_delta_stats);
164
165/**
166 * Pointer of function that updates a KV store stats for each WAL flush
167 */
168typedef void wal_flush_kvs_delta_stats_func(struct filemgr *file,
169                                            avl_tree *kvs_delta_stats);
170
171typedef fdb_status wal_snapshot_func(void *shandle, fdb_doc *doc,
172                                     uint64_t offset);
173typedef uint64_t wal_get_old_offset_func(void *dbhandle,
174                                         struct wal_item *item);
175typedef int64_t wal_doc_move_func(void *dbhandle,
176                                  void *new_dhandle,
177                                  struct wal_item *item,
178                                  fdb_doc *doc);
179typedef fdb_status wal_commit_mark_func(void *dbhandle,
180                                        uint64_t offset);
181
182#define WAL_FLAG_INITIALIZED 0x1
183
184
185typedef uint8_t wal_dirty_t;
186enum {
187    FDB_WAL_CLEAN = 0,
188    FDB_WAL_DIRTY = 1,
189    FDB_WAL_PENDING = 2
190};
191
192struct wal_shard {
193    struct avl_tree _map;
194    spin_t lock;
195};
196
197struct wal {
198    uint8_t flag;
199    atomic_uint32_t size; // total # entries in WAL (uint32_t)
200    atomic_uint32_t num_flushable; // # flushable entries in WAL (uint32_t)
201    atomic_uint64_t datasize; // total data size in WAL (uint64_t)
202    atomic_uint64_t mem_overhead; // memory overhead of all WAL entries
203    struct list txn_list; // list of active transactions
204    wal_dirty_t wal_dirty;
205    // tree of all 'wal_item_header' (keys) in shard
206    struct wal_shard *key_shards;
207    // indexes 'wal_item's seq num in WAL shard
208    struct wal_shard *seq_shards;
209    size_t num_shards;
210    // Global shared WAL Snapshot Data
211    struct avl_tree wal_snapshot_tree;
212    spin_t lock;
213};
214
215struct wal_cursor {
216    struct avl_node avl_merge; // avl node for merge sort across all shards
217    struct wal_item *item; // pointer to the shared WAL snapshot item
218    struct wal_item *first_item; // first key/seqnum item in snapshot of shard
219    struct wal_item *last_item; // last key/seqnum item in snapshot of shard
220};
221
222struct wal_iterator {
223    struct wal *_wal; // Pointer to global WAL
224    struct snap_handle *shandle; // Pointer to KVS snapshot handle.
225    struct wal_shard *map_shards; // pointer to the shared WAL key/seq shards
226    size_t num_shards; // number of shards in the global shared WAL key/seq
227    bool by_key; // if not set means iteration is by sequence number range
228    bool multi_kvs; // single kv mode vs multi kv instance mode
229    uint8_t direction; // forward/backward/none to avoid grabbing all locks
230    struct avl_tree merge_tree; // AVL tree to perform merge-sort over cursors
231    struct avl_node *cursor_pos; // points to shard that returns current item
232    struct wal_item *item_prev; // points to previous iterator item returned
233    struct wal_cursor *cursors; // cursor to item from each shard's tree
234};
235
236struct wal_txn_wrapper {
237    struct list_elem le;
238    union {
239        fdb_txn *txn; // when used in wal's transaction list
240        uint64_t txn_id; // when used in snapshot's active_txn_list
241    };
242};
243
244union wal_flush_items {
245    struct avl_tree tree; // if WAL items are to be sorted by offset
246    struct list list; // if WAL items need not be sorted
247};
248
249
250fdb_status wal_init(struct filemgr *file, int nbucket);
251int wal_is_initialized(struct filemgr *file);
252fdb_status wal_insert(fdb_txn *txn,
253                      struct filemgr *file,
254                      struct _fdb_key_cmp_info *cmp_info,
255                      fdb_doc *doc,
256                      uint64_t offset,
257                      wal_insert_by caller);
258fdb_status wal_immediate_remove(fdb_txn *txn,
259                                struct filemgr *file,
260                                struct _fdb_key_cmp_info *cmp_info,
261                                fdb_doc *doc,
262                                uint64_t offset,
263                                wal_insert_by caller);
264fdb_status wal_find(fdb_txn *txn, struct filemgr *file,
265                    struct _fdb_key_cmp_info *cmp_info,
266                    struct snap_handle *shandle,
267                    fdb_doc *doc, uint64_t *offset);
268fdb_status wal_find_kv_id(fdb_txn *txn,
269                          struct filemgr *file,
270                          fdb_kvs_id_t kv_id,
271                          struct _fdb_key_cmp_info *cmp_info,
272                          struct snap_handle *shandle,
273                          fdb_doc *doc,
274                          uint64_t *offset);
275
276fdb_status wal_txn_migration(void *dbhandle,
277                             void *new_dhandle,
278                             struct filemgr *old_file,
279                             struct filemgr *new_file,
280                             wal_doc_move_func *move_doc);
281fdb_status wal_commit(fdb_txn *txn, struct filemgr *file, wal_commit_mark_func *func,
282                      err_log_callback *log_callback);
283fdb_status wal_release_flushed_items(struct filemgr *file,
284                                     union wal_flush_items *flush_items);
285
286/**
287 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
288 *
289 * @param file Pointer to the file manager
290 * @param dbhandle Pointer to the KV store handle
291 * @param flush_func Pointer of function that flushes each WAL entry into the
292 *                   main indexes
293 * @param get_old_offset Pointer of function that retrieves an offset of the
294 *                       old KV item from the hbtrie
295 * @param seq_purge_func Pointer of function that purges an old entry with the
296 *                       same key from the sequence tree
297 * @param delta_stats_func Pointer of function that updates each KV store's stats
298 * @param flush_items Pointer to the list that contains the list of all WAL entries
299 *                    that are flushed into the main indexes
300 * @return FDB_RESULT upon successful WAL flush
301 */
302fdb_status wal_flush(struct filemgr *file,
303                     void *dbhandle,
304                     wal_flush_func *flush_func,
305                     wal_get_old_offset_func *get_old_offset,
306                     wal_flush_seq_purge_func *seq_purge_func,
307                     wal_flush_kvs_delta_stats_func *delta_stats_func,
308                     union wal_flush_items *flush_items);
309
310/**
311 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
312 * by the compactor
313 *
314 * @param file Pointer to the file manager
315 * @param dbhandle Pointer to the KV store handle
316 * @param flush_func Pointer of function that flushes each WAL entry into the
317 *                   main indexes
318 * @param get_old_offset Pointer of function that retrieves an offset of the
319 *                       old KV item from the hbtrie
320 * @param seq_purge_func Pointer of function that purges an old entry with the
321 *                       same key from the sequence tree
322 * @param delta_stats_func Pointer of function that updates each KV store's stats
323 * @param flush_items Pointer to the list that contains the list of all WAL entries
324 *                    that are flushed into the main indexes
325 * @return FDB_RESULT upon successful WAL flush
326 */
327fdb_status wal_flush_by_compactor(struct filemgr *file,
328                                  void *dbhandle,
329                                  wal_flush_func *flush_func,
330                                  wal_get_old_offset_func *get_old_offset,
331                                  wal_flush_seq_purge_func *seq_purge_func,
332                                  wal_flush_kvs_delta_stats_func *delta_stats_func,
333                                  union wal_flush_items *flush_items);
334/**
335 * Create a WAL snapshot for a specific KV Store
336 * @param file - the underlying file for the database
337 * @param txn - transaction that the snapshot is to be taken on
338 * @param kv_id - KV Store ID whose snapshot is to be taken
339 * @param seqnum - The sequence number at which the snapshot is to be taken
340 * @param key_cmp_info - custom comparison function.
341 * @param shandle - WAL snapshot handle result
342 */
343fdb_status wal_snapshot_open(struct filemgr *file,
344                             fdb_txn *txn,
345                             fdb_kvs_id_t kv_id,
346                             fdb_seqnum_t seqnum,
347                             _fdb_key_cmp_info *key_cmp_info,
348                             struct snap_handle **shandle);
349/**
350 * Clone from an existing WAL snapshot
351 * @param shandle_in - incoming snapshot handle
352 * @param shandle_out - cloned snapshot handle out
353 * @param seqnum - The sequence number at which the snapshot is to be taken
354 */
355fdb_status wal_snapshot_clone(struct snap_handle *shandle_in,
356                              struct snap_handle **shandle_out,
357                              fdb_seqnum_t seqnum);
358/**
359 * Create a persisted (durable) WAL snapshot for a specific KV Store
360 * @param seqnum - the highest sequence number for this persisted snapshot.
361 * @param key_cmp_info - custom comparison function.
362 * @param file - the underlying file
363 * @param txn - the current active transaction at time of snapshot creation
364 * @param shandle - WAL snapshot handle result
365 */
366fdb_status wal_dur_snapshot_open(fdb_seqnum_t seqnum,
367                                 _fdb_key_cmp_info *key_cmp_info,
368                                 struct filemgr *file, fdb_txn *txn,
369                                 struct snap_handle **shandle);
370/**
371 * Create an exclusive Snapshot of the WAL by copying all entries to
372 * immutable AVL trees
373 * @param file - the underlying file
374 * @param shandle - WAL snapshot handle created by wal_dur_snapshot_open()
375 * @param is_multi_kv - Does the WAL have multiple KV Stores
376 */
377
378fdb_status wal_copyto_snapshot(struct filemgr *file,
379                               struct snap_handle *shandle,
380                               bool is_multi_kv);
381
382/**
383 * Closes a WAL snapshot
384 * @param shandle - the snapshot handle to be closed
385 * @param file - the underlying file for the database
386 */
387fdb_status wal_snapshot_close(struct snap_handle *shandle,
388                              struct filemgr *file);
389/**
390 * Retrieve the KV Store stats of this KV Store from the WAL Snapshot
391 * @param shandle - the WAL snapshot handle
392 * @param stat - (OUT) returned stat
393 */
394fdb_status snap_get_stat(struct snap_handle *shandle, struct kvs_stat *stat);
395
396/**
397 * Persisted snapshots opened from disk needs to have its own immutable tree
398 * This routine is used by _fdb_restore_wal() to load immutable items into it
399 * @param shandle - the WAL snapshot handle
400 * @param doc - the immutable document to be inserted into the snapshot
401 * @param offset - offset of the immutable doc
402 */
403fdb_status wal_snap_insert(struct snap_handle *shandle, fdb_doc *doc,
404                           uint64_t offset);
405/**
406 * Initialize a WAL iterator snapshot by creating a barrier to future writes
407 * @param file - underlying ForestDB database file shared by all KV Stores
408 * @param shandle - pointer to snap_handle created by snapshot_open
409 * @param by_key - is the iteration done by key or by sequence number
410 * @param wal_iterator - Return pointer to initialized wal_iterator
411 */
412fdb_status wal_itr_init(struct filemgr *file,
413                        struct snap_handle *shandle,
414                        bool by_key,
415                        struct wal_iterator **wal_iterator);
416/**
417 * Initialize the boundaries of the iteration by setting first element
418 * @param - wal_itr - the WAL iterator whose bounds need to be set
419 * @param - first_elem - could be key or seqnum, NULL means first in snapshot
420 */
421fdb_status wal_itr_set_first(struct wal_iterator *wal_itr,
422                             struct wal_item *first_elem);
423/**
424 * Initialize the boundaries of the iteration by setting last element
425 * @param - wal_itr - the WAL iterator whose bounds need to be set
426 * @param - last_elem - could be key or seqnum, NULL means last in snapshot
427 */
428fdb_status wal_itr_set_last(struct wal_iterator *wal_itr,
429                            struct wal_item *last_elem);
430/**
431 * Position the sharded WAL iterator to a key/seqnum greater than the query
432 * if the queried key/seqnum does not exist.
433 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
434 * @param wal_item - the avl pointer of the query wal_item.
435 */
436struct wal_item *wal_itr_search_greater(struct wal_iterator *wal_itr,
437                                        struct wal_item *query_item);
438/**
439 * Position the sharded WAL iterator to a key/seqnum smaller than the query
440 * if the queried key/seqnum does not exist.
441 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
442 * @param wal_item - the pointer of the query wal_item.
443 */
444struct wal_item *wal_itr_search_smaller(struct wal_iterator *wal_itr,
445                                        struct wal_item *query_item);
446/**
447 * Position the sharded WAL iterator to the next key/seqnum than current pos
448 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
449 */
450struct wal_item *wal_itr_next(struct wal_iterator *wal_itr);
451
452/**
453 * Position the sharded WAL iterator to the previous key/seqnum from current pos
454 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
455 */
456struct wal_item *wal_itr_prev(struct wal_iterator *wal_itr);
457/**
458 * Position the sharded WAL iterator to the first key/seqnum in KV Store snapshot
459 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
460 */
461struct wal_item *wal_itr_first(struct wal_iterator *wal_itr);
462/**
463 * Position the sharded WAL iterator to the last key/seqnum in KV Store snapshot
464 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
465 */
466struct wal_item *wal_itr_last(struct wal_iterator *wal_itr);
467/**
468 * Free memory associated with the wal iteration
469 * @param wal_itr - the wal iterator whose memory needs to be freed.
470 */
471fdb_status wal_itr_close(struct wal_iterator *wal_itr);
472
473fdb_status wal_discard(struct filemgr *file, fdb_txn *txn);
474fdb_status wal_close(struct filemgr *file, err_log_callback *log_callback);
475fdb_status wal_shutdown(struct filemgr *file, err_log_callback *log_callback);
476
477/**
478 * Free memory associated with wal data structures.
479 */
480fdb_status wal_destroy(struct filemgr *file);
481
482fdb_status wal_close_kv_ins(struct filemgr *file,
483                            fdb_kvs_id_t kv_id, err_log_callback *log_callback);
484
485size_t wal_get_size(struct filemgr *file);
486size_t wal_get_num_shards(struct filemgr *file);
487size_t wal_get_num_flushable(struct filemgr *file);
488size_t wal_get_num_docs(struct filemgr *file);
489size_t wal_get_num_deletes(struct filemgr *file);
490size_t wal_get_datasize(struct filemgr *file);
491
492/**
493 * Set the dirty status of the WAL
494 *
495 * @param file Pointer to the file manager instance
496 * @param status New dirty status to be set
497 * @param set_on_non_pending Flag indicating the status can be only overriden
498 *        if the current status is not in FDB_WAL_PENDING
499 */
500void wal_set_dirty_status(struct filemgr *file,
501                          wal_dirty_t status,
502                          bool set_on_non_pending = false);
503
504wal_dirty_t wal_get_dirty_status(struct filemgr *file);
505
506void wal_add_transaction(struct filemgr *file, fdb_txn *txn);
507void wal_remove_transaction(struct filemgr *file, fdb_txn *txn);
508fdb_txn * wal_earliest_txn(struct filemgr *file, fdb_txn *cur_txn);
509bool wal_txn_exists(struct filemgr *file);
510
511/**
512 * Get the memory overhead of the WAL index for a given file manager
513 * @param file the instance of a file manager whose WAL index memory overhead
514 *        should be returned
515 * @return the memory overhead of a given file manager's WAL index
516 */
517size_t wal_get_mem_overhead(struct filemgr *file);
518
519#ifdef __cplusplus
520}
521#endif
522
523#endif
524