xref: /6.0.3/forestdb/src/wal.h (revision 5b0b2b2e)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 *     Copyright 2010 Couchbase, Inc
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 */
17
18#ifndef _JSAHN_WAL_H
19#define _JSAHN_WAL_H
20
21#include <stdint.h>
22#include "internal_types.h"
23#include "hash.h"
24#include "list.h"
25#include "avltree.h"
26#include "atomic.h"
27#include "libforestdb/fdb_errors.h"
28
29#ifdef __cplusplus
30extern "C" {
31#endif
32
33typedef uint8_t wal_item_action;
34enum{
35    WAL_ACT_INSERT,
36    WAL_ACT_LOGICAL_REMOVE, // An item is marked as "DELETED" by removing its doc body only.
37    WAL_ACT_REMOVE // An item (key, metadata, body) is removed immediately.
38};
39typedef int wal_insert_by;
40enum {
41    WAL_INS_WRITER = 0, // normal writer inserting
42    WAL_INS_COMPACT_PHASE1, // compactor in first phase moving unique docs
43    WAL_INS_COMPACT_PHASE2 // compactor in delta phase (catchup, uncommitted)
44};
45
46struct wal_item_header{
47    struct avl_node avl_key;
48    void *key;
49    uint16_t keylen;
50    uint8_t chunksize;
51    struct list items;
52};
53
54typedef uint64_t wal_snapid_t;
55#define OPEN_SNAPSHOT_TAG ((wal_snapid_t)(-1)) // any latest snapshot item
56struct snap_handle {
57    /**
58     * Link to the tree of snapshots indexed by (kvs_id, snap_id) pair.
59     */
60    struct avl_node avl_id;
61    /**
62     * Unique KV Store ID (keep as second member).
63     */
64    fdb_kvs_id_t id;
65    /**
66     * Back pointer to index into the global WAL snapshot array
67     */
68    wal_snapid_t snap_tag_idx;
69    /**
70     * Snapshot stop index that denotes a WAL flush
71     */
72    wal_snapid_t snap_stop_idx;
73    /**
74     * Incremented on snapshot_open, decremented on snapshot_close(Write Barrier)
75     * Reference count to avoid copy if same KV store WAL snapshot is cloned.
76     */
77    atomic_uint16_t ref_cnt_kvs;
78    /**
79     * Did wal_flush make me inaccessible to later snapshots, (Read-Write Barrier)
80     */
81    bool is_flushed;
82    /**
83     * Is this a persistent snapshot completely separate from WAL.
84     */
85    bool is_persisted_snapshot;
86    /**
87     * Number of WAL items put into this snapshot before it became immutable.
88     */
89    atomic_uint64_t wal_ndocs;
90    /**
91     * Highest sequence number seen in this KV store snapshot.
92     */
93    fdb_seqnum_t seqnum;
94    /**
95     * Transaction that the handle was in at the time of snapshot creation.
96     */
97    fdb_txn *snap_txn;
98    /**
99     * Global transaction pointer to distinguish from local transactions
100     * for partially committed items.
101     */
102    fdb_txn *global_txn;
103    /**
104     * Active transaction list to hide partially committed items whose
105     * transaction is still being ended.
106     */
107    struct list active_txn_list;
108    /**
109     * Local DB stats for cloned snapshots
110     */
111    struct kvs_stat stat;
112    /**
113     * Custom compare function context and callback set by user.
114     * TODO: Store original pointer on which snapshot was taken & use to clone
115     * index nodes as well!
116     */
117    struct _fdb_key_cmp_info cmp_info;
118    /**
119     * AVL tree to store unflushed WAL entries of a snapshot by key range
120     */
121    struct avl_tree key_tree;
122    /**
123     * AVL tree to store unflushed WAL entries of a snapshot by sequence number
124     */
125    struct avl_tree seq_tree;
126};
127
128#define WAL_ITEM_COMMITTED (0x01)
129#define WAL_ITEM_FLUSH_READY (0x02)
130#define WAL_ITEM_MULTI_KV_INS_MODE (0x04)
131#define WAL_ITEM_FLUSHED_OUT (0x08)
132struct wal_item{
133    struct list_elem list_elem; // for wal_item_header's 'items'
134    struct avl_node avl_seq; // used for indexing by sequence number
135    struct wal_item_header *header;
136    fdb_txn *txn;
137    struct snap_handle *shandle; // Pointer into wal_snapshot_tree for KV Store
138    wal_item_action action;
139    atomic_uint8_t flag;
140    uint32_t doc_size;
141    uint64_t offset;
142    fdb_seqnum_t seqnum;
143    uint64_t old_offset;
144    union { // for offset-based sorting for WAL flush
145        struct list_elem list_elem_txn; // for transaction
146        struct avl_node avl_flush;
147        struct list_elem list_elem_flush;
148        struct avl_node avl_keysnap; // for durable snapshot unique key lookup
149    };
150};
151
152typedef fdb_status wal_flush_func(void *dbhandle, struct wal_item *item,
153                                  struct avl_tree *stale_seqnum_list,
154                                  struct avl_tree *kvs_delta_stats);
155
156/**
157 * Pointer of function that purges stale entries from the sequence tree
158 * as part of WAL flush.
159 */
160typedef void wal_flush_seq_purge_func(void *dbhandle,
161                                      struct avl_tree *stale_seqnum_list,
162                                      struct avl_tree *kvs_delta_stats);
163
164/**
165 * Pointer of function that updates a KV store stats for each WAL flush
166 */
167typedef void wal_flush_kvs_delta_stats_func(struct filemgr *file,
168                                            avl_tree *kvs_delta_stats);
169
170typedef fdb_status wal_snapshot_func(void *shandle, fdb_doc *doc,
171                                     uint64_t offset);
172typedef uint64_t wal_get_old_offset_func(void *dbhandle,
173                                         struct wal_item *item);
174typedef int64_t wal_doc_move_func(void *dbhandle,
175                                  void *new_dhandle,
176                                  struct wal_item *item,
177                                  fdb_doc *doc);
178typedef fdb_status wal_commit_mark_func(void *dbhandle,
179                                        uint64_t offset);
180
181#define WAL_FLAG_INITIALIZED 0x1
182
183
184typedef uint8_t wal_dirty_t;
185enum {
186    FDB_WAL_CLEAN = 0,
187    FDB_WAL_DIRTY = 1,
188    FDB_WAL_PENDING = 2
189};
190
191struct wal_shard {
192    struct avl_tree _map;
193    spin_t lock;
194};
195
196struct wal {
197    uint8_t flag;
198    atomic_uint32_t size; // total # entries in WAL (uint32_t)
199    atomic_uint32_t num_flushable; // # flushable entries in WAL (uint32_t)
200    atomic_uint64_t datasize; // total data size in WAL (uint64_t)
201    atomic_uint64_t mem_overhead; // memory overhead of all WAL entries
202    struct list txn_list; // list of active transactions
203    wal_dirty_t wal_dirty;
204    // tree of all 'wal_item_header' (keys) in shard
205    struct wal_shard *key_shards;
206    // indexes 'wal_item's seq num in WAL shard
207    struct wal_shard *seq_shards;
208    size_t num_shards;
209    // Global shared WAL Snapshot Data
210    struct avl_tree wal_snapshot_tree;
211    spin_t lock;
212};
213
214struct wal_cursor {
215    struct avl_node avl_merge; // avl node for merge sort across all shards
216    struct wal_item *item; // pointer to the shared WAL snapshot item
217    struct wal_item *first_item; // first key/seqnum item in snapshot of shard
218    struct wal_item *last_item; // last key/seqnum item in snapshot of shard
219};
220
221struct wal_iterator {
222    struct wal *_wal; // Pointer to global WAL
223    struct snap_handle *shandle; // Pointer to KVS snapshot handle.
224    struct wal_shard *map_shards; // pointer to the shared WAL key/seq shards
225    size_t num_shards; // number of shards in the global shared WAL key/seq
226    bool by_key; // if not set means iteration is by sequence number range
227    bool multi_kvs; // single kv mode vs multi kv instance mode
228    uint8_t direction; // forward/backward/none to avoid grabbing all locks
229    struct avl_tree merge_tree; // AVL tree to perform merge-sort over cursors
230    struct avl_node *cursor_pos; // points to shard that returns current item
231    struct wal_item *item_prev; // points to previous iterator item returned
232    struct wal_cursor *cursors; // cursor to item from each shard's tree
233};
234
235struct wal_txn_wrapper {
236    fdb_txn *txn;
237    struct list_elem le;
238};
239
240union wal_flush_items {
241    struct avl_tree tree; // if WAL items are to be sorted by offset
242    struct list list; // if WAL items need not be sorted
243};
244
245
246fdb_status wal_init(struct filemgr *file, int nbucket);
247int wal_is_initialized(struct filemgr *file);
248fdb_status wal_insert(fdb_txn *txn,
249                      struct filemgr *file,
250                      struct _fdb_key_cmp_info *cmp_info,
251                      fdb_doc *doc,
252                      uint64_t offset,
253                      wal_insert_by caller);
254fdb_status wal_immediate_remove(fdb_txn *txn,
255                                struct filemgr *file,
256                                struct _fdb_key_cmp_info *cmp_info,
257                                fdb_doc *doc,
258                                uint64_t offset,
259                                wal_insert_by caller);
260fdb_status wal_find(fdb_txn *txn, struct filemgr *file,
261                    struct _fdb_key_cmp_info *cmp_info,
262                    struct snap_handle *shandle,
263                    fdb_doc *doc, uint64_t *offset);
264fdb_status wal_find_kv_id(fdb_txn *txn,
265                          struct filemgr *file,
266                          fdb_kvs_id_t kv_id,
267                          struct _fdb_key_cmp_info *cmp_info,
268                          struct snap_handle *shandle,
269                          fdb_doc *doc,
270                          uint64_t *offset);
271
272fdb_status wal_txn_migration(void *dbhandle,
273                             void *new_dhandle,
274                             struct filemgr *old_file,
275                             struct filemgr *new_file,
276                             wal_doc_move_func *move_doc);
277fdb_status wal_commit(fdb_txn *txn, struct filemgr *file, wal_commit_mark_func *func,
278                      err_log_callback *log_callback);
279fdb_status wal_release_flushed_items(struct filemgr *file,
280                                     union wal_flush_items *flush_items);
281
282/**
283 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
284 *
285 * @param file Pointer to the file manager
286 * @param dbhandle Pointer to the KV store handle
287 * @param flush_func Pointer of function that flushes each WAL entry into the
288 *                   main indexes
289 * @param get_old_offset Pointer of function that retrieves an offset of the
290 *                       old KV item from the hbtrie
291 * @param seq_purge_func Pointer of function that purges an old entry with the
292 *                       same key from the sequence tree
293 * @param delta_stats_func Pointer of function that updates each KV store's stats
294 * @param flush_items Pointer to the list that contains the list of all WAL entries
295 *                    that are flushed into the main indexes
296 * @return FDB_RESULT upon successful WAL flush
297 */
298fdb_status wal_flush(struct filemgr *file,
299                     void *dbhandle,
300                     wal_flush_func *flush_func,
301                     wal_get_old_offset_func *get_old_offset,
302                     wal_flush_seq_purge_func *seq_purge_func,
303                     wal_flush_kvs_delta_stats_func *delta_stats_func,
304                     union wal_flush_items *flush_items);
305
306/**
307 * Flush WAL entries into the main indexes (i.e., hbtrie and sequence tree)
308 * by the compactor
309 *
310 * @param file Pointer to the file manager
311 * @param dbhandle Pointer to the KV store handle
312 * @param flush_func Pointer of function that flushes each WAL entry into the
313 *                   main indexes
314 * @param get_old_offset Pointer of function that retrieves an offset of the
315 *                       old KV item from the hbtrie
316 * @param seq_purge_func Pointer of function that purges an old entry with the
317 *                       same key from the sequence tree
318 * @param delta_stats_func Pointer of function that updates each KV store's stats
319 * @param flush_items Pointer to the list that contains the list of all WAL entries
320 *                    that are flushed into the main indexes
321 * @return FDB_RESULT upon successful WAL flush
322 */
323fdb_status wal_flush_by_compactor(struct filemgr *file,
324                                  void *dbhandle,
325                                  wal_flush_func *flush_func,
326                                  wal_get_old_offset_func *get_old_offset,
327                                  wal_flush_seq_purge_func *seq_purge_func,
328                                  wal_flush_kvs_delta_stats_func *delta_stats_func,
329                                  union wal_flush_items *flush_items);
330/**
331 * Create a WAL snapshot for a specific KV Store
332 * @param file - the underlying file for the database
333 * @param txn - transaction that the snapshot is to be taken on
334 * @param kv_id - KV Store ID whose snapshot is to be taken
335 * @param seqnum - The sequence number at which the snapshot is to be taken
336 * @param key_cmp_info - custom comparison function.
337 * @param shandle - WAL snapshot handle result
338 */
339fdb_status wal_snapshot_open(struct filemgr *file,
340                             fdb_txn *txn,
341                             fdb_kvs_id_t kv_id,
342                             fdb_seqnum_t seqnum,
343                             _fdb_key_cmp_info *key_cmp_info,
344                             struct snap_handle **shandle);
345/**
346 * Clone from an existing WAL snapshot
347 * @param shandle_in - incoming snapshot handle
348 * @param shandle_out - cloned snapshot handle out
349 * @param seqnum - The sequence number at which the snapshot is to be taken
350 */
351fdb_status wal_snapshot_clone(struct snap_handle *shandle_in,
352                              struct snap_handle **shandle_out,
353                              fdb_seqnum_t seqnum);
354/**
355 * Create a persisted (durable) WAL snapshot for a specific KV Store
356 * @param seqnum - the highest sequence number for this persisted snapshot.
357 * @param key_cmp_info - custom comparison function.
358 * @param file - the underlying file
359 * @param txn - the current active transaction at time of snapshot creation
360 * @param shandle - WAL snapshot handle result
361 */
362fdb_status wal_dur_snapshot_open(fdb_seqnum_t seqnum,
363                                 _fdb_key_cmp_info *key_cmp_info,
364                                 struct filemgr *file, fdb_txn *txn,
365                                 struct snap_handle **shandle);
366/**
367 * Create an exclusive Snapshot of the WAL by copying all entries to
368 * immutable AVL trees
369 * @param file - the underlying file
370 * @param shandle - WAL snapshot handle created by wal_dur_snapshot_open()
371 * @param is_multi_kv - Does the WAL have multiple KV Stores
372 */
373
374fdb_status wal_copyto_snapshot(struct filemgr *file,
375                               struct snap_handle *shandle,
376                               bool is_multi_kv);
377
378/**
379 * Closes a WAL snapshot
380 * @param shandle - the snapshot handle to be closed
381 * @param file - the underlying file for the database
382 */
383fdb_status wal_snapshot_close(struct snap_handle *shandle,
384                              struct filemgr *file);
385/**
386 * Retrieve the KV Store stats of this KV Store from the WAL Snapshot
387 * @param shandle - the WAL snapshot handle
388 * @param stat - (OUT) returned stat
389 */
390fdb_status snap_get_stat(struct snap_handle *shandle, struct kvs_stat *stat);
391
392/**
393 * Persisted snapshots opened from disk needs to have its own immutable tree
394 * This routine is used by _fdb_restore_wal() to load immutable items into it
395 * @param shandle - the WAL snapshot handle
396 * @param doc - the immutable document to be inserted into the snapshot
397 * @param offset - offset of the immutable doc
398 */
399fdb_status wal_snap_insert(struct snap_handle *shandle, fdb_doc *doc,
400                           uint64_t offset);
401/**
402 * Initialize a WAL iterator snapshot by creating a barrier to future writes
403 * @param file - underlying ForestDB database file shared by all KV Stores
404 * @param shandle - pointer to snap_handle created by snapshot_open
405 * @param by_key - is the iteration done by key or by sequence number
406 * @param wal_iterator - Return pointer to initialized wal_iterator
407 */
408fdb_status wal_itr_init(struct filemgr *file,
409                        struct snap_handle *shandle,
410                        bool by_key,
411                        struct wal_iterator **wal_iterator);
412/**
413 * Initialize the boundaries of the iteration by setting first element
414 * @param - wal_itr - the WAL iterator whose bounds need to be set
415 * @param - first_elem - could be key or seqnum, NULL means first in snapshot
416 */
417fdb_status wal_itr_set_first(struct wal_iterator *wal_itr,
418                             struct wal_item *first_elem);
419/**
420 * Initialize the boundaries of the iteration by setting last element
421 * @param - wal_itr - the WAL iterator whose bounds need to be set
422 * @param - last_elem - could be key or seqnum, NULL means last in snapshot
423 */
424fdb_status wal_itr_set_last(struct wal_iterator *wal_itr,
425                            struct wal_item *last_elem);
426/**
427 * Position the sharded WAL iterator to a key/seqnum greater than the query
428 * if the queried key/seqnum does not exist.
429 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
430 * @param wal_item - the avl pointer of the query wal_item.
431 */
432struct wal_item *wal_itr_search_greater(struct wal_iterator *wal_itr,
433                                        struct wal_item *query_item);
434/**
435 * Position the sharded WAL iterator to a key/seqnum smaller than the query
436 * if the queried key/seqnum does not exist.
437 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
438 * @param wal_item - the pointer of the query wal_item.
439 */
440struct wal_item *wal_itr_search_smaller(struct wal_iterator *wal_itr,
441                                        struct wal_item *query_item);
442/**
443 * Position the sharded WAL iterator to the next key/seqnum than current pos
444 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
445 */
446struct wal_item *wal_itr_next(struct wal_iterator *wal_itr);
447
448/**
449 * Position the sharded WAL iterator to the previous key/seqnum from current pos
450 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
451 */
452struct wal_item *wal_itr_prev(struct wal_iterator *wal_itr);
453/**
454 * Position the sharded WAL iterator to the first key/seqnum in KV Store snapshot
455 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
456 */
457struct wal_item *wal_itr_first(struct wal_iterator *wal_itr);
458/**
459 * Position the sharded WAL iterator to the last key/seqnum in KV Store snapshot
460 * @param wal_itr - the wal iterator snapshot whose cursor needs to be set.
461 */
462struct wal_item *wal_itr_last(struct wal_iterator *wal_itr);
463/**
464 * Free memory associated with the wal iteration
465 * @param wal_itr - the wal iterator whose memory needs to be freed.
466 */
467fdb_status wal_itr_close(struct wal_iterator *wal_itr);
468
469fdb_status wal_discard(struct filemgr *file, fdb_txn *txn);
470fdb_status wal_close(struct filemgr *file, err_log_callback *log_callback);
471fdb_status wal_shutdown(struct filemgr *file, err_log_callback *log_callback);
472
473/**
474 * Free memory associated with wal data structures.
475 */
476fdb_status wal_destroy(struct filemgr *file);
477
478fdb_status wal_close_kv_ins(struct filemgr *file,
479                            fdb_kvs_id_t kv_id, err_log_callback *log_callback);
480
481size_t wal_get_size(struct filemgr *file);
482size_t wal_get_num_shards(struct filemgr *file);
483size_t wal_get_num_flushable(struct filemgr *file);
484size_t wal_get_num_docs(struct filemgr *file);
485size_t wal_get_num_deletes(struct filemgr *file);
486size_t wal_get_datasize(struct filemgr *file);
487
488/**
489 * Set the dirty status of the WAL
490 *
491 * @param file Pointer to the file manager instance
492 * @param status New dirty status to be set
493 * @param set_on_non_pending Flag indicating the status can be only overriden
494 *        if the current status is not in FDB_WAL_PENDING
495 */
496void wal_set_dirty_status(struct filemgr *file,
497                          wal_dirty_t status,
498                          bool set_on_non_pending = false);
499
500wal_dirty_t wal_get_dirty_status(struct filemgr *file);
501
502void wal_add_transaction(struct filemgr *file, fdb_txn *txn);
503void wal_remove_transaction(struct filemgr *file, fdb_txn *txn);
504fdb_txn * wal_earliest_txn(struct filemgr *file, fdb_txn *cur_txn);
505bool wal_txn_exists(struct filemgr *file);
506
507/**
508 * Get the memory overhead of the WAL index for a given file manager
509 * @param file the instance of a file manager whose WAL index memory overhead
510 *        should be returned
511 * @return the memory overhead of a given file manager's WAL index
512 */
513size_t wal_get_mem_overhead(struct filemgr *file);
514
515#ifdef __cplusplus
516}
517#endif
518
519#endif
520