xref: /6.6.0/subjson/subdoc/match.cc (revision aef8453a)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3*     Copyright 2015 Couchbase, Inc
4*
5*   Licensed under the Apache License, Version 2.0 (the "License");
6*   you may not use this file except in compliance with the License.
7*   You may obtain a copy of the License at
8*
9*       http://www.apache.org/licenses/LICENSE-2.0
10*
11*   Unless required by applicable law or agreed to in writing, software
12*   distributed under the License is distributed on an "AS IS" BASIS,
13*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*   See the License for the specific language governing permissions and
15*   limitations under the License.
16*/
17
18/* This file does the brunt of iterating through a buffer and determining the
19 * offset where a given path may begin or end.. */
20
21#define INCLUDE_JSONSL_SRC
22#include "jsonsl_header.h"
23#include "subdoc-api.h"
24#include "match.h"
25#include "hkesc.h"
26#include "validate.h"
27#include "util.h"
28
29using namespace Subdoc;
30
31namespace {
32struct ParseContext : public HashKey {
33    ParseContext(Match *match, Path::CompInfo *jpr) : jpr(jpr), match(match){
34    }
35
36    Path::CompInfo* jpr;
37
38    Match *match;
39
40    // Internal pointer/length pair used to hold the pointer of the
41    // unique array value (if in "unique" mode).
42    const char *uniquebuf = NULL;
43
44    void set_unique_begin(const jsonsl_state_st *, const jsonsl_char_t *at) {
45        uniquebuf = at;
46    }
47
48    const char *get_unique() const { return uniquebuf; }
49};
50}
51
52static void push_callback(jsonsl_t jsn,jsonsl_action_t, struct jsonsl_state_st *, const jsonsl_char_t *);
53static void pop_callback(jsonsl_t jsn,jsonsl_action_t, struct jsonsl_state_st *, const jsonsl_char_t *);
54
55static ParseContext *get_ctx(const jsonsl_t jsn)
56{
57    return (ParseContext *)jsn->data;
58}
59
60static int
61err_callback(jsonsl_t jsn, jsonsl_error_t err, struct jsonsl_state_st *state,
62    jsonsl_char_t *at)
63{
64    ParseContext *ctx = get_ctx(jsn);
65    ctx->match->status = err;
66    (void)state; (void)at;
67    return 0;
68}
69
70static void
71unique_callback(jsonsl_t jsn, jsonsl_action_t action, struct jsonsl_state_st *st,
72    const jsonsl_char_t *at)
73{
74    ParseContext *ctx = get_ctx(jsn);
75    Match *m = ctx->match;
76    int rv;
77    size_t slen;
78
79    if (action == JSONSL_ACTION_PUSH) {
80        /* abs. beginning of token */
81        ctx->set_unique_begin(st, at);
82        return;
83    }
84
85    if (st->mres == JSONSL_MATCH_COMPLETE) {
86        // Popping the parent level!
87        jsn->action_callback_POP = pop_callback;
88        jsn->action_callback_PUSH = push_callback;
89
90        // Prevent descent into parser again
91        jsn->max_callback_level = st->level+1;
92        pop_callback(jsn, action, st, at);
93        return;
94    }
95
96    slen = st->pos_cur - st->pos_begin;
97
98    SUBDOC_ASSERT(st->level == m->match_level + 1U);
99
100    if (st->type == JSONSL_T_STRING) {
101        slen++;
102
103        if (m->ensure_unique.length < 2) {
104            /* not a string */
105            return;
106        } else if (slen != m->ensure_unique.length) {
107            return; /* Length mismatch */
108        }
109
110        rv = strncmp(ctx->get_unique() + 1, m->ensure_unique.at + 1, slen-2);
111
112    } else if (st->type == JSONSL_T_SPECIAL) {
113        if (m->ensure_unique.length != slen) {
114            return;
115        } else if (slen != m->ensure_unique.length) {
116            return;
117        }
118        rv = strncmp(ctx->get_unique(), m->ensure_unique.at, slen);
119    } else {
120        /* We can't reliably indicate uniqueness for non-primitives */
121        m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
122        jsonsl_stop(jsn);
123        return;
124    }
125
126    if (rv == 0) {
127        m->unique_item_found = 1;
128        jsonsl_stop(jsn);
129    }
130}
131
132/* Make code a bit more readable */
133#define M_POSSIBLE JSONSL_MATCH_POSSIBLE
134
135static void
136push_callback(jsonsl_t jsn, jsonsl_action_t action, struct jsonsl_state_st *st,
137    const jsonsl_char_t *at)
138{
139    ParseContext *ctx = get_ctx(jsn);
140    Match *m = ctx->match;
141    struct jsonsl_state_st *parent = jsonsl_last_state(jsn, st);
142
143    if (st->type == JSONSL_T_HKEY) {
144        ctx->set_hk_begin(st, at);
145        return;
146    }
147
148    // If the parent is a match candidate
149    if (parent == NULL || parent->mres == M_POSSIBLE) {
150        unsigned prtype = parent ? parent->type : JSONSL_T_UNKNOWN;
151        size_t nkey;
152        const char *key;
153        key = ctx->get_hk(nkey);
154
155        /* Run the match */
156        st->mres = jsonsl_path_match(ctx->jpr, parent, st, key, nkey);
157
158        if (st->mres == JSONSL_MATCH_COMPLETE) {
159            m->matchres = JSONSL_MATCH_COMPLETE;
160            m->type = st->type;
161            m->loc_deepest.at = at;
162            m->match_level = st->level;
163
164            if (prtype == JSONSL_T_OBJECT) {
165                m->has_key = true;
166
167                // The beginning of the key (for "subdoc" purposes) actually
168                // _includes_ the opening and closing quotes
169                ctx->hk_rawloc(m->loc_key);
170
171                // I'm not sure if it's used.
172                m->position = static_cast<unsigned>((parent->nelem - 1) / 2);
173            } else if (prtype == JSONSL_T_LIST) {
174
175                // Array doesn't have a key
176                m->has_key = 0;
177                // array[n]
178                m->position = static_cast<unsigned>(parent->nelem - 1);
179            }
180
181            if (m->ensure_unique.at) {
182                if (st->type != JSONSL_T_LIST) {
183                    // Can't check "uniquness" in an array!
184                    m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
185                    jsonsl_stop(jsn);
186                    return;
187                } else {
188                    // Set up callbacks for string comparison. This will
189                    // be invoked for each push/pop combination.
190                    jsn->action_callback_POP = unique_callback;
191                    jsn->action_callback_PUSH = unique_callback;
192                    jsn->max_callback_level = st->level + 2;
193                }
194            }
195
196        } else if (st->mres == JSONSL_MATCH_NOMATCH) {
197            // Can't have a match on this tree. Ignore subsequent callbacks here.
198            st->ignore_callback = 1;
199
200        } else if (st->mres == JSONSL_MATCH_POSSIBLE) {
201            // Update our depth thus far
202            m->match_level = st->level;
203            m->loc_deepest.at = at;
204        } else if (st->mres == JSONSL_MATCH_TYPE_MISMATCH) {
205            st->ignore_callback = 1;
206            m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
207        }
208    }
209    (void)action; /* always push */
210}
211
212/*
213 * This callback is only invoked if GET_FOLLOWING_SIBLINGS is the option
214 * mode.
215 */
216static void
217next_sibling_push_callback(jsonsl_t jsn, jsonsl_action_t action,
218    struct jsonsl_state_st *state, const jsonsl_char_t *at)
219{
220    const jsonsl_state_st *parent;
221    ParseContext *ctx = get_ctx(jsn);
222    Match *m = ctx->match;
223
224    if (action == JSONSL_ACTION_POP) {
225        // We can get a POP callback before a PUSH callback if the match
226        // was the last child in the parent. In this case, we are being
227        // invoked on a child, not a sibling.
228        SUBDOC_ASSERT(state->level == m->match_level - 1U);
229        parent = state;
230
231    } else {
232        parent = jsonsl_last_state(jsn, state);
233    }
234
235    SUBDOC_ASSERT(ctx->match->matchres == JSONSL_MATCH_COMPLETE);
236    SUBDOC_ASSERT(ctx->match->extra_options == Match::GET_FOLLOWING_SIBLINGS);
237
238    if (parent != NULL && parent->mres == JSONSL_MATCH_POSSIBLE) {
239        /* Is the immediate parent! */
240        if (parent->type == JSONSL_T_OBJECT) {
241            unsigned nobj_elem = static_cast<unsigned>(parent->nelem);
242            if (action == JSONSL_ACTION_PUSH) {
243                SUBDOC_ASSERT(state->type == JSONSL_T_HKEY);
244                nobj_elem++;
245            }
246            m->num_siblings = static_cast<unsigned>(nobj_elem / 2);
247
248        } else if (parent->type == JSONSL_T_LIST) {
249            m->num_siblings = static_cast<unsigned>(parent->nelem);
250        }
251        m->num_siblings--;
252    }
253    jsonsl_stop(jsn);
254}
255
256static void
257pop_callback(jsonsl_t jsn, jsonsl_action_t, struct jsonsl_state_st *state,
258    const jsonsl_char_t *)
259{
260    ParseContext *ctx = get_ctx(jsn);
261    Match *m = ctx->match;
262
263    if (state->type == JSONSL_T_HKEY) {
264        // All we care about is recording the length of the key. We'll use
265        // this later on when matching (in the PUSH callback of a new element)
266        ctx->set_hk_end(state);
267        return;
268    }
269
270    if (state->mres == JSONSL_MATCH_COMPLETE) {
271        // This is the matched element. Record the end location of the
272        // match.
273        m->loc_deepest.length = jsn->pos - state->pos_begin;
274        m->immediate_parent_found = 1;
275        m->num_children = state->nelem;
276
277        if (state->type != JSONSL_T_SPECIAL) {
278            m->loc_deepest.length++; /* Include the terminating token */
279        } else {
280            m->sflags = state->special_flags;
281        }
282
283        if (m->get_last) {
284            if (state->type != JSONSL_T_LIST) {
285                // Not a list!
286                m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
287                jsonsl_stop(jsn);
288                return;
289
290            } else if (!state->nelem) {
291                // List is empty
292                m->matchres = JSONSL_MATCH_UNKNOWN;
293                jsonsl_stop(jsn);
294                return;
295            }
296
297            // Transpose the match
298            const jsonsl_state_st *child = jsonsl_last_child(jsn, state);
299
300            // For containers, pos does not include the terminating token
301            size_t child_endpos = jsn->pos;
302
303            m->loc_deepest.length = child_endpos - child->pos_begin;
304            m->loc_deepest.at = jsn->base + child->pos_begin;
305
306            // Remove trailing whitespace. This is because the child is
307            // deemed to end one character before the parent does, however
308            // there may be whitespace. This is usually OK but confuses tests.
309            while (isspace(m->loc_deepest.at[m->loc_deepest.length-1])) {
310                --m->loc_deepest.length;
311            }
312
313            m->match_level = child->level;
314            m->sflags = child->special_flags;
315            m->type = child->type;
316            m->num_children = child->nelem;
317
318            // Parent is an array, get pos/sibling information directly from it
319            m->num_siblings = state->nelem - 1;
320            m->position = m->num_siblings;
321            m->has_key = 0;
322            jsonsl_stop(jsn);
323
324        } else {
325            // Maybe we need another round to check for siblings?
326            if (m->extra_options == Match::GET_FOLLOWING_SIBLINGS) {
327                jsn->action_callback_POP = NULL;
328                jsn->action_callback_PUSH = NULL;
329
330                // We only care for this on push, but in the off chance where
331                // this is the final element in the parent *anyway*, the parsing
332                // itself should just stop.
333                jsn->action_callback = next_sibling_push_callback;
334            } else {
335                jsonsl_stop(jsn);
336            }
337        }
338
339        return;
340    }
341
342    // TODO: Determine if all these checks are necessary, since if everything
343    // is correct, parsing should stop as soon as the deepest match is
344    // terminated
345
346    // Not a container and not a full match. We can't do anything here.
347    if (!JSONSL_STATE_IS_CONTAINER(state)) {
348        return;
349    }
350
351    // This isn't part of the match tree, so we just ignore it
352    if (state->mres != JSONSL_MATCH_POSSIBLE) {
353        return;
354    }
355
356    // If we haven't bailed out yet, it means this is the deepest most parent
357    // match. This can either be the actual parent of the match (if a match
358    // is found), or the deepest existing parent which exists in the document.
359    // The latter information is needed for MKDIR_P semantics.
360    // Record the length of the parent
361    m->loc_deepest.length = jsn->pos - state->pos_begin;
362    m->loc_deepest.length++; // Always a container. Use trailing token.
363
364    // Record the number of "siblings" in the container. This is used by
365    // insertion/removal operations to determine comma placement, if any,
366    // before or after the current match.
367    if (state->type == JSONSL_T_OBJECT) {
368        m->num_siblings = static_cast<unsigned>(state->nelem / 2);
369    } else {
370        m->num_siblings = static_cast<unsigned>(state->nelem);
371    }
372
373    m->type = state->type;
374
375    // Is this the actual parent of the match?
376    if (state->level == ctx->jpr->ncomponents-1) {
377        m->immediate_parent_found = 1;
378    }
379
380    // Since this is the deepest parent we've found, we exit here!
381    jsonsl_stop(jsn);
382}
383
384int
385Match::exec_match_simple(const char *value, size_t nvalue,
386    const Path::CompInfo *jpr, jsonsl_t jsn)
387{
388    ParseContext ctx(this, const_cast<Path::CompInfo*>(jpr));
389    status = JSONSL_ERROR_SUCCESS;
390
391    jsonsl_enable_all_callbacks(jsn);
392    jsn->action_callback_PUSH = push_callback;
393    jsn->action_callback_POP = pop_callback;
394    jsn->error_callback = err_callback;
395    jsn->max_callback_level = ctx.jpr->ncomponents + 1;
396    jsn->data = &ctx;
397
398    jsonsl_feed(jsn, value, nvalue);
399    jsonsl_reset(jsn);
400    return 0;
401}
402
403int
404Match::exec_match_negix(const char *value, size_t nvalue, const Path *pth,
405    jsonsl_t jsn)
406{
407    /* First component to scan in next iteration */
408    size_t cur_start = 1;
409
410    /* For levels, compensate this much for the match level */
411    size_t level_offset = 0;
412
413    const Path::CompInfo& orig = *pth;
414    Path::Component comp_s[Limits::PATH_COMPONENTS_ALLOC];
415
416    /* Last length of the subdocument */
417    size_t last_len = nvalue;
418    /* Pointer to the beginning of the current subdocument */
419    const char *last_start = value;
420    std::copy(orig.begin(), orig.end(), comp_s);
421
422    while (cur_start < orig.size()) {
423        size_t ii;
424        int rv, is_last_neg = 0;
425
426        for (ii = cur_start; ii < orig.size(); ii++) {
427            /* Seek to the next negative index */
428            if (orig[ii].is_neg) {
429                /* Convert this to the first array index; then switch it
430                 * around to extract parent information */
431                is_last_neg = 1;
432                break;
433            }
434        }
435
436        // XXX: The next few lines could probably be more concise, but I find
437        // it better to explain what's going on.
438
439        // Set the components to use for the current match. These are the
440        // components between the termination of the *last* match and the
441        // next negative index (or the end!)
442        Path::Component *tmpcomps = &comp_s[cur_start];
443        size_t ntmpcomp = ii - cur_start;
444
445        // We need the root element here as well, so add one more component
446        // at the beginning
447        tmpcomps--;
448        ntmpcomp++;
449
450        // Declare the component info itself, this is fed as the path
451        // to exec_match
452        Path::CompInfo tmp(tmpcomps, ntmpcomp);
453
454        tmp[0].ptype = JSONSL_PATH_ROOT;
455
456        /* Clear the match. There's no good way to preserve info here,
457         * unfortunately. */
458        clear();
459
460        // Transpose array's last element as the match itself
461        if (is_last_neg) {
462            get_last = 1;
463        }
464
465        rv = exec_match_simple(last_start, last_len, &tmp, jsn);
466        match_level += level_offset;
467        if (level_offset) {
468            // 'nth iteration
469            match_level--;
470        }
471
472        level_offset = match_level;
473
474        if (rv != 0) { /* error */
475            return rv;
476        } else if (status != JSONSL_ERROR_SUCCESS) {
477            return 0;
478        } else if (matchres != JSONSL_MATCH_COMPLETE) {
479            return 0;
480        }
481
482        last_start = loc_deepest.at;
483        last_len = loc_deepest.length;
484
485        /* Chomp off the current component */
486        cur_start = ii + 1;
487    }
488
489    return 0;
490}
491
492/**
493 * Retrieves a buffer for the designated _path_ in the JSON document. The
494 * actual length of the remaining components may be determined by using
495 * pointer arithmetic.
496 *
497 * @param value The value buffer to search
498 * @param nvalue Length of the value buffer
499 * @param nj The path component to search for
500 * @param[out] a result structure for the path
501 * @return 0 if found, otherwise an error code
502 */
503int
504Match::exec_match(const char *value, size_t nvalue, const Path *pth, jsonsl_t jsn)
505{
506    if (!pth->has_negix) {
507        return exec_match_simple(value, nvalue, pth, jsn);
508    } else {
509        return exec_match_negix(value, nvalue, pth, jsn);
510    }
511}
512
513Match::Match()
514{
515    clear();
516}
517
518Match::~Match()
519{
520}
521
522void
523Match::clear()
524{
525    // TODO: Memberwise reset
526    memset(this, 0, sizeof(*this));
527}
528
529jsonsl_t
530Match::jsn_alloc()
531{
532    return jsonsl_new(Limits::PARSER_DEPTH);
533}
534
535void
536Match::jsn_free(jsonsl_t jsn)
537{
538    jsonsl_destroy(jsn);
539}
540
541typedef struct {
542    int err;
543    int rootcount;
544    int flags;
545    int maxdepth;
546} validate_ctx;
547
548static int
549validate_err_callback(jsonsl_t jsn,
550    jsonsl_error_t err, jsonsl_state_st *, jsonsl_char_t *)
551{
552    validate_ctx *ctx = (validate_ctx *)jsn->data;
553    ctx->err = err;
554    // printf("ERROR: AT=%s\n", at);
555    return 0;
556}
557
558static void
559validate_callback(jsonsl_t jsn, jsonsl_action_t action,
560    jsonsl_state_st *state, const jsonsl_char_t *)
561{
562    validate_ctx *ctx = reinterpret_cast<validate_ctx*>(jsn->data);
563
564    if (ctx->maxdepth > -1 &&
565            state->level - 1 == static_cast<unsigned>(ctx->maxdepth)) {
566        ctx->err = Validator::ETOODEEP;
567        jsonsl_stop(jsn);
568        return;
569    }
570
571    if (state->level == 1) {
572        // Root element
573        ctx->rootcount++;
574
575    } else if (state->level == 2 && action == JSONSL_ACTION_PUSH) {
576        if (ctx->flags & Validator::VALUE_PRIMITIVE) {
577            if (JSONSL_STATE_IS_CONTAINER(state)) {
578                ctx->err = Validator::ENOTPRIMITIVE;
579                jsonsl_stop(jsn);
580            }
581        }
582
583        if (ctx->flags & Validator::VALUE_SINGLE) {
584            auto *parent = jsonsl_last_state(jsn, state);
585            int has_multielem = 0;
586            if (parent->type == JSONSL_T_LIST && parent->nelem > 1) {
587                has_multielem = 1;
588            } else if (parent->type == JSONSL_T_OBJECT && parent->nelem > 2) {
589                has_multielem = 1;
590            }
591
592            if (has_multielem) {
593                ctx->err = Validator::EMULTIELEM;
594                jsonsl_stop(jsn);
595            }
596        }
597    }
598}
599
600static const Loc validate_ARRAY_PRE("[", 1);
601static const Loc validate_ARRAY_POST("]", 1);
602static const Loc validate_DICT_PRE("{\"k\":", 5);
603static const Loc validate_DICT_POST("}", 1);
604static const Loc validate_NOOP(NULL, 0);
605
606int
607Validator::validate(const char *s, size_t n, jsonsl_t jsn, int maxdepth, int mode)
608{
609    int need_free_jsn = 0;
610    int type = mode & PARENT_MASK;
611    int flags = mode & VALUE_MASK;
612    const Loc *l_pre, *l_post;
613
614    validate_ctx ctx = { 0,0 };
615    if (jsn == NULL) {
616        jsn = jsonsl_new(Limits::PARSER_DEPTH);
617        need_free_jsn = 1;
618    }
619
620    jsn->action_callback_POP = NULL;
621    jsn->action_callback_PUSH = NULL;
622
623    jsn->action_callback = validate_callback;
624    jsn->error_callback = validate_err_callback;
625
626    // Set the maximum level. This name is a bit misleading as it's
627    // actually the level number (inclusive) beyond which we won't get
628    // called. Cutting down on the number of callbacks can significantly
629    // benefit CPU, especially for larger JSON documents with many tokens.
630    if (maxdepth != -1) {
631        if (type != PARENT_NONE) {
632            // Don't count the wrapper in the depth
633            maxdepth++;
634        }
635        // Set the callback to be 1 more (i.e. max_level+2) so we can
636        // get notified about depth violations
637        jsn->max_callback_level = maxdepth + 2;
638    } else if (flags) {
639        // Set the callback to include the "wrapper" container and our
640        // actual value
641        jsn->max_callback_level = 3;
642    } else {
643        // only care about the actual value.
644        jsn->max_callback_level = 2;
645    }
646
647    ctx.maxdepth = maxdepth;
648    ctx.flags = flags;
649
650    jsn->call_OBJECT = 1;
651    jsn->call_LIST = 1;
652    jsn->call_STRING = 1;
653    jsn->call_SPECIAL = 1;
654    jsn->call_HKEY = 0;
655    jsn->call_UESCAPE = 0;
656    jsn->data = &ctx;
657
658    if (type == PARENT_NONE) {
659        l_pre = l_post = &validate_NOOP;
660    } else if (type == PARENT_ARRAY) {
661        l_pre = &validate_ARRAY_PRE;
662        l_post = &validate_ARRAY_POST;
663    } else if (type == PARENT_DICT) {
664        l_pre = &validate_DICT_PRE;
665        l_post = &validate_DICT_POST;
666    } else {
667        ctx.err = JSONSL_ERROR_GENERIC;
668        goto GT_ERR;
669    }
670
671    jsonsl_feed(jsn, l_pre->at, l_pre->length);
672    jsonsl_feed(jsn, s, n);
673
674    if (ctx.err == JSONSL_ERROR_SUCCESS) {
675        jsonsl_feed(jsn, l_post->at, l_post->length);
676    }
677
678    if (ctx.err == JSONSL_ERROR_SUCCESS) {
679        if (ctx.rootcount < 2) {
680            ctx.err = Validator::EPARTIAL;
681        } else if (ctx.rootcount > 2) {
682            ctx.err = Validator::EMULTIELEM;
683        }
684    }
685
686    GT_ERR:
687    if (need_free_jsn) {
688        jsonsl_destroy(jsn);
689    } else {
690        jsonsl_reset(jsn);
691    }
692    return ctx.err;
693}
694
695const char *
696Validator::errstr(int rv)
697{
698    if (rv <= JSONSL_ERROR_GENERIC) {
699        return jsonsl_strerror(static_cast<jsonsl_error_t>(rv));
700    }
701    switch (rv) {
702    case EMULTIELEM:
703        return "Found multiple elements (single expected)";
704    case EPARTIAL:
705        return "Found incomplete JSON";
706    default:
707        return "UNKNOWN";
708    }
709}
710