xref: /5.5.2/subjson/subdoc/match.cc (revision e7ba1a03)
1/* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3*     Copyright 2015 Couchbase, Inc
4*
5*   Licensed under the Apache License, Version 2.0 (the "License");
6*   you may not use this file except in compliance with the License.
7*   You may obtain a copy of the License at
8*
9*       http://www.apache.org/licenses/LICENSE-2.0
10*
11*   Unless required by applicable law or agreed to in writing, software
12*   distributed under the License is distributed on an "AS IS" BASIS,
13*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*   See the License for the specific language governing permissions and
15*   limitations under the License.
16*/
17
18/* This file does the brunt of iterating through a buffer and determining the
19 * offset where a given path may begin or end.. */
20
21#define INCLUDE_JSONSL_SRC
22#include "jsonsl_header.h"
23#include "subdoc-api.h"
24#include "match.h"
25#include "hkesc.h"
26#include "validate.h"
27#include "util.h"
28
29using namespace Subdoc;
30
31namespace {
32struct ParseContext : public HashKey {
33    ParseContext(Match *match, Path::CompInfo *jpr) : jpr(jpr), match(match){
34    }
35
36    Path::CompInfo* jpr;
37
38    Match *match;
39
40    // Internal pointer/length pair used to hold the pointer of the
41    // unique array value (if in "unique" mode).
42    const char *uniquebuf = NULL;
43
44    void set_unique_begin(const jsonsl_state_st *, const jsonsl_char_t *at) {
45        uniquebuf = at;
46    }
47
48    const char *get_unique() const { return uniquebuf; }
49};
50}
51
52static void push_callback(jsonsl_t jsn,jsonsl_action_t, struct jsonsl_state_st *, const jsonsl_char_t *);
53static void pop_callback(jsonsl_t jsn,jsonsl_action_t, struct jsonsl_state_st *, const jsonsl_char_t *);
54
55static ParseContext *get_ctx(const jsonsl_t jsn)
56{
57    return (ParseContext *)jsn->data;
58}
59
60static int
61err_callback(jsonsl_t jsn, jsonsl_error_t err, struct jsonsl_state_st *state,
62    jsonsl_char_t *at)
63{
64    ParseContext *ctx = get_ctx(jsn);
65    ctx->match->status = err;
66    (void)state; (void)at;
67    return 0;
68}
69
70static void
71unique_callback(jsonsl_t jsn, jsonsl_action_t action, struct jsonsl_state_st *st,
72    const jsonsl_char_t *at)
73{
74    ParseContext *ctx = get_ctx(jsn);
75    Match *m = ctx->match;
76    int rv;
77    size_t slen;
78
79    if (action == JSONSL_ACTION_PUSH) {
80        /* abs. beginning of token */
81        ctx->set_unique_begin(st, at);
82        return;
83    }
84
85    if (st->mres == JSONSL_MATCH_COMPLETE) {
86        // Popping the parent level!
87        jsn->action_callback_POP = pop_callback;
88        jsn->action_callback_PUSH = push_callback;
89
90        // Prevent descent into parser again
91        jsn->max_callback_level = st->level+1;
92        pop_callback(jsn, action, st, at);
93        return;
94    }
95
96    slen = st->pos_cur - st->pos_begin;
97
98    SUBDOC_ASSERT(st->level == m->match_level + 1U);
99
100    if (st->type == JSONSL_T_STRING) {
101        slen++;
102
103        if (m->ensure_unique.length < 2) {
104            /* not a string */
105            return;
106        } else if (slen != m->ensure_unique.length) {
107            return; /* Length mismatch */
108        }
109
110        rv = strncmp(ctx->get_unique() + 1, m->ensure_unique.at + 1, slen-2);
111
112    } else if (st->type == JSONSL_T_SPECIAL) {
113        if (m->ensure_unique.length != slen) {
114            return;
115        }
116        rv = strncmp(ctx->get_unique(), m->ensure_unique.at, slen);
117    } else {
118        /* We can't reliably indicate uniqueness for non-primitives */
119        m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
120        jsonsl_stop(jsn);
121        return;
122    }
123
124    if (rv == 0) {
125        m->unique_item_found = 1;
126        jsonsl_stop(jsn);
127    }
128}
129
130/* Make code a bit more readable */
131#define M_POSSIBLE JSONSL_MATCH_POSSIBLE
132
133static void
134push_callback(jsonsl_t jsn, jsonsl_action_t action, struct jsonsl_state_st *st,
135    const jsonsl_char_t *at)
136{
137    ParseContext *ctx = get_ctx(jsn);
138    Match *m = ctx->match;
139    struct jsonsl_state_st *parent = jsonsl_last_state(jsn, st);
140
141    if (st->type == JSONSL_T_HKEY) {
142        ctx->set_hk_begin(st, at);
143        return;
144    }
145
146    // If the parent is a match candidate
147    if (parent == NULL || parent->mres == M_POSSIBLE) {
148        unsigned prtype = parent ? parent->type : JSONSL_T_UNKNOWN;
149        size_t nkey;
150        const char *key;
151        key = ctx->get_hk(nkey);
152
153        /* Run the match */
154        st->mres = jsonsl_path_match(ctx->jpr, parent, st, key, nkey);
155
156        if (st->mres == JSONSL_MATCH_COMPLETE) {
157            m->matchres = JSONSL_MATCH_COMPLETE;
158            m->type = st->type;
159            m->loc_deepest.at = at;
160            m->match_level = st->level;
161
162            if (prtype == JSONSL_T_OBJECT) {
163                // The beginning of the key (for "subdoc" purposes) actually
164                // _includes_ the opening and closing quotes
165                ctx->hk_rawloc(m->loc_key);
166                m->position = static_cast<unsigned>(parent->nelem / 2) - 1;
167
168            } else if (prtype == JSONSL_T_LIST) {
169                // array[n]
170                m->position = static_cast<unsigned>(parent->nelem - 1);
171            }
172
173            if (m->ensure_unique.at) {
174                if (st->type != JSONSL_T_LIST) {
175                    // Can't check "uniquness" in an array!
176                    m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
177                    jsonsl_stop(jsn);
178                    return;
179                } else {
180                    // Set up callbacks for string comparison. This will
181                    // be invoked for each push/pop combination.
182                    jsn->action_callback_POP = unique_callback;
183                    jsn->action_callback_PUSH = unique_callback;
184                    jsn->max_callback_level = st->level + 2;
185                }
186            }
187
188        } else if (st->mres == JSONSL_MATCH_NOMATCH) {
189            // Can't have a match on this tree. Ignore subsequent callbacks here.
190            st->ignore_callback = 1;
191
192        } else if (st->mres == JSONSL_MATCH_POSSIBLE) {
193            // Update our depth thus far
194            m->match_level = st->level;
195            m->loc_deepest.at = at;
196        } else if (st->mres == JSONSL_MATCH_TYPE_MISMATCH) {
197            st->ignore_callback = 1;
198            m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
199        }
200    }
201    (void)action; /* always push */
202}
203
204/*
205 * This callback is only invoked if GET_FOLLOWING_SIBLINGS is the option
206 * mode.
207 */
208static void
209next_sibling_push_callback(jsonsl_t jsn, jsonsl_action_t action,
210    struct jsonsl_state_st *state, const jsonsl_char_t *at)
211{
212    const jsonsl_state_st *parent;
213    ParseContext *ctx = get_ctx(jsn);
214    Match *m = ctx->match;
215
216    if (action == JSONSL_ACTION_POP) {
217        // We can get a POP callback before a PUSH callback if the match
218        // was the last child in the parent. In this case, we are being
219        // invoked on a child, not a sibling.
220        SUBDOC_ASSERT(state->level == m->match_level - 1U);
221        parent = state;
222
223    } else {
224        parent = jsonsl_last_state(jsn, state);
225    }
226
227    SUBDOC_ASSERT(ctx->match->matchres == JSONSL_MATCH_COMPLETE);
228    SUBDOC_ASSERT(ctx->match->extra_options == Match::GET_FOLLOWING_SIBLINGS);
229
230    if (parent != NULL && parent->mres == JSONSL_MATCH_POSSIBLE) {
231        /* Is the immediate parent! */
232        if (parent->type == JSONSL_T_OBJECT) {
233            unsigned nobj_elem = static_cast<unsigned>(parent->nelem);
234            if (action == JSONSL_ACTION_PUSH) {
235                SUBDOC_ASSERT(state->type == JSONSL_T_HKEY);
236                nobj_elem++;
237            }
238            m->num_siblings = static_cast<unsigned>(nobj_elem / 2);
239
240        } else if (parent->type == JSONSL_T_LIST) {
241            m->num_siblings = static_cast<unsigned>(parent->nelem);
242        }
243        m->num_siblings--;
244    }
245    jsonsl_stop(jsn);
246}
247
248static void
249pop_callback(jsonsl_t jsn, jsonsl_action_t, struct jsonsl_state_st *state,
250    const jsonsl_char_t *)
251{
252    ParseContext *ctx = get_ctx(jsn);
253    Match *m = ctx->match;
254
255    if (state->type == JSONSL_T_HKEY) {
256        // All we care about is recording the length of the key. We'll use
257        // this later on when matching (in the PUSH callback of a new element)
258        ctx->set_hk_end(state);
259        return;
260    }
261
262    if (state->mres == JSONSL_MATCH_COMPLETE) {
263        // This is the matched element. Record the end location of the
264        // match.
265        m->loc_deepest.length = jsn->pos - state->pos_begin;
266        m->immediate_parent_found = 1;
267        m->num_children = state->nelem;
268
269        if (state->type != JSONSL_T_SPECIAL) {
270            m->loc_deepest.length++; /* Include the terminating token */
271        } else {
272            m->sflags = state->special_flags;
273        }
274
275        if (m->get_last) {
276            if (state->type != JSONSL_T_LIST) {
277                // Not a list!
278                m->matchres = JSONSL_MATCH_TYPE_MISMATCH;
279                jsonsl_stop(jsn);
280                return;
281
282            } else if (!state->nelem) {
283                // List is empty
284                m->matchres = JSONSL_MATCH_UNKNOWN;
285                jsonsl_stop(jsn);
286                return;
287            }
288
289            // Transpose the match
290            const jsonsl_state_st *child = jsonsl_last_child(jsn, state);
291
292            // For containers, pos does not include the terminating token
293            size_t child_endpos = jsn->pos;
294
295            m->loc_deepest.length = child_endpos - child->pos_begin;
296            m->loc_deepest.at = jsn->base + child->pos_begin;
297
298            // Remove trailing whitespace. This is because the child is
299            // deemed to end one character before the parent does, however
300            // there may be whitespace. This is usually OK but confuses tests.
301            while (isspace(m->loc_deepest.at[m->loc_deepest.length-1])) {
302                --m->loc_deepest.length;
303            }
304
305            m->match_level = child->level;
306            m->sflags = child->special_flags;
307            m->type = child->type;
308            m->num_children = child->nelem;
309
310            // Parent is an array, get pos/sibling information directly from it
311            m->num_siblings = state->nelem - 1;
312            m->position = m->num_siblings;
313            m->loc_key.clear();
314            jsonsl_stop(jsn);
315
316        } else {
317            // Maybe we need another round to check for siblings?
318            if (m->extra_options == Match::GET_FOLLOWING_SIBLINGS) {
319                jsn->action_callback_POP = NULL;
320                jsn->action_callback_PUSH = NULL;
321
322                // We only care for this on push, but in the off chance where
323                // this is the final element in the parent *anyway*, the parsing
324                // itself should just stop.
325                jsn->action_callback = next_sibling_push_callback;
326            } else {
327                jsonsl_stop(jsn);
328            }
329        }
330
331        return;
332    }
333
334    // TODO: Determine if all these checks are necessary, since if everything
335    // is correct, parsing should stop as soon as the deepest match is
336    // terminated
337
338    // Not a container and not a full match. We can't do anything here.
339    if (!JSONSL_STATE_IS_CONTAINER(state)) {
340        return;
341    }
342
343    // This isn't part of the match tree, so we just ignore it
344    if (state->mres != JSONSL_MATCH_POSSIBLE) {
345        return;
346    }
347
348    // If we haven't bailed out yet, it means this is the deepest most parent
349    // match. This can either be the actual parent of the match (if a match
350    // is found), or the deepest existing parent which exists in the document.
351    // The latter information is needed for MKDIR_P semantics.
352    // Record the length of the parent
353    m->loc_deepest.length = jsn->pos - state->pos_begin;
354    m->loc_deepest.length++; // Always a container. Use trailing token.
355
356    // Record the number of "siblings" in the container. This is used by
357    // insertion/removal operations to determine comma placement, if any,
358    // before or after the current match.
359    if (state->type == JSONSL_T_OBJECT) {
360        m->num_siblings = static_cast<unsigned>(state->nelem / 2);
361    } else {
362        m->num_siblings = static_cast<unsigned>(state->nelem);
363    }
364
365    m->type = state->type;
366
367    // Is this the actual parent of the match?
368    if (state->level == ctx->jpr->ncomponents-1) {
369        m->immediate_parent_found = 1;
370    }
371
372    // Since this is the deepest parent we've found, we exit here!
373    jsonsl_stop(jsn);
374}
375
376int
377Match::exec_match_simple(const char *value, size_t nvalue,
378    const Path::CompInfo *jpr, jsonsl_t jsn)
379{
380    ParseContext ctx(this, const_cast<Path::CompInfo*>(jpr));
381    status = JSONSL_ERROR_SUCCESS;
382
383    jsonsl_enable_all_callbacks(jsn);
384    jsn->action_callback_PUSH = push_callback;
385    jsn->action_callback_POP = pop_callback;
386    jsn->error_callback = err_callback;
387    jsn->max_callback_level = ctx.jpr->ncomponents + 1;
388    jsn->data = &ctx;
389
390    jsonsl_feed(jsn, value, nvalue);
391    jsonsl_reset(jsn);
392    return 0;
393}
394
395int
396Match::exec_match_negix(const char *value, size_t nvalue, const Path *pth,
397    jsonsl_t jsn)
398{
399    /* First component to scan in next iteration */
400    size_t cur_start = 1;
401
402    /* For levels, compensate this much for the match level */
403    size_t level_offset = 0;
404
405    const Path::CompInfo& orig = *pth;
406    Path::Component comp_s[Limits::PATH_COMPONENTS_ALLOC];
407
408    /* Last length of the subdocument */
409    size_t last_len = nvalue;
410    /* Pointer to the beginning of the current subdocument */
411    const char *last_start = value;
412    std::copy(orig.begin(), orig.end(), comp_s);
413
414    while (cur_start < orig.size()) {
415        size_t ii;
416        int rv, is_last_neg = 0;
417
418        for (ii = cur_start; ii < orig.size(); ii++) {
419            /* Seek to the next negative index */
420            if (orig[ii].is_neg) {
421                /* Convert this to the first array index; then switch it
422                 * around to extract parent information */
423                is_last_neg = 1;
424                break;
425            }
426        }
427
428        // XXX: The next few lines could probably be more concise, but I find
429        // it better to explain what's going on.
430
431        // Set the components to use for the current match. These are the
432        // components between the termination of the *last* match and the
433        // next negative index (or the end!)
434        Path::Component *tmpcomps = &comp_s[cur_start];
435        size_t ntmpcomp = ii - cur_start;
436
437        // We need the root element here as well, so add one more component
438        // at the beginning
439        tmpcomps--;
440        ntmpcomp++;
441
442        // Declare the component info itself, this is fed as the path
443        // to exec_match
444        Path::CompInfo tmp(tmpcomps, ntmpcomp);
445
446        tmp[0].ptype = JSONSL_PATH_ROOT;
447
448        /* Clear the match. There's no good way to preserve info here,
449         * unfortunately. */
450        clear();
451
452        // Transpose array's last element as the match itself
453        if (is_last_neg) {
454            get_last = 1;
455        }
456
457        rv = exec_match_simple(last_start, last_len, &tmp, jsn);
458        match_level += level_offset;
459        if (level_offset) {
460            // 'nth iteration
461            match_level--;
462        }
463
464        level_offset = match_level;
465
466        if (rv != 0) { /* error */
467            return rv;
468        } else if (status != JSONSL_ERROR_SUCCESS) {
469            return 0;
470        } else if (matchres != JSONSL_MATCH_COMPLETE) {
471            break;
472        }
473
474        last_start = loc_deepest.at;
475        last_len = loc_deepest.length;
476
477        /* Chomp off the current component */
478        cur_start = ii + 1;
479    }
480
481    // This is currently only used by GET_COUNT, in which an element is
482    // artificially added.
483    immediate_parent_found = match_level >= pth->ncomponents-1;
484    return 0;
485}
486
487/**
488 * Retrieves a buffer for the designated _path_ in the JSON document. The
489 * actual length of the remaining components may be determined by using
490 * pointer arithmetic.
491 *
492 * @param value The value buffer to search
493 * @param nvalue Length of the value buffer
494 * @param nj The path component to search for
495 * @param[out] a result structure for the path
496 * @return 0 if found, otherwise an error code
497 */
498int
499Match::exec_match(const char *value, size_t nvalue, const Path *pth, jsonsl_t jsn)
500{
501    if (!pth->has_negix) {
502        return exec_match_simple(value, nvalue, pth, jsn);
503    } else {
504        return exec_match_negix(value, nvalue, pth, jsn);
505    }
506}
507
508Match::Match()
509{
510    clear();
511}
512
513Match::~Match()
514{
515}
516
517void
518Match::clear()
519{
520    // TODO: Memberwise reset
521    memset(this, 0, sizeof(*this));
522}
523
524jsonsl_t
525Match::jsn_alloc()
526{
527    return jsonsl_new(Limits::PARSER_DEPTH);
528}
529
530void
531Match::jsn_free(jsonsl_t jsn)
532{
533    jsonsl_destroy(jsn);
534}
535
536typedef struct {
537    int err;
538    int rootcount;
539    int flags;
540    int maxdepth;
541} validate_ctx;
542
543static int
544validate_err_callback(jsonsl_t jsn,
545    jsonsl_error_t err, jsonsl_state_st *, jsonsl_char_t *)
546{
547    validate_ctx *ctx = (validate_ctx *)jsn->data;
548    ctx->err = err;
549    // printf("ERROR: AT=%s\n", at);
550    return 0;
551}
552
553static void
554validate_callback(jsonsl_t jsn, jsonsl_action_t action,
555    jsonsl_state_st *state, const jsonsl_char_t *)
556{
557    validate_ctx *ctx = reinterpret_cast<validate_ctx*>(jsn->data);
558
559    if (ctx->maxdepth > -1 &&
560            state->level - 1 == static_cast<unsigned>(ctx->maxdepth)) {
561        ctx->err = Validator::ETOODEEP;
562        jsonsl_stop(jsn);
563        return;
564    }
565
566    if (state->level == 1) {
567        // Root element
568        ctx->rootcount++;
569
570    } else if (state->level == 2 && action == JSONSL_ACTION_PUSH) {
571        if (ctx->flags & Validator::VALUE_PRIMITIVE) {
572            if (JSONSL_STATE_IS_CONTAINER(state)) {
573                ctx->err = Validator::ENOTPRIMITIVE;
574                jsonsl_stop(jsn);
575            }
576        }
577
578        if (ctx->flags & Validator::VALUE_SINGLE) {
579            auto *parent = jsonsl_last_state(jsn, state);
580            int has_multielem = 0;
581            if (parent->type == JSONSL_T_LIST && parent->nelem > 1) {
582                has_multielem = 1;
583            } else if (parent->type == JSONSL_T_OBJECT && parent->nelem > 2) {
584                has_multielem = 1;
585            }
586
587            if (has_multielem) {
588                ctx->err = Validator::EMULTIELEM;
589                jsonsl_stop(jsn);
590            }
591        }
592    }
593}
594
595static const Loc validate_ARRAY_PRE("[", 1);
596static const Loc validate_ARRAY_POST("]", 1);
597static const Loc validate_DICT_PRE("{\"k\":", 5);
598static const Loc validate_DICT_POST("}", 1);
599static const Loc validate_NOOP(NULL, 0);
600
601int
602Validator::validate(const char *s, size_t n, jsonsl_t jsn, int maxdepth, int mode)
603{
604    int need_free_jsn = 0;
605    int type = mode & PARENT_MASK;
606    int flags = mode & VALUE_MASK;
607    const Loc *l_pre, *l_post;
608
609    validate_ctx ctx = { 0,0 };
610    if (jsn == NULL) {
611        jsn = jsonsl_new(Limits::PARSER_DEPTH);
612        need_free_jsn = 1;
613    }
614
615    jsn->action_callback_POP = NULL;
616    jsn->action_callback_PUSH = NULL;
617
618    jsn->action_callback = validate_callback;
619    jsn->error_callback = validate_err_callback;
620
621    // Set the maximum level. This name is a bit misleading as it's
622    // actually the level number (inclusive) beyond which we won't get
623    // called. Cutting down on the number of callbacks can significantly
624    // benefit CPU, especially for larger JSON documents with many tokens.
625    if (maxdepth != -1) {
626        if (type != PARENT_NONE) {
627            // Don't count the wrapper in the depth
628            maxdepth++;
629        }
630        // Set the callback to be 1 more (i.e. max_level+2) so we can
631        // get notified about depth violations
632        jsn->max_callback_level = maxdepth + 2;
633    } else if (flags) {
634        // Set the callback to include the "wrapper" container and our
635        // actual value
636        jsn->max_callback_level = 3;
637    } else {
638        // only care about the actual value.
639        jsn->max_callback_level = 2;
640    }
641
642    ctx.maxdepth = maxdepth;
643    ctx.flags = flags;
644
645    jsn->call_OBJECT = 1;
646    jsn->call_LIST = 1;
647    jsn->call_STRING = 1;
648    jsn->call_SPECIAL = 1;
649    jsn->call_HKEY = 0;
650    jsn->call_UESCAPE = 0;
651    jsn->data = &ctx;
652
653    if (type == PARENT_NONE) {
654        l_pre = l_post = &validate_NOOP;
655    } else if (type == PARENT_ARRAY) {
656        l_pre = &validate_ARRAY_PRE;
657        l_post = &validate_ARRAY_POST;
658    } else if (type == PARENT_DICT) {
659        l_pre = &validate_DICT_PRE;
660        l_post = &validate_DICT_POST;
661    } else {
662        ctx.err = JSONSL_ERROR_GENERIC;
663        goto GT_ERR;
664    }
665
666    jsonsl_feed(jsn, l_pre->at, l_pre->length);
667    jsonsl_feed(jsn, s, n);
668
669    if (ctx.err == JSONSL_ERROR_SUCCESS) {
670        jsonsl_feed(jsn, l_post->at, l_post->length);
671    }
672
673    if (ctx.err == JSONSL_ERROR_SUCCESS) {
674        if (ctx.rootcount < 2) {
675            ctx.err = Validator::EPARTIAL;
676        } else if (ctx.rootcount > 2) {
677            ctx.err = Validator::EMULTIELEM;
678        }
679    }
680
681    GT_ERR:
682    if (need_free_jsn) {
683        jsonsl_destroy(jsn);
684    } else {
685        jsonsl_reset(jsn);
686    }
687    return ctx.err;
688}
689
690const char *
691Validator::errstr(int rv)
692{
693    if (rv <= JSONSL_ERROR_GENERIC) {
694        return jsonsl_strerror(static_cast<jsonsl_error_t>(rv));
695    }
696    switch (rv) {
697    case EMULTIELEM:
698        return "Found multiple elements (single expected)";
699    case EPARTIAL:
700        return "Found incomplete JSON";
701    default:
702        return "UNKNOWN";
703    }
704}
705