mirror of https://mirror.osredm.com/root/redis.git
442 lines
16 KiB
C
442 lines
16 KiB
C
/* Ultra‑lightweight top‑level JSON field extractor.
|
||
* Return the element directly as an expr.c token.
|
||
* This code is directly included inside expr.c.
|
||
*
|
||
* Copyright (c) 2025-Present, Redis Ltd.
|
||
* All rights reserved.
|
||
*
|
||
* Licensed under your choice of the Redis Source Available License 2.0
|
||
* (RSALv2) or the Server Side Public License v1 (SSPLv1).
|
||
*
|
||
* Originally authored by: Salvatore Sanfilippo.
|
||
*
|
||
* ------------------------------------------------------------------
|
||
*
|
||
* DESIGN GOALS:
|
||
*
|
||
* 1. Zero heap allocations while seeking the requested key.
|
||
* 2. A single parse (and therefore a single allocation, if needed)
|
||
* when the key finally matches.
|
||
* 3. Same subset‑of‑JSON coverage needed by expr.c:
|
||
* - Strings (escapes: \" \\ \n \r \t).
|
||
* - Numbers (double).
|
||
* - Booleans.
|
||
* - Null.
|
||
* - Flat arrays of the above primitives.
|
||
*
|
||
* Any other value (nested object, unicode escape, etc.) returns NULL.
|
||
* Should be very easy to extend it in case in the future we want
|
||
* more for the FILTER option of VSIM.
|
||
* 4. No global state, so this file can be #included directly in expr.c.
|
||
*
|
||
* The only API expr.c uses directly is:
|
||
*
|
||
* exprtoken *jsonExtractField(const char *json, size_t json_len,
|
||
* const char *field, size_t field_len);
|
||
* ------------------------------------------------------------------ */
|
||
|
||
#include <ctype.h>
|
||
#include <string.h>
|
||
|
||
// Forward declarations.
|
||
static int jsonSkipValue(const char **p, const char *end);
|
||
static exprtoken *jsonParseValueToken(const char **p, const char *end);
|
||
|
||
/* Similar to ctype.h isdigit() but covers the whole JSON number charset,
|
||
* including exp form. */
|
||
static int jsonIsNumberChar(int c) {
|
||
return isdigit(c) || c=='-' || c=='+' || c=='.' || c=='e' || c=='E';
|
||
}
|
||
|
||
/* ========================== Fast skipping of JSON =========================
|
||
* The helpers here are designed to skip values without performing any
|
||
* allocation. This way, for the use case of this JSON parser, we are able
|
||
* to easily (and with good speed) skip fields and values we are not
|
||
* interested in. Then, later in the code, when we find the field we want
|
||
* to obtain, we finally call the functions that turn a given JSON value
|
||
* associated to a field into our of our expressions token.
|
||
* ========================================================================== */
|
||
|
||
/* Advance *p consuming all the spaces. */
|
||
static inline void jsonSkipWhiteSpaces(const char **p, const char *end) {
|
||
while (*p < end && isspace((unsigned char)**p)) (*p)++;
|
||
}
|
||
|
||
/* Advance *p past a JSON string. Returns 1 on success, 0 on error. */
|
||
static int jsonSkipString(const char **p, const char *end) {
|
||
if (*p >= end || **p != '"') return 0;
|
||
(*p)++; /* Skip opening quote. */
|
||
while (*p < end) {
|
||
if (**p == '\\') {
|
||
(*p) += 2;
|
||
continue;
|
||
}
|
||
if (**p == '"') {
|
||
(*p)++; /* Skip closing quote. */
|
||
return 1;
|
||
}
|
||
(*p)++;
|
||
}
|
||
return 0; /* unterminated */
|
||
}
|
||
|
||
/* Skip an array or object generically using depth counter.
|
||
* Opener and closer tells the function how the aggregated
|
||
* data type starts/stops, basically [] or {}. */
|
||
static int jsonSkipBracketed(const char **p, const char *end,
|
||
char opener, char closer) {
|
||
int depth = 1;
|
||
(*p)++; /* Skip opener. */
|
||
|
||
/* Loop until we reach the end of the input or find the matching
|
||
* closer (depth becomes 0). */
|
||
while (*p < end && depth > 0) {
|
||
char c = **p;
|
||
|
||
if (c == '"') {
|
||
// Found a string, delegate skipping to jsonSkipString().
|
||
if (!jsonSkipString(p, end)) {
|
||
return 0; // String skipping failed (e.g., unterminated)
|
||
}
|
||
/* jsonSkipString() advances *p past the closing quote.
|
||
* Continue the loop to process the character *after* the string. */
|
||
continue;
|
||
}
|
||
|
||
/* If it's not a string, check if it affects the depth for the
|
||
* specific brackets we are currently tracking. */
|
||
if (c == opener) {
|
||
depth++;
|
||
} else if (c == closer) {
|
||
depth--;
|
||
}
|
||
|
||
/* Always advance the pointer for any non-string character.
|
||
* This handles commas, colons, whitespace, numbers, literals,
|
||
* and even nested brackets of a *different* type than the
|
||
* one we are currently skipping (e.g. skipping a { inside []). */
|
||
(*p)++;
|
||
}
|
||
|
||
/* Return 1 (true) if we successfully found the matching closer,
|
||
* otherwise there is a parse error and we return 0. */
|
||
return depth == 0;
|
||
}
|
||
|
||
/* Skip a single JSON literal (true, null, ...) starting at *p.
|
||
* Returns 1 on success, 0 on failure. */
|
||
static int jsonSkipLiteral(const char **p, const char *end, const char *lit) {
|
||
size_t l = strlen(lit);
|
||
if (*p + l > end) return 0;
|
||
if (strncmp(*p, lit, l) == 0) { *p += l; return 1; }
|
||
return 0;
|
||
}
|
||
|
||
/* Skip number, don't check that number format is correct, just consume
|
||
* number-alike characters.
|
||
*
|
||
* Note: More robust number skipping might check validity,
|
||
* but for skipping, just consuming plausible characters is enough. */
|
||
static int jsonSkipNumber(const char **p, const char *end) {
|
||
const char *num_start = *p;
|
||
while (*p < end && jsonIsNumberChar(**p)) (*p)++;
|
||
return *p > num_start; // Any progress made? Otherwise no number found.
|
||
}
|
||
|
||
/* Skip any JSON value. 1 = success, 0 = error. */
|
||
static int jsonSkipValue(const char **p, const char *end) {
|
||
jsonSkipWhiteSpaces(p, end);
|
||
if (*p >= end) return 0;
|
||
switch (**p) {
|
||
case '"': return jsonSkipString(p, end);
|
||
case '{': return jsonSkipBracketed(p, end, '{', '}');
|
||
case '[': return jsonSkipBracketed(p, end, '[', ']');
|
||
case 't': return jsonSkipLiteral(p, end, "true");
|
||
case 'f': return jsonSkipLiteral(p, end, "false");
|
||
case 'n': return jsonSkipLiteral(p, end, "null");
|
||
default: return jsonSkipNumber(p, end);
|
||
}
|
||
}
|
||
|
||
/* =========================== JSON to exprtoken ============================
|
||
* The functions below convert a given json value to the equivalent
|
||
* expression token structure.
|
||
* ========================================================================== */
|
||
|
||
static exprtoken *jsonParseStringToken(const char **p, const char *end) {
|
||
if (*p >= end || **p != '"') return NULL;
|
||
const char *start = ++(*p);
|
||
int esc = 0; size_t len = 0; int has_esc = 0;
|
||
const char *q = *p;
|
||
while (q < end) {
|
||
if (esc) { esc = 0; q++; len++; has_esc = 1; continue; }
|
||
if (*q == '\\') { esc = 1; q++; continue; }
|
||
if (*q == '"') break;
|
||
q++; len++;
|
||
}
|
||
if (q >= end || *q != '"') return NULL; // Unterminated string
|
||
exprtoken *t = exprNewToken(EXPR_TOKEN_STR);
|
||
|
||
if (!has_esc) {
|
||
// No escapes, we can point directly into the original JSON string.
|
||
t->str.start = (char*)start; t->str.len = len; t->str.heapstr = NULL;
|
||
} else {
|
||
// Escapes present, need to allocate and copy/process escapes.
|
||
char *dst = RedisModule_Alloc(len + 1);
|
||
|
||
t->str.start = t->str.heapstr = dst; t->str.len = len;
|
||
const char *r = start; esc = 0;
|
||
while (r < q) {
|
||
if (esc) {
|
||
switch (*r) {
|
||
// Supported escapes from Goal 3.
|
||
case 'n': *dst='\n'; break;
|
||
case 'r': *dst='\r'; break;
|
||
case 't': *dst='\t'; break;
|
||
case '\\': *dst='\\'; break;
|
||
case '"': *dst='\"'; break;
|
||
// Escapes (like \uXXXX, \b, \f) are not supported for now,
|
||
// we just copy them verbatim.
|
||
default: *dst=*r; break;
|
||
}
|
||
dst++; esc = 0; r++; continue;
|
||
}
|
||
if (*r == '\\') { esc = 1; r++; continue; }
|
||
*dst++ = *r++;
|
||
}
|
||
*dst = '\0'; // Null-terminate the allocated string.
|
||
}
|
||
*p = q + 1; // Advance the main pointer past the closing quote.
|
||
return t;
|
||
}
|
||
|
||
static exprtoken *jsonParseNumberToken(const char **p, const char *end) {
|
||
// Use a buffer to extract the number literal for parsing with strtod().
|
||
char buf[256]; int idx = 0;
|
||
const char *start = *p; // For strtod partial failures check.
|
||
|
||
// Copy potential number characters to buffer.
|
||
while (*p < end && idx < (int)sizeof(buf)-1 && jsonIsNumberChar(**p)) {
|
||
buf[idx++] = **p;
|
||
(*p)++;
|
||
}
|
||
buf[idx]='\0'; // Null-terminate buffer.
|
||
|
||
if (idx==0) return NULL; // No number characters found.
|
||
|
||
char *ep; // End pointer for strtod validation.
|
||
double v = strtod(buf, &ep);
|
||
|
||
/* Check if strtod() consumed the entire buffer content.
|
||
* If not, the number format was invalid. */
|
||
if (*ep!='\0') {
|
||
// strtod() failed; rewind p to the start and return NULL
|
||
*p = start;
|
||
return NULL;
|
||
}
|
||
|
||
// If strtod() succeeded, create and return the token..
|
||
exprtoken *t = exprNewToken(EXPR_TOKEN_NUM);
|
||
t->num = v;
|
||
return t;
|
||
}
|
||
|
||
static exprtoken *jsonParseLiteralToken(const char **p, const char *end, const char *lit, int type, double num) {
|
||
size_t l = strlen(lit);
|
||
|
||
// Ensure we don't read past 'end'.
|
||
if ((*p + l) > end) return NULL;
|
||
|
||
if (strncmp(*p, lit, l) != 0) return NULL; // Literal doesn't match.
|
||
|
||
// Check that the character *after* the literal is a valid JSON delimiter
|
||
// (whitespace, comma, closing bracket/brace, or end of input)
|
||
// This prevents matching "trueblabla" as "true".
|
||
if ((*p + l) < end) {
|
||
char next_char = *(*p + l);
|
||
if (!isspace((unsigned char)next_char) && next_char!=',' &&
|
||
next_char!=']' && next_char!='}') {
|
||
return NULL; // Invalid character following literal.
|
||
}
|
||
}
|
||
|
||
// Literal matched and is correctly terminated.
|
||
*p += l;
|
||
exprtoken *t = exprNewToken(type);
|
||
t->num = num;
|
||
return t;
|
||
}
|
||
|
||
static exprtoken *jsonParseArrayToken(const char **p, const char *end) {
|
||
if (*p >= end || **p != '[') return NULL;
|
||
(*p)++; // Skip '['.
|
||
jsonSkipWhiteSpaces(p,end);
|
||
|
||
exprtoken *t = exprNewToken(EXPR_TOKEN_TUPLE);
|
||
t->tuple.len = 0; t->tuple.ele = NULL; size_t alloc = 0;
|
||
|
||
// Handle empty array [].
|
||
if (*p < end && **p == ']') {
|
||
(*p)++; // Skip ']'.
|
||
return t;
|
||
}
|
||
|
||
// Parse array elements.
|
||
while (1) {
|
||
exprtoken *ele = jsonParseValueToken(p,end);
|
||
if (!ele) {
|
||
exprTokenRelease(t); // Clean up partially built array token.
|
||
return NULL;
|
||
}
|
||
|
||
// Grow allocated space for elements if needed.
|
||
if (t->tuple.len == alloc) {
|
||
size_t newsize = alloc ? alloc * 2 : 4;
|
||
// Check for potential overflow if newsize becomes huge.
|
||
if (newsize < alloc) {
|
||
exprTokenRelease(ele);
|
||
exprTokenRelease(t);
|
||
return NULL;
|
||
}
|
||
exprtoken **newele = RedisModule_Realloc(t->tuple.ele,
|
||
sizeof(exprtoken*)*newsize);
|
||
t->tuple.ele = newele;
|
||
alloc = newsize;
|
||
}
|
||
t->tuple.ele[t->tuple.len++] = ele; // Add element.
|
||
|
||
jsonSkipWhiteSpaces(p,end);
|
||
if (*p>=end) {
|
||
// Unterminated array. Note that this check is crucial because
|
||
// previous value parsed may seek 'p' to 'end'.
|
||
exprTokenRelease(t);
|
||
return NULL;
|
||
}
|
||
|
||
// Check for comma (more elements) or closing bracket.
|
||
if (**p == ',') {
|
||
(*p)++; // Skip ','
|
||
jsonSkipWhiteSpaces(p,end); // Skip whitespace before next element
|
||
continue; // Parse next element
|
||
} else if (**p == ']') {
|
||
(*p)++; // Skip ']'
|
||
return t; // End of array
|
||
} else {
|
||
// Unexpected character (not ',' or ']')
|
||
exprTokenRelease(t);
|
||
return NULL;
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Turn a JSON value into an expr token. */
|
||
static exprtoken *jsonParseValueToken(const char **p, const char *end) {
|
||
jsonSkipWhiteSpaces(p,end);
|
||
if (*p >= end) return NULL;
|
||
|
||
switch (**p) {
|
||
case '"': return jsonParseStringToken(p,end);
|
||
case '[': return jsonParseArrayToken(p,end);
|
||
case '{': return NULL; // No nested elements support for now.
|
||
case 't': return jsonParseLiteralToken(p,end,"true",EXPR_TOKEN_NUM,1);
|
||
case 'f': return jsonParseLiteralToken(p,end,"false",EXPR_TOKEN_NUM,0);
|
||
case 'n': return jsonParseLiteralToken(p,end,"null",EXPR_TOKEN_NULL,0);
|
||
default:
|
||
// Check if it starts like a number.
|
||
if (isdigit((unsigned char)**p) || **p=='-' || **p=='+') {
|
||
return jsonParseNumberToken(p,end);
|
||
}
|
||
// Anything else is an unsupported type or malformed JSON.
|
||
return NULL;
|
||
}
|
||
}
|
||
|
||
/* ============================== Fast key seeking ========================== */
|
||
|
||
/* Finds the start of the value for a given field key within a JSON object.
|
||
* Returns pointer to the first char of the value, or NULL if not found/error.
|
||
* This function does not perform any allocation and is optimized to seek
|
||
* the specified *toplevel* filed as fast as possible. */
|
||
static const char *jsonSeekField(const char *json, const char *end,
|
||
const char *field, size_t flen) {
|
||
const char *p = json;
|
||
jsonSkipWhiteSpaces(&p,end);
|
||
if (p >= end || *p != '{') return NULL; // Must start with '{'.
|
||
p++; // skip '{'.
|
||
|
||
while (1) {
|
||
jsonSkipWhiteSpaces(&p,end);
|
||
if (p >= end) return NULL; // Reached end within object.
|
||
|
||
if (*p == '}') return NULL; // End of object, field not found.
|
||
|
||
// Expecting a key (string).
|
||
if (*p != '"') return NULL; // Key must be a string.
|
||
|
||
// --- Key Matching using jsonSkipString ---
|
||
const char *key_start = p + 1; // Start of key content.
|
||
const char *key_end_p = p; // Will later contain the end.
|
||
|
||
// Use jsonSkipString() to find the end.
|
||
if (!jsonSkipString(&key_end_p, end)) {
|
||
// Unterminated / invalid key string.
|
||
return NULL;
|
||
}
|
||
|
||
// Calculate the length of the key's content.
|
||
size_t klen = (key_end_p - 1) - key_start;
|
||
|
||
/* Perform the comparison using the raw key content.
|
||
* WARNING: This uses memcmp(), so we don't handle escaped chars
|
||
* within the key matching against unescaped chars in 'field'. */
|
||
int match = klen == flen && !memcmp(key_start, field, flen);
|
||
|
||
// Update the main pointer 'p' to be after the key string.
|
||
p = key_end_p;
|
||
|
||
// Now we expect to find a ":" followed by a value.
|
||
jsonSkipWhiteSpaces(&p,end);
|
||
if (p>=end || *p!=':') return NULL; // Expect ':' after key
|
||
p++; // Skip ':'.
|
||
|
||
// Seek value.
|
||
jsonSkipWhiteSpaces(&p,end);
|
||
if (p>=end) return NULL; // Expect value after ':'
|
||
|
||
if (match) {
|
||
// Found the matching key, p now points to the start of the value.
|
||
return p;
|
||
} else {
|
||
// Key didn't match, skip the corresponding value.
|
||
if (!jsonSkipValue(&p,end)) return NULL; // Syntax error.
|
||
}
|
||
|
||
|
||
// Look for comma or a closing brace.
|
||
jsonSkipWhiteSpaces(&p,end);
|
||
if (p>=end) return NULL; // Reached end after value.
|
||
|
||
if (*p == ',') {
|
||
p++; // Skip comma, continue loop to find next key.
|
||
continue;
|
||
} else if (*p == '}') {
|
||
return NULL; // Reached end of object, field not found.
|
||
}
|
||
return NULL; // Malformed JSON (unexpected char after value).
|
||
}
|
||
}
|
||
|
||
/* This is the only real API that this file conceptually exports (it is
|
||
* inlined, actually). */
|
||
exprtoken *jsonExtractField(const char *json, size_t json_len,
|
||
const char *field, size_t field_len)
|
||
{
|
||
const char *end = json + json_len;
|
||
const char *valptr = jsonSeekField(json,end,field,field_len);
|
||
if (!valptr) return NULL;
|
||
|
||
/* Key found, valptr points to the start of the value.
|
||
* Convert it into an expression token object. */
|
||
return jsonParseValueToken(&valptr,end);
|
||
}
|