From 611c950293ae34dcef148ec62c9dd9626d7dc9e3 Mon Sep 17 00:00:00 2001 From: Shockingly Good Date: Mon, 28 Oct 2024 14:26:29 +0100 Subject: [PATCH 01/42] Fix crash in RM_GetCurrentUserName() when the user isn't accessible (#13619) The crash happens whenever the user isn't accessible, for example, it isn't set for the context (when it is temporary) or in some other cases like `notifyKeyspaceEvent`. To properly check for the ACL compliance, we need to get the user name and the user to invoke other APIs. However, it is not possible if it crashes, and it is impossible to work that around in the code since we don't know (and **shouldn't know**!) when it is available and when it is not. --- src/module.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/module.c b/src/module.c index 5238408c7..2b6e625f1 100644 --- a/src/module.c +++ b/src/module.c @@ -9678,6 +9678,12 @@ RedisModuleString *RM_GetModuleUserACLString(RedisModuleUser *user) { * The returned string must be released with RedisModule_FreeString() or by * enabling automatic memory management. */ RedisModuleString *RM_GetCurrentUserName(RedisModuleCtx *ctx) { + /* Sometimes, the user isn't passed along the call stack or isn't + * even set, so we need to check for the members to avoid crashes. */ + if (ctx->client == NULL || ctx->client->user == NULL || ctx->client->user->name == NULL) { + return NULL; + } + return RM_CreateString(ctx,ctx->client->user->name,sdslen(ctx->client->user->name)); } From 2ec78d262d06e8097e12278bd10c4c0216f4d1c9 Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Tue, 29 Oct 2024 13:07:26 +0200 Subject: [PATCH 02/42] Add KEYSIZES section to INFO (#13592) This PR adds a new section to the `INFO` command output, called `keysizes`. This section provides detailed statistics on the distribution of key sizes for each data type (strings, lists, sets, hashes and zsets) within the dataset. The distribution is tracked using a base-2 logarithmic histogram. # Motivation Currently, Redis lacks a built-in feature to track key sizes and item sizes per data type at a granular level. Understanding the distribution of key sizes is critical for monitoring memory usage and optimizing performance, particularly in large datasets. This enhancement will allow users to inspect the size distribution of keys directly from the `INFO` command, assisting with performance analysis and capacity planning. # Changes New Section in `INFO` Command: A new section called `keysizes` has been added to the `INFO` command output. This section reports a per-database, per-type histogram of key sizes. It provides insights into how many keys fall into specific size ranges (represented in powers of 2). **Example output:** ``` 127.0.0.1:6379> INFO keysizes # Keysizes db0_distrib_strings_sizes:1=19,2=655,512=100899,1K=31,2K=29,4K=23,8K=16,16K=3,32K=2 db0_distrib_lists_items:1=5784492,32=3558,64=1047,128=676,256=533,512=218,4K=1,8K=42 db0_distrib_sets_items:1=735564=50612,8=21462,64=1365,128=974,2K=292,4K=154,8K=89, db0_distrib_hashes_items:2=1,4=544,32=141169,64=207329,128=4349,256=136226,1K=1 ``` ## Future Use Cases: The key size distribution is collected per slot as well, laying the groundwork for future enhancements related to Redis Cluster. --- src/bitops.c | 48 +++- src/db.c | 58 ++++- src/kvstore.c | 73 ++++-- src/kvstore.h | 18 ++ src/lazyfree.c | 2 +- src/module.c | 10 +- src/object.c | 12 + src/server.c | 58 ++++- src/server.h | 9 +- src/t_hash.c | 34 ++- src/t_list.c | 28 ++- src/t_set.c | 24 +- src/t_string.c | 19 +- src/t_zset.c | 23 +- src/util.c | 7 + src/util.h | 13 + tests/unit/info-keysizes.tcl | 454 +++++++++++++++++++++++++++++++++++ tests/unit/memefficiency.tcl | 4 +- tests/unit/other.tcl | 17 +- 19 files changed, 836 insertions(+), 75 deletions(-) create mode 100644 tests/unit/info-keysizes.tcl diff --git a/src/bitops.c b/src/bitops.c index 2222c05ea..44312c6af 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -489,22 +489,27 @@ int getBitfieldTypeFromArgument(client *c, robj *o, int *sign, int *bits) { * bits to a string object. The command creates or pad with zeroes the string * so that the 'maxbit' bit can be addressed. The object is finally * returned. Otherwise if the key holds a wrong type NULL is returned and - * an error is sent to the client. */ -robj *lookupStringForBitCommand(client *c, uint64_t maxbit, int *dirty) { + * an error is sent to the client. + * + * (Must provide all the arguments to the function) + */ +static robj *lookupStringForBitCommand(client *c, uint64_t maxbit, + size_t *strOldSize, size_t *strGrowSize) +{ size_t byte = maxbit >> 3; robj *o = lookupKeyWrite(c->db,c->argv[1]); if (checkType(c,o,OBJ_STRING)) return NULL; - if (dirty) *dirty = 0; if (o == NULL) { o = createObject(OBJ_STRING,sdsnewlen(NULL, byte+1)); dbAdd(c->db,c->argv[1],o); - if (dirty) *dirty = 1; + *strGrowSize = byte + 1; + *strOldSize = 0; } else { o = dbUnshareStringValue(c->db,c->argv[1],o); - size_t oldlen = sdslen(o->ptr); + *strOldSize = sdslen(o->ptr); o->ptr = sdsgrowzero(o->ptr,byte+1); - if (dirty && oldlen != sdslen(o->ptr)) *dirty = 1; + *strGrowSize = sdslen(o->ptr) - *strOldSize; } return o; } @@ -561,8 +566,9 @@ void setbitCommand(client *c) { return; } - int dirty; - if ((o = lookupStringForBitCommand(c,bitoffset,&dirty)) == NULL) return; + size_t strOldSize, strGrowSize; + if ((o = lookupStringForBitCommand(c,bitoffset,&strOldSize,&strGrowSize)) == NULL) + return; /* Get current values */ byte = bitoffset >> 3; @@ -573,7 +579,7 @@ void setbitCommand(client *c) { /* Either it is newly created, changed length, or the bit changes before and after. * Note that the bitval here is actually a decimal number. * So we need to use `!!` to convert it to 0 or 1 for comparison. */ - if (dirty || (!!bitval != on)) { + if (strGrowSize || (!!bitval != on)) { /* Update byte with new bit value. */ byteval &= ~(1 << bit); byteval |= ((on & 0x1) << bit); @@ -581,6 +587,13 @@ void setbitCommand(client *c) { signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_STRING,"setbit",c->argv[1],c->db->id); server.dirty++; + + /* If this is not a new key (old size not 0) and size changed, then + * update the keysizes histogram. Otherwise, the histogram already + * updated in lookupStringForBitCommand() by calling dbAdd(). */ + if ((strOldSize > 0) && (strGrowSize != 0)) + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_STRING, + strOldSize, strOldSize + strGrowSize); } /* Return original value. */ @@ -1065,7 +1078,8 @@ struct bitfieldOp { void bitfieldGeneric(client *c, int flags) { robj *o; uint64_t bitoffset; - int j, numops = 0, changes = 0, dirty = 0; + int j, numops = 0, changes = 0; + size_t strOldSize, strGrowSize = 0; struct bitfieldOp *ops = NULL; /* Array of ops to execute at end. */ int owtype = BFOVERFLOW_WRAP; /* Overflow type. */ int readonly = 1; @@ -1159,7 +1173,7 @@ void bitfieldGeneric(client *c, int flags) { /* Lookup by making room up to the farthest bit reached by * this operation. */ if ((o = lookupStringForBitCommand(c, - highest_write_offset,&dirty)) == NULL) { + highest_write_offset,&strOldSize,&strGrowSize)) == NULL) { zfree(ops); return; } @@ -1209,7 +1223,7 @@ void bitfieldGeneric(client *c, int flags) { setSignedBitfield(o->ptr,thisop->offset, thisop->bits,newval); - if (dirty || (oldval != newval)) + if (strGrowSize || (oldval != newval)) changes++; } else { addReplyNull(c); @@ -1243,7 +1257,7 @@ void bitfieldGeneric(client *c, int flags) { setUnsignedBitfield(o->ptr,thisop->offset, thisop->bits,newval); - if (dirty || (oldval != newval)) + if (strGrowSize || (oldval != newval)) changes++; } else { addReplyNull(c); @@ -1286,6 +1300,14 @@ void bitfieldGeneric(client *c, int flags) { } if (changes) { + + /* If this is not a new key (old size not 0) and size changed, then + * update the keysizes histogram. Otherwise, the histogram already + * updated in lookupStringForBitCommand() by calling dbAdd(). */ + if ((strOldSize > 0) && (strGrowSize != 0)) + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_STRING, + strOldSize, strOldSize + strGrowSize); + signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_STRING,"setbit",c->argv[1],c->db->id); server.dirty += changes; diff --git a/src/db.c b/src/db.c index 49c374fb1..42248a20f 100644 --- a/src/db.c +++ b/src/db.c @@ -21,6 +21,8 @@ * C-level DB API *----------------------------------------------------------------------------*/ +static_assert(MAX_KEYSIZES_TYPES == OBJ_TYPE_BASIC_MAX, "Must be equal"); + /* Flags for expireIfNeeded */ #define EXPIRE_FORCE_DELETE_EXPIRED 1 #define EXPIRE_AVOID_DELETE_EXPIRED 2 @@ -46,6 +48,48 @@ void updateLFU(robj *val) { val->lru = (LFUGetTimeInMinutes()<<8) | counter; } +/* + * Update histogram of keys-sizes + * + * It is used to track the distribution of key sizes in the dataset. It is updated + * every time key's length is modified. Available to user via INFO command. + * + * The histogram is a base-2 logarithmic histogram, with 64 bins. The i'th bin + * represents the number of keys with a size in the range 2^i and 2^(i+1) + * exclusive. oldLen/newLen must be smaller than 2^48, and if their value + * equals 0, it means that the key is being created/deleted, respectively. Each + * data type has its own histogram and it is per database (In addition, there is + * histogram per slot for future cluster use). + * + * Examples to LEN values and corresponding bins in histogram: + * [1,2)->0 [2,4)->1 [4,8)->2 [8,16)->3 + */ +void updateKeysizesHist(redisDb *db, int didx, uint32_t type, uint64_t oldLen, uint64_t newLen) { + if(unlikely(type >= OBJ_TYPE_BASIC_MAX)) + return; + + kvstoreDictMetadata *dictMeta = kvstoreGetDictMetadata(db->keys, didx); + kvstoreMetadata *kvstoreMeta = kvstoreGetMetadata(db->keys); + + if (oldLen != 0) { + int old_bin = log2ceil(oldLen); + debugServerAssertWithInfo(server.current_client, NULL, old_bin < MAX_KEYSIZES_BINS); + /* If following a key deletion it is last one in slot's dict, then + * slot's dict might get released as well. Verify if metadata is not NULL. */ + if(dictMeta) dictMeta->keysizes_hist[type][old_bin]--; + kvstoreMeta->keysizes_hist[type][old_bin]--; + } + + if (newLen != 0) { + int new_bin = log2ceil(newLen); + debugServerAssertWithInfo(server.current_client, NULL, new_bin < MAX_KEYSIZES_BINS); + /* If following a key deletion it is last one in slot's dict, then + * slot's dict might get released as well. Verify if metadata is not NULL. */ + if(dictMeta) dictMeta->keysizes_hist[type][new_bin]++; + kvstoreMeta->keysizes_hist[type][new_bin]++; + } +} + /* Lookup a key for read or write operations, or return NULL if the key is not * found in the specified DB. This function implements the functionality of * lookupKeyRead(), lookupKeyWrite() and their ...WithFlags() variants. @@ -205,6 +249,7 @@ static dictEntry *dbAddInternal(redisDb *db, robj *key, robj *val, int update_if kvstoreDictSetVal(db->keys, slot, de, val); signalKeyAsReady(db, key, val->type); notifyKeyspaceEvent(NOTIFY_NEW,"new",key,db->id); + updateKeysizesHist(db, slot, val->type, 0, getObjectLength(val)); /* add hist */ return de; } @@ -250,6 +295,7 @@ int dbAddRDBLoad(redisDb *db, sds key, robj *val) { int slot = getKeySlot(key); dictEntry *de = kvstoreDictAddRaw(db->keys, slot, key, NULL); if (de == NULL) return 0; + updateKeysizesHist(db, slot, val->type, 0, getObjectLength(val)); /* add hist */ initObjectLRUOrLFU(val); kvstoreDictSetVal(db->keys, slot, de, val); return 1; @@ -273,6 +319,9 @@ static void dbSetValue(redisDb *db, robj *key, robj *val, int overwrite, dictEnt serverAssertWithInfo(NULL,key,de != NULL); robj *old = dictGetVal(de); + /* Remove old key from keysizes histogram */ + updateKeysizesHist(db, slot, old->type, getObjectLength(old), 0); /* remove hist */ + val->lru = old->lru; if (overwrite) { @@ -291,6 +340,9 @@ static void dbSetValue(redisDb *db, robj *key, robj *val, int overwrite, dictEnt } kvstoreDictSetVal(db->keys, slot, de, val); + /* Add new key to keysizes histogram */ + updateKeysizesHist(db, slot, val->type, 0, getObjectLength(val)); + /* if hash with HFEs, take care to remove from global HFE DS */ if (old->type == OBJ_HASH) hashTypeRemoveFromExpires(&db->hexpires, old); @@ -404,6 +456,9 @@ int dbGenericDelete(redisDb *db, robj *key, int async, int flags) { if (de) { robj *val = dictGetVal(de); + /* remove key from histogram */ + updateKeysizesHist(db, slot, val->type, getObjectLength(val), 0); + /* If hash object with expiry on fields, remove it from HFE DS of DB */ if (val->type == OBJ_HASH) hashTypeRemoveFromExpires(&db->hexpires, val); @@ -599,7 +654,8 @@ redisDb *initTempDb(void) { redisDb *tempDb = zcalloc(sizeof(redisDb)*server.dbnum); for (int i=0; itype->userdata; - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + kvstoreDictMetaBase *metadata = (kvstoreDictMetaBase *)dictMetadata(d); listAddNodeTail(kvs->rehashing, d); metadata->rehashing_node = listLast(kvs->rehashing); @@ -201,7 +209,7 @@ static void kvstoreDictRehashingStarted(dict *d) { * the old ht size of the dictionary from the total sum of buckets for a DB. */ static void kvstoreDictRehashingCompleted(dict *d) { kvstore *kvs = d->type->userdata; - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + kvstoreDictMetaBase *metadata = (kvstoreDictMetaBase *)dictMetadata(d); if (metadata->rehashing_node) { listDelNode(kvs->rehashing, metadata->rehashing_node); metadata->rehashing_node = NULL; @@ -214,10 +222,15 @@ static void kvstoreDictRehashingCompleted(dict *d) { kvs->overhead_hashtable_rehashing -= from; } -/* Returns the size of the DB dict metadata in bytes. */ -static size_t kvstoreDictMetadataSize(dict *d) { +/* Returns the size of the DB dict base metadata in bytes. */ +static size_t kvstoreDictMetaBaseSize(dict *d) { UNUSED(d); - return sizeof(kvstoreDictMetadata); + return sizeof(kvstoreDictMetaBase); +} +/* Returns the size of the DB dict extended metadata in bytes. */ +static size_t kvstoreDictMetadataExtendSize(dict *d) { + UNUSED(d); + return sizeof(kvstoreDictMetaEx); } /**********************************/ @@ -232,7 +245,13 @@ kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) { * for the dict cursor, see kvstoreScan */ assert(num_dicts_bits <= 16); - kvstore *kvs = zcalloc(sizeof(*kvs)); + /* Calc kvstore size */ + size_t kvsize = sizeof(kvstore); + /* Conditionally calc also histogram size */ + if (flags & KVSTORE_ALLOC_META_KEYS_HIST) + kvsize += sizeof(kvstoreMetadata); + + kvstore *kvs = zcalloc(kvsize); memcpy(&kvs->dtype, type, sizeof(kvs->dtype)); kvs->flags = flags; @@ -243,7 +262,10 @@ kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) { assert(!type->rehashingStarted); assert(!type->rehashingCompleted); kvs->dtype.userdata = kvs; - kvs->dtype.dictMetadataBytes = kvstoreDictMetadataSize; + if (flags & KVSTORE_ALLOC_META_KEYS_HIST) + kvs->dtype.dictMetadataBytes = kvstoreDictMetadataExtendSize; + else + kvs->dtype.dictMetadataBytes = kvstoreDictMetaBaseSize; kvs->dtype.rehashingStarted = kvstoreDictRehashingStarted; kvs->dtype.rehashingCompleted = kvstoreDictRehashingCompleted; @@ -263,7 +285,6 @@ kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) { kvs->bucket_count = 0; kvs->overhead_hashtable_lut = 0; kvs->overhead_hashtable_rehashing = 0; - return kvs; } @@ -272,9 +293,13 @@ void kvstoreEmpty(kvstore *kvs, void(callback)(dict*)) { dict *d = kvstoreGetDict(kvs, didx); if (!d) continue; - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + kvstoreDictMetaBase *metadata = (kvstoreDictMetaBase *)dictMetadata(d); if (metadata->rehashing_node) metadata->rehashing_node = NULL; + if (kvs->flags & KVSTORE_ALLOC_META_KEYS_HIST) { + kvstoreDictMetaEx *metaExt = (kvstoreDictMetaEx *) metadata; + memset(&metaExt->meta.keysizes_hist, 0, sizeof(metaExt->meta.keysizes_hist)); + } dictEmpty(d, callback); freeDictIfNeeded(kvs, didx); } @@ -296,7 +321,7 @@ void kvstoreRelease(kvstore *kvs) { dict *d = kvstoreGetDict(kvs, didx); if (!d) continue; - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + kvstoreDictMetaBase *metadata = (kvstoreDictMetaBase *)dictMetadata(d); if (metadata->rehashing_node) metadata->rehashing_node = NULL; dictRelease(d); @@ -330,11 +355,15 @@ unsigned long kvstoreBuckets(kvstore *kvs) { size_t kvstoreMemUsage(kvstore *kvs) { size_t mem = sizeof(*kvs); + size_t metaSize = sizeof(kvstoreDictMetaBase); + if (kvs->flags & KVSTORE_ALLOC_META_KEYS_HIST) + metaSize = sizeof(kvstoreDictMetaEx); + unsigned long long keys_count = kvstoreSize(kvs); mem += keys_count * dictEntryMemUsage() + kvstoreBuckets(kvs) * sizeof(dictEntry*) + - kvs->allocated_dicts * (sizeof(dict) + kvstoreDictMetadataSize(NULL)); + kvs->allocated_dicts * (sizeof(dict) + metaSize); /* Values are dict* shared with kvs->dicts */ mem += listLength(kvs->rehashing) * sizeof(listNode); @@ -785,7 +814,7 @@ void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn) /* After defragmenting the dict, update its corresponding * rehashing node in the kvstore's rehashing list. */ - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(*d); + kvstoreDictMetaBase *metadata = (kvstoreDictMetaBase *)dictMetadata(*d); if (metadata->rehashing_node) metadata->rehashing_node->value = *d; } @@ -856,6 +885,19 @@ int kvstoreDictDelete(kvstore *kvs, int didx, const void *key) { return ret; } +kvstoreDictMetadata *kvstoreGetDictMetadata(kvstore *kvs, int didx) { + dict *d = kvstoreGetDict(kvs, didx); + if ((!d) || (!(kvs->flags & KVSTORE_ALLOC_META_KEYS_HIST))) + return NULL; + + kvstoreDictMetaEx *metadata = (kvstoreDictMetaEx *)dictMetadata(d); + return &(metadata->meta); +} + +kvstoreMetadata *kvstoreGetMetadata(kvstore *kvs) { + return (kvstoreMetadata *) &kvs->metadata; +} + #ifdef REDIS_TEST #include #include "testhelp.h" @@ -1029,7 +1071,8 @@ int kvstoreTest(int argc, char **argv, int flags) { } TEST("Verify non-empty dict count is correctly updated") { - kvstore *kvs = kvstoreCreate(&KvstoreDictTestType, 2, KVSTORE_ALLOCATE_DICTS_ON_DEMAND); + kvstore *kvs = kvstoreCreate(&KvstoreDictTestType, 2, + KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_ALLOC_META_KEYS_HIST); for (int idx = 0; idx < 4; idx++) { for (i = 0; i < 16; i++) { de = kvstoreDictAddRaw(kvs, idx, stringFromInt(i), NULL); diff --git a/src/kvstore.h b/src/kvstore.h index bce45fe4c..3c3f7948c 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -4,6 +4,21 @@ #include "dict.h" #include "adlist.h" +/* maximum number of bins of keysizes histogram */ +#define MAX_KEYSIZES_BINS 48 +#define MAX_KEYSIZES_TYPES 5 /* static_assert at db.c verifies == OBJ_TYPE_BASIC_MAX */ + +/* When creating kvstore with flag `KVSTORE_ALLOC_META_KEYS_HIST`, then kvstore + * alloc and memset struct kvstoreMetadata on init, yet, managed outside kvstore */ +typedef struct { + uint64_t keysizes_hist[MAX_KEYSIZES_TYPES][MAX_KEYSIZES_BINS]; +} kvstoreMetadata; + +/* Like kvstoreMetadata, this one per dict */ +typedef struct { + uint64_t keysizes_hist[MAX_KEYSIZES_TYPES][MAX_KEYSIZES_BINS]; +} kvstoreDictMetadata; + typedef struct _kvstore kvstore; typedef struct _kvstoreIterator kvstoreIterator; typedef struct _kvstoreDictIterator kvstoreDictIterator; @@ -13,6 +28,7 @@ typedef int (kvstoreExpandShouldSkipDictIndex)(int didx); #define KVSTORE_ALLOCATE_DICTS_ON_DEMAND (1<<0) #define KVSTORE_FREE_EMPTY_DICTS (1<<1) +#define KVSTORE_ALLOC_META_KEYS_HIST (1<<2) /* Alloc keysizes histogram */ kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags); void kvstoreEmpty(kvstore *kvs, void(callback)(dict*)); void kvstoreRelease(kvstore *kvs); @@ -71,6 +87,8 @@ void kvstoreDictSetVal(kvstore *kvs, int didx, dictEntry *de, void *val); dictEntry *kvstoreDictTwoPhaseUnlinkFind(kvstore *kvs, int didx, const void *key, dictEntry ***plink, int *table_index); void kvstoreDictTwoPhaseUnlinkFree(kvstore *kvs, int didx, dictEntry *he, dictEntry **plink, int table_index); int kvstoreDictDelete(kvstore *kvs, int didx, const void *key); +kvstoreDictMetadata *kvstoreGetDictMetadata(kvstore *kvs, int didx); +kvstoreMetadata *kvstoreGetMetadata(kvstore *kvs); #ifdef REDIS_TEST int kvstoreTest(int argc, char *argv[], int flags); diff --git a/src/lazyfree.c b/src/lazyfree.c index 858751757..c33bc923e 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -207,7 +207,7 @@ void emptyDbAsync(redisDb *db) { } kvstore *oldkeys = db->keys, *oldexpires = db->expires; ebuckets oldHfe = db->hexpires; - db->keys = kvstoreCreate(&dbDictType, slot_count_bits, flags); + db->keys = kvstoreCreate(&dbDictType, slot_count_bits, flags | KVSTORE_ALLOC_META_KEYS_HIST); db->expires = kvstoreCreate(&dbExpiresDictType, slot_count_bits, flags); db->hexpires = ebCreate(); atomicIncr(lazyfree_objects, kvstoreSize(oldkeys)); diff --git a/src/module.c b/src/module.c index 2b6e625f1..032c3557c 100644 --- a/src/module.c +++ b/src/module.c @@ -4171,15 +4171,7 @@ int RM_KeyType(RedisModuleKey *key) { * If the key pointer is NULL or the key is empty, zero is returned. */ size_t RM_ValueLength(RedisModuleKey *key) { if (key == NULL || key->value == NULL) return 0; - switch(key->value->type) { - case OBJ_STRING: return stringObjectLen(key->value); - case OBJ_LIST: return listTypeLength(key->value); - case OBJ_SET: return setTypeSize(key->value); - case OBJ_ZSET: return zsetLength(key->value); - case OBJ_HASH: return hashTypeLength(key->value, 0); /* OPEN: To subtract expired fields? */ - case OBJ_STREAM: return streamLength(key->value); - default: return 0; - } + return getObjectLength(key->value); } /* If the key is open for writing, remove it, and setup the key to diff --git a/src/object.c b/src/object.c index 2b42e7b3e..d065359fa 100644 --- a/src/object.c +++ b/src/object.c @@ -680,6 +680,18 @@ robj *tryObjectEncoding(robj *o) { return tryObjectEncodingEx(o, 1); } +size_t getObjectLength(robj *o) { + switch(o->type) { + case OBJ_STRING: return stringObjectLen(o); + case OBJ_LIST: return listTypeLength(o); + case OBJ_SET: return setTypeSize(o); + case OBJ_ZSET: return zsetLength(o); + case OBJ_HASH: return hashTypeLength(o, 0); + case OBJ_STREAM: return streamLength(o); + default: return 0; + } +} + /* Get a decoded version of an encoded object (returned as a new object). * If the object is already raw-encoded just increment the ref count. */ robj *getDecodedObject(robj *o) { diff --git a/src/server.c b/src/server.c index 72208c7e2..054ad171f 100644 --- a/src/server.c +++ b/src/server.c @@ -2690,7 +2690,7 @@ void initServer(void) { flags |= KVSTORE_FREE_EMPTY_DICTS; } for (j = 0; j < server.dbnum; j++) { - server.db[j].keys = kvstoreCreate(&dbDictType, slot_count_bits, flags); + server.db[j].keys = kvstoreCreate(&dbDictType, slot_count_bits, flags | KVSTORE_ALLOC_META_KEYS_HIST); server.db[j].expires = kvstoreCreate(&dbExpiresDictType, slot_count_bits, flags); server.db[j].hexpires = ebCreate(); server.db[j].expires_cursor = 0; @@ -5521,7 +5521,7 @@ void releaseInfoSectionDict(dict *sec) { dict *genInfoSectionDict(robj **argv, int argc, char **defaults, int *out_all, int *out_everything) { char *default_sections[] = { "server", "clients", "memory", "persistence", "stats", "replication", - "cpu", "module_list", "errorstats", "cluster", "keyspace", NULL}; + "cpu", "module_list", "errorstats", "cluster", "keyspace", "keysizes", NULL}; if (!defaults) defaults = default_sections; @@ -6149,6 +6149,60 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { } } + /* keysizes */ + if (all_sections || (dictFind(section_dict,"keysizes") != NULL)) { + if (sections++) info = sdscat(info,"\r\n"); + info = sdscatprintf(info, "# Keysizes\r\n"); + + char *typestr[] = { + [OBJ_STRING] = "distrib_strings_sizes", + [OBJ_LIST] = "distrib_lists_items", + [OBJ_SET] = "distrib_sets_items", + [OBJ_ZSET] = "distrib_zsets_items", + [OBJ_HASH] = "distrib_hashes_items" + }; + serverAssert(sizeof(typestr)/sizeof(typestr[0]) == OBJ_TYPE_BASIC_MAX); + + for (int dbnum = 0; dbnum < server.dbnum; dbnum++) { + char *expSizeLabels[] = { + "1", "2", "4", "8", "16", "32", "64", "128", "256", "512", /* Byte */ + "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", /* Kilo */ + "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", /* Mega */ + "1G", "2G", "4G", "8G", "16G", "32G", "64G", "128G", "256G", "512G", /* Giga */ + "1T", "2T", "4T", "8T", "16T", "32T", "64T", "128T", "256T", "512T", /* Tera */ + "1P", "2P", "4P", "8P", "16P", "32P", "64P", "128P", "256P", "512P", /* Peta */ + "1E", "2E", "4E", "8E" /* Exa */ + }; + + if (kvstoreSize(server.db[dbnum].keys) == 0) + continue; + + for (int type = 0; type < OBJ_TYPE_BASIC_MAX; type++) { + uint64_t *kvstoreHist = kvstoreGetMetadata(server.db[dbnum].keys)->keysizes_hist[type]; + char buf[10000]; + int cnt = 0, buflen = 0; + + /* Print histogram to temp buf[]. First bin is garbage */ + buflen += snprintf(buf + buflen, sizeof(buf) - buflen, "db%d_%s:", dbnum, typestr[type]); + + for (int i = 0; i < MAX_KEYSIZES_BINS; i++) { + if (kvstoreHist[i] == 0) + continue; + + int res = snprintf(buf + buflen, sizeof(buf) - buflen, + (cnt == 0) ? "%s=%llu" : ",%s=%llu", + expSizeLabels[i], (unsigned long long) kvstoreHist[i]); + if (res < 0) break; + buflen += res; + cnt += kvstoreHist[i]; + } + + /* Print the temp buf[] to the info string */ + if (cnt) info = sdscatprintf(info, "%s\r\n", buf); + } + } + } + /* Get info from modules. * Returned when the user asked for "everything", "modules", or a specific module section. * We're not aware of the module section names here, and we rather avoid the search when we can. diff --git a/src/server.h b/src/server.h index b650f2699..4f5686192 100644 --- a/src/server.h +++ b/src/server.h @@ -41,10 +41,6 @@ #include #endif -#ifndef static_assert -#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] -#endif - typedef long long mstime_t; /* millisecond time type. */ typedef long long ustime_t; /* microsecond time type. */ @@ -698,6 +694,7 @@ typedef enum { #define OBJ_SET 2 /* Set object. */ #define OBJ_ZSET 3 /* Sorted set object. */ #define OBJ_HASH 4 /* Hash object. */ +#define OBJ_TYPE_BASIC_MAX 5 /* Max number of basic object types. */ /* The "module" object type is a special one that signals that the object * is one directly managed by a Redis module. In this case the value points @@ -969,7 +966,7 @@ typedef struct replBufBlock { * by integers from 0 (the default database) up to the max configured * database. The database number is the 'id' field in the structure. */ typedef struct redisDb { - kvstore *keys; /* The keyspace for this DB */ + kvstore *keys; /* The keyspace for this DB. As metadata, holds keysizes histogram */ kvstore *expires; /* Timeout of keys with a timeout set */ ebuckets hexpires; /* Hash expiration DS. Single TTL per hash (of next min field to expire) */ dict *blocking_keys; /* Keys with clients waiting for data (BLPOP)*/ @@ -2799,6 +2796,7 @@ int isSdsRepresentableAsLongLong(sds s, long long *llval); int isObjectRepresentableAsLongLong(robj *o, long long *llongval); robj *tryObjectEncoding(robj *o); robj *tryObjectEncodingEx(robj *o, int try_trim); +size_t getObjectLength(robj *o); robj *getDecodedObject(robj *o); size_t stringObjectLen(robj *o); robj *createStringObjectFromLongLong(long long value); @@ -3363,6 +3361,7 @@ long long getModuleNumericConfig(ModuleConfig *module_config); int setModuleNumericConfig(ModuleConfig *config, long long val, const char **err); /* db.c -- Keyspace access API */ +void updateKeysizesHist(redisDb *db, int didx, uint32_t type, uint64_t oldLen, uint64_t newLen); int removeExpire(redisDb *db, robj *key); void deleteExpiredKeyAndPropagate(redisDb *db, robj *keyobj); void deleteEvictedKeyAndPropagate(redisDb *db, robj *keyobj, long long *key_mem_freed); diff --git a/src/t_hash.c b/src/t_hash.c index 1625513eb..f114fa9b9 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -422,8 +422,13 @@ void listpackExExpire(redisDb *db, robj *o, ExpireInfo *info) { expired++; } - if (expired) + if (expired) { lpt->lp = lpDeleteRange(lpt->lp, 0, expired * 3); + + /* update keysizes */ + unsigned long l = lpLength(lpt->lp) / 3; + updateKeysizesHist(db, getKeySlot(lpt->key), OBJ_HASH, l + expired, l); + } min = hashTypeGetMinExpire(o, 1 /*accurate*/); info->nextExpireTime = min; @@ -546,6 +551,11 @@ SetExRes hashTypeSetExpiryListpack(HashTypeSetEx *ex, sds field, if (unlikely(checkAlreadyExpired(expireAt))) { propagateHashFieldDeletion(ex->db, ex->key->ptr, field, sdslen(field)); hashTypeDelete(ex->hashObj, field, 1); + + /* get listpack length */ + listpackEx *lpt = ((listpackEx *) ex->hashObj->ptr); + unsigned long length = lpLength(lpt->lp) / 3; + updateKeysizesHist(ex->db, getKeySlot(ex->key->ptr), OBJ_HASH, length+1, length); server.stat_expired_subkeys++; ex->fieldDeleted++; return HSETEX_DELETED; @@ -1042,6 +1052,8 @@ SetExRes hashTypeSetExpiryHT(HashTypeSetEx *exInfo, sds field, uint64_t expireAt /* If expired, then delete the field and propagate the deletion. * If replica, continue like the field is valid */ if (unlikely(checkAlreadyExpired(expireAt))) { + unsigned long length = dictSize(ht); + updateKeysizesHist(exInfo->db, getKeySlot(exInfo->key->ptr), OBJ_HASH, length, length-1); /* replicas should not initiate deletion of fields */ propagateHashFieldDeletion(exInfo->db, exInfo->key->ptr, field, sdslen(field)); hashTypeDelete(exInfo->hashObj, field, 1); @@ -2132,6 +2144,7 @@ ebuckets *hashTypeGetDictMetaHFE(dict *d) { *----------------------------------------------------------------------------*/ void hsetnxCommand(client *c) { + unsigned long hlen; int isHashDeleted; robj *o; if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; @@ -2152,6 +2165,8 @@ void hsetnxCommand(client *c) { addReply(c, shared.cone); signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_HASH,"hset",c->argv[1],c->db->id); + hlen = hashTypeLength(o, 0); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH, hlen - 1, hlen); server.dirty++; } @@ -2180,6 +2195,8 @@ void hsetCommand(client *c) { addReply(c, shared.ok); } signalModifiedKey(c,c->db,c->argv[1]); + unsigned long l = hashTypeLength(o, 0); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH, l - created, l); notifyKeyspaceEvent(NOTIFY_HASH,"hset",c->argv[1],c->db->id); server.dirty += (c->argc - 2)/2; } @@ -2205,11 +2222,14 @@ void hincrbyCommand(client *c) { } /* Else hashTypeGetValue() already stored it into &value */ } else if ((res == GETF_NOT_FOUND) || (res == GETF_EXPIRED)) { value = 0; + unsigned long l = hashTypeLength(o, 0); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH, l, l + 1); } else { /* Field expired and in turn hash deleted. Create new one! */ o = createHashObject(); dbAdd(c->db,c->argv[1],o); value = 0; + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH, 0, 1); } oldvalue = value; @@ -2254,11 +2274,14 @@ void hincrbyfloatCommand(client *c) { } } else if ((res == GETF_NOT_FOUND) || (res == GETF_EXPIRED)) { value = 0; + unsigned long l = hashTypeLength(o, 0); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH, l, l + 1); } else { /* Field expired and in turn hash deleted. Create new one! */ o = createHashObject(); dbAdd(c->db,c->argv[1],o); value = 0; + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH, 0, 1); } value += incr; @@ -2356,6 +2379,8 @@ void hdelCommand(client *c) { if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || checkType(c,o,OBJ_HASH)) return; + unsigned long oldLen = hashTypeLength(o, 0); + /* Hash field expiration is optimized to avoid frequent update global HFE DS for * each field deletion. Eventually active-expiration will run and update or remove * the hash from global HFE DS gracefully. Nevertheless, statistic "subexpiry" @@ -2375,6 +2400,8 @@ void hdelCommand(client *c) { } } if (deleted) { + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH, oldLen, oldLen - deleted); + signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_HASH,"hdel",c->argv[1],c->db->id); if (keyremoved) { @@ -2943,6 +2970,11 @@ static ExpireAction onFieldExpire(eItem item, void *ctx) { dict *d = expCtx->hashObj->ptr; dictExpireMetadata *dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); propagateHashFieldDeletion(expCtx->db, dictExpireMeta->key, hf, hfieldlen(hf)); + + /* update keysizes */ + unsigned long l = hashTypeLength(expCtx->hashObj, 0); + updateKeysizesHist(expCtx->db, getKeySlot(dictExpireMeta->key), OBJ_HASH, l, l - 1); + serverAssert(hashTypeDelete(expCtx->hashObj, hf, 0) == 1); server.stat_expired_subkeys++; return ACT_REMOVE_EXP_ITEM; diff --git a/src/t_list.c b/src/t_list.c index 98b180aa1..9263cbd12 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -7,6 +7,7 @@ */ #include "server.h" +#include "util.h" /*----------------------------------------------------------------------------- * List API @@ -462,6 +463,7 @@ void listTypeDelRange(robj *subject, long start, long count) { /* Implements LPUSH/RPUSH/LPUSHX/RPUSHX. * 'xx': push if key exists. */ void pushGenericCommand(client *c, int where, int xx) { + unsigned long llen; int j; robj *lobj = lookupKeyWrite(c->db, c->argv[1]); @@ -482,11 +484,13 @@ void pushGenericCommand(client *c, int where, int xx) { server.dirty++; } - addReplyLongLong(c, listTypeLength(lobj)); + llen = listTypeLength(lobj); + addReplyLongLong(c, llen); char *event = (where == LIST_HEAD) ? "lpush" : "rpush"; signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_LIST,event,c->argv[1],c->db->id); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_LIST, llen - (c->argc - 2), llen); } /* LPUSH [ ...] */ @@ -553,6 +557,8 @@ void linsertCommand(client *c) { notifyKeyspaceEvent(NOTIFY_LIST,"linsert", c->argv[1],c->db->id); server.dirty++; + unsigned long ll = listTypeLength(subject); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_LIST, ll-1, ll); } else { /* Notify client of a failed insert */ addReplyLongLong(c,-1); @@ -736,9 +742,11 @@ void addListRangeReply(client *c, robj *o, long start, long end, int reverse) { * if the key got deleted by this function. */ void listElementsRemoved(client *c, robj *key, int where, robj *o, long count, int signal, int *deleted) { char *event = (where == LIST_HEAD) ? "lpop" : "rpop"; - + unsigned long llen = listTypeLength(o); + notifyKeyspaceEvent(NOTIFY_LIST, event, key, c->db->id); - if (listTypeLength(o) == 0) { + updateKeysizesHist(c->db, getKeySlot(key->ptr), OBJ_LIST, llen + count, llen); + if (llen == 0) { if (deleted) *deleted = 1; dbDelete(c->db, key); @@ -870,7 +878,7 @@ void lrangeCommand(client *c) { /* LTRIM */ void ltrimCommand(client *c) { robj *o; - long start, end, llen, ltrim, rtrim; + long start, end, llen, ltrim, rtrim, llenNew;; if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != C_OK) || (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != C_OK)) return; @@ -908,12 +916,13 @@ void ltrimCommand(client *c) { } notifyKeyspaceEvent(NOTIFY_LIST,"ltrim",c->argv[1],c->db->id); - if (listTypeLength(o) == 0) { + if ((llenNew = listTypeLength(o)) == 0) { dbDelete(c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_GENERIC,"del",c->argv[1],c->db->id); } else { listTypeTryConversion(o,LIST_CONV_SHRINKING,NULL,NULL); } + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_LIST, llen, llenNew); signalModifiedKey(c,c->db,c->argv[1]); server.dirty += (ltrim + rtrim); addReply(c,shared.ok); @@ -1066,8 +1075,11 @@ void lremCommand(client *c) { listTypeReleaseIterator(li); if (removed) { + long ll = listTypeLength(subject); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_LIST, ll + removed, ll); notifyKeyspaceEvent(NOTIFY_LIST,"lrem",c->argv[1],c->db->id); - if (listTypeLength(subject) == 0) { + + if (ll == 0) { dbDelete(c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_GENERIC,"del",c->argv[1],c->db->id); } else { @@ -1089,6 +1101,10 @@ void lmoveHandlePush(client *c, robj *dstkey, robj *dstobj, robj *value, listTypeTryConversionAppend(dstobj,&value,0,0,NULL,NULL); listTypePush(dstobj,value,where); signalModifiedKey(c,c->db,dstkey); + + long ll = listTypeLength(dstobj); + updateKeysizesHist(c->db, getKeySlot(dstkey->ptr), OBJ_LIST, ll - 1, ll); + notifyKeyspaceEvent(NOTIFY_LIST, where == LIST_HEAD ? "lpush" : "rpush", dstkey, diff --git a/src/t_set.c b/src/t_set.c index 7b62f94b6..37055275c 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -603,6 +603,8 @@ void saddCommand(client *c) { if (setTypeAdd(set,c->argv[j]->ptr)) added++; } if (added) { + unsigned long size = setTypeSize(set); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_SET, size - added, size); signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_SET,"sadd",c->argv[1],c->db->id); } @@ -617,6 +619,8 @@ void sremCommand(client *c) { if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || checkType(c,set,OBJ_SET)) return; + unsigned long oldSize = setTypeSize(set); + for (j = 2; j < c->argc; j++) { if (setTypeRemove(set,c->argv[j]->ptr)) { deleted++; @@ -628,6 +632,8 @@ void sremCommand(client *c) { } } if (deleted) { + + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_SET, oldSize, oldSize - deleted); signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_SET,"srem",c->argv[1],c->db->id); if (keyremoved) @@ -669,8 +675,12 @@ void smoveCommand(client *c) { } notifyKeyspaceEvent(NOTIFY_SET,"srem",c->argv[1],c->db->id); + /* Update keysizes histogram */ + unsigned long srcLen = setTypeSize(srcset); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_SET, srcLen + 1, srcLen); + /* Remove the src set from the database when empty */ - if (setTypeSize(srcset) == 0) { + if (srcLen == 0) { dbDelete(c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_GENERIC,"del",c->argv[1],c->db->id); } @@ -686,6 +696,8 @@ void smoveCommand(client *c) { /* An extra key has changed when ele was successfully added to dstset */ if (setTypeAdd(dstset,ele->ptr)) { + unsigned long dstLen = setTypeSize(dstset); + updateKeysizesHist(c->db, getKeySlot(c->argv[2]->ptr), OBJ_SET, dstLen - 1, dstLen); server.dirty++; signalModifiedKey(c,c->db,c->argv[2]); notifyKeyspaceEvent(NOTIFY_SET,"sadd",c->argv[2],c->db->id); @@ -743,7 +755,7 @@ void scardCommand(client *c) { void spopWithCountCommand(client *c) { long l; - unsigned long count, size; + unsigned long count, size, toRemove; robj *set; /* Get the count argument */ @@ -763,10 +775,12 @@ void spopWithCountCommand(client *c) { } size = setTypeSize(set); + toRemove = (count >= size) ? size : count; /* Generate an SPOP keyspace notification */ notifyKeyspaceEvent(NOTIFY_SET,"spop",c->argv[1],c->db->id); - server.dirty += (count >= size) ? size : count; + server.dirty += toRemove; + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_SET, size, size - toRemove); /* CASE 1: * The number of requested elements is greater than or equal to @@ -949,6 +963,7 @@ void spopWithCountCommand(client *c) { } void spopCommand(client *c) { + unsigned long size; robj *set, *ele; if (c->argc == 3) { @@ -964,6 +979,9 @@ void spopCommand(client *c) { if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.null[c->resp])) == NULL || checkType(c,set,OBJ_SET)) return; + size = setTypeSize(set); + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_SET, size, size-1); + /* Pop a random element from the set */ ele = setTypePopRandom(set); diff --git a/src/t_string.c b/src/t_string.c index d1d6dce39..c96f5e89e 100644 --- a/src/t_string.c +++ b/src/t_string.c @@ -420,6 +420,7 @@ void getsetCommand(client *c) { } void setrangeCommand(client *c) { + size_t oldLen = 0, newLen; robj *o; long offset; sds value = c->argv[3]->ptr; @@ -449,16 +450,14 @@ void setrangeCommand(client *c) { o = createObject(OBJ_STRING,sdsnewlen(NULL, offset+value_len)); dbAdd(c->db,c->argv[1],o); } else { - size_t olen; - /* Key exists, check type */ if (checkType(c,o,OBJ_STRING)) return; /* Return existing string length when setting nothing */ - olen = stringObjectLen(o); + oldLen = stringObjectLen(o); if (value_len == 0) { - addReplyLongLong(c,olen); + addReplyLongLong(c, oldLen); return; } @@ -478,7 +477,10 @@ void setrangeCommand(client *c) { "setrange",c->argv[1],c->db->id); server.dirty++; } - addReplyLongLong(c,sdslen(o->ptr)); + + newLen = sdslen(o->ptr); + updateKeysizesHist(c->db,getKeySlot(c->argv[1]->ptr),OBJ_STRING,oldLen,newLen); + addReplyLongLong(c,newLen); } void getrangeCommand(client *c) { @@ -669,7 +671,7 @@ void incrbyfloatCommand(client *c) { } void appendCommand(client *c) { - size_t totlen; + size_t totlen, append_len; robj *o, *append; dictEntry *de; @@ -679,7 +681,7 @@ void appendCommand(client *c) { c->argv[2] = tryObjectEncoding(c->argv[2]); dbAdd(c->db,c->argv[1],c->argv[2]); incrRefCount(c->argv[2]); - totlen = stringObjectLen(c->argv[2]); + append_len = totlen = stringObjectLen(c->argv[2]); } else { /* Key exists, check type */ if (checkType(c,o,OBJ_STRING)) @@ -687,7 +689,7 @@ void appendCommand(client *c) { /* "append" is an argument, so always an sds */ append = c->argv[2]; - const size_t append_len = sdslen(append->ptr); + append_len = sdslen(append->ptr); if (checkStringLength(c,stringObjectLen(o),append_len) != C_OK) return; @@ -699,6 +701,7 @@ void appendCommand(client *c) { signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_STRING,"append",c->argv[1],c->db->id); server.dirty++; + updateKeysizesHist(c->db,getKeySlot(c->argv[1]->ptr),OBJ_STRING, totlen - append_len, totlen); addReplyLongLong(c,totlen); } diff --git a/src/t_zset.c b/src/t_zset.c index 590f21f86..7b014f0d0 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -1843,6 +1843,7 @@ void zaddGenericCommand(client *c, int flags) { zsetTypeMaybeConvert(zobj, elements); } + unsigned long llen = zsetLength(zobj); for (j = 0; j < elements; j++) { double newscore; score = scores[j]; @@ -1860,6 +1861,7 @@ void zaddGenericCommand(client *c, int flags) { score = newscore; } server.dirty += (added+updated); + updateKeysizesHist(c->db, getKeySlot(key->ptr), OBJ_ZSET, llen, llen+added); reply_to_client: if (incr) { /* ZINCRBY or INCR option. */ @@ -1907,8 +1909,13 @@ void zremCommand(client *c) { if (deleted) { notifyKeyspaceEvent(NOTIFY_ZSET,"zrem",key,c->db->id); - if (keyremoved) - notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + if (keyremoved) { + notifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, c->db->id); + /* No need updateKeysizesHist(). dbDelete() done it already. */ + } else { + unsigned long len = zsetLength(zobj); + updateKeysizesHist(c->db, getKeySlot(key->ptr), OBJ_ZSET, len + deleted, len); + } signalModifiedKey(c,c->db,key); server.dirty += deleted; } @@ -2023,8 +2030,13 @@ void zremrangeGenericCommand(client *c, zrange_type rangetype) { if (deleted) { signalModifiedKey(c,c->db,key); notifyKeyspaceEvent(NOTIFY_ZSET,notify_type,key,c->db->id); - if (keyremoved) - notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + if (keyremoved) { + notifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, c->db->id); + /* No need updateKeysizesHist(). dbDelete() done it already. */ + } else { + unsigned long len = zsetLength(zobj); + updateKeysizesHist(c->db, getKeySlot(key->ptr), OBJ_ZSET, len + deleted, len); + } } server.dirty += deleted; addReplyLongLong(c,deleted); @@ -4031,6 +4043,9 @@ void genericZpopCommand(client *c, robj **keyv, int keyc, int where, int emitkey dbDelete(c->db,key); notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + /* No need updateKeysizesHist(). dbDelete() done it already. */ + } else { + updateKeysizesHist(c->db, getKeySlot(key->ptr), OBJ_ZSET, llen, llen - result_count); } signalModifiedKey(c,c->db,key); diff --git a/src/util.c b/src/util.c index c0e69af4d..ec0a3fb0e 100644 --- a/src/util.c +++ b/src/util.c @@ -54,6 +54,13 @@ #define UNUSED(x) ((void)(x)) +/* Selectively define static_assert. Attempt to avoid include server.h in this file. */ +#ifndef static_assert +#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] +#endif + +static_assert(UINTPTR_MAX == 0xffffffffffffffff || UINTPTR_MAX == 0xffffffff, "Unsupported pointer size"); + /* Glob-style pattern matching. */ static int stringmatchlen_impl(const char *pattern, int patternLen, const char *string, int stringLen, int nocase, int *skipLongerMatches, int nesting) diff --git a/src/util.h b/src/util.h index 518342f02..07cfb61dc 100644 --- a/src/util.h +++ b/src/util.h @@ -79,6 +79,19 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...); size_t redis_strlcpy(char *dst, const char *src, size_t dsize); size_t redis_strlcat(char *dst, const char *src, size_t dsize); +/* to keep it opt without conditions Works only for: 0 < x < 2^63 */ +static inline int log2ceil(size_t x) { +#if UINTPTR_MAX == 0xffffffffffffffff + return 63 - __builtin_clzll(x); +#else + return 31 - __builtin_clz(x); +#endif +} + +#ifndef static_assert +#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] +#endif + #ifdef REDIS_TEST int utilTest(int argc, char **argv, int flags); #endif diff --git a/tests/unit/info-keysizes.tcl b/tests/unit/info-keysizes.tcl new file mode 100644 index 000000000..98d6d4e6f --- /dev/null +++ b/tests/unit/info-keysizes.tcl @@ -0,0 +1,454 @@ +################################################################################ +# Test the "info keysizes" command. +# The command returns a histogram of the sizes of keys in the database. +################################################################################ + +# Query and Strip result of "info keysizes" from header, spaces, and newlines. +proc get_stripped_info {server} { + set infoStripped [string map { + "# Keysizes" "" + " " "" "\n" "" "\r" "" + } [$server info keysizes] ] + return $infoStripped +} + +# Verify output of "info keysizes" command is as expected. +# +# Arguments: +# cmd - A command that should be run before the verification. +# expOutput - This is a string that represents the expected output abbreviated. +# Instead of the output of "strings_len_exp_distrib" write "STR". +# Similarly for LIST, SET, ZSET and HASH. Spaces and newlines are +# ignored. +# waitCond - If set to 1, the function wait_for_condition 50x50msec for the +# expOutput to match the actual output. +# +# (replicaMode) - Global variable that indicates if the test is running in replica +# mode. If so, run the command on leader, verify the output. Then wait +# for the replica to catch up and verify the output on the replica +# as well. Otherwise, just run the command on the leader and verify +# the output. +proc run_cmd_verify_hist {cmd expOutput {waitCond 0} } { + uplevel 1 $cmd + global replicaMode + + # ref the leader with `server` variable + if {$replicaMode eq 1} { set server [srv -1 client] } else { set server [srv 0 client] } + + # Replace all placeholders with the actual values. Remove spaces & newlines. + set expStripped [string map { + "STR" "distrib_strings_sizes" + "LIST" "distrib_lists_items" + "SET" "distrib_sets_items" + "ZSET" "distrib_zsets_items" + "HASH" "distrib_hashes_items" + " " "" "\n" "" "\r" "" + } $expOutput] + + if {$waitCond} { + wait_for_condition 50 50 { + $expStripped eq [get_stripped_info $server] + } else { + fail "Unexpected KEYSIZES. Expected: `$expStripped` \ + but got: `[get_stripped_info $server]`. Failed after command: $cmd" + + } + } else { + set infoStripped [get_stripped_info $server] + if {$expStripped ne $infoStripped} { + fail "Unexpected KEYSIZES. Expected: `$expStripped` \ + but got: `$infoStripped`. Failed after command: $cmd" + } + } + + # If we are testing `replicaMode` then need to wait for the replica to catch up + if {$replicaMode eq 1} { + wait_for_condition 50 50 { + $expStripped eq [get_stripped_info $server] + } else { + fail "Unexpected replica KEYSIZES. Expected: `$expStripped` \ + but got: `[get_stripped_info $server]`. Failed after command: $cmd" + } + } +} + +proc test_all_keysizes { {replMode 0} } { + # If in replica mode then update global var `replicaMode` so function + # `run_cmd_verify_hist` knows to run the command on the leader and then + # wait for the replica to catch up. + global replicaMode + set replicaMode $replMode + # ref the leader with `server` variable + if {$replicaMode eq 1} { + set server [srv -1 client] + set suffixRepl "(replica)" + } else { + set server [srv 0 client] + set suffixRepl "" + } + + test "KEYSIZES - Test i'th bin counts keysizes between (2^i) and (2^(i+1)-1) as expected $suffixRepl" { + set base_string "" + run_cmd_verify_hist {$server FLUSHALL} {} + for {set i 1} {$i <= 10} {incr i} { + append base_string "x" + set log_value [expr {1 << int(log($i) / log(2))}] + #puts "Iteration $i: $base_string (Log base 2 pattern: $log_value)" + run_cmd_verify_hist {$server set mykey $base_string} "db0_STR:$log_value=1" + } + } + + test "KEYSIZES - Histogram of values of Bytes, Kilo and Mega $suffixRepl" { + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server set x 0123456789ABCDEF} {db0_STR:16=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:32=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:64=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:128=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:256=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:512=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:1K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:2K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:4K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:8K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:16K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:32K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:64K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:128K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:256K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:512K=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:1M=1} + run_cmd_verify_hist {$server APPEND x [$server get x]} {db0_STR:2M=1} + } + + test "KEYSIZES - Test List $suffixRepl" { + # FLUSHALL + run_cmd_verify_hist {$server FLUSHALL} {} + # RPUSH + run_cmd_verify_hist {$server RPUSH l1 1 2 3 4 5} {db0_LIST:4=1} + run_cmd_verify_hist {$server RPUSH l1 6 7 8 9} {db0_LIST:8=1} + # Test also LPUSH, RPUSH, LPUSHX, RPUSHX + run_cmd_verify_hist {$server LPUSH l2 1} {db0_LIST:1=1,8=1} + run_cmd_verify_hist {$server LPUSH l2 2} {db0_LIST:2=1,8=1} + run_cmd_verify_hist {$server LPUSHX l2 3} {db0_LIST:2=1,8=1} + run_cmd_verify_hist {$server RPUSHX l2 4} {db0_LIST:4=1,8=1} + # RPOP + run_cmd_verify_hist {$server RPOP l1} {db0_LIST:4=1,8=1} + run_cmd_verify_hist {$server RPOP l1} {db0_LIST:4=2} + # DEL + run_cmd_verify_hist {$server DEL l1} {db0_LIST:4=1} + # LINSERT, LTRIM + run_cmd_verify_hist {$server RPUSH l3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14} {db0_LIST:4=1,8=1} + run_cmd_verify_hist {$server LINSERT l3 AFTER 9 10} {db0_LIST:4=1,16=1} + run_cmd_verify_hist {$server LTRIM l3 0 8} {db0_LIST:4=1,8=1} + # DEL + run_cmd_verify_hist {$server DEL l3} {db0_LIST:4=1} + run_cmd_verify_hist {$server DEL l2} {} + # LMOVE, BLMOVE + run_cmd_verify_hist {$server RPUSH l4 1 2 3 4 5 6 7 8} {db0_LIST:8=1} + run_cmd_verify_hist {$server LMOVE l4 l5 LEFT LEFT} {db0_LIST:1=1,4=1} + run_cmd_verify_hist {$server LMOVE l4 l5 RIGHT RIGHT} {db0_LIST:2=1,4=1} + run_cmd_verify_hist {$server LMOVE l4 l5 LEFT RIGHT} {db0_LIST:2=1,4=1} + run_cmd_verify_hist {$server LMOVE l4 l5 RIGHT LEFT} {db0_LIST:4=2} + run_cmd_verify_hist {$server BLMOVE l4 l5 RIGHT LEFT 0} {db0_LIST:2=1,4=1} + # DEL + run_cmd_verify_hist {$server DEL l4} {db0_LIST:4=1} + run_cmd_verify_hist {$server DEL l5} {} + # LMPOP + run_cmd_verify_hist {$server RPUSH l6 1 2 3 4 5 6 7 8 9 10} {db0_LIST:8=1} + run_cmd_verify_hist {$server LMPOP 1 l6 LEFT COUNT 2} {db0_LIST:8=1} + run_cmd_verify_hist {$server LMPOP 1 l6 LEFT COUNT 1} {db0_LIST:4=1} + run_cmd_verify_hist {$server LMPOP 1 l6 LEFT COUNT 6} {db0_LIST:1=1} + # LPOP + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l7 1 2 3 4} {db0_LIST:4=1} + run_cmd_verify_hist {$server LPOP l7} {db0_LIST:2=1} + # LREM + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l8 1 x 3 x 5 x 7 x 9 10} {db0_LIST:8=1} + run_cmd_verify_hist {$server LREM l8 3 x} {db0_LIST:4=1} + # EXPIRE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l9 1 2 3 4} {db0_LIST:4=1} + run_cmd_verify_hist {$server PEXPIRE l9 50} {db0_LIST:4=1} + run_cmd_verify_hist {} {} 1 + # SET overwrites + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l9 1 2 3 4} {db0_LIST:4=1} + run_cmd_verify_hist {$server SET l9 1234567} {db0_STR:4=1} + run_cmd_verify_hist {$server DEL l9} {} + } {} {cluster:skip} + + test "KEYSIZES - Test SET $suffixRepl" { + run_cmd_verify_hist {$server FLUSHALL} {} + # SADD + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5} {db0_SET:4=1} + run_cmd_verify_hist {$server SADD s1 6 7 8} {db0_SET:8=1} + # Test also SADD, SREM, SMOVE, SPOP + run_cmd_verify_hist {$server SADD s2 1} {db0_SET:1=1,8=1} + run_cmd_verify_hist {$server SADD s2 2} {db0_SET:2=1,8=1} + run_cmd_verify_hist {$server SREM s2 3} {db0_SET:2=1,8=1} + run_cmd_verify_hist {$server SMOVE s2 s3 2} {db0_SET:1=2,8=1} + run_cmd_verify_hist {$server SPOP s3} {db0_SET:1=1,8=1} + run_cmd_verify_hist {$server SPOP s2} {db0_SET:8=1} + run_cmd_verify_hist {$server SPOP s1} {db0_SET:4=1} + run_cmd_verify_hist {$server del s1} {} + + # SDIFFSTORE + run_cmd_verify_hist {$server flushall} {} + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5 6 7 8} {db0_SET:8=1} + run_cmd_verify_hist {$server SADD s2 6 7 8 9 A B C D} {db0_SET:8=2} + run_cmd_verify_hist {$server SDIFFSTORE s3 s1 s2} {db0_SET:4=1,8=2} + #SINTERSTORE + run_cmd_verify_hist {$server flushall} {} + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5 6 7 8} {db0_SET:8=1} + run_cmd_verify_hist {$server SADD s2 6 7 8 9 A B C D} {db0_SET:8=2} + run_cmd_verify_hist {$server SINTERSTORE s3 s1 s2} {db0_SET:2=1,8=2} + #SUNIONSTORE + run_cmd_verify_hist {$server flushall} {} + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5 6 7 8} {db0_SET:8=1} + run_cmd_verify_hist {$server SADD s2 6 7 8 9 A B C D} {db0_SET:8=2} + run_cmd_verify_hist {$server SUNIONSTORE s3 s1 s2} {db0_SET:8=3} + run_cmd_verify_hist {$server SADD s4 E F G H} {db0_SET:4=1,8=3} + run_cmd_verify_hist {$server SUNIONSTORE s5 s3 s4} {db0_SET:4=1,8=3,16=1} + # DEL + run_cmd_verify_hist {$server flushall} {} + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5 6 7 8} {db0_SET:8=1} + run_cmd_verify_hist {$server DEL s1} {} + # EXPIRE + run_cmd_verify_hist {$server flushall} {} + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5 6 7 8} {db0_SET:8=1} + run_cmd_verify_hist {$server PEXPIRE s1 50} {db0_SET:8=1} + run_cmd_verify_hist {} {} 1 + # SET overwrites + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5 6 7 8} {db0_SET:8=1} + run_cmd_verify_hist {$server SET s1 1234567} {db0_STR:4=1} + run_cmd_verify_hist {$server DEL s1} {} + } {} {cluster:skip} + + test "KEYSIZES - Test ZSET $suffixRepl" { + # ZADD, ZREM + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZADD z1 6 f 7 g 8 h 9 i} {db0_ZSET:8=1} + run_cmd_verify_hist {$server ZADD z2 1 a} {db0_ZSET:1=1,8=1} + run_cmd_verify_hist {$server ZREM z1 a} {db0_ZSET:1=1,8=1} + run_cmd_verify_hist {$server ZREM z1 b} {db0_ZSET:1=1,4=1} + # ZREMRANGEBYSCORE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZREMRANGEBYSCORE z1 -inf (2} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZREMRANGEBYSCORE z1 -inf (3} {db0_ZSET:2=1} + # ZREMRANGEBYRANK + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e 6 f} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZREMRANGEBYRANK z1 0 1} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZREMRANGEBYRANK z1 0 0} {db0_ZSET:2=1} + # ZREMRANGEBYLEX + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 0 a 0 b 0 c 0 d 0 e 0 f} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZREMRANGEBYLEX z1 - (d} {db0_ZSET:2=1} + # ZUNIONSTORE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZADD z2 6 f 7 g 8 h 9 i} {db0_ZSET:4=2} + run_cmd_verify_hist {$server ZUNIONSTORE z3 2 z1 z2} {db0_ZSET:4=2,8=1} + # ZINTERSTORE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZADD z2 3 c 4 d 5 e 6 f} {db0_ZSET:4=2} + run_cmd_verify_hist {$server ZINTERSTORE z3 2 z1 z2} {db0_ZSET:2=1,4=2} + # BZPOPMIN, BZPOPMAX + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server BZPOPMIN z1 0} {db0_ZSET:4=1} + run_cmd_verify_hist {$server BZPOPMAX z1 0} {db0_ZSET:2=1} + # ZDIFFSTORE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZADD z2 3 c 4 d 5 e 6 f} {db0_ZSET:4=2} + run_cmd_verify_hist {$server ZDIFFSTORE z3 2 z1 z2} {db0_ZSET:2=1,4=2} + # ZINTERSTORE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server ZADD z2 3 c 4 d 5 e 6 f} {db0_ZSET:4=2} + run_cmd_verify_hist {$server ZINTERSTORE z3 2 z1 z2} {db0_ZSET:2=1,4=2} + # DEL + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server DEL z1} {} + # EXPIRE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server PEXPIRE z1 50} {db0_ZSET:4=1} + run_cmd_verify_hist {} {} 1 + # SET overwrites + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c 4 d 5 e} {db0_ZSET:4=1} + run_cmd_verify_hist {$server SET z1 1234567} {db0_STR:4=1} + run_cmd_verify_hist {$server DEL z1} {} + } {} {cluster:skip} + + test "KEYSIZES - Test STRING $suffixRepl" { + # SETRANGE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server SET s2 1234567890} {db0_STR:8=1} + run_cmd_verify_hist {$server SETRANGE s2 10 123456} {db0_STR:16=1} + # MSET, MSETNX + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server MSET s3 1 s4 2 s5 3} {db0_STR:1=3} + run_cmd_verify_hist {$server MSETNX s6 1 s7 2 s8 3} {db0_STR:1=6} + # DEL + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server SET s9 1234567890} {db0_STR:8=1} + run_cmd_verify_hist {$server DEL s9} {} + #EXPIRE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server SET s10 1234567890} {db0_STR:8=1} + run_cmd_verify_hist {$server PEXPIRE s10 50} {db0_STR:8=1} + run_cmd_verify_hist {} {} 1 + # SET (+overwrite) + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server SET s1 1024} {db0_STR:4=1} + run_cmd_verify_hist {$server SET s1 842} {db0_STR:2=1} + run_cmd_verify_hist {$server SET s1 2} {db0_STR:1=1} + run_cmd_verify_hist {$server SET s1 1234567} {db0_STR:4=1} + } {} {cluster:skip} + + foreach type {listpackex hashtable} { + # Test different implementations of hash tables and listpacks + if {$type eq "hashtable"} { + $server config set hash-max-listpack-entries 0 + } else { + $server config set hash-max-listpack-entries 512 + } + + test "KEYSIZES - Test HASH ($type) $suffixRepl" { + # HSETNX + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server HSETNX h1 1 1} {db0_HASH:1=1} + run_cmd_verify_hist {$server HSETNX h1 2 2} {db0_HASH:2=1} + # HSET, HDEL + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server HSET h2 1 1} {db0_HASH:1=1} + run_cmd_verify_hist {$server HSET h2 2 2} {db0_HASH:2=1} + run_cmd_verify_hist {$server HDEL h2 1} {db0_HASH:1=1} + run_cmd_verify_hist {$server HDEL h2 2} {} + # HMSET + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server HMSET h1 1 1 2 2 3 3} {db0_HASH:2=1} + run_cmd_verify_hist {$server HMSET h1 1 1 2 2 3 3} {db0_HASH:2=1} + run_cmd_verify_hist {$server HMSET h1 1 1 2 2 3 3 4 4} {db0_HASH:4=1} + + # HINCRBY + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server hincrby h1 f1 10} {db0_HASH:1=1} + run_cmd_verify_hist {$server hincrby h1 f1 10} {db0_HASH:1=1} + run_cmd_verify_hist {$server hincrby h1 f2 20} {db0_HASH:2=1} + # HINCRBYFLOAT + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server hincrbyfloat h1 f1 10.5} {db0_HASH:1=1} + run_cmd_verify_hist {$server hincrbyfloat h1 f1 10.5} {db0_HASH:1=1} + run_cmd_verify_hist {$server hincrbyfloat h1 f2 10.5} {db0_HASH:2=1} + # HEXPIRE + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server HSET h1 f1 1} {db0_HASH:1=1} + run_cmd_verify_hist {$server HSET h1 f2 1} {db0_HASH:2=1} + run_cmd_verify_hist {$server HPEXPIREAT h1 1 FIELDS 1 f1} {db0_HASH:1=1} + run_cmd_verify_hist {$server HSET h1 f3 1} {db0_HASH:2=1} + run_cmd_verify_hist {$server HPEXPIRE h1 50 FIELDS 1 f2} {db0_HASH:2=1} + run_cmd_verify_hist {} {db0_HASH:1=1} 1 + run_cmd_verify_hist {$server HPEXPIRE h1 50 FIELDS 1 f3} {db0_HASH:1=1} + run_cmd_verify_hist {} {} 1 + } + } + + test "KEYSIZES - Test STRING BITS $suffixRepl" { + # BITOPS + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server SET b1 "x123456789"} {db0_STR:8=1} + run_cmd_verify_hist {$server SET b2 "x12345678"} {db0_STR:8=2} + run_cmd_verify_hist {$server BITOP AND b3 b1 b2} {db0_STR:8=3} + run_cmd_verify_hist {$server BITOP OR b4 b1 b2} {db0_STR:8=4} + run_cmd_verify_hist {$server BITOP XOR b5 b1 b2} {db0_STR:8=5} + # SETBIT + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server setbit b1 71 1} {db0_STR:8=1} + run_cmd_verify_hist {$server setbit b1 72 1} {db0_STR:8=1} + run_cmd_verify_hist {$server setbit b2 72 1} {db0_STR:8=2} + run_cmd_verify_hist {$server setbit b2 640 0} {db0_STR:8=1,64=1} + # BITFIELD + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server bitfield b3 set u8 6 255} {db0_STR:2=1} + run_cmd_verify_hist {$server bitfield b3 set u8 65 255} {db0_STR:8=1} + run_cmd_verify_hist {$server bitfield b4 set u8 1000 255} {db0_STR:8=1,64=1} + } {} {cluster:skip} + + test "KEYSIZES - Test RESTORE $suffixRepl" { + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l10 1 2 3 4} {db0_LIST:4=1} + set encoded [$server dump l10] + run_cmd_verify_hist {$server del l10} {} + run_cmd_verify_hist {$server restore l11 0 $encoded} {db0_LIST:4=1} + } + + test "KEYSIZES - Test RENAME $suffixRepl" { + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l12 1 2 3 4} {db0_LIST:4=1} + run_cmd_verify_hist {$server RENAME l12 l13} {db0_LIST:4=1} + } {} {cluster:skip} + + test "KEYSIZES - Test MOVE $suffixRepl" { + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l1 1 2 3 4} {db0_LIST:4=1} + run_cmd_verify_hist {$server RPUSH l2 1} {db0_LIST:1=1,4=1} + run_cmd_verify_hist {$server MOVE l1 1} {db0_LIST:1=1 db1_LIST:4=1} + } {} {cluster:skip} + + test "KEYSIZES - Test SWAPDB $suffixRepl" { + run_cmd_verify_hist {$server FLUSHALL} {} + run_cmd_verify_hist {$server RPUSH l1 1 2 3 4} {db0_LIST:4=1} + $server select 1 + run_cmd_verify_hist {$server ZADD z1 1 A} {db0_LIST:4=1 db1_ZSET:1=1} + run_cmd_verify_hist {$server SWAPDB 0 1} {db0_ZSET:1=1 db1_LIST:4=1} + $server select 0 + } {OK} {singledb:skip} + + test "KEYSIZES - Test RDB $suffixRepl" { + run_cmd_verify_hist {$server FLUSHALL} {} + # Write list, set and zset to db0 + run_cmd_verify_hist {$server RPUSH l1 1 2 3 4} {db0_LIST:4=1} + run_cmd_verify_hist {$server SADD s1 1 2 3 4 5} {db0_LIST:4=1 db0_SET:4=1} + run_cmd_verify_hist {$server ZADD z1 1 a 2 b 3 c} {db0_LIST:4=1 db0_SET:4=1 db0_ZSET:2=1} + run_cmd_verify_hist {$server SAVE} {db0_LIST:4=1 db0_SET:4=1 db0_ZSET:2=1} + if {$replicaMode eq 1} { + run_cmd_verify_hist {restart_server -1 true false} {db0_LIST:4=1 db0_SET:4=1 db0_ZSET:2=1} + } else { + run_cmd_verify_hist {restart_server 0 true false} {db0_LIST:4=1 db0_SET:4=1 db0_ZSET:2=1} + } + } {} {external:skip} +} + +start_server {} { + # Test KEYSIZES on a single server + r select 0 + test_all_keysizes 0 + + # Start another server to test replication of KEYSIZES + start_server {tags {needs:repl external:skip}} { + # Set the outer layer server as primary + set primary [srv -1 client] + set primary_host [srv -1 host] + set primary_port [srv -1 port] + # Set this inner layer server as replica + set replica [srv 0 client] + + # Server should have role replica + $replica replicaof $primary_host $primary_port + wait_for_condition 50 100 { [s 0 role] eq {slave} } else { fail "Replication not started." } + + # Test KEYSIZES on leader and replica + $primary select 0 + test_all_keysizes 1 + } +} diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index f04af799a..130289aff 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -306,7 +306,7 @@ run_solo {defrag} { r set "{bigstream}smallitem" val - set expected_frag 1.7 + set expected_frag 1.5 if {$::accurate} { # scale the hash to 1m fields in order to have a measurable the latency for {set j 10000} {$j < 1000000} {incr j} { @@ -601,7 +601,7 @@ run_solo {defrag} { # create big keys with 10k items set rd [redis_deferring_client] - set expected_frag 1.7 + set expected_frag 1.5 # add a mass of list nodes to two lists (allocations are interlaced) set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation set elements 500000 diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 52d7c3e98..7ce70cda1 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -508,18 +508,25 @@ start_server {tags {"other external:skip"}} { test "Redis can resize empty dict" { # Write and then delete 128 keys, creating an empty dict r flushall + + # Add one key to the db just to create the dict and get its initial size + r set x 1 + set initial_size [dict get [r memory stats] db.9 overhead.hashtable.main] + + # Now add 128 keys and then delete them for {set j 1} {$j <= 128} {incr j} { r set $j{b} a } + for {set j 1} {$j <= 128} {incr j} { r del $j{b} } - # The dict containing 128 keys must have expanded, - # its hash table itself takes a lot more than 400 bytes + + # dict must have expanded. Verify it eventually shrinks back to its initial size. wait_for_condition 100 50 { - [dict get [r memory stats] db.9 overhead.hashtable.main] < 400 + [dict get [r memory stats] db.9 overhead.hashtable.main] == $initial_size } else { - fail "dict did not resize in time" - } + fail "dict did not resize in time to its initial size" + } } } From 4b29be3f36b05c4e2479a80712f2c73f216ae031 Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Wed, 30 Oct 2024 08:45:25 +0800 Subject: [PATCH 03/42] Avoid redundant lpGet to boost quicklistCompare (#11533) `lpCompare()` in `quicklistCompare()` will call `lpGet()` again, which would be a waste. The change will result in a boost for all commands that use `quicklistCompre()`, including `linsert`, `lpos` and `lrem`. --- src/quicklist.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/quicklist.c b/src/quicklist.c index 7fe3430fc..6577525a7 100644 --- a/src/quicklist.c +++ b/src/quicklist.c @@ -1244,10 +1244,17 @@ int quicklistDelRange(quicklist *quicklist, const long start, /* compare between a two entries */ int quicklistCompare(quicklistEntry* entry, unsigned char *p2, const size_t p2_len) { - if (unlikely(QL_NODE_IS_PLAIN(entry->node))) { + if (entry->value) { return ((entry->sz == p2_len) && (memcmp(entry->value, p2, p2_len) == 0)); + } else { + /* We use string2ll() to get an integer representation of the + * string 'p2' and compare it to 'entry->longval', it's much + * faster than convert integer to string and comparing. */ + long long sval; + if (string2ll((const char*)p2, p2_len, &sval)) + return entry->longval == sval; } - return lpCompare(entry->zi, p2, p2_len); + return 0; } /* Returns a quicklist iterator 'iter'. After the initialization every From 6437d07b0371047e62c26e88e66bd4df0e2b94ba Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Wed, 30 Oct 2024 10:03:31 +0200 Subject: [PATCH 04/42] Fix memory leak on rdbload error (#13626) On RDB load error, if an invalid `expireAt` value is read, `dupSearchDict` is not released. --- src/rdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rdb.c b/src/rdb.c index 0bd4ee1ba..764b87a48 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2332,6 +2332,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) rdbReportCorruptRDB("invalid expireAt time: %llu", (unsigned long long) expireAt); decrRefCount(o); + if (dupSearchDict != NULL) dictRelease(dupSearchDict); return NULL; } From ded8d993b7eb8e7fbf165da4fcba7f22a44ea95d Mon Sep 17 00:00:00 2001 From: guybe7 Date: Wed, 30 Oct 2024 17:32:51 +0800 Subject: [PATCH 05/42] Modules: defrag CB should take robj, not sds (#13627) Added a log of the keyname in the test modules to reproduce the problem (tests crash without the fix) --- src/defrag.c | 9 ++++++--- tests/modules/defragtest.c | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index a819eb8ac..d3f4ceee6 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -729,8 +729,9 @@ void defragStream(redisDb *db, dictEntry *kde) { void defragModule(redisDb *db, dictEntry *kde) { robj *obj = dictGetVal(kde); serverAssert(obj->type == OBJ_MODULE); - - if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) + robj keyobj; + initStaticStringObject(keyobj, dictGetKey(kde)); + if (!moduleDefragValue(&keyobj, obj, db->id)) defragLater(db, kde); } @@ -940,7 +941,9 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int } else if (ob->type == OBJ_STREAM) { return scanLaterStreamListpacks(ob, cursor, endtime); } else if (ob->type == OBJ_MODULE) { - return moduleLateDefrag(dictGetKey(de), ob, cursor, endtime, dbid); + robj keyobj; + initStaticStringObject(keyobj, dictGetKey(de)); + return moduleLateDefrag(&keyobj, ob, cursor, endtime, dbid); } else { *cursor = 0; /* object type may have changed since we schedule it for later */ } diff --git a/tests/modules/defragtest.c b/tests/modules/defragtest.c index a27b57e13..597b5aa79 100644 --- a/tests/modules/defragtest.c +++ b/tests/modules/defragtest.c @@ -161,13 +161,14 @@ size_t FragFreeEffort(RedisModuleString *key, const void *value) { } int FragDefrag(RedisModuleDefragCtx *ctx, RedisModuleString *key, void **value) { - REDISMODULE_NOT_USED(key); unsigned long i = 0; int steps = 0; int dbid = RedisModule_GetDbIdFromDefragCtx(ctx); RedisModule_Assert(dbid != -1); + RedisModule_Log(NULL, "notice", "Defrag key: %s", RedisModule_StringPtrLen(key, NULL)); + /* Attempt to get cursor, validate it's what we're exepcting */ if (RedisModule_DefragCursorGet(ctx, &i) == REDISMODULE_OK) { if (i > 0) datatype_resumes++; From 9906daf5c9fdb836a5b3f04829c75701a4e90eb4 Mon Sep 17 00:00:00 2001 From: David Dougherty Date: Mon, 4 Nov 2024 08:18:22 -0800 Subject: [PATCH 06/42] Update old links for modules-api-ref.md (#13479) This PR replaces old .../topics/... links with current links, specifically for the modules-api-ref.md file and the new automation that Paolo Lazzari is working on. A few of the topics links have redirects, but some don't. Best to use updated links. --- README.md | 5 +++-- deps/hiredis/async.c | 2 +- redis.conf | 8 ++++---- sentinel.conf | 6 +++--- src/latency.c | 2 +- src/module.c | 16 ++++++++-------- src/notify.c | 2 +- utils/whatisdoing.sh | 2 +- 8 files changed, 22 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index bb866fbb1..4d6d7adb1 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,8 @@ Another good example is to think of Redis as a more complex version of memcached If you want to know more, this is a list of selected starting points: -* Introduction to Redis data types. https://redis.io/topics/data-types-intro +* Introduction to Redis data types. https://redis.io/docs/latest/develop/data-types/ + * The full list of Redis commands. https://redis.io/commands * There is much more inside the official Redis documentation. https://redis.io/documentation @@ -493,7 +494,7 @@ Other C files * `dict.c` is an implementation of a non-blocking hash table which rehashes incrementally. * `cluster.c` implements the Redis Cluster. Probably a good read only after being very familiar with the rest of the Redis code base. If you want to read `cluster.c` make sure to read the [Redis Cluster specification][4]. -[4]: https://redis.io/topics/cluster-spec +[4]: https://redis.io/docs/latest/operate/oss_and_stack/reference/cluster-spec/ Anatomy of a Redis command --- diff --git a/deps/hiredis/async.c b/deps/hiredis/async.c index 3d39cfaf8..ac56353dd 100644 --- a/deps/hiredis/async.c +++ b/deps/hiredis/async.c @@ -478,7 +478,7 @@ static int __redisGetSubscribeCallback(redisAsyncContext *ac, redisReply *reply, /* Match reply with the expected format of a pushed message. * The type and number of elements (3 to 4) are specified at: - * https://redis.io/topics/pubsub#format-of-pushed-messages */ + * https://redis.io/docs/latest/develop/interact/pubsub/#format-of-pushed-messages */ if ((reply->type == REDIS_REPLY_ARRAY && !(c->flags & REDIS_SUPPORTS_PUSH) && reply->elements >= 3) || reply->type == REDIS_REPLY_PUSH) { assert(reply->element[0]->type == REDIS_REPLY_STRING); diff --git a/redis.conf b/redis.conf index a95e2ed37..aadb4acb0 100644 --- a/redis.conf +++ b/redis.conf @@ -838,7 +838,7 @@ replica-priority 100 # this is used in order to send invalidation messages to clients. Please # check this page to understand more about the feature: # -# https://redis.io/topics/client-side-caching +# https://redis.io/docs/latest/develop/use/client-side-caching/ # # When tracking is enabled for a client, all the read only queries are assumed # to be cached: this will force Redis to store information in the invalidation @@ -1016,7 +1016,7 @@ replica-priority 100 # * stream - Data type: streams related. # # For more information about ACL configuration please refer to -# the Redis web site at https://redis.io/topics/acl +# the Redis web site at https://redis.io/docs/latest/operate/oss_and_stack/management/security/acl/ # ACL LOG # @@ -1393,7 +1393,7 @@ disable-thp yes # restarting the server can lead to data loss. A conversion needs to be done # by setting it via CONFIG command on a live server first. # -# Please check https://redis.io/topics/persistence for more information. +# Please check https://redis.io/docs/latest/operate/oss_and_stack/management/persistence/ for more information. appendonly no @@ -1880,7 +1880,7 @@ latency-monitor-threshold 0 ############################# EVENT NOTIFICATION ############################## # Redis can notify Pub/Sub clients about events happening in the key space. -# This feature is documented at https://redis.io/topics/notifications +# This feature is documented at https://redis.io/docs/latest/develop/use/keyspace-notifications/ # # For instance if keyspace events notification is enabled, and a client # performs a DEL operation on key "foo" stored in the Database 0, two diff --git a/sentinel.conf b/sentinel.conf index b7b3604f0..c7ce1cba7 100644 --- a/sentinel.conf +++ b/sentinel.conf @@ -133,7 +133,7 @@ sentinel monitor mymaster 127.0.0.1 6379 2 sentinel down-after-milliseconds mymaster 30000 # IMPORTANT NOTE: starting with Redis 6.2 ACL capability is supported for -# Sentinel mode, please refer to the Redis website https://redis.io/topics/acl +# Sentinel mode, please refer to the Redis website https://redis.io/docs/latest/operate/oss_and_stack/management/security/acl/ # for more details. # Sentinel's ACL users are defined in the following format: @@ -145,7 +145,7 @@ sentinel down-after-milliseconds mymaster 30000 # user worker +@admin +@connection ~* on >ffa9203c493aa99 # # For more information about ACL configuration please refer to the Redis -# website at https://redis.io/topics/acl and redis server configuration +# website at https://redis.io/docs/latest/operate/oss_and_stack/management/security/acl/ and redis server configuration # template redis.conf. # ACL LOG @@ -174,7 +174,7 @@ acllog-max-len 128 # so Sentinel will try to authenticate with the same password to all the # other Sentinels. So you need to configure all your Sentinels in a given # group with the same "requirepass" password. Check the following documentation -# for more info: https://redis.io/topics/sentinel +# for more info: https://redis.io/docs/latest/operate/oss_and_stack/management/sentinel/ # # IMPORTANT NOTE: starting with Redis 6.2 "requirepass" is a compatibility # layer on top of the ACL system. The option effect will be just setting diff --git a/src/latency.c b/src/latency.c index db4c9044d..69c9f4e5f 100644 --- a/src/latency.c +++ b/src/latency.c @@ -203,7 +203,7 @@ sds createLatencyReport(void) { if (dictSize(server.latency_events) == 0 && server.latency_monitor_threshold == 0) { - report = sdscat(report,"I'm sorry, Dave, I can't do that. Latency monitoring is disabled in this Redis instance. You may use \"CONFIG SET latency-monitor-threshold .\" in order to enable it. If we weren't in a deep space mission I'd suggest to take a look at https://redis.io/topics/latency-monitor.\n"); + report = sdscat(report,"I'm sorry, Dave, I can't do that. Latency monitoring is disabled in this Redis instance. You may use \"CONFIG SET latency-monitor-threshold .\" in order to enable it. If we weren't in a deep space mission I'd suggest to take a look at https://redis.io/docs/latest/operate/oss_and_stack/management/optimization/latency-monitor.\n"); return report; } diff --git a/src/module.c b/src/module.c index 032c3557c..883dda26f 100644 --- a/src/module.c +++ b/src/module.c @@ -983,7 +983,7 @@ int moduleGetCommandChannelsViaAPI(struct redisCommand *cmd, robj **argv, int ar * * These functions are used to implement custom Redis commands. * - * For examples, see https://redis.io/topics/modules-intro. + * For examples, see https://redis.io/docs/latest/develop/reference/modules/. * -------------------------------------------------------------------------- */ /* Return non-zero if a module command, that was declared with the @@ -1197,7 +1197,7 @@ RedisModuleCommand *moduleCreateCommandProxy(struct RedisModule *module, sds dec * from the same input arguments and key values. * Starting from Redis 7.0 this flag has been deprecated. * Declaring a command as "random" can be done using - * command tips, see https://redis.io/topics/command-tips. + * command tips, see https://redis.io/docs/latest/develop/reference/command-tips/. * * **"allow-stale"**: The command is allowed to run on slaves that don't * serve stale data. Don't use if you don't know what * this means. @@ -1587,7 +1587,7 @@ int RM_SetCommandACLCategories(RedisModuleCommand *command, const char *aclflags * both strings set to NULL. * * - `tips`: A string of space-separated tips regarding this command, meant for - * clients and proxies. See https://redis.io/topics/command-tips. + * clients and proxies. See https://redis.io/docs/latest/develop/reference/command-tips/. * * - `arity`: Number of arguments, including the command name itself. A positive * number specifies an exact number of arguments and a negative number @@ -5425,7 +5425,7 @@ int RM_HashGet(RedisModuleKey *key, int flags, ...) { /* -------------------------------------------------------------------------- * ## Key API for Stream type * - * For an introduction to streams, see https://redis.io/topics/streams-intro. + * For an introduction to streams, see https://redis.io/docs/latest/develop/data-types/streams/. * * The type RedisModuleStreamID, which is used in stream functions, is a struct * with two 64-bit fields and is defined as @@ -6298,7 +6298,7 @@ fmterr: * // Do something with myval. * } * - * This API is documented here: https://redis.io/topics/modules-intro + * This API is documented here: https://redis.io/docs/latest/develop/reference/modules/ */ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const char *fmt, ...) { client *c = NULL; @@ -6808,7 +6808,7 @@ robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj /* Register a new data type exported by the module. The parameters are the * following. Please for in depth documentation check the modules API - * documentation, especially https://redis.io/topics/modules-native-types. + * documentation, especially https://redis.io/docs/latest/develop/reference/modules/modules-native-types/. * * * **name**: A 9 characters data type name that MUST be unique in the Redis * Modules ecosystem. Be creative... and there will be no collisions. Use @@ -7697,7 +7697,7 @@ void RM_LatencyAddSample(const char *event, mstime_t latency) { * ## Blocking clients from modules * * For a guide about blocking commands in modules, see - * https://redis.io/topics/modules-blocking-ops. + * https://redis.io/docs/latest/develop/reference/modules/modules-blocking-ops/. * -------------------------------------------------------------------------- */ /* Returns 1 if the client already in the moduleUnblocked list, 0 otherwise. */ @@ -8709,7 +8709,7 @@ void moduleReleaseGIL(void) { * runs is dangerous and discouraged. In order to react to key space events with * write actions, please refer to `RM_AddPostNotificationJob`. * - * See https://redis.io/topics/notifications for more information. + * See https://redis.io/docs/latest/develop/use/keyspace-notifications/ for more information. */ int RM_SubscribeToKeyspaceEvents(RedisModuleCtx *ctx, int types, RedisModuleNotificationFunc callback) { RedisModuleKeyspaceSubscriber *sub = zmalloc(sizeof(*sub)); diff --git a/src/notify.c b/src/notify.c index 237716699..8c8249131 100644 --- a/src/notify.c +++ b/src/notify.c @@ -9,7 +9,7 @@ #include "server.h" /* This file implements keyspace events notification via Pub/Sub and - * described at https://redis.io/topics/notifications. */ + * described at https://redis.io/docs/latest/develop/use/keyspace-notifications/. */ /* Turn a string representing notification classes into an integer * representing notification classes flags xored. diff --git a/utils/whatisdoing.sh b/utils/whatisdoing.sh index 68d7f7cca..09c0b081f 100755 --- a/utils/whatisdoing.sh +++ b/utils/whatisdoing.sh @@ -4,7 +4,7 @@ # Software Watchdog, which provides a similar functionality but in # a more reliable / easy to use way. # -# Check https://redis.io/topics/latency for more information. +# Check https://redis.io/docs/latest/operate/oss_and_stack/management/optimization/latency/ for more information. #!/bin/bash nsamples=1 From fdeb97629ef8964a5d9040328ee63734884ac874 Mon Sep 17 00:00:00 2001 From: Nugine Date: Fri, 8 Nov 2024 15:19:38 +0800 Subject: [PATCH 07/42] Optimize PFCOUNT, PFMERGE command by SIMD acceleration (#13558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR optimizes the performance of HyperLogLog commands (PFCOUNT, PFMERGE) by adding AVX2 fast paths. Two AVX2 functions are added for conversion between raw representation and dense representation. They are 15 ~ 30 times faster than scalar implementaion. Note that sparse representation is not accelerated. AVX2 fast paths are enabled when the CPU supports AVX2 (checked at runtime) and the hyperloglog configuration is default (HLL_REGISTERS == 16384 && HLL_BITS == 6). When merging 3 dense hll structures, the benchmark shows a 12x speedup compared to the scalar version. ``` pfcount key1 key2 key3 pfmerge keyall key1 key2 key3 ``` ``` ====================================================================================================== Type Ops/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------ PFCOUNT-scalar 5570.09 35.89060 32.51100 65.27900 69.11900 299.17 PFCOUNT-avx2 72604.92 2.82072 2.73500 5.50300 7.13500 3899.68 ------------------------------------------------------------------------------------------------------ PFMERGE-scalar 7879.13 25.52156 24.19100 46.33500 48.38300 492.45 PFMERGE-avx2 126448.64 1.58120 1.53500 3.08700 4.89500 7903.04 ------------------------------------------------------------------------------------------------------ scalar: redis:unstable 9906daf5c9fdb836a5b3f04829c75701a4e90eb4 avx2: Nugine:hll-simd 02e09f85ac07eace50ebdddd0fd70822f7b9152d CPU: 13th Gen Intel® Core™ i9-13900H × 20 Memory: 32.0 GiB OS: Ubuntu 22.04.5 LTS ``` Experiment repo: https://github.com/Nugine/redis-hyperloglog Benchmark script: https://github.com/Nugine/redis-hyperloglog/blob/main/scripts/memtier.sh Algorithm: https://github.com/Nugine/redis-hyperloglog/blob/main/cpp/bench.cpp resolves #13551 --------- Co-authored-by: Yuan Wang Co-authored-by: debing.sun --- src/config.h | 13 +++ src/hyperloglog.c | 264 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 265 insertions(+), 12 deletions(-) diff --git a/src/config.h b/src/config.h index ae072c9df..c7fadf5ff 100644 --- a/src/config.h +++ b/src/config.h @@ -318,4 +318,17 @@ void setcpuaffinity(const char *cpulist); #define ATTRIBUTE_TARGET_POPCNT #endif +/* Check if we can compile AVX2 code */ +#if defined (__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4)) +#if defined(__has_attribute) && __has_attribute(target) +#define HAVE_AVX2 +#endif +#endif + +#if defined (HAVE_AVX2) +#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2"))) +#else +#define ATTRIBUTE_TARGET_AVX2 +#endif + #endif diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 6ed88c13b..aa51d4eab 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -13,6 +13,10 @@ #include #include +#ifdef HAVE_AVX2 +#include +#endif + /* The Redis HyperLogLog implementation is based on the following ideas: * * * The use of a 64 bit hash function as proposed in [1], in order to estimate @@ -1041,6 +1045,132 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) { } } +#ifdef HAVE_AVX2 +/* A specialized version of hllMergeDense, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense) + * + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { + const __m256i shuffle = _mm256_setr_epi8( // + 4, 5, 6, -1, // + 7, 8, 9, -1, // + 10, 11, 12, -1, // + 13, 14, 15, -1, // + 0, 1, 2, -1, // + 3, 4, 5, -1, // + 6, 7, 8, -1, // + 9, 10, 11, -1 // + ); + + /* Merge the first 8 registers (6 bytes) normally + * as the AVX2 algorithm needs 4 padding bytes at the start */ + uint8_t val; + for (int i = 0; i < 8; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } + + /* Dense to Raw: + * + * 4 registers in 3 bytes: + * {bbaaaaaa|ccccbbbb|ddddddcc} + * + * LOAD 32 bytes (32 registers) per iteration: + * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding) + * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * + * SHUFFLE to: + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 3 valid bytes (4 registers) and a zero byte. + * + * extract registers in each group with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (<<0) + * {00000000|00bbbbbb|00000000|00000000} x8 (<<2) + * {00000000|00000000|00cccccc|00000000} x8 (<<4) + * {00000000|00000000|00000000|00dddddd} x8 (<<6) + * + * merge the extracted registers with OR: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw + */ + + /* Skip 8 registers (6 bytes) */ + const uint8_t *r = reg_dense + 6 - 4; + uint8_t *t = reg_raw + 8; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x0, x; + x0 = _mm256_loadu_si256((__m256i *)r); + x = _mm256_shuffle_epi8(x0, shuffle); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000)); + + a2 = _mm256_slli_epi32(a2, 2); + a3 = _mm256_slli_epi32(a3, 4); + a4 = _mm256_slli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + + __m256i z = _mm256_loadu_si256((__m256i *)t); + + z = _mm256_max_epu8(z, y); + + _mm256_storeu_si256((__m256i *)t, z); + + r += 24; + t += 32; + } + + /* Merge the last 24 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} +#endif + +/* Merge dense-encoded registers to raw registers array. */ +void hllMergeDense(uint8_t* reg_raw, const uint8_t* reg_dense) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (__builtin_cpu_supports("avx2")) { + hllMergeDenseAVX2(reg_raw, reg_dense); + return; + } + } +#endif + + uint8_t val; + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} + /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll' * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'. * @@ -1054,12 +1184,7 @@ int hllMerge(uint8_t *max, robj *hll) { int i; if (hdr->encoding == HLL_DENSE) { - uint8_t val; - - for (i = 0; i < HLL_REGISTERS; i++) { - HLL_DENSE_GET_REGISTER(val,hdr->registers,i); - if (val > max[i]) max[i] = val; - } + hllMergeDense(max, hdr->registers); } else { uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr); long runlen, regval; @@ -1091,6 +1216,117 @@ int hllMerge(uint8_t *max, robj *hll) { return C_OK; } +#ifdef HAVE_AVX2 +/* A specialized version of hllDenseCompress, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress) + * + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { + const __m256i shuffle = _mm256_setr_epi8( // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1, // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1 // + ); + + /* Raw to Dense: + * + * LOAD 32 bytes (32 registers) per iteration: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 4 registers. + * + * move the registers to correct positions with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (>>0) + * {bb000000|0000bbbb|00000000|00000000} x8 (>>2) + * {00000000|cccc0000|000000cc|00000000} x8 (>>4) + * {00000000|00000000|dddddd00|00000000} x8 (>>6) + * + * merge the registers with OR: + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * + * SHUFFLE to: + * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + * + * STORE the lower half and higher half respectively: + * AAABBBCCCDDD0000 + * EEEFFFGGGHHH0000 + * AAABBBCCCDDDEEEFFFGGGHHH0000 + * + * Note that the last 4 bytes are padding bytes. + */ + + const uint8_t *r = reg_raw; + uint8_t *t = reg_dense; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x = _mm256_loadu_si256((__m256i *)r); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000)); + + a2 = _mm256_srli_epi32(a2, 2); + a3 = _mm256_srli_epi32(a3, 4); + a4 = _mm256_srli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + y = _mm256_shuffle_epi8(y, shuffle); + + __m128i lower, higher; + lower = _mm256_castsi256_si128(y); + higher = _mm256_extracti128_si256(y, 1); + + _mm_storeu_si128((__m128i *)t, lower); + _mm_storeu_si128((__m128i *)(t + 12), higher); + + r += 32; + t += 24; + } + + /* Merge the last 32 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} +#endif + +/* Compress raw registers to dense representation. */ +void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (__builtin_cpu_supports("avx2")) { + hllDenseCompressAVX2(reg_dense, reg_raw); + return; + } + } +#endif + + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} + /* ========================== HyperLogLog commands ========================== */ /* Create an HLL object. We always create the HLL using sparse encoding. @@ -1350,12 +1586,16 @@ void pfmergeCommand(client *c) { /* Write the resulting HLL to the destination HLL registers and * invalidate the cached value. */ - for (j = 0; j < HLL_REGISTERS; j++) { - if (max[j] == 0) continue; - hdr = o->ptr; - switch(hdr->encoding) { - case HLL_DENSE: hllDenseSet(hdr->registers,j,max[j]); break; - case HLL_SPARSE: hllSparseSet(o,j,max[j]); break; + if (use_dense) { + hllDenseCompress(hdr->registers, max); + } else { + for (j = 0; j < HLL_REGISTERS; j++) { + if (max[j] == 0) continue; + hdr = o->ptr; + switch (hdr->encoding) { + case HLL_DENSE: hllDenseSet(hdr->registers,j,max[j]); break; + case HLL_SPARSE: hllSparseSet(o,j,max[j]); break; + } } } hdr = o->ptr; /* o->ptr may be different now, as a side effect of From 54038811c00e84f1ad5ed5b62f9e96032f5a77df Mon Sep 17 00:00:00 2001 From: Ozan Tezcan Date: Mon, 11 Nov 2024 09:34:18 +0300 Subject: [PATCH 08/42] Print command tokens on a crash when hide-user-data-from-log is enabled (#13639) If `hide-user-data-from-log` config is enabled, we don't print client argv in the crashlog to avoid leaking user info. Though, debugging a crash becomes harder as we don't see the command arguments causing the crash. With this PR, we'll be printing command tokens to the log. As we have command tokens defined in json schema for each command, using this data, we can find tokens in the client argv. e.g. `SET key value GET EX 10` ---> we'll print `SET * * GET EX *` in the log. Modules should introduce their command structure via `RM_SetCommandInfo()`. Then, on a crash we'll able to know module command tokens. --- src/debug.c | 82 ++++++++-- tests/modules/crash.c | 288 ++++++++++++++++++++++++++++++++- tests/unit/moduleapi/crash.tcl | 70 +++++++- 3 files changed, 418 insertions(+), 22 deletions(-) diff --git a/src/debug.c b/src/debug.c index e77db51f0..e40375fbe 100644 --- a/src/debug.c +++ b/src/debug.c @@ -1052,6 +1052,46 @@ NULL /* =========================== Crash handling ============================== */ +/* When hide-user-data-from-log is enabled, to avoid leaking user info, we only + * print tokens of the current command into the log. First, we collect command + * tokens into this struct (Commands tokens are defined in json schema). Later, + * checking each argument against the token list. */ +#define CMD_TOKEN_MAX_COUNT 128 /* Max token count in a command's json schema */ +struct cmdToken { + const char *tokens[CMD_TOKEN_MAX_COUNT]; + int n_token; +}; + +/* Collect tokens from command arguments recursively. */ +static void cmdTokenCollect(struct cmdToken *tk, redisCommandArg *args, int argc) { + if (args == NULL) + return; + + for (int i = 0; i < argc && tk->n_token < CMD_TOKEN_MAX_COUNT; i++) { + if (args[i].token) + tk->tokens[tk->n_token++] = args[i].token; + cmdTokenCollect(tk, args[i].subargs, args[i].num_args); + } +} + +/* Get tokens of the command. */ +static void cmdTokenGetFromCommand(struct cmdToken *tk, struct redisCommand *cmd) { + tk->n_token = 0; + cmdTokenCollect(tk, cmd->args, cmd->num_args); +} + +/* Check if object is one of command's tokens. */ +static int cmdTokenCheck(struct cmdToken *tk, robj *o) { + if (o->type != OBJ_STRING || !sdsEncodedObject(o)) + return 0; + + for (int i = 0; i < tk->n_token; i++) { + if (strcasecmp(tk->tokens[i], o->ptr) == 0) + return 1; + } + return 0; +} + __attribute__ ((noinline)) void _serverAssert(const char *estr, const char *file, int line) { int new_report = bugReportStart(); @@ -1072,28 +1112,35 @@ void _serverAssert(const char *estr, const char *file, int line) { bugReportEnd(0, 0); } -/* Returns the amount of client's command arguments we allow logging */ -int clientArgsToLog(const client *c) { - return server.hide_user_data_from_log ? 1 : c->argc; -} - void _serverAssertPrintClientInfo(const client *c) { int j; char conninfo[CONN_INFO_LEN]; + struct redisCommand *cmd = NULL; + struct cmdToken tokens = {{0}}; bugReportStart(); serverLog(LL_WARNING,"=== ASSERTION FAILED CLIENT CONTEXT ==="); serverLog(LL_WARNING,"client->flags = %llu", (unsigned long long) c->flags); serverLog(LL_WARNING,"client->conn = %s", connGetInfo(c->conn, conninfo, sizeof(conninfo))); serverLog(LL_WARNING,"client->argc = %d", c->argc); + if (server.hide_user_data_from_log) { + cmd = lookupCommand(c->argv, c->argc); + if (cmd) + cmdTokenGetFromCommand(&tokens, cmd); + } + for (j=0; j < c->argc; j++) { - if (j >= clientArgsToLog(c)) { - serverLog(LL_WARNING,"client->argv[%d] = *redacted*",j); - continue; - } char buf[128]; char *arg; + /* Allow command name, subcommand name and command tokens in the log. */ + if (server.hide_user_data_from_log && (j != 0 && !(j == 1 && cmd && cmd->parent))) { + if (!cmdTokenCheck(&tokens, c->argv[j])) { + serverLog(LL_WARNING, "client->argv[%d] = *redacted*", j); + continue; + } + } + if (c->argv[j]->type == OBJ_STRING && sdsEncodedObject(c->argv[j])) { arg = (char*) c->argv[j]->ptr; } else { @@ -2061,16 +2108,27 @@ void logCurrentClient(client *cc, const char *title) { sds client; int j; + struct redisCommand *cmd = NULL; + struct cmdToken tokens = {{0}}; serverLog(LL_WARNING|LL_RAW, "\n------ %s CLIENT INFO ------\n", title); client = catClientInfoString(sdsempty(),cc); serverLog(LL_WARNING|LL_RAW,"%s\n", client); sdsfree(client); serverLog(LL_WARNING|LL_RAW,"argc: '%d'\n", cc->argc); + if (server.hide_user_data_from_log) { + cmd = lookupCommand(cc->argv, cc->argc); + if (cmd) + cmdTokenGetFromCommand(&tokens, cmd); + } + for (j = 0; j < cc->argc; j++) { - if (j >= clientArgsToLog(cc)) { - serverLog(LL_WARNING|LL_RAW,"argv[%d]: *redacted*\n",j); - continue; + /* Allow command name, subcommand name and command tokens in the log. */ + if (server.hide_user_data_from_log && (j != 0 && !(j == 1 && cmd && cmd->parent))) { + if (!cmdTokenCheck(&tokens, cc->argv[j])) { + serverLog(LL_WARNING|LL_RAW, "argv[%d]: '*redacted*'\n", j); + continue; + } } robj *decoded; decoded = getDecodedObject(cc->argv[j]); diff --git a/tests/modules/crash.c b/tests/modules/crash.c index c7eccda52..c5063d037 100644 --- a/tests/modules/crash.c +++ b/tests/modules/crash.c @@ -21,19 +21,291 @@ void segfaultCrash(RedisModuleInfoCtx *ctx, int for_crash_report) { *p = 'x'; } +int cmd_crash(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + UNUSED(ctx); + UNUSED(argv); + UNUSED(argc); + + RedisModule_Assert(0); + return REDISMODULE_OK; +} + int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { REDISMODULE_NOT_USED(argv); REDISMODULE_NOT_USED(argc); - if (RedisModule_Init(ctx,"infocrash",1,REDISMODULE_APIVER_1) + if (RedisModule_Init(ctx,"modulecrash",1,REDISMODULE_APIVER_1) == REDISMODULE_ERR) return REDISMODULE_ERR; - RedisModule_Assert(argc == 1); - if (!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "segfault")) { - if (RedisModule_RegisterInfoFunc(ctx, segfaultCrash) == REDISMODULE_ERR) return REDISMODULE_ERR; - } else if(!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "assert")) { - if (RedisModule_RegisterInfoFunc(ctx, assertCrash) == REDISMODULE_ERR) return REDISMODULE_ERR; - } else { - return REDISMODULE_ERR; + + if (argc >= 1) { + if (!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "segfault")) { + if (RedisModule_RegisterInfoFunc(ctx, segfaultCrash) == REDISMODULE_ERR) return REDISMODULE_ERR; + } else if (!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL),"assert")) { + if (RedisModule_RegisterInfoFunc(ctx, assertCrash) == REDISMODULE_ERR) return REDISMODULE_ERR; + } } + /* Create modulecrash.xadd command which is similar to xadd command. + * It will crash in the command handler to verify we print command tokens + * when hide-user-data-from-log config is enabled */ + RedisModuleCommandInfo info = { + .version = REDISMODULE_COMMAND_INFO_VERSION, + .arity = -5, + .key_specs = (RedisModuleCommandKeySpec[]){ + { + .notes = "UPDATE instead of INSERT because of the optional trimming feature", + .flags = REDISMODULE_CMD_KEY_RW | REDISMODULE_CMD_KEY_UPDATE, + .begin_search_type = REDISMODULE_KSPEC_BS_INDEX, + .bs.index.pos = 1, + .find_keys_type = REDISMODULE_KSPEC_FK_RANGE, + .fk.range = {0,1,0} + }, + {0} + }, + .args = (RedisModuleCommandArg[]){ + { + .name = "key", + .type = REDISMODULE_ARG_TYPE_KEY, + .key_spec_index = 0 + }, + { + .name = "nomkstream", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "NOMKSTREAM", + .since = "6.2.0", + .flags = REDISMODULE_CMD_ARG_OPTIONAL + }, + { + .name = "trim", + .type = REDISMODULE_ARG_TYPE_BLOCK, + .flags = REDISMODULE_CMD_ARG_OPTIONAL, + .subargs = (RedisModuleCommandArg[]){ + { + .name = "strategy", + .type = REDISMODULE_ARG_TYPE_ONEOF, + .subargs = (RedisModuleCommandArg[]){ + { + .name = "maxlen", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "MAXLEN", + }, + { + .name = "minid", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "MINID", + .since = "6.2.0", + }, + {0} + } + }, + { + .name = "operator", + .type = REDISMODULE_ARG_TYPE_ONEOF, + .flags = REDISMODULE_CMD_ARG_OPTIONAL, + .subargs = (RedisModuleCommandArg[]){ + { + .name = "equal", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "=" + }, + { + .name = "approximately", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "~" + }, + {0} + } + }, + { + .name = "threshold", + .type = REDISMODULE_ARG_TYPE_STRING, + .display_text = "threshold" /* Just for coverage, doesn't have a visible effect */ + }, + { + .name = "count", + .type = REDISMODULE_ARG_TYPE_INTEGER, + .token = "LIMIT", + .since = "6.2.0", + .flags = REDISMODULE_CMD_ARG_OPTIONAL + }, + {0} + } + }, + { + .name = "id-selector", + .type = REDISMODULE_ARG_TYPE_ONEOF, + .subargs = (RedisModuleCommandArg[]){ + { + .name = "auto-id", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "*" + }, + { + .name = "id", + .type = REDISMODULE_ARG_TYPE_STRING, + }, + {0} + } + }, + { + .name = "data", + .type = REDISMODULE_ARG_TYPE_BLOCK, + .flags = REDISMODULE_CMD_ARG_MULTIPLE, + .subargs = (RedisModuleCommandArg[]){ + { + .name = "field", + .type = REDISMODULE_ARG_TYPE_STRING, + }, + { + .name = "value", + .type = REDISMODULE_ARG_TYPE_STRING, + }, + {0} + } + }, + {0} + } + }; + + RedisModuleCommand *cmd; + + if (RedisModule_CreateCommand(ctx,"modulecrash.xadd", cmd_crash,"write deny-oom random fast",0,0,0) == REDISMODULE_ERR) + return REDISMODULE_ERR; + cmd = RedisModule_GetCommand(ctx,"modulecrash.xadd"); + if (RedisModule_SetCommandInfo(cmd, &info) == REDISMODULE_ERR) + return REDISMODULE_ERR; + + /* Create a subcommand: modulecrash.parent sub + * It will crash in the command handler to verify we print subcommand name + * when hide-user-data-from-log config is enabled */ + RedisModuleCommandInfo subcommand_info = { + .version = REDISMODULE_COMMAND_INFO_VERSION, + .arity = -5, + .key_specs = (RedisModuleCommandKeySpec[]){ + { + .flags = REDISMODULE_CMD_KEY_RW | REDISMODULE_CMD_KEY_UPDATE, + .begin_search_type = REDISMODULE_KSPEC_BS_INDEX, + .bs.index.pos = 1, + .find_keys_type = REDISMODULE_KSPEC_FK_RANGE, + .fk.range = {0,1,0} + }, + {0} + }, + .args = (RedisModuleCommandArg[]){ + { + .name = "key", + .type = REDISMODULE_ARG_TYPE_KEY, + .key_spec_index = 0 + }, + { + .name = "token", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "TOKEN", + .flags = REDISMODULE_CMD_ARG_OPTIONAL + }, + { + .name = "data", + .type = REDISMODULE_ARG_TYPE_BLOCK, + .subargs = (RedisModuleCommandArg[]){ + { + .name = "field", + .type = REDISMODULE_ARG_TYPE_STRING, + }, + { + .name = "value", + .type = REDISMODULE_ARG_TYPE_STRING, + }, + {0} + } + }, + {0} + } + }; + + if (RedisModule_CreateCommand(ctx,"modulecrash.parent",NULL,"",0,0,0) == REDISMODULE_ERR) + return REDISMODULE_ERR; + + RedisModuleCommand *parent = RedisModule_GetCommand(ctx,"modulecrash.parent"); + + if (RedisModule_CreateSubcommand(parent,"subcmd",cmd_crash,"",0,0,0) == REDISMODULE_ERR) + return REDISMODULE_ERR; + + cmd = RedisModule_GetCommand(ctx,"modulecrash.parent|subcmd"); + if (RedisModule_SetCommandInfo(cmd, &subcommand_info) == REDISMODULE_ERR) + return REDISMODULE_ERR; + + /* Create modulecrash.zunion command which is similar to zunion command. + * It will crash in the command handler to verify we print command tokens + * when hide-user-data-from-log config is enabled */ + RedisModuleCommandInfo zunioninfo = { + .version = REDISMODULE_COMMAND_INFO_VERSION, + .arity = -5, + .key_specs = (RedisModuleCommandKeySpec[]){ + { + .flags = REDISMODULE_CMD_KEY_RO, + .begin_search_type = REDISMODULE_KSPEC_BS_INDEX, + .bs.index.pos = 1, + .find_keys_type = REDISMODULE_KSPEC_FK_KEYNUM, + .fk.keynum = {0,1,1} + }, + {0} + }, + .args = (RedisModuleCommandArg[]){ + { + .name = "numkeys", + .type = REDISMODULE_ARG_TYPE_INTEGER, + }, + { + .name = "key", + .type = REDISMODULE_ARG_TYPE_KEY, + .key_spec_index = 0, + .flags = REDISMODULE_CMD_ARG_MULTIPLE + }, + { + .name = "weights", + .type = REDISMODULE_ARG_TYPE_INTEGER, + .token = "WEIGHTS", + .flags = REDISMODULE_CMD_ARG_OPTIONAL | REDISMODULE_CMD_ARG_MULTIPLE + }, + { + .name = "aggregate", + .type = REDISMODULE_ARG_TYPE_ONEOF, + .token = "AGGREGATE", + .flags = REDISMODULE_CMD_ARG_OPTIONAL, + .subargs = (RedisModuleCommandArg[]){ + { + .name = "sum", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "sum" + }, + { + .name = "min", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "min" + }, + { + .name = "max", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "max" + }, + {0} + } + }, + { + .name = "withscores", + .type = REDISMODULE_ARG_TYPE_PURE_TOKEN, + .token = "WITHSCORES", + .flags = REDISMODULE_CMD_ARG_OPTIONAL + }, + {0} + } + }; + + if (RedisModule_CreateCommand(ctx,"modulecrash.zunion", cmd_crash,"readonly",0,0,0) == REDISMODULE_ERR) + return REDISMODULE_ERR; + cmd = RedisModule_GetCommand(ctx,"modulecrash.zunion"); + if (RedisModule_SetCommandInfo(cmd, &zunioninfo) == REDISMODULE_ERR) + return REDISMODULE_ERR; + + return REDISMODULE_OK; } diff --git a/tests/unit/moduleapi/crash.tcl b/tests/unit/moduleapi/crash.tcl index dedbb1a1e..8c208b3b8 100644 --- a/tests/unit/moduleapi/crash.tcl +++ b/tests/unit/moduleapi/crash.tcl @@ -8,7 +8,7 @@ if {!$::valgrind} { start_server {tags {"modules"}} { r module load $testmodule assert test {Test module crash when info crashes with an assertion } { - catch {r 0 info infocrash} + catch {r 0 info modulecrash} set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000] set loglines [lindex $res 1] @@ -34,7 +34,7 @@ if {!$::valgrind} { start_server {tags {"modules"}} { r module load $testmodule segfault test {Test module crash when info crashes with a segfault} { - catch {r 0 info infocrash} + catch {r 0 info modulecrash} set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000] set loglines [lindex $res 1] @@ -60,4 +60,70 @@ if {!$::valgrind} { assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="] } } + + start_server {tags {"modules"}} { + r module load $testmodule + + # memcheck confuses sanitizer + r config set crash-memcheck-enabled no + + test {Test command tokens are printed when hide-user-data-from-log is enabled (xadd)} { + r config set hide-user-data-from-log yes + catch {r 0 modulecrash.xadd key NOMKSTREAM MAXLEN ~ 1000 * a b} + + wait_for_log_messages 0 {"*argv*0*: *modulecrash.xadd*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*1*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*2*: *NOMKSTREAM*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*3*: *MAXLEN*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*4*: *~*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*5*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*6*: *\**"} 0 10 1000 + wait_for_log_messages 0 {"*argv*7*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*8*: *redacted*"} 0 10 1000 + } + } + + start_server {tags {"modules"}} { + r module load $testmodule + + # memcheck confuses sanitizer + r config set crash-memcheck-enabled no + + test {Test command tokens are printed when hide-user-data-from-log is enabled (zunion)} { + r config set hide-user-data-from-log yes + catch {r 0 modulecrash.zunion 2 zset1 zset2 WEIGHTS 1 2 WITHSCORES somedata} + + wait_for_log_messages 0 {"*argv*0*: *modulecrash.zunion*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*1*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*2*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*3*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*4*: *WEIGHTS*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*5*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*6*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*7*: *WITHSCORES*"} 0 10 1000 + + # We don't expect arguments after WITHSCORE but just in case there + # is we rather not print it + wait_for_log_messages 0 {"*argv*8*: *redacted*"} 0 10 1000 + } + } + + start_server {tags {"modules"}} { + r module load $testmodule + + # memcheck confuses sanitizer + r config set crash-memcheck-enabled no + + test {Test subcommand name is printed when hide-user-data-from-log is enabled} { + r config set hide-user-data-from-log yes + catch {r 0 modulecrash.parent subcmd key TOKEN a b} + + wait_for_log_messages 0 {"*argv*0*: *modulecrash.parent*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*1*: *subcmd*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*2*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*3*: *TOKEN*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*4*: *redacted*"} 0 10 1000 + wait_for_log_messages 0 {"*argv*5*: *redacted*"} 0 10 1000 + } + } } From cf838038802b6d5049e3368953a3a737dd3aaee5 Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Tue, 12 Nov 2024 09:21:22 +0200 Subject: [PATCH 09/42] CRC64 perf improvements (#13638) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improve the performance of crc64 for large batches by processing large number of bytes in parallel and combining the results. --------- Co-authored-by: Viktor Söderqvist Co-authored-by: Madelyn Olson Co-authored-by: Josiah Carlson --- src/Makefile | 6 +- src/crc64.c | 275 +++++++++++++++++++++++++++++++++++++++++------ src/crccombine.c | 252 +++++++++++++++++++++++++++++++++++++++++++ src/crccombine.h | 10 ++ src/crcspeed.c | 172 +++++++++++++++++++++++++---- src/crcspeed.h | 2 + 6 files changed, 658 insertions(+), 59 deletions(-) create mode 100644 src/crccombine.c create mode 100644 src/crccombine.h diff --git a/src/Makefile b/src/Makefile index b98908988..8f245d19d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -354,11 +354,11 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) -REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o +REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) -REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o redisassert.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o +REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o redisassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o REDIS_CHECK_RDB_NAME=redis-check-rdb$(PROG_SUFFIX) REDIS_CHECK_AOF_NAME=redis-check-aof$(PROG_SUFFIX) ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(REDIS_SERVER_OBJ) $(REDIS_CLI_OBJ) $(REDIS_BENCHMARK_OBJ))) diff --git a/src/crc64.c b/src/crc64.c index 73e039145..c00ead483 100644 --- a/src/crc64.c +++ b/src/crc64.c @@ -26,8 +26,11 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ +#include #include "crc64.h" #include "crcspeed.h" +#include "redisassert.h" +#include "testhelp.h" static uint64_t crc64_table[8][256] = {{0}}; #define POLY UINT64_C(0xad93d23594c935a9) @@ -67,14 +70,33 @@ static uint64_t crc64_table[8][256] = {{0}}; * \return The reflected data. *****************************************************************************/ static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) { - uint_fast64_t ret = data & 0x01; + /* only ever called for data_len == 64 in this codebase + * + * Borrowed from bit twiddling hacks, original in the public domain. + * https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + * Extended to 64 bits, and added byteswap for final 3 steps. + * 16-30x 64-bit operations, no comparisons (16 for native byteswap, 30 for pure C) + */ - for (size_t i = 1; i < data_len; i++) { - data >>= 1; - ret = (ret << 1) | (data & 0x01); - } - - return ret; + assert(data_len <= 64); + /* swap odd and even bits */ + data = ((data >> 1) & 0x5555555555555555ULL) | ((data & 0x5555555555555555ULL) << 1); + /* swap consecutive pairs */ + data = ((data >> 2) & 0x3333333333333333ULL) | ((data & 0x3333333333333333ULL) << 2); + /* swap nibbles ... */ + data = ((data >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((data & 0x0F0F0F0F0F0F0F0FULL) << 4); +#if defined(__GNUC__) || defined(__clang__) + data = __builtin_bswap64(data); +#else + /* swap bytes */ + data = ((data >> 8) & 0x00FF00FF00FF00FFULL) | ((data & 0x00FF00FF00FF00FFULL) << 8); + /* swap 2-byte long pairs */ + data = ( data >> 16 & 0xFFFF0000FFFFULL) | ((data & 0xFFFF0000FFFFULL) << 16); + /* swap 4-byte quads */ + data = ( data >> 32 & 0xFFFFFFFFULL) | ((data & 0xFFFFFFFFULL) << 32); +#endif + /* adjust for non-64-bit reversals */ + return data >> (64 - data_len); } /** @@ -126,36 +148,221 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) { #ifdef REDIS_TEST #include +static void genBenchmarkRandomData(char *data, int count); +static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv); +static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv); +long long _ustime(void); + +#include +#include +#include +#include +#include +#include + +#include "zmalloc.h" +#include "crccombine.h" + +long long _ustime(void) { + struct timeval tv; + long long ust; + + gettimeofday(&tv, NULL); + ust = ((long long)tv.tv_sec)*1000000; + ust += tv.tv_usec; + return ust; +} + +static int bench_crc64(unsigned char *data, uint64_t size, long long passes, uint64_t check, char *name, int csv) { + uint64_t min = size, hash; + long long original_start = _ustime(), original_end; + for (long long i=passes; i > 0; i--) { + hash = crc64(0, data, size); + } + original_end = _ustime(); + min = (original_end - original_start) * 1000 / passes; + /* approximate nanoseconds without nstime */ + if (csv) { + printf("%s,%" PRIu64 ",%" PRIu64 ",%d\n", + name, size, (1000 * size) / min, hash == check); + } else { + printf("test size=%" PRIu64 " algorithm=%s %" PRIu64 " M/sec matches=%d\n", + size, name, (1000 * size) / min, hash == check); + } + return hash != check; +} + +const uint64_t BENCH_RPOLY = UINT64_C(0x95ac9329ac4bc9b5); + +static void bench_combine(char *label, uint64_t size, uint64_t expect, int csv) { + uint64_t min = size, start = expect, thash = expect ^ (expect >> 17); + long long original_start = _ustime(), original_end; + for (int i=0; i < 1000; i++) { + crc64_combine(thash, start, size, BENCH_RPOLY, 64); + } + original_end = _ustime(); + /* ran 1000 times, want ns per, counted us per 1000 ... */ + min = original_end - original_start; + if (csv) { + printf("%s,%" PRIu64 ",%" PRIu64 "\n", label, size, min); + } else { + printf("%s size=%" PRIu64 " in %" PRIu64 " nsec\n", label, size, min); + } +} + +static void genBenchmarkRandomData(char *data, int count) { + static uint32_t state = 1234; + int i = 0; + + while (count--) { + state = (state*1103515245+12345); + data[i++] = '0'+((state>>16)&63); + } +} + #define UNUSED(x) (void)(x) int crc64Test(int argc, char *argv[], int flags) { - UNUSED(argc); - UNUSED(argv); - UNUSED(flags); - crc64_init(); - printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", - (uint64_t)_crc64(0, "123456789", 9)); - printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", - (uint64_t)crc64(0, (unsigned char*)"123456789", 9)); - char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed " - "do eiusmod tempor incididunt ut labore et dolore magna " - "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " - "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " - "aute irure dolor in reprehenderit in voluptate velit esse " - "cillum dolore eu fugiat nulla pariatur. Excepteur sint " - "occaecat cupidatat non proident, sunt in culpa qui officia " - "deserunt mollit anim id est laborum."; - printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n", - (uint64_t)_crc64(0, li, sizeof(li))); - printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n", - (uint64_t)crc64(0, (unsigned char*)li, sizeof(li))); + + uint64_t crc64_test_size = 0; + int i, lastarg, csv = 0, loop = 0, combine = 0, testAll = 0; + +again: + if ((argc>=4) && (!strcmp(argv[3],"custom"))) { + for (i = 4; i < argc; i++) { + lastarg = (i == (argc - 1)); + if (!strcmp(argv[i], "--help")) { + goto usage; + } else if (!strcmp(argv[i], "--csv")) { + csv = 1; + } else if (!strcmp(argv[i], "-l")) { + loop = 1; + } else if (!strcmp(argv[i], "--crc")) { + if (lastarg) goto invalid; + crc64_test_size = atoll(argv[++i]); + } else if (!strcmp(argv[i], "--combine")) { + combine = 1; + } else { + invalid: + printf("Invalid option \"%s\" or option argument missing\n\n", + argv[i]); + usage: + printf( + "Usage: crc64 [OPTIONS]\n\n" + " --csv Output in CSV format\n" + " -l Loop. Run the tests forever\n" + " --crc Benchmark crc64 faster options, using a buffer this big, and quit when done.\n" + " --combine Benchmark crc64 combine value ranges and timings.\n" + ); + return 1; + } + } + } else { + crc64_test_size = 50000; + testAll = 1; + if (flags & REDIS_TEST_ACCURATE) crc64_test_size = 5000000; + } + + if ((crc64_test_size == 0 && combine == 0) || testAll) { + crc64_init(); + printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", + (uint64_t)_crc64(0, "123456789", 9)); + printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", + (uint64_t)crc64(0, (unsigned char*)"123456789", 9)); + char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed " + "do eiusmod tempor incididunt ut labore et dolore magna " + "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " + "aute irure dolor in reprehenderit in voluptate velit esse " + "cillum dolore eu fugiat nulla pariatur. Excepteur sint " + "occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum."; + printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n", + (uint64_t)_crc64(0, li, sizeof(li))); + printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n", + (uint64_t)crc64(0, (unsigned char*)li, sizeof(li))); + + if (!testAll) return 0; + } + + int init_this_loop = 1; + long long init_start, init_end; + + do { + unsigned char* data = NULL; + uint64_t passes = 0; + if (crc64_test_size) { + data = zmalloc(crc64_test_size); + genBenchmarkRandomData((char*)data, crc64_test_size); + /* We want to hash about 1 gig of data in total, looped, to get a good + * idea of our performance. + */ + passes = (UINT64_C(0x100000000) / crc64_test_size); + passes = passes >= 2 ? passes : 2; + passes = passes <= 1000 ? passes : 1000; + } + + crc64_init(); + /* warm up the cache */ + set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1); + uint64_t expect = crc64(0, data, crc64_test_size); + + if ((!combine || testAll) && crc64_test_size) { + if (csv && init_this_loop) printf("algorithm,buffer,performance,crc64_matches\n"); + + /* get the single-character version for single-byte Redis behavior */ + set_crc64_cutoffs(0, crc64_test_size+1); + assert(!bench_crc64(data, crc64_test_size, passes, expect, "crc_1byte", csv)); + + set_crc64_cutoffs(crc64_test_size+1, crc64_test_size+1); + /* run with 8-byte "single" path, crcfaster */ + assert(!(bench_crc64(data, crc64_test_size, passes, expect, "crcspeed", csv))); + + /* run with dual 8-byte paths */ + set_crc64_cutoffs(1, crc64_test_size+1); + assert(!(bench_crc64(data, crc64_test_size, passes, expect, "crcdual", csv))); + + /* run with tri 8-byte paths */ + set_crc64_cutoffs(1, 1); + assert(!(bench_crc64(data, crc64_test_size, passes, expect, "crctri", csv))); + + /* Be free memory region, be free. */ + zfree(data); + data = NULL; + } + + uint64_t INIT_SIZE = UINT64_C(0xffffffffffffffff); + if (combine || testAll) { + if (init_this_loop) { + init_start = _ustime(); + crc64_combine( + UINT64_C(0xdeadbeefdeadbeef), + UINT64_C(0xfeebdaedfeebdaed), + INIT_SIZE, + BENCH_RPOLY, 64); + init_end = _ustime(); + + init_end -= init_start; + init_end *= 1000; + if (csv) { + printf("operation,size,nanoseconds\n"); + printf("init_64,%" PRIu64 ",%" PRIu64 "\n", INIT_SIZE, (uint64_t)init_end); + } else { + printf("init_64 size=%" PRIu64 " in %" PRIu64 " nsec\n", INIT_SIZE, (uint64_t)init_end); + } + /* use the hash itself as the size (unpredictable) */ + bench_combine("hash_as_size_combine", crc64_test_size, expect, csv); + + /* let's do something big (predictable, so fast) */ + bench_combine("largest_combine", INIT_SIZE, expect, csv); + } + bench_combine("combine", crc64_test_size, expect, csv); + } + init_this_loop = 0; + /* step down by ~1.641 for a range of test sizes */ + crc64_test_size -= (crc64_test_size >> 2) + (crc64_test_size >> 3) + (crc64_test_size >> 6); + } while (crc64_test_size > 3); + if (loop) goto again; return 0; } #endif - -#ifdef REDIS_TEST_MAIN -int main(int argc, char *argv[]) { - return crc64Test(argc, argv); -} - -#endif diff --git a/src/crccombine.c b/src/crccombine.c new file mode 100644 index 000000000..20add37ee --- /dev/null +++ b/src/crccombine.c @@ -0,0 +1,252 @@ +#include +#include +#include +#if defined(__i386__) || defined(__X86_64__) +#include +#endif +#include "crccombine.h" + +/* Copyright (C) 2013 Mark Adler + * Copyright (C) 2019-2024 Josiah Carlson + * Portions originally from: crc64.c Version 1.4 16 Dec 2013 Mark Adler + * Modifications by Josiah Carlson + * - Added implementation variations with sample timings for gf_matrix_times*() + * - Most folks would be best using gf2_matrix_times_vec or + * gf2_matrix_times_vec2, unless some processor does AVX2 fast. + * - This is the implementation of the MERGE_CRC macro defined in + * crcspeed.c (which calls crc_combine()), and is a specialization of the + * generic crc_combine() (and related from the 2013 edition of Mark Adler's + * crc64.c)) for the sake of clarity and performance. + + This software is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu +*/ + +#define STATIC_ASSERT(VVV) do {int test = 1 / (VVV);test++;} while (0) + +#if !((defined(__i386__) || defined(__X86_64__))) + +/* This cuts 40% of the time vs bit-by-bit. */ + +uint64_t gf2_matrix_times_switch(uint64_t *mat, uint64_t vec) { + /* + * Without using any vector math, this handles 4 bits at a time, + * and saves 40+% of the time compared to the bit-by-bit version. Use if you + * have no vector compile option available to you. With cache, we see: + * E5-2670 ~1-2us to extend ~1 meg 64 bit hash + */ + uint64_t sum; + + sum = 0; + while (vec) { + /* reversing the case order is ~10% slower on Xeon E5-2670 */ + switch (vec & 15) { + case 15: + sum ^= *mat ^ *(mat+1) ^ *(mat+2) ^ *(mat+3); + break; + case 14: + sum ^= *(mat+1) ^ *(mat+2) ^ *(mat+3); + break; + case 13: + sum ^= *mat ^ *(mat+2) ^ *(mat+3); + break; + case 12: + sum ^= *(mat+2) ^ *(mat+3); + break; + case 11: + sum ^= *mat ^ *(mat+1) ^ *(mat+3); + break; + case 10: + sum ^= *(mat+1) ^ *(mat+3); + break; + case 9: + sum ^= *mat ^ *(mat+3); + break; + case 8: + sum ^= *(mat+3); + break; + case 7: + sum ^= *mat ^ *(mat+1) ^ *(mat+2); + break; + case 6: + sum ^= *(mat+1) ^ *(mat+2); + break; + case 5: + sum ^= *mat ^ *(mat+2); + break; + case 4: + sum ^= *(mat+2); + break; + case 3: + sum ^= *mat ^ *(mat+1); + break; + case 2: + sum ^= *(mat+1); + break; + case 1: + sum ^= *mat; + break; + default: + break; + } + vec >>= 4; + mat += 4; + } + return sum; +} + +#define CRC_MULTIPLY gf2_matrix_times_switch + +#else + +/* + Warning: here there be dragons involving vector math, and macros to save us + from repeating the same information over and over. +*/ + +uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec) { + /* + * Uses xmm registers on x86, works basically everywhere fast, doing + * cycles of movqda, mov, shr, pand, and, pxor, at least on gcc 8. + * Is 9-11x faster than original. + * E5-2670 ~29us to extend ~1 meg 64 bit hash + * i3-8130U ~22us to extend ~1 meg 64 bit hash + */ + v2uq sum = {0, 0}, + *mv2 = (v2uq*)mat; + /* this table allows us to eliminate conditions during gf2_matrix_times_vec2() */ + static v2uq masks2[4] = { + {0,0}, + {-1,0}, + {0,-1}, + {-1,-1}, + }; + + /* Almost as beautiful as gf2_matrix_times_vec, but only half as many + * bits per step, so we need 2 per chunk4 operation. Faster in my tests. */ + +#define DO_CHUNK4() \ + sum ^= (*mv2++) & masks2[vec & 3]; \ + vec >>= 2; \ + sum ^= (*mv2++) & masks2[vec & 3]; \ + vec >>= 2 + +#define DO_CHUNK16() \ + DO_CHUNK4(); \ + DO_CHUNK4(); \ + DO_CHUNK4(); \ + DO_CHUNK4() + + DO_CHUNK16(); + DO_CHUNK16(); + DO_CHUNK16(); + DO_CHUNK16(); + + STATIC_ASSERT(sizeof(uint64_t) == 8); + STATIC_ASSERT(sizeof(long long unsigned int) == 8); + return sum[0] ^ sum[1]; +} + +#undef DO_CHUNK16 +#undef DO_CHUNK4 + +#define CRC_MULTIPLY gf2_matrix_times_vec2 +#endif + +static void gf2_matrix_square(uint64_t *square, uint64_t *mat, uint8_t dim) { + unsigned n; + + for (n = 0; n < dim; n++) + square[n] = CRC_MULTIPLY(mat, mat[n]); +} + +/* Turns out our Redis / Jones CRC cycles at this point, so we can support + * more than 64 bits of extension if we want. Trivially. */ +static uint64_t combine_cache[64][64]; + +/* Mark Adler has some amazing updates to crc.c in his crcany repository. I + * like static caches, and not worrying about finding cycles generally. We are + * okay to spend the 32k of memory here, leaving the algorithm unchanged from + * as it was a decade ago, and be happy that it costs <200 microseconds to + * init, and that subsequent calls to the combine function take under 100 + * nanoseconds. We also note that the crcany/crc.c code applies to any CRC, and + * we are currently targeting one: Jones CRC64. + */ + +void init_combine_cache(uint64_t poly, uint8_t dim) { + unsigned n, cache_num = 0; + combine_cache[1][0] = poly; + int prev = 1; + uint64_t row = 1; + for (n = 1; n < dim; n++) + { + combine_cache[1][n] = row; + row <<= 1; + } + + gf2_matrix_square(combine_cache[0], combine_cache[1], dim); + gf2_matrix_square(combine_cache[1], combine_cache[0], dim); + + /* do/while to overwrite the first two layers, they are not used, but are + * re-generated in the last two layers for the Redis polynomial */ + do { + gf2_matrix_square(combine_cache[cache_num], combine_cache[cache_num + prev], dim); + prev = -1; + } while (++cache_num < 64); +} + +/* Return the CRC-64 of two sequential blocks, where crc1 is the CRC-64 of the + * first block, crc2 is the CRC-64 of the second block, and len2 is the length + * of the second block. + * + * If you want reflections on your CRCs; do them outside before / after. + * WARNING: if you enable USE_STATIC_COMBINE_CACHE to make this fast, you MUST + * ALWAYS USE THE SAME POLYNOMIAL, otherwise you will get the wrong results. + * You MAY bzero() the even/odd static arrays, which will induce a re-cache on + * next call as a work-around, but ... maybe just parameterize the cached + * models at that point like Mark Adler does in modern crcany/crc.c . + */ +uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim) { + /* degenerate case */ + if (len2 == 0) + return crc1; + + unsigned cache_num = 0; + if (combine_cache[0][0] == 0) { + init_combine_cache(poly, dim); + } + + /* apply len2 zeros to crc1 (first square will put the operator for one + zero byte, eight zero bits, in even) */ + do + { + /* apply zeros operator for this bit of len2 */ + if (len2 & 1) + crc1 = CRC_MULTIPLY(combine_cache[cache_num], crc1); + len2 >>= 1; + cache_num = (cache_num + 1) & 63; + /* if no more bits set, then done */ + } while (len2 != 0); + + /* return combined crc */ + crc1 ^= crc2; + return crc1; +} + +#undef CRC_MULTIPLY diff --git a/src/crccombine.h b/src/crccombine.h new file mode 100644 index 000000000..8da7c5fe6 --- /dev/null +++ b/src/crccombine.h @@ -0,0 +1,10 @@ + +#include + + +/* mask types */ +typedef unsigned long long v2uq __attribute__ ((vector_size (16))); + +uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec); +void init_combine_cache(uint64_t poly, uint8_t dim); +uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim); diff --git a/src/crcspeed.c b/src/crcspeed.c index 9682d8e0b..c7073cba2 100644 --- a/src/crcspeed.c +++ b/src/crcspeed.c @@ -1,11 +1,21 @@ /* * Copyright (C) 2013 Mark Adler + * Copyright (C) 2019-2024 Josiah Carlson * Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler * Modifications by Matt Stancliff : * - removed CRC64-specific behavior * - added generation of lookup tables by parameters * - removed inversion of CRC input/result * - removed automatic initialization in favor of explicit initialization + * Modifications by Josiah Carlson + * - Added case/vector/AVX/+ versions of crc combine function; see crccombine.c + * - added optional static cache + * - Modified to use 1 thread to: + * - Partition large crc blobs into 2-3 segments + * - Process the 2-3 segments in parallel + * - Merge the resulting crcs + * -> Resulting in 10-90% performance boost for data > 1 meg + * - macro-ized to reduce copy/pasta This software is provided 'as-is', without any express or implied warranty. In no event will the author be held liable for any damages @@ -28,6 +38,10 @@ */ #include "crcspeed.h" +#include "crccombine.h" + +#define CRC64_LEN_MASK UINT64_C(0x7ffffffffffffff8) +#define CRC64_REVERSED_POLY UINT64_C(0x95ac9329ac4bc9b5) /* Fill in a CRC constants table. */ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { @@ -39,7 +53,7 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { table[0][n] = crcfn(0, &v, 1); } - /* generate nested CRC table for future slice-by-8 lookup */ + /* generate nested CRC table for future slice-by-8/16/24+ lookup */ for (int n = 0; n < 256; n++) { crc = table[0][n]; for (int k = 1; k < 8; k++) { @@ -47,6 +61,10 @@ void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { table[k][n] = crc; } } +#if USE_STATIC_COMBINE_CACHE + /* initialize combine cache for CRC stapling for slice-by 16/24+ */ + init_combine_cache(CRC64_REVERSED_POLY, 64); +#endif } void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) { @@ -104,45 +122,151 @@ void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) { } } +/* Note: doing all of our crc/next modifications *before* the crc table + * references is an absolute speedup on all CPUs tested. So... keep these + * macros separate. + */ + +#define DO_8_1(crc, next) \ + crc ^= *(uint64_t *)next; \ + next += 8 + +#define DO_8_2(crc) \ + crc = little_table[7][(uint8_t)crc] ^ \ + little_table[6][(uint8_t)(crc >> 8)] ^ \ + little_table[5][(uint8_t)(crc >> 16)] ^ \ + little_table[4][(uint8_t)(crc >> 24)] ^ \ + little_table[3][(uint8_t)(crc >> 32)] ^ \ + little_table[2][(uint8_t)(crc >> 40)] ^ \ + little_table[1][(uint8_t)(crc >> 48)] ^ \ + little_table[0][crc >> 56] + +#define CRC64_SPLIT(div) \ + olen = len; \ + next2 = next1 + ((len / div) & CRC64_LEN_MASK); \ + len = (next2 - next1) + +#define MERGE_CRC(crcn) \ + crc1 = crc64_combine(crc1, crcn, next2 - next1, CRC64_REVERSED_POLY, 64) + +#define MERGE_END(last, DIV) \ + len = olen - ((next2 - next1) * DIV); \ + next1 = last + +/* Variables so we can change for benchmarking; these seem to be fairly + * reasonable for Intel CPUs made since 2010. Please adjust as necessary if + * or when your CPU has more load / execute units. We've written benchmark code + * to help you tune your platform, see crc64Test. */ +#if defined(__i386__) || defined(__X86_64__) +static size_t CRC64_TRI_CUTOFF = (2*1024); +static size_t CRC64_DUAL_CUTOFF = (128); +#else +static size_t CRC64_TRI_CUTOFF = (16*1024); +static size_t CRC64_DUAL_CUTOFF = (1024); +#endif + + +void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff) { + CRC64_DUAL_CUTOFF = dual_cutoff; + CRC64_TRI_CUTOFF = tri_cutoff; +} + /* Calculate a non-inverted CRC multiple bytes at a time on a little-endian * architecture. If you need inverted CRC, invert *before* calling and invert * *after* calling. - * 64 bit crc = process 8 bytes at once; + * 64 bit crc = process 8/16/24 bytes at once; */ -uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc, +uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc1, void *buf, size_t len) { - unsigned char *next = buf; + unsigned char *next1 = buf; + + if (CRC64_DUAL_CUTOFF < 1) { + goto final; + } /* process individual bytes until we reach an 8-byte aligned pointer */ - while (len && ((uintptr_t)next & 7) != 0) { - crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + while (len && ((uintptr_t)next1 & 7) != 0) { + crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8); len--; } - /* fast middle processing, 8 bytes (aligned!) per loop */ - while (len >= 8) { - crc ^= *(uint64_t *)next; - crc = little_table[7][crc & 0xff] ^ - little_table[6][(crc >> 8) & 0xff] ^ - little_table[5][(crc >> 16) & 0xff] ^ - little_table[4][(crc >> 24) & 0xff] ^ - little_table[3][(crc >> 32) & 0xff] ^ - little_table[2][(crc >> 40) & 0xff] ^ - little_table[1][(crc >> 48) & 0xff] ^ - little_table[0][crc >> 56]; - next += 8; - len -= 8; - } + if (len > CRC64_TRI_CUTOFF) { + /* 24 bytes per loop, doing 3 parallel 8 byte chunks at a time */ + unsigned char *next2, *next3; + uint64_t olen, crc2=0, crc3=0; + CRC64_SPLIT(3); + /* len is now the length of the first segment, the 3rd segment possibly + * having extra bytes to clean up at the end + */ + next3 = next2 + len; + while (len >= 8) { + len -= 8; + DO_8_1(crc1, next1); + DO_8_1(crc2, next2); + DO_8_1(crc3, next3); + DO_8_2(crc1); + DO_8_2(crc2); + DO_8_2(crc3); + } + /* merge the 3 crcs */ + MERGE_CRC(crc2); + MERGE_CRC(crc3); + MERGE_END(next3, 3); + } else if (len > CRC64_DUAL_CUTOFF) { + /* 16 bytes per loop, doing 2 parallel 8 byte chunks at a time */ + unsigned char *next2; + uint64_t olen, crc2=0; + CRC64_SPLIT(2); + /* len is now the length of the first segment, the 2nd segment possibly + * having extra bytes to clean up at the end + */ + while (len >= 8) { + len -= 8; + DO_8_1(crc1, next1); + DO_8_1(crc2, next2); + DO_8_2(crc1); + DO_8_2(crc2); + } + + /* merge the 2 crcs */ + MERGE_CRC(crc2); + MERGE_END(next2, 2); + } + /* We fall through here to handle our = 8) { + len -= 8; + DO_8_1(crc1, next1); + DO_8_2(crc1); + } +final: /* process remaining bytes (can't be larger than 8) */ while (len) { - crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + crc1 = little_table[0][(crc1 ^ *next1++) & 0xff] ^ (crc1 >> 8); len--; } - return crc; + return crc1; } +/* clean up our namespace */ +#undef DO_8_1 +#undef DO_8_2 +#undef CRC64_SPLIT +#undef MERGE_CRC +#undef MERGE_END +#undef CRC64_REVERSED_POLY +#undef CRC64_LEN_MASK + + +/* note: similar perf advantages can be had for long strings in crc16 using all + * of the same optimizations as above; though this is unnecessary. crc16 is + * normally used to shard keys; not hash / verify data, so is used on shorter + * data that doesn't warrant such changes. */ + uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc, void *buf, size_t len) { unsigned char *next = buf; @@ -190,6 +314,10 @@ uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf, len--; } + /* note: alignment + 2/3-way processing can probably be handled here nearly + the same as above, using our updated DO_8_2 macro. Not included in these + changes, as other authors, I don't have big-endian to test with. */ + while (len >= 8) { crc ^= *(uint64_t *)next; crc = big_table[0][crc & 0xff] ^ diff --git a/src/crcspeed.h b/src/crcspeed.h index d7ee95ebb..c29f236bc 100644 --- a/src/crcspeed.h +++ b/src/crcspeed.h @@ -34,6 +34,8 @@ typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t); typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t); +void set_crc64_cutoffs(size_t dual_cutoff, size_t tri_cutoff); + /* CRC-64 */ void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]); void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]); From 701f06657d20ec42f0cf78de0ac9d7197e44c00c Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Thu, 14 Nov 2024 20:35:31 +0800 Subject: [PATCH 10/42] Reuse c->argv after command execution to reduce memory allocation overhead (#13521) inspred by https://github.com/redis/redis/pull/12730 Before this PR, we allocate new memory to store the user command arguments, however, if the size of the current `c->argv` is larger than the current command, we can reuse the previously allocated argv to avoid allocating new memory for the current command. And we will free `c->argv` in client cron when the client is idle for 2 seconds. --------- Co-authored-by: Ozan Tezcan --- src/networking.c | 48 ++++++++++++++++++++++++++++++++---------------- src/server.c | 14 ++++++++++++++ 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/networking.c b/src/networking.c index 47312b8d8..95a6ee08e 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1441,16 +1441,22 @@ void freeClientOriginalArgv(client *c) { c->original_argc = 0; } -void freeClientArgv(client *c) { +static inline void freeClientArgvInternal(client *c, int free_argv) { int j; for (j = 0; j < c->argc; j++) decrRefCount(c->argv[j]); c->argc = 0; c->cmd = NULL; c->argv_len_sum = 0; - c->argv_len = 0; - zfree(c->argv); - c->argv = NULL; + if (free_argv) { + c->argv_len = 0; + zfree(c->argv); + c->argv = NULL; + } +} + +void freeClientArgv(client *c) { + freeClientArgvInternal(c, 1); } /* Close all the slaves connections. This is useful in chained replication @@ -2152,11 +2158,10 @@ int handleClientsWithPendingWrites(void) { return processed; } -/* resetClient prepare the client to process the next command */ -void resetClient(client *c) { +static inline void resetClientInternal(client *c, int free_argv) { redisCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL; - freeClientArgv(c); + freeClientArgvInternal(c, free_argv); c->cur_script = NULL; c->reqtype = 0; c->multibulklen = 0; @@ -2195,6 +2200,11 @@ void resetClient(client *c) { } } +/* resetClient prepare the client to process the next command */ +void resetClient(client *c) { + resetClientInternal(c, 1); +} + /* This function is used when we want to re-enter the event loop but there * is the risk that the client we are dealing with will be freed in some * way. This happens for instance in: @@ -2292,9 +2302,12 @@ int processInlineBuffer(client *c) { /* Setup argv array on client structure */ if (argc) { - if (c->argv) zfree(c->argv); - c->argv_len = argc; - c->argv = zmalloc(sizeof(robj*)*c->argv_len); + /* Create new argv if space is insufficient. */ + if (unlikely(argc > c->argv_len)) { + zfree(c->argv); + c->argv = zmalloc(sizeof(robj*)*argc); + c->argv_len = argc; + } c->argv_len_sum = 0; } @@ -2395,10 +2408,13 @@ int processMultibulkBuffer(client *c) { c->multibulklen = ll; - /* Setup argv array on client structure */ - if (c->argv) zfree(c->argv); - c->argv_len = min(c->multibulklen, 1024); - c->argv = zmalloc(sizeof(robj*)*c->argv_len); + /* Setup argv array on client structure. + * Create new argv if space is insufficient or if we need to allocate it gradually. */ + if (unlikely(c->multibulklen > c->argv_len || c->multibulklen > 1024)) { + zfree(c->argv); + c->argv_len = min(c->multibulklen, 1024); + c->argv = zmalloc(sizeof(robj*)*c->argv_len); + } c->argv_len_sum = 0; } @@ -2530,7 +2546,7 @@ void commandProcessed(client *c) { if (c->flags & CLIENT_BLOCKED) return; reqresAppendResponse(c); - resetClient(c); + resetClientInternal(c, 0); long long prev_offset = c->reploff; if (c->flags & CLIENT_MASTER && !(c->flags & CLIENT_MULTI)) { @@ -2661,7 +2677,7 @@ int processInputBuffer(client *c) { /* Multibulk processing could see a <= 0 length. */ if (c->argc == 0) { - resetClient(c); + resetClientInternal(c, 0); } else { /* If we are in the context of an I/O thread, we can't really * execute the command here. All we can do is to flag the client diff --git a/src/server.c b/src/server.c index 054ad171f..c60296a9a 100644 --- a/src/server.c +++ b/src/server.c @@ -784,6 +784,19 @@ int clientsCronResizeQueryBuffer(client *c) { return 0; } +/* If the client has been idle for too long, free the client's arguments. */ +int clientsCronFreeArgvIfIdle(client *c) { + /* If the arguments have already been freed or are still in use, exit ASAP. */ + if (!c->argv || c->argc) return 0; + time_t idletime = server.unixtime - c->lastinteraction; + if (idletime > 2) { + c->argv_len = 0; + zfree(c->argv); + c->argv = NULL; + } + return 0; +} + /* The client output buffer can be adjusted to better fit the memory requirements. * * the logic is: @@ -1050,6 +1063,7 @@ void clientsCron(void) { * terminated. */ if (clientsCronHandleTimeout(c,now)) continue; if (clientsCronResizeQueryBuffer(c)) continue; + if (clientsCronFreeArgvIfIdle(c)) continue; if (clientsCronResizeOutputBuffer(c,now)) continue; if (clientsCronTrackExpansiveClients(c, curr_peak_mem_usage_slot)) continue; From 5b84dc967855eaecd90550584608145037d88ac6 Mon Sep 17 00:00:00 2001 From: nafraf Date: Thu, 21 Nov 2024 01:14:14 -0500 Subject: [PATCH 11/42] Fix module loadex command crash due to invalid config (#13653) Fix to https://github.com/redis/redis/issues/13650 providing an invalid config to a module with datatype crashes when redis tries to unload the module due to the invalid config --------- Co-authored-by: debing.sun --- src/module.c | 13 ++++++++----- src/server.h | 2 +- tests/modules/datatype.c | 9 +++++++++ tests/unit/moduleapi/datatype.tcl | 5 +++++ 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/module.c b/src/module.c index 883dda26f..6f0fa8abf 100644 --- a/src/module.c +++ b/src/module.c @@ -12371,7 +12371,7 @@ int moduleLoad(const char *path, void **module_argv, int module_argc, int is_loa } if (post_load_err) { - moduleUnload(ctx.module->name, NULL); + serverAssert(moduleUnload(ctx.module->name, NULL, 1) == C_OK); moduleFreeContext(&ctx); return C_ERR; } @@ -12387,14 +12387,17 @@ int moduleLoad(const char *path, void **module_argv, int module_argc, int is_loa /* Unload the module registered with the specified name. On success * C_OK is returned, otherwise C_ERR is returned and errmsg is set - * with an appropriate message. */ -int moduleUnload(sds name, const char **errmsg) { + * with an appropriate message. + * Only forcefully unload this module, passing forced_unload != 0, + * if it is certain that it has not yet been in use (e.g., immediate + * unload on failed load). */ +int moduleUnload(sds name, const char **errmsg, int forced_unload) { struct RedisModule *module = dictFetchValue(modules,name); if (module == NULL) { *errmsg = "no such module with that name"; return C_ERR; - } else if (listLength(module->types)) { + } else if (listLength(module->types) && !forced_unload) { *errmsg = "the module exports one or more module-side data " "types, can't unload"; return C_ERR; @@ -13182,7 +13185,7 @@ NULL } else if (!strcasecmp(subcmd,"unload") && c->argc == 3) { const char *errmsg = NULL; - if (moduleUnload(c->argv[2]->ptr, &errmsg) == C_OK) + if (moduleUnload(c->argv[2]->ptr, &errmsg, 0) == C_OK) addReply(c,shared.ok); else { if (errmsg == NULL) errmsg = "operation not possible."; diff --git a/src/server.h b/src/server.h index 4f5686192..bfdbde8da 100644 --- a/src/server.h +++ b/src/server.h @@ -2510,7 +2510,7 @@ void moduleInitModulesSystem(void); void moduleInitModulesSystemLast(void); void modulesCron(void); int moduleLoad(const char *path, void **argv, int argc, int is_loadex); -int moduleUnload(sds name, const char **errmsg); +int moduleUnload(sds name, const char **errmsg, int forced_unload); void moduleLoadFromQueue(void); int moduleGetCommandKeysViaAPI(struct redisCommand *cmd, robj **argv, int argc, getKeysResult *result); int moduleGetCommandChannelsViaAPI(struct redisCommand *cmd, robj **argv, int argc, getKeysResult *result); diff --git a/tests/modules/datatype.c b/tests/modules/datatype.c index 408d1a526..05cf2337c 100644 --- a/tests/modules/datatype.c +++ b/tests/modules/datatype.c @@ -312,3 +312,12 @@ int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) return REDISMODULE_OK; } + +int RedisModule_OnUnload(RedisModuleCtx *ctx) { + REDISMODULE_NOT_USED(ctx); + if (datatype) { + RedisModule_Free(datatype); + datatype = NULL; + } + return REDISMODULE_OK; +} diff --git a/tests/unit/moduleapi/datatype.tcl b/tests/unit/moduleapi/datatype.tcl index 951c060e7..5d1722caa 100644 --- a/tests/unit/moduleapi/datatype.tcl +++ b/tests/unit/moduleapi/datatype.tcl @@ -1,6 +1,11 @@ set testmodule [file normalize tests/modules/datatype.so] start_server {tags {"modules"}} { + test {DataType: test loadex with invalid config} { + catch { r module loadex $testmodule CONFIG invalid_config 1 } e + assert_match {*ERR Error loading the extension*} $e + } + r module load $testmodule test {DataType: Test module is sane, GET/SET work.} { From 79fd2558284df54b65b839c2edb5dcb875a5e00c Mon Sep 17 00:00:00 2001 From: Oran Agra Date: Thu, 21 Nov 2024 08:22:17 +0200 Subject: [PATCH 12/42] Add Lua VM memory to memory overhead, now that it's part of zmalloc (#13660) To complement the work done in #13133. it added the script VMs memory to be counted as part of zmalloc, but that means they should be also counted as part of the non-value overhead. this commit contains some refactoring to make variable names and function names less confusing. it also adds a new field named `script.VMs` into the `MEMORY STATS` command. additionally, clear scripts and stats between tests in external mode (which is related to how this issue was discovered) --- src/commands/memory-stats.json | 3 +++ src/eval.c | 4 ++-- src/functions.c | 4 ++-- src/functions.h | 4 ++-- src/object.c | 17 ++++++++++++----- src/server.c | 10 +++++----- src/server.h | 7 ++++--- tests/support/server.tcl | 2 ++ 8 files changed, 32 insertions(+), 19 deletions(-) diff --git a/src/commands/memory-stats.json b/src/commands/memory-stats.json index 98e49b7d2..23dac5eb2 100644 --- a/src/commands/memory-stats.json +++ b/src/commands/memory-stats.json @@ -44,6 +44,9 @@ "lua.caches": { "type": "integer" }, + "script.VMs": { + "type": "integer" + }, "functions.caches": { "type": "integer" }, diff --git a/src/eval.c b/src/eval.c index b11656f1d..2c83db19e 100644 --- a/src/eval.c +++ b/src/eval.c @@ -730,7 +730,7 @@ NULL } } -unsigned long evalMemory(void) { +unsigned long evalScriptsMemoryVM(void) { return luaMemory(lctx.lua); } @@ -738,7 +738,7 @@ dict* evalScriptsDict(void) { return lctx.lua_scripts; } -unsigned long evalScriptsMemory(void) { +unsigned long evalScriptsMemoryEngine(void) { return lctx.lua_scripts_mem + dictMemUsage(lctx.lua_scripts) + dictSize(lctx.lua_scripts) * sizeof(luaScript) + diff --git a/src/functions.c b/src/functions.c index b74dd9a28..dde42daf6 100644 --- a/src/functions.c +++ b/src/functions.c @@ -1063,7 +1063,7 @@ void functionLoadCommand(client *c) { } /* Return memory usage of all the engines combine */ -unsigned long functionsMemory(void) { +unsigned long functionsMemoryVM(void) { dictIterator *iter = dictGetIterator(engines); dictEntry *entry = NULL; size_t engines_memory = 0; @@ -1078,7 +1078,7 @@ unsigned long functionsMemory(void) { } /* Return memory overhead of all the engines combine */ -unsigned long functionsMemoryOverhead(void) { +unsigned long functionsMemoryEngine(void) { size_t memory_overhead = dictMemUsage(engines); memory_overhead += dictMemUsage(curr_functions_lib_ctx->functions); memory_overhead += sizeof(functionsLibCtx); diff --git a/src/functions.h b/src/functions.h index ed4392db3..4df6d1b59 100644 --- a/src/functions.h +++ b/src/functions.h @@ -102,8 +102,8 @@ struct functionLibInfo { int functionsRegisterEngine(const char *engine_name, engine *engine_ctx); sds functionsCreateWithLibraryCtx(sds code, int replace, sds* err, functionsLibCtx *lib_ctx, size_t timeout); -unsigned long functionsMemory(void); -unsigned long functionsMemoryOverhead(void); +unsigned long functionsMemoryVM(void); +unsigned long functionsMemoryEngine(void); unsigned long functionsNum(void); unsigned long functionsLibNum(void); dict* functionsLibGet(void); diff --git a/src/object.c b/src/object.c index d065359fa..5f43904d6 100644 --- a/src/object.c +++ b/src/object.c @@ -1230,12 +1230,16 @@ struct redisMemOverhead *getMemoryOverheadData(void) { mh->aof_buffer = mem; mem_total+=mem; - mem = evalScriptsMemory(); - mh->lua_caches = mem; + mem = evalScriptsMemoryEngine(); + mh->eval_caches = mem; mem_total+=mem; - mh->functions_caches = functionsMemoryOverhead(); + mh->functions_caches = functionsMemoryEngine(); mem_total+=mh->functions_caches; + mh->script_vm = evalScriptsMemoryVM(); + mh->script_vm += functionsMemoryVM(); + mem_total+=mh->script_vm; + for (j = 0; j < server.dbnum; j++) { redisDb *db = server.db+j; if (!kvstoreNumAllocatedDicts(db->keys)) continue; @@ -1556,7 +1560,7 @@ NULL } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) { struct redisMemOverhead *mh = getMemoryOverheadData(); - addReplyMapLen(c,31+mh->num_dbs); + addReplyMapLen(c,32+mh->num_dbs); addReplyBulkCString(c,"peak.allocated"); addReplyLongLong(c,mh->peak_allocated); @@ -1583,11 +1587,14 @@ NULL addReplyLongLong(c,mh->aof_buffer); addReplyBulkCString(c,"lua.caches"); - addReplyLongLong(c,mh->lua_caches); + addReplyLongLong(c,mh->eval_caches); addReplyBulkCString(c,"functions.caches"); addReplyLongLong(c,mh->functions_caches); + addReplyBulkCString(c,"script.VMs"); + addReplyLongLong(c,mh->script_vm); + for (size_t j = 0; j < mh->num_dbs; j++) { char dbname[32]; snprintf(dbname,sizeof(dbname),"db.%zd",mh->db[j].dbid); diff --git a/src/server.c b/src/server.c index c60296a9a..973b02001 100644 --- a/src/server.c +++ b/src/server.c @@ -5700,8 +5700,8 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { size_t zmalloc_used = zmalloc_used_memory(); size_t total_system_mem = server.system_memory_size; const char *evict_policy = evictPolicyToString(); - long long memory_lua = evalMemory(); - long long memory_functions = functionsMemory(); + long long memory_lua = evalScriptsMemoryVM(); + long long memory_functions = functionsMemoryVM(); struct redisMemOverhead *mh = getMemoryOverheadData(); /* Peak memory is updated from time to time by serverCron() so it @@ -5716,7 +5716,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { bytesToHuman(total_system_hmem,sizeof(total_system_hmem),total_system_mem); bytesToHuman(used_memory_lua_hmem,sizeof(used_memory_lua_hmem),memory_lua); bytesToHuman(used_memory_vm_total_hmem,sizeof(used_memory_vm_total_hmem),memory_functions + memory_lua); - bytesToHuman(used_memory_scripts_hmem,sizeof(used_memory_scripts_hmem),mh->lua_caches + mh->functions_caches); + bytesToHuman(used_memory_scripts_hmem,sizeof(used_memory_scripts_hmem),mh->eval_caches + mh->functions_caches); bytesToHuman(used_memory_rss_hmem,sizeof(used_memory_rss_hmem),server.cron_malloc_stats.process_rss); bytesToHuman(maxmemory_hmem,sizeof(maxmemory_hmem),server.maxmemory); @@ -5742,7 +5742,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "used_memory_lua:%lld\r\n", memory_lua, /* deprecated, renamed to used_memory_vm_eval */ "used_memory_vm_eval:%lld\r\n", memory_lua, "used_memory_lua_human:%s\r\n", used_memory_lua_hmem, /* deprecated */ - "used_memory_scripts_eval:%lld\r\n", (long long)mh->lua_caches, + "used_memory_scripts_eval:%lld\r\n", (long long)mh->eval_caches, "number_of_cached_scripts:%lu\r\n", dictSize(evalScriptsDict()), "number_of_functions:%lu\r\n", functionsNum(), "number_of_libraries:%lu\r\n", functionsLibNum(), @@ -5750,7 +5750,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "used_memory_vm_total:%lld\r\n", memory_functions + memory_lua, "used_memory_vm_total_human:%s\r\n", used_memory_vm_total_hmem, "used_memory_functions:%lld\r\n", (long long)mh->functions_caches, - "used_memory_scripts:%lld\r\n", (long long)mh->lua_caches + (long long)mh->functions_caches, + "used_memory_scripts:%lld\r\n", (long long)mh->eval_caches + (long long)mh->functions_caches, "used_memory_scripts_human:%s\r\n", used_memory_scripts_hmem, "maxmemory:%lld\r\n", server.maxmemory, "maxmemory_human:%s\r\n", maxmemory_hmem, diff --git a/src/server.h b/src/server.h index bfdbde8da..5a89c2ad6 100644 --- a/src/server.h +++ b/src/server.h @@ -1401,8 +1401,9 @@ struct redisMemOverhead { size_t clients_normal; size_t cluster_links; size_t aof_buffer; - size_t lua_caches; + size_t eval_caches; size_t functions_caches; + size_t script_vm; size_t overhead_total; size_t dataset; size_t total_keys; @@ -3510,9 +3511,9 @@ int ldbIsEnabled(void); void ldbLog(sds entry); void ldbLogRedisReply(char *reply); void sha1hex(char *digest, char *script, size_t len); -unsigned long evalMemory(void); +unsigned long evalScriptsMemoryVM(void); dict* evalScriptsDict(void); -unsigned long evalScriptsMemory(void); +unsigned long evalScriptsMemoryEngine(void); uint64_t evalGetCommandFlags(client *c, uint64_t orig_flags); uint64_t fcallGetCommandFlags(client *c, uint64_t orig_flags); int isInsideYieldingLongCommand(void); diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 0db72cbfe..e429043fc 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -373,6 +373,8 @@ proc run_external_server_test {code overrides} { r flushall r function flush + r script flush + r config resetstat # store configs set saved_config {} From 155634502d15adde8b55f1f6d1f93101f76986a7 Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Thu, 21 Nov 2024 09:55:02 +0200 Subject: [PATCH 13/42] modules API: Support register unprefixed config parameters (#13656) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #10285 introduced support for modules to register four types of configurations — Bool, Numeric, String, and Enum. Accessible through the Redis config file and the CONFIG command. With this PR, it will be possible to register configuration parameters without automatically prefixing the parameter names. This provides greater flexibility in configuration naming, enabling, for instance, both `bf-initial-size` or `initial-size` to be defined in the module without automatically prefixing with `.`. In addition it will also be possible to create a single additional alias via the same API. This brings us another step closer to integrate modules into redis core. **Example:** Register a configuration parameter `bf-initial-size` with an alias `initial-size` without the automatic module name prefix, set with new `REDISMODULE_CONFIG_UNPREFIXED` flag: ``` RedisModule_RegisterBoolConfig(ctx, "bf-initial-size|initial-size", default_val, optflags | REDISMODULE_CONFIG_UNPREFIXED, getfn, setfn, applyfn, privdata); ``` # API changes Related functions that now support unprefixed configuration flag (`REDISMODULE_CONFIG_UNPREFIXED`) along with optional alias: ``` RedisModule_RegisterBoolConfig RedisModule_RegisterEnumConfig RedisModule_RegisterNumericConfig RedisModule_RegisterStringConfig ``` # Implementation Details: `config.c`: On load server configuration, at function `loadServerConfigFromString()`, it collects all unknown configurations into `module_configs_queue` dictionary. These may include valid module configurations or invalid ones. They will be validated later by `loadModuleConfigs()` against the configurations declared by the loaded module(s). `Module.c:` The `ModuleConfig` structure has been modified to store now: (1) Full configuration name (2) Alias (3) Unprefixed flag status - ensuring that configurations retain their original registration format when triggered in notifications. Added error printout: This change introduces an error printout for unresolved configurations, detailing each unresolved parameter detected during startup. The last line in the output existed prior to this change and has been retained to systems relies on it: ``` 595011:M 18 Nov 2024 08:26:23.616 # Unresolved Configuration(s) Detected: 595011:M 18 Nov 2024 08:26:23.616 # >>> 'bf-initiel-size 8' 595011:M 18 Nov 2024 08:26:23.616 # >>> 'search-sizex 32' 595011:M 18 Nov 2024 08:26:23.616 # Module Configuration detected without loadmodule directive or no ApplyConfig call: aborting ``` # Backward Compatibility: Existing modules will function without modification, as the new functionality only applies if REDISMODULE_CONFIG_UNPREFIXED is explicitly set. # Module vs. Core API Conflict Behavior The new API allows to modules loading duplication of same configuration name or same configuration alias, just like redis core configuration allows (i.e. the users sets two configs with a different value, but these two configs are actually the same one). Unlike redis core, given a name and its alias, it doesn't allow have both configuration on load. To implement it, it is required to modify DS `module_configs_queue` to reflect the order of their loading and later on, during `loadModuleConfigs()`, resolve pairs of names and aliases and which one is the last one to apply. "Relaxing" this limitation can be deferred to a future update if necessary, but for now, we error in this case. --- src/config.c | 152 +++++++++----- src/module.c | 267 ++++++++++++++++++++----- src/redismodule.h | 1 + src/server.h | 11 +- tests/modules/moduleconfigs.c | 67 ++++++- tests/unit/moduleapi/moduleconfigs.tcl | 107 +++++++++- 6 files changed, 494 insertions(+), 111 deletions(-) diff --git a/src/config.c b/src/config.c index 720311630..d0d30966c 100644 --- a/src/config.c +++ b/src/config.c @@ -268,7 +268,7 @@ dict *configs = NULL; /* Runtime config values */ /* Lookup a config by the provided sds string name, or return NULL * if the config does not exist */ -static standardConfig *lookupConfig(sds name) { +static standardConfig *lookupConfig(const sds name) { dictEntry *de = dictFind(configs, name); return de ? dictGetVal(de) : NULL; } @@ -552,16 +552,6 @@ void loadServerConfigFromString(char *config) { } } else if (!strcasecmp(argv[0],"loadmodule") && argc >= 2) { queueLoadModule(argv[1],&argv[2],argc-2); - } else if (strchr(argv[0], '.')) { - if (argc < 2) { - err = "Module config specified without value"; - goto loaderr; - } - sds name = sdsdup(argv[0]); - sds val = sdsdup(argv[1]); - for (int i = 2; i < argc; i++) - val = sdscatfmt(val, " %S", argv[i]); - if (!dictReplace(server.module_configs_queue, name, val)) sdsfree(name); } else if (!strcasecmp(argv[0],"sentinel")) { /* argc == 1 is handled by main() as we need to enter the sentinel * mode ASAP. */ @@ -573,7 +563,20 @@ void loadServerConfigFromString(char *config) { queueSentinelConfig(argv+1,argc-1,linenum,lines[i]); } } else { - err = "Bad directive or wrong number of arguments"; goto loaderr; + /* Collect all unknown configurations into `module_configs_queue`. + * These may include valid module configurations or invalid ones. + * They will be validated later by loadModuleConfigs() against the + * configurations declared by the loaded module(s). */ + + if (argc < 2) { + err = "Bad directive or wrong number of arguments"; + goto loaderr; + } + sds name = sdsdup(argv[0]); + sds val = sdsdup(argv[1]); + for (int i = 2; i < argc; i++) + val = sdscatfmt(val, " %S", argv[i]); + if (!dictReplace(server.module_configs_queue, name, val)) sdsfree(name); } sdsfreesplitres(argv,argc); argv = NULL; @@ -3312,16 +3315,34 @@ void removeConfig(sds name) { standardConfig *config = lookupConfig(name); if (!config) return; if (config->flags & MODULE_CONFIG) { + sdsfree((sds) config->name); - if (config->type == ENUM_CONFIG) { - configEnum *enumNode = config->data.enumd.enum_value; - while(enumNode->name != NULL) { - zfree(enumNode->name); - enumNode++; - } - zfree(config->data.enumd.enum_value); - } else if (config->type == SDS_CONFIG) { - if (config->data.sds.default_value) sdsfree((sds)config->data.sds.default_value); + sdsfree((sds) config->alias); + + switch (config->type) { + case BOOL_CONFIG: + break; + case NUMERIC_CONFIG: + break; + case SDS_CONFIG: + if (config->data.sds.default_value) + sdsfree((sds)config->data.sds.default_value); + break; + case ENUM_CONFIG: + { + configEnum *enumNode = config->data.enumd.enum_value; + while(enumNode->name != NULL) { + zfree(enumNode->name); + enumNode++; + } + zfree(config->data.enumd.enum_value); + } + break; + case SPECIAL_CONFIG: /* Not used by modules */ + case STRING_CONFIG: /* Not used by modules */ + default: + serverAssert(0); + break; } } dictDelete(configs, name); @@ -3332,40 +3353,77 @@ void removeConfig(sds name) { *----------------------------------------------------------------------------*/ /* Create a bool/string/enum/numeric standardConfig for a module config in the configs dictionary */ -void addModuleBoolConfig(const char *module_name, const char *name, int flags, void *privdata, int default_val) { - sds config_name = sdscatfmt(sdsempty(), "%s.%s", module_name, name); + +/* On removeConfig(), name and alias will be sdsfree() */ +void addModuleBoolConfig(sds name, sds alias, int flags, void *privdata, int default_val) { int config_dummy_address; - standardConfig module_config = createBoolConfig(config_name, NULL, flags | MODULE_CONFIG, config_dummy_address, default_val, NULL, NULL); - module_config.data.yesno.config = NULL; - module_config.privdata = privdata; - registerConfigValue(config_name, &module_config, 0); + standardConfig sc = createBoolConfig(name, alias, flags | MODULE_CONFIG, config_dummy_address, default_val, NULL, NULL); + sc.data.yesno.config = NULL; + sc.privdata = privdata; + registerConfigValue(name, &sc, 0); + + /* If alias available, deep copy standardConfig and register again */ + if (alias) { + sc.name = sdsdup(name); + sc.alias = sdsdup(alias); + registerConfigValue(sc.alias, &sc, 1); + } } -void addModuleStringConfig(const char *module_name, const char *name, int flags, void *privdata, sds default_val) { - sds config_name = sdscatfmt(sdsempty(), "%s.%s", module_name, name); +/* On removeConfig(), name, default_val, and alias will be sdsfree() */ +void addModuleStringConfig(sds name, sds alias, int flags, void *privdata, sds default_val) { sds config_dummy_address; - standardConfig module_config = createSDSConfig(config_name, NULL, flags | MODULE_CONFIG, 0, config_dummy_address, default_val, NULL, NULL); - module_config.data.sds.config = NULL; - module_config.privdata = privdata; - registerConfigValue(config_name, &module_config, 0); + standardConfig sc = createSDSConfig(name, alias, flags | MODULE_CONFIG, 0, config_dummy_address, default_val, NULL, NULL); + sc.data.sds.config = NULL; + sc.privdata = privdata; + registerConfigValue(name, &sc, 0); /* memcpy sc */ + + /* If alias available, deep copy standardConfig and register again */ + if (alias) { + sc.name = sdsdup(name); + sc.alias = sdsdup(alias); + if (default_val) sc.data.sds.default_value = sdsdup(default_val); + registerConfigValue(sc.alias, &sc, 1); + } } -void addModuleEnumConfig(const char *module_name, const char *name, int flags, void *privdata, int default_val, configEnum *enum_vals) { - sds config_name = sdscatfmt(sdsempty(), "%s.%s", module_name, name); +/* On removeConfig(), name, default_val, alias and enum_vals will be freed */ +void addModuleEnumConfig(sds name, sds alias, int flags, void *privdata, int default_val, configEnum *enum_vals, int num_enum_vals) { int config_dummy_address; - standardConfig module_config = createEnumConfig(config_name, NULL, flags | MODULE_CONFIG, enum_vals, config_dummy_address, default_val, NULL, NULL); - module_config.data.enumd.config = NULL; - module_config.privdata = privdata; - registerConfigValue(config_name, &module_config, 0); + standardConfig sc = createEnumConfig(name, alias, flags | MODULE_CONFIG, enum_vals, config_dummy_address, default_val, NULL, NULL); + sc.data.enumd.config = NULL; + sc.privdata = privdata; + registerConfigValue(name, &sc, 0); + + /* If alias available, deep copy standardConfig and register again */ + if (alias) { + sc.name = sdsdup(name); + sc.alias = sdsdup(alias); + sc.data.enumd.enum_value = zmalloc((num_enum_vals + 1) * sizeof(configEnum)); + for (int i = 0; i < num_enum_vals; i++) { + sc.data.enumd.enum_value[i].name = zstrdup(enum_vals[i].name); + sc.data.enumd.enum_value[i].val = enum_vals[i].val; + } + sc.data.enumd.enum_value[num_enum_vals].name = NULL; + sc.data.enumd.enum_value[num_enum_vals].val = 0; + registerConfigValue(sc.alias, &sc, 1); + } } -void addModuleNumericConfig(const char *module_name, const char *name, int flags, void *privdata, long long default_val, int conf_flags, long long lower, long long upper) { - sds config_name = sdscatfmt(sdsempty(), "%s.%s", module_name, name); +/* On removeConfig(), it will free name, and alias if it is not NULL */ +void addModuleNumericConfig(sds name, sds alias, int flags, void *privdata, long long default_val, int conf_flags, long long lower, long long upper) { long long config_dummy_address; - standardConfig module_config = createLongLongConfig(config_name, NULL, flags | MODULE_CONFIG, lower, upper, config_dummy_address, default_val, conf_flags, NULL, NULL); - module_config.data.numeric.config.ll = NULL; - module_config.privdata = privdata; - registerConfigValue(config_name, &module_config, 0); + standardConfig sc = createLongLongConfig(name, alias, flags | MODULE_CONFIG, lower, upper, config_dummy_address, default_val, conf_flags, NULL, NULL); + sc.data.numeric.config.ll = NULL; + sc.privdata = privdata; + registerConfigValue(name, &sc, 0); + + /* If alias available, deep copy standardConfig and register again */ + if (alias) { + sc.name = sdsdup(name); + sc.alias = sdsdup(alias); + registerConfigValue(sc.alias, &sc, 1); + } } /*----------------------------------------------------------------------------- @@ -3418,3 +3476,7 @@ void configRewriteCommand(client *c) { addReply(c,shared.ok); } } + +int configExists(const sds name) { + return lookupConfig(name) != NULL; +} diff --git a/src/module.c b/src/module.c index 6f0fa8abf..a7a4e1f45 100644 --- a/src/module.c +++ b/src/module.c @@ -437,7 +437,13 @@ typedef int (*RedisModuleConfigApplyFunc)(RedisModuleCtx *ctx, void *privdata, R /* Struct representing a module config. These are stored in a list in the module struct */ struct ModuleConfig { - sds name; /* Name of config without the module name appended to the front */ + sds name; /* Fullname of the config (as it appears in the config file) */ + sds alias; /* Optional alias for the configuration. NULL if none exists */ + + int unprefixedFlag; /* Indicates if the REDISMODULE_CONFIG_UNPREFIXED flag was set. + * If the configuration name was prefixed,during get_fn/set_fn + * callbacks, it should be reported without the prefix */ + void *privdata; /* Optional data passed into the module config callbacks */ union get_fn { /* The get callback specified by the module */ RedisModuleConfigGetStringFunc get_string; @@ -2253,12 +2259,16 @@ int moduleIsModuleCommand(void *module_handle, struct redisCommand *cmd) { * -------------------------------------------------------------------------- */ int moduleListConfigMatch(void *config, void *name) { - return strcasecmp(((ModuleConfig *) config)->name, (char *) name) == 0; + ModuleConfig *mc = (ModuleConfig *) config; + /* Compare the provided name with the config's name and alias if it exists */ + return strcasecmp(mc->name, (char *) name) == 0 || + ((mc->alias) && strcasecmp(mc->alias, (char *) name) == 0); } void moduleListFree(void *config) { ModuleConfig *module_config = (ModuleConfig *) config; sdsfree(module_config->name); + sdsfree(module_config->alias); zfree(config); } @@ -12090,10 +12100,9 @@ void moduleRemoveConfigs(RedisModule *module) { listRewind(module->module_configs, &li); while ((ln = listNext(&li))) { ModuleConfig *config = listNodeValue(ln); - sds module_name = sdsnew(module->name); - sds full_name = sdscat(sdscat(module_name, "."), config->name); /* ModuleName.ModuleConfig */ - removeConfig(full_name); - sdsfree(full_name); + removeConfig(config->name); + if (config->alias) + removeConfig(config->alias); } } @@ -12132,6 +12141,12 @@ void moduleLoadFromQueue(void) { listDelNode(server.loadmodule_queue, ln); } if (dictSize(server.module_configs_queue)) { + serverLog(LL_WARNING, "Unresolved Configuration(s) Detected:"); + dictIterator *di = dictGetIterator(server.module_configs_queue); + dictEntry *de; + while ((de = dictNext(di)) != NULL) { + serverLog(LL_WARNING, ">>> '%s %s'", (char *)dictGetKey(de), (char *)dictGetVal(de)); + } serverLog(LL_WARNING, "Module Configuration detected without loadmodule directive or no ApplyConfig call: aborting"); exit(1); } @@ -12582,7 +12597,8 @@ int moduleVerifyConfigFlags(unsigned int flags, configType type) { | REDISMODULE_CONFIG_PROTECTED | REDISMODULE_CONFIG_DENY_LOADING | REDISMODULE_CONFIG_BITFLAGS - | REDISMODULE_CONFIG_MEMORY))) { + | REDISMODULE_CONFIG_MEMORY + | REDISMODULE_CONFIG_UNPREFIXED))) { serverLogRaw(LL_WARNING, "Invalid flag(s) for configuration"); return REDISMODULE_ERR; } @@ -12619,6 +12635,54 @@ int moduleVerifyResourceName(const char *name) { return REDISMODULE_OK; } +/* Verify unprefixed name config might be a single "" or in the form + * "|". Unlike moduleVerifyResourceName(), unprefixed name config + * allows a single dot in the name or alias. + * + * delim - Updates to point to "|" if it exists, NULL otherwise. + */ +int moduleVerifyUnprefixedName(const char *nameAlias, const char **delim) { + if (nameAlias[0] == '\0') + return REDISMODULE_ERR; + + *delim = NULL; + int dot_count = 0, lname = 0; + + for (size_t i = 0; nameAlias[i] != '\0'; i++) { + char ch = nameAlias[i]; + + if (((*delim) == NULL) && (ch == '|')) { + /* Handle single separator between name and alias */ + if (!lname) { + serverLog(LL_WARNING, "Module configuration name is empty: %s", nameAlias); + return REDISMODULE_ERR; + } + *delim = &nameAlias[i]; + dot_count = lname = 0; + } else if ( (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || + (ch >= '0' && ch <= '9') || (ch == '_') || (ch == '-') ) + { + ++lname; + } else if (ch == '.') { + /* Allow only one dot per section (name or alias) */ + if (++dot_count > 1) { + serverLog(LL_WARNING, "Invalid character sequence in Module configuration name or alias: %s", nameAlias); + return REDISMODULE_ERR; + } + } else { + serverLog(LL_WARNING, "Invalid character %c in Module configuration name or alias %s.", ch, nameAlias); + return REDISMODULE_ERR; + } + } + + if (!lname) { + serverLog(LL_WARNING, "Module configuration name or alias is empty : %s", nameAlias); + return REDISMODULE_ERR; + } + + return REDISMODULE_OK; +} + /* This is a series of set functions for each type that act as dispatchers for * config.c to call module set callbacks. */ #define CONFIG_ERR_SIZE 256 @@ -12631,9 +12695,24 @@ static void propagateErrorString(RedisModuleString *err_in, const char **err) { } } +/* If configuration was originally registered with indication to prefix the name, + * return the name without the prefix by skipping prefix ".". + * Otherwise, return the stored name as is. */ +static char *getRegisteredConfigName(ModuleConfig *config) { + if (config->unprefixedFlag) + return config->name; + + /* For prefixed configuration, find the '.' indicating the end of the prefix */ + char *endOfPrefix = strchr(config->name, '.'); + serverAssert(endOfPrefix != NULL); + return endOfPrefix + 1; +} + int setModuleBoolConfig(ModuleConfig *config, int val, const char **err) { RedisModuleString *error = NULL; - int return_code = config->set_fn.set_bool(config->name, val, config->privdata, &error); + + char *rname = getRegisteredConfigName(config); + int return_code = config->set_fn.set_bool(rname, val, config->privdata, &error); propagateErrorString(error, err); return return_code == REDISMODULE_OK ? 1 : 0; } @@ -12641,7 +12720,9 @@ int setModuleBoolConfig(ModuleConfig *config, int val, const char **err) { int setModuleStringConfig(ModuleConfig *config, sds strval, const char **err) { RedisModuleString *error = NULL; RedisModuleString *new = createStringObject(strval, sdslen(strval)); - int return_code = config->set_fn.set_string(config->name, new, config->privdata, &error); + + char *rname = getRegisteredConfigName(config); + int return_code = config->set_fn.set_string(rname, new, config->privdata, &error); propagateErrorString(error, err); decrRefCount(new); return return_code == REDISMODULE_OK ? 1 : 0; @@ -12656,7 +12737,8 @@ int setModuleEnumConfig(ModuleConfig *config, int val, const char **err) { int setModuleNumericConfig(ModuleConfig *config, long long val, const char **err) { RedisModuleString *error = NULL; - int return_code = config->set_fn.set_numeric(config->name, val, config->privdata, &error); + char *rname = getRegisteredConfigName(config); + int return_code = config->set_fn.set_numeric(rname, val, config->privdata, &error); propagateErrorString(error, err); return return_code == REDISMODULE_OK ? 1 : 0; } @@ -12664,20 +12746,24 @@ int setModuleNumericConfig(ModuleConfig *config, long long val, const char **err /* This is a series of get functions for each type that act as dispatchers for * config.c to call module set callbacks. */ int getModuleBoolConfig(ModuleConfig *module_config) { - return module_config->get_fn.get_bool(module_config->name, module_config->privdata); + char *rname = getRegisteredConfigName(module_config); + return module_config->get_fn.get_bool(rname, module_config->privdata); } sds getModuleStringConfig(ModuleConfig *module_config) { - RedisModuleString *val = module_config->get_fn.get_string(module_config->name, module_config->privdata); + char *rname = getRegisteredConfigName(module_config); + RedisModuleString *val = module_config->get_fn.get_string(rname, module_config->privdata); return val ? sdsdup(val->ptr) : NULL; } int getModuleEnumConfig(ModuleConfig *module_config) { - return module_config->get_fn.get_enum(module_config->name, module_config->privdata); + char *rname = getRegisteredConfigName(module_config); + return module_config->get_fn.get_enum(rname, module_config->privdata); } long long getModuleNumericConfig(ModuleConfig *module_config) { - return module_config->get_fn.get_numeric(module_config->name, module_config->privdata); + char *rname = getRegisteredConfigName(module_config); + return module_config->get_fn.get_numeric(rname, module_config->privdata); } /* This function takes a module and a list of configs stored as sds NAME VALUE pairs. @@ -12689,25 +12775,26 @@ int loadModuleConfigs(RedisModule *module) { listRewind(module->module_configs, &li); while ((ln = listNext(&li))) { ModuleConfig *module_config = listNodeValue(ln); - sds config_name = sdscatfmt(sdsempty(), "%s.%s", module->name, module_config->name); - dictEntry *config_argument = dictFind(server.module_configs_queue, config_name); - if (config_argument) { - if (!performModuleConfigSetFromName(dictGetKey(config_argument), dictGetVal(config_argument), &err)) { - serverLog(LL_WARNING, "Issue during loading of configuration %s : %s", (sds) dictGetKey(config_argument), err); - sdsfree(config_name); + dictEntry *de = dictUnlink(server.module_configs_queue, module_config->name); + if ((!de) && (module_config->alias)) + de = dictUnlink(server.module_configs_queue, module_config->alias); + + /* If found in the queue, set the value. Otherwise, set the default value. */ + if (de) { + if (!performModuleConfigSetFromName(dictGetKey(de), dictGetVal(de), &err)) { + serverLog(LL_WARNING, "Issue during loading of configuration %s : %s", (sds) dictGetKey(de), err); + dictFreeUnlinkedEntry(server.module_configs_queue, de); dictEmpty(server.module_configs_queue, NULL); return REDISMODULE_ERR; } + dictFreeUnlinkedEntry(server.module_configs_queue, de); } else { - if (!performModuleConfigSetDefaultFromName(config_name, &err)) { + if (!performModuleConfigSetDefaultFromName(module_config->name, &err)) { serverLog(LL_WARNING, "Issue attempting to set default value of configuration %s : %s", module_config->name, err); - sdsfree(config_name); dictEmpty(server.module_configs_queue, NULL); return REDISMODULE_ERR; } } - dictDelete(server.module_configs_queue, config_name); - sdsfree(config_name); } module->configs_initialized = 1; return REDISMODULE_OK; @@ -12757,26 +12844,93 @@ int moduleConfigApplyConfig(list *module_configs, const char **err, const char * * ## Module Configurations API * -------------------------------------------------------------------------- */ -/* Create a module config object. */ -ModuleConfig *createModuleConfig(const char *name, RedisModuleConfigApplyFunc apply_fn, void *privdata, RedisModule *module) { +/* Resolve config name and create a module config object */ +ModuleConfig *createModuleConfig(const char *name, RedisModuleConfigApplyFunc apply_fn, + void *privdata, RedisModule *module, unsigned int flags) +{ + sds cname, alias = NULL; + + /* Determine the configuration name: + * - If the unprefixed flag is set, the "." prefix is omitted. + * - An optional alias can be specified using "|". + * + * Examples: + * - Unprefixed: "bf.initial_size" or "bf-initial-size|bf.initial_size". + * - Prefixed: "initial_size" becomes ".initial_size". + */ + if (flags & REDISMODULE_CONFIG_UNPREFIXED) { + const char *delim = strchr(name, '|'); + cname = sdsnew(name); + if (delim) { /* Handle "|" format */ + sdssubstr(cname, 0, delim - name); + alias = sdsnew(delim + 1); + } + } else { + /* Add the module name prefix */ + cname = sdscatfmt(sdsempty(), "%s.%s", module->name, name); + } + ModuleConfig *new_config = zmalloc(sizeof(ModuleConfig)); - new_config->name = sdsnew(name); + new_config->unprefixedFlag = flags & REDISMODULE_CONFIG_UNPREFIXED; + new_config->name = cname; + new_config->alias = alias; new_config->apply_fn = apply_fn; new_config->privdata = privdata; new_config->module = module; return new_config; } +/* Verify the configuration name and check for duplicates. + * + * - If the configuration is flagged as unprefixed, it checks for duplicate + * names and optional aliases in the format |. + * - If the configuration is prefixed, it ensures the name is unique with + * the module name prepended (.). + */ int moduleConfigValidityCheck(RedisModule *module, const char *name, unsigned int flags, configType type) { if (!module->onload) { errno = EBUSY; return REDISMODULE_ERR; } - if (moduleVerifyConfigFlags(flags, type) || moduleVerifyResourceName(name)) { + if (moduleVerifyConfigFlags(flags, type)) { errno = EINVAL; return REDISMODULE_ERR; } - if (isModuleConfigNameRegistered(module, name)) { + + int isdup = 0; + if (flags & REDISMODULE_CONFIG_UNPREFIXED) { + const char *delim = NULL; /* Pointer to the '|' delimiter in | */ + if (moduleVerifyUnprefixedName(name, &delim)){ + errno = EINVAL; + return REDISMODULE_ERR; + } + + if (delim) { + /* Temporary split the "|" for the check */ + int count; + sds *ar = sdssplitlen(name, strlen(name), "|", 1, &count); + serverAssert(count == 2); /* Already validated */ + isdup = configExists(ar[0]) || + configExists(ar[1]) || + (sdscmp(ar[0], ar[1]) == 0); + sdsfreesplitres(ar, count); + } else { + sds _name = sdsnew(name); + isdup = configExists(_name); + sdsfree(_name); + } + } else { + if (moduleVerifyResourceName(name)) { + errno = EINVAL; + return REDISMODULE_ERR; + } + + sds fullname = sdscatfmt(sdsempty(), "%s.%s", module->name, name); + isdup = configExists(fullname); + sdsfree(fullname); + } + + if (isdup) { serverLog(LL_WARNING, "Configuration by the name: %s already registered", name); errno = EALREADY; return REDISMODULE_ERR; @@ -12886,12 +13040,14 @@ int RM_RegisterStringConfig(RedisModuleCtx *ctx, const char *name, const char *d if (moduleConfigValidityCheck(module, name, flags, NUMERIC_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); - new_config->get_fn.get_string = getfn; - new_config->set_fn.set_string = setfn; - listAddNodeTail(module->module_configs, new_config); - flags = maskModuleConfigFlags(flags); - addModuleStringConfig(module->name, name, flags, new_config, default_val ? sdsnew(default_val) : NULL); + + ModuleConfig *mc = createModuleConfig(name, applyfn, privdata, module, flags); + mc->get_fn.get_string = getfn; + mc->set_fn.set_string = setfn; + listAddNodeTail(module->module_configs, mc); + unsigned int cflags = maskModuleConfigFlags(flags); + addModuleStringConfig(sdsdup(mc->name), (mc->alias) ? sdsdup(mc->alias) : NULL, + cflags, mc, default_val ? sdsnew(default_val) : NULL); return REDISMODULE_OK; } @@ -12903,12 +13059,13 @@ int RM_RegisterBoolConfig(RedisModuleCtx *ctx, const char *name, int default_val if (moduleConfigValidityCheck(module, name, flags, BOOL_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); - new_config->get_fn.get_bool = getfn; - new_config->set_fn.set_bool = setfn; - listAddNodeTail(module->module_configs, new_config); - flags = maskModuleConfigFlags(flags); - addModuleBoolConfig(module->name, name, flags, new_config, default_val); + ModuleConfig *mc = createModuleConfig(name, applyfn, privdata, module, flags); + mc->get_fn.get_bool = getfn; + mc->set_fn.set_bool = setfn; + listAddNodeTail(module->module_configs, mc); + unsigned int cflags = maskModuleConfigFlags(flags); + addModuleBoolConfig(sdsdup(mc->name), (mc->alias) ? sdsdup(mc->alias) : NULL, + cflags, mc, default_val); return REDISMODULE_OK; } @@ -12946,9 +13103,9 @@ int RM_RegisterEnumConfig(RedisModuleCtx *ctx, const char *name, int default_val if (moduleConfigValidityCheck(module, name, flags, ENUM_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); - new_config->get_fn.get_enum = getfn; - new_config->set_fn.set_enum = setfn; + ModuleConfig *mc = createModuleConfig(name, applyfn, privdata, module, flags); + mc->get_fn.get_enum = getfn; + mc->set_fn.set_enum = setfn; configEnum *enum_vals = zmalloc((num_enum_vals + 1) * sizeof(configEnum)); for (int i = 0; i < num_enum_vals; i++) { enum_vals[i].name = zstrdup(enum_values[i]); @@ -12956,9 +13113,11 @@ int RM_RegisterEnumConfig(RedisModuleCtx *ctx, const char *name, int default_val } enum_vals[num_enum_vals].name = NULL; enum_vals[num_enum_vals].val = 0; - listAddNodeTail(module->module_configs, new_config); - flags = maskModuleConfigFlags(flags) | maskModuleEnumConfigFlags(flags); - addModuleEnumConfig(module->name, name, flags, new_config, default_val, enum_vals); + listAddNodeTail(module->module_configs, mc); + + unsigned int cflags = maskModuleConfigFlags(flags) | maskModuleEnumConfigFlags(flags); + addModuleEnumConfig(sdsdup(mc->name), (mc->alias) ? sdsdup(mc->alias) : NULL, + cflags, mc, default_val, enum_vals, num_enum_vals); return REDISMODULE_OK; } @@ -12971,13 +13130,15 @@ int RM_RegisterNumericConfig(RedisModuleCtx *ctx, const char *name, long long de if (moduleConfigValidityCheck(module, name, flags, NUMERIC_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); - new_config->get_fn.get_numeric = getfn; - new_config->set_fn.set_numeric = setfn; - listAddNodeTail(module->module_configs, new_config); + ModuleConfig *mc = createModuleConfig(name, applyfn, privdata, module, flags); + mc->get_fn.get_numeric = getfn; + mc->set_fn.set_numeric = setfn; + listAddNodeTail(module->module_configs, mc); unsigned int numeric_flags = maskModuleNumericConfigFlags(flags); - flags = maskModuleConfigFlags(flags); - addModuleNumericConfig(module->name, name, flags, new_config, default_val, numeric_flags, min, max); + + unsigned int cflags = maskModuleConfigFlags(flags); + addModuleNumericConfig(sdsdup(mc->name), (mc->alias) ? sdsdup(mc->alias) : NULL, + cflags, mc, default_val, numeric_flags, min, max); return REDISMODULE_OK; } diff --git a/src/redismodule.h b/src/redismodule.h index b84913b1e..d0b7f7735 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -131,6 +131,7 @@ typedef long long ustime_t; #define REDISMODULE_CONFIG_MEMORY (1ULL<<7) /* Indicates if this value can be set as a memory value */ #define REDISMODULE_CONFIG_BITFLAGS (1ULL<<8) /* Indicates if this value can be set as a multiple enum values */ +#define REDISMODULE_CONFIG_UNPREFIXED (1ULL<<9) /* Provided configuration name won't be prefixed with the module name */ /* StreamID type. */ typedef struct RedisModuleStreamID { diff --git a/src/server.h b/src/server.h index 5a89c2ad6..6d9832b18 100644 --- a/src/server.h +++ b/src/server.h @@ -1590,7 +1590,7 @@ struct redisServer { dict *moduleapi; /* Exported core APIs dictionary for modules. */ dict *sharedapi; /* Like moduleapi but containing the APIs that modules share with each other. */ - dict *module_configs_queue; /* Dict that stores module configurations from .conf file until after modules are loaded during startup or arguments to loadex. */ + dict *module_configs_queue; /* Unmapped configs are queued here, assumed to be module config. Applied after modules are loaded during startup or arguments to loadex. */ list *loadmodule_queue; /* List of modules to load at startup. */ int module_pipe[2]; /* Pipe used to awake the event loop by module threads. */ pid_t child_pid; /* PID of current child */ @@ -3346,10 +3346,10 @@ void freeServerClientMemUsageBuckets(void); typedef struct ModuleConfig ModuleConfig; int performModuleConfigSetFromName(sds name, sds value, const char **err); int performModuleConfigSetDefaultFromName(sds name, const char **err); -void addModuleBoolConfig(const char *module_name, const char *name, int flags, void *privdata, int default_val); -void addModuleStringConfig(const char *module_name, const char *name, int flags, void *privdata, sds default_val); -void addModuleEnumConfig(const char *module_name, const char *name, int flags, void *privdata, int default_val, configEnum *enum_vals); -void addModuleNumericConfig(const char *module_name, const char *name, int flags, void *privdata, long long default_val, int conf_flags, long long lower, long long upper); +void addModuleBoolConfig(sds name, sds alias, int flags, void *privdata, int default_val); +void addModuleStringConfig(sds name, sds alias, int flags, void *privdata, sds default_val); +void addModuleEnumConfig(sds name, sds alias, int flags, void *privdata, int default_val, configEnum *enum_vals, int num_enum_vals); +void addModuleNumericConfig(sds name, sds alias, int flags, void *privdata, long long default_val, int conf_flags, long long lower, long long upper); void addModuleConfigApply(list *module_configs, ModuleConfig *module_config); int moduleConfigApplyConfig(list *module_configs, const char **err, const char **err_arg_name); int getModuleBoolConfig(ModuleConfig *module_config); @@ -3775,6 +3775,7 @@ void configGetCommand(client *c); void configResetStatCommand(client *c); void configRewriteCommand(client *c); void configHelpCommand(client *c); +int configExists(const sds name); void hincrbyCommand(client *c); void hincrbyfloatCommand(client *c); void subscribeCommand(client *c); diff --git a/tests/modules/moduleconfigs.c b/tests/modules/moduleconfigs.c index 2c1737df7..504a7e291 100644 --- a/tests/modules/moduleconfigs.c +++ b/tests/modules/moduleconfigs.c @@ -1,11 +1,12 @@ #include "redismodule.h" #include -int mutable_bool_val; +int mutable_bool_val, no_prefix_bool, no_prefix_bool2; int immutable_bool_val; -long long longval; -long long memval; +long long longval, no_prefix_longval; +long long memval, no_prefix_memval; RedisModuleString *strval = NULL; -int enumval; +RedisModuleString *strval2 = NULL; +int enumval, no_prefix_enumval; int flagsval; /* Series of get and set callbacks for each type of config, these rely on the privdata ptr @@ -103,6 +104,36 @@ int longlongApplyFunc(RedisModuleCtx *ctx, void *privdata, RedisModuleString **e return REDISMODULE_OK; } +RedisModuleString *getStringConfigUnprefix(const char *name, void *privdata) { + REDISMODULE_NOT_USED(name); + REDISMODULE_NOT_USED(privdata); + return strval2; +} + +int setStringConfigUnprefix(const char *name, RedisModuleString *new, void *privdata, RedisModuleString **err) { + REDISMODULE_NOT_USED(name); + REDISMODULE_NOT_USED(err); + REDISMODULE_NOT_USED(privdata); + if (strval2) RedisModule_FreeString(NULL, strval2); + RedisModule_RetainString(NULL, new); + strval2 = new; + return REDISMODULE_OK; +} + +int getEnumConfigUnprefix(const char *name, void *privdata) { + REDISMODULE_NOT_USED(name); + REDISMODULE_NOT_USED(privdata); + return no_prefix_enumval; +} + +int setEnumConfigUnprefix(const char *name, int val, void *privdata, RedisModuleString **err) { + REDISMODULE_NOT_USED(name); + REDISMODULE_NOT_USED(err); + REDISMODULE_NOT_USED(privdata); + no_prefix_enumval = val; + return REDISMODULE_OK; +} + int registerBlockCheck(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { REDISMODULE_NOT_USED(argv); REDISMODULE_NOT_USED(argc); @@ -168,6 +199,30 @@ int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) if (RedisModule_RegisterNumericConfig(ctx, "numeric", -1, REDISMODULE_CONFIG_DEFAULT, -5, 2000, getNumericConfigCommand, setNumericConfigCommand, longlongApplyFunc, &longval) == REDISMODULE_ERR) { return REDISMODULE_ERR; } + + /*** unprefixed and aliased configuration ***/ + if (RedisModule_RegisterBoolConfig(ctx, "unprefix-bool|unprefix-bool-alias", 1, REDISMODULE_CONFIG_DEFAULT|REDISMODULE_CONFIG_UNPREFIXED, + getBoolConfigCommand, setBoolConfigCommand, NULL, &no_prefix_bool) == REDISMODULE_ERR) { + return REDISMODULE_ERR; + } + if (RedisModule_RegisterBoolConfig(ctx, "unprefix-noalias-bool", 1, REDISMODULE_CONFIG_DEFAULT|REDISMODULE_CONFIG_UNPREFIXED, + getBoolConfigCommand, setBoolConfigCommand, NULL, &no_prefix_bool2) == REDISMODULE_ERR) { + return REDISMODULE_ERR; + } + if (RedisModule_RegisterNumericConfig(ctx, "unprefix.numeric|unprefix.numeric-alias", -1, REDISMODULE_CONFIG_DEFAULT|REDISMODULE_CONFIG_UNPREFIXED, + -5, 2000, getNumericConfigCommand, setNumericConfigCommand, NULL, &no_prefix_longval) == REDISMODULE_ERR) { + return REDISMODULE_ERR; + } + if (RedisModule_RegisterStringConfig(ctx, "unprefix-string|unprefix.string-alias", "secret unprefix", REDISMODULE_CONFIG_DEFAULT|REDISMODULE_CONFIG_UNPREFIXED, + getStringConfigUnprefix, setStringConfigUnprefix, NULL, NULL) == REDISMODULE_ERR) { + return REDISMODULE_ERR; + } + if (RedisModule_RegisterEnumConfig(ctx, "unprefix-enum|unprefix-enum-alias", 1, REDISMODULE_CONFIG_DEFAULT|REDISMODULE_CONFIG_UNPREFIXED, + enum_vals, int_vals, 5, getEnumConfigUnprefix, setEnumConfigUnprefix, NULL, NULL) == REDISMODULE_ERR) { + return REDISMODULE_ERR; + } + + size_t len; if (argc && !strcasecmp(RedisModule_StringPtrLen(argv[0], &len), "noload")) { return REDISMODULE_OK; @@ -191,5 +246,9 @@ int RedisModule_OnUnload(RedisModuleCtx *ctx) { RedisModule_FreeString(ctx, strval); strval = NULL; } + if (strval2) { + RedisModule_FreeString(ctx, strval2); + strval2 = NULL; + } return REDISMODULE_OK; } diff --git a/tests/unit/moduleapi/moduleconfigs.tcl b/tests/unit/moduleapi/moduleconfigs.tcl index 1709e9d99..609732b58 100644 --- a/tests/unit/moduleapi/moduleconfigs.tcl +++ b/tests/unit/moduleapi/moduleconfigs.tcl @@ -13,6 +13,17 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.enum] "moduleconfigs.enum one" assert_equal [r config get moduleconfigs.flags] "moduleconfigs.flags {one two}" assert_equal [r config get moduleconfigs.numeric] "moduleconfigs.numeric -1" + + # Check un-prefixed and aliased configuration + assert_equal [r config get unprefix-bool] "unprefix-bool yes" + assert_equal [r config get unprefix-noalias-bool] "unprefix-noalias-bool yes" + assert_equal [r config get unprefix-bool-alias] "unprefix-bool-alias yes" + assert_equal [r config get unprefix.numeric] "unprefix.numeric -1" + assert_equal [r config get unprefix.numeric-alias] "unprefix.numeric-alias -1" + assert_equal [r config get unprefix-string] "unprefix-string {secret unprefix}" + assert_equal [r config get unprefix.string-alias] "unprefix.string-alias {secret unprefix}" + assert_equal [r config get unprefix-enum] "unprefix-enum one" + assert_equal [r config get unprefix-enum-alias] "unprefix-enum-alias one" } test {Config set commands work} { @@ -34,6 +45,25 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.flags] "moduleconfigs.flags two" r config set moduleconfigs.numeric -2 assert_equal [r config get moduleconfigs.numeric] "moduleconfigs.numeric -2" + + # Check un-prefixed and aliased configuration + r config set unprefix-bool no + assert_equal [r config get unprefix-bool] "unprefix-bool no" + assert_equal [r config get unprefix-bool-alias] "unprefix-bool-alias no" + r config set unprefix-bool-alias yes + assert_equal [r config get unprefix-bool] "unprefix-bool yes" + assert_equal [r config get unprefix-bool-alias] "unprefix-bool-alias yes" + r config set unprefix.numeric 5 + assert_equal [r config get unprefix.numeric] "unprefix.numeric 5" + assert_equal [r config get unprefix.numeric-alias] "unprefix.numeric-alias 5" + r config set unprefix.numeric-alias 6 + assert_equal [r config get unprefix.numeric] "unprefix.numeric 6" + r config set unprefix.string-alias "blabla" + assert_equal [r config get unprefix-string] "unprefix-string blabla" + assert_equal [r config get unprefix.string-alias] "unprefix.string-alias blabla" + r config set unprefix-enum two + assert_equal [r config get unprefix-enum] "unprefix-enum two" + assert_equal [r config get unprefix-enum-alias] "unprefix-enum-alias two" } test {Config set commands enum flags} { @@ -93,11 +123,30 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.enum] "moduleconfigs.enum one" assert_equal [r config get moduleconfigs.flags] "moduleconfigs.flags {one two}" assert_equal [r config get moduleconfigs.numeric] "moduleconfigs.numeric -1" + + # Check un-prefixed and aliased configuration + assert_equal [r config get unprefix-bool] "unprefix-bool yes" + assert_equal [r config get unprefix-bool-alias] "unprefix-bool-alias yes" + assert_equal [r config get unprefix.numeric] "unprefix.numeric -1" + assert_equal [r config get unprefix.numeric-alias] "unprefix.numeric-alias -1" + assert_equal [r config get unprefix-string] "unprefix-string {secret unprefix}" + assert_equal [r config get unprefix.string-alias] "unprefix.string-alias {secret unprefix}" + assert_equal [r config get unprefix-enum] "unprefix-enum one" + assert_equal [r config get unprefix-enum-alias] "unprefix-enum-alias one" + + r module unload moduleconfigs } test {test loadex functionality} { - r module loadex $testmodule CONFIG moduleconfigs.mutable_bool no CONFIG moduleconfigs.immutable_bool yes CONFIG moduleconfigs.memory_numeric 2mb CONFIG moduleconfigs.string tclortickle + r module loadex $testmodule CONFIG moduleconfigs.mutable_bool no \ + CONFIG moduleconfigs.immutable_bool yes \ + CONFIG moduleconfigs.memory_numeric 2mb \ + CONFIG moduleconfigs.string tclortickle \ + CONFIG unprefix-bool no \ + CONFIG unprefix.numeric-alias 123 \ + CONFIG unprefix-string abc_def \ + assert_not_equal [lsearch [lmap x [r module list] {dict get $x name}] moduleconfigs] -1 assert_equal [r config get moduleconfigs.mutable_bool] "moduleconfigs.mutable_bool no" assert_equal [r config get moduleconfigs.immutable_bool] "moduleconfigs.immutable_bool yes" @@ -107,6 +156,18 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.enum] "moduleconfigs.enum one" assert_equal [r config get moduleconfigs.flags] "moduleconfigs.flags {one two}" assert_equal [r config get moduleconfigs.numeric] "moduleconfigs.numeric -1" + + # Check un-prefixed and aliased configuration + assert_equal [r config get unprefix-bool] "unprefix-bool no" + assert_equal [r config get unprefix-bool-alias] "unprefix-bool-alias no" + assert_equal [r config get unprefix.numeric] "unprefix.numeric 123" + assert_equal [r config get unprefix.numeric-alias] "unprefix.numeric-alias 123" + assert_equal [r config get unprefix-string] "unprefix-string abc_def" + assert_equal [r config get unprefix.string-alias] "unprefix.string-alias abc_def" + assert_equal [r config get unprefix-enum] "unprefix-enum one" + assert_equal [r config get unprefix-enum-alias] "unprefix-enum-alias one" + + } test {apply function works} { @@ -121,9 +182,19 @@ start_server {tags {"modules"}} { } test {test double config argument to loadex} { - r module loadex $testmodule CONFIG moduleconfigs.mutable_bool yes CONFIG moduleconfigs.mutable_bool no - assert_equal [r config get moduleconfigs.mutable_bool] "moduleconfigs.mutable_bool no" - r module unload moduleconfigs + r module loadex $testmodule CONFIG moduleconfigs.mutable_bool yes \ + CONFIG moduleconfigs.mutable_bool no \ + CONFIG unprefix.numeric-alias 1 \ + CONFIG unprefix.numeric-alias 2 \ + CONFIG unprefix-string blabla + + assert_equal [r config get moduleconfigs.mutable_bool] "moduleconfigs.mutable_bool no" + # Check un-prefixed and aliased configuration + assert_equal [r config get unprefix.numeric-alias] "unprefix.numeric-alias 2" + assert_equal [r config get unprefix.numeric] "unprefix.numeric 2" + assert_equal [r config get unprefix-string] "unprefix-string blabla" + assert_equal [r config get unprefix.string-alias] "unprefix.string-alias blabla" + r module unload moduleconfigs } test {missing loadconfigs call} { @@ -156,6 +227,9 @@ start_server {tags {"modules"}} { assert_match {*ERR*} $e assert_equal [r config get configs.test] "configs.test yes" r module unload configs + # Verify config name and its alias being used together gets failed + catch {[r module loadex $testmodule CONFIG unprefix.numeric 1 CONFIG unprefix.numeric-alias 1]} + assert_match {*ERR*} $e } test {test config rewrite with dynamic load} { @@ -167,6 +241,10 @@ start_server {tags {"modules"}} { r config set moduleconfigs.memory_numeric 750 r config set moduleconfigs.enum two r config set moduleconfigs.flags "four two" + r config set unprefix-bool-alias no + r config set unprefix.numeric 456 + r config set unprefix.string-alias "unprefix" + r config set unprefix-enum two r config rewrite restart_server 0 true false # Ensure configs we rewrote are present and that the conf file is readable @@ -176,6 +254,17 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.enum] "moduleconfigs.enum two" assert_equal [r config get moduleconfigs.flags] "moduleconfigs.flags {two four}" assert_equal [r config get moduleconfigs.numeric] "moduleconfigs.numeric -1" + + # Check unprefixed configuration and alias + assert_equal [r config get unprefix-bool] "unprefix-bool no" + assert_equal [r config get unprefix-bool-alias] "unprefix-bool-alias no" + assert_equal [r config get unprefix.numeric] "unprefix.numeric 456" + assert_equal [r config get unprefix.numeric-alias] "unprefix.numeric-alias 456" + assert_equal [r config get unprefix-string] "unprefix-string unprefix" + assert_equal [r config get unprefix.string-alias] "unprefix.string-alias unprefix" + assert_equal [r config get unprefix-enum] "unprefix-enum two" + assert_equal [r config get unprefix-enum-alias] "unprefix-enum-alias two" + r module unload moduleconfigs } @@ -241,6 +330,16 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.flags] "moduleconfigs.flags {two four}" assert_equal [r config get moduleconfigs.numeric] "moduleconfigs.numeric -1" assert_equal [r config get moduleconfigs.memory_numeric] "moduleconfigs.memory_numeric 1024" + + # Check un-prefixed and aliased configuration + assert_equal [r config get unprefix-bool] "unprefix-bool yes" + assert_equal [r config get unprefix-bool-alias] "unprefix-bool-alias yes" + assert_equal [r config get unprefix.numeric] "unprefix.numeric -1" + assert_equal [r config get unprefix.numeric-alias] "unprefix.numeric-alias -1" + assert_equal [r config get unprefix-string] "unprefix-string {secret unprefix}" + assert_equal [r config get unprefix.string-alias] "unprefix.string-alias {secret unprefix}" + assert_equal [r config get unprefix-enum] "unprefix-enum one" + assert_equal [r config get unprefix-enum-alias] "unprefix-enum-alias one" } } } From 9ebf80a28c3ba12e9aedc7cee145637d1ed2ab5a Mon Sep 17 00:00:00 2001 From: Ozan Tezcan Date: Thu, 21 Nov 2024 14:12:58 +0300 Subject: [PATCH 14/42] Fix memory leak of jemalloc tcache on function flush command (#13661) Starting from https://github.com/redis/redis/pull/13133, we allocate a jemalloc thread cache and use it for lua vm. On certain cases, like `script flush` or `function flush` command, we free the existing thread cache and create a new one. Though, for `function flush`, we were not actually destroying the existing thread cache itself. Each call creates a new thread cache on jemalloc and we leak the previous thread cache instances. Jemalloc allows maximum 4096 thread cache instances. If we reach this limit, Redis prints "Failed creating the lua jemalloc tcache" log and abort. There are other cases that can cause this memory leak, including replication scenarios when emptyData() is called. The implication is that it looks like redis `used_memory` is low, but `allocator_allocated` and RSS remain high. Co-authored-by: debing.sun --- src/eval.c | 6 +++++- src/function_lua.c | 13 +++++++++++++ src/script.c | 4 ++-- tests/unit/scripting.tcl | 29 +++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/eval.c b/src/eval.c index 2c83db19e..8670a0a69 100644 --- a/src/eval.c +++ b/src/eval.c @@ -259,12 +259,16 @@ void scriptingInit(int setup) { void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) { dictRelease(lua_scripts); listRelease(lua_scripts_lru_list); - lua_close(lua); #if defined(USE_JEMALLOC) /* When lua is closed, destroy the previously used private tcache. */ void *ud = (global_State*)G(lua)->ud; unsigned int lua_tcache = (unsigned int)(uintptr_t)ud; +#endif + + lua_close(lua); + +#if defined(USE_JEMALLOC) je_mallctl("tcache.destroy", NULL, NULL, (void *)&lua_tcache, sizeof(unsigned int)); #endif } diff --git a/src/function_lua.c b/src/function_lua.c index 2d05ed7e0..37069ec21 100644 --- a/src/function_lua.c +++ b/src/function_lua.c @@ -23,6 +23,9 @@ #include #include #include +#if defined(USE_JEMALLOC) +#include +#endif #define LUA_ENGINE_NAME "LUA" #define REGISTRY_ENGINE_CTX_NAME "__ENGINE_CTX__" @@ -189,8 +192,18 @@ static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) { static void luaEngineFreeCtx(void *engine_ctx) { luaEngineCtx *lua_engine_ctx = engine_ctx; +#if defined(USE_JEMALLOC) + /* When lua is closed, destroy the previously used private tcache. */ + void *ud = (global_State*)G(lua_engine_ctx->lua)->ud; + unsigned int lua_tcache = (unsigned int)(uintptr_t)ud; +#endif + lua_close(lua_engine_ctx->lua); zfree(lua_engine_ctx); + +#if defined(USE_JEMALLOC) + je_mallctl("tcache.destroy", NULL, NULL, (void *)&lua_tcache, sizeof(unsigned int)); +#endif } static void luaRegisterFunctionArgsInitialize(registerFunctionArgs *register_f_args, diff --git a/src/script.c b/src/script.c index b2388d739..c88e451f9 100644 --- a/src/script.c +++ b/src/script.c @@ -64,7 +64,7 @@ lua_State *createLuaState(void) { size_t sz = sizeof(unsigned int); int err = je_mallctl("tcache.create", (void *)&tcache, &sz, NULL, 0); if (err) { - serverLog(LL_WARNING, "Failed creating the lua jemalloc tcache."); + serverLog(LL_WARNING, "Failed creating the lua jemalloc tcache (err=%d).", err); exit(1); } @@ -79,7 +79,7 @@ void luaEnvInit(void) { size_t sz = sizeof(unsigned int); int err = je_mallctl("arenas.create", (void *)&arena, &sz, NULL, 0); if (err) { - serverLog(LL_WARNING, "Failed creating the lua jemalloc arena."); + serverLog(LL_WARNING, "Failed creating the lua jemalloc arena (err=%d).", err); exit(1); } server.lua_arena = arena; diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index 023a26c6b..05a37c00e 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -1884,6 +1884,35 @@ start_server {tags {"scripting needs:debug"}} { } start_server {tags {"scripting"}} { + test "Test script flush will not leak memory - script:$is_eval" { + r flushall + r script flush + r function flush + + # This is a best-effort test to check we don't leak some resources on + # script flush and function flush commands. For lua vm, we create a + # jemalloc thread cache. On each script flush command, thread cache is + # destroyed and we create a new one. In this test, running script flush + # many times to verify there is no increase in the memory usage while + # re-creating some of the resources for lua vm. + set used_memory [s used_memory] + set allocator_allocated [s allocator_allocated] + + r multi + for {set j 1} {$j <= 500} {incr j} { + if {$is_eval} { + r SCRIPT FLUSH + } else { + r FUNCTION FLUSH + } + } + r exec + + # Verify used memory is not (much) higher. + assert_lessthan [s used_memory] [expr $used_memory*1.5] + assert_lessthan [s allocator_allocated] [expr $allocator_allocated*1.5] + } + test "Verify Lua performs GC correctly after script loading" { set dummy_script "--[string repeat x 10]\nreturn " set n 50000 From 05b99c8f4cca9a5439c54888cf275f8447cd3c31 Mon Sep 17 00:00:00 2001 From: Ali <115415312+xogoodnow@users.noreply.github.com> Date: Fri, 22 Nov 2024 15:59:17 +0330 Subject: [PATCH 15/42] Fix typo in redis.conf (#12634) unnecessarily and repetitive "OR" --- redis.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redis.conf b/redis.conf index aadb4acb0..6688fdc2a 100644 --- a/redis.conf +++ b/redis.conf @@ -1362,7 +1362,7 @@ oom-score-adj-values 0 200 800 #################### KERNEL transparent hugepage CONTROL ###################### # Usually the kernel Transparent Huge Pages control is set to "madvise" or -# or "never" by default (/sys/kernel/mm/transparent_hugepage/enabled), in which +# "never" by default (/sys/kernel/mm/transparent_hugepage/enabled), in which # case this config has no effect. On systems in which it is set to "always", # redis will attempt to disable it specifically for the redis process in order # to avoid latency problems specifically with fork(2) and CoW. From a106198878f85b91de651dde74587768e81d4506 Mon Sep 17 00:00:00 2001 From: "Filipe Oliveira (Redis)" Date: Tue, 26 Nov 2024 08:11:01 +0000 Subject: [PATCH 16/42] Optimize addReplyBulk on sds/int encoded strings: 2.2% to 4% reduction of CPU Time on GET high pipeline use-cases (#13644) ### Summary By profing 1KiB 100% GET's use-case, on high pipeline use-cases, we can see that addReplyBulk and it's inner calls takes 8.30% of the CPU cycles. This PR reduces from 2.2% to 4% the CPU time spent on addReplyBulk. Specifically for GET use-cases, we saw an improvement from 2.7% to 9.1% on the achievable ops/sec ### Improvement By reducing the duplicate work we can improve by around 2.7% on sds encoded strings, and around 9% on int encoded strings. This PR does the following: - Avoid duplicate sdslen on addReplyBulk() for sds enconded objects - Avoid duplicate sdigits10() call on int incoded objects on addReplyBulk() - avoid final "\r\n" addReplyProto() in the OBJ_ENCODING_INT type on addReplyBulk Altogether this improvements results in the following improvement on the achievable ops/sec : Encoding | unstable (commit 9906daf5c9fdb836a5b3f04829c75701a4e90eb4) | this PR | % improvement -- | -- | -- | -- 1KiB Values string SDS encoded | 1478081.88 | 1517635.38 | 2.7% Values string "1" OBJ_ENCODING_INT | 1521139.36 | 1658876.59 | 9.1% ### CPU Time: Total of addReplyBulk Encoding | unstable (commit 9906daf5c9fdb836a5b3f04829c75701a4e90eb4) | this PR | reduction of CPU Time: Total -- | -- | -- | -- 1KiB Values string SDS encoded | 8.30% | 6.10% | 2.2% Values string "1" OBJ_ENCODING_INT | 7.20% | 3.20% | 4.0% ### To reproduce Run redis with unix socket enabled ``` taskset -c 0 /root/redis/src/redis-server --unixsocket /tmp/1.socket --save '' --enable-debug-command local ``` #### 1KiB Values string SDS encoded Load data ``` taskset -c 2-5 memtier_benchmark --ratio 1:0 -n allkeys --key-pattern P:P --key-maximum 1000000 --hide-histogram --pipeline 10 -S /tmp/1.socket ``` Benchmark ``` taskset -c 2-6 memtier_benchmark --ratio 0:1 -c 1 -t 5 --test-time 60 --hide-histogram -d 1000 --pipeline 500 -S /tmp/1.socket --key-maximum 1000000 --json-out-file results.json ``` #### Values string "1" OBJ_ENCODING_INT Load data ``` $ taskset -c 2-5 memtier_benchmark --command "SET __key__ 1" -n allkeys --command-key-pattern P --key-maximum 1000000 --hide-histogram -c 1 -t 1 --pipeline 100 -S /tmp/1.socket # confirm we have the expected reply and format $ redis-cli get memtier-1 "1" $ redis-cli debug object memtier-1 Value at:0x7f14cec57570 refcount:2147483647 encoding:int serializedlength:2 lru:2861503 lru_seconds_idle:8 ``` Benchmark ``` taskset -c 2-6 memtier_benchmark --ratio 0:1 -c 1 -t 5 --test-time 60 --hide-histogram -d 1000 --pipeline 500 -S /tmp/1.socket --key-maximum 1000000 --json-out-file results.json ``` --- src/networking.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/networking.c b/src/networking.c index 95a6ee08e..67e1b139c 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1078,9 +1078,26 @@ void addReplyBulkLen(client *c, robj *obj) { /* Add a Redis Object as a bulk reply */ void addReplyBulk(client *c, robj *obj) { - addReplyBulkLen(c,obj); - addReply(c,obj); - addReplyProto(c,"\r\n",2); + if (prepareClientToWrite(c) != C_OK) return; + + if (sdsEncodedObject(obj)) { + const size_t len = sdslen(obj->ptr); + _addReplyLongLongBulk(c, len); + _addReplyToBufferOrList(c,obj->ptr,len); + _addReplyToBufferOrList(c,"\r\n",2); + } else if (obj->encoding == OBJ_ENCODING_INT) { + /* For integer encoded strings we just convert it into a string + * using our optimized function, and attach the resulting string + * to the output buffer. */ + char buf[34]; + size_t len = ll2string(buf,sizeof(buf),(long)obj->ptr); + buf[len+1] = '\r'; + buf[len+2] = '\n'; + _addReplyLongLongBulk(c, len); + _addReplyToBufferOrList(c,buf,len+2); + } else { + serverPanic("Wrong obj->encoding in addReply()"); + } } /* Add a C buffer as bulk reply */ From db33b67d372c7d7b9977e9b1fef86730ac79ceb7 Mon Sep 17 00:00:00 2001 From: Vitah Lin Date: Thu, 28 Nov 2024 21:59:43 +0800 Subject: [PATCH 17/42] Deprecate ubuntu lunar and macos-12 in workflows (#13669) 1. Ubuntu Lunar reached End of Life on January 25, 2024, so upgrade the ubuntu version to plucky in action `test-ubuntu-jemalloc-fortify` to pass the daily CI 2. The macOS-12 environment is deprecated so upgrade macos-12 to macos-13 in daily CI --------- Co-authored-by: debing.sun --- .github/workflows/daily.yml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index f1484b102..8d71ce650 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -76,7 +76,6 @@ jobs: if: | (github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) && !contains(github.event.inputs.skipjobs, 'fortify') - container: ubuntu:lunar timeout-minutes: 14400 steps: - name: prep @@ -94,12 +93,10 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - apt-get update && apt-get install -y make gcc-13 g++-13 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 100 + apt-get update && apt-get install -y make gcc g++ make CC=gcc REDIS_CFLAGS='-Werror -DREDIS_TEST -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3' - name: testprep - run: apt-get install -y tcl8.6 tclx procps + run: sudo apt-get install -y tcl8.6 tclx procps - name: test if: true && !contains(github.event.inputs.skiptests, 'redis') run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} @@ -876,7 +873,7 @@ jobs: build-macos: strategy: matrix: - os: [macos-12, macos-14] + os: [macos-13, macos-15] runs-on: ${{ matrix.os }} if: | (github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) && @@ -903,7 +900,7 @@ jobs: run: make REDIS_CFLAGS='-Werror -DREDIS_TEST' test-freebsd: - runs-on: macos-12 + runs-on: macos-13 if: | (github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) && !contains(github.event.inputs.skipjobs, 'freebsd') From 06b144aa097e968f579048800cd0e5c9336ea1c9 Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Thu, 28 Nov 2024 18:33:58 +0200 Subject: [PATCH 18/42] Modules API: Add RedisModule_ACLCheckKeyPrefixPermissions (#13666) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces a new API function to the Redis Module API: ``` int RedisModule_ACLCheckKeyPrefixPermissions(RedisModuleUser *user, RedisModuleString *prefix, int flags); ``` Purpose: The function checks if a given user has access permissions to any key that match a specific prefix. This validation is based on the user’s ACL permissions and the specified flags. Note, this prefix-based approach API may fail to detect prefixes that are individually uncovered but collectively covered by the patterns. For example the prefix `ID-*` is not fully included in pattern `ID-[0]*` and is not fully included in pattern `ID-[^0]*` but it is fully included in the set of patterns `{ID-[0]*, ID-[^0]*}` --- src/acl.c | 12 ++++++-- src/module.c | 40 +++++++++++++++++++++++++ src/redismodule.h | 2 ++ src/server.h | 1 + src/util.c | 37 +++++++++++++++++++++++ src/util.h | 2 ++ tests/modules/aclcheck.c | 49 +++++++++++++++++++++++++++++++ tests/unit/moduleapi/aclcheck.tcl | 38 +++++++++++++++++++++++- 8 files changed, 178 insertions(+), 3 deletions(-) diff --git a/src/acl.c b/src/acl.c index 699a3808a..5af6edbd9 100644 --- a/src/acl.c +++ b/src/acl.c @@ -1577,14 +1577,22 @@ static int ACLSelectorCheckKey(aclSelector *selector, const char *key, int keyle if (keyspec_flags & CMD_KEY_DELETE) key_flags |= ACL_WRITE_PERMISSION; if (keyspec_flags & CMD_KEY_UPDATE) key_flags |= ACL_WRITE_PERMISSION; + /* Is given key represent a prefix of a set of keys */ + int prefix = keyspec_flags & CMD_KEY_PREFIX; + /* Test this key against every pattern. */ while((ln = listNext(&li))) { keyPattern *pattern = listNodeValue(ln); if ((pattern->flags & key_flags) != key_flags) continue; size_t plen = sdslen(pattern->pattern); - if (stringmatchlen(pattern->pattern,plen,key,keylen,0)) - return ACL_OK; + if (prefix) { + if (prefixmatch(pattern->pattern,plen,key,keylen,0)) + return ACL_OK; + } else { + if (stringmatchlen(pattern->pattern, plen, key, keylen, 0)) + return ACL_OK; + } } return ACL_DENIED_KEY; } diff --git a/src/module.c b/src/module.c index a7a4e1f45..35f20f902 100644 --- a/src/module.c +++ b/src/module.c @@ -9774,6 +9774,45 @@ int RM_ACLCheckKeyPermissions(RedisModuleUser *user, RedisModuleString *key, int return REDISMODULE_OK; } +/* Check if the user can access keys matching the given key prefix according to the ACLs + * attached to the user and the flags representing key access. The flags are the same that + * are used in the keyspec for logical operations. These flags are documented in + * RedisModule_SetCommandInfo as the REDISMODULE_CMD_KEY_ACCESS, + * REDISMODULE_CMD_KEY_UPDATE, REDISMODULE_CMD_KEY_INSERT, and REDISMODULE_CMD_KEY_DELETE flags. + * + * If no flags are supplied, the user is still required to have some access to keys matching + * the prefix for this command to return successfully. + * + * If the user is able to access keys matching the prefix, then REDISMODULE_OK is returned. + * Otherwise, REDISMODULE_ERR is returned and errno is set to one of the following values: + * + * * EINVAL: The provided flags are invalid. + * * EACCES: The user does not have permission to access keys matching the prefix. + */ +int RM_ACLCheckKeyPrefixPermissions(RedisModuleUser *user, RedisModuleString *prefix, int flags) { + const int allow_mask = (REDISMODULE_CMD_KEY_ACCESS + | REDISMODULE_CMD_KEY_INSERT + | REDISMODULE_CMD_KEY_DELETE + | REDISMODULE_CMD_KEY_UPDATE); + + if ((flags & allow_mask) != flags) { + errno = EINVAL; + return REDISMODULE_ERR; + } + + int keyspec_flags = moduleConvertKeySpecsFlags(flags, 0); + + /* Add the prefix flag to the keyspec flags */ + keyspec_flags |= CMD_KEY_PREFIX; + + if (ACLUserCheckKeyPerm(user->user, prefix->ptr, sdslen(prefix->ptr), keyspec_flags) != ACL_OK) { + errno = EACCES; + return REDISMODULE_ERR; + } + + return REDISMODULE_OK; +} + /* Check if the pubsub channel can be accessed by the user based off of the given * access flags. See RM_ChannelAtPosWithFlags for more information about the * possible flags that can be passed in. @@ -14186,6 +14225,7 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(GetModuleUserFromUserName); REGISTER_API(ACLCheckCommandPermissions); REGISTER_API(ACLCheckKeyPermissions); + REGISTER_API(ACLCheckKeyPrefixPermissions); REGISTER_API(ACLCheckChannelPermissions); REGISTER_API(ACLAddLogEntry); REGISTER_API(ACLAddLogEntryByUserName); diff --git a/src/redismodule.h b/src/redismodule.h index d0b7f7735..2c8ff09b5 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -1288,6 +1288,7 @@ REDISMODULE_API RedisModuleString * (*RedisModule_GetCurrentUserName)(RedisModul REDISMODULE_API RedisModuleUser * (*RedisModule_GetModuleUserFromUserName)(RedisModuleString *name) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_ACLCheckCommandPermissions)(RedisModuleUser *user, RedisModuleString **argv, int argc) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_ACLCheckKeyPermissions)(RedisModuleUser *user, RedisModuleString *key, int flags) REDISMODULE_ATTR; +REDISMODULE_API int (*RedisModule_ACLCheckKeyPrefixPermissions)(RedisModuleUser *user, RedisModuleString *prefix, int flags) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_ACLCheckChannelPermissions)(RedisModuleUser *user, RedisModuleString *ch, int literal) REDISMODULE_ATTR; REDISMODULE_API void (*RedisModule_ACLAddLogEntry)(RedisModuleCtx *ctx, RedisModuleUser *user, RedisModuleString *object, RedisModuleACLLogEntryReason reason) REDISMODULE_ATTR; REDISMODULE_API void (*RedisModule_ACLAddLogEntryByUserName)(RedisModuleCtx *ctx, RedisModuleString *user, RedisModuleString *object, RedisModuleACLLogEntryReason reason) REDISMODULE_ATTR; @@ -1657,6 +1658,7 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int REDISMODULE_GET_API(GetModuleUserFromUserName); REDISMODULE_GET_API(ACLCheckCommandPermissions); REDISMODULE_GET_API(ACLCheckKeyPermissions); + REDISMODULE_GET_API(ACLCheckKeyPrefixPermissions); REDISMODULE_GET_API(ACLCheckChannelPermissions); REDISMODULE_GET_API(ACLAddLogEntry); REDISMODULE_GET_API(ACLAddLogEntryByUserName); diff --git a/src/server.h b/src/server.h index 6d9832b18..c76ff2fcc 100644 --- a/src/server.h +++ b/src/server.h @@ -274,6 +274,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; * out to all keys it should cover */ #define CMD_KEY_VARIABLE_FLAGS (1ULL<<10) /* Means that some keys might have * different flags depending on arguments */ +#define CMD_KEY_PREFIX (1ULL<<11) /* Given key represents a prefix of a set of keys */ /* Key flags for when access type is unknown */ #define CMD_KEY_FULL_ACCESS (CMD_KEY_RW | CMD_KEY_ACCESS | CMD_KEY_UPDATE) diff --git a/src/util.c b/src/util.c index ec0a3fb0e..ee67be418 100644 --- a/src/util.c +++ b/src/util.c @@ -198,6 +198,43 @@ static int stringmatchlen_impl(const char *pattern, int patternLen, return 0; } +/* + * glob-style pattern matching to check if a given pattern fully includes + * the prefix of a string. For the match to succeed, the pattern must end with + * an unescaped '*' character. + * + * Returns: 1 if the `pattern` fully matches the `prefixStr`. Returns 0 otherwise. + */ +int prefixmatch(const char *pattern, int patternLen, + const char *prefixStr, int prefixStrLen, int nocase) { + int skipLongerMatches = 0; + + /* Step 1: Verify if the pattern matches the prefix string completely. */ + if (!stringmatchlen_impl(pattern, patternLen, prefixStr, prefixStrLen, nocase, &skipLongerMatches, 0)) + return 0; + + /* Step 2: Verify that the pattern ends with an unescaped '*', indicating + * it can match any suffix of the string beyond the prefix. This check + * remains outside stringmatchlen_impl() to keep its complexity manageable. + */ + if (pattern[patternLen - 1] != '*' || patternLen == 0) + return 0; + + /* Count backward the number of consecutive backslashes preceding the '*' + * to determine if the '*' is escaped. */ + int backslashCount = 0; + for (int i = patternLen - 2; i >= 0; i--) { + if (pattern[i] == '\\') + ++backslashCount; + else + break; /* Stop counting when a non-backslash character is found. */ + } + + /* Return 1 if the '*' is not escaped (i.e., even count), 0 otherwise. */ + return (backslashCount % 2 == 0); +} + +/* Glob-style pattern matching to a string. */ int stringmatchlen(const char *pattern, int patternLen, const char *string, int stringLen, int nocase) { int skipLongerMatches = 0; diff --git a/src/util.h b/src/util.h index 07cfb61dc..9745fe282 100644 --- a/src/util.h +++ b/src/util.h @@ -36,6 +36,8 @@ typedef enum { LD_STR_HEX /* %La */ } ld2string_mode; +int prefixmatch(const char *pattern, int patternLen, const char *prefixStr, + int prefixStrLen, int nocase); int stringmatchlen(const char *p, int plen, const char *s, int slen, int nocase); int stringmatch(const char *p, const char *s, int nocase); int stringmatchlen_fuzz_test(void); diff --git a/tests/modules/aclcheck.c b/tests/modules/aclcheck.c index b74651804..0fb876c52 100644 --- a/tests/modules/aclcheck.c +++ b/tests/modules/aclcheck.c @@ -51,6 +51,52 @@ int set_aclcheck_key(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { return REDISMODULE_OK; } +/* A wrap for SET command with ACL check on the key. */ +int set_aclcheck_prefixkey(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + if (argc < 4) { + return RedisModule_WrongArity(ctx); + } + + int permissions; + const char *flags = RedisModule_StringPtrLen(argv[1], NULL); + + if (!strcasecmp(flags, "W")) { + permissions = REDISMODULE_CMD_KEY_UPDATE; + } else if (!strcasecmp(flags, "R")) { + permissions = REDISMODULE_CMD_KEY_ACCESS; + } else if (!strcasecmp(flags, "*")) { + permissions = REDISMODULE_CMD_KEY_UPDATE | REDISMODULE_CMD_KEY_ACCESS; + } else if (!strcasecmp(flags, "~")) { + permissions = 0; /* Requires either read or write */ + } else { + RedisModule_ReplyWithError(ctx, "INVALID FLAGS"); + return REDISMODULE_OK; + } + + /* Check that the key can be accessed */ + RedisModuleString *user_name = RedisModule_GetCurrentUserName(ctx); + RedisModuleUser *user = RedisModule_GetModuleUserFromUserName(user_name); + int ret = RedisModule_ACLCheckKeyPrefixPermissions(user, argv[2], permissions); + if (ret != 0) { + RedisModule_ReplyWithError(ctx, "DENIED KEY"); + RedisModule_FreeModuleUser(user); + RedisModule_FreeString(ctx, user_name); + return REDISMODULE_OK; + } + + RedisModuleCallReply *rep = RedisModule_Call(ctx, "SET", "v", argv + 3, argc - 3); + if (!rep) { + RedisModule_ReplyWithError(ctx, "NULL reply returned"); + } else { + RedisModule_ReplyWithCallReply(ctx, rep); + RedisModule_FreeCallReply(rep); + } + + RedisModule_FreeModuleUser(user); + RedisModule_FreeString(ctx, user_name); + return REDISMODULE_OK; +} + /* A wrap for PUBLISH command with ACL check on the channel. */ int publish_aclcheck_channel(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { if (argc != 3) { @@ -247,6 +293,9 @@ int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) if (RedisModule_CreateCommand(ctx,"aclcheck.set.check.key", set_aclcheck_key,"write",0,0,0) == REDISMODULE_ERR) return REDISMODULE_ERR; + if (RedisModule_CreateCommand(ctx,"aclcheck.set.check.prefixkey", set_aclcheck_prefixkey,"write",0,0,0) == REDISMODULE_ERR) + return REDISMODULE_ERR; + if (RedisModule_CreateCommand(ctx,"block.commands.outside.onload", commandBlockCheck,"write",0,0,0) == REDISMODULE_ERR) return REDISMODULE_ERR; diff --git a/tests/unit/moduleapi/aclcheck.tcl b/tests/unit/moduleapi/aclcheck.tcl index 1ea09a232..063c5c5b0 100644 --- a/tests/unit/moduleapi/aclcheck.tcl +++ b/tests/unit/moduleapi/aclcheck.tcl @@ -18,10 +18,43 @@ start_server {tags {"modules acl"}} { assert {[dict get $entry object] eq {set}} assert {[dict get $entry reason] eq {command}} } + + test {test module check acl for key prefix permission} { + r acl setuser default +set resetkeys ~CART* %W~ORDER* %R~PRODUCT* ~ESCAPED_STAR\\* ~NON_ESCAPED_STAR\\\\* + + # check for key permission of prefix CART* (READ+WRITE) + catch {r aclcheck.set.check.prefixkey "~" CAR CART_CLOTHES_7 5} e + assert_match "*DENIED KEY*" $e + assert_equal [r aclcheck.set.check.prefixkey "~" CART CART 5] OK + assert_equal [r aclcheck.set.check.prefixkey "W" CART_BOOKS CART_BOOKS_12 5] OK + assert_equal [r aclcheck.set.check.prefixkey "R" CART_CLOTHES CART_CLOTHES_7 5] OK + + # check for key permission of prefix ORDER* (WRITE) + catch {r aclcheck.set.check.prefixkey "~" ORDE ORDER_2024_155351 5} e + assert_match "*DENIED KEY*" $e + assert_equal [r aclcheck.set.check.prefixkey "~" ORDER ORDER 5] OK + assert_equal [r aclcheck.set.check.prefixkey "W" ORDER_2024 ORDER_2024_564879 5] OK + assert_equal [r aclcheck.set.check.prefixkey "~" ORDER_2023 ORDER_2023_564879 5] OK + catch {r aclcheck.set.check.prefixkey "R" ORDER_2023 ORDER_2023_564879 5} + assert_match "*DENIED KEY*" $e + + # check for key permission of prefix PRODUCT* (READ) + catch {r aclcheck.set.check.prefixkey "~" PRODUC PRODUCT_CLOTHES_753376 5} e + assert_match "*DENIED KEY*" $e + assert_equal [r aclcheck.set.check.prefixkey "~" PRODUCT PRODUCT 5] OK + assert_equal [r aclcheck.set.check.prefixkey "~" PRODUCT_BOOKS PRODUCT_BOOKS_753376 5] OK + + # pattern ends with a escaped '*' character should not be counted as a prefix + catch {r aclcheck.set.check.prefixkey "~" ESCAPED_STAR ESCAPED_STAR_12 5} e + assert_match "*DENIED KEY*" $e + catch {r aclcheck.set.check.prefixkey "~" ESCAPED_STAR* ESCAPED_STAR* 5} e + assert_match "*DENIED KEY*" $e + assert_equal [r aclcheck.set.check.prefixkey "~" NON_ESCAPED_STAR\\ NON_ESCAPED_STAR\\clothes 5] OK + } test {test module check acl for key perm} { # give permission for SET and block all keys but x(READ+WRITE), y(WRITE), z(READ) - r acl setuser default +set resetkeys ~x %W~y %R~z + r acl setuser default +set resetkeys ~x %W~y %R~z ~ESCAPED_STAR\\* assert_equal [r aclcheck.set.check.key "*" x 5] OK catch {r aclcheck.set.check.key "*" v 5} e @@ -40,6 +73,9 @@ start_server {tags {"modules acl"}} { assert_equal [r aclcheck.set.check.key "R" z 5] OK catch {r aclcheck.set.check.key "R" v 5} e assert_match "*DENIED KEY*" $e + + # check pattern ends with escaped '*' character + assert_equal [r aclcheck.set.check.key "~" ESCAPED_STAR* 5] OK } test {test module check acl for module user} { From 2af69a931ab45aad22204763bf93e1050629eede Mon Sep 17 00:00:00 2001 From: Ozan Tezcan Date: Tue, 3 Dec 2024 09:26:19 +0300 Subject: [PATCH 19/42] Do not call _dictClear()'s callback for the first 65k items (#13674) In https://github.com/redis/redis/pull/13495, we introduced a feature to reply -LOADING while flushing a large db on a replica. While `_dictClear()` is in progress, it calls a callback for every 65k items and we yield back to eventloop to reply -LOADING. This change has made some tests unstable as those tests don't expect new -LOADING reply. One observation, inside `_dictClear()`, we call the callback even if db has a few keys. Most tests run with small amount of keys. So, each replication and cluster test has to handle potential -LOADING reply now. This PR changes this behavior, skips calling callback when `i=0` to stabilize replication tests. Callback will be called after the first 65k items. Most tests use less than 65k keys and they won't get -LOADING reply. --- src/dict.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/dict.c b/src/dict.c index 2928d8af5..82c64b248 100644 --- a/src/dict.c +++ b/src/dict.c @@ -697,8 +697,9 @@ int _dictClear(dict *d, int htidx, void(callback)(dict*)) { /* Free all the elements */ for (i = 0; i < DICTHT_SIZE(d->ht_size_exp[htidx]) && d->ht_used[htidx] > 0; i++) { dictEntry *he, *nextHe; - - if (callback && (i & 65535) == 0) callback(d); + /* Callback will be called once for every 65535 deletions. Beware, + * if dict has less than 65535 items, it will not be called at all.*/ + if (callback && i != 0 && (i & 65535) == 0) callback(d); if ((he = d->ht_table[htidx][i]) == NULL) continue; while(he) { From ddafac4c6cef358441ce0ddb9a46a3a0378478d1 Mon Sep 17 00:00:00 2001 From: "Filipe Oliveira (Redis)" Date: Wed, 4 Dec 2024 09:16:14 +0000 Subject: [PATCH 20/42] Optimize dictFind with prefetching and branch prediction hints (#13646) This pull request optimizes the `dictFind` function by adding software prefetching and branch prediction hints to improve cache efficiency and reduce memory latency. It introduces 2 prefetch hints (read/write) that became no-ops in case the compiler does not support it. Baseline profiling with Intel VTune indicated that dictFind was significantly back-end bound, with memory latency accounting for 59.6% of clockticks, with frequent stalls from DRAM-bound operations due to cache misses during hash table lookups. ![microarch](https://github.com/user-attachments/assets/9e3cf334-ae6b-4767-b568-713a4ac24e87) --------- Co-authored-by: Yuan Wang --- src/config.h | 19 +++++++++++++++++++ src/dict.c | 11 ++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/config.h b/src/config.h index c7fadf5ff..e8f77a350 100644 --- a/src/config.h +++ b/src/config.h @@ -101,6 +101,25 @@ #endif #endif +/* Test for __builtin_prefetch() + * Supported in LLVM since 2.9: https://releases.llvm.org/2.9/docs/ReleaseNotes.html + * Supported in GCC since 3.1 but we use 4.9 given it's too old: https://gcc.gnu.org/gcc-3.1/changes.html. */ +#if defined(__clang__) && (__clang_major__ > 2 || (__clang_major__ == 2 && __clang_minor__ >= 9)) +#define HAS_BUILTIN_PREFETCH 1 +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) +#define HAS_BUILTIN_PREFETCH 1 +#else +#define HAS_BUILTIN_PREFETCH 0 +#endif + +#if HAS_BUILTIN_PREFETCH +#define redis_prefetch_read(addr) __builtin_prefetch(addr, 0, 3) /* Read with high locality */ +#define redis_prefetch_write(addr) __builtin_prefetch(addr, 1, 3) /* Write with high locality */ +#else +#define redis_prefetch_read(addr) ((void)(addr)) /* No-op if unsupported */ +#define redis_prefetch_write(addr) ((void)(addr)) /* No-op if unsupported */ +#endif + /* Define redis_fsync to fdatasync() in Linux and fsync() for all the rest */ #if defined(__linux__) #define redis_fsync(fd) fdatasync(fd) diff --git a/src/dict.c b/src/dict.c index 82c64b248..0b72506ce 100644 --- a/src/dict.c +++ b/src/dict.c @@ -760,14 +760,23 @@ dictEntry *dictFind(dict *d, const void *key) for (table = 0; table <= 1; table++) { if (table == 0 && (long)idx < d->rehashidx) continue; idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); + + /* Prefetch the bucket at the calculated index */ + redis_prefetch_read(&d->ht_table[table][idx]); + he = d->ht_table[table][idx]; while(he) { void *he_key = dictGetKey(he); + + /* Prefetch the next entry to improve cache efficiency */ + redis_prefetch_read(dictGetNext(he)); + if (key == he_key || cmpFunc(d, key, he_key)) return he; he = dictGetNext(he); } - if (!dictIsRehashing(d)) return NULL; + /* Use unlikely to optimize branch prediction for the common case */ + if (unlikely(!dictIsRehashing(d))) return NULL; } return NULL; } From 59953d2df62b0e3c4996b8068836591e96349720 Mon Sep 17 00:00:00 2001 From: "Filipe Oliveira (Redis)" Date: Wed, 4 Dec 2024 10:04:37 +0000 Subject: [PATCH 21/42] Improve listpack Handling and Decoding Efficiency: 16.3% improvement on LRANGE command (#13652) This PR focused on refining listpack encoding/decoding functions and optimizing reply handling mechanisms related to it. Each commit has the measured improvement up until the last accumulated improvement of 16.3% on [memtier_benchmark-1key-list-100-elements-lrange-all-elements-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1key-list-100-elements-lrange-all-elements-pipeline-10.yml) benchmark. Connection mode | CE Baseline (Nov 14th) 701f06657d20ec42f0cf78de0ac9d7197e44c00c | CE PR #13652 | CE PR vs CE Unstable -- | -- | -- | -- TCP | 155696 | 178874 | 14.9% Unix socket | 169743 | 197428 | 16.3% To test it we can simply focus on the scan.tcl ``` tclsh tests/test_helper.tcl --single unit/replybufsize ``` ### Commit details: - 2e58d048fd9d1365c56292ebd69f2ed141bfeda1 + 29c6c86c6b96376f807656f203b61d0e7fcb65a4 : Eliminate an indirect memory access on lpCurrentEncodedSizeBytes and completely avoid passing p* fully to lpCurrentEncodedSizeBytes + Add lpNextWithBytes helper function and optimize addListListpackRangeReply **- Improvement of 3.1%, from 168969.88 ops/sec to 174239.75 ops/sec** - af52aacff86e2809f3451e2561043d532b9ee223 Refactor lpDecodeBacklen for loop-based decoding, improving readability and branch efficiency. **- NO CHANGE. REVERTED in 09f6680ba0d0b5acabca537c651008f0c8ec061b** - 048bfe4edaaf5671f7f06b06d1492f81e6af3e59 + 03e8ff3af70891cbd3d53ca865558976459bb38e : reducing condition checks in _addReplyToBuffer, inlining it, and avoid entering it when there are there already entries in the reply list and check if the reply length exceeds available buffer space before calling _addReplyToBuffer **- accumulated Improvement of 12.4%, from 168969.88 ops/sec to 189726.81 ops/sec** - 9a63d4d6a9fa946505e31ecce4c7796845fc022c: always update the buf_peak on _addReplyToBufferOrList **- accumulated Improvement of 14.2%, from 168969.88 ops/sec to 193887 ops/sec** - b544ade67628a1feaf714d6cfd114930e0c7670b: Introduce lpEncodeBacklenBytes to avoid any indirect memory access on previous usage of lpEncodeBacklen(NULL,...). inline lpEncodeBacklenBytes(). **- accumulated Improvement of 16.3%, from 168969.88 ops/sec to 197427.70 ops/sec** --------- Co-authored-by: debing.sun --- src/listpack.c | 166 ++++++++++++++++++++++++++++++++++++++++------- src/listpack.h | 1 + src/networking.c | 38 ++++------- src/t_list.c | 19 +++--- 4 files changed, 165 insertions(+), 59 deletions(-) diff --git a/src/listpack.c b/src/listpack.c index 8b9ae196e..8b733d8b8 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -369,6 +369,23 @@ static inline unsigned long lpEncodeBacklen(unsigned char *buf, uint64_t l) { } } +/* Calculate the number of bytes required to reverse-encode a variable length + * field representing the length of the previous element of size 'l', ranging + * from 1 to 5. */ +static inline unsigned long lpEncodeBacklenBytes(uint64_t l) { + if (l <= 127) { + return 1; + } else if (l < 16383) { + return 2; + } else if (l < 2097151) { + return 3; + } else if (l < 268435455) { + return 4; + } else { + return 5; + } +} + /* Decode the backlen and returns it. If the encoding looks invalid (more than * 5 bytes are used), UINT64_MAX is returned to report the problem. */ static inline uint64_t lpDecodeBacklen(unsigned char *p) { @@ -431,17 +448,17 @@ static inline uint32_t lpCurrentEncodedSizeUnsafe(unsigned char *p) { * This includes just the encoding byte, and the bytes needed to encode the length * of the element (excluding the element data itself) * If the element encoding is wrong then 0 is returned. */ -static inline uint32_t lpCurrentEncodedSizeBytes(unsigned char *p) { - if (LP_ENCODING_IS_7BIT_UINT(p[0])) return 1; - if (LP_ENCODING_IS_6BIT_STR(p[0])) return 1; - if (LP_ENCODING_IS_13BIT_INT(p[0])) return 1; - if (LP_ENCODING_IS_16BIT_INT(p[0])) return 1; - if (LP_ENCODING_IS_24BIT_INT(p[0])) return 1; - if (LP_ENCODING_IS_32BIT_INT(p[0])) return 1; - if (LP_ENCODING_IS_64BIT_INT(p[0])) return 1; - if (LP_ENCODING_IS_12BIT_STR(p[0])) return 2; - if (LP_ENCODING_IS_32BIT_STR(p[0])) return 5; - if (p[0] == LP_EOF) return 1; +static inline uint32_t lpCurrentEncodedSizeBytes(const unsigned char encoding) { + if (LP_ENCODING_IS_7BIT_UINT(encoding)) return 1; + if (LP_ENCODING_IS_6BIT_STR(encoding)) return 1; + if (LP_ENCODING_IS_13BIT_INT(encoding)) return 1; + if (LP_ENCODING_IS_16BIT_INT(encoding)) return 1; + if (LP_ENCODING_IS_24BIT_INT(encoding)) return 1; + if (LP_ENCODING_IS_32BIT_INT(encoding)) return 1; + if (LP_ENCODING_IS_64BIT_INT(encoding)) return 1; + if (LP_ENCODING_IS_12BIT_STR(encoding)) return 2; + if (LP_ENCODING_IS_32BIT_STR(encoding)) return 5; + if (encoding == LP_EOF) return 1; return 0; } @@ -449,13 +466,22 @@ static inline uint32_t lpCurrentEncodedSizeBytes(unsigned char *p) { * function if the current element is the EOF element at the end of the * listpack, however, while this function is used to implement lpNext(), * it does not return NULL when the EOF element is encountered. */ -unsigned char *lpSkip(unsigned char *p) { +static inline unsigned char *lpSkip(unsigned char *p) { unsigned long entrylen = lpCurrentEncodedSizeUnsafe(p); - entrylen += lpEncodeBacklen(NULL,entrylen); + entrylen += lpEncodeBacklenBytes(entrylen); p += entrylen; return p; } +/* This is similar to lpNext() but avoids the inner call to lpBytes when you already know the listpack size. */ +unsigned char *lpNextWithBytes(unsigned char *lp, unsigned char *p, const size_t lpbytes) { + assert(p); + p = lpSkip(p); + if (p[0] == LP_EOF) return NULL; + lpAssertValidEntry(lp, lpbytes, p); + return p; +} + /* If 'p' points to an element of the listpack, calling lpNext() will return * the pointer to the next element (the one on the right), or NULL if 'p' * already pointed to the last element of the listpack. */ @@ -475,7 +501,7 @@ unsigned char *lpPrev(unsigned char *lp, unsigned char *p) { if (p-lp == LP_HDR_SIZE) return NULL; p--; /* Seek the first backlen byte of the last element. */ uint64_t prevlen = lpDecodeBacklen(p); - prevlen += lpEncodeBacklen(NULL,prevlen); + prevlen += lpEncodeBacklenBytes(prevlen); p -= prevlen-1; /* Seek the first byte of the previous entry. */ lpAssertValidEntry(lp, lpBytes(lp), p); return p; @@ -569,7 +595,7 @@ static inline unsigned char *lpGetWithSize(unsigned char *p, int64_t *count, uns if (entry_size) *entry_size = LP_ENCODING_7BIT_UINT_ENTRY_SIZE; } else if (LP_ENCODING_IS_6BIT_STR(p[0])) { *count = LP_ENCODING_6BIT_STR_LEN(p); - if (entry_size) *entry_size = 1 + *count + lpEncodeBacklen(NULL, *count + 1); + if (entry_size) *entry_size = 1 + *count + lpEncodeBacklenBytes(*count + 1); return p+1; } else if (LP_ENCODING_IS_13BIT_INT(p[0])) { uval = ((p[0]&0x1f)<<8) | p[1]; @@ -611,11 +637,11 @@ static inline unsigned char *lpGetWithSize(unsigned char *p, int64_t *count, uns if (entry_size) *entry_size = LP_ENCODING_64BIT_INT_ENTRY_SIZE; } else if (LP_ENCODING_IS_12BIT_STR(p[0])) { *count = LP_ENCODING_12BIT_STR_LEN(p); - if (entry_size) *entry_size = 2 + *count + lpEncodeBacklen(NULL, *count + 2); + if (entry_size) *entry_size = 2 + *count + lpEncodeBacklenBytes(*count + 2); return p+2; } else if (LP_ENCODING_IS_32BIT_STR(p[0])) { *count = LP_ENCODING_32BIT_STR_LEN(p); - if (entry_size) *entry_size = 5 + *count + lpEncodeBacklen(NULL, *count + 5); + if (entry_size) *entry_size = 5 + *count + lpEncodeBacklenBytes(*count + 5); return p+5; } else { uval = 12345678900000000ULL + p[0]; @@ -647,8 +673,99 @@ static inline unsigned char *lpGetWithSize(unsigned char *p, int64_t *count, uns } } +/* Return the listpack element pointed by 'p'. + * + * The function has the same behaviour as lpGetWithSize when 'entry_size' is NULL, + * but avoids a lot of unecesarry branching performance penalties. */ +static inline unsigned char *lpGetWithBuf(unsigned char *p, int64_t *count, unsigned char *intbuf) { + int64_t val; + uint64_t uval, negstart, negmax; + assert(p); /* assertion for valgrind (avoid NPD) */ + const unsigned char encoding = p[0]; + + /* string encoding */ + if (LP_ENCODING_IS_6BIT_STR(encoding)) { + *count = LP_ENCODING_6BIT_STR_LEN(p); + return p+1; + } + if (LP_ENCODING_IS_12BIT_STR(encoding)) { + *count = LP_ENCODING_12BIT_STR_LEN(p); + return p+2; + } + if (LP_ENCODING_IS_32BIT_STR(encoding)) { + *count = LP_ENCODING_32BIT_STR_LEN(p); + return p+5; + } + /* int encoding */ + if (LP_ENCODING_IS_7BIT_UINT(encoding)) { + negstart = UINT64_MAX; /* 7 bit ints are always positive. */ + negmax = 0; + uval = encoding & 0x7f; + } else if (LP_ENCODING_IS_13BIT_INT(encoding)) { + uval = ((encoding&0x1f)<<8) | p[1]; + negstart = (uint64_t)1<<12; + negmax = 8191; + } else if (LP_ENCODING_IS_16BIT_INT(encoding)) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8; + negstart = (uint64_t)1<<15; + negmax = UINT16_MAX; + } else if (LP_ENCODING_IS_24BIT_INT(encoding)) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8 | + (uint64_t)p[3]<<16; + negstart = (uint64_t)1<<23; + negmax = UINT32_MAX>>8; + } else if (LP_ENCODING_IS_32BIT_INT(encoding)) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8 | + (uint64_t)p[3]<<16 | + (uint64_t)p[4]<<24; + negstart = (uint64_t)1<<31; + negmax = UINT32_MAX; + } else if (LP_ENCODING_IS_64BIT_INT(encoding)) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8 | + (uint64_t)p[3]<<16 | + (uint64_t)p[4]<<24 | + (uint64_t)p[5]<<32 | + (uint64_t)p[6]<<40 | + (uint64_t)p[7]<<48 | + (uint64_t)p[8]<<56; + negstart = (uint64_t)1<<63; + negmax = UINT64_MAX; + } else { + uval = 12345678900000000ULL + encoding; + negstart = UINT64_MAX; + negmax = 0; + } + + /* We reach this code path only for integer encodings. + * Convert the unsigned value to the signed one using two's complement + * rule. */ + if (uval >= negstart) { + /* This three steps conversion should avoid undefined behaviors + * in the unsigned -> signed conversion. */ + uval = negmax-uval; + val = uval; + val = -val-1; + } else { + val = uval; + } + + /* Return the string representation of the integer or the value itself + * depending on intbuf being NULL or not. */ + if (intbuf) { + *count = ll2string((char*)intbuf,LP_INTBUF_SIZE,(long long)val); + return intbuf; + } else { + *count = val; + return NULL; + } +} + unsigned char *lpGet(unsigned char *p, int64_t *count, unsigned char *intbuf) { - return lpGetWithSize(p, count, intbuf, NULL); + return lpGetWithBuf(p, count, intbuf); } /* This is just a wrapper to lpGet() that is able to get entry value directly. @@ -880,7 +997,7 @@ unsigned char *lpInsert(unsigned char *lp, unsigned char *elestr, unsigned char uint32_t replaced_len = 0; if (where == LP_REPLACE) { replaced_len = lpCurrentEncodedSizeUnsafe(p); - replaced_len += lpEncodeBacklen(NULL,replaced_len); + replaced_len += lpEncodeBacklenBytes(replaced_len); ASSERT_INTEGRITY_LEN(lp, p, replaced_len); } @@ -1420,7 +1537,7 @@ size_t lpBytes(unsigned char *lp) { size_t lpEntrySizeInteger(long long lval) { uint64_t enclen; lpEncodeIntegerGetType(lval, NULL, &enclen); - unsigned long backlen = lpEncodeBacklen(NULL, enclen); + unsigned long backlen = lpEncodeBacklenBytes(enclen); return enclen + backlen; } @@ -1487,6 +1604,7 @@ unsigned char *lpValidateFirst(unsigned char *lp) { /* Validate the integrity of a single listpack entry and move to the next one. * The input argument 'pp' is a reference to the current record and is advanced on exit. + * the data pointed to by 'lp' will not be modified by the function. * Returns 1 if valid, 0 if invalid. */ int lpValidateNext(unsigned char *lp, unsigned char **pp, size_t lpbytes) { #define OUT_OF_RANGE(p) ( \ @@ -1506,7 +1624,7 @@ int lpValidateNext(unsigned char *lp, unsigned char **pp, size_t lpbytes) { } /* check that we can read the encoded size */ - uint32_t lenbytes = lpCurrentEncodedSizeBytes(p); + uint32_t lenbytes = lpCurrentEncodedSizeBytes(p[0]); if (!lenbytes) return 0; @@ -1516,7 +1634,7 @@ int lpValidateNext(unsigned char *lp, unsigned char **pp, size_t lpbytes) { /* get the entry length and encoded backlen. */ unsigned long entrylen = lpCurrentEncodedSizeUnsafe(p); - unsigned long encodedBacklen = lpEncodeBacklen(NULL,entrylen); + unsigned long encodedBacklen = lpEncodeBacklenBytes(entrylen); entrylen += encodedBacklen; /* make sure the entry doesn't reach outside the edge of the listpack */ @@ -1859,9 +1977,9 @@ void lpRepr(unsigned char *lp) { p = lpFirst(lp); while(p) { - uint32_t encoded_size_bytes = lpCurrentEncodedSizeBytes(p); + uint32_t encoded_size_bytes = lpCurrentEncodedSizeBytes(p[0]); uint32_t encoded_size = lpCurrentEncodedSizeUnsafe(p); - unsigned long back_len = lpEncodeBacklen(NULL, encoded_size); + unsigned long back_len = lpEncodeBacklenBytes(encoded_size); printf( "{\n" "\taddr: 0x%08lx,\n" diff --git a/src/listpack.h b/src/listpack.h index c9fbc5624..bfddbd73b 100644 --- a/src/listpack.h +++ b/src/listpack.h @@ -65,6 +65,7 @@ unsigned char *lpFindCb(unsigned char *lp, unsigned char *p, void *user, lpCmp c unsigned char *lpFirst(unsigned char *lp); unsigned char *lpLast(unsigned char *lp); unsigned char *lpNext(unsigned char *lp, unsigned char *p); +unsigned char *lpNextWithBytes(unsigned char *lp, unsigned char *p, const size_t lpbytes); unsigned char *lpPrev(unsigned char *lp, unsigned char *p); size_t lpBytes(unsigned char *lp); size_t lpEntrySizeInteger(long long lval); diff --git a/src/networking.c b/src/networking.c index 67e1b139c..9a9515f77 100644 --- a/src/networking.c +++ b/src/networking.c @@ -317,29 +317,6 @@ int prepareClientToWrite(client *c) { * Low level functions to add more data to output buffers. * -------------------------------------------------------------------------- */ -/* Attempts to add the reply to the static buffer in the client struct. - * Returns the length of data that is added to the reply buffer. - * - * Sanitizer suppression: client->buf_usable_size determined by - * zmalloc_usable_size() call. Writing beyond client->buf boundaries confuses - * sanitizer and generates a false positive out-of-bounds error */ -REDIS_NO_SANITIZE("bounds") -size_t _addReplyToBuffer(client *c, const char *s, size_t len) { - size_t available = c->buf_usable_size - c->bufpos; - - /* If there already are entries in the reply list, we cannot - * add anything more to the static buffer. */ - if (listLength(c->reply) > 0) return 0; - - size_t reply_len = len > available ? available : len; - memcpy(c->buf+c->bufpos,s,reply_len); - c->bufpos+=reply_len; - /* We update the buffer peak after appending the reply to the buffer */ - if(c->buf_peak < (size_t)c->bufpos) - c->buf_peak = (size_t)c->bufpos; - return reply_len; -} - /* Adds the reply to the reply linked list. * Note: some edits to this function need to be relayed to AddReplyFromClient. */ void _addReplyProtoToList(client *c, list *reply_list, const char *s, size_t len) { @@ -419,7 +396,20 @@ void _addReplyToBufferOrList(client *c, const char *s, size_t len) { return; } - size_t reply_len = _addReplyToBuffer(c,s,len); + /* We update the buffer peak always */ + const size_t available = c->buf_usable_size - c->bufpos; + + size_t reply_len = 0; + /* If there already are entries in the reply list, we cannot + * add anything more to the static buffer. */ + if (listLength(c->reply) < 1) { + reply_len = len > available ? available : len; + memcpy(c->buf+c->bufpos,s,reply_len); + c->bufpos+=reply_len; + /* We update the buffer peak after appending the reply to the buffer */ + c->buf_peak = max(c->buf_peak,(size_t)c->bufpos); + } + if (len > reply_len) _addReplyProtoToList(c,c->reply,s+reply_len,len-reply_len); } diff --git a/src/t_list.c b/src/t_list.c index 9263cbd12..51f82c4bf 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -683,23 +683,20 @@ void addListQuicklistRangeReply(client *c, robj *o, int from, int rangelen, int * Note that the purpose is to make the methods small so that the * code in the loop can be inlined better to improve performance. */ void addListListpackRangeReply(client *c, robj *o, int from, int rangelen, int reverse) { - unsigned char *p = lpSeek(o->ptr, from); - unsigned char *vstr; - unsigned int vlen; - long long lval; + unsigned char *lp = o->ptr; + unsigned char *p = lpSeek(lp, from); + const size_t lpbytes = lpBytes(lp); + int64_t vlen; /* Return the result in form of a multi-bulk reply */ addReplyArrayLen(c,rangelen); while(rangelen--) { serverAssert(p); /* fail on corrupt data */ - vstr = lpGetValue(p, &vlen, &lval); - if (vstr) { - addReplyBulkCBuffer(c,vstr,vlen); - } else { - addReplyBulkLongLong(c,lval); - } - p = reverse ? lpPrev(o->ptr,p) : lpNext(o->ptr,p); + unsigned char buf[LP_INTBUF_SIZE]; + unsigned char *vstr = lpGet(p,&vlen,buf); + addReplyBulkCBuffer(c,vstr,vlen); + p = reverse ? lpPrev(lp,p) : lpNextWithBytes(lp,p,lpbytes); } } From 0dd057222bd8b156b113e925745d7b2bf9486056 Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Thu, 5 Dec 2024 11:14:52 +0200 Subject: [PATCH 22/42] Modules API: new HashFieldMinExpire(). Add flag REDISMODULE_HASH_EXPIRE_TIME to HashGet(). (#13676) This PR introduces API to query Expiration time of hash fields. # New `RedisModule_HashFieldMinExpire()` For a given hash, retrieves the minimum expiration time across all fields. If no fields have expiration or if the key is not a hash then return `REDISMODULE_NO_EXPIRE` (-1). ``` mstime_t RM_HashFieldMinExpire(RedisModuleKey *hash); ``` # Extension to `RedisModule_HashGet()` Adds a new flag, `REDISMODULE_HASH_EXPIRE_TIME`, to retrieve the expiration time of a specific hash field. If the field does not exist or has no expiration, returns `REDISMODULE_NO_EXPIRE`. It is fully backward-compatible (RM_HashGet retains its original behavior unless the new flag is used). Example: ``` mstime_t expiry1, expiry2; RedisModule_HashGet(mykey, REDISMODULE_HASH_EXPIRE_TIME, "field1", &expiry1, NULL); RedisModule_HashGet(mykey, REDISMODULE_HASH_EXPIRE_TIME, "field1", &expiry1, "field2", &expiry2, NULL); ``` --- src/module.c | 51 +++++++++++++++++++++++--- src/redismodule.h | 15 ++++---- src/server.h | 3 +- src/sort.c | 5 +-- src/t_hash.c | 59 ++++++++++++++++++++---------- tests/modules/hash.c | 67 +++++++++++++++++++++++++++++++++++ tests/unit/moduleapi/hash.tcl | 49 +++++++++++++++++++++++-- 7 files changed, 216 insertions(+), 33 deletions(-) diff --git a/src/module.c b/src/module.c index 35f20f902..f12d03b47 100644 --- a/src/module.c +++ b/src/module.c @@ -5358,7 +5358,12 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) { * expecting a RedisModuleString pointer to pointer, the function just * reports if the field exists or not and expects an integer pointer * as the second element of each pair. - * + * + * REDISMODULE_HASH_EXPIRE_TIME: retrieves the expiration time of a field in the hash. + * The function expects a `mstime_t` pointer as the second element of each pair. + * If the field does not exist or has no expiration, the value is set to + * `REDISMODULE_NO_EXPIRE`. This flag must not be used with `REDISMODULE_HASH_EXISTS`. + * * Example of REDISMODULE_HASH_CFIELDS: * * RedisModuleString *username, *hashedpass; @@ -5367,8 +5372,13 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) { * Example of REDISMODULE_HASH_EXISTS: * * int exists; - * RedisModule_HashGet(mykey,REDISMODULE_HASH_EXISTS,argv[1],&exists,NULL); + * RedisModule_HashGet(mykey,REDISMODULE_HASH_EXISTS,"username",&exists,NULL); * + * Example of REDISMODULE_HASH_EXPIRE_TIME: + * + * mstime_t hpExpireTime; + * RedisModule_HashGet(mykey,REDISMODULE_HASH_EXPIRE_TIME,"hp",&hpExpireTime,NULL); + * * The function returns REDISMODULE_OK on success and REDISMODULE_ERR if * the key is not a hash value. * @@ -5385,6 +5395,10 @@ int RM_HashGet(RedisModuleKey *key, int flags, ...) { if (key->mode & REDISMODULE_OPEN_KEY_ACCESS_EXPIRED) hfeFlags = HFE_LAZY_ACCESS_EXPIRED; /* allow read also expired fields */ + /* Verify flag HASH_EXISTS is not set together with HASH_EXPIRE_TIME */ + if ((flags & REDISMODULE_HASH_EXISTS) && (flags & REDISMODULE_HASH_EXPIRE_TIME)) + return REDISMODULE_ERR; + va_start(ap, flags); while(1) { RedisModuleString *field, **valueptr; @@ -5407,11 +5421,22 @@ int RM_HashGet(RedisModuleKey *key, int flags, ...) { } else { *existsptr = 0; } + } else if (flags & REDISMODULE_HASH_EXPIRE_TIME) { + mstime_t *expireptr = va_arg(ap,mstime_t*); + *expireptr = REDISMODULE_NO_EXPIRE; + if (key->value) { + uint64_t expireTime = 0; + /* As an opt, avoid fetching value, only expire time */ + int res = hashTypeGetValueObject(key->db, key->value, field->ptr, + hfeFlags, NULL, &expireTime, NULL); + /* If field has expiration time */ + if (res && expireTime != 0) *expireptr = expireTime; + } } else { valueptr = va_arg(ap,RedisModuleString**); if (key->value) { - *valueptr = hashTypeGetValueObject(key->db, key->value, field->ptr, - hfeFlags, NULL); + hashTypeGetValueObject(key->db, key->value, field->ptr, + hfeFlags, valueptr, NULL, NULL); if (*valueptr) { robj *decoded = getDecodedObject(*valueptr); @@ -5432,6 +5457,23 @@ int RM_HashGet(RedisModuleKey *key, int flags, ...) { return REDISMODULE_OK; } +/** + * Retrieves the minimum expiration time of fields in a hash. + * + * Return: + * - The minimum expiration time (in milliseconds) of the hash fields if at + * least one field has an expiration set. + * - REDISMODULE_NO_EXPIRE if no fields have an expiration set or if the key + * is not a hash. + */ +mstime_t RM_HashFieldMinExpire(RedisModuleKey *key) { + if ((!key->value) || (key->value->type != OBJ_HASH)) + return REDISMODULE_NO_EXPIRE; + + mstime_t min = hashTypeGetMinExpire(key->value, 1); + return (min == EB_EXPIRE_TIME_INVALID) ? REDISMODULE_NO_EXPIRE : min; +} + /* -------------------------------------------------------------------------- * ## Key API for Stream type * @@ -14027,6 +14069,7 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(ZsetRangeEndReached); REGISTER_API(HashSet); REGISTER_API(HashGet); + REGISTER_API(HashFieldMinExpire); REGISTER_API(StreamAdd); REGISTER_API(StreamDelete); REGISTER_API(StreamIteratorStart); diff --git a/src/redismodule.h b/src/redismodule.h index 2c8ff09b5..b8f00e816 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -115,12 +115,13 @@ typedef long long ustime_t; #define REDISMODULE_ZADD_LT (1<<6) /* Hash API flags. */ -#define REDISMODULE_HASH_NONE 0 -#define REDISMODULE_HASH_NX (1<<0) -#define REDISMODULE_HASH_XX (1<<1) -#define REDISMODULE_HASH_CFIELDS (1<<2) -#define REDISMODULE_HASH_EXISTS (1<<3) -#define REDISMODULE_HASH_COUNT_ALL (1<<4) +#define REDISMODULE_HASH_NONE 0 +#define REDISMODULE_HASH_NX (1<<0) +#define REDISMODULE_HASH_XX (1<<1) +#define REDISMODULE_HASH_CFIELDS (1<<2) +#define REDISMODULE_HASH_EXISTS (1<<3) +#define REDISMODULE_HASH_COUNT_ALL (1<<4) +#define REDISMODULE_HASH_EXPIRE_TIME (1<<5) #define REDISMODULE_CONFIG_DEFAULT 0 /* This is the default for a module config. */ #define REDISMODULE_CONFIG_IMMUTABLE (1ULL<<0) /* Can this value only be set at startup? */ @@ -1083,6 +1084,7 @@ REDISMODULE_API int (*RedisModule_ZsetRangePrev)(RedisModuleKey *key) REDISMODUL REDISMODULE_API int (*RedisModule_ZsetRangeEndReached)(RedisModuleKey *key) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_HashSet)(RedisModuleKey *key, int flags, ...) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_HashGet)(RedisModuleKey *key, int flags, ...) REDISMODULE_ATTR; +REDISMODULE_API mstime_t (*RedisModule_HashFieldMinExpire)(RedisModuleKey *key) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_StreamAdd)(RedisModuleKey *key, int flags, RedisModuleStreamID *id, RedisModuleString **argv, int64_t numfields) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_StreamDelete)(RedisModuleKey *key, RedisModuleStreamID *id) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_StreamIteratorStart)(RedisModuleKey *key, int flags, RedisModuleStreamID *startid, RedisModuleStreamID *endid) REDISMODULE_ATTR; @@ -1453,6 +1455,7 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int REDISMODULE_GET_API(ZsetRangeEndReached); REDISMODULE_GET_API(HashSet); REDISMODULE_GET_API(HashGet); + REDISMODULE_GET_API(HashFieldMinExpire); REDISMODULE_GET_API(StreamAdd); REDISMODULE_GET_API(StreamDelete); REDISMODULE_GET_API(StreamIteratorStart); diff --git a/src/server.h b/src/server.h index c76ff2fcc..205d73c68 100644 --- a/src/server.h +++ b/src/server.h @@ -3242,7 +3242,8 @@ void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll, uint64_t *expireTime); sds hashTypeCurrentObjectNewSds(hashTypeIterator *hi, int what); hfield hashTypeCurrentObjectNewHfield(hashTypeIterator *hi); -robj *hashTypeGetValueObject(redisDb *db, robj *o, sds field, int hfeFlags, int *isHashDeleted); +int hashTypeGetValueObject(redisDb *db, robj *o, sds field, int hfeFlags, + robj **val, uint64_t *expireTime, int *isHashDeleted); int hashTypeSet(redisDb *db, robj *o, sds field, sds value, int flags); robj *hashTypeDup(robj *o, sds newkey, uint64_t *minHashExpire); uint64_t hashTypeRemoveFromExpires(ebuckets *hexpires, robj *o); diff --git a/src/sort.c b/src/sort.c index 01035e218..8b24494e5 100644 --- a/src/sort.c +++ b/src/sort.c @@ -41,7 +41,7 @@ redisSortOperation *createSortOperation(int type, robj *pattern) { robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) { char *p, *f, *k; sds spat, ssub; - robj *keyobj, *fieldobj = NULL, *o; + robj *keyobj, *fieldobj = NULL, *o, *val; int prefixlen, sublen, postfixlen, fieldlen; /* If the pattern is "#" return the substitution object itself in order @@ -95,7 +95,8 @@ robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) { /* Retrieve value from hash by the field name. The returned object * is a new object with refcount already incremented. */ int isHashDeleted; - o = hashTypeGetValueObject(db, o, fieldobj->ptr, HFE_LAZY_EXPIRE, &isHashDeleted); + hashTypeGetValueObject(db, o, fieldobj->ptr, HFE_LAZY_EXPIRE, &val, NULL, &isHashDeleted); + o = val; if (isHashDeleted) goto noobj; diff --git a/src/t_hash.c b/src/t_hash.c index f114fa9b9..f8cfdf8ce 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -716,24 +716,28 @@ GetFieldRes hashTypeGetFromHashTable(robj *o, sds field, sds *value, uint64_t *e * If *vll is populated *vstr is set to NULL, so the caller can * always check the function return by checking the return value * for GETF_OK and checking if vll (or vstr) is NULL. - * + * expiredAt - if the field has an expiration time, it will be set to the expiration + * time of the field. Otherwise, will be set to EB_EXPIRE_TIME_INVALID. */ GetFieldRes hashTypeGetValue(redisDb *db, robj *o, sds field, unsigned char **vstr, - unsigned int *vlen, long long *vll, int hfeFlags) { - uint64_t expiredAt; + unsigned int *vlen, long long *vll, + int hfeFlags, uint64_t *expiredAt) +{ sds key; GetFieldRes res; + uint64_t dummy; + if (expiredAt == NULL) expiredAt = &dummy; if (o->encoding == OBJ_ENCODING_LISTPACK || o->encoding == OBJ_ENCODING_LISTPACK_EX) { *vstr = NULL; - res = hashTypeGetFromListpack(o, field, vstr, vlen, vll, &expiredAt); + res = hashTypeGetFromListpack(o, field, vstr, vlen, vll, expiredAt); if (res == GETF_NOT_FOUND) return GETF_NOT_FOUND; } else if (o->encoding == OBJ_ENCODING_HT) { sds value = NULL; - res = hashTypeGetFromHashTable(o, field, &value, &expiredAt); + res = hashTypeGetFromHashTable(o, field, &value, expiredAt); if (res == GETF_NOT_FOUND) return GETF_NOT_FOUND; @@ -744,7 +748,8 @@ GetFieldRes hashTypeGetValue(redisDb *db, robj *o, sds field, unsigned char **vs serverPanic("Unknown hash encoding"); } - if ((expiredAt >= (uint64_t) commandTimeSnapshot()) || (hfeFlags & HFE_LAZY_ACCESS_EXPIRED)) + if ((*expiredAt >= (uint64_t) commandTimeSnapshot()) || + (hfeFlags & HFE_LAZY_ACCESS_EXPIRED)) return GETF_OK; if (server.masterhost) { @@ -797,29 +802,46 @@ GetFieldRes hashTypeGetValue(redisDb *db, robj *o, sds field, unsigned char **vs * isHashDeleted - If attempted to access expired field and it's the last field * in the hash, then the hash will as well be deleted. In this case, * isHashDeleted will be set to 1. + * val - If the field is found, then val will be set to the value object. + * expireTime - If the field exists (`GETF_OK`) then expireTime will be set to + * the expiration time of the field. Otherwise, it will be set to 0. + * + * Returns 1 if the field exists, and 0 when it doesn't. */ -robj *hashTypeGetValueObject(redisDb *db, robj *o, sds field, int hfeFlags, int *isHashDeleted) { +int hashTypeGetValueObject(redisDb *db, robj *o, sds field, int hfeFlags, + robj **val, uint64_t *expireTime, int *isHashDeleted) { unsigned char *vstr; unsigned int vlen; long long vll; if (isHashDeleted) *isHashDeleted = 0; - GetFieldRes res = hashTypeGetValue(db,o,field,&vstr,&vlen,&vll, hfeFlags); + if (val) *val = NULL; + GetFieldRes res = hashTypeGetValue(db,o,field,&vstr,&vlen,&vll, + hfeFlags, expireTime); if (res == GETF_OK) { - if (vstr) return createStringObject((char*)vstr,vlen); - else return createStringObjectFromLongLong(vll); + /* expireTime set to 0 if the field has no expiration time */ + if (expireTime && (*expireTime == EB_EXPIRE_TIME_INVALID)) + *expireTime = 0; + + /* If expected to return the value, then create a new object */ + if (val) { + if (vstr) *val = createStringObject((char *) vstr, vlen); + else *val = createStringObjectFromLongLong(vll); + } + return 1; } if ((res == GETF_EXPIRED_HASH) && (isHashDeleted)) *isHashDeleted = 1; /* GETF_EXPIRED_HASH, GETF_EXPIRED, GETF_NOT_FOUND */ - return NULL; + return 0; } /* Test if the specified field exists in the given hash. If the field is - * expired (HFE), then it will be lazy deleted + * expired (HFE), then it will be lazy deleted unless HFE_LAZY_AVOID_FIELD_DEL + * hfeFlags is set. * * hfeFlags - Lookup HFE_LAZY_* flags * isHashDeleted - If attempted to access expired field and it is the last field @@ -833,7 +855,8 @@ int hashTypeExists(redisDb *db, robj *o, sds field, int hfeFlags, int *isHashDel unsigned int vlen = UINT_MAX; long long vll = LLONG_MAX; - GetFieldRes res = hashTypeGetValue(db, o, field, &vstr, &vlen, &vll, hfeFlags); + GetFieldRes res = hashTypeGetValue(db, o, field, &vstr, &vlen, &vll, + hfeFlags, NULL); if (isHashDeleted) *isHashDeleted = (res == GETF_EXPIRED_HASH) ? 1 : 0; return (res == GETF_OK) ? 1 : 0; @@ -2212,7 +2235,7 @@ void hincrbyCommand(client *c) { if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; GetFieldRes res = hashTypeGetValue(c->db,o,c->argv[2]->ptr,&vstr,&vlen,&value, - HFE_LAZY_EXPIRE); + HFE_LAZY_EXPIRE, NULL); if (res == GETF_OK) { if (vstr) { if (string2ll((char*)vstr,vlen,&value) == 0) { @@ -2262,7 +2285,7 @@ void hincrbyfloatCommand(client *c) { } if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; GetFieldRes res = hashTypeGetValue(c->db, o,c->argv[2]->ptr,&vstr,&vlen,&ll, - HFE_LAZY_EXPIRE); + HFE_LAZY_EXPIRE, NULL); if (res == GETF_OK) { if (vstr) { if (string2ld((char*)vstr,vlen,&value) == 0) { @@ -2319,7 +2342,7 @@ static GetFieldRes addHashFieldToReply(client *c, robj *o, sds field, int hfeFla unsigned int vlen = UINT_MAX; long long vll = LLONG_MAX; - GetFieldRes res = hashTypeGetValue(c->db, o, field, &vstr, &vlen, &vll, hfeFlags); + GetFieldRes res = hashTypeGetValue(c->db, o, field, &vstr, &vlen, &vll, hfeFlags, NULL); if (res == GETF_OK) { if (vstr) { addReplyBulkCBuffer(c, vstr, vlen); @@ -2434,8 +2457,8 @@ void hstrlenCommand(client *c) { if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || checkType(c,o,OBJ_HASH)) return; - GetFieldRes res = hashTypeGetValue(c->db, o, c->argv[2]->ptr, &vstr, &vlen, &vll, - HFE_LAZY_EXPIRE); + GetFieldRes res = hashTypeGetValue(c->db, o, c->argv[2]->ptr, &vstr, + &vlen, &vll, HFE_LAZY_EXPIRE, NULL); if (res == GETF_NOT_FOUND || res == GETF_EXPIRED || res == GETF_EXPIRED_HASH) { addReply(c, shared.czero); diff --git a/tests/modules/hash.c b/tests/modules/hash.c index 0a9477390..462c21e1d 100644 --- a/tests/modules/hash.c +++ b/tests/modules/hash.c @@ -117,6 +117,67 @@ int test_open_key_subexpired_hget(RedisModuleCtx *ctx, RedisModuleString **argv, return REDISMODULE_OK; } +int test_open_key_hget_expire(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + if (argc<3) { + RedisModule_WrongArity(ctx); + return REDISMODULE_OK; + } + + RedisModuleKey *key = openKeyWithMode(ctx, argv[1], REDISMODULE_OPEN_KEY_ACCESS_EXPIRED); + if (!key) return REDISMODULE_OK; + + mstime_t expireAt; + + /* Let's test here that we get error if using invalid flags combination */ + RedisModule_Assert( + RedisModule_HashGet(key, + REDISMODULE_HASH_EXISTS | + REDISMODULE_HASH_EXPIRE_TIME, + argv[2], &expireAt, NULL) == REDISMODULE_ERR); + + /* Now let's get the expire time */ + RedisModule_HashGet(key, REDISMODULE_HASH_EXPIRE_TIME,argv[2],&expireAt,NULL); + RedisModule_ReplyWithLongLong(ctx, expireAt); + RedisModule_CloseKey(key); + return REDISMODULE_OK; +} + +/* Test variadic function to get two expiration times */ +int test_open_key_hget_two_expire(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + if (argc<3) { + RedisModule_WrongArity(ctx); + return REDISMODULE_OK; + } + + RedisModuleKey *key = openKeyWithMode(ctx, argv[1], REDISMODULE_OPEN_KEY_ACCESS_EXPIRED); + if (!key) return REDISMODULE_OK; + + mstime_t expireAt1, expireAt2; + RedisModule_HashGet(key,REDISMODULE_HASH_EXPIRE_TIME,argv[2],&expireAt1,argv[3],&expireAt2,NULL); + + /* return the two expire time */ + RedisModule_ReplyWithArray(ctx, 2); + RedisModule_ReplyWithLongLong(ctx, expireAt1); + RedisModule_ReplyWithLongLong(ctx, expireAt2); + RedisModule_CloseKey(key); + return REDISMODULE_OK; +} + +int test_open_key_hget_min_expire(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + if (argc!=2) { + RedisModule_WrongArity(ctx); + return REDISMODULE_OK; + } + + RedisModuleKey *key = openKeyWithMode(ctx, argv[1], REDISMODULE_READ); + if (!key) return REDISMODULE_OK; + + volatile mstime_t minExpire = RedisModule_HashFieldMinExpire(key); + RedisModule_ReplyWithLongLong(ctx, minExpire); + RedisModule_CloseKey(key); + return REDISMODULE_OK; +} + int numReplies; void ScanCallback(RedisModuleKey *key, RedisModuleString *field, RedisModuleString *value, void *privdata) { UNUSED(key); @@ -172,6 +233,12 @@ int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) return REDISMODULE_ERR; if (RedisModule_CreateCommand(ctx, "hash.hscan_expired", test_open_key_access_expired_hscan,"", 0, 0, 0) == REDISMODULE_ERR) return REDISMODULE_ERR; + if (RedisModule_CreateCommand(ctx, "hash.hget_expire", test_open_key_hget_expire,"", 0, 0, 0) == REDISMODULE_ERR) + return REDISMODULE_ERR; + if (RedisModule_CreateCommand(ctx, "hash.hget_two_expire", test_open_key_hget_two_expire,"", 0, 0, 0) == REDISMODULE_ERR) + return REDISMODULE_ERR; + if (RedisModule_CreateCommand(ctx, "hash.hget_min_expire", test_open_key_hget_min_expire,"", 0, 0, 0) == REDISMODULE_ERR) + return REDISMODULE_ERR; return REDISMODULE_OK; } diff --git a/tests/unit/moduleapi/hash.tcl b/tests/unit/moduleapi/hash.tcl index 57b067369..0449b45bd 100644 --- a/tests/unit/moduleapi/hash.tcl +++ b/tests/unit/moduleapi/hash.tcl @@ -60,7 +60,7 @@ start_server {tags {"modules"}} { r debug set-active-expire 1 } {OK} {needs:debug} - test {test open key with REDISMODULE_OPEN_KEY_ACCESS_EXPIRED to scan expired fields} { + test {Module hash - test open key with REDISMODULE_OPEN_KEY_ACCESS_EXPIRED to scan expired fields} { r debug set-active-expire 0 r del H1 r hash.set H1 "n" f1 v1 f2 v2 f3 v3 @@ -70,6 +70,9 @@ start_server {tags {"modules"}} { assert_equal "f1 f2 f3 v1 v2 v3" [lsort [r hash.hscan_expired H1]] # Get expired field with flag REDISMODULE_OPEN_KEY_ACCESS_EXPIRED assert_equal {v1} [r hash.hget_expired H1 f1] + # Verify we can get the TTL of the expired field as well + set now [expr [clock seconds]*1000] + assert_range [r hash.hget_expire H1 f2] [expr {$now-1000}] [expr {$now+1000}] # Verify key doesn't exist on normal access without the flag assert_equal 0 [r hexists H1 f1] assert_equal 0 [r hexists H1 f2] @@ -78,7 +81,7 @@ start_server {tags {"modules"}} { r debug set-active-expire 1 } - test {test open key with REDISMODULE_OPEN_KEY_ACCESS_EXPIRED to scan expired key} { + test {Module hash - test open key with REDISMODULE_OPEN_KEY_ACCESS_EXPIRED to scan expired key} { r debug set-active-expire 0 r del H1 r hash.set H1 "n" f1 v1 f2 v2 f3 v3 @@ -92,6 +95,48 @@ start_server {tags {"modules"}} { assert_equal 0 [r exists H1] r debug set-active-expire 1 } + + test {Module hash - Read field expiration time} { + r del H1 + r hash.set H1 "n" f1 v1 f2 v2 f3 v3 f4 v4 + r hexpire H1 10 FIELDS 1 f1 + r hexpire H1 100 FIELDS 1 f2 + r hexpire H1 1000 FIELDS 1 f3 + + # Validate that the expiration times for fields f1, f2, and f3 are correct + set nowMsec [expr [clock seconds]*1000] + assert_range [r hash.hget_expire H1 f1] [expr {$nowMsec+9000}] [expr {$nowMsec+11000}] + assert_range [r hash.hget_expire H1 f2] [expr {$nowMsec+90000}] [expr {$nowMsec+110000}] + assert_range [r hash.hget_expire H1 f3] [expr {$nowMsec+900000}] [expr {$nowMsec+1100000}] + + # Assert that field f4 and f5_not_exist have no expiration (should return -1) + assert_equal [r hash.hget_expire H1 f4] -1 + assert_equal [r hash.hget_expire H1 f5_not_exist] -1 + + # Assert that variadic version of hget_expire works as well + assert_equal [r hash.hget_two_expire H1 f1 f2] [list [r hash.hget_expire H1 f1] [r hash.hget_expire H1 f2]] + } + + test {Module hash - Read minimum expiration time} { + r del H1 + r hash.set H1 "n" f1 v1 f2 v2 f3 v3 f4 v4 + r hexpire H1 100 FIELDS 1 f1 + r hexpire H1 10 FIELDS 1 f2 + r hexpire H1 1000 FIELDS 1 f3 + + # Validate that the minimum expiration time is correct + set nowMsec [expr [clock seconds]*1000] + assert_range [r hash.hget_min_expire H1] [expr {$nowMsec+9000}] [expr {$nowMsec+11000}] + assert_equal [r hash.hget_min_expire H1] [r hash.hget_expire H1 f2] + + # Assert error if key not found + assert_error {*key not found*} {r hash.hget_min_expire non_exist_hash} + + # Assert return -1 if no expiration (=REDISMODULE_NO_EXPIRE) + r del H2 + r hash.set H2 "n" f1 v1 + assert_equal [r hash.hget_min_expire H2] -1 + } test "Unload the module - hash" { assert_equal {OK} [r module unload hash] From c51c96656bf1f1801ae90a376f71890cbcdea4b4 Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Tue, 10 Dec 2024 09:16:30 +0200 Subject: [PATCH 23/42] modules API: Add test for ACL check of empty prefix (#13678) - Add empty string test for the new API `RedisModule_ACLCheckKeyPrefixPermissions`. - Fix order of checks: `(pattern[patternLen - 1] != '*' || patternLen == 0)` --------- Co-authored-by: debing.sun --- src/util.c | 2 +- tests/unit/moduleapi/aclcheck.tcl | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/util.c b/src/util.c index ee67be418..43793f15b 100644 --- a/src/util.c +++ b/src/util.c @@ -217,7 +217,7 @@ int prefixmatch(const char *pattern, int patternLen, * it can match any suffix of the string beyond the prefix. This check * remains outside stringmatchlen_impl() to keep its complexity manageable. */ - if (pattern[patternLen - 1] != '*' || patternLen == 0) + if (patternLen == 0 || pattern[patternLen - 1] != '*' ) return 0; /* Count backward the number of consecutive backslashes preceding the '*' diff --git a/tests/unit/moduleapi/aclcheck.tcl b/tests/unit/moduleapi/aclcheck.tcl index 063c5c5b0..cf89ea52e 100644 --- a/tests/unit/moduleapi/aclcheck.tcl +++ b/tests/unit/moduleapi/aclcheck.tcl @@ -50,7 +50,20 @@ start_server {tags {"modules acl"}} { catch {r aclcheck.set.check.prefixkey "~" ESCAPED_STAR* ESCAPED_STAR* 5} e assert_match "*DENIED KEY*" $e assert_equal [r aclcheck.set.check.prefixkey "~" NON_ESCAPED_STAR\\ NON_ESCAPED_STAR\\clothes 5] OK - } + } + + test {check ACL permissions versus empty string prefix} { + # The empty string should should match all keys permissions + r acl setuser default +set resetkeys %R~* %W~* ~* + assert_equal [r aclcheck.set.check.prefixkey "~" "" CART_BOOKS_12 5] OK + assert_equal [r aclcheck.set.check.prefixkey "W" "" ORDER_2024_564879 5] OK + assert_equal [r aclcheck.set.check.prefixkey "R" "" PRODUCT_BOOKS_753376 5] OK + + # The empty string prefix should not match if cannot access all keys + r acl setuser default +set resetkeys %R~x* %W~x* ~x* + catch {r aclcheck.set.check.prefixkey "~" "" CART_BOOKS_12 5} e + assert_match "*DENIED KEY*" $e + } test {test module check acl for key perm} { # give permission for SET and block all keys but x(READ+WRITE), y(WRITE), z(READ) From f8942f93a6b156f2b05cd40940b9a23feb68de0c Mon Sep 17 00:00:00 2001 From: "Filipe Oliveira (Redis)" Date: Thu, 12 Dec 2024 17:41:08 +0000 Subject: [PATCH 24/42] Avoid unnecessary hfield Creation/Deletion on updates in hashTypeSet. HSET updates improvement of ~10% (#13655) This PR eliminates unnecessary creation and destruction of hfield objects, ensuring only required updates or insertions are performed. This reduces overhead and improves performance by streamlining field management in hash dictionaries, particularly in scenarios involving frequent updates, like the benchmarks in: - [memtier_benchmark-100Kkeys-load-hash-50-fields-with-100B-values](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-100Kkeys-load-hash-50-fields-with-100B-values.yml) - [memtier_benchmark-10Mkeys-load-hash-5-fields-with-100B-values-pipeline-10](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-10Mkeys-load-hash-5-fields-with-100B-values-pipeline-10.yml) To test it we can simply focus on the hfield related tests ``` tclsh tests/test_helper.tcl --single unit/type/hash-field-expire tclsh tests/test_helper.tcl --single unit/type/hash tclsh tests/test_helper.tcl --dump-logs --single unit/other ``` Extra check on full CI: - [x] https://github.com/filipecosta90/redis/actions/runs/12225788759 ## microbenchmark results 16.7% improvement (drop in time) in dictAddNonExistingRaw vs dictAddRaw ``` make REDIS_CFLAGS="-g -fno-omit-frame-pointer -O3 -DREDIS_TEST" -j $ ./src/redis-server test dict --accurate (...) Inserting via dictAddRaw() non existing: 5000000 items in 2592 ms (...) Inserting via dictAddNonExistingRaw() non existing: 5000000 items in 2160 ms ``` 8% improvement (drop in time) in find (non existing) and adding via `dictGetHash()+dictFindWithHash()+dictAddNonExistingRaw()` vs `dictFind()+dictAddRaw()` ``` make REDIS_CFLAGS="-g -fno-omit-frame-pointer -O3 -DREDIS_TEST" -j $ ./src/redis-server test dict --accurate (...) Find() and inserting via dictFind()+dictAddRaw() non existing: 5000000 items in 2983 ms Find() and inserting via dictGetHash()+dictFindWithHash()+dictAddNonExistingRaw() non existing: 5000000 items in 2740 ms ``` ## benchmark results To benchmark: ``` pip3 install redis-benchmarks-specification==0.1.250 taskset -c 0 ./src/redis-server --save '' --protected-mode no --daemonize yes redis-benchmarks-spec-client-runner --tests-regexp ".*load-hash.*" --flushall_on_every_test_start --flushall_on_every_test_end --cpuset_start_pos 2 --override-memtier-test-time 60 ``` Improvements on achievable throughput in: test | ops/sec unstable (59953d2df62b0e3c4996b8068836591e96349720) | ops/sec this PR (24af7190fdc0ad3c6d5957c17853490990c35dcc) | % change -- | -- | -- | -- memtier_benchmark-1key-load-hash-1K-fields-with-5B-values | 4097 | 5032 | 22.8% memtier_benchmark-100Kkeys-load-hash-50-fields-with-100B-values | 37658 | 44688 | 18.7% memtier_benchmark-100Kkeys-load-hash-50-fields-with-1000B-values | 14736 | 17350 | 17.7% memtier_benchmark-1Mkeys-load-hash-5-fields-with-1000B-values-pipeline-10 | 131848 | 143485 | 8.8% memtier_benchmark-1Mkeys-load-hash-hmset-5-fields-with-1000B-values | 82071 | 85681 | 4.4% memtier_benchmark-1Mkeys-load-hash-5-fields-with-1000B-values | 82882 | 86336 | 4.2% memtier_benchmark-10Mkeys-load-hash-5-fields-with-100B-values-pipeline-10 | 262502 | 273376 | 4.1% memtier_benchmark-10Kkeys-load-hash-50-fields-with-10000B-values | 2821 | 2936 | 4.1% --------- Co-authored-by: Moti Cohen --- src/defrag.c | 4 +- src/dict.c | 224 +++++++++++++++++++++++++++++++++++++++++--------- src/dict.h | 4 +- src/kvstore.c | 4 +- src/kvstore.h | 2 +- src/t_hash.c | 30 ++++--- 6 files changed, 205 insertions(+), 63 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index d3f4ceee6..71aa580f3 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -296,7 +296,7 @@ void activeDefragHfieldDictCallback(void *privdata, const dictEntry *de) { dictUseStoredKeyApi(d, 1); uint64_t hash = dictGetHash(d, newhf); dictUseStoredKeyApi(d, 0); - dictEntry *de = dictFindEntryByPtrAndHash(d, hf, hash); + dictEntry *de = dictFindByHashAndPtr(d, hf, hash); serverAssert(de); dictSetKey(d, de, newhf); } @@ -753,7 +753,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { * the pointer it holds, since it won't be able to do the string * compare, but we can find the entry using key hash and pointer. */ uint64_t hash = kvstoreGetHash(db->expires, newsds); - dictEntry *expire_de = kvstoreDictFindEntryByPtrAndHash(db->expires, slot, keysds, hash); + dictEntry *expire_de = kvstoreDictFindByHashAndPtr(db->expires, slot, keysds, hash); if (expire_de) kvstoreDictSetKey(db->expires, slot, expire_de, newsds); } diff --git a/src/dict.c b/src/dict.c index 0b72506ce..24b1eb80d 100644 --- a/src/dict.c +++ b/src/dict.c @@ -62,6 +62,7 @@ typedef struct { static void _dictExpandIfNeeded(dict *d); static void _dictShrinkIfNeeded(dict *d); +static void _dictRehashStepIfNeeded(dict *d, uint64_t visitedIdx); static signed char _dictNextExp(unsigned long size); static int _dictInit(dict *d, dictType *type); static dictEntry *dictGetNext(const dictEntry *de); @@ -509,6 +510,39 @@ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing) return dictInsertAtPosition(d, key, position); } +/* Low-level add function for non-existing keys: + * This function adds a new entry to the dictionary, assuming the key does not + * already exist. + * Parameters: + * - `dict *d`: Pointer to the dictionary structure. + * - `void *key`: Pointer to the key being added. + * - `const uint64_t hash`: hash of the key being added. + * Guarantees: + * - The key is assumed to be non-existing. + * Note: + * Ensure that the key's uniqueness is managed externally before calling this function. */ +dictEntry *dictAddNonExistsByHash(dict *d, void *key, const uint64_t hash) { + /* Get the position for the new key, it should never be NULL. */ + unsigned long idx, table; + idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]); + + /* Rehash the hash table if needed */ + _dictRehashStepIfNeeded(d,idx); + + /* Expand the hash table if needed */ + _dictExpandIfNeeded(d); + + table = dictIsRehashing(d) ? 1 : 0; + idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); + void *position = &d->ht_table[table][idx]; + assert(position!=NULL); + + /* Dup the key if necessary. */ + if (d->type->keyDup) key = d->type->keyDup(d, key); + + return dictInsertAtPosition(d, key, position); +} + /* Adds a key in the dict's hashtable at the position returned by a preceding * call to dictFindPositionForInsert. This is a low level function which allows * splitting dictAddRaw in two parts. Normally, dictAddRaw or dictAdd should be @@ -608,17 +642,8 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { h = dictHashKey(d, key, d->useStoredKeyApi); idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]); - if (dictIsRehashing(d)) { - if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) { - /* If we have a valid hash entry at `idx` in ht0, we perform - * rehash on the bucket at `idx` (being more CPU cache friendly) */ - _dictBucketRehash(d, idx); - } else { - /* If the hash entry is not in ht0, we rehash the buckets based - * on the rehashidx (not CPU cache friendly). */ - _dictRehashStep(d); - } - } + /* Rehash the hash table if needed */ + _dictRehashStepIfNeeded(d,idx); keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); @@ -734,32 +759,21 @@ void dictRelease(dict *d) zfree(d); } -dictEntry *dictFind(dict *d, const void *key) -{ +dictEntry *dictFindByHash(dict *d, const void *key, const uint64_t hash) { dictEntry *he; - uint64_t h, idx, table; + uint64_t idx, table; if (dictSize(d) == 0) return NULL; /* dict is empty */ - h = dictHashKey(d, key, d->useStoredKeyApi); - idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]); + idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]); keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); - if (dictIsRehashing(d)) { - if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) { - /* If we have a valid hash entry at `idx` in ht0, we perform - * rehash on the bucket at `idx` (being more CPU cache friendly) */ - _dictBucketRehash(d, idx); - } else { - /* If the hash entry is not in ht0, we rehash the buckets based - * on the rehashidx (not CPU cache friendly). */ - _dictRehashStep(d); - } - } + /* Rehash the hash table if needed */ + _dictRehashStepIfNeeded(d,idx); for (table = 0; table <= 1; table++) { if (table == 0 && (long)idx < d->rehashidx) continue; - idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); + idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); /* Prefetch the bucket at the calculated index */ redis_prefetch_read(&d->ht_table[table][idx]); @@ -781,6 +795,13 @@ dictEntry *dictFind(dict *d, const void *key) return NULL; } +dictEntry *dictFind(dict *d, const void *key) +{ + if (dictSize(d) == 0) return NULL; /* dict is empty */ + const uint64_t hash = dictHashKey(d, key, d->useStoredKeyApi); + return dictFindByHash(d,key,hash); +} + void *dictFetchValue(dict *d, const void *key) { dictEntry *he; @@ -1566,6 +1587,21 @@ static void _dictShrinkIfNeeded(dict *d) dictShrinkIfNeeded(d); } +static void _dictRehashStepIfNeeded(dict *d, uint64_t visitedIdx) { + if ((!dictIsRehashing(d)) || (d->pauserehash != 0)) + return; + /* rehashing not in progress if rehashidx == -1 */ + if ((long)visitedIdx >= d->rehashidx && d->ht_table[0][visitedIdx]) { + /* If we have a valid hash entry at `idx` in ht0, we perform + * rehash on the bucket at `idx` (being more CPU cache friendly) */ + _dictBucketRehash(d, visitedIdx); + } else { + /* If the hash entry is not in ht0, we rehash the buckets based + * on the rehashidx (not CPU cache friendly). */ + dictRehash(d,1); + } +} + /* Our hash table capability is a power of two */ static signed char _dictNextExp(unsigned long size) { @@ -1586,17 +1622,8 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) if (existing) *existing = NULL; idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]); - if (dictIsRehashing(d)) { - if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) { - /* If we have a valid hash entry at `idx` in ht0, we perform - * rehash on the bucket at `idx` (being more CPU cache friendly) */ - _dictBucketRehash(d, idx); - } else { - /* If the hash entry is not in ht0, we rehash the buckets based - * on the rehashidx (not CPU cache friendly). */ - _dictRehashStep(d); - } - } + /* Rehash the hash table if needed */ + _dictRehashStepIfNeeded(d,idx); /* Expand the hash table if needed */ _dictExpandIfNeeded(d); @@ -1624,6 +1651,7 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) return bucket; } + void dictEmpty(dict *d, void(callback)(dict*)) { /* Someone may be monitoring a dict that started rehashing, before * destroying the dict fake completion. */ @@ -1649,7 +1677,7 @@ uint64_t dictGetHash(dict *d, const void *key) { * the hash value should be provided using dictGetHash. * no string / key comparison is performed. * return value is a pointer to the dictEntry if found, or NULL if not found. */ -dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) { +dictEntry *dictFindByHashAndPtr(dict *d, const void *oldptr, const uint64_t hash) { dictEntry *he; unsigned long idx, table; @@ -1831,6 +1859,32 @@ char *stringFromLongLong(long long value) { return s; } +char *stringFromSubstring(void) { + #define LARGE_STRING_SIZE 10000 + #define MIN_STRING_SIZE 100 + #define MAX_STRING_SIZE 500 + static char largeString[LARGE_STRING_SIZE + 1]; + static int init = 0; + if (init == 0) { + /* Generate a large string */ + for (size_t i = 0; i < LARGE_STRING_SIZE; i++) { + /* Random printable ASCII character (33 to 126) */ + largeString[i] = 33 + (rand() % 94); + } + /* Null-terminate the large string */ + largeString[LARGE_STRING_SIZE] = '\0'; + init = 1; + } + /* Randomly choose a size between minSize and maxSize */ + size_t substringSize = MIN_STRING_SIZE + (rand() % (MAX_STRING_SIZE - MIN_STRING_SIZE + 1)); + size_t startIndex = rand() % (LARGE_STRING_SIZE - substringSize + 1); + /* Allocate memory for the substring (+1 for null terminator) */ + char *s = zmalloc(substringSize + 1); + memcpy(s, largeString + startIndex, substringSize); // Copy the substring + s[substringSize] = '\0'; // Null-terminate the string + return s; +} + dictType BenchmarkDictType = { hashCallback, NULL, @@ -1853,6 +1907,8 @@ int dictTest(int argc, char **argv, int flags) { long long start, elapsed; int retval; dict *dict = dictCreate(&BenchmarkDictType); + dictEntry* de = NULL; + dictEntry* existing = NULL; long count = 0; unsigned long new_dict_size, current_dict_used, remain_keys; int accurate = (flags & REDIS_TEST_ACCURATE); @@ -1992,13 +2048,99 @@ int dictTest(int argc, char **argv, int flags) { dictEmpty(dict, NULL); dictSetResizeEnabled(DICT_RESIZE_ENABLE); } + srand(12345); + start_benchmark(); + for (j = 0; j < count; j++) { + /* Create a dynamically allocated substring */ + char *key = stringFromSubstring(); + + /* Insert the range directly from the large string */ + de = dictAddRaw(dict, key, &existing); + assert(de != NULL || existing != NULL); + /* If key already exists NULL is returned so we need to free the temp key string */ + if (de == NULL) zfree(key); + } + end_benchmark("Inserting random substrings (100-500B) from large string with symbols"); + assert((long)dictSize(dict) <= count); + dictEmpty(dict, NULL); start_benchmark(); for (j = 0; j < count; j++) { retval = dictAdd(dict,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } - end_benchmark("Inserting"); + end_benchmark("Inserting via dictAdd() non existing"); + assert((long)dictSize(dict) == count); + + dictEmpty(dict, NULL); + + start_benchmark(); + for (j = 0; j < count; j++) { + de = dictAddRaw(dict,stringFromLongLong(j),NULL); + assert(de != NULL); + } + end_benchmark("Inserting via dictAddRaw() non existing"); + assert((long)dictSize(dict) == count); + + start_benchmark(); + for (j = 0; j < count; j++) { + void *key = stringFromLongLong(j); + de = dictAddRaw(dict,key,&existing); + assert(existing != NULL); + zfree(key); + } + end_benchmark("Inserting via dictAddRaw() existing (no insertion)"); + assert((long)dictSize(dict) == count); + + dictEmpty(dict, NULL); + + start_benchmark(); + for (j = 0; j < count; j++) { + void *key = stringFromLongLong(j); + const uint64_t hash = dictGetHash(dict, key); + de = dictAddNonExistsByHash(dict,key,hash); + assert(de != NULL); + } + end_benchmark("Inserting via dictAddNonExistsByHash() non existing"); + assert((long)dictSize(dict) == count); + + /* Wait for rehashing. */ + while (dictIsRehashing(dict)) { + dictRehashMicroseconds(dict,100*1000); + } + + dictEmpty(dict, NULL); + + start_benchmark(); + for (j = 0; j < count; j++) { + /* Create a key */ + void *key = stringFromLongLong(j); + + /* Check if the key exists */ + dictEntry *entry = dictFind(dict, key); + assert(entry == NULL); + + /* Add the key */ + dictEntry *de = dictAddRaw(dict, key, NULL); + assert(de != NULL); + } + end_benchmark("Find() and inserting via dictFind()+dictAddRaw() non existing"); + + dictEmpty(dict, NULL); + + start_benchmark(); + for (j = 0; j < count; j++) { + /* Create a key */ + void *key = stringFromLongLong(j); + uint64_t hash = dictGetHash(dict, key); + + /* Check if the key exists */ + dictEntry *entry = dictFindByHash(dict, key, hash); + assert(entry == NULL); + de = dictAddNonExistsByHash(dict, key, hash); + assert(de != NULL); + } + end_benchmark("Find() and inserting via dictGetHash()+dictFindByHash()+dictAddNonExistsByHash() non existing"); assert((long)dictSize(dict) == count); /* Wait for rehashing. */ diff --git a/src/dict.h b/src/dict.h index e78833066..bcc207c47 100644 --- a/src/dict.h +++ b/src/dict.h @@ -196,6 +196,7 @@ int dictTryExpand(dict *d, unsigned long size); int dictShrink(dict *d, unsigned long size); int dictAdd(dict *d, void *key, void *val); dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing); +dictEntry *dictAddNonExistsByHash(dict *d, void *key, const uint64_t hash); void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing); dictEntry *dictInsertAtPosition(dict *d, void *key, void *position); dictEntry *dictAddOrFind(dict *d, void *key); @@ -207,6 +208,8 @@ dictEntry *dictTwoPhaseUnlinkFind(dict *d, const void *key, dictEntry ***plink, void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table_index); void dictRelease(dict *d); dictEntry * dictFind(dict *d, const void *key); +dictEntry *dictFindByHash(dict *d, const void *key, const uint64_t hash); +dictEntry *dictFindByHashAndPtr(dict *d, const void *oldptr, const uint64_t hash); void *dictFetchValue(dict *d, const void *key); int dictShrinkIfNeeded(dict *d); int dictExpandIfNeeded(dict *d); @@ -249,7 +252,6 @@ uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); unsigned long dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); uint64_t dictGetHash(dict *d, const void *key); -dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash); void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size); size_t dictGetStatsMsg(char *buf, size_t bufsize, dictStats *stats, int full); diff --git a/src/kvstore.c b/src/kvstore.c index 34e73d6c6..6a4d123ad 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -766,12 +766,12 @@ dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx) return dictGetFairRandomKey(d); } -dictEntry *kvstoreDictFindEntryByPtrAndHash(kvstore *kvs, int didx, const void *oldptr, uint64_t hash) +dictEntry *kvstoreDictFindByHashAndPtr(kvstore *kvs, int didx, const void *oldptr, uint64_t hash) { dict *d = kvstoreGetDict(kvs, didx); if (!d) return NULL; - return dictFindEntryByPtrAndHash(d, oldptr, hash); + return dictFindByHashAndPtr(d, oldptr, hash); } unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count) diff --git a/src/kvstore.h b/src/kvstore.h index 3c3f7948c..9e2e4afe0 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -73,7 +73,7 @@ void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_id); dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di); dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx); dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx); -dictEntry *kvstoreDictFindEntryByPtrAndHash(kvstore *kvs, int didx, const void *oldptr, uint64_t hash); +dictEntry *kvstoreDictFindByHashAndPtr(kvstore *kvs, int didx, const void *oldptr, uint64_t hash); unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count); int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size); unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); diff --git a/src/t_hash.c b/src/t_hash.c index f8cfdf8ce..c6e48b77a 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -966,27 +966,25 @@ int hashTypeSet(redisDb *db, robj *o, sds field, sds value, int flags) { hashTypeConvert(o, OBJ_ENCODING_HT, &db->hexpires); } else if (o->encoding == OBJ_ENCODING_HT) { - hfield newField = hfieldNew(field, sdslen(field), 0); dict *ht = o->ptr; dictEntry *de, *existing; - - /* stored key is different than lookup key */ - dictUseStoredKeyApi(ht, 1); - de = dictAddRaw(ht, newField, &existing); - dictUseStoredKeyApi(ht, 0); - - /* If field already exists, then update "field". "Value" will be set afterward */ - if (de == NULL) { - if (flags & HASH_SET_KEEP_TTL) { - /* keep old field along with TTL */ - hfieldFree(newField); - } else { - /* If attached TTL to the old field, then remove it from hash's private ebuckets */ + const uint64_t hash = dictGetHash(ht,field); + /* check if field already exists */ + existing = dictFindByHash(ht, field, hash); + /* check if field already exists */ + if (existing == NULL) { + hfield newField = hfieldNew(field, sdslen(field), 0); + dictUseStoredKeyApi(ht, 1); + de = dictAddNonExistsByHash(ht, newField, hash); + dictUseStoredKeyApi(ht, 0); + } else { + /* If attached TTL to the old field, then remove it from hash's + * private ebuckets when HASH_SET_KEEP_TTL is not set. */ + if (!(flags & HASH_SET_KEEP_TTL)) { hfield oldField = dictGetKey(existing); hfieldPersist(o, oldField); - hfieldFree(oldField); - dictSetKey(ht, existing, newField); } + /* Free the old value */ sdsfree(dictGetVal(existing)); update = 1; de = existing; From 684077682e5826ab658da975c9536df1584b425f Mon Sep 17 00:00:00 2001 From: Nugine Date: Wed, 18 Dec 2024 14:41:04 +0800 Subject: [PATCH 25/42] Fix bug in PFMERGE command (#13672) The bug was introduced in #13558 . When merging dense hll structures, `hllDenseCompress` writes to wrong location and the result will be zero. The unit tests didn't cover this case. This PR + fixes the bug + adds `PFDEBUG SIMD (ON|OFF)` for unit tests + adds a new TCL test to cover the cases Synchronized from https://github.com/valkey-io/valkey/pull/1293 --------- Signed-off-by: Xuyang Wang Co-authored-by: debing.sun --- src/hyperloglog.c | 33 +++++++++++++++++++++++++++++-- tests/unit/hyperloglog.tcl | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index aa51d4eab..742c47b1a 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -190,6 +190,13 @@ struct hllhdr { static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected"; +#ifdef HAVE_AVX2 +static int simd_enabled = 1; +#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2")) +#else +#define HLL_USE_AVX2 0 +#endif + /* =========================== Low level bit macros ========================= */ /* Macros to access the dense representation. @@ -1155,7 +1162,7 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { void hllMergeDense(uint8_t* reg_raw, const uint8_t* reg_dense) { #ifdef HAVE_AVX2 if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { - if (__builtin_cpu_supports("avx2")) { + if (HLL_USE_AVX2) { hllMergeDenseAVX2(reg_raw, reg_dense); return; } @@ -1315,7 +1322,7 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { #ifdef HAVE_AVX2 if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { - if (__builtin_cpu_supports("avx2")) { + if (HLL_USE_AVX2) { hllDenseCompressAVX2(reg_dense, reg_raw); return; } @@ -1587,6 +1594,7 @@ void pfmergeCommand(client *c) { /* Write the resulting HLL to the destination HLL registers and * invalidate the cached value. */ if (use_dense) { + hdr = o->ptr; hllDenseCompress(hdr->registers, max); } else { for (j = 0; j < HLL_REGISTERS; j++) { @@ -1724,6 +1732,7 @@ cleanup: * PFDEBUG DECODE * PFDEBUG ENCODING * PFDEBUG TODENSE + * PFDEBUG SIMD (ON|OFF) */ void pfdebugCommand(client *c) { char *cmd = c->argv[1]->ptr; @@ -1731,6 +1740,26 @@ void pfdebugCommand(client *c) { robj *o; int j; + if (!strcasecmp(cmd, "simd")) { + if (c->argc != 3) goto arityerr; + + if (!strcasecmp(c->argv[2]->ptr, "on")) { +#ifdef HAVE_AVX2 + simd_enabled = 1; +#endif + } else if (!strcasecmp(c->argv[2]->ptr, "off")) { +#ifdef HAVE_AVX2 + simd_enabled = 0; +#endif + } else { + addReplyError(c, "Argument must be ON or OFF"); + } + + addReplyStatus(c, HLL_USE_AVX2 ? "enabled" : "disabled"); + + return; + } + o = lookupKeyWrite(c->db,c->argv[2]); if (o == NULL) { addReplyError(c,"The specified key does not exist"); diff --git a/tests/unit/hyperloglog.tcl b/tests/unit/hyperloglog.tcl index ee437189f..f1bbeace9 100644 --- a/tests/unit/hyperloglog.tcl +++ b/tests/unit/hyperloglog.tcl @@ -222,6 +222,46 @@ start_server {tags {"hll"}} { assert_equal 3 [r pfcount destkey] } + test {PFMERGE results with simd} { + r del hllscalar{t} hllsimd{t} hll1{t} hll2{t} hll3{t} + for {set x 1} {$x < 2000} {incr x} { + r pfadd hll1{t} [expr rand()] + } + for {set x 1} {$x < 4000} {incr x} { + r pfadd hll2{t} [expr rand()] + } + for {set x 1} {$x < 8000} {incr x} { + r pfadd hll3{t} [expr rand()] + } + assert {[r pfcount hll1{t}] > 0} + assert {[r pfcount hll2{t}] > 0} + assert {[r pfcount hll3{t}] > 0} + + r pfdebug simd off + set scalar [r pfcount hll1{t} hll2{t} hll3{t}] + r pfdebug simd on + set simd [r pfcount hll1{t} hll2{t} hll3{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + r pfdebug simd off + r pfmerge hllscalar{t} hll1{t} hll2{t} hll3{t} + r pfdebug simd on + r pfmerge hllsimd{t} hll1{t} hll2{t} hll3{t} + + set scalar [r pfcount hllscalar{t}] + set simd [r pfcount hllsimd{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + set scalar [r get hllscalar{t}] + set simd [r get hllsimd{t}] + assert_equal $scalar $simd + + } {} {needs:pfdebug} + test {PFCOUNT multiple-keys merge returns cardinality of union #1} { r del hll1{t} hll2{t} hll3{t} for {set x 1} {$x < 10000} {incr x} { From 1f09a55ebae402f4031d12e2c7c06fb64fdd7ed1 Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Fri, 20 Dec 2024 09:39:14 +0800 Subject: [PATCH 26/42] Avoid importing memory aligned malloc (#13693) This PR is based on the commits from PR https://github.com/valkey-io/valkey/pull/1442. We deprecate the usage of classic malloc and free, but under certain circumstances they might get imported from intrinsics. The original thought is we should just override malloc and free to use zmalloc and zfree, but I think we should continue to deprecate it to avoid accidental imports of allocations. --------- Co-authored-by: Madelyn Olson --- src/hyperloglog.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 742c47b1a..0533e0f75 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -4,8 +4,13 @@ * Copyright (c) 2014-Present, Redis Ltd. * All rights reserved. * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2) or the Server Side Public License v1 (SSPLv1). + * + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. */ #include "server.h" @@ -14,6 +19,9 @@ #include #ifdef HAVE_AVX2 +/* Define __MM_MALLOC_H to prevent importing the memory aligned + * allocation functions, which we don't use. */ +#define __MM_MALLOC_H #include #endif From 08c2b276fbf8ecb13692449ef9c2cb43aa82ba8e Mon Sep 17 00:00:00 2001 From: Moti Cohen Date: Sun, 22 Dec 2024 14:10:07 +0200 Subject: [PATCH 27/42] Optimize dict `no_value` also for even addresses (#13683) This pull request enhances the no_value flag option in the dict implementation, which is used to store keys without associated values. Previously, when a key had an odd memory address and was the only item in a table entry, it could be directly stored as a pointer without requiring an intermediate dictEntry. With this update, the optimization has been extended to also handle keys with even memory addresses in the same manner. --- src/dict.c | 258 ++++++++++++++++++++++++++++++--------------------- src/dict.h | 10 +- src/server.c | 3 +- 3 files changed, 157 insertions(+), 114 deletions(-) diff --git a/src/dict.c b/src/dict.c index 24b1eb80d..3bfcc7017 100644 --- a/src/dict.c +++ b/src/dict.c @@ -120,14 +120,16 @@ uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len) { * pointer actually points to. If the least bit is set, it's a key. Otherwise, * the bit pattern of the least 3 significant bits mark the kind of entry. */ -#define ENTRY_PTR_MASK 7 /* 111 */ -#define ENTRY_PTR_NORMAL 0 /* 000 */ -#define ENTRY_PTR_NO_VALUE 2 /* 010 */ +#define ENTRY_PTR_MASK 7 /* 111 */ +#define ENTRY_PTR_NORMAL 0 /* 000 : If a pointer to an entry with value. */ +#define ENTRY_PTR_IS_ODD_KEY 1 /* XX1 : If a pointer to odd key address (must be 1). */ +#define ENTRY_PTR_IS_EVEN_KEY 2 /* 010 : If a pointer to even key address. (must be 2 or 4). */ +#define ENTRY_PTR_NO_VALUE 4 /* 100 : If a pointer to an entry without value. */ /* Returns 1 if the entry pointer is a pointer to a key, rather than to an * allocated entry. Returns 0 otherwise. */ static inline int entryIsKey(const dictEntry *de) { - return (uintptr_t)(void *)de & 1; + return ((uintptr_t)de & (ENTRY_PTR_IS_ODD_KEY | ENTRY_PTR_IS_EVEN_KEY)); } /* Returns 1 if the pointer is actually a pointer to a dictEntry struct. Returns @@ -156,7 +158,6 @@ static inline dictEntry *encodeMaskedPtr(const void *ptr, unsigned int bits) { } static inline void *decodeMaskedPtr(const dictEntry *de) { - assert(!entryIsKey(de)); return (void *)((uintptr_t)(void *)de & ~ENTRY_PTR_MASK); } @@ -327,18 +328,17 @@ static void rehashEntriesInBucketAtIndex(dict *d, uint64_t idx) { h = idx & DICTHT_SIZE_MASK(d->ht_size_exp[1]); } if (d->type->no_value) { - if (d->type->keys_are_odd && !d->ht_table[1][h]) { - /* Destination bucket is empty and we can store the key - * directly without an allocated entry. Free the old entry - * if it's an allocated entry. - * - * TODO: Add a flag 'keys_are_even' and if set, we can use - * this optimization for these dicts too. We can set the LSB - * bit when stored as a dict entry and clear it again when - * we need the key back. */ - assert(entryIsKey(key)); + if (!d->ht_table[1][h]) { + /* The destination bucket is empty, allowing the key to be stored + * directly without allocating a dictEntry. If an old entry was + * previously allocated, free its memory. */ if (!entryIsKey(de)) zfree(decodeMaskedPtr(de)); - de = key; + + if (d->type->keys_are_odd) + de = key; /* ENTRY_PTR_IS_ODD_KEY trivially set by the odd key. */ + else + de = encodeMaskedPtr(key, ENTRY_PTR_IS_EVEN_KEY); + } else if (entryIsKey(de)) { /* We don't have an allocated entry but we need one. */ de = createEntryNoValue(key, d->ht_table[1][h]); @@ -556,16 +556,17 @@ dictEntry *dictInsertAtPosition(dict *d, void *key, void *position) { assert(bucket >= &d->ht_table[htidx][0] && bucket <= &d->ht_table[htidx][DICTHT_SIZE_MASK(d->ht_size_exp[htidx])]); if (d->type->no_value) { - if (d->type->keys_are_odd && !*bucket) { - /* We can store the key directly in the destination bucket without the - * allocated entry. - * - * TODO: Add a flag 'keys_are_even' and if set, we can use this - * optimization for these dicts too. We can set the LSB bit when - * stored as a dict entry and clear it again when we need the key - * back. */ - entry = key; - assert(entryIsKey(entry)); + if (!*bucket) { + /* We can store the key directly in the destination bucket without + * allocating dictEntry. + */ + if (d->type->keys_are_odd) { + entry = key; + assert(entryIsKey(entry)); + /* The flag ENTRY_PTR_IS_ODD_KEY (=0x1) is already aligned with LSB bit */ + } else { + entry = encodeMaskedPtr(key, ENTRY_PTR_IS_EVEN_KEY); + } } else { /* Allocate an entry without value. */ entry = createEntryNoValue(key, *bucket); @@ -908,7 +909,10 @@ double dictIncrDoubleVal(dictEntry *de, double val) { } void *dictGetKey(const dictEntry *de) { - if (entryIsKey(de)) return (void*)de; + /* if entryIsKey() */ + if ((uintptr_t)de & ENTRY_PTR_IS_ODD_KEY) return (void *) de; + if ((uintptr_t)de & ENTRY_PTR_IS_EVEN_KEY) return decodeMaskedPtr(de); + /* Regular entry */ if (entryIsNoValue(de)) return decodeEntryNoValue(de)->key; return de->key; } @@ -1906,7 +1910,7 @@ int dictTest(int argc, char **argv, int flags) { long j; long long start, elapsed; int retval; - dict *dict = dictCreate(&BenchmarkDictType); + dict *d = dictCreate(&BenchmarkDictType); dictEntry* de = NULL; dictEntry* existing = NULL; long count = 0; @@ -1926,12 +1930,12 @@ int dictTest(int argc, char **argv, int flags) { TEST("Add 16 keys and verify dict resize is ok") { dictSetResizeEnabled(DICT_RESIZE_ENABLE); for (j = 0; j < 16; j++) { - retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + retval = dictAdd(d,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } - while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == 16); - assert(dictBuckets(dict) == 16); + while (dictIsRehashing(d)) dictRehashMicroseconds(d,1000); + assert(dictSize(d) == 16); + assert(dictBuckets(d) == 16); } TEST("Use DICT_RESIZE_AVOID to disable the dict resize and pad to (dict_force_resize_ratio * 16)") { @@ -1940,112 +1944,112 @@ int dictTest(int argc, char **argv, int flags) { * dict_force_resize_ratio in next test. */ dictSetResizeEnabled(DICT_RESIZE_AVOID); for (j = 16; j < (long)dict_force_resize_ratio * 16; j++) { - retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + retval = dictAdd(d,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } current_dict_used = dict_force_resize_ratio * 16; - assert(dictSize(dict) == current_dict_used); - assert(dictBuckets(dict) == 16); + assert(dictSize(d) == current_dict_used); + assert(dictBuckets(d) == 16); } TEST("Add one more key, trigger the dict resize") { - retval = dictAdd(dict,stringFromLongLong(current_dict_used),(void*)(current_dict_used)); + retval = dictAdd(d,stringFromLongLong(current_dict_used),(void*)(current_dict_used)); assert(retval == DICT_OK); current_dict_used++; new_dict_size = 1UL << _dictNextExp(current_dict_used); - assert(dictSize(dict) == current_dict_used); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 16); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); + assert(dictSize(d) == current_dict_used); + assert(DICTHT_SIZE(d->ht_size_exp[0]) == 16); + assert(DICTHT_SIZE(d->ht_size_exp[1]) == new_dict_size); /* Wait for rehashing. */ dictSetResizeEnabled(DICT_RESIZE_ENABLE); - while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == current_dict_used); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + while (dictIsRehashing(d)) dictRehashMicroseconds(d,1000); + assert(dictSize(d) == current_dict_used); + assert(DICTHT_SIZE(d->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(d->ht_size_exp[1]) == 0); } TEST("Delete keys until we can trigger shrink in next test") { /* Delete keys until we can satisfy (1 / HASHTABLE_MIN_FILL) in the next test. */ for (j = new_dict_size / HASHTABLE_MIN_FILL + 1; j < (long)current_dict_used; j++) { char *key = stringFromLongLong(j); - retval = dictDelete(dict, key); + retval = dictDelete(d, key); zfree(key); assert(retval == DICT_OK); } current_dict_used = new_dict_size / HASHTABLE_MIN_FILL + 1; - assert(dictSize(dict) == current_dict_used); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + assert(dictSize(d) == current_dict_used); + assert(DICTHT_SIZE(d->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(d->ht_size_exp[1]) == 0); } TEST("Delete one more key, trigger the dict resize") { current_dict_used--; char *key = stringFromLongLong(current_dict_used); - retval = dictDelete(dict, key); + retval = dictDelete(d, key); zfree(key); unsigned long oldDictSize = new_dict_size; new_dict_size = 1UL << _dictNextExp(current_dict_used); assert(retval == DICT_OK); - assert(dictSize(dict) == current_dict_used); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == oldDictSize); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); + assert(dictSize(d) == current_dict_used); + assert(DICTHT_SIZE(d->ht_size_exp[0]) == oldDictSize); + assert(DICTHT_SIZE(d->ht_size_exp[1]) == new_dict_size); /* Wait for rehashing. */ - while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == current_dict_used); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + while (dictIsRehashing(d)) dictRehashMicroseconds(d,1000); + assert(dictSize(d) == current_dict_used); + assert(DICTHT_SIZE(d->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(d->ht_size_exp[1]) == 0); } TEST("Empty the dictionary and add 128 keys") { - dictEmpty(dict, NULL); + dictEmpty(d, NULL); for (j = 0; j < 128; j++) { - retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + retval = dictAdd(d,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } - while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == 128); - assert(dictBuckets(dict) == 128); + while (dictIsRehashing(d)) dictRehashMicroseconds(d,1000); + assert(dictSize(d) == 128); + assert(dictBuckets(d) == 128); } TEST("Use DICT_RESIZE_AVOID to disable the dict resize and reduce to 3") { /* Use DICT_RESIZE_AVOID to disable the dict reset, and reduce * the number of keys until we can trigger shrinking in next test. */ dictSetResizeEnabled(DICT_RESIZE_AVOID); - remain_keys = DICTHT_SIZE(dict->ht_size_exp[0]) / (HASHTABLE_MIN_FILL * dict_force_resize_ratio) + 1; + remain_keys = DICTHT_SIZE(d->ht_size_exp[0]) / (HASHTABLE_MIN_FILL * dict_force_resize_ratio) + 1; for (j = remain_keys; j < 128; j++) { char *key = stringFromLongLong(j); - retval = dictDelete(dict, key); + retval = dictDelete(d, key); zfree(key); assert(retval == DICT_OK); } current_dict_used = remain_keys; - assert(dictSize(dict) == remain_keys); - assert(dictBuckets(dict) == 128); + assert(dictSize(d) == remain_keys); + assert(dictBuckets(d) == 128); } TEST("Delete one more key, trigger the dict resize") { current_dict_used--; char *key = stringFromLongLong(current_dict_used); - retval = dictDelete(dict, key); + retval = dictDelete(d, key); zfree(key); new_dict_size = 1UL << _dictNextExp(current_dict_used); assert(retval == DICT_OK); - assert(dictSize(dict) == current_dict_used); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); + assert(dictSize(d) == current_dict_used); + assert(DICTHT_SIZE(d->ht_size_exp[0]) == 128); + assert(DICTHT_SIZE(d->ht_size_exp[1]) == new_dict_size); /* Wait for rehashing. */ dictSetResizeEnabled(DICT_RESIZE_ENABLE); - while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); - assert(dictSize(dict) == current_dict_used); - assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); - assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + while (dictIsRehashing(d)) dictRehashMicroseconds(d,1000); + assert(dictSize(d) == current_dict_used); + assert(DICTHT_SIZE(d->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(d->ht_size_exp[1]) == 0); } TEST("Restore to original state") { - dictEmpty(dict, NULL); + dictEmpty(d, NULL); dictSetResizeEnabled(DICT_RESIZE_ENABLE); } srand(12345); @@ -2055,61 +2059,61 @@ int dictTest(int argc, char **argv, int flags) { char *key = stringFromSubstring(); /* Insert the range directly from the large string */ - de = dictAddRaw(dict, key, &existing); + de = dictAddRaw(d, key, &existing); assert(de != NULL || existing != NULL); /* If key already exists NULL is returned so we need to free the temp key string */ if (de == NULL) zfree(key); } end_benchmark("Inserting random substrings (100-500B) from large string with symbols"); - assert((long)dictSize(dict) <= count); - dictEmpty(dict, NULL); + assert((long)dictSize(d) <= count); + dictEmpty(d, NULL); start_benchmark(); for (j = 0; j < count; j++) { - retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + retval = dictAdd(d,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } end_benchmark("Inserting via dictAdd() non existing"); - assert((long)dictSize(dict) == count); + assert((long)dictSize(d) == count); - dictEmpty(dict, NULL); + dictEmpty(d, NULL); start_benchmark(); for (j = 0; j < count; j++) { - de = dictAddRaw(dict,stringFromLongLong(j),NULL); + de = dictAddRaw(d,stringFromLongLong(j),NULL); assert(de != NULL); } end_benchmark("Inserting via dictAddRaw() non existing"); - assert((long)dictSize(dict) == count); + assert((long)dictSize(d) == count); start_benchmark(); for (j = 0; j < count; j++) { void *key = stringFromLongLong(j); - de = dictAddRaw(dict,key,&existing); + de = dictAddRaw(d,key,&existing); assert(existing != NULL); zfree(key); } end_benchmark("Inserting via dictAddRaw() existing (no insertion)"); - assert((long)dictSize(dict) == count); + assert((long)dictSize(d) == count); - dictEmpty(dict, NULL); + dictEmpty(d, NULL); start_benchmark(); for (j = 0; j < count; j++) { void *key = stringFromLongLong(j); - const uint64_t hash = dictGetHash(dict, key); - de = dictAddNonExistsByHash(dict,key,hash); + const uint64_t hash = dictGetHash(d, key); + de = dictAddNonExistsByHash(d,key,hash); assert(de != NULL); } end_benchmark("Inserting via dictAddNonExistsByHash() non existing"); - assert((long)dictSize(dict) == count); + assert((long)dictSize(d) == count); /* Wait for rehashing. */ - while (dictIsRehashing(dict)) { - dictRehashMicroseconds(dict,100*1000); + while (dictIsRehashing(d)) { + dictRehashMicroseconds(d,100*1000); } - dictEmpty(dict, NULL); + dictEmpty(d, NULL); start_benchmark(); for (j = 0; j < count; j++) { @@ -2117,41 +2121,41 @@ int dictTest(int argc, char **argv, int flags) { void *key = stringFromLongLong(j); /* Check if the key exists */ - dictEntry *entry = dictFind(dict, key); + dictEntry *entry = dictFind(d, key); assert(entry == NULL); /* Add the key */ - dictEntry *de = dictAddRaw(dict, key, NULL); + dictEntry *de = dictAddRaw(d, key, NULL); assert(de != NULL); } end_benchmark("Find() and inserting via dictFind()+dictAddRaw() non existing"); - dictEmpty(dict, NULL); + dictEmpty(d, NULL); start_benchmark(); for (j = 0; j < count; j++) { /* Create a key */ void *key = stringFromLongLong(j); - uint64_t hash = dictGetHash(dict, key); + uint64_t hash = dictGetHash(d, key); - /* Check if the key exists */ - dictEntry *entry = dictFindByHash(dict, key, hash); + /* Check if the key exists */ + dictEntry *entry = dictFindByHash(d, key, hash); assert(entry == NULL); - de = dictAddNonExistsByHash(dict, key, hash); + de = dictAddNonExistsByHash(d, key, hash); assert(de != NULL); } end_benchmark("Find() and inserting via dictGetHash()+dictFindByHash()+dictAddNonExistsByHash() non existing"); - assert((long)dictSize(dict) == count); + assert((long)dictSize(d) == count); /* Wait for rehashing. */ - while (dictIsRehashing(dict)) { - dictRehashMicroseconds(dict,100*1000); + while (dictIsRehashing(d)) { + dictRehashMicroseconds(d,100*1000); } start_benchmark(); for (j = 0; j < count; j++) { char *key = stringFromLongLong(j); - dictEntry *de = dictFind(dict,key); + dictEntry *de = dictFind(d,key); assert(de != NULL); zfree(key); } @@ -2160,7 +2164,7 @@ int dictTest(int argc, char **argv, int flags) { start_benchmark(); for (j = 0; j < count; j++) { char *key = stringFromLongLong(j); - dictEntry *de = dictFind(dict,key); + dictEntry *de = dictFind(d,key); assert(de != NULL); zfree(key); } @@ -2169,7 +2173,7 @@ int dictTest(int argc, char **argv, int flags) { start_benchmark(); for (j = 0; j < count; j++) { char *key = stringFromLongLong(rand() % count); - dictEntry *de = dictFind(dict,key); + dictEntry *de = dictFind(d,key); assert(de != NULL); zfree(key); } @@ -2177,7 +2181,7 @@ int dictTest(int argc, char **argv, int flags) { start_benchmark(); for (j = 0; j < count; j++) { - dictEntry *de = dictGetRandomKey(dict); + dictEntry *de = dictGetRandomKey(d); assert(de != NULL); } end_benchmark("Accessing random keys"); @@ -2186,7 +2190,7 @@ int dictTest(int argc, char **argv, int flags) { for (j = 0; j < count; j++) { char *key = stringFromLongLong(rand() % count); key[0] = 'X'; - dictEntry *de = dictFind(dict,key); + dictEntry *de = dictFind(d,key); assert(de == NULL); zfree(key); } @@ -2195,14 +2199,52 @@ int dictTest(int argc, char **argv, int flags) { start_benchmark(); for (j = 0; j < count; j++) { char *key = stringFromLongLong(j); - retval = dictDelete(dict,key); + retval = dictDelete(d,key); assert(retval == DICT_OK); key[0] += 17; /* Change first number to letter. */ - retval = dictAdd(dict,key,(void*)j); + retval = dictAdd(d,key,(void*)j); assert(retval == DICT_OK); } end_benchmark("Removing and adding"); - dictRelease(dict); + dictRelease(d); + + TEST("Use dict without values (no_value=1)") { + dictType dt = BenchmarkDictType; + dt.no_value = 1; + + /* Allocate array of size count and fill it with keys (stringFromLongLong(j) */ + char **lookupKeys = zmalloc(sizeof(char*) * count); + for (long j = 0; j < count; j++) + lookupKeys[j] = stringFromLongLong(j); + + + /* Add keys without values. */ + dict *d = dictCreate(&dt); + for (j = 0; j < count; j++) { + retval = dictAdd(d,lookupKeys[j],NULL); + assert(retval == DICT_OK); + } + + /* Now, we should be able to find the keys. */ + for (j = 0; j < count; j++) { + dictEntry *de = dictFind(d,lookupKeys[j]); + assert(de != NULL); + } + + /* Find non exists keys. */ + for (j = 0; j < count; j++) { + /* Temporarily override first char of key */ + char tmp = lookupKeys[j][0]; + lookupKeys[j][0] = 'X'; + dictEntry *de = dictFind(d,lookupKeys[j]); + lookupKeys[j][0] = tmp; + assert(de == NULL); + } + + dictRelease(d); + zfree(lookupKeys); + } + return 0; } #endif diff --git a/src/dict.h b/src/dict.h index bcc207c47..12a5c9918 100644 --- a/src/dict.h +++ b/src/dict.h @@ -53,12 +53,12 @@ typedef struct dictType { /* Flags */ /* The 'no_value' flag, if set, indicates that values are not used, i.e. the * dict is a set. When this flag is set, it's not possible to access the - * value of a dictEntry and it's also impossible to use dictSetKey(). Entry - * metadata can also not be used. */ + * value of a dictEntry and it's also impossible to use dictSetKey(). It + * enables an optimization to store a key directly without an allocating + * dictEntry in between, if it is the only key in the bucket. */ unsigned int no_value:1; - /* If no_value = 1 and all keys are odd (LSB=1), setting keys_are_odd = 1 - * enables one more optimization: to store a key without an allocated - * dictEntry. */ + /* This flag is required for `no_value` optimization since the optimization + * reuses LSB bits as metadata */ unsigned int keys_are_odd:1; /* TODO: Add a 'keys_are_even' flag and use a similar optimization if that * flag is set. */ diff --git a/src/server.c b/src/server.c index 973b02001..4b729fede 100644 --- a/src/server.c +++ b/src/server.c @@ -636,7 +636,8 @@ dictType clientDictType = { NULL, /* key dup */ NULL, /* val dup */ dictClientKeyCompare, /* key compare */ - .no_value = 1 /* no values in this dict */ + .no_value = 1, /* no values in this dict */ + .keys_are_odd = 0 /* a client pointer is not an odd pointer */ }; /* This function is called once a background process of some kind terminates, From 64a40b20d906f019a32fcb227c938768079dd9ed Mon Sep 17 00:00:00 2001 From: Yuan Wang Date: Mon, 23 Dec 2024 14:16:40 +0800 Subject: [PATCH 28/42] Async IO Threads (#13695) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Introduction Redis introduced IO Thread in 6.0, allowing IO threads to handle client request reading, command parsing and reply writing, thereby improving performance. The current IO thread implementation has a few drawbacks. - The main thread is blocked during IO thread read/write operations and must wait for all IO threads to complete their current tasks before it can continue execution. In other words, the entire process is synchronous. This prevents the efficient utilization of multi-core CPUs for parallel processing. - When the number of clients and requests increases moderately, it causes all IO threads to reach full CPU utilization due to the busy wait mechanism used by the IO threads. This makes it challenging for us to determine which part of Redis has reached its bottleneck. - When IO threads are enabled with TLS and io-threads-do-reads, a disconnection of a connection with pending data may result in it being assigned to multiple IO threads simultaneously. This can cause race conditions and trigger assertion failures. Related issue: redis#12540 Therefore, we designed an asynchronous IO threads solution. The IO threads adopt an event-driven model, with the main thread dedicated to command processing, meanwhile, the IO threads handle client read and write operations in parallel. ## Implementation ### Overall As before, we did not change the fact that all client commands must be executed on the main thread, because Redis was originally designed to be single-threaded, and processing commands in a multi-threaded manner would inevitably introduce numerous race and synchronization issues. But now each IO thread has independent event loop, therefore, IO threads can use a multiplexing approach to handle client read and write operations, eliminating the CPU overhead caused by busy-waiting. the execution process can be briefly described as follows: the main thread assigns clients to IO threads after accepting connections, IO threads will notify the main thread when clients finish reading and parsing queries, then the main thread processes queries from IO threads and generates replies, IO threads handle writing reply to clients after receiving clients list from main thread, and then continue to handle client read and write events. ### Each IO thread has independent event loop We now assign each IO thread its own event loop. This approach eliminates the need for the main thread to perform the costly `epoll_wait` operation for handling connections (except for specific ones). Instead, the main thread processes requests from the IO threads and hands them back once completed, fully offloading read and write events to the IO threads. Additionally, all TLS operations, including handling pending data, have been moved entirely to the IO threads. This resolves the issue where io-threads-do-reads could not be used with TLS. ### Event-notified client queue To facilitate communication between the IO threads and the main thread, we designed an event-notified client queue. Each IO thread and the main thread have two such queues to store clients waiting to be processed. These queues are also integrated with the event loop to enable handling. We use pthread_mutex to ensure the safety of queue operations, as well as data visibility and ordering, and race conditions are minimized, as each IO thread and the main thread operate on independent queues, avoiding thread suspension due to lock contention. And we implemented an event notifier based on `eventfd` or `pipe` to support event-driven handling. ### Thread safety Since the main thread and IO threads can execute in parallel, we must handle data race issues carefully. **client->flags** The primary tasks of IO threads are reading and writing, i.e. `readQueryFromClient` and `writeToClient`. However, IO threads and the main thread may concurrently modify or access `client->flags`, leading to potential race conditions. To address this, we introduced an io-flags variable to record operations performed by IO threads, thereby avoiding race conditions on `client->flags`. **Pause IO thread** In the main thread, we may want to operate data of IO threads, maybe uninstall event handler, access or operate query/output buffer or resize event loop, we need a clean and safe context to do that. We pause IO thread in `IOThreadBeforeSleep`, do some jobs and then resume it. To avoid thread suspended, we use busy waiting to confirm the target status. Besides we use atomic variable to make sure memory visibility and ordering. We introduce these functions to pause/resume IO Threads as below. ``` pauseIOThread, resumeIOThread pauseAllIOThreads, resumeAllIOThreads pauseIOThreadsRange, resumeIOThreadsRange ``` Testing has shown that `pauseIOThread` is highly efficient, allowing the main thread to execute nearly 200,000 operations per second during stress tests. Similarly, `pauseAllIOThreads` with 8 IO threads can handle up to nearly 56,000 operations per second. But operations performed between pausing and resuming IO threads must be quick; otherwise, they could cause the IO threads to reach full CPU utilization. **freeClient and freeClientAsync** The main thread may need to terminate a client currently running on an IO thread, for example, due to ACL rule changes, reaching the output buffer limit, or evicting a client. In such cases, we need to pause the IO thread to safely operate on the client. **maxclients and maxmemory-clients updating** When adjusting `maxclients`, we need to resize the event loop for all IO threads. Similarly, when modifying `maxmemory-clients`, we need to traverse all clients to calculate their memory usage. To ensure safe operations, we pause all IO threads during these adjustments. **Client info reading** The main thread may need to read a client’s fields to generate a descriptive string, such as for the `CLIENT LIST` command or logging purposes. In such cases, we need to pause the IO thread handling that client. If information for all clients needs to be displayed, all IO threads must be paused. **Tracking redirect** Redis supports the tracking feature and can even send invalidation messages to a connection with a specified ID. But the target client may be running on IO thread, directly manipulating the client’s output buffer is not thread-safe, and the IO thread may not be aware that the client requires a response. In such cases, we pause the IO thread handling the client, modify the output buffer, and install a write event handler to ensure proper handling. **clientsCron** In the `clientsCron` function, the main thread needs to traverse all clients to perform operations such as timeout checks, verifying whether they have reached the soft output buffer limit, resizing the output/query buffer, or updating memory usage. To safely operate on a client, the IO thread handling that client must be paused. If we were to pause the IO thread for each client individually, the efficiency would be very low. Conversely, pausing all IO threads simultaneously would be costly, especially when there are many IO threads, as clientsCron is invoked relatively frequently. To address this, we adopted a batched approach for pausing IO threads. At most, 8 IO threads are paused at a time. The operations mentioned above are only performed on clients running in the paused IO threads, significantly reducing overhead while maintaining safety. ### Observability In the current design, the main thread always assigns clients to the IO thread with the least clients. To clearly observe the number of clients handled by each IO thread, we added the new section in INFO output. The `INFO THREADS` section can show the client count for each IO thread. ``` # Threads io_thread_0:clients=0 io_thread_1:clients=2 io_thread_2:clients=2 ``` Additionally, in the `CLIENT LIST` output, we also added a field to indicate the thread to which each client is assigned. `id=244 addr=127.0.0.1:41870 laddr=127.0.0.1:6379 ... resp=2 lib-name= lib-ver= io-thread=1` ## Trade-off ### Special Clients For certain special types of clients, keeping them running on IO threads would result in severe race issues that are difficult to resolve. Therefore, we chose not to offload these clients to the IO threads. For replica, monitor, subscribe, and tracking clients, main thread may directly write them a reply when conditions are met. Race issues are difficult to resolve, so we have them processed in the main thread. This includes the Lua debug clients as well, since we may operate connection directly. For blocking client, after the IO thread reads and parses a command and hands it over to the main thread, if the client is identified as a blocking type, it will be remained in the main thread. Once the blocking operation completes and the reply is generated, the client is transferred back to the IO thread to send the reply and wait for event triggers. ### Clients Eviction To support client eviction, it is necessary to update each client’s memory usage promptly during operations such as read, write, or command execution. However, when a client operates on an IO thread, it is not feasible to update the memory usage immediately due to the risk of data races. As a result, memory usage can only be updated either in the main thread while processing commands or in the `ClientsCron` periodically. The downside of this approach is that updates might experience a delay of up to one second, which could impact the precision of memory management for eviction. To avoid incorrectly evicting clients. We adopted a best-effort compensation solution, when we decide to eviction a client, we update its memory usage again before evicting, if the memory used by the client does not decrease or memory usage bucket is not changed, then we will evict it, otherwise, not evict it. However, we have not completely solved this problem. Due to the delay in memory usage updates, it may lead us to make incorrect decisions about the need to evict clients. ### Defragment In the majority of cases we do NOT use the data from argv directly in the db. 1. key names We store a copy that we allocate in the main thread, see `sdsdup()` in `dbAdd()`. 2. hash key and value We store key as hfield and store value as sds, see `hfieldNew()` and `sdsdup()` in `hashTypeSet()`. 3. other datatypes They don't even use SDS, so there is no reference issues. But in some cases client the data from argv may be retain by the main thread. As a result, during fragmentation cleanup, we need to move allocations from the IO thread’s arena to the main thread’s arena. We always allocate new memory in the main thread’s arena, but the memory released by IO threads may not yet have been reclaimed. This ultimately causes the fragmentation rate to be higher compared to creating and allocating entirely within a single thread. The following cases below will lead to memory allocated by the IO thread being kept by the main thread. 1. string related command: `append`, `getset`, `mset` and `set`. If `tryObjectEncoding()` does not change argv, we will keep it directly in the main thread, see the code in `tryObjectEncoding()`(specifically `trimStringObjectIfNeeded()`) 2. block related command. the key names will be kept in `c->db->blocking_keys`. 3. watch command the key names will be kept in `c->db->watched_keys`. 4. [s]subscribe command channel name will be kept in `serverPubSubChannels`. 5. script load command script will be kept in `server.lua_scripts`. 7. some module API: `RM_RetainString`, `RM_HoldString` Those issues will be handled in other PRs. ## Testing ### Functional Testing The commit with enabling IO Threads has passed all TCL tests, but we did some changes: **Client query buffer**: In the original code, when using a reusable query buffer, ownership of the query buffer would be released after the command was processed. However, with IO threads enabled, the client transitions from an IO thread to the main thread for processing. This causes the ownership release to occur earlier than the command execution. As a result, when IO threads are enabled, the client's information will never indicate that a shared query buffer is in use. Therefore, we skip the corresponding query buffer tests in this case. **Defragment**: Add a new defragmentation test to verify the effect of io threads on defragmentation. **Command delay**: For deferred clients in TCL tests, due to clients being assigned to different threads for execution, delays may occur. To address this, we introduced conditional waiting: the process proceeds to the next step only when the `client list` contains the corresponding commands. ### Sanitizer Testing The commit passed all TCL tests and reported no errors when compiled with the `fsanitizer=thread` and `fsanitizer=address` options enabled. But we made the following modifications: we suppressed the sanitizer warnings for clients with watched keys when updating `client->flags`, we think IO threads read `client->flags`, but never modify it or read the `CLIENT_DIRTY_CAS` bit, main thread just only modifies this bit, so there is no actual data race. ## Others ### IO thread number In the new multi-threaded design, the main thread is primarily focused on command processing to improve performance. Typically, the main thread does not handle regular client I/O operations but is responsible for clients such as replication and tracking clients. To avoid breaking changes, we still consider the main thread as the first IO thread. When the io-threads configuration is set to a low value (e.g., 2), performance does not show a significant improvement compared to a single-threaded setup for simple commands (such as SET or GET), as the main thread does not consume much CPU for these simple operations. This results in underutilized multi-core capacity. However, for more complex commands, having a low number of IO threads may still be beneficial. Therefore, it’s important to adjust the `io-threads` based on your own performance tests. Additionally, you can clearly monitor the CPU utilization of the main thread and IO threads using `top -H -p $redis_pid`. This allows you to easily identify where the bottleneck is. If the IO thread is the bottleneck, increasing the `io-threads` will improve performance. If the main thread is the bottleneck, the overall performance can only be scaled by increasing the number of shards or replicas. --------- Co-authored-by: debing.sun Co-authored-by: oranagra --- redis.conf | 31 +- src/Makefile | 2 +- src/ae.c | 41 +- src/ae.h | 2 + src/cluster.c | 2 +- src/cluster_legacy.c | 4 +- src/commands.def | 5 +- src/commands/client-list.json | 12 + src/config.c | 12 +- src/config.h | 1 + src/connection.c | 8 +- src/connection.h | 47 +- src/debug.c | 2 + src/eventnotifier.c | 97 ++++ src/eventnotifier.h | 33 ++ src/iothread.c | 631 ++++++++++++++++++++++ src/multi.c | 8 +- src/networking.c | 708 ++++++++----------------- src/replication.c | 2 +- src/server.c | 83 ++- src/server.h | 95 +++- src/socket.c | 37 +- src/tls.c | 114 ++-- src/tracking.c | 19 +- src/unix.c | 17 +- tests/integration/shutdown.tcl | 10 + tests/support/util.tcl | 10 + tests/unit/client-eviction.tcl | 34 +- tests/unit/info.tcl | 2 +- tests/unit/introspection.tcl | 85 ++- tests/unit/lazyfree.tcl | 2 + tests/unit/maxmemory.tcl | 6 +- tests/unit/memefficiency.tcl | 122 ++++- tests/unit/moduleapi/blockedclient.tcl | 7 +- tests/unit/pubsub.tcl | 10 + tests/unit/pubsubshard.tcl | 5 + tests/unit/querybuf.tcl | 7 +- tests/unit/type/list.tcl | 5 + 38 files changed, 1683 insertions(+), 635 deletions(-) create mode 100644 src/eventnotifier.c create mode 100644 src/eventnotifier.h create mode 100644 src/iothread.c diff --git a/redis.conf b/redis.conf index 6688fdc2a..a1cbedd34 100644 --- a/redis.conf +++ b/redis.conf @@ -1291,38 +1291,27 @@ lazyfree-lazy-user-flush no # in different I/O threads. Since especially writing is so slow, normally # Redis users use pipelining in order to speed up the Redis performances per # core, and spawn multiple instances in order to scale more. Using I/O -# threads it is possible to easily speedup two times Redis without resorting +# threads it is possible to easily speedup several times Redis without resorting # to pipelining nor sharding of the instance. # # By default threading is disabled, we suggest enabling it only in machines # that have at least 4 or more cores, leaving at least one spare core. -# Using more than 8 threads is unlikely to help much. We also recommend using -# threaded I/O only if you actually have performance problems, with Redis -# instances being able to use a quite big percentage of CPU time, otherwise -# there is no point in using this feature. +# We also recommend using threaded I/O only if you actually have performance +# problems, with Redis instances being able to use a quite big percentage of +# CPU time, otherwise there is no point in using this feature. # -# So for instance if you have a four cores boxes, try to use 2 or 3 I/O -# threads, if you have a 8 cores, try to use 6 threads. In order to +# So for instance if you have a four cores boxes, try to use 3 I/O +# threads, if you have a 8 cores, try to use 7 threads. In order to # enable I/O threads use the following configuration directive: # # io-threads 4 # # Setting io-threads to 1 will just use the main thread as usual. -# When I/O threads are enabled, we only use threads for writes, that is -# to thread the write(2) syscall and transfer the client buffers to the -# socket. However it is also possible to enable threading of reads and -# protocol parsing using the following configuration directive, by setting -# it to yes: +# When I/O threads are enabled, we not only use threads for writes, that +# is to thread the write(2) syscall and transfer the client buffers to the +# socket, but also use threads for reads and protocol parsing. # -# io-threads-do-reads no -# -# Usually threading reads doesn't help much. -# -# NOTE 1: This configuration directive cannot be changed at runtime via -# CONFIG SET. Also, this feature currently does not work when SSL is -# enabled. -# -# NOTE 2: If you want to test the Redis speedup using redis-benchmark, make +# NOTE: If you want to test the Redis speedup using redis-benchmark, make # sure you also run the benchmark itself in threaded mode, using the # --threads option to match the number of Redis threads, otherwise you'll not # be able to notice the improvements. diff --git a/src/Makefile b/src/Makefile index 8f245d19d..4f394782d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -354,7 +354,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) diff --git a/src/ae.c b/src/ae.c index 3d3569865..ac4422398 100644 --- a/src/ae.c +++ b/src/ae.c @@ -42,7 +42,7 @@ #endif #endif - +#define INITIAL_EVENT 1024 aeEventLoop *aeCreateEventLoop(int setsize) { aeEventLoop *eventLoop; int i; @@ -50,8 +50,9 @@ aeEventLoop *aeCreateEventLoop(int setsize) { monotonicInit(); /* just in case the calling app didn't initialize */ if ((eventLoop = zmalloc(sizeof(*eventLoop))) == NULL) goto err; - eventLoop->events = zmalloc(sizeof(aeFileEvent)*setsize); - eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*setsize); + eventLoop->nevents = setsize < INITIAL_EVENT ? setsize : INITIAL_EVENT; + eventLoop->events = zmalloc(sizeof(aeFileEvent)*eventLoop->nevents); + eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*eventLoop->nevents); if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err; eventLoop->setsize = setsize; eventLoop->timeEventHead = NULL; @@ -61,10 +62,11 @@ aeEventLoop *aeCreateEventLoop(int setsize) { eventLoop->beforesleep = NULL; eventLoop->aftersleep = NULL; eventLoop->flags = 0; + memset(eventLoop->privdata, 0, sizeof(eventLoop->privdata)); if (aeApiCreate(eventLoop) == -1) goto err; /* Events with mask == AE_NONE are not set. So let's initialize the * vector with it. */ - for (i = 0; i < setsize; i++) + for (i = 0; i < eventLoop->nevents; i++) eventLoop->events[i].mask = AE_NONE; return eventLoop; @@ -102,20 +104,19 @@ void aeSetDontWait(aeEventLoop *eventLoop, int noWait) { * * Otherwise AE_OK is returned and the operation is successful. */ int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) { - int i; - if (setsize == eventLoop->setsize) return AE_OK; if (eventLoop->maxfd >= setsize) return AE_ERR; if (aeApiResize(eventLoop,setsize) == -1) return AE_ERR; - eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize); - eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize); eventLoop->setsize = setsize; - /* Make sure that if we created new slots, they are initialized with - * an AE_NONE mask. */ - for (i = eventLoop->maxfd+1; i < setsize; i++) - eventLoop->events[i].mask = AE_NONE; + /* If the current allocated space is larger than the requested size, + * we need to shrink it to the requested size. */ + if (setsize < eventLoop->nevents) { + eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize); + eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize); + eventLoop->nevents = setsize; + } return AE_OK; } @@ -147,6 +148,22 @@ int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, errno = ERANGE; return AE_ERR; } + + /* Resize the events and fired arrays if the file + * descriptor exceeds the current number of events. */ + if (unlikely(fd >= eventLoop->nevents)) { + int newnevents = eventLoop->nevents; + newnevents = (newnevents * 2 > fd + 1) ? newnevents * 2 : fd + 1; + newnevents = (newnevents > eventLoop->setsize) ? eventLoop->setsize : newnevents; + eventLoop->events = zrealloc(eventLoop->events, sizeof(aeFileEvent) * newnevents); + eventLoop->fired = zrealloc(eventLoop->fired, sizeof(aeFiredEvent) * newnevents); + + /* Initialize new slots with an AE_NONE mask */ + for (int i = eventLoop->nevents; i < newnevents; i++) + eventLoop->events[i].mask = AE_NONE; + eventLoop->nevents = newnevents; + } + aeFileEvent *fe = &eventLoop->events[fd]; if (aeApiAddEvent(eventLoop, fd, mask) == -1) diff --git a/src/ae.h b/src/ae.h index 5f1e17f7d..16c5fcc5c 100644 --- a/src/ae.h +++ b/src/ae.h @@ -79,6 +79,7 @@ typedef struct aeEventLoop { int maxfd; /* highest file descriptor currently registered */ int setsize; /* max number of file descriptors tracked */ long long timeEventNextId; + int nevents; /* Size of Registered events */ aeFileEvent *events; /* Registered events */ aeFiredEvent *fired; /* Fired events */ aeTimeEvent *timeEventHead; @@ -87,6 +88,7 @@ typedef struct aeEventLoop { aeBeforeSleepProc *beforesleep; aeBeforeSleepProc *aftersleep; int flags; + void *privdata[2]; } aeEventLoop; /* Prototypes */ diff --git a/src/cluster.c b/src/cluster.c index 876b1327f..6c0bf75cc 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -317,7 +317,7 @@ migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long ti } /* Create the connection */ - conn = connCreate(connTypeOfCluster()); + conn = connCreate(server.el, connTypeOfCluster()); if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout) != C_OK) { addReplyError(c,"-IOERR error or timeout connecting to the client"); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index ead19ac71..d707d863d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1262,7 +1262,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { return; } - connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth); + connection *conn = connCreateAccepted(server.el, connTypeOfCluster(), cfd, &require_auth); /* Make sure connection is not in an error state */ if (connGetState(conn) != CONN_STATE_ACCEPTING) { @@ -4583,7 +4583,7 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ if (node->link == NULL) { clusterLink *link = createClusterLink(node); - link->conn = connCreate(connTypeOfCluster()); + link->conn = connCreate(server.el, connTypeOfCluster()); connSetPrivateData(link->conn, link); if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr, clusterLinkConnectHandler) == C_ERR) { diff --git a/src/commands.def b/src/commands.def index ef42fb8da..53be28942 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1239,6 +1239,9 @@ commandHistory CLIENT_LIST_History[] = { {"6.2.0","Added `argv-mem`, `tot-mem`, `laddr` and `redir` fields and the optional `ID` filter."}, {"7.0.0","Added `resp`, `multi-mem`, `rbs` and `rbp` fields."}, {"7.0.3","Added `ssub` field."}, +{"7.2.0","Added `lib-name` and `lib-ver` fields."}, +{"7.4.0","Added `watch` field."}, +{"8.0.0","Added `io-thread` field."}, }; #endif @@ -1546,7 +1549,7 @@ struct COMMAND_STRUCT CLIENT_Subcommands[] = { {MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, {MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, {MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,6,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, -{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,6,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, +{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,9,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, {MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args}, {MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args}, {MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args}, diff --git a/src/commands/client-list.json b/src/commands/client-list.json index f72ffaf40..08305216c 100644 --- a/src/commands/client-list.json +++ b/src/commands/client-list.json @@ -31,6 +31,18 @@ [ "7.0.3", "Added `ssub` field." + ], + [ + "7.2.0", + "Added `lib-name` and `lib-ver` fields." + ], + [ + "7.4.0", + "Added `watch` field." + ], + [ + "8.0.0", + "Added `io-thread` field." ] ], "command_flags": [ diff --git a/src/config.c b/src/config.c index d0d30966c..797284347 100644 --- a/src/config.c +++ b/src/config.c @@ -430,6 +430,7 @@ void loadServerConfigFromString(char *config) { {"list-max-ziplist-entries", 2, 2}, {"list-max-ziplist-value", 2, 2}, {"lua-replicate-commands", 2, 2}, + {"io-threads-do-reads", 2, 2}, {NULL, 0}, }; char buf[1024]; @@ -2550,11 +2551,10 @@ static int updateMaxclients(const char **err) { *err = msg; return 0; } - if ((unsigned int) aeGetSetSize(server.el) < - server.maxclients + CONFIG_FDSET_INCR) - { - if (aeResizeSetSize(server.el, - server.maxclients + CONFIG_FDSET_INCR) == AE_ERR) + size_t newsize = server.maxclients + CONFIG_FDSET_INCR; + if ((unsigned int) aeGetSetSize(server.el) < newsize) { + if (aeResizeSetSize(server.el, newsize) == AE_ERR || + resizeAllIOThreadsEventLoops(newsize) == AE_ERR) { *err = "The event loop API used by Redis is not able to handle the specified number of clients"; return 0; @@ -3035,6 +3035,7 @@ static int applyClientMaxMemoryUsage(const char **err) { if (server.maxmemory_clients != 0) initServerClientMemUsageBuckets(); + pauseAllIOThreads(); /* When client eviction is enabled update memory buckets for all clients. * When disabled, clear that data structure. */ listRewind(server.clients, &li); @@ -3048,6 +3049,7 @@ static int applyClientMaxMemoryUsage(const char **err) { updateClientMemUsageAndBucket(c); } } + resumeAllIOThreads(); if (server.maxmemory_clients == 0) freeServerClientMemUsageBuckets(); diff --git a/src/config.h b/src/config.h index e8f77a350..ec0fb1529 100644 --- a/src/config.h +++ b/src/config.h @@ -47,6 +47,7 @@ #define HAVE_PROC_SMAPS 1 #define HAVE_PROC_SOMAXCONN 1 #define HAVE_PROC_OOM_SCORE_ADJ 1 +#define HAVE_EVENT_FD 1 #endif /* Test for task_info() */ diff --git a/src/connection.c b/src/connection.c index fd9d5d17a..6ac1b99d9 100644 --- a/src/connection.c +++ b/src/connection.c @@ -156,14 +156,14 @@ void connTypeCleanupAll(void) { } /* walk all the connection types until has pending data */ -int connTypeHasPendingData(void) { +int connTypeHasPendingData(struct aeEventLoop *el) { ConnectionType *ct; int type; int ret = 0; for (type = 0; type < CONN_TYPE_MAX; type++) { ct = connTypes[type]; - if (ct && ct->has_pending_data && (ret = ct->has_pending_data())) { + if (ct && ct->has_pending_data && (ret = ct->has_pending_data(el))) { return ret; } } @@ -172,7 +172,7 @@ int connTypeHasPendingData(void) { } /* walk all the connection types and process pending data for each connection type */ -int connTypeProcessPendingData(void) { +int connTypeProcessPendingData(struct aeEventLoop *el) { ConnectionType *ct; int type; int ret = 0; @@ -180,7 +180,7 @@ int connTypeProcessPendingData(void) { for (type = 0; type < CONN_TYPE_MAX; type++) { ct = connTypes[type]; if (ct && ct->process_pending_data) { - ret += ct->process_pending_data(); + ret += ct->process_pending_data(el); } } diff --git a/src/connection.h b/src/connection.h index a8c296d15..0ebc84489 100644 --- a/src/connection.h +++ b/src/connection.h @@ -60,8 +60,8 @@ typedef struct ConnectionType { int (*listen)(connListener *listener); /* create/shutdown/close connection */ - connection* (*conn_create)(void); - connection* (*conn_create_accepted)(int fd, void *priv); + connection* (*conn_create)(struct aeEventLoop *el); + connection* (*conn_create_accepted)(struct aeEventLoop *el, int fd, void *priv); void (*shutdown)(struct connection *conn); void (*close)(struct connection *conn); @@ -81,9 +81,13 @@ typedef struct ConnectionType { ssize_t (*sync_read)(struct connection *conn, char *ptr, ssize_t size, long long timeout); ssize_t (*sync_readline)(struct connection *conn, char *ptr, ssize_t size, long long timeout); + /* event loop */ + void (*unbind_event_loop)(struct connection *conn); + int (*rebind_event_loop)(struct connection *conn, aeEventLoop *el); + /* pending data */ - int (*has_pending_data)(void); - int (*process_pending_data)(void); + int (*has_pending_data)(struct aeEventLoop *el); + int (*process_pending_data)(struct aeEventLoop *el); /* TLS specified methods */ sds (*get_peer_cert)(struct connection *conn); @@ -98,6 +102,7 @@ struct connection { short int refs; unsigned short int iovcnt; void *private_data; + struct aeEventLoop *el; ConnectionCallbackFunc conn_handler; ConnectionCallbackFunc write_handler; ConnectionCallbackFunc read_handler; @@ -319,6 +324,28 @@ static inline int connHasReadHandler(connection *conn) { return conn->read_handler != NULL; } +/* Returns true if the connection is bound to an event loop */ +static inline int connHasEventLoop(connection *conn) { + return conn->el != NULL; +} + +/* Unbind the current event loop from the connection, so that it can be + * rebind to a different event loop in the future. */ +static inline void connUnbindEventLoop(connection *conn) { + if (conn->el == NULL) return; + connSetReadHandler(conn, NULL); + connSetWriteHandler(conn, NULL); + if (conn->type->unbind_event_loop) + conn->type->unbind_event_loop(conn); + conn->el = NULL; +} + +/* Rebind the connection to another event loop, read/write handlers must not + * be installed in the current event loop */ +static inline int connRebindEventLoop(connection *conn, aeEventLoop *el) { + return conn->type->rebind_event_loop(conn, el); +} + /* Associate a private data pointer with the connection */ static inline void connSetPrivateData(connection *conn, void *data) { conn->private_data = data; @@ -379,14 +406,14 @@ ConnectionType *connectionTypeUnix(void); int connectionIndexByType(const char *typename); /* Create a connection of specified type */ -static inline connection *connCreate(ConnectionType *ct) { - return ct->conn_create(); +static inline connection *connCreate(struct aeEventLoop *el, ConnectionType *ct) { + return ct->conn_create(el); } /* Create an accepted connection of specified type. * priv is connection type specified argument */ -static inline connection *connCreateAccepted(ConnectionType *ct, int fd, void *priv) { - return ct->conn_create_accepted(fd, priv); +static inline connection *connCreateAccepted(struct aeEventLoop *el, ConnectionType *ct, int fd, void *priv) { + return ct->conn_create_accepted(el, fd, priv); } /* Configure a connection type. A typical case is to configure TLS. @@ -400,10 +427,10 @@ static inline int connTypeConfigure(ConnectionType *ct, void *priv, int reconfig void connTypeCleanupAll(void); /* Test all the connection type has pending data or not. */ -int connTypeHasPendingData(void); +int connTypeHasPendingData(struct aeEventLoop *el); /* walk all the connection types and process pending data for each connection type */ -int connTypeProcessPendingData(void); +int connTypeProcessPendingData(struct aeEventLoop *el); /* Listen on an initialized listener */ static inline int connListen(connListener *listener) { diff --git a/src/debug.c b/src/debug.c index e40375fbe..c4d184b15 100644 --- a/src/debug.c +++ b/src/debug.c @@ -2451,6 +2451,8 @@ void removeSigSegvHandlers(void) { } void printCrashReport(void) { + server.crashing = 1; + /* Log INFO and CLIENT LIST */ logServerInfo(); diff --git a/src/eventnotifier.c b/src/eventnotifier.c new file mode 100644 index 000000000..6dc3cf990 --- /dev/null +++ b/src/eventnotifier.c @@ -0,0 +1,97 @@ +/* eventnotifier.c -- An event notifier based on eventfd or pipe. + * + * Copyright (c) 2024-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ + +#include "eventnotifier.h" + +#include +#include +#include +#ifdef HAVE_EVENT_FD +#include +#endif + +#include "anet.h" +#include "zmalloc.h" + +eventNotifier* createEventNotifier(void) { + eventNotifier *en = zmalloc(sizeof(eventNotifier)); + if (!en) return NULL; + +#ifdef HAVE_EVENT_FD + if ((en->efd = eventfd(0, EFD_NONBLOCK| EFD_CLOEXEC)) != -1) { + return en; + } +#else + if (anetPipe(en->pipefd, O_CLOEXEC|O_NONBLOCK, O_CLOEXEC|O_NONBLOCK) != -1) { + return en; + } +#endif + + /* Clean up if error. */ + zfree(en); + return NULL; +} + +int getReadEventFd(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + return en->efd; +#else + return en->pipefd[0]; +#endif +} + +int getWriteEventFd(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + return en->efd; +#else + return en->pipefd[1]; +#endif +} + +int triggerEventNotifier(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + uint64_t u = 1; + if (write(en->efd, &u, sizeof(uint64_t)) == -1) { + return EN_ERR; + } +#else + char buf[1] = {'R'}; + if (write(en->pipefd[1], buf, 1) == -1) { + return EN_ERR; + } +#endif + return EN_OK; +} + +int handleEventNotifier(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + uint64_t u; + if (read(en->efd, &u, sizeof(uint64_t)) == -1) { + return EN_ERR; + } +#else + char buf[1]; + if (read(en->pipefd[0], buf, 1) == -1) { + return EN_ERR; + } +#endif + return EN_OK; +} + +void freeEventNotifier(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + close(en->efd); +#else + close(en->pipefd[0]); + close(en->pipefd[1]); +#endif + + /* Free memory */ + zfree(en); +} diff --git a/src/eventnotifier.h b/src/eventnotifier.h new file mode 100644 index 000000000..39e3b5113 --- /dev/null +++ b/src/eventnotifier.h @@ -0,0 +1,33 @@ +/* eventnotifier.h -- An event notifier based on eventfd or pipe. + * + * Copyright (c) 2024-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ + +#ifndef EVENTNOTIFIER_H +#define EVENTNOTIFIER_H + +#include "config.h" + +#define EN_OK 0 +#define EN_ERR -1 + +typedef struct eventNotifier { +#ifdef HAVE_EVENT_FD + int efd; +#else + int pipefd[2]; +#endif +} eventNotifier; + +eventNotifier* createEventNotifier(void); +int getReadEventFd(struct eventNotifier *en); +int getWriteEventFd(struct eventNotifier *en); +int triggerEventNotifier(struct eventNotifier *en); +int handleEventNotifier(struct eventNotifier *en); +void freeEventNotifier(struct eventNotifier *en); + +#endif diff --git a/src/iothread.c b/src/iothread.c new file mode 100644 index 000000000..2e5c98a28 --- /dev/null +++ b/src/iothread.c @@ -0,0 +1,631 @@ +/* iothread.c -- The threaded io implementation. + * + * Copyright (c) 2024-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ + +#include "server.h" + +/* IO threads. */ +static IOThread IOThreads[IO_THREADS_MAX_NUM]; + +/* For main thread */ +static list *mainThreadPendingClientsToIOThreads[IO_THREADS_MAX_NUM]; /* Clients to IO threads */ +static list *mainThreadProcessingClients[IO_THREADS_MAX_NUM]; /* Clients in processing */ +static list *mainThreadPendingClients[IO_THREADS_MAX_NUM]; /* Pending clients from IO threads */ +static pthread_mutex_t mainThreadPendingClientsMutexes[IO_THREADS_MAX_NUM]; /* Mutex for pending clients */ +static eventNotifier* mainThreadPendingClientsNotifiers[IO_THREADS_MAX_NUM]; /* Notifier for pending clients */ + +/* When IO threads read a complete query of clients or want to free clients, it + * should remove it from its clients list and put the client in the list to main + * thread, we will send these clients to main thread in IOThreadBeforeSleep. */ +void enqueuePendingClientsToMainThread(client *c, int unbind) { + /* If the IO thread may no longer manage it, such as closing client, we should + * unbind client from event loop, so main thread doesn't need to do it costly. */ + if (unbind) connUnbindEventLoop(c->conn); + /* Just skip if it already is transferred. */ + if (c->io_thread_client_list_node) { + listDelNode(IOThreads[c->tid].clients, c->io_thread_client_list_node); + c->io_thread_client_list_node = NULL; + /* Disable read and write to avoid race when main thread processes. */ + c->io_flags &= ~(CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED); + listAddNodeTail(IOThreads[c->tid].pending_clients_to_main_thread, c); + } +} + +/* Unbind connection of client from io thread event loop, write and read handlers + * also be removed, ensures that we can operate the client safely. */ +void unbindClientFromIOThreadEventLoop(client *c) { + serverAssert(c->tid != IOTHREAD_MAIN_THREAD_ID && + c->running_tid == IOTHREAD_MAIN_THREAD_ID); + if (!connHasEventLoop(c->conn)) return; + /* As calling in main thread, we should pause the io thread to make it safe. */ + pauseIOThread(c->tid); + connUnbindEventLoop(c->conn); + resumeIOThread(c->tid); +} + +/* When main thread is processing a client from IO thread, and wants to keep it, + * we should unbind connection of client from io thread event loop first, + * and then bind the client connection into server's event loop. */ +void keepClientInMainThread(client *c) { + serverAssert(c->tid != IOTHREAD_MAIN_THREAD_ID && + c->running_tid == IOTHREAD_MAIN_THREAD_ID); + /* IO thread no longer manage it. */ + server.io_threads_clients_num[c->tid]--; + /* Unbind connection of client from io thread event loop. */ + unbindClientFromIOThreadEventLoop(c); + /* Let main thread to run it, rebind event loop and read handler */ + connRebindEventLoop(c->conn, server.el); + connSetReadHandler(c->conn, readQueryFromClient); + c->io_flags |= CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED; + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + c->tid = IOTHREAD_MAIN_THREAD_ID; + /* Main thread starts to manage it. */ + server.io_threads_clients_num[c->tid]++; +} + +/* If the client is managed by IO thread, we should fetch it from IO thread + * and then main thread will can process it. Just like IO Thread transfers + * the client to the main thread for processing. */ +void fetchClientFromIOThread(client *c) { + serverAssert(c->tid != IOTHREAD_MAIN_THREAD_ID && + c->running_tid != IOTHREAD_MAIN_THREAD_ID); + pauseIOThread(c->tid); + /* Remove the client from clients list of IO thread or main thread. */ + if (c->io_thread_client_list_node) { + listDelNode(IOThreads[c->tid].clients, c->io_thread_client_list_node); + c->io_thread_client_list_node = NULL; + } else { + list *clients[5] = { + IOThreads[c->tid].pending_clients, + IOThreads[c->tid].pending_clients_to_main_thread, + mainThreadPendingClients[c->tid], + mainThreadProcessingClients[c->tid], + mainThreadPendingClientsToIOThreads[c->tid] + }; + for (int i = 0; i < 5; i++) { + listNode *ln = listSearchKey(clients[i], c); + if (ln) { + listDelNode(clients[i], ln); + /* Client only can be in one client list. */ + break; + } + } + } + /* Unbind connection of client from io thread event loop. */ + connUnbindEventLoop(c->conn); + /* Now main thread can process it. */ + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + resumeIOThread(c->tid); +} + +/* For some clients, we must handle them in the main thread, since there is + * data race to be processed in IO threads. + * + * - Close ASAP, we must free the client in main thread. + * - Replica, pubsub, monitor, blocked, tracking clients, main thread may + * directly write them a reply when conditions are met. + * - Script command with debug may operate connection directly. */ +int isClientMustHandledByMainThread(client *c) { + if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_MASTER | CLIENT_SLAVE | + CLIENT_PUBSUB | CLIENT_MONITOR | CLIENT_BLOCKED | + CLIENT_UNBLOCKED | CLIENT_TRACKING | CLIENT_LUA_DEBUG | + CLIENT_LUA_DEBUG_SYNC)) + { + return 1; + } + return 0; +} + +/* When the main thread accepts a new client or transfers clients to IO threads, + * it assigns the client to the IO thread with the fewest clients. */ +void assignClientToIOThread(client *c) { + serverAssert(c->tid == IOTHREAD_MAIN_THREAD_ID); + /* Find the IO thread with the fewest clients. */ + int min_id = 0; + int min = INT_MAX; + for (int i = 1; i < server.io_threads_num; i++) { + if (server.io_threads_clients_num[i] < min) { + min = server.io_threads_clients_num[i]; + min_id = i; + } + } + + /* Assign the client to the IO thread. */ + server.io_threads_clients_num[c->tid]--; + c->tid = min_id; + c->running_tid = min_id; + server.io_threads_clients_num[min_id]++; + + /* Unbind connection of client from main thread event loop, disable read and + * write, and then put it in the list, main thread will send these clients + * to IO thread in beforeSleep. */ + connUnbindEventLoop(c->conn); + c->io_flags &= ~(CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED); + listAddNodeTail(mainThreadPendingClientsToIOThreads[c->tid], c); +} + +/* If updating maxclients config, we not only resize the event loop of main thread + * but also resize the event loop of all io threads, and if one thread is failed, + * it is failed totally, since a fd can be distributed into any IO thread. */ +int resizeAllIOThreadsEventLoops(size_t newsize) { + int result = AE_OK; + if (server.io_threads_num <= 1) return result; + + /* To make context safe. */ + pauseAllIOThreads(); + for (int i = 1; i < server.io_threads_num; i++) { + IOThread *t = &IOThreads[i]; + if (aeResizeSetSize(t->el, newsize) == AE_ERR) + result = AE_ERR; + } + resumeAllIOThreads(); + return result; +} + +/* In the main thread, we may want to operate data of io threads, maybe uninstall + * event handler, access query/output buffer or resize event loop, we need a clean + * and safe context to do that. We pause io thread in IOThreadBeforeSleep, do some + * jobs and then resume it. To avoid thread suspended, we use busy waiting to confirm + * the target status. Besides we use atomic variable to make sure memory visibility + * and ordering. + * + * Make sure that only the main thread can call these function, + * - pauseIOThread, resumeIOThread + * - pauseAllIOThreads, resumeAllIOThreads + * - pauseIOThreadsRange, resumeIOThreadsRange + * + * The main thread will pause the io thread, and then wait for the io thread to + * be paused. The io thread will check the paused status in IOThreadBeforeSleep, + * and then pause itself. + * + * The main thread will resume the io thread, and then wait for the io thread to + * be resumed. The io thread will check the paused status in IOThreadBeforeSleep, + * and then resume itself. + */ + +/* We may pause the same io thread nestedly, so we need to record the times of + * pausing, and only when the times of pausing is 0, we can pause the io thread, + * and only when the times of pausing is 1, we can resume the io thread. */ +static int PausedIOThreads[IO_THREADS_MAX_NUM] = {0}; + +/* Pause the specific range of io threads, and wait for them to be paused. */ +void pauseIOThreadsRange(int start, int end) { + if (server.io_threads_num <= 1) return; + serverAssert(start >= 1 && end < server.io_threads_num && start <= end); + serverAssert(pthread_equal(pthread_self(), server.main_thread_id)); + + /* Try to make all io threads paused in parallel */ + for (int i = start; i <= end; i++) { + PausedIOThreads[i]++; + /* Skip if already paused */ + if (PausedIOThreads[i] > 1) continue; + + int paused; + atomicGetWithSync(IOThreads[i].paused, paused); + /* Don't support to call reentrant */ + serverAssert(paused == IO_THREAD_UNPAUSED); + atomicSetWithSync(IOThreads[i].paused, IO_THREAD_PAUSING); + /* Just notify io thread, no actual job, since io threads check paused + * status in IOThreadBeforeSleep, so just wake it up if polling wait. */ + triggerEventNotifier(IOThreads[i].pending_clients_notifier); + } + + /* Wait for all io threads paused */ + for (int i = start; i <= end; i++) { + if (PausedIOThreads[i] > 1) continue; + int paused = IO_THREAD_PAUSING; + while (paused != IO_THREAD_PAUSED) { + atomicGetWithSync(IOThreads[i].paused, paused); + } + } +} + +/* Resume the specific range of io threads, and wait for them to be resumed. */ +void resumeIOThreadsRange(int start, int end) { + if (server.io_threads_num <= 1) return; + serverAssert(start >= 1 && end < server.io_threads_num && start <= end); + serverAssert(pthread_equal(pthread_self(), server.main_thread_id)); + + for (int i = start; i <= end; i++) { + serverAssert(PausedIOThreads[i] > 0); + PausedIOThreads[i]--; + if (PausedIOThreads[i] > 0) continue; + + int paused; + /* Check if it is paused, since we must call 'pause' and + * 'resume' in pairs */ + atomicGetWithSync(IOThreads[i].paused, paused); + serverAssert(paused == IO_THREAD_PAUSED); + /* Resume */ + atomicSetWithSync(IOThreads[i].paused, IO_THREAD_RESUMING); + while (paused != IO_THREAD_UNPAUSED) { + atomicGetWithSync(IOThreads[i].paused, paused); + } + } +} + +/* The IO thread checks whether it is being paused, and if so, it pauses itself + * and waits for resuming, corresponding to the pause/resumeIOThread* functions. + * Currently, this is only called in IOThreadBeforeSleep, as there are no pending + * I/O events at this point, with a clean context. */ +void handlePauseAndResume(IOThread *t) { + int paused; + /* Check if i am being paused. */ + atomicGetWithSync(t->paused, paused); + if (paused == IO_THREAD_PAUSING) { + atomicSetWithSync(t->paused, IO_THREAD_PAUSED); + /* Wait for resuming */ + while (paused != IO_THREAD_RESUMING) { + atomicGetWithSync(t->paused, paused); + } + atomicSetWithSync(t->paused, IO_THREAD_UNPAUSED); + } +} + +/* Pause the specific io thread, and wait for it to be paused. */ +void pauseIOThread(int id) { + pauseIOThreadsRange(id, id); +} + +/* Resume the specific io thread, and wait for it to be resumed. */ +void resumeIOThread(int id) { + resumeIOThreadsRange(id, id); +} + +/* Pause all io threads, and wait for them to be paused. */ +void pauseAllIOThreads(void) { + pauseIOThreadsRange(1, server.io_threads_num-1); +} + +/* Resume all io threads, and wait for them to be resumed. */ +void resumeAllIOThreads(void) { + resumeIOThreadsRange(1, server.io_threads_num-1); +} + +/* Add the pending clients to the list of IO threads, and trigger an event to + * notify io threads to handle. */ +int sendPendingClientsToIOThreads(void) { + int processed = 0; + for (int i = 1; i < server.io_threads_num; i++) { + int len = listLength(mainThreadPendingClientsToIOThreads[i]); + if (len > 0) { + IOThread *t = &IOThreads[i]; + pthread_mutex_lock(&t->pending_clients_mutex); + listJoin(t->pending_clients, mainThreadPendingClientsToIOThreads[i]); + pthread_mutex_unlock(&t->pending_clients_mutex); + /* Trigger an event, maybe an error is returned when buffer is full + * if using pipe, but no worry, io thread will handle all clients + * in list when receiving a notification. */ + triggerEventNotifier(t->pending_clients_notifier); + } + processed += len; + } + return processed; +} + +extern int ProcessingEventsWhileBlocked; + +/* The main thread processes the clients from IO threads, these clients may have + * a complete command to execute or need to be freed. Note that IO threads never + * free client since this operation access much server data. + * + * Please notice that this function may be called reentrantly, i,e, the same goes + * for handleClientsFromIOThread and processClientsOfAllIOThreads. For example, + * when processing script command, it may call processEventsWhileBlocked to + * process new events, if the clients with fired events from the same io thread, + * it may call this function reentrantly. */ +void processClientsFromIOThread(IOThread *t) { + listNode *node = NULL; + + while (listLength(mainThreadProcessingClients[t->id])) { + /* Each time we pop up only the first client to process to guarantee + * reentrancy safety. */ + if (node) zfree(node); + node = listFirst(mainThreadProcessingClients[t->id]); + listUnlinkNode(mainThreadProcessingClients[t->id], node); + client *c = listNodeValue(node); + + /* Make sure the client is readable or writable in io thread to + * avoid data race. */ + serverAssert(!(c->io_flags & (CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED))); + serverAssert(!(c->flags & CLIENT_CLOSE_ASAP)); + + /* Let main thread to run it, set running thread id first. */ + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + + /* If a read error occurs, handle it in the main thread first, since we + * want to print logs about client information before freeing. */ + if (c->read_error) handleClientReadError(c); + + /* The client is asked to close in IO thread. */ + if (c->io_flags & CLIENT_IO_CLOSE_ASAP) { + freeClient(c); + continue; + } + + /* Update the client in the mem usage */ + updateClientMemUsageAndBucket(c); + + /* Process the pending command and input buffer. */ + if (!c->read_error && c->io_flags & CLIENT_IO_PENDING_COMMAND) { + c->flags |= CLIENT_PENDING_COMMAND; + if (processPendingCommandAndInputBuffer(c) == C_ERR) { + /* If the client is no longer valid, it must be freed safely. */ + continue; + } + } + + /* We may have pending replies if io thread may not finish writing + * reply to client, so we did not put the client in pending write + * queue. And we should do that first since we may keep the client + * in main thread instead of returning to io threads. */ + if (!(c->flags & CLIENT_PENDING_WRITE) && clientHasPendingReplies(c)) + putClientInPendingWriteQueue(c); + + /* The client only can be processed in the main thread, otherwise data + * race will happen, since we may touch client's data in main thread. */ + if (isClientMustHandledByMainThread(c)) { + keepClientInMainThread(c); + continue; + } + + /* Remove this client from pending write clients queue of main thread, + * And some clients may do not have reply if CLIENT REPLY OFF/SKIP. */ + if (c->flags & CLIENT_PENDING_WRITE) { + c->flags &= ~CLIENT_PENDING_WRITE; + listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); + } + c->running_tid = c->tid; + listLinkNodeHead(mainThreadPendingClientsToIOThreads[c->tid], node); + node = NULL; + } + if (node) zfree(node); + + /* Trigger the io thread to handle these clients ASAP to make them processed + * in parallel. + * + * If AOF fsync policy is always, we should not let io thread handle these + * clients now since we don't flush AOF buffer to file and sync yet. + * So these clients will be delayed to send io threads in beforeSleep after + * flushAppendOnlyFile. + * + * If we are in processEventsWhileBlocked, we don't send clients to io threads + * now, we want to update server.events_processed_while_blocked accurately. */ + if (listLength(mainThreadPendingClientsToIOThreads[t->id]) && + server.aof_fsync != AOF_FSYNC_ALWAYS && + !ProcessingEventsWhileBlocked) + { + pthread_mutex_lock(&(t->pending_clients_mutex)); + listJoin(t->pending_clients, mainThreadPendingClientsToIOThreads[t->id]); + pthread_mutex_unlock(&(t->pending_clients_mutex)); + triggerEventNotifier(t->pending_clients_notifier); + } +} + +/* When the io thread finishes processing the client with the read event, it will + * notify the main thread through event triggering in IOThreadBeforeSleep. The main + * thread handles the event through this function. */ +void handleClientsFromIOThread(struct aeEventLoop *el, int fd, void *ptr, int mask) { + UNUSED(el); + UNUSED(mask); + + IOThread *t = ptr; + + /* Handle fd event first. */ + serverAssert(fd == getReadEventFd(mainThreadPendingClientsNotifiers[t->id])); + handleEventNotifier(mainThreadPendingClientsNotifiers[t->id]); + + /* Get the list of clients to process. */ + pthread_mutex_lock(&mainThreadPendingClientsMutexes[t->id]); + listJoin(mainThreadProcessingClients[t->id], mainThreadPendingClients[t->id]); + pthread_mutex_unlock(&mainThreadPendingClientsMutexes[t->id]); + if (listLength(mainThreadProcessingClients[t->id]) == 0) return; + + /* Process the clients from IO threads. */ + processClientsFromIOThread(t); +} + +/* In the new threaded io design, one thread may process multiple clients, so when + * an io thread notifies the main thread of an event, there may be multiple clients + * with commands that need to be processed. But in the event handler function + * handleClientsFromIOThread may be blocked when processing the specific command, + * the previous clients can not get a reply, and the subsequent clients can not be + * processed, so we need to handle this scenario in beforeSleep. The function is to + * process the commands of subsequent clients from io threads. And another function + * sendPendingClientsToIOThreads make sure clients from io thread can get replies. + * See also beforeSleep. */ +void processClientsOfAllIOThreads(void) { + for (int i = 1; i < server.io_threads_num; i++) { + processClientsFromIOThread(&IOThreads[i]); + } +} + +/* After the main thread processes the clients, it will send the clients back to + * io threads to handle, and fire an event, the io thread handles the event by + * this function. If the client is not binded to the event loop, we should bind + * it first and install read handler, and we don't uninstall client read handler + * unless freeing client. If the client has pending reply, we just reply to client + * first, and then install write handler if needed. */ +void handleClientsFromMainThread(struct aeEventLoop *ae, int fd, void *ptr, int mask) { + UNUSED(ae); + UNUSED(mask); + + IOThread *t = ptr; + + /* Handle fd event first. */ + serverAssert(fd == getReadEventFd(t->pending_clients_notifier)); + handleEventNotifier(t->pending_clients_notifier); + + pthread_mutex_lock(&t->pending_clients_mutex); + listJoin(t->processing_clients, t->pending_clients); + pthread_mutex_unlock(&t->pending_clients_mutex); + if (listLength(t->processing_clients) == 0) return; + + listIter li; + listNode *ln; + listRewind(t->processing_clients, &li); + while((ln = listNext(&li))) { + client *c = listNodeValue(ln); + serverAssert(!(c->io_flags & (CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED))); + /* Main thread must handle clients with CLIENT_CLOSE_ASAP flag, since + * we only set io_flags when clients in io thread are freed ASAP. */ + serverAssert(!(c->flags & CLIENT_CLOSE_ASAP)); + + /* Link client in IO thread clients list first. */ + serverAssert(c->io_thread_client_list_node == NULL); + listAddNodeTail(t->clients, c); + c->io_thread_client_list_node = listLast(t->clients); + + /* The client is asked to close, we just let main thread free it. */ + if (c->io_flags & CLIENT_IO_CLOSE_ASAP) { + enqueuePendingClientsToMainThread(c, 1); + continue; + } + + /* Enable read and write and reset some flags. */ + c->io_flags |= CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED; + c->io_flags &= ~CLIENT_IO_PENDING_COMMAND; + + /* Only bind once, we never remove read handler unless freeing client. */ + if (!connHasEventLoop(c->conn)) { + connRebindEventLoop(c->conn, t->el); + serverAssert(!connHasReadHandler(c->conn)); + connSetReadHandler(c->conn, readQueryFromClient); + } + + /* If the client has pending replies, write replies to client. */ + if (clientHasPendingReplies(c)) { + writeToClient(c, 0); + if (!(c->io_flags & CLIENT_IO_CLOSE_ASAP) && clientHasPendingReplies(c)) { + connSetWriteHandler(c->conn, sendReplyToClient); + } + } + } + listEmpty(t->processing_clients); +} + +void IOThreadBeforeSleep(struct aeEventLoop *el) { + IOThread *t = el->privdata[0]; + + /* Handle pending data(typical TLS). */ + connTypeProcessPendingData(el); + + /* If any connection type(typical TLS) still has pending unread data don't sleep at all. */ + aeSetDontWait(el, connTypeHasPendingData(el)); + + /* Check if i am being paused, pause myself and resume. */ + handlePauseAndResume(t); + + /* Check if there are clients to be processed in main thread, and then join + * them to the list of main thread. */ + if (listLength(t->pending_clients_to_main_thread) > 0) { + pthread_mutex_lock(&mainThreadPendingClientsMutexes[t->id]); + listJoin(mainThreadPendingClients[t->id], t->pending_clients_to_main_thread); + pthread_mutex_unlock(&mainThreadPendingClientsMutexes[t->id]); + /* Trigger an event, maybe an error is returned when buffer is full + * if using pipe, but no worry, main thread will handle all clients + * in list when receiving a notification. */ + triggerEventNotifier(mainThreadPendingClientsNotifiers[t->id]); + } +} + +/* The main function of IO thread, it will run an event loop. The mian thread + * and IO thread will communicate through event notifier. */ +void *IOThreadMain(void *ptr) { + IOThread *t = ptr; + char thdname[16]; + snprintf(thdname, sizeof(thdname), "io_thd_%d", t->id); + redis_set_thread_title(thdname); + redisSetCpuAffinity(server.server_cpulist); + makeThreadKillable(); + aeSetBeforeSleepProc(t->el, IOThreadBeforeSleep); + aeMain(t->el); + return NULL; +} + +/* Initialize the data structures needed for threaded I/O. */ +void initThreadedIO(void) { + if (server.io_threads_num <= 1) return; + + server.io_threads_active = 1; + + if (server.io_threads_num > IO_THREADS_MAX_NUM) { + serverLog(LL_WARNING,"Fatal: too many I/O threads configured. " + "The maximum number is %d.", IO_THREADS_MAX_NUM); + exit(1); + } + + /* Spawn and initialize the I/O threads. */ + for (int i = 1; i < server.io_threads_num; i++) { + IOThread *t = &IOThreads[i]; + t->id = i; + t->el = aeCreateEventLoop(server.maxclients+CONFIG_FDSET_INCR); + t->el->privdata[0] = t; + t->pending_clients = listCreate(); + t->processing_clients = listCreate(); + t->pending_clients_to_main_thread = listCreate(); + t->clients = listCreate(); + atomicSetWithSync(t->paused, IO_THREAD_UNPAUSED); + + pthread_mutexattr_t *attr = NULL; + #if defined(__linux__) && defined(__GLIBC__) + attr = zmalloc(sizeof(pthread_mutexattr_t)); + pthread_mutexattr_init(attr); + pthread_mutexattr_settype(attr, PTHREAD_MUTEX_ADAPTIVE_NP); + #endif + pthread_mutex_init(&t->pending_clients_mutex, attr); + + t->pending_clients_notifier = createEventNotifier(); + if (aeCreateFileEvent(t->el, getReadEventFd(t->pending_clients_notifier), + AE_READABLE, handleClientsFromMainThread, t) != AE_OK) + { + serverLog(LL_WARNING, "Fatal: Can't register file event for IO thread notifications."); + exit(1); + } + + /* Create IO thread */ + if (pthread_create(&t->tid, NULL, IOThreadMain, (void*)t) != 0) { + serverLog(LL_WARNING, "Fatal: Can't initialize IO thread."); + exit(1); + } + + /* For main thread */ + mainThreadPendingClientsToIOThreads[i] = listCreate(); + mainThreadPendingClients[i] = listCreate(); + mainThreadProcessingClients[i] = listCreate(); + pthread_mutex_init(&mainThreadPendingClientsMutexes[i], attr); + mainThreadPendingClientsNotifiers[i] = createEventNotifier(); + if (aeCreateFileEvent(server.el, getReadEventFd(mainThreadPendingClientsNotifiers[i]), + AE_READABLE, handleClientsFromIOThread, t) != AE_OK) + { + serverLog(LL_WARNING, "Fatal: Can't register file event for main thread notifications."); + exit(1); + } + if (attr) zfree(attr); + } +} + +/* Kill the IO threads, TODO: release the applied resources. */ +void killIOThreads(void) { + if (server.io_threads_num <= 1) return; + + int err, j; + for (j = 1; j < server.io_threads_num; j++) { + if (IOThreads[j].tid == pthread_self()) continue; + if (IOThreads[j].tid && pthread_cancel(IOThreads[j].tid) == 0) { + if ((err = pthread_join(IOThreads[j].tid,NULL)) != 0) { + serverLog(LL_WARNING, + "IO thread(tid:%lu) can not be joined: %s", + (unsigned long)IOThreads[j].tid, strerror(err)); + } else { + serverLog(LL_WARNING, + "IO thread(tid:%lu) terminated",(unsigned long)IOThreads[j].tid); + } + } + } +} diff --git a/src/multi.c b/src/multi.c index 6d1ba5697..1956c3dd8 100644 --- a/src/multi.c +++ b/src/multi.c @@ -355,7 +355,12 @@ int isWatchedKeyExpired(client *c) { } /* "Touch" a key, so that if this key is being WATCHed by some client the - * next EXEC will fail. */ + * next EXEC will fail. + * + * Sanitizer suppression: IO threads also read c->flags, but never modify + * it or read the CLIENT_DIRTY_CAS bit, main thread just only modifies + * this bit, so there is actually no real data race. */ +REDIS_NO_SANITIZE("thread") void touchWatchedKey(redisDb *db, robj *key) { list *clients; listIter li; @@ -404,6 +409,7 @@ void touchWatchedKey(redisDb *db, robj *key) { * replaced_with: for SWAPDB, the WATCH should be invalidated if * the key exists in either of them, and skipped only if it * doesn't exist in both. */ +REDIS_NO_SANITIZE("thread") void touchAllWatchedKeysInDb(redisDb *emptied, redisDb *replaced_with) { listIter li; listNode *ln; diff --git a/src/networking.c b/src/networking.c index 9a9515f77..8fb37af08 100644 --- a/src/networking.c +++ b/src/networking.c @@ -24,7 +24,6 @@ static void setProtocolError(const char *errstr, client *c); static void pauseClientsByClient(mstime_t end, int isPauseClientAll); -int postponeClientRead(client *c); char *getClientSockname(client *c); static inline int clientTypeIsSlave(client *c); int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */ @@ -132,6 +131,9 @@ client *createClient(connection *conn) { uint64_t client_id; atomicGetIncr(server.next_client_id, client_id, 1); c->id = client_id; + c->tid = IOTHREAD_MAIN_THREAD_ID; + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + if (conn) server.io_threads_clients_num[c->tid]++; #ifdef LOG_REQ_RES reqresReset(c, 0); c->resp = server.client_default_resp; @@ -163,6 +165,8 @@ client *createClient(connection *conn) { c->bulklen = -1; c->sentlen = 0; c->flags = 0; + c->io_flags = CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED; + c->read_error = 0; c->slot = -1; c->ctime = c->lastinteraction = server.unixtime; c->duration = 0; @@ -195,8 +199,8 @@ client *createClient(connection *conn) { c->peerid = NULL; c->sockname = NULL; c->client_list_node = NULL; + c->io_thread_client_list_node = NULL; c->postponed_list_node = NULL; - c->pending_read_list_node = NULL; c->client_tracking_redirection = 0; c->client_tracking_prefixes = NULL; c->last_memory_usage = 0; @@ -300,13 +304,8 @@ int prepareClientToWrite(client *c) { if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ /* Schedule the client to write the output buffers to the socket, unless - * it should already be setup to do so (it has already pending data). - * - * If CLIENT_PENDING_READ is set, we're in an IO thread and should - * not put the client in pending write queue. Instead, it will be - * done by handleClientsWithPendingReadsUsingThreads() upon return. - */ - if (!clientHasPendingReplies(c) && io_threads_op == IO_THREADS_OP_IDLE) + * it should already be setup to do so (it has already pending data). */ + if (!clientHasPendingReplies(c) && likely(c->running_tid == IOTHREAD_MAIN_THREAD_ID)) putClientInPendingWriteQueue(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1359,6 +1358,9 @@ void clientAcceptHandler(connection *conn) { moduleFireServerEvent(REDISMODULE_EVENT_CLIENT_CHANGE, REDISMODULE_SUBEVENT_CLIENT_CHANGE_CONNECTED, c); + + /* Assign the client to an IO thread */ + if (server.io_threads_num > 1) assignClientToIOThread(c); } void acceptCommonHandler(connection *conn, int flags, char *ip) { @@ -1547,14 +1549,6 @@ void unlinkClient(client *c) { c->flags &= ~CLIENT_PENDING_WRITE; } - /* Remove from the list of pending reads if needed. */ - serverAssert(!c->conn || io_threads_op == IO_THREADS_OP_IDLE); - if (c->pending_read_list_node != NULL) { - listDelNode(server.clients_pending_read,c->pending_read_list_node); - c->pending_read_list_node = NULL; - } - - /* When client was just unblocked because of a blocking operation, * remove it from the list of unblocked clients. */ if (c->flags & CLIENT_UNBLOCKED) { @@ -1631,7 +1625,7 @@ void deauthenticateAndCloseClient(client *c) { * If any data remained in the buffer, the client will take ownership of the buffer * and a new empty buffer will be allocated for the reusable buffer. */ static void resetReusableQueryBuf(client *c) { - serverAssert(c->flags & CLIENT_REUSABLE_QUERYBUFFER); + serverAssert(c->io_flags & CLIENT_IO_REUSABLE_QUERYBUFFER); if (c->querybuf != thread_reusable_qb || sdslen(c->querybuf) > c->qb_pos) { /* If querybuf has been reallocated or there is still data left, * let the client take ownership of the reusable buffer. */ @@ -1645,7 +1639,7 @@ static void resetReusableQueryBuf(client *c) { /* Mark that the client is no longer using the reusable query buffer * and indicate that it is no longer used by any client. */ - c->flags &= ~CLIENT_REUSABLE_QUERYBUFFER; + c->io_flags &= ~CLIENT_IO_REUSABLE_QUERYBUFFER; thread_reusable_qb_used = 0; } @@ -1659,6 +1653,19 @@ void freeClient(client *c) { return; } + /* If the client is running in io thread, we can't free it directly. */ + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + fetchClientFromIOThread(c); + } + + /* We need to unbind connection of client from io thread event loop first. */ + if (c->tid != IOTHREAD_MAIN_THREAD_ID) { + unbindClientFromIOThreadEventLoop(c); + } + + /* Update the number of clients in the IO thread. */ + if (c->conn) server.io_threads_clients_num[c->tid]--; + /* For connected clients, call the disconnection event of modules hooks. */ if (c->conn) { moduleFireServerEvent(REDISMODULE_EVENT_CLIENT_CHANGE, @@ -1703,7 +1710,7 @@ void freeClient(client *c) { } /* Free the query buffer */ - if (c->flags & CLIENT_REUSABLE_QUERYBUFFER) + if (c->io_flags & CLIENT_IO_REUSABLE_QUERYBUFFER) resetReusableQueryBuf(c); sdsfree(c->querybuf); c->querybuf = NULL; @@ -1816,25 +1823,24 @@ void freeClient(client *c) { * a context where calling freeClient() is not possible, because the client * should be valid for the continuation of the flow of the program. */ void freeClientAsync(client *c) { - /* We need to handle concurrent access to the server.clients_to_close list - * only in the freeClientAsync() function, since it's the only function that - * may access the list while Redis uses I/O threads. All the other accesses - * are in the context of the main thread while the other threads are - * idle. */ + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + int main_thread = pthread_equal(pthread_self(), server.main_thread_id); + /* Make sure the main thread can access IO thread data safely. */ + if (main_thread) pauseIOThread(c->tid); + if (!(c->flags & CLIENT_IO_CLOSE_ASAP)) { + c->io_flags |= CLIENT_IO_CLOSE_ASAP; + enqueuePendingClientsToMainThread(c, 1); + } + if (main_thread) resumeIOThread(c->tid); + return; + } + if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_SCRIPT) return; c->flags |= CLIENT_CLOSE_ASAP; /* Replicas that was marked as CLIENT_CLOSE_ASAP should not keep the * replication backlog from been trimmed. */ if (c->flags & CLIENT_SLAVE) freeReplicaReferencedReplBuffer(c); - if (server.io_threads_num == 1) { - /* no need to bother with locking if there's just one thread (the main thread) */ - listAddNodeTail(server.clients_to_close,c); - return; - } - static pthread_mutex_t async_free_queue_mutex = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&async_free_queue_mutex); listAddNodeTail(server.clients_to_close,c); - pthread_mutex_unlock(&async_free_queue_mutex); } /* Log errors for invalid use and free the client in async way. @@ -1867,7 +1873,7 @@ int beforeNextClient(client *c) { /* Skip the client processing if we're in an IO thread, in that case we'll perform this operation later (this function is called again) in the fan-in stage of the threading mechanism */ - if (io_threads_op != IO_THREADS_OP_IDLE) + if (c && c->running_tid != IOTHREAD_MAIN_THREAD_ID) return C_OK; /* Handle async frees */ /* Note: this doesn't make the server.clients_to_close list redundant because of @@ -2052,8 +2058,12 @@ int _writeToClient(client *c, ssize_t *nwritten) { * set to 0. So when handler_installed is set to 0 the function must be * thread safe. */ int writeToClient(client *c, int handler_installed) { + if (!(c->io_flags & CLIENT_IO_WRITE_ENABLED)) return C_OK; /* Update total number of writes on server */ atomicIncr(server.stat_total_writes_processed, 1); + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + atomicIncr(server.stat_io_writes_processed, 1); + } ssize_t nwritten = 0, totwritten = 0; @@ -2107,7 +2117,7 @@ int writeToClient(client *c, int handler_installed) { * is always called with handler_installed set to 0 from threads * so we are fine. */ if (handler_installed) { - serverAssert(io_threads_op == IO_THREADS_OP_IDLE); + /* IO Thread also can do that now. */ connSetWriteHandler(c->conn, NULL); } @@ -2118,10 +2128,10 @@ int writeToClient(client *c, int handler_installed) { } } /* Update client's memory usage after writing. - * Since this isn't thread safe we do this conditionally. In case of threaded writes this is done in - * handleClientsWithPendingWritesUsingThreads(). */ - if (io_threads_op == IO_THREADS_OP_IDLE) + * Since this isn't thread safe we do this conditionally. */ + if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) { updateClientMemUsageAndBucket(c); + } return C_OK; } @@ -2153,6 +2163,15 @@ int handleClientsWithPendingWrites(void) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; + /* Let IO thread handle the client if possible. */ + if (server.io_threads_num > 1 && + !(c->flags & CLIENT_CLOSE_AFTER_REPLY) && + !isClientMustHandledByMainThread(c)) + { + assignClientToIOThread(c); + continue; + } + /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) continue; @@ -2227,7 +2246,7 @@ void resetClient(client *c) { * path, it is not really released, but only marked for later release. */ void protectClient(client *c) { c->flags |= CLIENT_PROTECTED; - if (c->conn) { + if (c->conn && c->tid == IOTHREAD_MAIN_THREAD_ID) { connSetReadHandler(c->conn,NULL); connSetWriteHandler(c->conn,NULL); } @@ -2238,7 +2257,8 @@ void unprotectClient(client *c) { if (c->flags & CLIENT_PROTECTED) { c->flags &= ~CLIENT_PROTECTED; if (c->conn) { - connSetReadHandler(c->conn,readQueryFromClient); + if (c->tid == IOTHREAD_MAIN_THREAD_ID) + connSetReadHandler(c->conn,readQueryFromClient); if (clientHasPendingReplies(c)) putClientInPendingWriteQueue(c); } } @@ -2263,8 +2283,7 @@ int processInlineBuffer(client *c) { /* Nothing to do without a \r\n */ if (newline == NULL) { if (sdslen(c->querybuf)-c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c,"Protocol error: too big inline request"); - setProtocolError("too big inline request",c); + c->read_error = CLIENT_READ_TOO_BIG_INLINE_REQUEST; } return C_ERR; } @@ -2279,8 +2298,7 @@ int processInlineBuffer(client *c) { argv = sdssplitargs(aux,&argc); sdsfree(aux); if (argv == NULL) { - addReplyError(c,"Protocol error: unbalanced quotes in request"); - setProtocolError("unbalanced quotes in inline request",c); + c->read_error = CLIENT_READ_UNBALANCED_QUOTES; return C_ERR; } @@ -2299,8 +2317,7 @@ int processInlineBuffer(client *c) { * to keep the connection active. */ if (querylen != 0 && c->flags & CLIENT_MASTER) { sdsfreesplitres(argv,argc); - serverLog(LL_WARNING,"WARNING: Receiving inline protocol from master, master stream corruption? Closing the master connection and discarding the cached master."); - setProtocolError("Master using the inline protocol. Desync?",c); + c->read_error = CLIENT_READ_MASTER_USING_INLINE_PROTOCAL; return C_ERR; } @@ -2385,8 +2402,7 @@ int processMultibulkBuffer(client *c) { newline = strchr(c->querybuf+c->qb_pos,'\r'); if (newline == NULL) { if (sdslen(c->querybuf)-c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c,"Protocol error: too big mbulk count string"); - setProtocolError("too big mbulk count string",c); + c->read_error = CLIENT_READ_TOO_BIG_MBULK_COUNT_STRING; } return C_ERR; } @@ -2400,12 +2416,10 @@ int processMultibulkBuffer(client *c) { serverAssertWithInfo(c,NULL,c->querybuf[c->qb_pos] == '*'); ok = string2ll(c->querybuf+1+c->qb_pos,newline-(c->querybuf+1+c->qb_pos),&ll); if (!ok || ll > INT_MAX) { - addReplyError(c,"Protocol error: invalid multibulk length"); - setProtocolError("invalid mbulk count",c); + c->read_error = CLIENT_READ_INVALID_MULTIBUCK_LENGTH; return C_ERR; } else if (ll > 10 && authRequired(c)) { - addReplyError(c, "Protocol error: unauthenticated multibulk length"); - setProtocolError("unauth mbulk count", c); + c->read_error = CLIENT_READ_UNAUTH_MBUCK_COUNT; return C_ERR; } @@ -2432,9 +2446,7 @@ int processMultibulkBuffer(client *c) { newline = strchr(c->querybuf+c->qb_pos,'\r'); if (newline == NULL) { if (sdslen(c->querybuf)-c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c, - "Protocol error: too big bulk count string"); - setProtocolError("too big bulk count string",c); + c->read_error = CLIENT_READ_TOO_BIG_BUCK_COUNT_STRING; return C_ERR; } break; @@ -2445,22 +2457,17 @@ int processMultibulkBuffer(client *c) { break; if (c->querybuf[c->qb_pos] != '$') { - addReplyErrorFormat(c, - "Protocol error: expected '$', got '%c'", - c->querybuf[c->qb_pos]); - setProtocolError("expected $ but got something else",c); + c->read_error = CLIENT_READ_EXPECTED_DOLLAR; return C_ERR; } ok = string2ll(c->querybuf+c->qb_pos+1,newline-(c->querybuf+c->qb_pos+1),&ll); if (!ok || ll < 0 || (!(c->flags & CLIENT_MASTER) && ll > server.proto_max_bulk_len)) { - addReplyError(c,"Protocol error: invalid bulk length"); - setProtocolError("invalid bulk length",c); + c->read_error = CLIENT_READ_INVALID_BUCK_LENGTH; return C_ERR; } else if (ll > 16384 && authRequired(c)) { - addReplyError(c, "Protocol error: unauthenticated bulk length"); - setProtocolError("unauth bulk length", c); + c->read_error = CLIENT_READ_UNAUTH_BUCK_LENGTH; return C_ERR; } @@ -2637,6 +2644,74 @@ int processPendingCommandAndInputBuffer(client *c) { return C_OK; } +void handleClientReadError(client *c) { + switch (c->read_error) { + case CLIENT_READ_TOO_BIG_INLINE_REQUEST: + addReplyError(c,"Protocol error: too big inline request"); + setProtocolError("too big inline request",c); + break; + case CLIENT_READ_UNBALANCED_QUOTES: + addReplyError(c,"Protocol error: unbalanced quotes in request"); + setProtocolError("unbalanced quotes in request",c); + break; + case CLIENT_READ_MASTER_USING_INLINE_PROTOCAL: + serverLog(LL_WARNING,"WARNING: Receiving inline protocol from master, master stream corruption? Closing the master connection and discarding the cached master."); + setProtocolError("Master using the inline protocol. Desync?",c); + break; + case CLIENT_READ_TOO_BIG_MBULK_COUNT_STRING: + addReplyError(c,"Protocol error: too big mbulk count string"); + setProtocolError("too big mbulk count string",c); + break; + case CLIENT_READ_TOO_BIG_BUCK_COUNT_STRING: + addReplyError(c, "Protocol error: too big bulk count string"); + setProtocolError("too big bulk count string",c); + break; + case CLIENT_READ_EXPECTED_DOLLAR: + addReplyErrorFormat(c, + "Protocol error: expected '$', got '%c'", + c->querybuf[c->qb_pos]); + setProtocolError("expected $ but got something else",c); + break; + case CLIENT_READ_INVALID_BUCK_LENGTH: + addReplyError(c,"Protocol error: invalid bulk length"); + setProtocolError("invalid bulk length",c); + break; + case CLIENT_READ_UNAUTH_BUCK_LENGTH: + addReplyError(c, "Protocol error: unauthenticated bulk length"); + setProtocolError("unauth bulk length", c); + break; + case CLIENT_READ_INVALID_MULTIBUCK_LENGTH: + addReplyError(c,"Protocol error: invalid multibulk length"); + setProtocolError("invalid mbulk count",c); + break; + case CLIENT_READ_UNAUTH_MBUCK_COUNT: + addReplyError(c, "Protocol error: unauthenticated multibulk length"); + setProtocolError("unauth mbulk count", c); + break; + case CLIENT_READ_CONN_DISCONNECTED: + serverLog(LL_VERBOSE, "Reading from client: %s",connGetLastError(c->conn)); + break; + case CLIENT_READ_CONN_CLOSED: + if (server.verbosity <= LL_VERBOSE) { + sds info = catClientInfoString(sdsempty(), c); + serverLog(LL_VERBOSE, "Client closed connection %s", info); + sdsfree(info); + } + break; + case CLIENT_READ_REACHED_MAX_QUERYBUF: { + sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty(); + bytes = sdscatrepr(bytes,c->querybuf,64); + serverLog(LL_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes); + sdsfree(ci); + sdsfree(bytes); + break; + } + default: + serverPanic("Unknown client read error"); + break; + } +} + /* This function is called every time, in the client structure 'c', there is * more query buffer to process, because we read more data from the socket * or because a client was blocked and later reactivated, so there could be @@ -2656,7 +2731,7 @@ int processInputBuffer(client *c) { * condition on the slave. We want just to accumulate the replication * stream (instead of replying -BUSY like we do with other clients) and * later resume the processing. */ - if (isInsideYieldingLongCommand() && c->flags & CLIENT_MASTER) break; + if (c->flags & CLIENT_MASTER && isInsideYieldingLongCommand()) break; /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is * written to the client. Make sure to not let the reply grow after @@ -2675,23 +2750,34 @@ int processInputBuffer(client *c) { } if (c->reqtype == PROTO_REQ_INLINE) { - if (processInlineBuffer(c) != C_OK) break; + if (processInlineBuffer(c) != C_OK) { + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID && c->read_error) + enqueuePendingClientsToMainThread(c, 0); + break; + } } else if (c->reqtype == PROTO_REQ_MULTIBULK) { - if (processMultibulkBuffer(c) != C_OK) break; + if (processMultibulkBuffer(c) != C_OK) { + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID && c->read_error) + enqueuePendingClientsToMainThread(c, 0); + break; + } } else { serverPanic("Unknown request type"); } /* Multibulk processing could see a <= 0 length. */ if (c->argc == 0) { - resetClientInternal(c, 0); + freeClientArgvInternal(c, 0); + c->reqtype = 0; + c->multibulklen = 0; + c->bulklen = -1; } else { /* If we are in the context of an I/O thread, we can't really * execute the command here. All we can do is to flag the client * as one that needs to process the command. */ - if (io_threads_op != IO_THREADS_OP_IDLE) { - serverAssert(io_threads_op == IO_THREADS_OP_READ); - c->flags |= CLIENT_PENDING_COMMAND; + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + c->io_flags |= CLIENT_IO_PENDING_COMMAND; + enqueuePendingClientsToMainThread(c, 0); break; } @@ -2732,7 +2818,7 @@ int processInputBuffer(client *c) { /* Update client memory usage after processing the query buffer, this is * important in case the query buffer is big and wasn't drained during * the above loop (because of partially sent big commands). */ - if (io_threads_op == IO_THREADS_OP_IDLE) + if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) updateClientMemUsageAndBucket(c); return C_OK; @@ -2742,13 +2828,14 @@ void readQueryFromClient(connection *conn) { client *c = connGetPrivateData(conn); int nread, big_arg = 0; size_t qblen, readlen; - - /* Check if we want to read from the client later when exiting from - * the event loop. This is the case if threaded I/O is enabled. */ - if (postponeClientRead(c)) return; + if (!(c->io_flags & CLIENT_IO_READ_ENABLED)) return; + c->read_error = 0; /* Update total number of reads on server */ atomicIncr(server.stat_total_reads_processed, 1); + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + atomicIncr(server.stat_io_reads_processed, 1); + } readlen = PROTO_IOBUF_LEN; /* If this is a multi bulk request, and we are processing a bulk reply @@ -2793,7 +2880,7 @@ void readQueryFromClient(connection *conn) { /* Assign the reusable query buffer to the client and mark it as in use. */ serverAssert(sdslen(thread_reusable_qb) == 0); c->querybuf = thread_reusable_qb; - c->flags |= CLIENT_REUSABLE_QUERYBUFFER; + c->io_flags |= CLIENT_IO_REUSABLE_QUERYBUFFER; thread_reusable_qb_used = 1; } } @@ -2821,16 +2908,12 @@ void readQueryFromClient(connection *conn) { if (connGetState(conn) == CONN_STATE_CONNECTED) { goto done; } else { - serverLog(LL_VERBOSE, "Reading from client: %s",connGetLastError(c->conn)); + c->read_error = CLIENT_READ_CONN_DISCONNECTED; freeClientAsync(c); goto done; } } else if (nread == 0) { - if (server.verbosity <= LL_VERBOSE) { - sds info = catClientInfoString(sdsempty(), c); - serverLog(LL_VERBOSE, "Client closed connection %s", info); - sdsfree(info); - } + c->read_error = CLIENT_READ_CONN_CLOSED; freeClientAsync(c); goto done; } @@ -2853,13 +2936,9 @@ void readQueryFromClient(connection *conn) { * * For unauthenticated clients, the query buffer cannot exceed 1MB at most. */ (c->mstate.argv_len_sums + sdslen(c->querybuf) > server.client_max_querybuf_len || - (c->mstate.argv_len_sums + sdslen(c->querybuf) > 1024*1024 && authRequired(c)))) { - sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty(); - - bytes = sdscatrepr(bytes,c->querybuf,64); - serverLog(LL_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes); - sdsfree(ci); - sdsfree(bytes); + (c->mstate.argv_len_sums + sdslen(c->querybuf) > 1024*1024 && authRequired(c)))) + { + c->read_error = CLIENT_READ_REACHED_MAX_QUERYBUF; freeClientAsync(c); atomicIncr(server.stat_client_qbuf_limit_disconnections, 1); goto done; @@ -2871,7 +2950,13 @@ void readQueryFromClient(connection *conn) { c = NULL; done: - if (c && (c->flags & CLIENT_REUSABLE_QUERYBUFFER)) { + if (c && c->read_error) { + if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) { + handleClientReadError(c); + } + } + + if (c && (c->io_flags & CLIENT_IO_REUSABLE_QUERYBUFFER)) { serverAssert(c->qb_pos == 0); /* Ensure the client's query buffer is trimmed in processInputBuffer */ resetReusableQueryBuf(c); } @@ -2933,6 +3018,16 @@ char *getClientSockname(client *c) { sds catClientInfoString(sds s, client *client) { char flags[17], events[3], conninfo[CONN_INFO_LEN], *p; + /* Pause IO thread to access data of the client safely. */ + int paused = 0; + if (client->running_tid != IOTHREAD_MAIN_THREAD_ID && + pthread_equal(server.main_thread_id, pthread_self()) && + !server.crashing) + { + paused = 1; + pauseIOThread(client->running_tid); + } + p = flags; if (client->flags & CLIENT_SLAVE) { if (client->flags & CLIENT_MONITOR) @@ -3006,7 +3101,10 @@ sds catClientInfoString(sds s, client *client) { " redir=%I", (client->flags & CLIENT_TRACKING) ? (long long) client->client_tracking_redirection : -1, " resp=%i", client->resp, " lib-name=%s", client->lib_name ? (char*)client->lib_name->ptr : "", - " lib-ver=%s", client->lib_ver ? (char*)client->lib_ver->ptr : "")); + " lib-ver=%s", client->lib_ver ? (char*)client->lib_ver->ptr : "", + " io-thread=%i", client->tid)); + + if (paused) resumeIOThread(client->running_tid); return ret; } @@ -3016,6 +3114,17 @@ sds getAllClientsInfoString(int type) { client *client; sds o = sdsnewlen(SDS_NOINIT,200*listLength(server.clients)); sdsclear(o); + + /* Pause all IO threads to access data of clients safely, and pausing the + * specific IO thread will not repeatedly execute in catClientInfoString. */ + int allpaused = 0; + if (server.io_threads_num > 1 && !server.crashing && + pthread_equal(server.main_thread_id, pthread_self())) + { + allpaused = 1; + pauseAllIOThreads(); + } + listRewind(server.clients,&li); while ((ln = listNext(&li)) != NULL) { client = listNodeValue(ln); @@ -3023,6 +3132,8 @@ sds getAllClientsInfoString(int type) { o = catClientInfoString(o,client); o = sdscatlen(o,"\n",1); } + + if (allpaused) resumeAllIOThreads(); return o; } @@ -4331,388 +4442,6 @@ void processEventsWhileBlocked(void) { server.cmd_time_snapshot = prev_cmd_time_snapshot; } -/* ========================================================================== - * Threaded I/O - * ========================================================================== */ - -#define IO_THREADS_MAX_NUM 128 - -typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) threads_pending { - redisAtomic unsigned long value; -} threads_pending; - -pthread_t io_threads[IO_THREADS_MAX_NUM]; -pthread_mutex_t io_threads_mutex[IO_THREADS_MAX_NUM]; -threads_pending io_threads_pending[IO_THREADS_MAX_NUM]; -int io_threads_op; /* IO_THREADS_OP_IDLE, IO_THREADS_OP_READ or IO_THREADS_OP_WRITE. */ // TODO: should access to this be atomic??! - -/* This is the list of clients each thread will serve when threaded I/O is - * used. We spawn io_threads_num-1 threads, since one is the main thread - * itself. */ -list *io_threads_list[IO_THREADS_MAX_NUM]; - -static inline unsigned long getIOPendingCount(int i) { - unsigned long count = 0; - atomicGetWithSync(io_threads_pending[i].value, count); - return count; -} - -static inline void setIOPendingCount(int i, unsigned long count) { - atomicSetWithSync(io_threads_pending[i].value, count); -} - -void *IOThreadMain(void *myid) { - /* The ID is the thread number (from 0 to server.io_threads_num-1), and is - * used by the thread to just manipulate a single sub-array of clients. */ - long id = (unsigned long)myid; - char thdname[16]; - - snprintf(thdname, sizeof(thdname), "io_thd_%ld", id); - redis_set_thread_title(thdname); - redisSetCpuAffinity(server.server_cpulist); - makeThreadKillable(); - - while(1) { - /* Wait for start */ - for (int j = 0; j < 1000000; j++) { - if (getIOPendingCount(id) != 0) break; - } - - /* Give the main thread a chance to stop this thread. */ - if (getIOPendingCount(id) == 0) { - pthread_mutex_lock(&io_threads_mutex[id]); - pthread_mutex_unlock(&io_threads_mutex[id]); - continue; - } - - serverAssert(getIOPendingCount(id) != 0); - - /* Process: note that the main thread will never touch our list - * before we drop the pending count to 0. */ - listIter li; - listNode *ln; - listRewind(io_threads_list[id],&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - if (io_threads_op == IO_THREADS_OP_WRITE) { - writeToClient(c,0); - } else if (io_threads_op == IO_THREADS_OP_READ) { - readQueryFromClient(c->conn); - } else { - serverPanic("io_threads_op value is unknown"); - } - } - listEmpty(io_threads_list[id]); - setIOPendingCount(id, 0); - } -} - -/* Initialize the data structures needed for threaded I/O. */ -void initThreadedIO(void) { - server.io_threads_active = 0; /* We start with threads not active. */ - - /* Indicate that io-threads are currently idle */ - io_threads_op = IO_THREADS_OP_IDLE; - - /* Don't spawn any thread if the user selected a single thread: - * we'll handle I/O directly from the main thread. */ - if (server.io_threads_num == 1) return; - - if (server.io_threads_num > IO_THREADS_MAX_NUM) { - serverLog(LL_WARNING,"Fatal: too many I/O threads configured. " - "The maximum number is %d.", IO_THREADS_MAX_NUM); - exit(1); - } - - /* Spawn and initialize the I/O threads. */ - for (int i = 0; i < server.io_threads_num; i++) { - /* Things we do for all the threads including the main thread. */ - io_threads_list[i] = listCreate(); - if (i == 0) continue; /* Thread 0 is the main thread. */ - - /* Things we do only for the additional threads. */ - pthread_t tid; - pthread_mutex_init(&io_threads_mutex[i],NULL); - setIOPendingCount(i, 0); - pthread_mutex_lock(&io_threads_mutex[i]); /* Thread will be stopped. */ - if (pthread_create(&tid,NULL,IOThreadMain,(void*)(long)i) != 0) { - serverLog(LL_WARNING,"Fatal: Can't initialize IO thread."); - exit(1); - } - io_threads[i] = tid; - } -} - -void killIOThreads(void) { - int err, j; - for (j = 0; j < server.io_threads_num; j++) { - if (io_threads[j] == pthread_self()) continue; - if (io_threads[j] && pthread_cancel(io_threads[j]) == 0) { - if ((err = pthread_join(io_threads[j],NULL)) != 0) { - serverLog(LL_WARNING, - "IO thread(tid:%lu) can not be joined: %s", - (unsigned long)io_threads[j], strerror(err)); - } else { - serverLog(LL_WARNING, - "IO thread(tid:%lu) terminated",(unsigned long)io_threads[j]); - } - } - } -} - -void startThreadedIO(void) { - serverAssert(server.io_threads_active == 0); - for (int j = 1; j < server.io_threads_num; j++) - pthread_mutex_unlock(&io_threads_mutex[j]); - server.io_threads_active = 1; -} - -void stopThreadedIO(void) { - /* We may have still clients with pending reads when this function - * is called: handle them before stopping the threads. */ - handleClientsWithPendingReadsUsingThreads(); - serverAssert(server.io_threads_active == 1); - for (int j = 1; j < server.io_threads_num; j++) - pthread_mutex_lock(&io_threads_mutex[j]); - server.io_threads_active = 0; -} - -/* This function checks if there are not enough pending clients to justify - * taking the I/O threads active: in that case I/O threads are stopped if - * currently active. We track the pending writes as a measure of clients - * we need to handle in parallel, however the I/O threading is disabled - * globally for reads as well if we have too little pending clients. - * - * The function returns 0 if the I/O threading should be used because there - * are enough active threads, otherwise 1 is returned and the I/O threads - * could be possibly stopped (if already active) as a side effect. */ -int stopThreadedIOIfNeeded(void) { - int pending = listLength(server.clients_pending_write); - - /* Return ASAP if IO threads are disabled (single threaded mode). */ - if (server.io_threads_num == 1) return 1; - - if (pending < (server.io_threads_num*2)) { - if (server.io_threads_active) stopThreadedIO(); - return 1; - } else { - return 0; - } -} - -/* This function achieves thread safety using a fan-out -> fan-in paradigm: - * Fan out: The main thread fans out work to the io-threads which block until - * setIOPendingCount() is called with a value larger than 0 by the main thread. - * Fan in: The main thread waits until getIOPendingCount() returns 0. Then - * it can safely perform post-processing and return to normal synchronous - * work. */ -int handleClientsWithPendingWritesUsingThreads(void) { - int processed = listLength(server.clients_pending_write); - if (processed == 0) return 0; /* Return ASAP if there are no clients. */ - - /* If I/O threads are disabled or we have few clients to serve, don't - * use I/O threads, but the boring synchronous code. */ - if (server.io_threads_num == 1 || stopThreadedIOIfNeeded()) { - return handleClientsWithPendingWrites(); - } - - /* Start threads if needed. */ - if (!server.io_threads_active) startThreadedIO(); - - /* Distribute the clients across N different lists. */ - listIter li; - listNode *ln; - listRewind(server.clients_pending_write,&li); - int item_id = 0; - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - c->flags &= ~CLIENT_PENDING_WRITE; - - /* Remove clients from the list of pending writes since - * they are going to be closed ASAP. */ - if (c->flags & CLIENT_CLOSE_ASAP) { - listUnlinkNode(server.clients_pending_write, ln); - continue; - } - - /* Since all replicas and replication backlog use global replication - * buffer, to guarantee data accessing thread safe, we must put all - * replicas client into io_threads_list[0] i.e. main thread handles - * sending the output buffer of all replicas. */ - if (unlikely(clientTypeIsSlave(c))) { - listAddNodeTail(io_threads_list[0],c); - continue; - } - - int target_id = item_id % server.io_threads_num; - listAddNodeTail(io_threads_list[target_id],c); - item_id++; - } - - /* Give the start condition to the waiting threads, by setting the - * start condition atomic var. */ - io_threads_op = IO_THREADS_OP_WRITE; - for (int j = 1; j < server.io_threads_num; j++) { - int count = listLength(io_threads_list[j]); - setIOPendingCount(j, count); - } - - /* Also use the main thread to process a slice of clients. */ - listRewind(io_threads_list[0],&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - writeToClient(c,0); - } - listEmpty(io_threads_list[0]); - - /* Wait for all the other threads to end their work. */ - while(1) { - unsigned long pending = 0; - for (int j = 1; j < server.io_threads_num; j++) - pending += getIOPendingCount(j); - if (pending == 0) break; - } - - io_threads_op = IO_THREADS_OP_IDLE; - - /* Run the list of clients again to install the write handler where - * needed. */ - listRewind(server.clients_pending_write,&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - - /* Update the client in the mem usage after we're done processing it in the io-threads */ - updateClientMemUsageAndBucket(c); - - /* Install the write handler if there are pending writes in some - * of the clients. */ - if (clientHasPendingReplies(c)) { - installClientWriteHandler(c); - } - } - while(listLength(server.clients_pending_write) > 0) { - listUnlinkNode(server.clients_pending_write, server.clients_pending_write->head); - } - - /* Update processed count on server */ - server.stat_io_writes_processed += processed; - - return processed; -} - -/* Return 1 if we want to handle the client read later using threaded I/O. - * This is called by the readable handler of the event loop. - * As a side effect of calling this function the client is put in the - * pending read clients and flagged as such. */ -int postponeClientRead(client *c) { - if (server.io_threads_active && - server.io_threads_do_reads && - !ProcessingEventsWhileBlocked && - !(c->flags & (CLIENT_MASTER|CLIENT_SLAVE|CLIENT_BLOCKED)) && - io_threads_op == IO_THREADS_OP_IDLE) - { - listAddNodeHead(server.clients_pending_read,c); - c->pending_read_list_node = listFirst(server.clients_pending_read); - return 1; - } else { - return 0; - } -} - -/* When threaded I/O is also enabled for the reading + parsing side, the - * readable handler will just put normal clients into a queue of clients to - * process (instead of serving them synchronously). This function runs - * the queue using the I/O threads, and process them in order to accumulate - * the reads in the buffers, and also parse the first command available - * rendering it in the client structures. - * This function achieves thread safety using a fan-out -> fan-in paradigm: - * Fan out: The main thread fans out work to the io-threads which block until - * setIOPendingCount() is called with a value larger than 0 by the main thread. - * Fan in: The main thread waits until getIOPendingCount() returns 0. Then - * it can safely perform post-processing and return to normal synchronous - * work. */ -int handleClientsWithPendingReadsUsingThreads(void) { - if (!server.io_threads_active || !server.io_threads_do_reads) return 0; - int processed = listLength(server.clients_pending_read); - if (processed == 0) return 0; - - /* Distribute the clients across N different lists. */ - listIter li; - listNode *ln; - listRewind(server.clients_pending_read,&li); - int item_id = 0; - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - int target_id = item_id % server.io_threads_num; - listAddNodeTail(io_threads_list[target_id],c); - item_id++; - } - - /* Give the start condition to the waiting threads, by setting the - * start condition atomic var. */ - io_threads_op = IO_THREADS_OP_READ; - for (int j = 1; j < server.io_threads_num; j++) { - int count = listLength(io_threads_list[j]); - setIOPendingCount(j, count); - } - - /* Also use the main thread to process a slice of clients. */ - listRewind(io_threads_list[0],&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - readQueryFromClient(c->conn); - } - listEmpty(io_threads_list[0]); - - /* Wait for all the other threads to end their work. */ - while(1) { - unsigned long pending = 0; - for (int j = 1; j < server.io_threads_num; j++) - pending += getIOPendingCount(j); - if (pending == 0) break; - } - - io_threads_op = IO_THREADS_OP_IDLE; - - /* Run the list of clients again to process the new buffers. */ - while(listLength(server.clients_pending_read)) { - ln = listFirst(server.clients_pending_read); - client *c = listNodeValue(ln); - listDelNode(server.clients_pending_read,ln); - c->pending_read_list_node = NULL; - - serverAssert(!(c->flags & CLIENT_BLOCKED)); - - if (beforeNextClient(c) == C_ERR) { - /* If the client is no longer valid, we avoid - * processing the client later. So we just go - * to the next. */ - continue; - } - - /* Once io-threads are idle we can update the client in the mem usage */ - updateClientMemUsageAndBucket(c); - - if (processPendingCommandAndInputBuffer(c) == C_ERR) { - /* If the client is no longer valid, we avoid - * processing the client later. So we just go - * to the next. */ - continue; - } - - /* We may have pending replies if a thread readQueryFromClient() produced - * replies and did not put the client in pending write queue (it can't). - */ - if (!(c->flags & CLIENT_PENDING_WRITE) && clientHasPendingReplies(c)) - putClientInPendingWriteQueue(c); - } - - /* Update processed count on server */ - server.stat_io_reads_processed += processed; - - return processed; -} - /* Returns the actual client eviction limit based on current configuration or * 0 if no limit. */ size_t getClientEvictionLimit(void) { @@ -4752,11 +4481,34 @@ void evictClients(void) { listNode *ln = listNext(&bucket_iter); if (ln) { client *c = ln->value; - sds ci = catClientInfoString(sdsempty(),c); - serverLog(LL_NOTICE, "Evicting client: %s", ci); - freeClient(c); - sdsfree(ci); - server.stat_evictedclients++; + size_t last_memory = c->last_memory_usage; + int tid = c->running_tid; + if (tid != IOTHREAD_MAIN_THREAD_ID) { + pauseIOThread(tid); + /* We need to update the client memory usage and bucket if the client + * is running in IO thread. This is because the client memory usage + * and bucket are updated 'only' in the main thread, such as processing + * command and clientsCron, it may delay updating, to avoid incorrectly + * evicting clients, we update again before evicting, if the memory + * used by the client does not decrease or memory usage bucket is not + * changed, then we will evict it, otherwise, not evict it. */ + updateClientMemUsageAndBucket(c); + } + if (c->last_memory_usage >= last_memory || + c->mem_usage_bucket == &server.client_mem_usage_buckets[curr_bucket]) + { + sds ci = catClientInfoString(sdsempty(),c); + serverLog(LL_NOTICE, "Evicting client: %s", ci); + freeClient(c); + sdsfree(ci); + server.stat_evictedclients++; + } + if (tid != IOTHREAD_MAIN_THREAD_ID) { + resumeIOThread(tid); + /* The 'next' of 'bucket_iter' may be changed after updating client memory + * usage and freeing client, so let reset 'bucket_iter'. */ + listRewind(server.client_mem_usage_buckets[curr_bucket].clients, &bucket_iter); + } } else { curr_bucket--; if (curr_bucket < 0) { diff --git a/src/replication.c b/src/replication.c index abf930e61..79a55d39b 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2925,7 +2925,7 @@ write_error: /* Handle sendCommand() errors. */ } int connectWithMaster(void) { - server.repl_transfer_s = connCreate(connTypeOfReplication()); + server.repl_transfer_s = connCreate(server.el, connTypeOfReplication()); if (connConnect(server.repl_transfer_s, server.masterhost, server.masterport, server.bind_source_addr, syncWithMaster) == C_ERR) { serverLog(LL_WARNING,"Unable to connect to MASTER: %s", diff --git a/src/server.c b/src/server.c index 4b729fede..0b4c95ce8 100644 --- a/src/server.c +++ b/src/server.c @@ -963,7 +963,7 @@ void removeClientFromMemUsageBucket(client *c, int allow_eviction) { * returns 1 if client eviction for this client is allowed, 0 otherwise. */ int updateClientMemUsageAndBucket(client *c) { - serverAssert(io_threads_op == IO_THREADS_OP_IDLE && c->conn); + serverAssert(pthread_equal(pthread_self(), server.main_thread_id) && c->conn); int allow_eviction = clientEvictionAllowed(c); removeClientFromMemUsageBucket(c, allow_eviction); @@ -1015,6 +1015,7 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) { * default server.hz value is 10, so sometimes here we need to process thousands * of clients per second, turning this function into a source of latency. */ +#define CLIENTS_CRON_PAUSE_IOTHREAD 8 #define CLIENTS_CRON_MIN_ITERATIONS 5 void clientsCron(void) { /* Try to process at least numclients/server.hz of clients @@ -1049,6 +1050,15 @@ void clientsCron(void) { ClientsPeakMemInput[zeroidx] = 0; ClientsPeakMemOutput[zeroidx] = 0; + /* Pause the IO threads that are processing clients, to let us access clients + * safely. In order to avoid increasing CPU usage by pausing all threads when + * there are too many io threads, we pause io threads in multiple batches. */ + static int start = 1, end = 0; + if (server.io_threads_num >= 1 && listLength(server.clients) > 0) { + end = start + CLIENTS_CRON_PAUSE_IOTHREAD - 1; + if (end >= server.io_threads_num) end = server.io_threads_num - 1; + pauseIOThreadsRange(start, end); + } while(listLength(server.clients) && iterations--) { client *c; @@ -1059,6 +1069,15 @@ void clientsCron(void) { head = listFirst(server.clients); c = listNodeValue(head); listRotateHeadToTail(server.clients); + + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID && + !(c->running_tid >= start && c->running_tid <= end)) + { + /* Skip clients that are being processed by the IO threads that + * are not paused. */ + continue; + } + /* The following functions do different service checks on the client. * The protocol is that they return non-zero if the client was * terminated. */ @@ -1080,6 +1099,14 @@ void clientsCron(void) { if (closeClientOnOutputBufferLimitReached(c, 0)) continue; } + + /* Resume the IO threads that were paused */ + if (end) { + resumeIOThreadsRange(start, end); + start = end + 1; + if (start >= server.io_threads_num) start = 1; + end = 0; + } } /* This function handles 'background' operations we are required to do @@ -1528,9 +1555,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { migrateCloseTimedoutSockets(); } - /* Stop the I/O threads if we don't have enough pending work. */ - stopThreadedIOIfNeeded(); - /* Resize tracking keys table if needed. This is also done at every * command execution, but we want to be sure that if the last command * executed changes the value via CONFIG SET, the server will perform @@ -1682,24 +1706,28 @@ void beforeSleep(struct aeEventLoop *eventLoop) { * events to handle. */ if (ProcessingEventsWhileBlocked) { uint64_t processed = 0; - processed += handleClientsWithPendingReadsUsingThreads(); - processed += connTypeProcessPendingData(); + processed += connTypeProcessPendingData(server.el); if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); processed += handleClientsWithPendingWrites(); processed += freeClientsInAsyncFreeQueue(); + + /* Let the clients after the blocking call be processed. */ + processClientsOfAllIOThreads(); + /* New connections may have been established while blocked, clients from + * IO thread may have replies to write, ensure they are promptly sent to + * IO threads. */ + processed += sendPendingClientsToIOThreads(); + server.events_processed_while_blocked += processed; return; } - /* We should handle pending reads clients ASAP after event loop. */ - handleClientsWithPendingReadsUsingThreads(); - /* Handle pending data(typical TLS). (must be done before flushAppendOnlyFile) */ - connTypeProcessPendingData(); + connTypeProcessPendingData(server.el); /* If any connection type(typical TLS) still has pending unread data don't sleep at all. */ - int dont_sleep = connTypeHasPendingData(); + int dont_sleep = connTypeHasPendingData(server.el); /* Call the Redis Cluster before sleep function. Note that this function * may change the state of Redis Cluster (from ok to fail or vice versa), @@ -1765,8 +1793,8 @@ void beforeSleep(struct aeEventLoop *eventLoop) { long long prev_fsynced_reploff = server.fsynced_reploff; /* Write the AOF buffer on disk, - * must be done before handleClientsWithPendingWritesUsingThreads, - * in case of appendfsync=always. */ + * must be done before handleClientsWithPendingWrites and + * sendPendingClientsToIOThreads, in case of appendfsync=always. */ if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); @@ -1788,7 +1816,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) { } /* Handle writes with pending output buffers. */ - handleClientsWithPendingWritesUsingThreads(); + handleClientsWithPendingWrites(); + + /* Let io thread to handle its pending clients. */ + sendPendingClientsToIOThreads(); /* Record cron time in beforeSleep. This does not include the time consumed by AOF writing and IO writing above. */ monotime cron_start_time_after_write = getMonotonicUs(); @@ -2117,6 +2148,7 @@ void initServerConfig(void) { memset(server.blocked_clients_by_type,0, sizeof(server.blocked_clients_by_type)); server.shutdown_asap = 0; + server.crashing = 0; server.shutdown_flags = 0; server.shutdown_mstime = 0; server.cluster_module_flags = CLUSTER_MODULE_FLAG_NONE; @@ -2583,9 +2615,9 @@ void resetServerStats(void) { server.stat_sync_full = 0; server.stat_sync_partial_ok = 0; server.stat_sync_partial_err = 0; - server.stat_io_reads_processed = 0; + atomicSet(server.stat_io_reads_processed, 0); atomicSet(server.stat_total_reads_processed, 0); - server.stat_io_writes_processed = 0; + atomicSet(server.stat_io_writes_processed, 0); atomicSet(server.stat_total_writes_processed, 0); atomicSet(server.stat_client_qbuf_limit_disconnections, 0); server.stat_client_outbuf_limit_disconnections = 0; @@ -2778,6 +2810,7 @@ void initServer(void) { server.aof_last_write_errno = 0; server.repl_good_slaves_count = 0; server.last_sig_received = 0; + memset(server.io_threads_clients_num, 0, sizeof(server.io_threads_clients_num)); /* Initiate acl info struct */ server.acl_info.invalid_cmd_accesses = 0; @@ -5535,7 +5568,7 @@ void releaseInfoSectionDict(dict *sec) { * The resulting dictionary should be released with releaseInfoSectionDict. */ dict *genInfoSectionDict(robj **argv, int argc, char **defaults, int *out_all, int *out_everything) { char *default_sections[] = { - "server", "clients", "memory", "persistence", "stats", "replication", + "server", "clients", "memory", "persistence", "stats", "replication", "threads", "cpu", "module_list", "errorstats", "cluster", "keyspace", "keysizes", NULL}; if (!defaults) defaults = default_sections; @@ -5886,6 +5919,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { long long current_active_defrag_time = server.stat_last_active_defrag_time ? (long long) elapsedUs(server.stat_last_active_defrag_time): 0; long long stat_client_qbuf_limit_disconnections; + long long stat_io_reads_processed, stat_io_writes_processed; atomicGet(server.stat_total_reads_processed, stat_total_reads_processed); atomicGet(server.stat_total_writes_processed, stat_total_writes_processed); atomicGet(server.stat_net_input_bytes, stat_net_input_bytes); @@ -5893,6 +5927,8 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { atomicGet(server.stat_net_repl_input_bytes, stat_net_repl_input_bytes); atomicGet(server.stat_net_repl_output_bytes, stat_net_repl_output_bytes); atomicGet(server.stat_client_qbuf_limit_disconnections, stat_client_qbuf_limit_disconnections); + atomicGet(server.stat_io_reads_processed, stat_io_reads_processed); + atomicGet(server.stat_io_writes_processed, stat_io_writes_processed); if (sections++) info = sdscat(info,"\r\n"); info = sdscatprintf(info, "# Stats\r\n" FMTARGS( @@ -5944,8 +5980,8 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "dump_payload_sanitizations:%lld\r\n", server.stat_dump_payload_sanitizations, "total_reads_processed:%lld\r\n", stat_total_reads_processed, "total_writes_processed:%lld\r\n", stat_total_writes_processed, - "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed, - "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed, + "io_threaded_reads_processed:%lld\r\n", stat_io_reads_processed, + "io_threaded_writes_processed:%lld\r\n", stat_io_writes_processed, "client_query_buffer_limit_disconnections:%lld\r\n", stat_client_qbuf_limit_disconnections, "client_output_buffer_limit_disconnections:%lld\r\n", server.stat_client_outbuf_limit_disconnections, "reply_buffer_shrinks:%lld\r\n", server.stat_reply_buffer_shrinks, @@ -6094,6 +6130,15 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { #endif /* RUSAGE_THREAD */ } + /* Threads */ + if (all_sections || (dictFind(section_dict,"threads") != NULL)) { + if (sections++) info = sdscat(info,"\r\n"); + info = sdscatprintf(info, "# Threads\r\n"); + for (j = 0; j < server.io_threads_num; j++) { + info = sdscatprintf(info, "io_thread_%d:clients=%d\r\n", j, server.io_threads_clients_num[j]); + } + } + /* Modules */ if (all_sections || (dictFind(section_dict,"module_list") != NULL) || (dictFind(section_dict,"modules") != NULL)) { if (sections++) info = sdscat(info,"\r\n"); diff --git a/src/server.h b/src/server.h index 205d73c68..bc965999e 100644 --- a/src/server.h +++ b/src/server.h @@ -61,6 +61,7 @@ typedef long long ustime_t; /* microsecond time type. */ N-elements flat arrays */ #include "rax.h" /* Radix tree */ #include "connection.h" /* Connection abstraction */ +#include "eventnotifier.h" /* Event notification */ #define REDISMODULE_CORE 1 typedef struct redisObject robj; @@ -184,6 +185,14 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; /* Hash table parameters */ #define HASHTABLE_MAX_LOAD_FACTOR 1.618 /* Maximum hash table load factor. */ +/* Max number of IO threads */ +#define IO_THREADS_MAX_NUM 128 + +/* Main thread id for doing IO work, whatever we enable or disable io thread + * the main thread always does IO work, so we can consider that the main thread + * is the io thread 0. */ +#define IOTHREAD_MAIN_THREAD_ID 0 + /* Command flags. Please check the definition of struct redisCommand in this file * for more information about the meaning of every flag. */ #define CMD_WRITE (1ULL<<0) @@ -385,11 +394,33 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CLIENT_MODULE_PREVENT_AOF_PROP (1ULL<<48) /* Module client do not want to propagate to AOF */ #define CLIENT_MODULE_PREVENT_REPL_PROP (1ULL<<49) /* Module client do not want to propagate to replica */ #define CLIENT_REPROCESSING_COMMAND (1ULL<<50) /* The client is re-processing the command. */ -#define CLIENT_REUSABLE_QUERYBUFFER (1ULL<<51) /* The client is using the reusable query buffer. */ /* Any flag that does not let optimize FLUSH SYNC to run it in bg as blocking client ASYNC */ #define CLIENT_AVOID_BLOCKING_ASYNC_FLUSH (CLIENT_DENY_BLOCKING|CLIENT_MULTI|CLIENT_LUA_DEBUG|CLIENT_LUA_DEBUG_SYNC|CLIENT_MODULE) +/* Client flags for client IO */ +#define CLIENT_IO_READ_ENABLED (1ULL<<0) /* Client can read from socket. */ +#define CLIENT_IO_WRITE_ENABLED (1ULL<<1) /* Client can write to socket. */ +#define CLIENT_IO_PENDING_COMMAND (1ULL<<2) /* Similar to CLIENT_PENDING_COMMAND. */ +#define CLIENT_IO_REUSABLE_QUERYBUFFER (1ULL<<3) /* The client is using the reusable query buffer. */ +#define CLIENT_IO_CLOSE_ASAP (1ULL<<4) /* Close this client ASAP in IO thread. */ + +/* Definitions for client read errors. These error codes are used to indicate + * various issues that can occur while reading or parsing data from a client. */ +#define CLIENT_READ_TOO_BIG_INLINE_REQUEST 1 +#define CLIENT_READ_UNBALANCED_QUOTES 2 +#define CLIENT_READ_MASTER_USING_INLINE_PROTOCAL 3 +#define CLIENT_READ_TOO_BIG_MBULK_COUNT_STRING 4 +#define CLIENT_READ_TOO_BIG_BUCK_COUNT_STRING 5 +#define CLIENT_READ_EXPECTED_DOLLAR 6 +#define CLIENT_READ_INVALID_BUCK_LENGTH 7 +#define CLIENT_READ_UNAUTH_BUCK_LENGTH 8 +#define CLIENT_READ_INVALID_MULTIBUCK_LENGTH 9 +#define CLIENT_READ_UNAUTH_MBUCK_COUNT 10 +#define CLIENT_READ_CONN_DISCONNECTED 11 +#define CLIENT_READ_CONN_CLOSED 12 +#define CLIENT_READ_REACHED_MAX_QUERYBUF 13 + /* Client block type (btype field in client structure) * if CLIENT_BLOCKED flag is set. */ typedef enum blocking_type { @@ -578,6 +609,12 @@ typedef enum { #define SHUTDOWN_NOW 4 /* Don't wait for replicas to catch up. */ #define SHUTDOWN_FORCE 8 /* Don't let errors prevent shutdown. */ +/* IO thread pause status */ +#define IO_THREAD_UNPAUSED 0 +#define IO_THREAD_PAUSING 1 +#define IO_THREAD_PAUSED 2 +#define IO_THREAD_RESUMING 3 + /* Command call flags, see call() function */ #define CMD_CALL_NONE 0 #define CMD_CALL_PROPAGATE_AOF (1<<0) @@ -1159,6 +1196,10 @@ typedef struct client { uint64_t id; /* Client incremental unique ID. */ uint64_t flags; /* Client flags: CLIENT_* macros. */ connection *conn; + uint8_t tid; /* Thread assigned ID this client is bound to. */ + uint8_t running_tid; /* Thread assigned ID this client is running on. */ + uint8_t io_flags; /* Accessed by both main and IO threads, but not modified concurrently */ + uint8_t read_error; /* Client read error: CLIENT_READ_* macros. */ int resp; /* RESP protocol version. Can be 2 or 3. */ redisDb *db; /* Pointer to currently SELECTed DB. */ robj *name; /* As set by CLIENT SETNAME. */ @@ -1226,8 +1267,8 @@ typedef struct client { sds peerid; /* Cached peer ID. */ sds sockname; /* Cached connection target address. */ listNode *client_list_node; /* list node in client list */ + listNode *io_thread_client_list_node; /* list node in io thread client list */ listNode *postponed_list_node; /* list node within the postponed list */ - listNode *pending_read_list_node; /* list node in clients pending read list */ void *module_blocked_client; /* Pointer to the RedisModuleBlockedClient associated with this * client. This is set in case of module authentication before the * unblocked client is reprocessed to handle reply callbacks. */ @@ -1280,6 +1321,20 @@ typedef struct client { #endif } client; +typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) { + uint8_t id; /* The unique ID assigned, if IO_THREADS_MAX_NUM is more + * than 256, we should also promote the data type. */ + pthread_t tid; /* Pthread ID */ + redisAtomic int paused; /* Paused status for the io thread. */ + aeEventLoop *el; /* Main event loop of io thread. */ + list *pending_clients; /* List of clients with pending writes. */ + list *processing_clients; /* List of clients being processed. */ + eventNotifier *pending_clients_notifier; /* Used to wake up the loop when write should be performed. */ + pthread_mutex_t pending_clients_mutex; /* Mutex for pending write list */ + list *pending_clients_to_main_thread; /* Clients that are waiting to be executed by the main thread. */ + list *clients; /* IO thread managed clients. */ +} IOThread; + /* ACL information */ typedef struct aclInfo { long long user_auth_failures; /* Auth failure counts on user level */ @@ -1568,6 +1623,7 @@ struct redisServer { int errors_enabled; /* If true, errorstats is enabled, and we will add new errors. */ unsigned int lruclock; /* Clock for LRU eviction */ volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */ + volatile sig_atomic_t crashing; /* Server is crashing report. */ mstime_t shutdown_mstime; /* Timestamp to limit graceful shutdown. */ int last_sig_received; /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */ int shutdown_flags; /* Flags passed to prepareForShutdown(). */ @@ -1638,6 +1694,7 @@ struct redisServer { redisAtomic uint64_t next_client_id; /* Next client unique ID. Incremental. */ int protected_mode; /* Don't accept external connections. */ int io_threads_num; /* Number of IO threads to use. */ + int io_threads_clients_num[IO_THREADS_MAX_NUM]; /* Number of clients assigned to each IO thread. */ int io_threads_do_reads; /* Read and parse from IO threads? */ int io_threads_active; /* Is IO threads currently active? */ long long events_processed_while_blocked; /* processEventsWhileBlocked() */ @@ -1710,8 +1767,8 @@ struct redisServer { long long stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to master, etc.) error replies */ long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */ long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ - long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */ - long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */ + redisAtomic long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */ + redisAtomic long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */ redisAtomic long long stat_total_reads_processed; /* Total number of read events processed */ redisAtomic long long stat_total_writes_processed; /* Total number of write events processed */ redisAtomic long long stat_client_qbuf_limit_disconnections; /* Total number of clients reached query buf length limit */ @@ -2461,11 +2518,6 @@ typedef struct { #define OBJ_HASH_KEY 1 #define OBJ_HASH_VALUE 2 -#define IO_THREADS_OP_IDLE 0 -#define IO_THREADS_OP_READ 1 -#define IO_THREADS_OP_WRITE 2 -extern int io_threads_op; - /* Hash-field data type (of t_hash.c) */ typedef mstr hfield; extern mstrKind mstrFieldKind; @@ -2680,9 +2732,6 @@ void whileBlockedCron(void); void blockingOperationStarts(void); void blockingOperationEnds(void); int handleClientsWithPendingWrites(void); -int handleClientsWithPendingWritesUsingThreads(void); -int handleClientsWithPendingReadsUsingThreads(void); -int stopThreadedIOIfNeeded(void); int clientHasPendingReplies(client *c); int updateClientMemUsageAndBucket(client *c); void removeClientFromMemUsageBucket(client *c, int allow_eviction); @@ -2691,13 +2740,32 @@ int writeToClient(client *c, int handler_installed); void linkClient(client *c); void protectClient(client *c); void unprotectClient(client *c); -void initThreadedIO(void); client *lookupClientByID(uint64_t id); int authRequired(client *c); void putClientInPendingWriteQueue(client *c); /* reply macros */ #define ADD_REPLY_BULK_CBUFFER_STRING_CONSTANT(c, str) addReplyBulkCBuffer(c, str, strlen(str)) +/* iothread.c - the threaded io implementation */ +void initThreadedIO(void); +void killIOThreads(void); +void pauseIOThread(int id); +void resumeIOThread(int id); +void pauseAllIOThreads(void); +void resumeAllIOThreads(void); +void pauseIOThreadsRange(int start, int end); +void resumeIOThreadsRange(int start, int end); +int resizeAllIOThreadsEventLoops(size_t newsize); +int sendPendingClientsToIOThreads(void); +void enqueuePendingClientsToMainThread(client *c, int unbind); +void putInPendingClienstForIOThreads(client *c); +void handleClientReadError(client *c); +void unbindClientFromIOThreadEventLoop(client *c); +void processClientsOfAllIOThreads(void); +void assignClientToIOThread(client *c); +void fetchClientFromIOThread(client *c); +int isClientMustHandledByMainThread(client *c); + /* logreqres.c - logging of requests and responses */ void reqresReset(client *c, int free_buf); void reqresSaveClientReplyOffset(client *c); @@ -3901,7 +3969,6 @@ void xorDigest(unsigned char *digest, const void *ptr, size_t len); sds catSubCommandFullname(const char *parent_name, const char *sub_name); void commandAddSubcommand(struct redisCommand *parent, struct redisCommand *subcommand, const char *declared_name); void debugDelay(int usec); -void killIOThreads(void); void killThreads(void); void makeThreadKillable(void); void swapMainDbWithTempDb(redisDb *tempDb); diff --git a/src/socket.c b/src/socket.c index 33c28588a..fd6335251 100644 --- a/src/socket.c +++ b/src/socket.c @@ -53,11 +53,12 @@ static ConnectionType CT_Socket; * be embedded in different structs, not just client. */ -static connection *connCreateSocket(void) { +static connection *connCreateSocket(struct aeEventLoop *el) { connection *conn = zcalloc(sizeof(connection)); conn->type = &CT_Socket; conn->fd = -1; conn->iovcnt = IOV_MAX; + conn->el = el; return conn; } @@ -72,9 +73,9 @@ static connection *connCreateSocket(void) { * is not in an error state (which is not possible for a socket connection, * but could but possible with other protocols). */ -static connection *connCreateAcceptedSocket(int fd, void *priv) { +static connection *connCreateAcceptedSocket(struct aeEventLoop *el, int fd, void *priv) { UNUSED(priv); - connection *conn = connCreateSocket(); + connection *conn = connCreateSocket(el); conn->fd = fd; conn->state = CONN_STATE_ACCEPTING; return conn; @@ -93,7 +94,7 @@ static int connSocketConnect(connection *conn, const char *addr, int port, const conn->state = CONN_STATE_CONNECTING; conn->conn_handler = connect_handler; - aeCreateFileEvent(server.el, conn->fd, AE_WRITABLE, + aeCreateFileEvent(conn->el, conn->fd, AE_WRITABLE, conn->type->ae_handler, conn); return C_OK; @@ -114,7 +115,7 @@ static void connSocketShutdown(connection *conn) { /* Close the connection and free resources. */ static void connSocketClose(connection *conn) { if (conn->fd != -1) { - aeDeleteFileEvent(server.el,conn->fd, AE_READABLE | AE_WRITABLE); + if (conn->el) aeDeleteFileEvent(conn->el, conn->fd, AE_READABLE | AE_WRITABLE); close(conn->fd); conn->fd = -1; } @@ -190,6 +191,15 @@ static int connSocketAccept(connection *conn, ConnectionCallbackFunc accept_hand return ret; } +/* Rebind the connection to another event loop, read/write handlers must not + * be installed in the current event loop, otherwise it will cause two event + * loops to manage the same connection at the same time. */ +static int connSocketRebindEventLoop(connection *conn, aeEventLoop *el) { + serverAssert(!conn->el && !conn->read_handler && !conn->write_handler); + conn->el = el; + return C_OK; +} + /* Register a write handler, to be called when the connection is writable. * If NULL, the existing handler is removed. * @@ -207,9 +217,9 @@ static int connSocketSetWriteHandler(connection *conn, ConnectionCallbackFunc fu else conn->flags &= ~CONN_FLAG_WRITE_BARRIER; if (!conn->write_handler) - aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE); + aeDeleteFileEvent(conn->el,conn->fd,AE_WRITABLE); else - if (aeCreateFileEvent(server.el,conn->fd,AE_WRITABLE, + if (aeCreateFileEvent(conn->el,conn->fd,AE_WRITABLE, conn->type->ae_handler,conn) == AE_ERR) return C_ERR; return C_OK; } @@ -222,9 +232,9 @@ static int connSocketSetReadHandler(connection *conn, ConnectionCallbackFunc fun conn->read_handler = func; if (!conn->read_handler) - aeDeleteFileEvent(server.el,conn->fd,AE_READABLE); + aeDeleteFileEvent(conn->el,conn->fd,AE_READABLE); else - if (aeCreateFileEvent(server.el,conn->fd, + if (aeCreateFileEvent(conn->el,conn->fd, AE_READABLE,conn->type->ae_handler,conn) == AE_ERR) return C_ERR; return C_OK; } @@ -250,7 +260,7 @@ static void connSocketEventHandler(struct aeEventLoop *el, int fd, void *clientD conn->state = CONN_STATE_CONNECTED; } - if (!conn->write_handler) aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE); + if (!conn->write_handler) aeDeleteFileEvent(conn->el, conn->fd, AE_WRITABLE); if (!callHandler(conn, conn->conn_handler)) return; conn->conn_handler = NULL; @@ -291,7 +301,6 @@ static void connSocketAcceptHandler(aeEventLoop *el, int fd, void *privdata, int int cport, cfd; int max = server.max_new_conns_per_cycle; char cip[NET_IP_STR_LEN]; - UNUSED(el); UNUSED(mask); UNUSED(privdata); @@ -304,7 +313,7 @@ static void connSocketAcceptHandler(aeEventLoop *el, int fd, void *privdata, int return; } serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport); - acceptCommonHandler(connCreateAcceptedSocket(cfd, NULL),0,cip); + acceptCommonHandler(connCreateAcceptedSocket(el,cfd,NULL), 0, cip); } } @@ -397,6 +406,10 @@ static ConnectionType CT_Socket = { .blocking_connect = connSocketBlockingConnect, .accept = connSocketAccept, + /* event loop */ + .unbind_event_loop = NULL, + .rebind_event_loop = connSocketRebindEventLoop, + /* IO */ .write = connSocketWrite, .writev = connSocketWritev, diff --git a/src/tls.c b/src/tls.c index 3cc504ad1..3c7b5c0a0 100644 --- a/src/tls.c +++ b/src/tls.c @@ -75,10 +75,6 @@ static int parseProtocolsConfig(const char *str) { return protocols; } -/* list of connections with pending data already read from the socket, but not - * served to the reader yet. */ -static list *pending_list = NULL; - /** * OpenSSL global initialization and locking handling callbacks. * Note that this is only required for OpenSSL < 1.1.0. @@ -144,8 +140,6 @@ static void tlsInit(void) { if (!RAND_poll()) { serverLog(LL_WARNING, "OpenSSL: Failed to seed random number generator."); } - - pending_list = listCreate(); } static void tlsCleanup(void) { @@ -435,20 +429,21 @@ typedef struct tls_connection { listNode *pending_list_node; } tls_connection; -static connection *createTLSConnection(int client_side) { +static connection *createTLSConnection(struct aeEventLoop *el, int client_side) { SSL_CTX *ctx = redis_tls_ctx; if (client_side && redis_tls_client_ctx) ctx = redis_tls_client_ctx; tls_connection *conn = zcalloc(sizeof(tls_connection)); conn->c.type = &CT_TLS; conn->c.fd = -1; + conn->c.el = el; conn->c.iovcnt = IOV_MAX; conn->ssl = SSL_new(ctx); return (connection *) conn; } -static connection *connCreateTLS(void) { - return createTLSConnection(1); +static connection *connCreateTLS(struct aeEventLoop *el) { + return createTLSConnection(el, 1); } /* Fetch the latest OpenSSL error and store it in the connection */ @@ -468,10 +463,11 @@ static void updateTLSError(tls_connection *conn) { * Callers should use connGetState() and verify the created connection * is not in an error state. */ -static connection *connCreateAcceptedTLS(int fd, void *priv) { +static connection *connCreateAcceptedTLS(struct aeEventLoop *el, int fd, void *priv) { int require_auth = *(int *)priv; - tls_connection *conn = (tls_connection *) createTLSConnection(0); + tls_connection *conn = (tls_connection *) createTLSConnection(el, 0); conn->c.fd = fd; + conn->c.el = el; conn->c.state = CONN_STATE_ACCEPTING; if (!conn->ssl) { @@ -575,17 +571,17 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update } static void registerSSLEvent(tls_connection *conn, WantIOType want) { - int mask = aeGetFileEvents(server.el, conn->c.fd); + int mask = aeGetFileEvents(conn->c.el, conn->c.fd); switch (want) { case WANT_READ: - if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); - if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, + if (mask & AE_WRITABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE); + if (!(mask & AE_READABLE)) aeCreateFileEvent(conn->c.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); break; case WANT_WRITE: - if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); - if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, + if (mask & AE_READABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_READABLE); + if (!(mask & AE_WRITABLE)) aeCreateFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); break; default: @@ -595,19 +591,42 @@ static void registerSSLEvent(tls_connection *conn, WantIOType want) { } static void updateSSLEvent(tls_connection *conn) { - int mask = aeGetFileEvents(server.el, conn->c.fd); + serverAssert(conn->c.el); + int mask = aeGetFileEvents(conn->c.el, conn->c.fd); int need_read = conn->c.read_handler || (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ); int need_write = conn->c.write_handler || (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE); if (need_read && !(mask & AE_READABLE)) - aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); + aeCreateFileEvent(conn->c.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); if (!need_read && (mask & AE_READABLE)) - aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); + aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_READABLE); if (need_write && !(mask & AE_WRITABLE)) - aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); + aeCreateFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); if (!need_write && (mask & AE_WRITABLE)) - aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); + aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE); +} + +/* Add a connection to the list of connections with pending data that has + * already been read from the socket but has not yet been served to the reader. */ +static void tlsPendingAdd(tls_connection *conn) { + if (!conn->c.el->privdata[1]) + conn->c.el->privdata[1] = listCreate(); + + list *pending_list = conn->c.el->privdata[1]; + if (!conn->pending_list_node) { + listAddNodeTail(pending_list, conn); + conn->pending_list_node = listLast(pending_list); + } +} + +/* Removes a connection from the list of connections with pending data. */ +static void tlsPendingRemove(tls_connection *conn) { + if (conn->pending_list_node) { + list *pending_list = conn->c.el->privdata[1]; + listDelNode(pending_list, conn->pending_list_node); + conn->pending_list_node = NULL; + } } static void tlsHandleEvent(tls_connection *conn, int mask) { @@ -718,13 +737,9 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { * to a list of pending connection that should be handled anyway. */ if ((mask & AE_READABLE)) { if (SSL_pending(conn->ssl) > 0) { - if (!conn->pending_list_node) { - listAddNodeTail(pending_list, conn); - conn->pending_list_node = listLast(pending_list); - } + tlsPendingAdd(conn); } else if (conn->pending_list_node) { - listDelNode(pending_list, conn->pending_list_node); - conn->pending_list_node = NULL; + tlsPendingRemove(conn); } } @@ -734,7 +749,8 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { break; } - updateSSLEvent(conn); + /* The event loop may have been unbound during the event processing above. */ + if (conn->c.el) updateSSLEvent(conn); } static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask) { @@ -748,7 +764,6 @@ static void tlsAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) int cport, cfd; int max = server.max_new_tls_conns_per_cycle; char cip[NET_IP_STR_LEN]; - UNUSED(el); UNUSED(mask); UNUSED(privdata); @@ -761,7 +776,7 @@ static void tlsAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) return; } serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport); - acceptCommonHandler(connCreateAcceptedTLS(cfd, &server.tls_auth_clients),0,cip); + acceptCommonHandler(connCreateAcceptedTLS(el,cfd,&server.tls_auth_clients), 0, cip); } } @@ -806,6 +821,7 @@ static void connTLSClose(connection *conn_) { } if (conn->pending_list_node) { + list *pending_list = conn->c.el->privdata[1]; listDelNode(pending_list, conn->pending_list_node); conn->pending_list_node = NULL; } @@ -863,6 +879,33 @@ static int connTLSConnect(connection *conn_, const char *addr, int port, const c return C_OK; } +static void connTLSUnbindEventLoop(connection *conn_) { + tls_connection *conn = (tls_connection *) conn_; + + /* We need to remove all events from the old event loop. The subsequent + * updateSSLEvent() will add the appropriate events to the new event loop. */ + if (conn->c.el) { + int mask = aeGetFileEvents(conn->c.el, conn->c.fd); + if (mask & AE_READABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_READABLE); + if (mask & AE_WRITABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE); + + /* Check if there are pending events and handle accordingly. */ + int has_pending = conn->pending_list_node != NULL; + if (has_pending) tlsPendingRemove(conn); + } +} + +static int connTLSRebindEventLoop(connection *conn_, aeEventLoop *el) { + tls_connection *conn = (tls_connection *) conn_; + serverAssert(!conn->c.el && !conn->c.read_handler && + !conn->c.write_handler && !conn->pending_list_node); + conn->c.el = el; + if (el && SSL_pending(conn->ssl)) tlsPendingAdd(conn); + /* Add the appropriate events to the new event loop. */ + updateSSLEvent((tls_connection *) conn); + return C_OK; +} + static int connTLSWrite(connection *conn_, const void *data, size_t data_len) { tls_connection *conn = (tls_connection *) conn_; int ret; @@ -1044,16 +1087,19 @@ static const char *connTLSGetType(connection *conn_) { return CONN_TYPE_TLS; } -static int tlsHasPendingData(void) { +static int tlsHasPendingData(struct aeEventLoop *el) { + list *pending_list = el->privdata[1]; if (!pending_list) return 0; return listLength(pending_list) > 0; } -static int tlsProcessPendingData(void) { +static int tlsProcessPendingData(struct aeEventLoop *el) { listIter li; listNode *ln; + list *pending_list = el->privdata[1]; + if (!pending_list) return 0; int processed = listLength(pending_list); listRewind(pending_list,&li); while((ln = listNext(&li))) { @@ -1114,6 +1160,10 @@ static ConnectionType CT_TLS = { .blocking_connect = connTLSBlockingConnect, .accept = connTLSAccept, + /* event loop */ + .unbind_event_loop = connTLSUnbindEventLoop, + .rebind_event_loop = connTLSRebindEventLoop, + /* IO */ .read = connTLSRead, .write = connTLSWrite, diff --git a/src/tracking.c b/src/tracking.c index 8ff14369d..5eec3e1d1 100644 --- a/src/tracking.c +++ b/src/tracking.c @@ -253,6 +253,7 @@ void trackingRememberKeys(client *tracking, client *executing) { * - Following a flush command, to send a single RESP NULL to indicate * that all keys are now invalid. */ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { + int paused = 0; uint64_t old_flags = c->flags; c->flags |= CLIENT_PUSHING; @@ -275,6 +276,11 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { if (!(old_flags & CLIENT_PUSHING)) c->flags &= ~CLIENT_PUSHING; c = redir; using_redirection = 1; + /* Start to touch another client data. */ + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + pauseIOThread(c->running_tid); + paused = 1; + } old_flags = c->flags; c->flags |= CLIENT_PUSHING; } @@ -296,7 +302,7 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { * it since RESP2 does not support push messages in the same * connection. */ if (!(old_flags & CLIENT_PUSHING)) c->flags &= ~CLIENT_PUSHING; - return; + goto done; } /* Send the "value" part, which is the array of keys. */ @@ -308,6 +314,17 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { } updateClientMemUsageAndBucket(c); if (!(old_flags & CLIENT_PUSHING)) c->flags &= ~CLIENT_PUSHING; + +done: + if (paused) { + if (clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_PENDING_WRITE)); + /* Actually we install write handler of client which is in IO thread + * event loop, it is safe since the io thread is paused */ + connSetWriteHandler(c->conn, sendReplyToClient); + } + resumeIOThread(c->running_tid); + } } /* This function is called when a key is modified in Redis and in the case diff --git a/src/unix.c b/src/unix.c index eb5850765..b61cb6d49 100644 --- a/src/unix.c +++ b/src/unix.c @@ -74,18 +74,19 @@ static int connUnixListen(connListener *listener) { return C_OK; } -static connection *connCreateUnix(void) { +static connection *connCreateUnix(struct aeEventLoop *el) { connection *conn = zcalloc(sizeof(connection)); conn->type = &CT_Unix; conn->fd = -1; conn->iovcnt = IOV_MAX; + conn->el = el; return conn; } -static connection *connCreateAcceptedUnix(int fd, void *priv) { +static connection *connCreateAcceptedUnix(struct aeEventLoop *el, int fd, void *priv) { UNUSED(priv); - connection *conn = connCreateUnix(); + connection *conn = connCreateUnix(el); conn->fd = fd; conn->state = CONN_STATE_ACCEPTING; return conn; @@ -107,7 +108,7 @@ static void connUnixAcceptHandler(aeEventLoop *el, int fd, void *privdata, int m return; } serverLog(LL_VERBOSE,"Accepted connection to %s", server.unixsocket); - acceptCommonHandler(connCreateAcceptedUnix(cfd, NULL),CLIENT_UNIX_SOCKET,NULL); + acceptCommonHandler(connCreateAcceptedUnix(el, cfd, NULL),CLIENT_UNIX_SOCKET,NULL); } } @@ -123,6 +124,10 @@ static int connUnixAccept(connection *conn, ConnectionCallbackFunc accept_handle return connectionTypeTcp()->accept(conn, accept_handler); } +static int connUnixRebindEventLoop(connection *conn, aeEventLoop *el) { + return connectionTypeTcp()->rebind_event_loop(conn, el); +} + static int connUnixWrite(connection *conn, const void *data, size_t data_len) { return connectionTypeTcp()->write(conn, data, data_len); } @@ -186,6 +191,10 @@ static ConnectionType CT_Unix = { .blocking_connect = NULL, .accept = connUnixAccept, + /* event loop */ + .unbind_event_loop = NULL, + .rebind_event_loop = connUnixRebindEventLoop, + /* IO */ .write = connUnixWrite, .writev = connUnixWritev, diff --git a/tests/integration/shutdown.tcl b/tests/integration/shutdown.tcl index b2ec32cbd..4169d64b7 100644 --- a/tests/integration/shutdown.tcl +++ b/tests/integration/shutdown.tcl @@ -156,6 +156,11 @@ test "Shutting down master waits for replica then fails" { set rd2 [redis_deferring_client -1] $rd1 shutdown $rd2 shutdown + wait_for_condition 100 10 { + [llength [regexp -all -inline {cmd=shutdown} [$master client list]]] eq 2 + } else { + fail "shutdown did not arrive" + } set info_clients [$master info clients] assert_match "*connected_clients:3*" $info_clients assert_match "*blocked_clients:2*" $info_clients @@ -209,6 +214,11 @@ test "Shutting down master waits for replica then aborted" { set rd2 [redis_deferring_client -1] $rd1 shutdown $rd2 shutdown + wait_for_condition 100 10 { + [llength [regexp -all -inline {cmd=shutdown} [$master client list]]] eq 2 + } else { + fail "shutdown did not arrive" + } set info_clients [$master info clients] assert_match "*connected_clients:3*" $info_clients assert_match "*blocked_clients:2*" $info_clients diff --git a/tests/support/util.tcl b/tests/support/util.tcl index f374c3dc9..c240a286c 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -698,6 +698,16 @@ proc latencyrstat_percentiles {cmd r} { } } +proc get_io_thread_clients {id {client r}} { + set pattern "io_thread_$id:clients=(\[0-9\]+)" + set info [$client info threads] + if {[regexp $pattern $info _ value]} { + return $value + } else { + return -1 + } +} + proc generate_fuzzy_traffic_on_key {key type duration} { # Commands per type, blocking commands removed # TODO: extract these from COMMAND DOCS, and improve to include other types diff --git a/tests/unit/client-eviction.tcl b/tests/unit/client-eviction.tcl index 7e8270aa8..3caaf9bd4 100644 --- a/tests/unit/client-eviction.tcl +++ b/tests/unit/client-eviction.tcl @@ -108,7 +108,11 @@ start_server {} { $rr write [join [list "*1\r\n\$$maxmemory_clients_actual\r\n" [string repeat v $maxmemory_clients_actual]] ""] $rr flush } e - assert {![client_exists $cname]} + wait_for_condition 100 10 { + ![client_exists $cname] + } else { + fail "Failed to evict client" + } $rr close # Restore settings @@ -360,6 +364,13 @@ start_server {} { resume_process $server_pid r ping ;# make sure a full event loop cycle is processed before issuing CLIENT LIST + # wait for get commands to be processed + wait_for_condition 100 10 { + [expr {[regexp {calls=(\d+)} [cmdrstat get r] -> calls] ? $calls : 0}] >= 2 + } else { + fail "get did not arrive" + } + # Validate obuf-clients were disconnected (because of obuf limit) catch {client_field obuf-client1 name} e assert_match {no client named obuf-client1 found*} $e @@ -367,7 +378,9 @@ start_server {} { assert_match {no client named obuf-client2 found*} $e # Validate qbuf-client is still connected and wasn't evicted - assert_equal [client_field qbuf-client name] {qbuf-client} + if {[lindex [r config get io-threads] 1] == 1} { + assert_equal [client_field qbuf-client name] {qbuf-client} + } $rr1 close $rr2 close @@ -404,8 +417,11 @@ start_server {} { # Decrease maxmemory_clients and expect client eviction r config set maxmemory-clients [expr $maxmemory_clients / 2] - set connected_clients [llength [lsearch -all [split [string trim [r client list]] "\r\n"] *name=client*]] - assert {$connected_clients > 0 && $connected_clients < $client_count} + wait_for_condition 200 10 { + [llength [regexp -all -inline {name=client} [r client list]]] < $client_count + } else { + fail "Failed to evict clients" + } foreach rr $rrs {$rr close} } @@ -463,8 +479,11 @@ start_server {} { assert {$total_client_mem <= $maxmemory_clients} # Make sure we have only half of our clients now - set connected_clients [llength [lsearch -all [split [string trim [r client list]] "\r\n"] *name=client*]] - assert {$connected_clients == [expr $client_count / 2]} + wait_for_condition 200 100 { + [llength [regexp -all -inline {name=client} [r client list]]] == $client_count / 2 + } else { + fail "Failed to evict clients" + } # Restore the reply buffer resize to default r debug replybuffer resizing 1 @@ -519,7 +538,8 @@ start_server {} { foreach size [lreverse $sizes] { set control_mem [client_field control tot-mem] set total_mem [expr $total_mem - $clients_per_size * $size] - r config set maxmemory-clients [expr $total_mem + $control_mem] + # allow some tolerance when using io threads + r config set maxmemory-clients [expr $total_mem + $control_mem + 1000] set clients [split [string trim [r client list]] "\r\n"] # Verify only relevant clients were evicted for {set i 0} {$i < [llength $sizes]} {incr i} { diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 6e2d381f5..fc66fb510 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -313,7 +313,7 @@ start_server {tags {"info" "external:skip"}} { assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" } assert_morethan $el_sum2 $el_sum1 - assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance + assert_lessthan $el_sum2 [expr $el_sum1+100000] ;# we expect roughly 100ms here, but allow some tolerance if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" } assert_morethan $cmd_sum2 $cmd_sum1 assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index fbd1d14fe..2ba1a8c96 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -6,8 +6,13 @@ start_server {tags {"introspection"}} { } test {CLIENT LIST} { - r client list - } {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|list user=* redir=-1 resp=*} + set client_list [r client list] + if {[lindex [r config get io-threads] 1] == 1} { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|list user=* redir=-1 resp=*} $client_list + } else { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|list user=* redir=-1 resp=*} $client_list + } + } test {CLIENT LIST with IDs} { set myid [r client id] @@ -16,8 +21,13 @@ start_server {tags {"introspection"}} { } test {CLIENT INFO} { - r client info - } {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=*} + set client [r client info] + if {[lindex [r config get io-threads] 1] == 1} { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=*} $client + } else { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=*} $client + } + } test {CLIENT KILL with illegal arguments} { assert_error "ERR wrong number of arguments for 'client|kill' command" {r client kill} @@ -86,6 +96,11 @@ start_server {tags {"introspection"}} { assert {$connected_clients >= 3} set res [r client kill skipme yes] assert {$res == $connected_clients - 1} + wait_for_condition 1000 10 { + [s connected_clients] eq 1 + } else { + fail "Can't kill all clients except the current one" + } # Kill all clients, including `me` set rd3 [redis_deferring_client] @@ -304,6 +319,9 @@ start_server {tags {"introspection"}} { $rd read ; # Discard the OK $bc blpop mylist 0 + # make sure the blpop arrives first + $bc flush + after 100 wait_for_blocked_clients_count 1 r lpush mylist 1 wait_for_blocked_clients_count 0 @@ -904,3 +922,62 @@ test {CONFIG REWRITE handles alias config properly} { assert_equal [r config get hash-max-listpack-entries] {hash-max-listpack-entries 100} } } {} {external:skip} + +test {IO threads client number} { + start_server {overrides {io-threads 2} tags {external:skip}} { + set iothread_clients [get_io_thread_clients 1] + assert_equal $iothread_clients [s connected_clients] + assert_equal [get_io_thread_clients 0] 0 + + r script debug yes ; # Transfer to main thread + assert_equal [get_io_thread_clients 0] 1 + assert_equal [get_io_thread_clients 1] [expr $iothread_clients - 1] + + set iothread_clients [get_io_thread_clients 1] + set rd1 [redis_deferring_client] + set rd2 [redis_deferring_client] + assert_equal [get_io_thread_clients 1] [expr $iothread_clients + 2] + $rd1 close + $rd2 close + wait_for_condition 1000 10 { + [get_io_thread_clients 1] eq $iothread_clients + } else { + fail "Fail to close clients of io thread 1" + } + assert_equal [get_io_thread_clients 0] 1 + + r script debug no ; # Transfer to io thread + assert_equal [get_io_thread_clients 0] 0 + assert_equal [get_io_thread_clients 1] [expr $iothread_clients + 1] + } +} + +test {Clients are evenly distributed among io threads} { + start_server {overrides {io-threads 4} tags {external:skip}} { + set cur_clients [s connected_clients] + assert_equal $cur_clients 1 + global rdclients + for {set i 1} {$i < 9} {incr i} { + set rdclients($i) [redis_deferring_client] + } + for {set i 1} {$i <= 3} {incr i} { + assert_equal [get_io_thread_clients $i] 3 + } + + $rdclients(3) close + $rdclients(4) close + wait_for_condition 1000 10 { + [get_io_thread_clients 1] eq 2 && + [get_io_thread_clients 2] eq 2 && + [get_io_thread_clients 3] eq 3 + } else { + fail "Fail to close clients" + } + + set $rdclients(3) [redis_deferring_client] + set $rdclients(4) [redis_deferring_client] + for {set i 1} {$i <= 3} {incr i} { + assert_equal [get_io_thread_clients $i] 3 + } + } +} diff --git a/tests/unit/lazyfree.tcl b/tests/unit/lazyfree.tcl index b4ade4031..cb3a4b014 100644 --- a/tests/unit/lazyfree.tcl +++ b/tests/unit/lazyfree.tcl @@ -10,6 +10,7 @@ start_server {tags {"lazyfree"}} { set peak_mem [s used_memory] assert {[r unlink myset] == 1} assert {$peak_mem > $orig_mem+1000000} + reconnect ;# free the memory of reused argv of client wait_for_condition 50 100 { [s used_memory] < $peak_mem && [s used_memory] < $orig_mem*2 @@ -32,6 +33,7 @@ start_server {tags {"lazyfree"}} { set peak_mem [s used_memory] r flushdb async assert {$peak_mem > $orig_mem+1000000} + reconnect ;# free the memory of reused argv of client wait_for_condition 50 100 { [s used_memory] < $peak_mem && [s used_memory] < $orig_mem*2 diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index 363dab472..966ac4487 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -29,7 +29,11 @@ start_server {tags {"maxmemory" "external:skip"}} { set dbsize [r dbsize] if $client_eviction { - return [expr $evicted_clients > 0 && $evicted_keys == 0 && $dbsize == 50] + if {[lindex [r config get io-threads] 1] == 1} { + return [expr $evicted_clients > 0 && $evicted_keys == 0 && $dbsize == 50] + } else { + return [expr $evicted_clients >= 0 && $evicted_keys >= 0 && $dbsize <= 50] + } } else { return [expr $evicted_clients == 0 && $evicted_keys > 0 && $dbsize < 50] } diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 130289aff..92c1f572c 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -420,7 +420,10 @@ run_solo {defrag} { $rd_pubsub read ; # Discard subscribe replies $rd_pubsub ssubscribe $channel_name $rd_pubsub read ; # Discard ssubscribe replies - $rd set k$j $channel_name + # Pub/Sub clients are handled in the main thread, so their memory is + # allocated there. Using the SETBIT command avoids the main thread + # referencing argv from IO threads. + $rd setbit k$j [expr {[string length $channel_name] * 8}] 1 $rd read ; # Discard set replies } @@ -583,6 +586,123 @@ run_solo {defrag} { } } + test "Active defrag for argv retained by the main thread from IO thread: $type" { + r flushdb + r config set hz 100 + r config set activedefrag no + wait_for_defrag_stop 500 100 + r config resetstat + set io_threads [lindex [r config get io-threads] 1] + if {$io_threads == 1} { + r config set active-defrag-threshold-lower 5 + } else { + r config set active-defrag-threshold-lower 10 + } + r config set active-defrag-cycle-min 65 + r config set active-defrag-cycle-max 75 + r config set active-defrag-ignore-bytes 1000kb + r config set maxmemory 0 + + # Create some clients so that they are distributed among different io threads. + set clients {} + for {set i 0} {$i < 8} {incr i} { + lappend clients [redis_client] + } + + # Populate memory with interleaving key pattern of same size + set dummy "[string repeat x 400]" + set n 10000 + for {set i 0} {$i < [llength $clients]} {incr i} { + set rr [lindex $clients $i] + for {set j 0} {$j < $n} {incr j} { + $rr set "k$i-$j" $dummy + } + } + + # If io-threads is enable, verify that memory allocation is not from the main thread. + if {$io_threads != 1} { + # At least make sure that bin 448 is created in the main thread's arena. + r set k dummy + r del k + + # We created 10000 string keys of 400 bytes each for each client, so when the memory + # allocation for the 448 bin in the main thread is significantly smaller than this, + # we can conclude that the memory allocation is not coming from it. + set malloc_stats [r memory malloc-stats] + if {[regexp {(?s)arenas\[0\]:.*?448[ ]+[\d]+[ ]+([\d]+)[ ]} $malloc_stats - allocated]} { + # Ensure the allocation for bin 448 in the main thread’s arena + # is far less than 4375k (10000 * 448 bytes). + assert_lessthan $allocated 200000 + } else { + fail "Failed to get the main thread's malloc stats." + } + } + + after 120 ;# serverCron only updates the info once in 100ms + if {$::verbose} { + puts "used [s allocator_allocated]" + puts "rss [s allocator_active]" + puts "frag [s allocator_frag_ratio]" + puts "frag_bytes [s allocator_frag_bytes]" + } + assert_lessthan [s allocator_frag_ratio] 1.05 + + # Delete keys with even indices to create fragmentation. + for {set i 0} {$i < [llength $clients]} {incr i} { + set rd [lindex $clients $i] + for {set j 0} {$j < $n} {incr j 2} { + $rd del "k$i-$j" + } + } + for {set i 0} {$i < [llength $clients]} {incr i} { + [lindex $clients $i] close + } + + after 120 ;# serverCron only updates the info once in 100ms + if {$::verbose} { + puts "used [s allocator_allocated]" + puts "rss [s allocator_active]" + puts "frag [s allocator_frag_ratio]" + puts "frag_bytes [s allocator_frag_bytes]" + } + assert_morethan [s allocator_frag_ratio] 1.4 + + catch {r config set activedefrag yes} e + if {[r config get activedefrag] eq "activedefrag yes"} { + + # wait for the active defrag to start working (decision once a second) + wait_for_condition 50 100 { + [s total_active_defrag_time] ne 0 + } else { + after 120 ;# serverCron only updates the info once in 100ms + puts [r info memory] + puts [r info stats] + puts [r memory malloc-stats] + fail "defrag not started." + } + + # wait for the active defrag to stop working + wait_for_defrag_stop 500 100 + + # test the fragmentation is lower + after 120 ;# serverCron only updates the info once in 100ms + if {$::verbose} { + puts "used [s allocator_allocated]" + puts "rss [s allocator_active]" + puts "frag [s allocator_frag_ratio]" + puts "frag_bytes [s allocator_frag_bytes]" + } + + if {$io_threads == 1} { + assert_lessthan_equal [s allocator_frag_ratio] 1.05 + } else { + # TODO: When multithreading is enabled, argv may be created in the io thread + # and kept in the main thread, which can cause fragmentation to become worse. + assert_lessthan_equal [s allocator_frag_ratio] 1.1 + } + } + } + if {$type eq "standalone"} { ;# skip in cluster mode test "Active defrag big list: $type" { r flushdb diff --git a/tests/unit/moduleapi/blockedclient.tcl b/tests/unit/moduleapi/blockedclient.tcl index 22b2c4bae..28cc76fe8 100644 --- a/tests/unit/moduleapi/blockedclient.tcl +++ b/tests/unit/moduleapi/blockedclient.tcl @@ -130,7 +130,12 @@ foreach call_type {nested normal} { $rd flush # make sure we get BUSY error, and that we didn't get it too early - assert_error {*BUSY Slow module operation*} {r ping} + wait_for_condition 50 100 { + ([catch {r ping} reply] == 1) && + ([string match {*BUSY Slow module operation*} $reply]) + } else { + fail "Failed waiting for busy slow response" + } assert_morethan_equal [expr [clock clicks -milliseconds]-$start] $busy_time_limit # abort the blocking operation diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 1defb5158..9a4f1196b 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -85,6 +85,11 @@ start_server {tags {"pubsub network"}} { set rd1 [redis_deferring_client] assert_equal {1 2 3} [subscribe $rd1 {chan1 chan2 chan3}] unsubscribe $rd1 + wait_for_condition 100 10 { + [regexp {cmd=unsubscribe} [r client list]] eq 1 + } else { + fail "unsubscribe did not arrive" + } assert_equal 0 [r publish chan1 hello] assert_equal 0 [r publish chan2 hello] assert_equal 0 [r publish chan3 hello] @@ -158,6 +163,11 @@ start_server {tags {"pubsub network"}} { set rd1 [redis_deferring_client] assert_equal {1 2 3} [psubscribe $rd1 {chan1.* chan2.* chan3.*}] punsubscribe $rd1 + wait_for_condition 100 10 { + [regexp {cmd=punsubscribe} [r client list]] eq 1 + } else { + fail "punsubscribe did not arrive" + } assert_equal 0 [r publish chan1.hi hello] assert_equal 0 [r publish chan2.hi hello] assert_equal 0 [r publish chan3.hi hello] diff --git a/tests/unit/pubsubshard.tcl b/tests/unit/pubsubshard.tcl index 6e3fb61c1..a3c841d36 100644 --- a/tests/unit/pubsubshard.tcl +++ b/tests/unit/pubsubshard.tcl @@ -46,6 +46,11 @@ start_server {tags {"pubsubshard external:skip"}} { assert_equal {2} [ssubscribe $rd1 {chan2}] assert_equal {3} [ssubscribe $rd1 {chan3}] sunsubscribe $rd1 + wait_for_condition 100 10 { + [regexp {cmd=sunsubscribe} [r client list]] eq 1 + } else { + fail "sunsubscribe did not arrive" + } assert_equal 0 [r SPUBLISH chan1 hello] assert_equal 0 [r SPUBLISH chan2 hello] assert_equal 0 [r SPUBLISH chan3 hello] diff --git a/tests/unit/querybuf.tcl b/tests/unit/querybuf.tcl index d05911156..9dcf986e8 100644 --- a/tests/unit/querybuf.tcl +++ b/tests/unit/querybuf.tcl @@ -166,7 +166,12 @@ start_server {tags {"querybuf"}} { # The client executing the command is currently using the reusable query buffer, # so the size shown is that of the reusable query buffer. It will be returned # to the reusable query buffer after command execution. - assert_match {*qbuf=26 qbuf-free=* cmd=client|list *} $res + # Note that if IO threads are enabled, the reusable query buffer will be dereferenced earlier. + if {[lindex [r config get io-threads] 1] == 1} { + assert_match {*qbuf=26 qbuf-free=* cmd=client|list *} $res + } else { + assert_match {*qbuf=0 qbuf-free=* cmd=client|list *} $res + } $rd close } diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl index 9f46a8beb..fad95b970 100644 --- a/tests/unit/type/list.tcl +++ b/tests/unit/type/list.tcl @@ -1100,6 +1100,11 @@ foreach {pop} {BLPOP BLMPOP_LEFT} { $watching_client get somekey{t} $watching_client read $watching_client exec + wait_for_condition 100 10 { + [regexp {cmd=exec} [r client list]] eq 1 + } else { + fail "exec did not arrive" + } # Blocked BLPOPLPUSH may create problems, unblock it. r lpush srclist{t} element set res [$watching_client read] From 7665bdc91aa6f98289eefdcbdb2def4467864b7a Mon Sep 17 00:00:00 2001 From: Yuan Wang Date: Wed, 25 Dec 2024 16:03:22 +0800 Subject: [PATCH 29/42] Offload `lookupCommand` into IO threads when threaded IO is enabled (#13696) From flame graph, we could see `lookupCommand` in main thread costs much CPU, so we can let IO threads to perform `lookupCommand`. To avoid race condition among multiple IO threads, made the following changes: - Pause all IO threads when register or unregister commands - Force a full rehashing of the command table dict when resizing --- src/dict.c | 6 ++++++ src/dict.h | 4 ++++ src/iothread.c | 4 ++-- src/module.c | 11 ++++++++++- src/networking.c | 4 +++- src/server.c | 6 ++++-- src/server.h | 1 + 7 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/dict.c b/src/dict.c index 3bfcc7017..bf183422a 100644 --- a/src/dict.c +++ b/src/dict.c @@ -277,6 +277,12 @@ int _dictResize(dict *d, unsigned long size, int* malloc_failed) return DICT_OK; } + /* Force a full rehashing of the dictionary */ + if (d->type->force_full_rehash) { + while (dictRehash(d, 1000)) { + /* Continue rehashing */ + } + } return DICT_OK; } diff --git a/src/dict.h b/src/dict.h index 12a5c9918..fc4554ae2 100644 --- a/src/dict.h +++ b/src/dict.h @@ -62,6 +62,10 @@ typedef struct dictType { unsigned int keys_are_odd:1; /* TODO: Add a 'keys_are_even' flag and use a similar optimization if that * flag is set. */ + + /* Ensures that the entire hash table is rehashed at once if set. */ + unsigned int force_full_rehash:1; + /* Sometimes we want the ability to store a key in a given way inside the hash * function, and lookup it in some other way without resorting to any kind of * conversion. For instance the key may be stored as a structure also diff --git a/src/iothread.c b/src/iothread.c index 2e5c98a28..e3da683d5 100644 --- a/src/iothread.c +++ b/src/iothread.c @@ -195,7 +195,7 @@ static int PausedIOThreads[IO_THREADS_MAX_NUM] = {0}; /* Pause the specific range of io threads, and wait for them to be paused. */ void pauseIOThreadsRange(int start, int end) { - if (server.io_threads_num <= 1) return; + if (!server.io_threads_active) return; serverAssert(start >= 1 && end < server.io_threads_num && start <= end); serverAssert(pthread_equal(pthread_self(), server.main_thread_id)); @@ -227,7 +227,7 @@ void pauseIOThreadsRange(int start, int end) { /* Resume the specific range of io threads, and wait for them to be resumed. */ void resumeIOThreadsRange(int start, int end) { - if (server.io_threads_num <= 1) return; + if (!server.io_threads_active) return; serverAssert(start >= 1 && end < server.io_threads_num && start <= end); serverAssert(pthread_equal(pthread_self(), server.main_thread_id)); diff --git a/src/module.c b/src/module.c index f12d03b47..f662ebb30 100644 --- a/src/module.c +++ b/src/module.c @@ -664,7 +664,7 @@ void moduleReleaseTempClient(client *c) { c->bufpos = 0; c->flags = CLIENT_MODULE; c->user = NULL; /* Root user */ - c->cmd = c->lastcmd = c->realcmd = NULL; + c->cmd = c->lastcmd = c->realcmd = c->iolookedcmd = NULL; if (c->bstate.async_rm_call_handle) { RedisModuleAsyncRMCallPromise *promise = c->bstate.async_rm_call_handle; promise->c = NULL; /* Remove the client from the promise so it will no longer be possible to abort it. */ @@ -1276,8 +1276,11 @@ int RM_CreateCommand(RedisModuleCtx *ctx, const char *name, RedisModuleCmdFunc c RedisModuleCommand *cp = moduleCreateCommandProxy(ctx->module, declared_name, sdsdup(declared_name), cmdfunc, flags, firstkey, lastkey, keystep); cp->rediscmd->arity = cmdfunc ? -1 : -2; /* Default value, can be changed later via dedicated API */ + pauseAllIOThreads(); serverAssert(dictAdd(server.commands, sdsdup(declared_name), cp->rediscmd) == DICT_OK); serverAssert(dictAdd(server.orig_commands, sdsdup(declared_name), cp->rediscmd) == DICT_OK); + resumeAllIOThreads(); + cp->rediscmd->id = ACLGetCommandID(declared_name); /* ID used for ACL. */ return REDISMODULE_OK; } @@ -10905,6 +10908,10 @@ void moduleCallCommandFilters(client *c) { f->callback(&filter); } + /* If the filter sets a new command, including command or subcommand, + * the command looked up in IO threads will be invalid. */ + c->iolookedcmd = NULL; + c->argv = filter.argv; c->argv_len = filter.argv_len; c->argc = filter.argc; @@ -12321,6 +12328,7 @@ int moduleFreeCommand(struct RedisModule *module, struct redisCommand *cmd) { } void moduleUnregisterCommands(struct RedisModule *module) { + pauseAllIOThreads(); /* Unregister all the commands registered by this module. */ dictIterator *di = dictGetSafeIterator(server.commands); dictEntry *de; @@ -12335,6 +12343,7 @@ void moduleUnregisterCommands(struct RedisModule *module) { zfree(cmd); } dictReleaseIterator(di); + resumeAllIOThreads(); } /* We parse argv to add sds "NAME VALUE" pairs to the server.module_configs_queue list of configs. diff --git a/src/networking.c b/src/networking.c index 8fb37af08..e3d0d25e4 100644 --- a/src/networking.c +++ b/src/networking.c @@ -159,7 +159,7 @@ client *createClient(connection *conn) { c->argv_len_sum = 0; c->original_argc = 0; c->original_argv = NULL; - c->cmd = c->lastcmd = c->realcmd = NULL; + c->cmd = c->lastcmd = c->realcmd = c->iolookedcmd = NULL; c->cur_script = NULL; c->multibulklen = 0; c->bulklen = -1; @@ -1456,6 +1456,7 @@ static inline void freeClientArgvInternal(client *c, int free_argv) { decrRefCount(c->argv[j]); c->argc = 0; c->cmd = NULL; + c->iolookedcmd = NULL; c->argv_len_sum = 0; if (free_argv) { c->argv_len = 0; @@ -2777,6 +2778,7 @@ int processInputBuffer(client *c) { * as one that needs to process the command. */ if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { c->io_flags |= CLIENT_IO_PENDING_COMMAND; + c->iolookedcmd = lookupCommand(c->argv, c->argc); enqueuePendingClientsToMainThread(c, 0); break; } diff --git a/src/server.c b/src/server.c index 0b4c95ce8..a9b2b637c 100644 --- a/src/server.c +++ b/src/server.c @@ -521,7 +521,8 @@ dictType commandTableDictType = { dictSdsKeyCaseCompare, /* key compare */ dictSdsDestructor, /* key destructor */ NULL, /* val destructor */ - NULL /* allow to expand */ + NULL, /* allow to expand */ + .force_full_rehash = 1, /* force full rehashing */ }; /* Hash type hash table (note that small hashes are represented with listpacks) */ @@ -3988,7 +3989,8 @@ int processCommand(client *c) { * In case we are reprocessing a command after it was blocked, * we do not have to repeat the same checks */ if (!client_reprocessing_command) { - c->cmd = c->lastcmd = c->realcmd = lookupCommand(c->argv,c->argc); + c->cmd = c->lastcmd = c->realcmd = c->iolookedcmd ? c->iolookedcmd : + lookupCommand(c->argv,c->argc); sds err; if (!commandCheckExistence(c, &err)) { rejectCommandSds(c, err); diff --git a/src/server.h b/src/server.h index bc965999e..3fe562184 100644 --- a/src/server.h +++ b/src/server.h @@ -1215,6 +1215,7 @@ typedef struct client { robj **original_argv; /* Arguments of original command if arguments were rewritten. */ size_t argv_len_sum; /* Sum of lengths of objects in argv list. */ struct redisCommand *cmd, *lastcmd; /* Last command executed. */ + struct redisCommand *iolookedcmd; /* Command looked up in IO threads. */ struct redisCommand *realcmd; /* The original command that was executed by the client, Used to update error stats in case the c->cmd was modified during the command invocation (like on GEOADD for example). */ From 8144019a13434717e22c44657064b2ea91863372 Mon Sep 17 00:00:00 2001 From: Thalia Archibald Date: Wed, 25 Dec 2024 20:37:23 -0800 Subject: [PATCH 30/42] Check length before reading in `stringmatchlen` (#13690) Fixes four cases where `stringmatchlen` could overrun the pattern if it is not terminated with NUL. These commits are cherry-picked from my [fork](https://github.com/thaliaarchi/antirez-stringmatch) which extracts `stringmatch` as a library and compares it to other projects by antirez which uses the same matcher. --- src/util.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/util.c b/src/util.c index 43793f15b..f48bd1287 100644 --- a/src/util.c +++ b/src/util.c @@ -109,24 +109,24 @@ static int stringmatchlen_impl(const char *pattern, int patternLen, pattern++; patternLen--; - not = pattern[0] == '^'; + not = patternLen && pattern[0] == '^'; if (not) { pattern++; patternLen--; } match = 0; while(1) { - if (pattern[0] == '\\' && patternLen >= 2) { + if (patternLen >= 2 && pattern[0] == '\\') { pattern++; patternLen--; if (pattern[0] == string[0]) match = 1; - } else if (pattern[0] == ']') { - break; } else if (patternLen == 0) { pattern--; patternLen++; break; + } else if (pattern[0] == ']') { + break; } else if (patternLen >= 3 && pattern[1] == '-') { int start = pattern[0]; int end = pattern[2]; @@ -186,7 +186,7 @@ static int stringmatchlen_impl(const char *pattern, int patternLen, pattern++; patternLen--; if (stringLen == 0) { - while(*pattern == '*') { + while(patternLen && *pattern == '*') { pattern++; patternLen--; } From dc57ee03b1c5b8f646718e362f3a809a7511ad36 Mon Sep 17 00:00:00 2001 From: Yuan Wang Date: Thu, 26 Dec 2024 12:51:44 +0800 Subject: [PATCH 31/42] Do security attack check only when command not found to reduce the critical path (#13702) This PR is based on the commits from PR https://github.com/valkey-io/valkey/pull/1212. When explored the cycles distribution for main thread with io-threads enabled. We found this security attack check takes significant time in main thread, **~3%** cycles were used to do the commands security check in main thread. This patch try to completely avoid doing it in the hot path. We can do it only after we looked up the command and it wasn't found, just before we call commandCheckExistence. --------- Co-authored-by: Lipeng Zhu Co-authored-by: Wangyang Guo --- src/server.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/server.c b/src/server.c index a9b2b637c..e9ab2a74b 100644 --- a/src/server.c +++ b/src/server.c @@ -3969,12 +3969,6 @@ int processCommand(client *c) { reqresAppendRequest(c); } - /* Handle possible security attacks. */ - if (!strcasecmp(c->argv[0]->ptr,"host:") || !strcasecmp(c->argv[0]->ptr,"post")) { - securityWarningCommand(c); - return C_ERR; - } - /* If we're inside a module blocked context yielding that wants to avoid * processing clients, postpone the command. */ if (server.busy_module_yield_flags != BUSY_MODULE_YIELD_NONE && @@ -3989,8 +3983,15 @@ int processCommand(client *c) { * In case we are reprocessing a command after it was blocked, * we do not have to repeat the same checks */ if (!client_reprocessing_command) { - c->cmd = c->lastcmd = c->realcmd = c->iolookedcmd ? c->iolookedcmd : - lookupCommand(c->argv,c->argc); + struct redisCommand *cmd = c->iolookedcmd ? c->iolookedcmd : lookupCommand(c->argv, c->argc); + if (!cmd) { + /* Handle possible security attacks. */ + if (!strcasecmp(c->argv[0]->ptr,"host:") || !strcasecmp(c->argv[0]->ptr,"post")) { + securityWarningCommand(c); + return C_ERR; + } + } + c->cmd = c->lastcmd = c->realcmd = cmd; sds err; if (!commandCheckExistence(c, &err)) { rejectCommandSds(c, err); From 04f63d4af74cb5aa0d1e12e05fa8f7f92cb2ef94 Mon Sep 17 00:00:00 2001 From: raffertyyu Date: Tue, 31 Dec 2024 21:41:10 +0800 Subject: [PATCH 32/42] Fix index error of CRLF when replying with integer-encoded strings (#13711) close #13709 Fix the index error of CRLF character for integer-encoded strings in addReplyBulk function --------- Co-authored-by: debing.sun --- src/networking.c | 4 +-- tests/unit/protocol.tcl | 62 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index e3d0d25e4..589b3ed24 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1080,8 +1080,8 @@ void addReplyBulk(client *c, robj *obj) { * to the output buffer. */ char buf[34]; size_t len = ll2string(buf,sizeof(buf),(long)obj->ptr); - buf[len+1] = '\r'; - buf[len+2] = '\n'; + buf[len] = '\r'; + buf[len+1] = '\n'; _addReplyLongLongBulk(c, len); _addReplyToBufferOrList(c,buf,len+2); } else { diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index e3b4115a8..a98b124b7 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -135,6 +135,68 @@ start_server {tags {"protocol network"}} { assert_equal [r read] {a} } + test "bulk reply protocol" { + # value=2 (int encoding) + r set crlf 2 + assert_equal [r rawread 5] "+OK\r\n" + r get crlf + assert_equal [r rawread 7] "\$1\r\n2\r\n" + r object encoding crlf + assert_equal [r rawread 9] "\$3\r\nint\r\n" + + # value=2147483647 (int encoding) + r set crlf 2147483647 + assert_equal [r rawread 5] "+OK\r\n" + r get crlf + assert_equal [r rawread 17] "\$10\r\n2147483647\r\n" + r object encoding crlf + assert_equal [r rawread 9] "\$3\r\nint\r\n" + + # value=-2147483648 (int encoding) + r set crlf -2147483648 + assert_equal [r rawread 5] "+OK\r\n" + r get crlf + assert_equal [r rawread 18] "\$11\r\n-2147483648\r\n" + r object encoding crlf + assert_equal [r rawread 9] "\$3\r\nint\r\n" + + # value=-9223372036854775809 (embstr encoding) + r set crlf -9223372036854775809 + assert_equal [r rawread 5] "+OK\r\n" + r get crlf + assert_equal [r rawread 27] "\$20\r\n-9223372036854775809\r\n" + r object encoding crlf + assert_equal [r rawread 12] "\$6\r\nembstr\r\n" + + # value=9223372036854775808 (embstr encoding) + r set crlf 9223372036854775808 + assert_equal [r rawread 5] "+OK\r\n" + r get crlf + assert_equal [r rawread 26] "\$19\r\n9223372036854775808\r\n" + r object encoding crlf + assert_equal [r rawread 12] "\$6\r\nembstr\r\n" + + # normal sds (embstr encoding) + r set crlf aaaaaaaaaaaaaaaa + assert_equal [r rawread 5] "+OK\r\n" + r get crlf + assert_equal [r rawread 23] "\$16\r\naaaaaaaaaaaaaaaa\r\n" + r object encoding crlf + assert_equal [r rawread 12] "\$6\r\nembstr\r\n" + + # normal sds (raw string encoding) with 45 'a' + set rawstr [string repeat "a" 45] + r set crlf $rawstr + assert_equal [r rawread 5] "+OK\r\n" + r get crlf + assert_equal [r rawread 52] "\$45\r\n$rawstr\r\n" + r object encoding crlf + assert_equal [r rawread 9] "\$3\r\nraw\r\n" + + r del crlf + assert_equal [r rawread 4] ":1\r\n" + } + # restore connection settings r readraw 0 r deferred 0 From 8e9f5146dd15eb8b934d4fe6c561639c034f78e8 Mon Sep 17 00:00:00 2001 From: Yuan Wang Date: Mon, 6 Jan 2025 15:59:02 +0800 Subject: [PATCH 33/42] Add reads/writes metrics for IO threads (#13703) The main job of the IO thread is read queries and write replies, so reads/writes metrics can reflect the workload of IO threads, now we also support this metrics `io_threaded_reads/writes_processed` in detail for each IO thread. Of course, to avoid break changes, `io_threaded_reads/writes_processed` is still there. But before async io thread commit, we may sum the IO done by the main thread if IO threads are active, but now we only sum the IO done by IO threads. Now threads section in `info` command output is as follows: ``` # Threads io_thread_0:clients=0,reads=0,writes=0 io_thread_1:clients=54,reads=6546940,writes=6546919 io_thread_2:clients=54,reads=6513650,writes=6513625 io_thread_3:clients=54,reads=6396571,writes=6396525 io_thread_4:clients=53,reads=6511120,writes=6511097 io_thread_5:clients=53,reads=6539302,writes=6539280 io_thread_6:clients=53,reads=6502269,writes=6502248 ``` --- src/networking.c | 14 ++++-------- src/server.c | 58 ++++++++++++++++++++++++++++++++---------------- src/server.h | 6 ++--- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/src/networking.c b/src/networking.c index 589b3ed24..94303e1b9 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2060,11 +2060,8 @@ int _writeToClient(client *c, ssize_t *nwritten) { * thread safe. */ int writeToClient(client *c, int handler_installed) { if (!(c->io_flags & CLIENT_IO_WRITE_ENABLED)) return C_OK; - /* Update total number of writes on server */ - atomicIncr(server.stat_total_writes_processed, 1); - if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { - atomicIncr(server.stat_io_writes_processed, 1); - } + /* Update the number of writes of io threads on server */ + atomicIncr(server.stat_io_writes_processed[c->running_tid], 1); ssize_t nwritten = 0, totwritten = 0; @@ -2833,11 +2830,8 @@ void readQueryFromClient(connection *conn) { if (!(c->io_flags & CLIENT_IO_READ_ENABLED)) return; c->read_error = 0; - /* Update total number of reads on server */ - atomicIncr(server.stat_total_reads_processed, 1); - if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { - atomicIncr(server.stat_io_reads_processed, 1); - } + /* Update the number of reads of io threads on server */ + atomicIncr(server.stat_io_reads_processed[c->running_tid], 1); readlen = PROTO_IOBUF_LEN; /* If this is a multi bulk request, and we are processing a bulk reply diff --git a/src/server.c b/src/server.c index e9ab2a74b..7bc39c411 100644 --- a/src/server.c +++ b/src/server.c @@ -2616,10 +2616,10 @@ void resetServerStats(void) { server.stat_sync_full = 0; server.stat_sync_partial_ok = 0; server.stat_sync_partial_err = 0; - atomicSet(server.stat_io_reads_processed, 0); - atomicSet(server.stat_total_reads_processed, 0); - atomicSet(server.stat_io_writes_processed, 0); - atomicSet(server.stat_total_writes_processed, 0); + for (j = 0; j < IO_THREADS_MAX_NUM; j++) { + atomicSet(server.stat_io_reads_processed[j], 0); + atomicSet(server.stat_io_writes_processed[j], 0); + } atomicSet(server.stat_client_qbuf_limit_disconnections, 0); server.stat_client_outbuf_limit_disconnections = 0; for (j = 0; j < STATS_METRIC_COUNT; j++) { @@ -5912,9 +5912,29 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { } } + /* Threads */ + int stat_io_ops_processed_calculated = 0; + long long stat_io_reads_processed = 0, stat_io_writes_processed = 0; + long long stat_total_reads_processed = 0, stat_total_writes_processed = 0; + if (all_sections || (dictFind(section_dict,"threads") != NULL)) { + if (sections++) info = sdscat(info,"\r\n"); + info = sdscatprintf(info, "# Threads\r\n"); + long long reads, writes; + for (j = 0; j < server.io_threads_num; j++) { + atomicGet(server.stat_io_reads_processed[j], reads); + atomicGet(server.stat_io_writes_processed[j], writes); + info = sdscatprintf(info, "io_thread_%d:clients=%d,reads=%lld,writes=%lld\r\n", + j, server.io_threads_clients_num[j], reads, writes); + stat_total_reads_processed += reads; + if (j != 0) stat_io_reads_processed += reads; /* Skip the main thread */ + stat_total_writes_processed += writes; + if (j != 0) stat_io_writes_processed += writes; /* Skip the main thread */ + } + stat_io_ops_processed_calculated = 1; + } + /* Stats */ if (all_sections || (dictFind(section_dict,"stats") != NULL)) { - long long stat_total_reads_processed, stat_total_writes_processed; long long stat_net_input_bytes, stat_net_output_bytes; long long stat_net_repl_input_bytes, stat_net_repl_output_bytes; long long current_eviction_exceeded_time = server.stat_last_eviction_exceeded_time ? @@ -5922,16 +5942,25 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { long long current_active_defrag_time = server.stat_last_active_defrag_time ? (long long) elapsedUs(server.stat_last_active_defrag_time): 0; long long stat_client_qbuf_limit_disconnections; - long long stat_io_reads_processed, stat_io_writes_processed; - atomicGet(server.stat_total_reads_processed, stat_total_reads_processed); - atomicGet(server.stat_total_writes_processed, stat_total_writes_processed); atomicGet(server.stat_net_input_bytes, stat_net_input_bytes); atomicGet(server.stat_net_output_bytes, stat_net_output_bytes); atomicGet(server.stat_net_repl_input_bytes, stat_net_repl_input_bytes); atomicGet(server.stat_net_repl_output_bytes, stat_net_repl_output_bytes); atomicGet(server.stat_client_qbuf_limit_disconnections, stat_client_qbuf_limit_disconnections); - atomicGet(server.stat_io_reads_processed, stat_io_reads_processed); - atomicGet(server.stat_io_writes_processed, stat_io_writes_processed); + + /* If we calculated the total reads and writes in the threads section, + * we don't need to do it again, and also keep the values consistent. */ + if (!stat_io_ops_processed_calculated) { + long long reads, writes; + for (j = 0; j < server.io_threads_num; j++) { + atomicGet(server.stat_io_reads_processed[j], reads); + stat_total_reads_processed += reads; + if (j != 0) stat_io_reads_processed += reads; /* Skip the main thread */ + atomicGet(server.stat_io_writes_processed[j], writes); + stat_total_writes_processed += writes; + if (j != 0) stat_io_writes_processed += writes; /* Skip the main thread */ + } + } if (sections++) info = sdscat(info,"\r\n"); info = sdscatprintf(info, "# Stats\r\n" FMTARGS( @@ -6133,15 +6162,6 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { #endif /* RUSAGE_THREAD */ } - /* Threads */ - if (all_sections || (dictFind(section_dict,"threads") != NULL)) { - if (sections++) info = sdscat(info,"\r\n"); - info = sdscatprintf(info, "# Threads\r\n"); - for (j = 0; j < server.io_threads_num; j++) { - info = sdscatprintf(info, "io_thread_%d:clients=%d\r\n", j, server.io_threads_clients_num[j]); - } - } - /* Modules */ if (all_sections || (dictFind(section_dict,"module_list") != NULL) || (dictFind(section_dict,"modules") != NULL)) { if (sections++) info = sdscat(info,"\r\n"); diff --git a/src/server.h b/src/server.h index 3fe562184..49ab7b708 100644 --- a/src/server.h +++ b/src/server.h @@ -1768,10 +1768,8 @@ struct redisServer { long long stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to master, etc.) error replies */ long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */ long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ - redisAtomic long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */ - redisAtomic long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */ - redisAtomic long long stat_total_reads_processed; /* Total number of read events processed */ - redisAtomic long long stat_total_writes_processed; /* Total number of write events processed */ + redisAtomic long long stat_io_reads_processed[IO_THREADS_MAX_NUM]; /* Number of read events processed by IO / Main threads */ + redisAtomic long long stat_io_writes_processed[IO_THREADS_MAX_NUM]; /* Number of write events processed by IO / Main threads */ redisAtomic long long stat_client_qbuf_limit_disconnections; /* Total number of clients reached query buf length limit */ long long stat_client_outbuf_limit_disconnections; /* Total number of clients reached output buf length limit */ /* The following two are used to track instantaneous metrics, like From 4a12291765c215d8ab9263f533cb9b9c8135729e Mon Sep 17 00:00:00 2001 From: RQfreefly <53940557+RQfreefly@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:35:47 +0800 Subject: [PATCH 34/42] Fix typos in multiple Redis source files (#13716) --- src/atomicvar.h | 2 +- src/eval.c | 2 +- src/quicklist.c | 4 ++-- src/sds.c | 30 +++++++++++++++--------------- src/t_list.c | 2 +- src/t_stream.c | 6 +++--- src/t_string.c | 4 ++-- src/t_zset.c | 2 +- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/atomicvar.h b/src/atomicvar.h index b8529ba27..834e77880 100644 --- a/src/atomicvar.h +++ b/src/atomicvar.h @@ -32,7 +32,7 @@ * (if the flag was 0 -> set to 1, if it's already 1 -> do nothing, but the final result is that the flag is set), * and also it has a full barrier (__sync_lock_test_and_set has acquire barrier). * - * NOTE2: Unlike other atomic type, which aren't guaranteed to be lock free, c11 atmoic_flag does. + * NOTE2: Unlike other atomic type, which aren't guaranteed to be lock free, c11 atomic_flag does. * To check whether a type is lock free, atomic_is_lock_free() can be used. * It can be considered to limit the flag type to atomic_flag to improve performance. * diff --git a/src/eval.c b/src/eval.c index 8670a0a69..47fb43464 100644 --- a/src/eval.c +++ b/src/eval.c @@ -93,7 +93,7 @@ struct ldbState { * bodies in order to obtain the Lua function name, and in the implementation * of redis.sha1(). * - * 'digest' should point to a 41 bytes buffer: 40 for SHA1 converted into an + * 'digest' should point to a 41 bytes buffer: 40 for SHA1 converted into a * hexadecimal number, plus 1 byte for null term. */ void sha1hex(char *digest, char *script, size_t len) { SHA1_CTX ctx; diff --git a/src/quicklist.c b/src/quicklist.c index 6577525a7..81bc70646 100644 --- a/src/quicklist.c +++ b/src/quicklist.c @@ -2126,7 +2126,7 @@ int quicklistTest(int argc, char *argv[], int flags) { quicklistRelease(ql); } - TEST("Comprassion Plain node") { + TEST("Compression Plain node") { for (int f = 0; f < fill_count; f++) { size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; @@ -3301,7 +3301,7 @@ int quicklistTest(int argc, char *argv[], int flags) { } #if ULONG_MAX >= 0xffffffffffffffff - TEST("compress and decomress quicklist plain node large than UINT32_MAX") { + TEST("compress and decompress quicklist plain node larger than UINT32_MAX") { size_t sz = (1ull << 32); unsigned char *s = zmalloc(sz); randstring(s, sz); diff --git a/src/sds.c b/src/sds.c index 53bafffe5..6d9e63af5 100644 --- a/src/sds.c +++ b/src/sds.c @@ -1443,29 +1443,29 @@ int sdsTest(int argc, char **argv, int flags) { /* Test sdsresize - extend */ x = sdsnew("1234567890123456789012345678901234567890"); x = sdsResize(x, 200, 1); - test_cond("sdsrezie() expand len", sdslen(x) == 40); - test_cond("sdsrezie() expand strlen", strlen(x) == 40); - test_cond("sdsrezie() expand alloc", sdsalloc(x) == 200); + test_cond("sdsresize() expand len", sdslen(x) == 40); + test_cond("sdsresize() expand strlen", strlen(x) == 40); + test_cond("sdsresize() expand alloc", sdsalloc(x) == 200); /* Test sdsresize - trim free space */ x = sdsResize(x, 80, 1); - test_cond("sdsrezie() shrink len", sdslen(x) == 40); - test_cond("sdsrezie() shrink strlen", strlen(x) == 40); - test_cond("sdsrezie() shrink alloc", sdsalloc(x) == 80); + test_cond("sdsresize() shrink len", sdslen(x) == 40); + test_cond("sdsresize() shrink strlen", strlen(x) == 40); + test_cond("sdsresize() shrink alloc", sdsalloc(x) == 80); /* Test sdsresize - crop used space */ x = sdsResize(x, 30, 1); - test_cond("sdsrezie() crop len", sdslen(x) == 30); - test_cond("sdsrezie() crop strlen", strlen(x) == 30); - test_cond("sdsrezie() crop alloc", sdsalloc(x) == 30); + test_cond("sdsresize() crop len", sdslen(x) == 30); + test_cond("sdsresize() crop strlen", strlen(x) == 30); + test_cond("sdsresize() crop alloc", sdsalloc(x) == 30); /* Test sdsresize - extend to different class */ x = sdsResize(x, 400, 1); - test_cond("sdsrezie() expand len", sdslen(x) == 30); - test_cond("sdsrezie() expand strlen", strlen(x) == 30); - test_cond("sdsrezie() expand alloc", sdsalloc(x) == 400); + test_cond("sdsresize() expand len", sdslen(x) == 30); + test_cond("sdsresize() expand strlen", strlen(x) == 30); + test_cond("sdsresize() expand alloc", sdsalloc(x) == 400); /* Test sdsresize - shrink to different class */ x = sdsResize(x, 4, 1); - test_cond("sdsrezie() crop len", sdslen(x) == 4); - test_cond("sdsrezie() crop strlen", strlen(x) == 4); - test_cond("sdsrezie() crop alloc", sdsalloc(x) == 4); + test_cond("sdsresize() crop len", sdslen(x) == 4); + test_cond("sdsresize() crop strlen", strlen(x) == 4); + test_cond("sdsresize() crop alloc", sdsalloc(x) == 4); sdsfree(x); } return 0; diff --git a/src/t_list.c b/src/t_list.c index 51f82c4bf..8d990ded7 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -875,7 +875,7 @@ void lrangeCommand(client *c) { /* LTRIM */ void ltrimCommand(client *c) { robj *o; - long start, end, llen, ltrim, rtrim, llenNew;; + long start, end, llen, ltrim, rtrim, llenNew; if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != C_OK) || (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != C_OK)) return; diff --git a/src/t_stream.c b/src/t_stream.c index 6a36bb69d..8578551a8 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -241,7 +241,7 @@ robj *streamDup(robj *o) { /* This is a wrapper function for lpGet() to directly get an integer value * from the listpack (that may store numbers as a string), converting * the string if needed. - * The 'valid" argument is an optional output parameter to get an indication + * The 'valid' argument is an optional output parameter to get an indication * if the record was valid, when this parameter is NULL, the function will * fail with an assertion. */ static inline int64_t lpGetIntegerIfValid(unsigned char *ele, int *valid) { @@ -1742,7 +1742,7 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end /* Try to add a new NACK. Most of the time this will work and * will not require extra lookups. We'll fix the problem later - * if we find that there is already a entry for this ID. */ + * if we find that there is already an entry for this ID. */ streamNACK *nack = streamCreateNACK(consumer); int group_inserted = raxTryInsert(group->pel,buf,sizeof(buf),nack,NULL); @@ -1875,7 +1875,7 @@ robj *streamTypeLookupWriteOrCreate(client *c, robj *key, int no_create) { * that can be represented. If 'strict' is set to 1, "-" and "+" will be * treated as an invalid ID. * - * The ID form -* specifies a millisconds-only ID, leaving the sequence part + * The ID form -* specifies a milliseconds-only ID, leaving the sequence part * to be autogenerated. When a non-NULL 'seq_given' argument is provided, this * form is accepted and the argument is set to 0 unless the sequence part is * specified. diff --git a/src/t_string.c b/src/t_string.c index c96f5e89e..04fd2b6bd 100644 --- a/src/t_string.c +++ b/src/t_string.c @@ -21,7 +21,7 @@ static int checkStringLength(client *c, long long size, long long append) { return C_OK; /* 'uint64_t' cast is there just to prevent undefined behavior on overflow */ long long total = (uint64_t)size + append; - /* Test configured max-bulk-len represending a limit of the biggest string object, + /* Test configured max-bulk-len representing a limit of the biggest string object, * and also test for overflow. */ if (total > server.proto_max_bulk_len || total < size || total < append) { addReplyError(c,"string exceeds maximum allowed size (proto-max-bulk-len)"); @@ -61,7 +61,7 @@ static int checkStringLength(client *c, long long size, long long append) { static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int unit, long long *milliseconds); void setGenericCommand(client *c, int flags, robj *key, robj *val, robj *expire, int unit, robj *ok_reply, robj *abort_reply) { - long long milliseconds = 0; /* initialized to avoid any harmness warning */ + long long milliseconds = 0; /* initialized to avoid any harmless warning */ int found = 0; int setkey_flags = 0; diff --git a/src/t_zset.c b/src/t_zset.c index 7b014f0d0..f0172b1bb 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -728,7 +728,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { x = x->level[0].forward; } } else { - /* If offset is big, we caasn jump from the last zsl->level-1 node. */ + /* If offset is big, we can jump from the last zsl->level-1 node. */ rank_diff = edge_rank + 1 + n - last_highest_level_rank; x = zslGetElementByRankFromNode(last_highest_level_node, zsl->level - 1, rank_diff); } From 08d714d0e5d59f1c1ca37cff1724849220fb6daf Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Wed, 8 Jan 2025 09:57:23 +0800 Subject: [PATCH 35/42] Fix crash due to cron argv release (#13725) Introduced by https://github.com/redis/redis/issues/13521 If the client argv was released due to a timeout before sending the complete command, `argv_len` will be reset to 0. When argv is parsed again and resized, requesting a length of 0 may result in argv being NULL, then leading to a crash. And fix a bug that `argv_len` is not updated correctly in `replaceClientCommandVector()`. --------- Co-authored-by: ShooterIT Co-authored-by: meiravgri <109056284+meiravgri@users.noreply.github.com> --- src/networking.c | 3 ++- src/server.c | 5 +++-- tests/unit/protocol.tcl | 11 +++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/networking.c b/src/networking.c index 94303e1b9..fd9905e2f 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2505,6 +2505,7 @@ int processMultibulkBuffer(client *c) { } else { /* Check if we have space in argv, grow if needed */ if (c->argc >= c->argv_len) { + serverAssert(c->argv_len); /* Ensure argv is not freed while the client is in the mid of parsing command. */ c->argv_len = min(c->argv_len < INT_MAX/2 ? c->argv_len*2 : INT_MAX, c->argc+c->multibulklen); c->argv = zrealloc(c->argv, sizeof(robj*)*c->argv_len); } @@ -3990,7 +3991,7 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) { retainOriginalCommandVector(c); freeClientArgv(c); c->argv = argv; - c->argc = argc; + c->argc = c->argv_len = argc; c->argv_len_sum = 0; for (j = 0; j < c->argc; j++) if (c->argv[j]) diff --git a/src/server.c b/src/server.c index 7bc39c411..a4c8f1152 100644 --- a/src/server.c +++ b/src/server.c @@ -788,8 +788,9 @@ int clientsCronResizeQueryBuffer(client *c) { /* If the client has been idle for too long, free the client's arguments. */ int clientsCronFreeArgvIfIdle(client *c) { - /* If the arguments have already been freed or are still in use, exit ASAP. */ - if (!c->argv || c->argc) return 0; + /* If the client is in the middle of parsing a command, or if argv is in use + * (e.g. parsed in the IO thread but not yet executed, or blocked), exit ASAP. */ + if (!c->argv || c->multibulklen || c->argc) return 0; time_t idletime = server.unixtime - c->lastinteraction; if (idletime > 2) { c->argv_len = 0; diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index a98b124b7..8a6aeed4a 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -310,3 +310,14 @@ start_server {tags {"regression"}} { $rd close } } + +start_server {tags {"regression"}} { + test "Regression for a crash with cron release of client arguments" { + r write "*3\r\n" + r flush + after 3000 ;# wait for c->argv to be released due to timeout + r write "\$3\r\nSET\r\n\$3\r\nkey\r\n\$1\r\n0\r\n" + r flush + r read + } {OK} +} From 21aee83abdbfe8878d8b870b9783ce52ec8fe0f2 Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Wed, 8 Jan 2025 16:12:52 +0800 Subject: [PATCH 36/42] Fix issue with argv not being shrunk (#13698) Found by @ShooterIT ## Describe If a client first creates a command with a very large number of parameters, such as 10,000 parameters, the argv will be expanded to accommodate 10,000. If the subsequent commands have fewer than 10,000 parameters, this argv will continue to be reused and will never be shrunk. ## Solution When determining whether it is necessary to rebuild argv, if the length of the previous argv has already exceeded 1024, we will progressively create argv regardless. ## Free argv in cron Add a new condition to determine whether argv needs to be resized in cron. When the number of parameters exceeds 128, we will resize it regardless to avoid a single client consuming too much memory. It will now occupy a maximum of (128 * 8 bytes). --------- Co-authored-by: Yuan Wang --- src/networking.c | 8 ++++++-- src/server.c | 5 ++++- tests/unit/lazyfree.tcl | 2 -- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/networking.c b/src/networking.c index fd9905e2f..8331db267 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2428,8 +2428,12 @@ int processMultibulkBuffer(client *c) { c->multibulklen = ll; /* Setup argv array on client structure. - * Create new argv if space is insufficient or if we need to allocate it gradually. */ - if (unlikely(c->multibulklen > c->argv_len || c->multibulklen > 1024)) { + * Create new argv in the following cases: + * 1) When the requested size is greater than the current size. + * 2) When the requested size is less than the current size, because + * we always allocate argv gradually with a maximum size of 1024, + * Therefore, if argv_len exceeds this limit, we always reallocate. */ + if (unlikely(c->multibulklen > c->argv_len || c->argv_len > 1024)) { zfree(c->argv); c->argv_len = min(c->multibulklen, 1024); c->argv = zmalloc(sizeof(robj*)*c->argv_len); diff --git a/src/server.c b/src/server.c index a4c8f1152..7a422bb46 100644 --- a/src/server.c +++ b/src/server.c @@ -791,8 +791,11 @@ int clientsCronFreeArgvIfIdle(client *c) { /* If the client is in the middle of parsing a command, or if argv is in use * (e.g. parsed in the IO thread but not yet executed, or blocked), exit ASAP. */ if (!c->argv || c->multibulklen || c->argc) return 0; + + /* Free argv if the client has been idle for more than 2 seconds or if argv + * size is too large. */ time_t idletime = server.unixtime - c->lastinteraction; - if (idletime > 2) { + if (idletime > 2 || c->argv_len > 128) { c->argv_len = 0; zfree(c->argv); c->argv = NULL; diff --git a/tests/unit/lazyfree.tcl b/tests/unit/lazyfree.tcl index cb3a4b014..b4ade4031 100644 --- a/tests/unit/lazyfree.tcl +++ b/tests/unit/lazyfree.tcl @@ -10,7 +10,6 @@ start_server {tags {"lazyfree"}} { set peak_mem [s used_memory] assert {[r unlink myset] == 1} assert {$peak_mem > $orig_mem+1000000} - reconnect ;# free the memory of reused argv of client wait_for_condition 50 100 { [s used_memory] < $peak_mem && [s used_memory] < $orig_mem*2 @@ -33,7 +32,6 @@ start_server {tags {"lazyfree"}} { set peak_mem [s used_memory] r flushdb async assert {$peak_mem > $orig_mem+1000000} - reconnect ;# free the memory of reused argv of client wait_for_condition 50 100 { [s used_memory] < $peak_mem && [s used_memory] < $orig_mem*2 From dc0ee51cb19403607991b060b9f1c71c71548404 Mon Sep 17 00:00:00 2001 From: "Filipe Oliveira (Redis)" Date: Mon, 13 Jan 2025 07:40:36 +0000 Subject: [PATCH 37/42] Refactor Client Write Preparation and Handling (#13721) This update refactors prepareClientToWrite by introducing _prepareClientToWrite for inline checks within network.c file, and separates replica and non-replica handling for pending replies and writes (_clientHasPendingRepliesSlave/NonSlave and _writeToClientSlave/NonSlave). --------- Co-authored-by: debing.sun Co-authored-by: Yuan Wang --- src/networking.c | 249 +++++++++++++++++++++++++++-------------------- 1 file changed, 143 insertions(+), 106 deletions(-) diff --git a/src/networking.c b/src/networking.c index 8331db267..20452726a 100644 --- a/src/networking.c +++ b/src/networking.c @@ -26,6 +26,10 @@ static void setProtocolError(const char *errstr, client *c); static void pauseClientsByClient(mstime_t end, int isPauseClientAll); char *getClientSockname(client *c); static inline int clientTypeIsSlave(client *c); +static inline int _clientHasPendingRepliesSlave(client *c); +static inline int _clientHasPendingRepliesNonSlave(client *c); +static inline int _writeToClientNonSlave(client *c, ssize_t *nwritten); +static inline int _writeToClientSlave(client *c, ssize_t *nwritten); int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */ __thread sds thread_reusable_qb = NULL; __thread int thread_reusable_qb_used = 0; /* Avoid multiple clients using reusable query @@ -261,6 +265,40 @@ void putClientInPendingWriteQueue(client *c) { } } +static inline int _prepareClientToWrite(client *c) { + const uint64_t _flags = c->flags; + /* If it's the Lua client we always return ok without installing any + * handler since there is no socket at all. */ + if (unlikely(_flags & (CLIENT_SCRIPT|CLIENT_MODULE))) return C_OK; + + /* If CLIENT_CLOSE_ASAP flag is set, we need not write anything. */ + if (unlikely(_flags & CLIENT_CLOSE_ASAP)) return C_ERR; + + /* CLIENT REPLY OFF / SKIP handling: don't send replies. + * CLIENT_PUSHING handling: disables the reply silencing flags. */ + if (unlikely((_flags & (CLIENT_REPLY_OFF|CLIENT_REPLY_SKIP)) && + !(_flags & CLIENT_PUSHING))) return C_ERR; + + /* Masters don't receive replies, unless CLIENT_MASTER_FORCE_REPLY flag + * is set. */ + if (unlikely((_flags & CLIENT_MASTER) && + !(_flags & CLIENT_MASTER_FORCE_REPLY))) return C_ERR; + + if (unlikely(!c->conn)) return C_ERR; /* Fake client for AOF loading. */ + + /* Schedule the client to write the output buffers to the socket, unless + * it should already be setup to do so (it has already pending data). + * + * If the client runs in an IO thread, we should not put the client in the + * pending write queue. Instead, we will install the write handler to the + * corresponding IO thread’s event loop and let it handle the reply. */ + if (!clientHasPendingReplies(c) && likely(c->running_tid == IOTHREAD_MAIN_THREAD_ID)) + putClientInPendingWriteQueue(c); + + /* Authorize the caller to queue in the output buffer of this client. */ + return C_OK; +} + /* This function is called every time we are going to transmit new data * to the client. The behavior is the following: * @@ -284,32 +322,7 @@ void putClientInPendingWriteQueue(client *c) { * data to the clients output buffers. If the function returns C_ERR no * data should be appended to the output buffers. */ int prepareClientToWrite(client *c) { - /* If it's the Lua client we always return ok without installing any - * handler since there is no socket at all. */ - if (c->flags & (CLIENT_SCRIPT|CLIENT_MODULE)) return C_OK; - - /* If CLIENT_CLOSE_ASAP flag is set, we need not write anything. */ - if (c->flags & CLIENT_CLOSE_ASAP) return C_ERR; - - /* CLIENT REPLY OFF / SKIP handling: don't send replies. - * CLIENT_PUSHING handling: disables the reply silencing flags. */ - if ((c->flags & (CLIENT_REPLY_OFF|CLIENT_REPLY_SKIP)) && - !(c->flags & CLIENT_PUSHING)) return C_ERR; - - /* Masters don't receive replies, unless CLIENT_MASTER_FORCE_REPLY flag - * is set. */ - if ((c->flags & CLIENT_MASTER) && - !(c->flags & CLIENT_MASTER_FORCE_REPLY)) return C_ERR; - - if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ - - /* Schedule the client to write the output buffers to the socket, unless - * it should already be setup to do so (it has already pending data). */ - if (!clientHasPendingReplies(c) && likely(c->running_tid == IOTHREAD_MAIN_THREAD_ID)) - putClientInPendingWriteQueue(c); - - /* Authorize the caller to queue in the output buffer of this client. */ - return C_OK; + return _prepareClientToWrite(c); } /* ----------------------------------------------------------------------------- @@ -419,7 +432,7 @@ void _addReplyToBufferOrList(client *c, const char *s, size_t len) { /* Add the object 'obj' string representation to the client output buffer. */ void addReply(client *c, robj *obj) { - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; if (sdsEncodedObject(obj)) { _addReplyToBufferOrList(c,obj->ptr,sdslen(obj->ptr)); @@ -438,7 +451,7 @@ void addReply(client *c, robj *obj) { /* Add the SDS 's' string to the client output buffer, as a side effect * the SDS string is freed. */ void addReplySds(client *c, sds s) { - if (prepareClientToWrite(c) != C_OK) { + if (_prepareClientToWrite(c) != C_OK) { /* The caller expects the sds to be free'd. */ sdsfree(s); return; @@ -456,7 +469,7 @@ void addReplySds(client *c, sds s) { * _addReplyProtoToList() if we fail to extend the existing tail object * in the list of objects. */ void addReplyProto(client *c, const char *s, size_t len) { - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; _addReplyToBufferOrList(c,s,len); } @@ -720,7 +733,7 @@ void *addReplyDeferredLen(client *c) { /* Note that we install the write event here even if the object is not * ready to be sent, since we are sure that before returning to the * event loop setDeferredAggregateLen() will be called. */ - if (prepareClientToWrite(c) != C_OK) return NULL; + if (_prepareClientToWrite(c) != C_OK) return NULL; /* Replicas should normally not cause any writes to the reply buffer. In case a rogue replica sent a command on the * replication link that caused a reply to be generated we'll simply disconnect it. @@ -985,7 +998,7 @@ void addReplyLongLong(client *c, long long ll) { else if (ll == 1) addReply(c, shared.cone); else { - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; _addReplyLongLongWithPrefix(c, ll, ':'); } } @@ -998,13 +1011,13 @@ void addReplyLongLongFromStr(client *c, robj *str) { void addReplyAggregateLen(client *c, long length, int prefix) { serverAssert(length >= 0); - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; _addReplyLongLongWithPrefix(c, length, prefix); } void addReplyArrayLen(client *c, long length) { serverAssert(length >= 0); - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; _addReplyLongLongMBulk(c, length); } @@ -1061,13 +1074,13 @@ void addReplyNullArray(client *c) { /* Create the length prefix of a bulk reply, example: $2234 */ void addReplyBulkLen(client *c, robj *obj) { size_t len = stringObjectLen(obj); - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; _addReplyLongLongBulk(c, len); } /* Add a Redis Object as a bulk reply */ void addReplyBulk(client *c, robj *obj) { - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; if (sdsEncodedObject(obj)) { const size_t len = sdslen(obj->ptr); @@ -1091,7 +1104,7 @@ void addReplyBulk(client *c, robj *obj) { /* Add a C buffer as bulk reply */ void addReplyBulkCBuffer(client *c, const void *p, size_t len) { - if (prepareClientToWrite(c) != C_OK) return; + if (_prepareClientToWrite(c) != C_OK) return; _addReplyLongLongBulk(c, len); _addReplyToBufferOrList(c, p, len); _addReplyToBufferOrList(c, "\r\n", 2); @@ -1099,7 +1112,7 @@ void addReplyBulkCBuffer(client *c, const void *p, size_t len) { /* Add sds to reply (takes ownership of sds and frees it) */ void addReplyBulkSds(client *c, sds s) { - if (prepareClientToWrite(c) != C_OK) { + if (_prepareClientToWrite(c) != C_OK) { sdsfree(s); return; } @@ -1235,9 +1248,9 @@ void AddReplyFromClient(client *dst, client *src) { /* First add the static buffer (either into the static buffer or reply list) */ addReplyProto(dst,src->buf, src->bufpos); - /* We need to check with prepareClientToWrite again (after addReplyProto) + /* We need to check with _prepareClientToWrite again (after addReplyProto) * since addReplyProto may have changed something (like CLIENT_CLOSE_ASAP) */ - if (prepareClientToWrite(dst) != C_OK) + if (_prepareClientToWrite(dst) != C_OK) return; /* We're bypassing _addReplyProtoToList, so we need to add the pre/post @@ -1284,26 +1297,32 @@ void copyReplicaOutputBuffer(client *dst, client *src) { ((replBufBlock *)listNodeValue(dst->ref_repl_buf_node))->refcount++; } +static inline int _clientHasPendingRepliesNonSlave(client *c) { + return c->bufpos || listLength(c->reply); +} + +static inline int _clientHasPendingRepliesSlave(client *c) { + /* Replicas use global shared replication buffer instead of + * private output buffer. */ + serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); + if (c->ref_repl_buf_node == NULL) return 0; + + /* If the last replication buffer block content is totally sent, + * we have nothing to send. */ + listNode *ln = listLast(server.repl_buffer_blocks); + replBufBlock *tail = listNodeValue(ln); + if (ln == c->ref_repl_buf_node && + c->ref_block_pos == tail->used) return 0; + return 1; +} + /* Return true if the specified client has pending reply buffers to write to * the socket. */ int clientHasPendingReplies(client *c) { if (unlikely(clientTypeIsSlave(c))) { - /* Replicas use global shared replication buffer instead of - * private output buffer. */ - serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); - if (c->ref_repl_buf_node == NULL) return 0; - - /* If the last replication buffer block content is totally sent, - * we have nothing to send. */ - listNode *ln = listLast(server.repl_buffer_blocks); - replBufBlock *tail = listNodeValue(ln); - if (ln == c->ref_repl_buf_node && - c->ref_block_pos == tail->used) return 0; - - return 1; - } else { - return c->bufpos || listLength(c->reply); + return _clientHasPendingRepliesSlave(c); } + return _clientHasPendingRepliesNonSlave(c); } void clientAcceptHandler(connection *conn) { @@ -1992,38 +2011,13 @@ static int _writevToClient(client *c, ssize_t *nwritten) { return C_OK; } -/* This function does actual writing output buffers to different types of - * clients, it is called by writeToClient. +/* This function does actual writing output buffers for non slave client types, + * it is called by writeToClient. * If we write successfully, it returns C_OK, otherwise, C_ERR is returned, * and 'nwritten' is an output parameter, it means how many bytes server write * to client. */ -int _writeToClient(client *c, ssize_t *nwritten) { +static inline int _writeToClientNonSlave(client *c, ssize_t *nwritten) { *nwritten = 0; - if (unlikely(clientTypeIsSlave(c))) { - serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); - - replBufBlock *o = listNodeValue(c->ref_repl_buf_node); - serverAssert(o->used >= c->ref_block_pos); - /* Send current block if it is not fully sent. */ - if (o->used > c->ref_block_pos) { - *nwritten = connWrite(c->conn, o->buf+c->ref_block_pos, - o->used-c->ref_block_pos); - if (*nwritten <= 0) return C_ERR; - c->ref_block_pos += *nwritten; - } - - /* If we fully sent the object on head, go to the next one. */ - listNode *next = listNextNode(c->ref_repl_buf_node); - if (next && c->ref_block_pos == o->used) { - o->refcount--; - ((replBufBlock *)(listNodeValue(next)))->refcount++; - c->ref_repl_buf_node = next; - c->ref_block_pos = 0; - incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); - } - return C_OK; - } - /* When the reply list is not empty, it's better to use writev to save us some * system calls and TCP packets. */ if (listLength(c->reply) > 0) { @@ -2045,8 +2039,36 @@ int _writeToClient(client *c, ssize_t *nwritten) { c->bufpos = 0; c->sentlen = 0; } - } + } + return C_OK; +} +/* This function does actual writing output buffers for slave client types, + * it is called by writeToClient. + * If we write successfully, it returns C_OK, otherwise, C_ERR is returned, + * and 'nwritten' is an output parameter, it means how many bytes server write + * to client. */ +static inline int _writeToClientSlave(client *c, ssize_t *nwritten) { + *nwritten = 0; + serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); + replBufBlock *o = listNodeValue(c->ref_repl_buf_node); + serverAssert(o->used >= c->ref_block_pos); + /* Send current block if it is not fully sent. */ + if (o->used > c->ref_block_pos) { + *nwritten = connWrite(c->conn, o->buf+c->ref_block_pos, + o->used-c->ref_block_pos); + if (*nwritten <= 0) return C_ERR; + c->ref_block_pos += *nwritten; + } + /* If we fully sent the object on head, go to the next one. */ + listNode *next = listNextNode(c->ref_repl_buf_node); + if (next && c->ref_block_pos == o->used) { + o->refcount--; + ((replBufBlock *)(listNodeValue(next)))->refcount++; + c->ref_repl_buf_node = next; + c->ref_block_pos = 0; + incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); + } return C_OK; } @@ -2064,32 +2086,45 @@ int writeToClient(client *c, int handler_installed) { atomicIncr(server.stat_io_writes_processed[c->running_tid], 1); ssize_t nwritten = 0, totwritten = 0; + const int is_slave = clientTypeIsSlave(c); - while(clientHasPendingReplies(c)) { - int ret = _writeToClient(c, &nwritten); - if (ret == C_ERR) break; - totwritten += nwritten; - /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a slave or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ - if (totwritten > NET_MAX_WRITES_PER_EVENT && - (server.maxmemory == 0 || - zmalloc_used_memory() < server.maxmemory) && - !(c->flags & CLIENT_SLAVE)) break; - } - - if (unlikely(clientTypeIsSlave(c))) { + if (unlikely(is_slave)) { + /* We send as much as possible if the client is + * a slave (otherwise, on high-speed traffic, the + * replication buffer will grow indefinitely) */ + while(_clientHasPendingRepliesSlave(c)) { + int ret = _writeToClientSlave(c, &nwritten); + if (ret == C_ERR) break; + totwritten += nwritten; + } atomicIncr(server.stat_net_repl_output_bytes, totwritten); } else { + /* If we reach this block and client is marked with CLIENT_SLAVE flag + * it's because it's a MONITOR client, which are marked as replicas, + * but exposed as normal clients */ + const int is_normal_client = !(c->flags & CLIENT_SLAVE); + while (_clientHasPendingRepliesNonSlave(c)) { + int ret = _writeToClientNonSlave(c, &nwritten); + if (ret == C_ERR) break; + totwritten += nwritten; + /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. + * + * Moreover, we also send as much as possible if the client is + * a slave (covered above) or a monitor (covered here). + * (otherwise, on high-speed traffic, the + * output buffer will grow indefinitely) */ + if (totwritten > NET_MAX_WRITES_PER_EVENT && + (server.maxmemory == 0 || + zmalloc_used_memory() < server.maxmemory) && + is_normal_client) break; + } atomicIncr(server.stat_net_output_bytes, totwritten); } @@ -4127,6 +4162,8 @@ int getClientType(client *c) { } static inline int clientTypeIsSlave(client *c) { + /* Even though MONITOR clients are marked as replicas, we + * want the expose them as normal clients. */ if (unlikely((c->flags & CLIENT_SLAVE) && !(c->flags & CLIENT_MONITOR))) return 1; return 0; From 73a9b916c9f42f2e07b9338a975f9a473ad0cd9b Mon Sep 17 00:00:00 2001 From: Ozan Tezcan Date: Mon, 13 Jan 2025 15:09:52 +0300 Subject: [PATCH 38/42] Rdb channel replication (#13732) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR is based on: https://github.com/redis/redis/pull/12109 https://github.com/valkey-io/valkey/pull/60 Closes: https://github.com/redis/redis/issues/11678 **Motivation** During a full sync, when master is delivering RDB to the replica, incoming write commands are kept in a replication buffer in order to be sent to the replica once RDB delivery is completed. If RDB delivery takes a long time, it might create memory pressure on master. Also, once a replica connection accumulates replication data which is larger than output buffer limits, master will kill replica connection. This may cause a replication failure. The main benefit of the rdb channel replication is streaming incoming commands in parallel to the RDB delivery. This approach shifts replication stream buffering to the replica and reduces load on master. We do this by opening another connection for RDB delivery. The main channel on replica will be receiving replication stream while rdb channel is receiving the RDB. This feature also helps to reduce master's main process CPU load. By opening a dedicated connection for the RDB transfer, the bgsave process has access to the new connection and it will stream RDB directly to the replicas. Before this change, due to TLS connection restriction, the bgsave process was writing RDB bytes to a pipe and the main process was forwarding it to the replica. This is no longer necessary, the main process can avoid these expensive socket read/write syscalls. It also means RDB delivery to replica will be faster as it avoids this step. In summary, replication will be faster and master's performance during full syncs will improve. **Implementation steps** 1. When replica connects to the master, it sends 'rdb-channel-repl' as part of capability exchange to let master to know replica supports rdb channel. 2. When replica lacks sufficient data for PSYNC, master sends +RDBCHANNELSYNC reply with replica's client id. As the next step, the replica opens a new connection (rdb-channel) and configures it against the master with the appropriate capabilities and requirements. It also sends given client id back to master over rdbchannel, so that master can associate these channels. (initial replica connection will be referred as main-channel) Then, replica requests fullsync using the RDB channel. 3. Prior to forking, master attaches the replica's main channel to the replication backlog to deliver replication stream starting at the snapshot end offset. 4. The master main process sends replication stream via the main channel, while the bgsave process sends the RDB directly to the replica via the rdb-channel. Replica accumulates replication stream in a local buffer, while the RDB is being loaded into the memory. 5. Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated replication stream into the db. Sync is completed. **Some details** - Currently, rdbchannel replication is supported only if `repl-diskless-sync` is enabled on master. Otherwise, replication will happen over a single connection as in before. - On replica, there is a limit to replication stream buffering. Replica uses a new config `replica-full-sync-buffer-limit` to limit number of bytes to accumulate. If it is not set, replica inherits `client-output-buffer-limit ` hard limit config. If we reach this limit, replica stops accumulating. This is not a failure scenario though. Further accumulation will happen on master side. Depending on the configured limits on master, master may kill the replica connection. **API changes in INFO output:** 1. New replica state: `send_bulk_and_stream`. Indicates full sync is still in progress for this replica. It is receiving replication stream and rdb in parallel. ``` slave0:ip=127.0.0.1,port=5002,state=send_bulk_and_stream,offset=0,lag=0 ``` Replica state changes in steps: - First, replica sends psync and receives +RDBCHANNELSYNC :`state=wait_bgsave` - After replica connects with rdbchannel and delivery starts: `state=send_bulk_and_stream` - After full sync: `state=online` 2. On replica side, replication stream buffering metrics: - replica_full_sync_buffer_size: Currently accumulated replication stream data in bytes. - replica_full_sync_buffer_peak: Peak number of bytes that this instance accumulated in the lifetime of the process. ``` replica_full_sync_buffer_size:20485 replica_full_sync_buffer_peak:1048560 ``` **API changes in CLIENT LIST** In `client list` output, rdbchannel clients will have 'C' flag in addition to 'S' replica flag: ``` id=11 addr=127.0.0.1:39108 laddr=127.0.0.1:5001 fd=14 name= age=5 idle=5 flags=SC db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=0 multi-mem=0 rbs=1024 rbp=0 obl=0 oll=0 omem=0 tot-mem=1920 events=r cmd=psync user=default redir=-1 resp=2 lib-name= lib-ver= io-thread=0 ``` **Config changes:** - `replica-full-sync-buffer-limit`: Controls how much replication data replica can accumulate during rdbchannel replication. If it is not set, a value of 0 means replica will inherit `client-output-buffer-limit ` hard limit config to limit accumulated data. - `repl-rdb-channel` config is added as a hidden config. This is mostly for testing as we need to support both rdbchannel replication and the older single connection replication (to keep compatibility with older versions and rdbchannel replication will not be enabled if repl-diskless-sync is not enabled). it affects both the master (not to respond to rdb channel requests), and the replica (not to declare capability) **Internal API changes:** Changes that were introduced to Redis replication: - New replication capability is added to replconf command: `capa rdb-channel-repl`. Indicates replica is capable of rdb channel replication. Replica sends it when it connects to master along with other capabilities. - If replica needs fullsync, master replies `+RDBCHANNELSYNC ` to the replica's PSYNC request. - When replica opens rdbchannel connection, as part of replconf command, it sends `rdb-channel 1` to let master know this is rdb channel. Also, it sends `main-ch-client-id ` as part of replconf command so master can associate channels. **Testing:** As rdbchannel replication is enabled by default, we run whole test suite with it. Though, as we need to support both rdbchannel and single connection replication, we'll be running some tests twice with `repl-rdb-channel yes/no` config. **Replica state diagram** ``` * * Replica state machine * * * Main channel state * ┌───────────────────┐ * │RECEIVE_PING_REPLY │ * └────────┬──────────┘ * │ +PONG * ┌────────▼──────────┐ * │SEND_HANDSHAKE │ RDB channel state * └────────┬──────────┘ ┌───────────────────────────────┐ * │+OK ┌───► RDB_CH_SEND_HANDSHAKE │ * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ * │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id * └────────┬──────────┘ │ ┌──────────────▼────────────────┐ * │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │ * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ * │RECEIVE_PORT_REPLY │ │ │ +OK * └────────┬──────────┘ │ ┌──────────────▼────────────────┐ * │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│ * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ * │RECEIVE_IP_REPLY │ │ │ +OK * └────────┬──────────┘ │ ┌──────────────▼────────────────┐ * │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │ * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ * │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC * └────────┬──────────┘ │ │Rdb delivery * │ │ ┌──────────────▼────────────────┐ * ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │ * │SEND_PSYNC │ │ └──────────────┬────────────────┘ * └─┬─────────────────┘ │ │ Done loading * │PSYNC (use cached-master) │ │ * ┌─▼─────────────────┐ │ │ * │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication * └─┬─────────────────┘ │ │ │ buffer into memory * │ │ │ │ * │+RDBCHANNELSYNC client-id │ │ │ * ├──────┬───────────────────┘ │ │ * │ │ Main channel │ │ * │ │ accumulates repl data │ │ * │ ┌──▼────────────────┐ │ ┌───────▼───────────┐ * │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │ * │ └───────────────────┘ └────▲───▲──────────┘ * │ │ │ * │ │ │ * │ +FULLRESYNC ┌───────────────────┐ │ │ * ├────────────────► REPL_TRANSFER ├────┘ │ * │ └───────────────────┘ │ * │ +CONTINUE │ * └──────────────────────────────────────────────┘ */ ``` ----- This PR also contains changes and ideas from: https://github.com/valkey-io/valkey/pull/837 https://github.com/valkey-io/valkey/pull/1173 https://github.com/valkey-io/valkey/pull/804 https://github.com/valkey-io/valkey/pull/945 https://github.com/valkey-io/valkey/pull/989 --------- Co-authored-by: Yuan Wang Co-authored-by: debing.sun Co-authored-by: Moti Cohen Co-authored-by: naglera Co-authored-by: Amit Nagler <58042354+naglera@users.noreply.github.com> Co-authored-by: Madelyn Olson Co-authored-by: Binbin Co-authored-by: Viktor Söderqvist Co-authored-by: Ping Xie Co-authored-by: Ran Shidlansik Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: xbasel <103044017+xbasel@users.noreply.github.com> --- redis.conf | 18 + src/config.c | 5 + src/debug.c | 25 + src/networking.c | 19 +- src/rdb.c | 137 +-- src/replication.c | 829 +++++++++++++++++- src/rio.c | 174 +++- src/rio.h | 53 +- src/server.c | 23 +- src/server.h | 65 +- .../cluster/tests/12-replica-migration-2.tcl | 2 +- tests/helpers/gen_write_load.tcl | 25 +- tests/integration/replication-buffer.tcl | 69 +- tests/integration/replication-psync.tcl | 54 +- tests/integration/replication-rdbchannel.tcl | 795 +++++++++++++++++ tests/integration/replication.tcl | 39 +- tests/support/util.tcl | 34 +- tests/unit/auth.tcl | 50 +- 18 files changed, 2204 insertions(+), 212 deletions(-) create mode 100644 tests/integration/replication-rdbchannel.tcl diff --git a/redis.conf b/redis.conf index a1cbedd34..7184c44a1 100644 --- a/redis.conf +++ b/redis.conf @@ -727,6 +727,24 @@ repl-disable-tcp-nodelay no # # repl-backlog-ttl 3600 +# During a fullsync, the master may decide to send both the RDB file and the +# replication stream to the replica in parallel. This approach shifts the +# responsibility of buffering the replication stream to the replica during the +# fullsync process. The replica accumulates the replication stream data until +# the RDB file is fully loaded. Once the RDB delivery is completed and +# successfully loaded, the replica begins processing and applying the +# accumulated replication data to the db. The configuration below controls how +# much replication data the replica can accumulate during a fullsync. +# +# When the replica reaches this limit, it will stop accumulating further data. +# At this point, additional data accumulation may occur on the master side +# depending on the 'client-output-buffer-limit ' config of master. +# +# A value of 0 means replica inherits hard limit of +# 'client-output-buffer-limit ' config to limit accumulation size. +# +# replica-full-sync-buffer-limit 0 + # The replica priority is an integer number published by Redis in the INFO # output. It is used by Redis Sentinel in order to select a replica to promote # into a master if the master is no longer working correctly. diff --git a/src/config.c b/src/config.c index 797284347..9d287dd99 100644 --- a/src/config.c +++ b/src/config.c @@ -3,6 +3,9 @@ * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2) or the Server Side Public License v1 (SSPLv1). * @@ -3076,6 +3079,7 @@ standardConfig static_configs[] = { createBoolConfig("lazyfree-lazy-user-flush", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.lazyfree_lazy_user_flush , 0, NULL, NULL), createBoolConfig("repl-disable-tcp-nodelay", NULL, MODIFIABLE_CONFIG, server.repl_disable_tcp_nodelay, 0, NULL, NULL), createBoolConfig("repl-diskless-sync", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_diskless_sync, 1, NULL, NULL), + createBoolConfig("repl-rdb-channel", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, server.repl_rdb_channel, 1, NULL, NULL), createBoolConfig("aof-rewrite-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.aof_rewrite_incremental_fsync, 1, NULL, NULL), createBoolConfig("no-appendfsync-on-rewrite", NULL, MODIFIABLE_CONFIG, server.aof_no_fsync_on_rewrite, 0, NULL, NULL), createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL), @@ -3218,6 +3222,7 @@ standardConfig static_configs[] = { createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024*1024, LONG_MAX, server.proto_max_bulk_len, 512ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */ createLongLongConfig("stream-node-max-entries", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.stream_node_max_entries, 100, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("repl-backlog-size", NULL, MODIFIABLE_CONFIG, 1, LLONG_MAX, server.repl_backlog_size, 1024*1024, MEMORY_CONFIG, NULL, updateReplBacklogSize), /* Default: 1mb */ + createLongLongConfig("replica-full-sync-buffer-limit", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.repl_full_sync_buffer_limit, 0, MEMORY_CONFIG, NULL, NULL), /* Default: Inherits 'client-output-buffer-limit ' */ /* Unsigned Long Long configs */ createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory), diff --git a/src/debug.c b/src/debug.c index c4d184b15..4ecfed5e3 100644 --- a/src/debug.c +++ b/src/debug.c @@ -2,6 +2,9 @@ * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2) or the Server Side Public License v1 (SSPLv1). * @@ -483,6 +486,8 @@ void debugCommand(client *c) { " In case RESET is provided the peak reset time will be restored to the default value", "REPLYBUFFER RESIZING <0|1>", " Enable or disable the reply buffer resize cron job", +"REPL-PAUSE ", +" Pause the server's main process during various replication steps.", "DICT-RESIZING <0|1>", " Enable or disable the main dict and expire dict resizing.", "SCRIPT >", @@ -1018,6 +1023,20 @@ NULL return; } addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "repl-pause") && c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr, "clear")) { + server.repl_debug_pause = REPL_DEBUG_PAUSE_NONE; + } else if (!strcasecmp(c->argv[2]->ptr,"after-fork")) { + server.repl_debug_pause |= REPL_DEBUG_AFTER_FORK; + } else if (!strcasecmp(c->argv[2]->ptr,"before-rdb-channel")) { + server.repl_debug_pause |= REPL_DEBUG_BEFORE_RDB_CHANNEL; + } else if (!strcasecmp(c->argv[2]->ptr, "on-streaming-repl-buf")) { + server.repl_debug_pause |= REPL_DEBUG_ON_STREAMING_REPL_BUF; + } else { + addReplySubcommandSyntaxError(c); + return; + } + addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "dict-resizing") && c->argc == 3) { server.dict_resizing = atoi(c->argv[2]->ptr); addReply(c, shared.ok); @@ -2583,6 +2602,12 @@ void applyWatchdogPeriod(void) { } } +void debugPauseProcess(void) { + serverLog(LL_NOTICE, "Process is about to stop."); + raise(SIGSTOP); + serverLog(LL_NOTICE, "Process has been continued."); +} + /* Positive input is sleep time in microseconds. Negative input is fractions * of microseconds, i.e. -10 means 100 nanoseconds. */ void debugDelay(int usec) { diff --git a/src/networking.c b/src/networking.c index 20452726a..31799a406 100644 --- a/src/networking.c +++ b/src/networking.c @@ -188,6 +188,7 @@ client *createClient(connection *conn) { c->slave_addr = NULL; c->slave_capa = SLAVE_CAPA_NONE; c->slave_req = SLAVE_REQ_NONE; + c->main_ch_client_id = 0; c->reply = listCreate(); c->deferred_reply_errors = NULL; c->reply_bytes = 0; @@ -252,6 +253,7 @@ void putClientInPendingWriteQueue(client *c) { * writes at this stage. */ if (!(c->flags & CLIENT_PENDING_WRITE) && (c->replstate == REPL_STATE_NONE || + c->replstate == SLAVE_STATE_SEND_BULK_AND_STREAM || (c->replstate == SLAVE_STATE_ONLINE && !c->repl_start_cmd_stream_on_ack))) { /* Here instead of installing the write handler, we just flag the @@ -1556,7 +1558,16 @@ void unlinkClient(client *c) { } } /* Only use shutdown when the fork is active and we are the parent. */ - if (server.child_type) connShutdown(c->conn); + if (server.child_type) { + /* connShutdown() may access TLS state. If this is a rdbchannel + * client, bgsave fork is writing to the connection and TLS state in + * the main process is stale. SSL_shutdown() involves a handshake, + * and it may block the caller when used with stale TLS state.*/ + if (c->flags & CLIENT_REPL_RDB_CHANNEL) + shutdown(c->conn->fd, SHUT_RDWR); + else + connShutdown(c->conn); + } connClose(c->conn); c->conn = NULL; } @@ -1725,7 +1736,8 @@ void freeClient(client *c) { /* Log link disconnection with slave */ if (clientTypeIsSlave(c)) { - serverLog(LL_NOTICE,"Connection with replica %s lost.", + const char *type = c->flags & CLIENT_REPL_RDB_CHANNEL ? " (rdbchannel)" : ""; + serverLog(LL_NOTICE,"Connection with replica%s %s lost.", type, replicationGetSlaveName(c)); } @@ -3086,6 +3098,7 @@ sds catClientInfoString(sds s, client *client) { if (client->flags & CLIENT_READONLY) *p++ = 'r'; if (client->flags & CLIENT_NO_EVICT) *p++ = 'e'; if (client->flags & CLIENT_NO_TOUCH) *p++ = 'T'; + if (client->flags & CLIENT_REPL_RDB_CHANNEL) *p++ = 'C'; if (p == flags) *p++ = 'N'; *p++ = '\0'; @@ -4309,7 +4322,7 @@ void flushSlavesOutputBuffers(void) { * * 3. Obviously if the slave is not ONLINE. */ - if (slave->replstate == SLAVE_STATE_ONLINE && + if ((slave->replstate == SLAVE_STATE_ONLINE || slave->replstate == SLAVE_STATE_SEND_BULK_AND_STREAM) && !(slave->flags & CLIENT_CLOSE_ASAP) && can_receive_writes && !slave->repl_start_cmd_stream_on_ack && diff --git a/src/rdb.c b/src/rdb.c index 764b87a48..7a072cad5 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2,8 +2,13 @@ * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2) or the Server Side Public License v1 (SSPLv1). + * + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. */ #include "server.h" @@ -3810,8 +3815,10 @@ static void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { } if (server.rdb_child_exit_pipe!=-1) close(server.rdb_child_exit_pipe); - aeDeleteFileEvent(server.el, server.rdb_pipe_read, AE_READABLE); - close(server.rdb_pipe_read); + if (server.rdb_pipe_read != -1) { + aeDeleteFileEvent(server.el, server.rdb_pipe_read, AE_READABLE); + close(server.rdb_pipe_read); + } server.rdb_child_exit_pipe = -1; server.rdb_pipe_read = -1; zfree(server.rdb_pipe_conns); @@ -3875,7 +3882,8 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) { listNode *ln; listIter li; pid_t childpid; - int pipefds[2], rdb_pipe_write, safe_to_exit_pipe; + int pipefds[2], rdb_pipe_write = 0, safe_to_exit_pipe = 0; + int rdb_channel = (req & SLAVE_REQ_RDB_CHANNEL); if (hasActiveChildProcess()) return C_ERR; @@ -3883,29 +3891,30 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) { * drained the pipe. */ if (server.rdb_pipe_conns) return C_ERR; - /* Before to fork, create a pipe that is used to transfer the rdb bytes to - * the parent, we can't let it write directly to the sockets, since in case - * of TLS we must let the parent handle a continuous TLS state when the - * child terminates and parent takes over. */ - if (anetPipe(pipefds, O_NONBLOCK, 0) == -1) return C_ERR; - server.rdb_pipe_read = pipefds[0]; /* read end */ - rdb_pipe_write = pipefds[1]; /* write end */ + if (!rdb_channel) { + /* Before to fork, create a pipe that is used to transfer the rdb bytes to + * the parent, we can't let it write directly to the sockets, since in case + * of TLS we must let the parent handle a continuous TLS state when the + * child terminates and parent takes over. */ + if (anetPipe(pipefds, O_NONBLOCK, 0) == -1) return C_ERR; + server.rdb_pipe_read = pipefds[0]; /* read end */ + rdb_pipe_write = pipefds[1]; /* write end */ - /* create another pipe that is used by the parent to signal to the child - * that it can exit. */ - if (anetPipe(pipefds, 0, 0) == -1) { - close(rdb_pipe_write); - close(server.rdb_pipe_read); - return C_ERR; + /* create another pipe that is used by the parent to signal to the child + * that it can exit. */ + if (anetPipe(pipefds, 0, 0) == -1) { + close(rdb_pipe_write); + close(server.rdb_pipe_read); + return C_ERR; + } + safe_to_exit_pipe = pipefds[0]; /* read end */ + server.rdb_child_exit_pipe = pipefds[1]; /* write end */ } - safe_to_exit_pipe = pipefds[0]; /* read end */ - server.rdb_child_exit_pipe = pipefds[1]; /* write end */ /* Collect the connections of the replicas we want to transfer - * the RDB to, which are i WAIT_BGSAVE_START state. */ - server.rdb_pipe_conns = zmalloc(sizeof(connection *)*listLength(server.slaves)); - server.rdb_pipe_numconns = 0; - server.rdb_pipe_numconns_writing = 0; + * the RDB to, which are in WAIT_BGSAVE_START state. */ + int numconns = 0; + connection **conns = zmalloc(sizeof(*conns) * listLength(server.slaves)); listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; @@ -3913,22 +3922,36 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) { /* Check slave has the exact requirements */ if (slave->slave_req != req) continue; - server.rdb_pipe_conns[server.rdb_pipe_numconns++] = slave->conn; - replicationSetupSlaveForFullResync(slave,getPsyncInitialOffset()); + replicationSetupSlaveForFullResync(slave, getPsyncInitialOffset()); + conns[numconns++] = slave->conn; + if (rdb_channel) { + /* Put the socket in blocking mode to simplify RDB transfer. */ + connSendTimeout(slave->conn, server.repl_timeout * 1000); + connBlock(slave->conn); + } } } + if (!rdb_channel) { + server.rdb_pipe_conns = conns; + server.rdb_pipe_numconns = numconns; + server.rdb_pipe_numconns_writing = 0; + } + /* Create the child process. */ if ((childpid = redisFork(CHILD_TYPE_RDB)) == 0) { /* Child */ int retval, dummy; rio rdb; - rioInitWithFd(&rdb,rdb_pipe_write); - - /* Close the reading part, so that if the parent crashes, the child will - * get a write error and exit. */ - close(server.rdb_pipe_read); + if (rdb_channel) { + rioInitWithConnset(&rdb, conns, numconns); + } else { + rioInitWithFd(&rdb,rdb_pipe_write); + /* Close the reading part, so that if the parent crashes, the child + * will get a write error and exit. */ + close(server.rdb_pipe_read); + } redisSetProcTitle("redis-rdb-to-slaves"); redisSetCpuAffinity(server.bgsave_cpulist); @@ -3941,14 +3964,19 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) { sendChildCowInfo(CHILD_INFO_TYPE_RDB_COW_SIZE, "RDB"); } - rioFreeFd(&rdb); - /* wake up the reader, tell it we're done. */ - close(rdb_pipe_write); - close(server.rdb_child_exit_pipe); /* close write end so that we can detect the close on the parent. */ - /* hold exit until the parent tells us it's safe. we're not expecting - * to read anything, just get the error when the pipe is closed. */ - dummy = read(safe_to_exit_pipe, pipefds, 1); - UNUSED(dummy); + if (rdb_channel) { + rioFreeConnset(&rdb); + } else { + rioFreeFd(&rdb); + /* wake up the reader, tell it we're done. */ + close(rdb_pipe_write); + close(server.rdb_child_exit_pipe); /* close write end so that we can detect the close on the parent. */ + /* hold exit until the parent tells us it's safe. we're not expecting + * to read anything, just get the error when the pipe is closed. */ + dummy = read(safe_to_exit_pipe, pipefds, 1); + UNUSED(dummy); + } + zfree(conns); exitFromChild((retval == C_OK) ? 0 : 1); } else { /* Parent */ @@ -3966,24 +3994,33 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) { slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START; } } - close(rdb_pipe_write); - close(server.rdb_pipe_read); - close(server.rdb_child_exit_pipe); - zfree(server.rdb_pipe_conns); - server.rdb_pipe_conns = NULL; - server.rdb_pipe_numconns = 0; - server.rdb_pipe_numconns_writing = 0; + + if (!rdb_channel) { + close(rdb_pipe_write); + close(server.rdb_pipe_read); + close(server.rdb_child_exit_pipe); + zfree(server.rdb_pipe_conns); + server.rdb_pipe_conns = NULL; + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; + } } else { - serverLog(LL_NOTICE,"Background RDB transfer started by pid %ld", - (long) childpid); + serverLog(LL_NOTICE, "Background RDB transfer started by pid %ld to %s", (long)childpid, + rdb_channel ? "replica socket" : "parent process pipe"); server.rdb_save_time_start = time(NULL); server.rdb_child_type = RDB_CHILD_TYPE_SOCKET; - close(rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */ - if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) { - serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); + if (!rdb_channel) { + close(rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */ + if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) { + serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); + } } } - close(safe_to_exit_pipe); + if (rdb_channel) + zfree(conns); + else + close(safe_to_exit_pipe); + return (childpid == -1) ? C_ERR : C_OK; } return C_OK; /* Unreached. */ diff --git a/src/replication.c b/src/replication.c index 79a55d39b..635497b67 100644 --- a/src/replication.c +++ b/src/replication.c @@ -12,6 +12,19 @@ * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. */ +/* + * replication.c - Replication Management + * + * This file contains the implementation of Redis's replication logic, which + * enables data synchronization between master and replica instances. + * It handles: + * - Master-to-replica synchronization + * - Full and partial resynchronizations + * - Replication backlog management + * - State machines for replica operations + * - RDB Channel for Full Sync (lookup "rdb channel for full sync") + */ + #include "server.h" #include "cluster.h" #include "bio.h" @@ -31,13 +44,56 @@ void replicationSendAck(void); int replicaPutOnline(client *slave); void replicaStartCommandStream(client *slave); int cancelReplicationHandshake(int reconnect); +static void rdbChannelFullSyncWithMaster(connection *conn); +static int rdbChannelAbortRdbTransfer(void); +static void rdbChannelBufferReplData(connection *conn); +static void rdbChannelReplDataBufInit(void); +static void rdbChannelSuccess(void); /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case * the instance is configured to have no persistence. */ int RDBGeneratedByReplication = 0; + +/* A reference to diskless loading rio to abort it asynchronously. It's needed + * for rdbchannel replication. While loading from rdbchannel connection, we may + * yield back to eventloop. If main channel connection detects a network problem + * we want to abort loading. It calls rioAbort() in this case, so next rioRead() + * from rdbchannel connection will return error to cancel loading safely. */ +static rio *disklessLoadingRio = NULL; + /* --------------------------- Utility functions ---------------------------- */ + +/* Returns 1 if the replica is rdbchannel and there is an associated main + * channel slave with that. */ +int replicationCheckHasMainChannel(client *replica) { + if (!(replica->flags & CLIENT_REPL_RDB_CHANNEL) || + !replica->main_ch_client_id || + lookupClientByID(replica->main_ch_client_id) == NULL) + { + return 0; + } + return 1; +} + +/* During rdb channel replication, replica opens two connections. From master + * POV, these connections are distinct replicas in server.slaves. This function + * counts associated replicas as one and returns logical replica count. */ +unsigned long replicationLogicalReplicaCount(void) { + unsigned long count = 0; + listNode *ln; + listIter li; + + listRewind(server.slaves,&li); + while ((ln = listNext(&li))) { + client *replica = listNodeValue(ln); + if (!replicationCheckHasMainChannel(replica)) + count++; + } + return count; +} + static ConnectionType *connTypeOfReplication(void) { if (server.tls_replication) { return connectionTypeTls(); @@ -191,7 +247,8 @@ int canFeedReplicaReplBuffer(client *replica) { if (replica->flags & CLIENT_REPL_RDBONLY) return 0; /* Don't feed replicas that are still waiting for BGSAVE to start. */ - if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) return 0; + if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START || + replica->replstate == SLAVE_STATE_WAIT_RDB_CHANNEL) return 0; /* Don't feed replicas that are going to be closed ASAP. */ if (replica->flags & CLIENT_CLOSE_ASAP) return 0; @@ -199,6 +256,20 @@ int canFeedReplicaReplBuffer(client *replica) { return 1; } +/* Create the replication backlog if needed. */ +void createReplicationBacklogIfNeeded(void) { + if (listLength(server.slaves) == 1 && server.repl_backlog == NULL) { + /* When we create the backlog from scratch, we always use a new + * replication ID and clear the ID2, since there is no valid + * past history. */ + changeReplicationId(); + clearReplicationId2(); + createReplicationBacklog(); + serverLog(LL_NOTICE,"Replication backlog created, my new " + "replication IDs are '%s' and '%s'", + server.replid, server.replid2); + } +} /* Similar with 'prepareClientToWrite', note that we must call this function * before feeding replication stream into global replication buffer, since * clientHasPendingReplies in prepareClientToWrite will access the global @@ -702,6 +773,22 @@ int replicationSetupSlaveForFullResync(client *slave, long long offset) { /* Don't send this reply to slaves that approached us with * the old SYNC command. */ if (!(slave->flags & CLIENT_PRE_PSYNC)) { + if (slave->slave_req & SLAVE_REQ_RDB_CHANNEL) { + /* This slave is rdbchannel. Find its associated main channel and + * change its state so we can deliver replication stream from now + * on, in parallel to rdb. */ + uint64_t id = slave->main_ch_client_id; + client *c = lookupClientByID(id); + if (c && c->replstate == SLAVE_STATE_WAIT_RDB_CHANNEL) { + c->replstate = SLAVE_STATE_SEND_BULK_AND_STREAM; + serverLog(LL_NOTICE, "Starting to deliver RDB and replication stream to replica: %s", + replicationGetSlaveName(c)); + } else { + serverLog(LL_WARNING, "Starting to deliver RDB to replica %s" + " but it has no associated main channel", + replicationGetSlaveName(slave)); + } + } buflen = snprintf(buf,sizeof(buf),"+FULLRESYNC %s %lld\r\n", server.replid,offset); if (connWrite(slave->conn,buf,buflen) != buflen) { @@ -748,8 +835,9 @@ int masterTryPartialResynchronization(client *c, long long psync_offset) { "up to %lld", psync_offset, server.second_replid_offset); } } else { - serverLog(LL_NOTICE,"Full resync requested by replica %s", - replicationGetSlaveName(c)); + serverLog(LL_NOTICE,"Full resync requested by replica %s %s", + replicationGetSlaveName(c), + c->flags & CLIENT_REPL_RDB_CHANNEL ? "(rdb-channel)" : ""); } goto need_full_resync; } @@ -846,8 +934,9 @@ int startBgsaveForReplication(int mincapa, int req) { /* `SYNC` should have failed with error if we don't support socket and require a filter, assert this here */ serverAssert(socket_target || !(req & SLAVE_REQ_RDB_MASK)); - serverLog(LL_NOTICE,"Starting BGSAVE for SYNC with target: %s", - socket_target ? "replicas sockets" : "disk"); + serverLog(LL_NOTICE,"Starting BGSAVE for SYNC with target: %s%s", + socket_target ? "replicas sockets" : "disk", + (req & SLAVE_REQ_RDB_CHANNEL) ? " (rdb-channel)" : ""); rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); @@ -860,6 +949,8 @@ int startBgsaveForReplication(int mincapa, int req) { /* Keep the page cache since it'll get used soon */ retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE); } + if (server.repl_debug_pause & REPL_DEBUG_AFTER_FORK) + debugPauseProcess(); } else { serverLog(LL_WARNING,"BGSAVE for replication: replication information not available, can't generate the RDB file right now. Try later."); retval = C_ERR; @@ -1007,6 +1098,34 @@ void syncCommand(client *c) { * resync on purpose when they are not able to partially * resync. */ if (master_replid[0] != '?') server.stat_sync_partial_err++; + if (c->slave_capa & SLAVE_CAPA_RDB_CHANNEL_REPL) { + int len; + char buf[128]; + /* Replica is capable of rdbchannel replication. This is + * replica's main channel. Let replica know full sync is needed. + * Replica will open another connection (rdbchannel). Once rdb + * delivery starts, we'll stream repl data to the main channel.*/ + c->flags |= CLIENT_SLAVE; + c->replstate = SLAVE_STATE_WAIT_RDB_CHANNEL; + c->repl_ack_time = server.unixtime; + listAddNodeTail(server.slaves, c); + createReplicationBacklogIfNeeded(); + + serverLog(LL_NOTICE, + "Replica %s is capable of rdb channel synchronization, and partial sync isn't possible. " + "Full sync will continue with dedicated rdb channel.", + replicationGetSlaveName(c)); + + /* Send +RDBCHANNELSYNC with client id. Rdbchannel of replica + * will call 'replconf set-main-ch-id ' so we can + * associate replica connections on master.*/ + len = snprintf(buf, sizeof(buf), "+RDBCHANNELSYNC %llu\r\n", + (unsigned long long) c->id); + if (connWrite(c->conn, buf, strlen(buf)) != len) + freeClientAsync(c); + + return; + } } } else { /* If a slave uses SYNC, we are dealing with an old implementation @@ -1028,17 +1147,7 @@ void syncCommand(client *c) { listAddNodeTail(server.slaves,c); /* Create the replication backlog if needed. */ - if (listLength(server.slaves) == 1 && server.repl_backlog == NULL) { - /* When we create the backlog from scratch, we always use a new - * replication ID and clear the ID2, since there is no valid - * past history. */ - changeReplicationId(); - clearReplicationId2(); - createReplicationBacklog(); - serverLog(LL_NOTICE,"Replication backlog created, my new " - "replication IDs are '%s' and '%s'", - server.replid, server.replid2); - } + createReplicationBacklogIfNeeded(); /* CASE 1: BGSAVE is in progress, with disk target. */ if (server.child_type == CHILD_TYPE_RDB && @@ -1126,7 +1235,7 @@ void syncCommand(client *c) { * the master can accurately lists replicas and their listening ports in the * INFO output. * - * - capa + * - capa * What is the capabilities of this instance. * eof: supports EOF-style RDB transfer for diskless replication. * psync2: supports PSYNC v2, so understands +CONTINUE . @@ -1146,7 +1255,11 @@ void syncCommand(client *c) { * - rdb-filter-only * Define "include" filters for the RDB snapshot. Currently we only support * a single include filter: "functions". Passing an empty string "" will - * result in an empty RDB. */ + * result in an empty RDB. + * + * - main-ch-client-id + * Replica's main channel informs master that this is the main channel of the + * rdb channel identified by the client-id. */ void replconfCommand(client *c) { int j; @@ -1182,6 +1295,10 @@ void replconfCommand(client *c) { c->slave_capa |= SLAVE_CAPA_EOF; else if (!strcasecmp(c->argv[j+1]->ptr,"psync2")) c->slave_capa |= SLAVE_CAPA_PSYNC2; + else if (!strcasecmp(c->argv[j+1]->ptr,"rdb-channel-repl") && server.repl_rdb_channel && + server.repl_diskless_sync) { + c->slave_capa |= SLAVE_CAPA_RDB_CHANNEL_REPL; + } } else if (!strcasecmp(c->argv[j]->ptr,"ack")) { /* REPLCONF ACK is used by slave to inform the master the amount * of replication stream that it processed so far. It is an @@ -1212,6 +1329,8 @@ void replconfCommand(client *c) { checkChildrenDone(); if (c->repl_start_cmd_stream_on_ack && c->replstate == SLAVE_STATE_ONLINE) replicaStartCommandStream(c); + if (c->replstate == SLAVE_STATE_SEND_BULK_AND_STREAM) + replicaPutOnline(c); /* Note: this command does not reply anything! */ return; } else if (!strcasecmp(c->argv[j]->ptr,"getack")) { @@ -1255,6 +1374,31 @@ void replconfCommand(client *c) { } } sdsfreesplitres(filters, filter_count); + } else if (!strcasecmp(c->argv[j]->ptr, "rdb-channel")) { + long rdb_channel = 0; + if (getRangeLongFromObjectOrReply(c, c->argv[j + 1], 0, 1, &rdb_channel, NULL) != C_OK) + return; + if (rdb_channel == 1) { + c->flags |= CLIENT_REPL_RDB_CHANNEL; + c->slave_req |= SLAVE_REQ_RDB_CHANNEL; + } else { + c->flags &= ~CLIENT_REPL_RDB_CHANNEL; + c->slave_req &= ~SLAVE_REQ_RDB_CHANNEL; + } + } else if (!strcasecmp(c->argv[j]->ptr, "main-ch-client-id")) { + /* REPLCONF main-ch-client-id is used to identify + * the current replica rdb channel with existing main channel + * connection. */ + long long client_id = 0; + client *main_ch; + if (getLongLongFromObjectOrReply(c, c->argv[j + 1], &client_id, NULL) != C_OK) + return; + main_ch = lookupClientByID(client_id); + if (!main_ch || main_ch->replstate != SLAVE_STATE_WAIT_RDB_CHANNEL) { + addReplyErrorFormat(c, "Unrecognized RDB client id: %lld", client_id); + return; + } + c->main_ch_client_id = (uint64_t)client_id; } else { addReplyErrorFormat(c,"Unrecognized REPLCONF option: %s", (char*)c->argv[j]->ptr); @@ -1713,6 +1857,19 @@ void shiftReplicationId(void) { /* ----------------------------------- SLAVE -------------------------------- */ +/* Replication: Replica side. */ +void slaveGetPortStr(char *buf, size_t size) { + long long port; + if (server.slave_announce_port) { + port = server.slave_announce_port; + } else if (server.tls_replication && server.tls_port) { + port = server.tls_port; + } else { + port = server.port; + } + ll2string(buf, size, port); +} + /* Returns 1 if the given replication state is a handshake state, * 0 otherwise. */ int slaveIsInHandshakeState(void) { @@ -1852,6 +2009,7 @@ void readSyncBulkPayload(connection *conn) { int use_diskless_load = useDisklessLoad(); redisDb *diskless_load_tempDb = NULL; functionsLibCtx* temp_functions_lib_ctx = NULL; + int rdbchannel = (conn == server.repl_rdb_transfer_s); int empty_db_flags = server.repl_slave_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; off_t left; @@ -2086,6 +2244,7 @@ void readSyncBulkPayload(connection *conn) { functionsLibCtxClear(functions_lib_ctx); } + disklessLoadingRio = &rdb; loadingSetFlags(NULL, server.repl_transfer_size, asyncLoading); if (server.repl_diskless_load != REPL_DISKLESS_LOAD_SWAPDB) { serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Flushing old data"); @@ -2120,9 +2279,9 @@ void readSyncBulkPayload(connection *conn) { loadingFailed = 1; } } + disklessLoadingRio = NULL; if (loadingFailed) { - cancelReplicationHandshake(1); rioFreeConn(&rdb, NULL); if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { @@ -2144,6 +2303,10 @@ void readSyncBulkPayload(connection *conn) { * stopLoading() must be called after emptyData() above. */ stopLoading(0); + /* This must be called after stopLoading(0) as it checks loading + * flag in case of rdbchannel replication. */ + cancelReplicationHandshake(1); + /* Note that there's no point in restarting the AOF on SYNC * failure, it'll be restarted when sync succeeds or the replica * gets promoted. */ @@ -2281,6 +2444,27 @@ void readSyncBulkPayload(connection *conn) { /* Send the initial ACK immediately to put this replica in online state. */ if (usemark) replicationSendAck(); + if (rdbchannel) { + int close_asap; + + if (server.repl_rdb_transfer_s) { + connClose(server.repl_rdb_transfer_s); + server.repl_rdb_transfer_s = NULL; + } + /* At this point, RDB is loaded. If state is REPL_RDB_CH_STATE_CLOSE_ASAP, + * it means main channel faced a problem while RDB is being loaded. It + * stopped replication stream buffering. It's okay though. We'll stream + * whatever we have into the db, then replica will try psync from the + * index that it has. */ + close_asap = (server.repl_rdb_ch_state == REPL_RDB_CH_STATE_CLOSE_ASAP); + /* Finalize fullsync */ + rdbChannelSuccess(); + + /* Main channel connection was broken. Let's trigger a psync with master. */ + if (close_asap && server.master) + freeClientAsync(server.master); + } + /* Restart the AOF subsystem now that we finished the sync. This * will trigger an AOF rewrite, and when done will start appending * to the new file. */ @@ -2440,6 +2624,7 @@ char *sendCommandArgv(connection *conn, int argc, char **argv, size_t *argv_lens #define PSYNC_FULLRESYNC 3 #define PSYNC_NOT_SUPPORTED 4 #define PSYNC_TRY_LATER 5 +#define PSYNC_FULLRESYNC_RDBCHANNEL 6 int slaveTryPartialResynchronization(connection *conn, int read_reply) { char *psync_replid; char psync_offset[32]; @@ -2531,6 +2716,25 @@ int slaveTryPartialResynchronization(connection *conn, int read_reply) { return PSYNC_FULLRESYNC; } + if (!strncmp(reply, "+RDBCHANNELSYNC", strlen("+RDBCHANNELSYNC"))) { + char *client_id = strchr(reply,' '); + if (client_id) + client_id++; + + if (!client_id) { + serverLog(LL_WARNING, + "Master replied with wrong +RDBCHANNELSYNC syntax: %s", reply); + return PSYNC_NOT_SUPPORTED; + } + server.repl_main_ch_client_id = strtoll(client_id, NULL, 10);; + /* A response of +RDBCHANNELSYNC from the master implies that partial + * synchronization is not possible and that the master supports full + * sync using dedicated RDB channel. Full sync will continue that way.*/ + serverLog(LL_NOTICE, "PSYNC is not possible, initialize RDB channel."); + sdsfree(reply); + return PSYNC_FULLRESYNC_RDBCHANNEL; + } + if (!strncmp(reply,"+CONTINUE",9)) { /* Partial resync was accepted. */ serverLog(LL_NOTICE, @@ -2696,17 +2900,11 @@ void syncWithMaster(connection *conn) { /* Set the slave port, so that Master's INFO command can list the * slave listening port correctly. */ { - int port; - if (server.slave_announce_port) - port = server.slave_announce_port; - else if (server.tls_replication && server.tls_port) - port = server.tls_port; - else - port = server.port; - sds portstr = sdsfromlonglong(port); + char buf[LONG_STR_SIZE]; + + slaveGetPortStr(buf, sizeof(buf)); err = sendCommand(conn,"REPLCONF", - "listening-port",portstr, NULL); - sdsfree(portstr); + "listening-port",buf, NULL); if (err) goto write_error; } @@ -2726,7 +2924,9 @@ void syncWithMaster(connection *conn) { * * The master will ignore capabilities it does not understand. */ err = sendCommand(conn,"REPLCONF", - "capa","eof","capa","psync2",NULL); + "capa","eof","capa","psync2", + server.repl_rdb_channel ? "capa" : NULL, "rdb-channel-repl", NULL); + if (err) goto write_error; server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY; @@ -2829,7 +3029,10 @@ void syncWithMaster(connection *conn) { * but there is nothing technically wrong with a full resync which * could happen in edge cases. */ if (server.failover_state == FAILOVER_IN_PROGRESS) { - if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) { + if (psync_result == PSYNC_CONTINUE || + psync_result == PSYNC_FULLRESYNC || + psync_result == PSYNC_FULLRESYNC_RDBCHANNEL) + { clearFailoverState(); } else { abortFailover("Failover target rejected psync request"); @@ -2883,6 +3086,27 @@ void syncWithMaster(connection *conn) { server.repl_transfer_fd = dfd; } + server.repl_transfer_size = -1; + server.repl_transfer_read = 0; + server.repl_transfer_last_fsync_off = 0; + server.repl_transfer_lastio = server.unixtime; + + /* Using rdb channel replication, the master responded +RDBCHANNELSYNC. + * We need to initialize the RDB channel. */ + if (psync_result == PSYNC_FULLRESYNC_RDBCHANNEL) { + /* Create RDB connection */ + server.repl_rdb_transfer_s = connCreate(server.el, connTypeOfReplication()); + if (connConnect(server.repl_rdb_transfer_s, server.masterhost, + server.masterport, server.bind_source_addr, + rdbChannelFullSyncWithMaster) == C_ERR) { + serverLog(LL_WARNING, "Unable to connect to master: %s", connGetLastError(server.repl_rdb_transfer_s)); + goto error; + } + server.repl_rdb_ch_state = REPL_RDB_CH_SEND_HANDSHAKE; + connSetReadHandler(server.repl_transfer_s, NULL); + return; + } + /* Setup the non blocking download of the bulk file. */ if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) @@ -2895,10 +3119,6 @@ void syncWithMaster(connection *conn) { } server.repl_state = REPL_STATE_TRANSFER; - server.repl_transfer_size = -1; - server.repl_transfer_read = 0; - server.repl_transfer_last_fsync_off = 0; - server.repl_transfer_lastio = server.unixtime; return; no_response_error: /* Handle receiveSynchronousResponse() error when master has no reply */ @@ -2908,6 +3128,9 @@ no_response_error: /* Handle receiveSynchronousResponse() error when master has error: if (dfd != -1) close(dfd); connClose(conn); + if (server.repl_rdb_transfer_s) + connClose(server.repl_rdb_transfer_s); + server.repl_rdb_transfer_s = NULL; server.repl_transfer_s = NULL; if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd); @@ -2975,6 +3198,9 @@ void replicationAbortSyncTransfer(void) { * * Otherwise zero is returned and no operation is performed at all. */ int cancelReplicationHandshake(int reconnect) { + if (rdbChannelAbortRdbTransfer() != C_OK) + return 1; + if (server.repl_state == REPL_STATE_TRANSFER) { replicationAbortSyncTransfer(); server.repl_state = REPL_STATE_CONNECT; @@ -3135,6 +3361,539 @@ void replicationHandleMasterDisconnection(void) { } } +/* Rdb channel for full sync + * + * - During a full sync, when master is delivering RDB to the replica, incoming + * write commands are kept in a replication buffer in order to be sent to the + * replica once RDB delivery is completed. If RDB delivery takes a long time, + * it might create memory pressure on master. Also, once a replica connection + * accumulates replication data which is larger than output buffer limits, + * master will kill replica connection. This may cause a replication failure. + * + * The main benefit of the rdb channel replication is streaming incoming + * commands in parallel to the RDB delivery. This approach shifts replication + * stream buffering to the replica and reduces load on master. We do this by + * opening another connection for RDB delivery. The main channel on replica + * will be receiving replication stream while rdb channel is receiving the RDB. + * + * This feature also helps to reduce master's main process CPU load. By + * opening a dedicated connection for the RDB transfer, the bgsave process has + * direct access to the new connection and it will stream RDB directly to the + * replicas. Before this change, due to TLS connection restriction, the bgsave + * process was writing RDB bytes to a pipe and the main process was forwarding + * it to the replica. This is no longer necessary, the main process can avoid + * these expensive socket read/write syscalls. + * + * Implementation + * - When replica connects to the master, it sends 'rdb-channel-repl' as part + * of capability exchange to let master to know replica supports rdb channel. + * - When replica lacks sufficient data for PSYNC, master sends +RDBCHANNELSYNC + * reply with replica's client id. As the next step, the replica opens a new + * connection (rdb-channel) and configures it against the master with the + * appropriate capabilities and requirements. It also sends given client id + * back to master over rdbchannel so that master can associate these + * channels (initial replica connection will be referred as main-channel) + * Then, replica requests fullsync using the RDB channel. + * - Prior to forking, master attaches the replica's main channel to the + * replication backlog to deliver replication stream starting at the snapshot + * end offset. + * - The master main process sends replication stream via the main channel, + * while the bgsave process sends the RDB directly to the replica via the + * rdb-channel. Replica accumulates replication stream in a local buffer, + * while the RDB is being loaded into the memory. + * - Once the replica completes loading the rdb, it drops the rdb channel and + * streams the accumulated replication stream into the db. Sync is completed. + * + * * Replica state machine * + * + * Main channel state + * ┌───────────────────┐ + * │RECEIVE_PING_REPLY │ + * └────────┬──────────┘ + * │ +PONG + * ┌────────▼──────────┐ + * │SEND_HANDSHAKE │ RDB channel state + * └────────┬──────────┘ ┌───────────────────────────────┐ + * │+OK ┌───► RDB_CH_SEND_HANDSHAKE │ + * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ + * │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id + * └────────┬──────────┘ │ ┌──────────────▼────────────────┐ + * │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │ + * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ + * │RECEIVE_PORT_REPLY │ │ │ +OK + * └────────┬──────────┘ │ ┌──────────────▼────────────────┐ + * │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│ + * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ + * │RECEIVE_IP_REPLY │ │ │ +OK + * └────────┬──────────┘ │ ┌──────────────▼────────────────┐ + * │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │ + * ┌────────▼──────────┐ │ └──────────────┬────────────────┘ + * │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC + * └────────┬──────────┘ │ │Rdb delivery + * │ │ ┌──────────────▼────────────────┐ + * ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │ + * │SEND_PSYNC │ │ └──────────────┬────────────────┘ + * └─┬─────────────────┘ │ │ Done loading + * │PSYNC (use cached-master) │ │ + * ┌─▼─────────────────┐ │ │ + * │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication + * └─┬─────────────────┘ │ │ │ buffer into memory + * │ │ │ │ + * │+RDBCHANNELSYNC client-id │ │ │ + * ├──────┬───────────────────┘ │ │ + * │ │ Main channel │ │ + * │ │ accumulates repl data │ │ + * │ ┌──▼────────────────┐ │ ┌───────▼───────────┐ + * │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │ + * │ └───────────────────┘ └────▲───▲──────────┘ + * │ │ │ + * │ │ │ + * │ +FULLRESYNC ┌───────────────────┐ │ │ + * ├────────────────► REPL_TRANSFER ├────┘ │ + * │ └───────────────────┘ │ + * │ +CONTINUE │ + * └──────────────────────────────────────────────┘ + */ + +/* Replication: Replica side. */ +static int rdbChannelSendHandshake(connection *conn, sds *err) { + /* AUTH with the master if required. */ + if (server.masterauth) { + char *args[] = {"AUTH", NULL, NULL}; + size_t lens[] = {4, 0, 0}; + int argc = 1; + if (server.masteruser) { + args[argc] = server.masteruser; + lens[argc] = strlen(server.masteruser); + argc++; + } + args[argc] = server.masterauth; + lens[argc] = sdslen(server.masterauth); + argc++; + *err = sendCommandArgv(conn, argc, args, lens); + if (*err) { + serverLog(LL_WARNING, "Error sending AUTH to master in rdb channel replication handshake: %s", *err); + return C_ERR; + } + } + + char buf[LONG_STR_SIZE]; + slaveGetPortStr(buf, sizeof(buf)); + + char cid[LONG_STR_SIZE]; + ull2string(cid, sizeof(cid), server.repl_main_ch_client_id); + + *err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", + "rdb-channel", "1", "main-ch-client-id", cid, + "listening-port", buf, NULL); + if (*err) { + serverLog(LL_WARNING, "Error sending REPLCONF command to master in rdb channel handshake: %s", *err); + return C_ERR; + } + + if (connSetReadHandler(conn, rdbChannelFullSyncWithMaster) == C_ERR) { + char conninfo[CONN_INFO_LEN]; + serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", + strerror(errno), connGetInfo(conn, conninfo, sizeof(conninfo))); + return C_ERR; + } + return C_OK; +} + +/* Replication: Replica side. */ +static int rdbChannelHandleAuthReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); + if (*err == NULL) { + serverLog(LL_WARNING, "Master did not respond to auth command during rdb channel handshake"); + return C_ERR; + } + if ((*err)[0] == '-') { + serverLog(LL_WARNING, "Unable to AUTH to master: %s", *err); + return C_ERR; + } + server.repl_rdb_ch_state = REPL_RDB_CH_RECEIVE_REPLCONF_REPLY; + return C_OK; +} + +/* Replication: Replica side. */ +static int rdbChannelHandleReplconfReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); + if (*err == NULL) { + serverLog(LL_WARNING, "Master did not respond to replconf command during rdb channel handshake"); + return C_ERR; + } + if (*err[0] == '-') { + serverLog(LL_WARNING, "Master replied error to replconf: %s", *err); + return C_ERR; + } + sdsfree(*err); + + if (server.repl_debug_pause & REPL_DEBUG_BEFORE_RDB_CHANNEL) + debugPauseProcess(); + + /* Request rdb from master */ + *err = sendCommand(conn, "PSYNC", "?", "-1", NULL); + if (*err) { + serverLog(LL_WARNING, "I/O error writing to Master: %s", *err); + return C_ERR; + } + + return C_OK; +} + +/* Replication: Replica side. */ +static int rdbChannelHandleFullresyncReply(connection *conn, sds *err) { + char *replid = NULL, *offset = NULL; + + *err = receiveSynchronousResponse(conn); + if (*err == NULL) + return C_ERR; + + if (*err[0] == '\0') { + /* Retry again later */ + serverLog(LL_DEBUG, "Received empty psync reply"); + return C_RETRY; + } + + /* FULL RESYNC, parse the reply in order to extract the replid + * and the replication offset. */ + replid = strchr(*err,' '); + if (replid) { + replid++; + offset = strchr(replid, ' '); + if (offset) offset++; + } + if (!replid || !offset || (offset-replid-1) != CONFIG_RUN_ID_SIZE) { + serverLog(LL_WARNING, "Received unexpected psync reply: %s", *err); + return C_ERR; + } + memcpy(server.master_replid, replid, offset-replid-1); + server.master_replid[CONFIG_RUN_ID_SIZE] = '\0'; + server.master_initial_offset = strtoll(offset,NULL,10); + + /* Prepare the main and rdb channels for rdb and repl stream delivery.*/ + server.repl_state = REPL_STATE_TRANSFER; + rdbChannelReplDataBufInit(); + + serverLog(LL_NOTICE, "Starting to receive RDB and replication stream in parallel."); + + /* RDB is still loading. Setup connection to accumulate repl data. */ + if (connSetReadHandler(server.repl_transfer_s, + rdbChannelBufferReplData) != C_OK) + { + serverLog(LL_WARNING, "Can't set read handler for main channel: %s", + strerror(errno)); + return C_ERR; + } + + /* Prepare RDB channel connection for RDB download. */ + if (connSetReadHandler(server.repl_rdb_transfer_s, + readSyncBulkPayload) != C_OK) + { + char inf[CONN_INFO_LEN]; + serverLog(LL_WARNING, + "Can't create readable event for rdb channel connection: %s (%s)", + strerror(errno), + connGetInfo(server.repl_rdb_transfer_s, inf, sizeof(inf))); + return C_ERR; + } + + return C_OK; +} + +/* Replication: Replica side. + * This connection handler is used to initialize the RDB channel connection.*/ +static void rdbChannelFullSyncWithMaster(connection *conn) { + int ret = 0; + char *err = NULL; + serverAssert(conn == server.repl_rdb_transfer_s); + + /* Check for errors in the socket: after a non blocking connect() we + * may find that the socket is in error state. */ + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_WARNING, "Error condition on socket for rdb channel replication: %s", + connGetLastError(conn)); + goto error; + } + switch (server.repl_rdb_ch_state) { + case REPL_RDB_CH_SEND_HANDSHAKE: + ret = rdbChannelSendHandshake(conn, &err); + if (ret == C_OK) + server.repl_rdb_ch_state = REPL_RDB_CH_RECEIVE_AUTH_REPLY; + break; + case REPL_RDB_CH_RECEIVE_AUTH_REPLY: + if (server.masterauth) { + ret = rdbChannelHandleAuthReply(conn, &err); + if (ret == C_OK) + server.repl_rdb_ch_state = REPL_RDB_CH_RECEIVE_REPLCONF_REPLY; + /* Wait for next bulk before trying to read replconf reply. */ + break; + } + server.repl_rdb_ch_state = REPL_RDB_CH_RECEIVE_REPLCONF_REPLY; + /* fall through */ + case REPL_RDB_CH_RECEIVE_REPLCONF_REPLY: + ret = rdbChannelHandleReplconfReply(conn, &err); + if (ret == C_OK) + server.repl_rdb_ch_state = REPL_RDB_CH_RECEIVE_FULLRESYNC; + break; + case REPL_RDB_CH_RECEIVE_FULLRESYNC: + ret = rdbChannelHandleFullresyncReply(conn, &err); + if (ret == C_OK) + server.repl_rdb_ch_state = REPL_RDB_CH_RDB_LOADING; + break; + default: + serverPanic("Unknown rdb channel state: %d", server.repl_rdb_ch_state); + } + + if (ret == C_ERR) + goto error; + + sdsfree(err); + return; + +error: + if (err) { + serverLog(LL_WARNING, "rdb channel sync failed with error: %s", err); + sdsfree(err); + } + if (server.repl_transfer_s) { + connClose(server.repl_transfer_s); + server.repl_transfer_s = NULL; + } + server.repl_state = REPL_STATE_CONNECT; + rdbChannelAbortRdbTransfer(); +} + +/* Replication: Replica side. + * Initialize replica's local replication buffer to accumulate repl stream + * during rdb channel sync. */ +static void rdbChannelReplDataBufInit(void) { + serverAssert(server.repl_full_sync_buffer.blocks == NULL); + server.repl_full_sync_buffer.size = 0; + server.repl_full_sync_buffer.used = 0; + server.repl_full_sync_buffer.blocks = listCreate(); + server.repl_full_sync_buffer.blocks->free = zfree; +} + +/* Replication: Replica side. + * Free replica's local replication buffer */ +static void rdbChannelReplDataBufFree(void) { + listRelease(server.repl_full_sync_buffer.blocks); + server.repl_full_sync_buffer.blocks = NULL; + server.repl_full_sync_buffer.size = 0; + server.repl_full_sync_buffer.used = 0; +} + +/* Replication: Replica side. + * Reads replication data from master connection into the repl buffer block */ +int rdbChannelReadIntoBuf(connection *conn, replDataBufBlock *b) { + atomicIncr(server.stat_io_reads_processed[IOTHREAD_MAIN_THREAD_ID], 1); + + int nread = connRead(conn, b->buf + b->used, b->size - b->used); + if (nread <= 0) { + if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_WARNING, "Main channel error while reading from master: %s", + connGetLastError(conn)); + cancelReplicationHandshake(1); + } + return -1; + } + + b->used += nread; + server.repl_full_sync_buffer.used += nread; + atomicIncr(server.stat_net_repl_input_bytes, nread); + + return nread; +} + +/* Replication: Replica side. + * Read handler for buffering incoming repl data during RDB download/loading. */ +void rdbChannelBufferReplData(connection *conn) { + const int buflen = 1024 * 1024; + const int minread = 16 * 1024; + int nread = 0; + int needs_read = 1; + + listNode *ln = listLast(server.repl_full_sync_buffer.blocks); + replDataBufBlock *tail = ln ? listNodeValue(ln) : NULL; + + /* Try to append last node. */ + if (tail && tail->size > tail->used) { + nread = rdbChannelReadIntoBuf(conn, tail); + if (nread <= 0) + return; + + /* If buffer is filled fully, there might be more data in socket buffer. + * Only read again if we've read small amount (less than minread). */ + needs_read = (tail->size == tail->used) && nread < minread; + } + + if (needs_read) { + unsigned long long limit; + size_t usable_size; + + /* For accumulation limit, if 'replica-full-sync-buffer-limit' is set, + * we'll use it. Otherwise, 'client-output-buffer-limit ' is + * the limit.*/ + limit = server.repl_full_sync_buffer_limit; + if (limit == 0) + limit = server.client_obuf_limits[CLIENT_TYPE_SLAVE].hard_limit_bytes; + + if (limit != 0 && server.repl_full_sync_buffer.size > limit) { + serverLog(LL_NOTICE, "Replication buffer limit has been reached (%llu bytes), " + "stopped buffering replication stream. Further accumulation may occur on master side. ", limit); + connSetReadHandler(conn, NULL); + return; + } + + tail = zmalloc_usable(buflen, &usable_size); + tail->size = usable_size - sizeof(replDataBufBlock); + tail->used = 0; + + listAddNodeTail(server.repl_full_sync_buffer.blocks, tail); + server.repl_full_sync_buffer.size += tail->size; + + /* Update buffer's peak */ + if (server.repl_full_sync_buffer.peak < server.repl_full_sync_buffer.size) + server.repl_full_sync_buffer.peak = server.repl_full_sync_buffer.size; + + rdbChannelReadIntoBuf(conn, tail); + } +} + +/* Replication: Replica side. + * Streams accumulated replication data into the database. */ +int rdbChannelStreamReplDataToDb(client *c) { + int ret = C_OK; + size_t size, used, offset = 0; + listNode *n = NULL; + replDataBufBlock *o; + + serverAssert(c->flags & CLIENT_MASTER); + + if (!server.repl_full_sync_buffer.blocks) + return C_OK; + + blockingOperationStarts(); + protectClient(c); + while ((n = listFirst(server.repl_full_sync_buffer.blocks))) { + o = listNodeValue(n); + size = o->size; + used = o->used; + c->querybuf = sdscatlen(c->querybuf, o->buf, used); + c->read_reploff += (long long int) used; + listDelNode(server.repl_full_sync_buffer.blocks, n); + + /* We don't expect error return value but just in case. */ + ret = processInputBuffer(c); + if (ret != C_OK) + break; + + server.repl_full_sync_buffer.used -= used; + server.repl_full_sync_buffer.size -= size; + + if (server.repl_debug_pause & REPL_DEBUG_ON_STREAMING_REPL_BUF) + debugPauseProcess(); + + /* Check if we should yield back to the event loop */ + if (server.loading_process_events_interval_bytes && + ((offset + used) / server.loading_process_events_interval_bytes > + offset / server.loading_process_events_interval_bytes)) + { + replicationSendNewlineToMaster(); + processEventsWhileBlocked(); + } + + offset += used; + /* Check if master client was freed in processEventsWhileBlocked(). + * It can happen if we receive 'replicaof' command or 'client kill' + * command for the master. */ + if (c->flags & CLIENT_CLOSE_ASAP || !server.repl_full_sync_buffer.blocks) { + ret = C_ERR; + break; + } + } + unprotectClient(c); + blockingOperationEnds(); + + if (ret != C_OK) { + serverLog(LL_WARNING, "Master client was freed while streaming accumulated replication data to db."); + return C_ERR; + } + + return C_OK; +} + +/* Replication: Replica side. + * On rdb channel failure, close rdb-connection and reset state. + * Return C_OK if cleanup is done. Otherwise, returns C_ERR which means cleanup + * will be done asynchronously. */ +static int rdbChannelAbortRdbTransfer(void) { + if (server.repl_rdb_ch_state == REPL_RDB_CH_STATE_NONE) + return C_OK; + + if (server.repl_rdb_transfer_s) { + if (server.loading) { + /* If loading flag is set, we want to handle cleanup asynchronously. + * We set REPL_RDB_CH_STATE_CLOSE_ASAP so it will be handled just + * outside loading loop.*/ + serverLog(LL_NOTICE, "Aborting rdb channel sync while loading the RDB."); + + if (disklessLoadingRio) + /* Mark rio with abort flag, next rioRead() will return error.*/ + rioAbort(disklessLoadingRio); + else { + /* For disk based loading, we can wait until loading is done. + * This way, replica will have a chance for a successful psync + * later.*/ + serverLog(LL_NOTICE, "After loading RDB, replica will try psync with master."); + } + + if (server.repl_transfer_s) + connSetReadHandler(server.repl_transfer_s, NULL); + + server.repl_rdb_ch_state = REPL_RDB_CH_STATE_CLOSE_ASAP; + return C_ERR; + } + connClose(server.repl_rdb_transfer_s); + server.repl_rdb_transfer_s = NULL; + } + + serverLog(LL_NOTICE, "Aborting rdb channel sync"); + + if (server.repl_transfer_fd != -1) { + close(server.repl_transfer_fd); + server.repl_transfer_fd = -1; + } + if (server.repl_transfer_tmpfile) { + bg_unlink(server.repl_transfer_tmpfile); + zfree(server.repl_transfer_tmpfile); + server.repl_transfer_tmpfile = NULL; + } + rdbChannelReplDataBufFree(); + server.repl_rdb_ch_state = REPL_RDB_CH_STATE_NONE; + return C_OK; +} + +/* Replica side. After loading the rdb, stream replication buffer to the db. */ +static void rdbChannelSuccess(void) { + serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Starting to stream replication buffer into the db" + " (%zu bytes).", server.repl_full_sync_buffer.used); + + if (rdbChannelStreamReplDataToDb(server.master) == C_ERR) { + serverLog(LL_WARNING, "Failed to stream local replication buffer into the db"); + + rdbChannelAbortRdbTransfer(); + if (server.master) + freeClientAsync(server.master); + return; + } + + serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Successfully streamed replication buffer into the db"); + rdbChannelReplDataBufFree(); + server.repl_rdb_ch_state = REPL_RDB_CH_STATE_NONE; +} + void replicaofCommand(client *c) { /* SLAVEOF is not allowed in cluster mode as replication is automatically * configured using the current address of the master node. */ diff --git a/src/rio.c b/src/rio.c index 9398a3f78..16ec75b07 100644 --- a/src/rio.c +++ b/src/rio.c @@ -1,3 +1,16 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + * + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. + */ + /* rio.c is a simple stream-oriented I/O abstraction that provides an interface * to write code that can consume/produce data using different concrete input * and output devices. For instance the same rdb.c code using the rio @@ -14,34 +27,6 @@ * for the current checksum. * * ---------------------------------------------------------------------------- - * - * Copyright (c) 2009-2012, Pieter Noordhuis - * Copyright (c) 2009-current, Redis Ltd. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. */ @@ -429,6 +414,139 @@ void rioFreeFd(rio *r) { sdsfree(r->io.fd.buf); } +/* ------------------- Connection set implementation ------------------ + * This target is used to write the RDB file to a set of replica connections as + * part of rdb channel replication. */ + +/* Returns 1 for success, 0 for failure. + * The function returns success as long as we are able to correctly write + * to at least one file descriptor. + * + * When buf is NULL or len is 0, the function performs a flush operation if + * there is some pending buffer, so this function is also used in order to + * implement rioConnsetFlush(). */ +static size_t rioConnsetWrite(rio *r, const void *buf, size_t len) { + const size_t pre_flush_size = 256 * 1024; + unsigned char *p = (unsigned char*) buf; + size_t buflen = len; + size_t failed = 0; /* number of connections that write() returned error. */ + + /* For small writes, we rather keep the data in user-space buffer, and flush + * it only when it grows. however for larger writes, we prefer to flush + * any pre-existing buffer, and write the new one directly without reallocs + * and memory copying. */ + if (len > pre_flush_size) { + rioConnsetWrite(r, NULL, 0); + } else { + if (buf && len) { + r->io.connset.buf = sdscatlen(r->io.connset.buf, buf, len); + if (sdslen(r->io.connset.buf) <= PROTO_IOBUF_LEN) + return 1; + } + + p = (unsigned char *)r->io.connset.buf; + buflen = sdslen(r->io.connset.buf); + } + + while (buflen > 0) { + /* Write in little chunks so that when there are big writes we + * parallelize while the kernel is sending data in background to the + * TCP socket. */ + size_t limit = PROTO_IOBUF_LEN * 2; + size_t count = buflen < limit ? buflen : limit; + + for (size_t i = 0; i < r->io.connset.n_dst; i++) { + size_t n_written = 0; + + if (r->io.connset.dst[i].failed != 0) { + failed++; + continue; /* Skip failed connections. */ + } + + do { + ssize_t ret; + connection *c = r->io.connset.dst[i].conn; + + ret = connWrite(c, p + n_written, count - n_written); + if (ret <= 0) { + if (errno == 0) + errno = EIO; + /* With blocking sockets, which is the sole user of this + * rio target, EWOULDBLOCK is returned only because of + * the SO_SNDTIMEO socket option, so we translate the error + * into one more recognizable by the user. */ + if (ret == -1 && errno == EWOULDBLOCK) + errno = ETIMEDOUT; + + r->io.connset.dst[i].failed = 1; + break; + } + n_written += ret; + } while (n_written != count); + } + if (failed == r->io.connset.n_dst) + return 0; /* All the connections have failed. */ + + p += count; + buflen -= count; + r->io.connset.pos += count; + } + + sdsclear(r->io.connset.buf); + return 1; +} + +/* Returns 1 or 0 for success/failure. */ +static size_t rioConnsetRead(rio *r, void *buf, size_t len) { + UNUSED(r); + UNUSED(buf); + UNUSED(len); + return 0; /* Error, this target does not support reading. */ +} + +/* Returns the number of sent bytes. */ +static off_t rioConnsetTell(rio *r) { + return r->io.connset.pos; +} + +/* Flushes any buffer to target device if applicable. Returns 1 on success + * and 0 on failures. */ +static int rioConnsetFlush(rio *r) { + /* Our flush is implemented by the write method, that recognizes a + * buffer set to NULL with a count of zero as a flush request. */ + return rioConnsetWrite(r, NULL, 0); +} + +static const rio rioConnsetIO = { + rioConnsetRead, + rioConnsetWrite, + rioConnsetTell, + rioConnsetFlush, + NULL, /* update_checksum */ + 0, /* current checksum */ + 0, /* flags */ + 0, /* bytes read or written */ + 0, /* read/write chunk size */ + { { NULL, 0 } } /* union for io-specific vars */ +}; + +void rioInitWithConnset(rio *r, connection **conns, size_t n_conns) { + *r = rioConnsetIO; + r->io.connset.dst = zcalloc(sizeof(*r->io.connset.dst) * n_conns); + r->io.connset.n_dst = n_conns; + r->io.connset.pos = 0; + r->io.connset.buf = sdsempty(); + + for (size_t i = 0; i < n_conns; i++) + r->io.connset.dst[i].conn = conns[i]; +} + +/* release the rio stream. */ +void rioFreeConnset(rio *r) { + zfree(r->io.connset.dst); + sdsfree(r->io.connset.buf); +} + /* ---------------------------- Generic functions ---------------------------- */ /* This function can be installed both in memory and file streams when checksum diff --git a/src/rio.h b/src/rio.h index 361d2004c..b868fd366 100644 --- a/src/rio.h +++ b/src/rio.h @@ -1,31 +1,14 @@ /* - * Copyright (c) 2009-2012, Pieter Noordhuis - * Copyright (c) 2009-current, Redis Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. */ @@ -39,6 +22,7 @@ #define RIO_FLAG_READ_ERROR (1<<0) #define RIO_FLAG_WRITE_ERROR (1<<1) +#define RIO_FLAG_ABORT (1<<2) #define RIO_TYPE_FILE (1<<0) #define RIO_TYPE_BUFFER (1<<1) @@ -97,6 +81,17 @@ struct _rio { off_t pos; sds buf; } fd; + /* Multiple connections target (used to write to N sockets). */ + struct { + struct { + connection *conn; /* Connection */ + int failed; /* If write failed on this connection. */ + } *dst; + + size_t n_dst; /* Number of connections */ + off_t pos; /* Number of sent bytes */ + sds buf; + } connset; } io; }; @@ -107,7 +102,7 @@ typedef struct _rio rio; * if needed. */ static inline size_t rioWrite(rio *r, const void *buf, size_t len) { - if (r->flags & RIO_FLAG_WRITE_ERROR) return 0; + if (r->flags & (RIO_FLAG_WRITE_ERROR | RIO_FLAG_ABORT)) return 0; while (len) { size_t bytes_to_write = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len; if (r->update_cksum) r->update_cksum(r,buf,bytes_to_write); @@ -123,7 +118,7 @@ static inline size_t rioWrite(rio *r, const void *buf, size_t len) { } static inline size_t rioRead(rio *r, void *buf, size_t len) { - if (r->flags & RIO_FLAG_READ_ERROR) return 0; + if (r->flags & (RIO_FLAG_READ_ERROR | RIO_FLAG_ABORT)) return 0; while (len) { size_t bytes_to_read = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len; if (r->read(r,buf,bytes_to_read) == 0) { @@ -146,6 +141,10 @@ static inline int rioFlush(rio *r) { return r->flush(r); } +static inline void rioAbort(rio *r) { + r->flags |= RIO_FLAG_ABORT; +} + /* This function allows to know if there was a read error in any past * operation, since the rio stream was created or since the last call * to rioClearError(). */ @@ -159,16 +158,18 @@ static inline int rioGetWriteError(rio *r) { } static inline void rioClearErrors(rio *r) { - r->flags &= ~(RIO_FLAG_READ_ERROR|RIO_FLAG_WRITE_ERROR); + r->flags &= ~(RIO_FLAG_READ_ERROR|RIO_FLAG_WRITE_ERROR|RIO_FLAG_ABORT); } void rioInitWithFile(rio *r, FILE *fp); void rioInitWithBuffer(rio *r, sds s); void rioInitWithConn(rio *r, connection *conn, size_t read_limit); void rioInitWithFd(rio *r, int fd); +void rioInitWithConnset(rio *r, connection **conns, size_t n_conns); void rioFreeFd(rio *r); void rioFreeConn(rio *r, sds* out_remainingBufferedData); +void rioFreeConnset(rio *r); size_t rioWriteBulkCount(rio *r, char prefix, long count); size_t rioWriteBulkString(rio *r, const char *buf, size_t len); diff --git a/src/server.c b/src/server.c index 7a422bb46..fb1fc1f4b 100644 --- a/src/server.c +++ b/src/server.c @@ -1444,7 +1444,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { serverLog(LL_DEBUG, "%lu clients connected (%lu replicas), %zu bytes in use", listLength(server.clients)-listLength(server.slaves), - listLength(server.slaves), + replicationLogicalReplicaCount(), zmalloc_used_memory()); } } @@ -2183,6 +2183,8 @@ void initServerConfig(void) { server.cached_master = NULL; server.master_initial_offset = -1; server.repl_state = REPL_STATE_NONE; + server.repl_rdb_ch_state = REPL_RDB_CH_STATE_NONE; + server.repl_full_sync_buffer = (struct replDataBuf) {0}; server.repl_transfer_tmpfile = NULL; server.repl_transfer_fd = -1; server.repl_transfer_s = NULL; @@ -2679,6 +2681,8 @@ void initServer(void) { server.hz = server.config_hz; server.pid = getpid(); server.in_fork_child = CHILD_TYPE_NONE; + server.rdb_pipe_read = -1; + server.rdb_child_exit_pipe = -1; server.main_thread_id = pthread_self(); server.current_client = NULL; server.errors = raxNew(); @@ -5458,7 +5462,10 @@ const char *replstateToString(int replstate) { switch (replstate) { case SLAVE_STATE_WAIT_BGSAVE_START: case SLAVE_STATE_WAIT_BGSAVE_END: + case SLAVE_STATE_WAIT_RDB_CHANNEL: return "wait_bgsave"; + case SLAVE_STATE_SEND_BULK_AND_STREAM: + return "send_bulk_and_stream"; case SLAVE_STATE_SEND_BULK: return "send_bulk"; case SLAVE_STATE_ONLINE: @@ -6056,7 +6063,9 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "master_last_io_seconds_ago:%d\r\n", server.master ? ((int)(server.unixtime-server.master->lastinteraction)) : -1, "master_sync_in_progress:%d\r\n", server.repl_state == REPL_STATE_TRANSFER, "slave_read_repl_offset:%lld\r\n", slave_read_repl_offset, - "slave_repl_offset:%lld\r\n", slave_repl_offset)); + "slave_repl_offset:%lld\r\n", slave_repl_offset, + "replica_full_sync_buffer_size:%zu\r\n", server.repl_full_sync_buffer.size, + "replica_full_sync_buffer_peak:%zu\r\n", server.repl_full_sync_buffer.peak)); if (server.repl_state == REPL_STATE_TRANSFER) { double perc = 0; @@ -6085,7 +6094,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { info = sdscatprintf(info, "connected_slaves:%lu\r\n", - listLength(server.slaves)); + replicationLogicalReplicaCount()); /* If min-slaves-to-write is active, write the number of slaves * currently considered 'good'. */ @@ -6108,6 +6117,14 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { int port; long lag = 0; + /* During rdbchannel replication, replica opens two connections. + * These are distinct slaves in server.slaves list from master + * POV. We don't want to list these separately. If a rdbchannel + * replica has an associated main-channel replica in + * server.slaves list, we'll list main channel replica only. */ + if (replicationCheckHasMainChannel(slave)) + continue; + if (!slaveip) { if (connAddrPeerName(slave->conn,ip,sizeof(ip),&port) == -1) continue; diff --git a/src/server.h b/src/server.h index 49ab7b708..5bc8b3f4d 100644 --- a/src/server.h +++ b/src/server.h @@ -92,6 +92,7 @@ struct hdr_histogram; /* Error codes */ #define C_OK 0 #define C_ERR -1 +#define C_RETRY -2 /* Static server configuration */ #define CONFIG_DEFAULT_HZ 10 /* Time interrupt calls/sec. */ @@ -394,6 +395,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CLIENT_MODULE_PREVENT_AOF_PROP (1ULL<<48) /* Module client do not want to propagate to AOF */ #define CLIENT_MODULE_PREVENT_REPL_PROP (1ULL<<49) /* Module client do not want to propagate to replica */ #define CLIENT_REPROCESSING_COMMAND (1ULL<<50) /* The client is re-processing the command. */ +#define CLIENT_REPL_RDB_CHANNEL (1ULL<<51) /* Client which is used for rdb delivery as part of rdb channel replication */ /* Any flag that does not let optimize FLUSH SYNC to run it in bg as blocking client ASYNC */ #define CLIENT_AVOID_BLOCKING_ASYNC_FLUSH (CLIENT_DENY_BLOCKING|CLIENT_MULTI|CLIENT_LUA_DEBUG|CLIENT_LUA_DEBUG_SYNC|CLIENT_MODULE) @@ -473,6 +475,24 @@ typedef enum { REPL_STATE_CONNECTED, /* Connected to master */ } repl_state; +/* Replica rdb channel replication state. Used in server.repl_rdb_ch_state for + * replicas to remember what to do next. */ +typedef enum { + REPL_RDB_CH_STATE_CLOSE_ASAP = -1, /* Async error state */ + REPL_RDB_CH_STATE_NONE = 0, /* No active rdb channel sync */ + REPL_RDB_CH_SEND_HANDSHAKE, /* Send handshake sequence to master */ + REPL_RDB_CH_RECEIVE_AUTH_REPLY, /* Wait for AUTH reply */ + REPL_RDB_CH_RECEIVE_REPLCONF_REPLY, /* Wait for REPLCONF reply */ + REPL_RDB_CH_RECEIVE_FULLRESYNC, /* Wait for +FULLRESYNC reply */ + REPL_RDB_CH_RDB_LOADING, /* Loading rdb using rdb channel */ +} repl_rdb_channel_state; + +/* Replication debug flags for testing. */ +#define REPL_DEBUG_PAUSE_NONE (1 << 0) +#define REPL_DEBUG_AFTER_FORK (1 << 1) +#define REPL_DEBUG_BEFORE_RDB_CHANNEL (1 << 2) +#define REPL_DEBUG_ON_STREAMING_REPL_BUF (1 << 3) + /* The state of an in progress coordinated failover */ typedef enum { NO_FAILOVER = 0, /* No failover in progress */ @@ -491,16 +511,22 @@ typedef enum { #define SLAVE_STATE_ONLINE 9 /* RDB file transmitted, sending just updates. */ #define SLAVE_STATE_RDB_TRANSMITTED 10 /* RDB file transmitted - This state is used only for * a replica that only wants RDB without replication buffer */ +#define SLAVE_STATE_WAIT_RDB_CHANNEL 11 /* Main channel of replica is connected, + * we are waiting rdbchannel connection to start delivery.*/ +#define SLAVE_STATE_SEND_BULK_AND_STREAM 12 /* Main channel of a replica which uses rdb channel replication. + * Sending RDB file and replication stream in parallel. */ /* Slave capabilities. */ -#define SLAVE_CAPA_NONE 0 -#define SLAVE_CAPA_EOF (1<<0) /* Can parse the RDB EOF streaming format. */ -#define SLAVE_CAPA_PSYNC2 (1<<1) /* Supports PSYNC2 protocol. */ +#define SLAVE_CAPA_NONE 0 +#define SLAVE_CAPA_EOF (1<<0) /* Can parse the RDB EOF streaming format. */ +#define SLAVE_CAPA_PSYNC2 (1<<1) /* Supports PSYNC2 protocol. */ +#define SLAVE_CAPA_RDB_CHANNEL_REPL (1<<2) /* Supports rdb channel replication during full sync */ /* Slave requirements */ -#define SLAVE_REQ_NONE 0 -#define SLAVE_REQ_RDB_EXCLUDE_DATA (1 << 0) /* Exclude data from RDB */ +#define SLAVE_REQ_NONE 0 +#define SLAVE_REQ_RDB_EXCLUDE_DATA (1 << 0) /* Exclude data from RDB */ #define SLAVE_REQ_RDB_EXCLUDE_FUNCTIONS (1 << 1) /* Exclude functions from RDB */ +#define SLAVE_REQ_RDB_CHANNEL (1 << 2) /* Use rdb channel replication */ /* Mask of all bits in the slave requirements bitfield that represent non-standard (filtered) RDB requirements */ #define SLAVE_REQ_RDB_MASK (SLAVE_REQ_RDB_EXCLUDE_DATA | SLAVE_REQ_RDB_EXCLUDE_FUNCTIONS) @@ -1162,6 +1188,23 @@ typedef struct replBacklog { * byte in the replication backlog buffer.*/ } replBacklog; +/* Used by replDataBuf during rdb channel replication to accumulate replication + * stream on replica side. */ +typedef struct replDataBufBlock { + size_t used; /* Used bytes in the buf */ + size_t size; /* Size of the buf */ + char buf[]; /* Replication data */ +} replDataBufBlock; + +/* Linked list of replDataBufBlock structs, holds replication stream during + * rdb channel replication on replica side. */ +typedef struct replDataBuf { + list *blocks; /* List of replDataBufBlock */ + size_t size; /* Total number of bytes available in all blocks. */ + size_t used; /* Total number of bytes actually used in all blocks. */ + size_t peak; /* Peak number of bytes stored in all blocks. */ +} replDataBuf; + typedef struct { list *clients; size_t mem_usage_sum; @@ -1258,6 +1301,7 @@ typedef struct client { char *slave_addr; /* Optionally given by REPLCONF ip-address */ int slave_capa; /* Slave capabilities: SLAVE_CAPA_* bitwise OR. */ int slave_req; /* Slave requirements: SLAVE_REQ_* */ + uint64_t main_ch_client_id; /* The client id of this replica's main channel */ multiState mstate; /* MULTI/EXEC state */ blockingState bstate; /* blocking state */ long long woff; /* Last write global replication offset. */ @@ -1936,6 +1980,8 @@ struct redisServer { int repl_ping_slave_period; /* Master pings the slave every N seconds */ replBacklog *repl_backlog; /* Replication backlog for partial syncs */ long long repl_backlog_size; /* Backlog circular buffer size */ + long long repl_full_sync_buffer_limit; /* Accumulated repl data limit during rdb channel replication */ + replDataBuf repl_full_sync_buffer; /* Accumulated replication data for rdb channel replication */ time_t repl_backlog_time_limit; /* Time without slaves after the backlog gets released. */ time_t repl_no_slaves_since; /* We have no slaves since that time. @@ -1949,6 +1995,9 @@ struct redisServer { int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ int repl_diskless_sync_max_replicas;/* Max replicas for diskless repl BGSAVE * delay (start sooner if they all connect). */ + int repl_rdb_channel; /* Config used to determine if the replica should + * use rdb channel replication for full syncs. */ + int repl_debug_pause; /* Debug config to force the main process to pause. */ size_t repl_buffer_mem; /* The memory of replication buffer. */ list *repl_buffer_blocks; /* Replication buffers blocks list * (serving replica clients and repl backlog) */ @@ -1962,10 +2011,13 @@ struct redisServer { client *cached_master; /* Cached master to be reused for PSYNC. */ int repl_syncio_timeout; /* Timeout for synchronous I/O calls */ int repl_state; /* Replication status if the instance is a slave */ + int repl_rdb_ch_state; /* State of the replica's rdb channel during rdb channel replication */ + uint64_t repl_main_ch_client_id; /* Main channel client id received in +RDBCHANNELSYNC reply. */ off_t repl_transfer_size; /* Size of RDB to read from master during sync. */ off_t repl_transfer_read; /* Amount of RDB read from master during sync. */ off_t repl_transfer_last_fsync_off; /* Offset when we fsync-ed last time. */ connection *repl_transfer_s; /* Slave -> Master SYNC connection */ + connection *repl_rdb_transfer_s; /* Slave -> Master FULL SYNC connection (RDB download) */ int repl_transfer_fd; /* Slave -> Master SYNC temp file descriptor */ char *repl_transfer_tmpfile; /* Slave-> master SYNC temp file name */ time_t repl_transfer_lastio; /* Unix time of the latest read, for timeout */ @@ -2948,6 +3000,8 @@ void clearFailoverState(void); void updateFailoverStatus(void); void abortFailover(const char *err); const char *getFailoverStateString(void); +int replicationCheckHasMainChannel(client *slave); +unsigned long replicationLogicalReplicaCount(void); /* Generic persistence functions */ void startLoadingFile(size_t size, char* filename, int rdbflags); @@ -3972,6 +4026,7 @@ void killThreads(void); void makeThreadKillable(void); void swapMainDbWithTempDb(redisDb *tempDb); sds getVersion(void); +void debugPauseProcess(void); /* Use macro for checking log level to avoid evaluating arguments in cases log * should be ignored due to low level. */ diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl index ed680061c..5e2f671ef 100644 --- a/tests/cluster/tests/12-replica-migration-2.tcl +++ b/tests/cluster/tests/12-replica-migration-2.tcl @@ -46,7 +46,7 @@ test "Resharding all the master #0 slots away from it" { } test "Master #0 who lost all slots should turn into a replica without replicas" { - wait_for_condition 1000 50 { + wait_for_condition 2000 50 { [RI 0 role] == "slave" && [RI 0 connected_slaves] == 0 } else { puts [R 0 info replication] diff --git a/tests/helpers/gen_write_load.tcl b/tests/helpers/gen_write_load.tcl index 568f5cde2..5c6563a97 100644 --- a/tests/helpers/gen_write_load.tcl +++ b/tests/helpers/gen_write_load.tcl @@ -1,18 +1,37 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + source tests/support/redis.tcl set ::tlsdir "tests/tls" -proc gen_write_load {host port seconds tls} { +# Continuously sends SET commands to the server. If key is omitted, a random key +# is used for every SET command. The value is always random. +proc gen_write_load {host port seconds tls {key ""}} { set start_time [clock seconds] set r [redis $host $port 1 $tls] $r client setname LOAD_HANDLER $r select 9 while 1 { - $r set [expr rand()] [expr rand()] + if {$key == ""} { + $r set [expr rand()] [expr rand()] + } else { + $r set $key [expr rand()] + } if {[clock seconds]-$start_time > $seconds} { exit 0 } } } -gen_write_load [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] [lindex $argv 3] +gen_write_load [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] [lindex $argv 3] [lindex $argv 4] diff --git a/tests/integration/replication-buffer.tcl b/tests/integration/replication-buffer.tcl index 64b26ca02..43a6a757b 100644 --- a/tests/integration/replication-buffer.tcl +++ b/tests/integration/replication-buffer.tcl @@ -1,6 +1,20 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + # This test group aims to test that all replicas share one global replication buffer, # two replicas don't make replication buffer size double, and when there is no replica, # replica buffer will shrink. +foreach rdbchannel {"yes" "no"} { start_server {tags {"repl external:skip"}} { start_server {} { start_server {} { @@ -9,6 +23,10 @@ start_server {} { set replica2 [srv -2 client] set replica3 [srv -1 client] + $replica1 config set repl-rdb-channel $rdbchannel + $replica2 config set repl-rdb-channel $rdbchannel + $replica3 config set repl-rdb-channel $rdbchannel + set master [srv 0 client] set master_host [srv 0 host] set master_port [srv 0 port] @@ -18,6 +36,7 @@ start_server {} { $master config set repl-diskless-sync-delay 5 $master config set repl-diskless-sync-max-replicas 1 $master config set client-output-buffer-limit "replica 0 0 0" + $master config set repl-rdb-channel $rdbchannel # Make sure replica3 is synchronized with master $replica3 replicaof $master_host $master_port @@ -39,7 +58,7 @@ start_server {} { fail "fail to sync with replicas" } - test {All replicas share one global replication buffer} { + test "All replicas share one global replication buffer rdbchannel=$rdbchannel" { set before_used [s used_memory] populate 1024 "" 1024 ; # Write extra 1M data # New data uses 1M memory, but all replicas use only one @@ -47,7 +66,13 @@ start_server {} { # more than double of replication buffer. set repl_buf_mem [s mem_total_replication_buffers] set extra_mem [expr {[s used_memory]-$before_used-1024*1024}] - assert {$extra_mem < 2*$repl_buf_mem} + if {$rdbchannel == "yes"} { + # master's replication buffers should not grow + assert {$extra_mem < 1024*1024} + assert {$repl_buf_mem < 1024*1024} + } else { + assert {$extra_mem < 2*$repl_buf_mem} + } # Kill replica1, replication_buffer will not become smaller catch {$replica1 shutdown nosave} @@ -59,7 +84,7 @@ start_server {} { assert_equal $repl_buf_mem [s mem_total_replication_buffers] } - test {Replication buffer will become smaller when no replica uses} { + test "Replication buffer will become smaller when no replica uses rdbchannel=$rdbchannel" { # Make sure replica3 catch up with the master wait_for_ofs_sync $master $replica3 @@ -71,12 +96,18 @@ start_server {} { } else { fail "replica2 doesn't disconnect with master" } - assert {[expr $repl_buf_mem - 1024*1024] > [s mem_total_replication_buffers]} + if {$rdbchannel == "yes"} { + # master's replication buffers should not grow + assert {1024*512 > [s mem_total_replication_buffers]} + } else { + assert {[expr $repl_buf_mem - 1024*1024] > [s mem_total_replication_buffers]} + } } } } } } +} # This test group aims to test replication backlog size can outgrow the backlog # limit config if there is a slow replica which keep massive replication buffers, @@ -84,6 +115,7 @@ start_server {} { # partial re-synchronization. Of course, replication backlog memory also can # become smaller when master disconnects with slow replicas since output buffer # limit is reached. +foreach rdbchannel {"yes" "no"} { start_server {tags {"repl external:skip"}} { start_server {} { start_server {} { @@ -98,6 +130,7 @@ start_server {} { $master config set save "" $master config set repl-backlog-size 16384 + $master config set repl-rdb-channel $rdbchannel $master config set client-output-buffer-limit "replica 0 0 0" # Executing 'debug digest' on master which has many keys costs much time @@ -105,12 +138,16 @@ start_server {} { # with master. $master config set repl-timeout 1000 $replica1 config set repl-timeout 1000 + $replica1 config set repl-rdb-channel $rdbchannel + $replica1 config set client-output-buffer-limit "replica 1024 0 0" $replica2 config set repl-timeout 1000 + $replica2 config set client-output-buffer-limit "replica 1024 0 0" + $replica2 config set repl-rdb-channel $rdbchannel $replica1 replicaof $master_host $master_port wait_for_sync $replica1 - test {Replication backlog size can outgrow the backlog limit config} { + test "Replication backlog size can outgrow the backlog limit config rdbchannel=$rdbchannel" { # Generating RDB will take 1000 seconds $master config set rdb-key-save-delay 1000000 populate 1000 master 10000 @@ -124,7 +161,7 @@ start_server {} { } # Replication actual backlog grow more than backlog setting since # the slow replica2 kept replication buffer. - populate 10000 master 10000 + populate 20000 master 10000 assert {[s repl_backlog_histlen] > [expr 10000*10000]} } @@ -135,7 +172,7 @@ start_server {} { fail "Replica offset didn't catch up with the master after too long time" } - test {Replica could use replication buffer (beyond backlog config) for partial resynchronization} { + test "Replica could use replication buffer (beyond backlog config) for partial resynchronization rdbchannel=$rdbchannel" { # replica1 disconnects with master $replica1 replicaof [srv -1 host] [srv -1 port] # Write a mass of data that exceeds repl-backlog-size @@ -155,7 +192,7 @@ start_server {} { assert_equal [$master debug digest] [$replica1 debug digest] } - test {Replication backlog memory will become smaller if disconnecting with replica} { + test "Replication backlog memory will become smaller if disconnecting with replica rdbchannel=$rdbchannel" { assert {[s repl_backlog_histlen] > [expr 2*10000*10000]} assert_equal [s connected_slaves] {2} @@ -165,8 +202,11 @@ start_server {} { r set key [string repeat A [expr 64*1024]] # master will close replica2's connection since replica2's output # buffer limit is reached, so there only is replica1. + # In case of rdbchannel=yes, main channel will be disconnected only. wait_for_condition 100 100 { - [s connected_slaves] eq {1} + [s connected_slaves] eq {1} || + ([s connected_slaves] eq {2} && + [string match {*slave*state=wait_bgsave*} [$master info]]) } else { fail "master didn't disconnect with replica2" } @@ -185,15 +225,19 @@ start_server {} { } } } +} -test {Partial resynchronization is successful even client-output-buffer-limit is less than repl-backlog-size} { +foreach rdbchannel {"yes" "no"} { +test "Partial resynchronization is successful even client-output-buffer-limit is less than repl-backlog-size rdbchannel=$rdbchannel" { start_server {tags {"repl external:skip"}} { start_server {} { r config set save "" r config set repl-backlog-size 100mb r config set client-output-buffer-limit "replica 512k 0 0" + r config set repl-rdb-channel $rdbchannel set replica [srv -1 client] + $replica config set repl-rdb-channel $rdbchannel $replica replicaof [srv 0 host] [srv 0 port] wait_for_sync $replica @@ -231,7 +275,7 @@ test {Partial resynchronization is successful even client-output-buffer-limit is } # This test was added to make sure big keys added to the backlog do not trigger psync loop. -test {Replica client-output-buffer size is limited to backlog_limit/16 when no replication data is pending} { +test "Replica client-output-buffer size is limited to backlog_limit/16 when no replication data is pending rdbchannel=$rdbchannel" { proc client_field {r type f} { set client [$r client list type $type] if {![regexp $f=(\[a-zA-Z0-9-\]+) $client - res]} { @@ -252,6 +296,8 @@ test {Replica client-output-buffer size is limited to backlog_limit/16 when no r $master config set repl-backlog-size 16384 $master config set client-output-buffer-limit "replica 32768 32768 60" + $master config set repl-rdb-channel $rdbchannel + $replica config set repl-rdb-channel $rdbchannel # Key has has to be larger than replica client-output-buffer limit. set keysize [expr 256*1024] @@ -304,4 +350,5 @@ test {Replica client-output-buffer size is limited to backlog_limit/16 when no r } } } +} diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl index dc1df0fa6..3ffa2db93 100644 --- a/tests/integration/replication-psync.tcl +++ b/tests/integration/replication-psync.tcl @@ -1,3 +1,16 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + # Creates a master-slave pair and breaks the link continuously to force # partial resyncs attempts, all this while flooding the master with # write queries. @@ -8,7 +21,7 @@ # If reconnect is > 0, the test actually try to break the connection and # reconnect with the master, otherwise just the initial synchronization is # checked for consistency. -proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect} { +proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect rdbchannel} { start_server {tags {"repl"} overrides {save {}}} { start_server {overrides {save {}}} { @@ -21,7 +34,9 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reco $master config set repl-backlog-ttl $backlog_ttl $master config set repl-diskless-sync $mdl $master config set repl-diskless-sync-delay 1 + $master config set repl-rdb-channel $rdbchannel $slave config set repl-diskless-load $sdl + $slave config set repl-rdb-channel $rdbchannel set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000] set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000] @@ -46,7 +61,7 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reco } } - test "Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect)" { + test "Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect, rdbchannel: $rdbchannel)" { # Now while the clients are writing data, break the maste-slave # link multiple times. if ($reconnect) { @@ -120,24 +135,31 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reco tags {"external:skip"} { foreach mdl {no yes} { foreach sdl {disabled swapdb} { - test_psync {no reconnection, just sync} 6 1000000 3600 0 { - } $mdl $sdl 0 + foreach rdbchannel {yes no} { + if {$rdbchannel == "yes" && $mdl == "no"} { + # rdbchannel replication requires repl-diskless-sync enabled + continue + } - test_psync {ok psync} 6 100000000 3600 0 { - assert {[s -1 sync_partial_ok] > 0} - } $mdl $sdl 1 + test_psync {no reconnection, just sync} 6 1000000 3600 0 { + } $mdl $sdl 0 $rdbchannel - test_psync {no backlog} 6 100 3600 0.5 { - assert {[s -1 sync_partial_err] > 0} - } $mdl $sdl 1 + test_psync {ok psync} 6 100000000 3600 0 { + assert {[s -1 sync_partial_ok] > 0} + } $mdl $sdl 1 $rdbchannel - test_psync {ok after delay} 3 100000000 3600 3 { - assert {[s -1 sync_partial_ok] > 0} - } $mdl $sdl 1 + test_psync {no backlog} 6 100 3600 0.5 { + assert {[s -1 sync_partial_err] > 0} + } $mdl $sdl 1 $rdbchannel - test_psync {backlog expired} 3 100000000 1 3 { - assert {[s -1 sync_partial_err] > 0} - } $mdl $sdl 1 + test_psync {ok after delay} 3 100000000 3600 3 { + assert {[s -1 sync_partial_ok] > 0} + } $mdl $sdl 1 $rdbchannel + + test_psync {backlog expired} 3 100000000 1 3 { + assert {[s -1 sync_partial_err] > 0} + } $mdl $sdl 1 $rdbchannel + } } } } diff --git a/tests/integration/replication-rdbchannel.tcl b/tests/integration/replication-rdbchannel.tcl new file mode 100644 index 000000000..212bfbf05 --- /dev/null +++ b/tests/integration/replication-rdbchannel.tcl @@ -0,0 +1,795 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + +# Returns either main or rdbchannel client id +# Assumes there is one replica with two channels +proc get_replica_client_id {master rdbchannel} { + set input [$master client list type replica] + + foreach line [split $input "\n"] { + if {[regexp {id=(\d+).*flags=(\S+)} $line match id flags]} { + if {$rdbchannel == "yes"} { + # rdbchannel will have C flag + if {[string match *C* $flags]} { + return $id + } + } else { + return $id + } + } + } + + error "Replica not found" +} + +start_server {tags {"repl external:skip"}} { + set replica1 [srv 0 client] + + start_server {} { + set replica2 [srv 0 client] + + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + $master config set repl-diskless-sync yes + $master config set repl-rdb-channel yes + populate 1000 master 10 + + test "Test replication with multiple replicas (rdbchannel enabled on both)" { + $replica1 config set repl-rdb-channel yes + $replica1 replicaof $master_host $master_port + + $replica2 config set repl-rdb-channel yes + $replica2 replicaof $master_host $master_port + + wait_replica_online $master 0 + wait_replica_online $master 1 + + $master set x 1 + + # Wait until replicas catch master + wait_for_ofs_sync $master $replica1 + wait_for_ofs_sync $master $replica2 + + # Verify db's are identical + assert_morethan [$master dbsize] 0 + assert_equal [$master get x] 1 + assert_equal [$master debug digest] [$replica1 debug digest] + assert_equal [$master debug digest] [$replica2 debug digest] + } + + test "Test replication with multiple replicas (rdbchannel enabled on one of them)" { + # Allow both replicas to ask for sync + $master config set repl-diskless-sync-delay 5 + + $replica1 replicaof no one + $replica2 replicaof no one + $replica1 config set repl-rdb-channel yes + $replica2 config set repl-rdb-channel no + + set prev_forks [s 0 total_forks] + $master set x 2 + + # There will be two forks subsequently, one for rdbchannel + # replica another for the replica without rdbchannel config. + $replica1 replicaof $master_host $master_port + $replica2 replicaof $master_host $master_port + + set res [wait_for_log_messages 0 {"*Starting BGSAVE* replicas sockets (rdb-channel)*"} 0 2000 10] + set loglines [lindex $res 1] + wait_for_log_messages 0 {"*Starting BGSAVE* replicas sockets*"} $loglines 2000 10 + + wait_replica_online $master 0 100 100 + wait_replica_online $master 1 100 100 + + # Verify two new forks. + assert_equal [s 0 total_forks] [expr $prev_forks + 2] + + wait_for_ofs_sync $master $replica1 + wait_for_ofs_sync $master $replica2 + + # Verify db's are identical + assert_equal [$replica1 get x] 2 + assert_equal [$replica2 get x] 2 + assert_equal [$master debug digest] [$replica1 debug digest] + assert_equal [$master debug digest] [$replica2 debug digest] + } + + test "Test rdbchannel is not used if repl-diskless-sync config is disabled on master" { + $replica1 replicaof no one + $replica2 replicaof no one + + $master config set repl-diskless-sync-delay 0 + $master config set repl-diskless-sync no + + $master set x 3 + $replica1 replicaof $master_host $master_port + + # Verify log message does not mention rdbchannel + wait_for_log_messages 0 {"*Starting BGSAVE for SYNC with target: disk*"} 0 2000 1 + + wait_replica_online $master 0 + wait_for_ofs_sync $master $replica1 + + # Verify db's are identical + assert_equal [$replica1 get x] 3 + assert_equal [$master debug digest] [$replica1 debug digest] + } + } + } +} + +start_server {tags {"repl external:skip"}} { + set replica [srv 0 client] + set replica_pid [srv 0 pid] + + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + $master config set repl-rdb-channel yes + $replica config set repl-rdb-channel yes + + # Reuse this test to verify large key delivery + $master config set rdbcompression no + $master config set rdb-key-save-delay 3000 + populate 1000 prefix1 10 + populate 5 prefix2 3000000 + populate 5 prefix3 2000000 + populate 5 prefix4 1000000 + + # On master info output, we should see state transition in this order: + # 1. wait_bgsave: Replica receives psync error (+RDBCHANNELSYNC) + # 2. send_bulk_and_stream: Replica opens rdbchannel and delivery started + # 3. online: Sync is completed + test "Test replica state should start with wait_bgsave" { + $replica config set key-load-delay 100000 + # Pause replica before opening rdb channel conn + $replica debug repl-pause before-rdb-channel + $replica replicaof $master_host $master_port + + wait_for_condition 50 200 { + [s 0 connected_slaves] == 1 && + [string match "*wait_bgsave*" [s 0 slave0]] + } else { + fail "replica failed" + } + } + + test "Test replica state advances to send_bulk_and_stream when rdbchannel connects" { + $master set x 1 + resume_process $replica_pid + + wait_for_condition 50 200 { + [s 0 connected_slaves] == 1 && + [s 0 rdb_bgsave_in_progress] == 1 && + [string match "*send_bulk_and_stream*" [s 0 slave0]] + } else { + fail "replica failed" + } + } + + test "Test replica rdbchannel client has SC flag on client list output" { + set input [$master client list type replica] + + # There will two replicas, second one should be rdbchannel + set trimmed_input [string trimright $input] + set lines [split $trimmed_input "\n"] + if {[llength $lines] < 2} { + error "There is no second line in the input: $input" + } + set second_line [lindex $lines 1] + + # Check if 'flags=SC' exists in the second line + if {![regexp {flags=SC} $second_line]} { + error "Flags are not 'SC' in the second line: $second_line" + } + } + + test "Test replica state advances to online when fullsync is completed" { + # Speed up loading + $replica config set key-load-delay 0 + + wait_replica_online $master 0 100 1000 + wait_for_ofs_sync $master $replica + + wait_for_condition 50 200 { + [s 0 rdb_bgsave_in_progress] == 0 && + [s 0 connected_slaves] == 1 && + [string match "*online*" [s 0 slave0]] + } else { + fail "replica failed" + } + + wait_replica_online $master 0 100 1000 + wait_for_ofs_sync $master $replica + + # Verify db's are identical + assert_morethan [$master dbsize] 0 + assert_equal [$master debug digest] [$replica debug digest] + } + } +} + +start_server {tags {"repl external:skip"}} { + set replica [srv 0 client] + + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + $master config set repl-rdb-channel yes + $replica config set repl-rdb-channel yes + + test "Test master memory does not increase during replication" { + # Put some delay to rdb generation. If master doesn't forward + # incoming traffic to replica, master's replication buffer will grow + $master config set rdb-key-save-delay 200 + $master config set repl-backlog-size 5mb + populate 10000 master 10000 + + # Start write traffic + set load_handle [start_write_load $master_host $master_port 100 "key1"] + set prev_used [s 0 used_memory] + + $replica replicaof $master_host $master_port + set backlog_size [lindex [$master config get repl-backlog-size] 1] + + # Verify used_memory stays low + set max_retry 1000 + set prev_buf_size 0 + while {$max_retry} { + assert_lessthan [expr [s 0 used_memory] - $prev_used] 20000000 + assert_lessthan_equal [s 0 mem_total_replication_buffers] [expr {$backlog_size + 1000000}] + + # Check replica state + if {[string match *slave0*state=online* [$master info]] && + [s -1 master_link_status] == "up"} { + break + } else { + incr max_retry -1 + after 10 + } + } + if {$max_retry == 0} { + error "assertion:Replica not in sync after 10 seconds" + } + + stop_write_load $load_handle + } + } +} + +start_server {tags {"repl external:skip"}} { + set replica [srv 0 client] + + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + $master config set repl-rdb-channel yes + $replica config set repl-rdb-channel yes + + test "Test replication stream buffer becomes full on replica" { + # For replication stream accumulation, replica inherits slave output + # buffer limit as the size limit. In this test, we create traffic to + # fill the buffer fully. Once the limit is reached, accumulation + # will stop. This is not a failure scenario though. From that point, + # further accumulation may occur on master side. Replication should + # be completed successfully. + + # Create some artificial delay for rdb delivery and load. We'll + # generate some traffic to fill the replication buffer. + $master config set rdb-key-save-delay 1000 + $replica config set key-load-delay 1000 + $replica config set client-output-buffer-limit "replica 64kb 64kb 0" + populate 2000 master 1 + + set prev_sync_full [s 0 sync_full] + $replica replicaof $master_host $master_port + + # Wait for replica to establish psync using main channel + wait_for_condition 500 1000 { + [string match "*state=send_bulk_and_stream*" [s 0 slave0]] + } else { + fail "replica didn't start sync" + } + + # Create some traffic on replication stream + populate 100 master 100000 + + # Wait for replica's buffer limit reached + wait_for_log_messages -1 {"*Replication buffer limit has been reached*"} 0 1000 10 + + # Speed up loading + $replica config set key-load-delay 0 + + # Wait until sync is successful + wait_for_condition 200 200 { + [status $master master_repl_offset] eq [status $replica master_repl_offset] && + [status $master master_repl_offset] eq [status $replica slave_repl_offset] + } else { + fail "replica offsets didn't match in time" + } + + # Verify sync was not interrupted. + assert_equal [s 0 sync_full] [expr $prev_sync_full + 1] + + # Verify db's are identical + assert_morethan [$master dbsize] 0 + assert_equal [$master debug digest] [$replica debug digest] + } + + test "Test replication stream buffer config replica-full-sync-buffer-limit" { + # By default, replica inherits client-output-buffer-limit of replica + # to limit accumulated repl data during rdbchannel sync. + # replica-full-sync-buffer-limit should override it if it is set. + $replica replicaof no one + + # Create some artificial delay for rdb delivery and load. We'll + # generate some traffic to fill the replication buffer. + $master config set rdb-key-save-delay 1000 + $replica config set key-load-delay 1000 + $replica config set client-output-buffer-limit "replica 1024 1024 0" + $replica config set replica-full-sync-buffer-limit 20mb + populate 2000 master 1 + + $replica replicaof $master_host $master_port + + # Wait until replication starts + wait_for_condition 500 1000 { + [string match "*state=send_bulk_and_stream*" [s 0 slave0]] + } else { + fail "replica didn't start sync" + } + + # Create some traffic on replication stream + populate 100 master 100000 + + # Make sure config is used, we accumulated more than + # client-output-buffer-limit + assert_morethan [s -1 replica_full_sync_buffer_size] 1024 + } + } +} + +start_server {tags {"repl external:skip"}} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + set master_pid [srv 0 pid] + set loglines [count_log_lines 0] + + $master config set repl-diskless-sync yes + $master config set repl-rdb-channel yes + $master config set repl-backlog-size 1mb + $master config set client-output-buffer-limit "replica 100k 0 0" + $master config set loglevel debug + $master config set repl-diskless-sync-delay 3 + + start_server {} { + set replica [srv 0 client] + set replica_pid [srv 0 pid] + + $replica config set repl-rdb-channel yes + $replica config set loglevel debug + $replica config set repl-timeout 10 + $replica config set key-load-delay 10000 + $replica config set loading-process-events-interval-bytes 1024 + + test "Test master disconnects replica when output buffer limit is reached" { + populate 20000 master 100 -1 + + $replica replicaof $master_host $master_port + wait_for_condition 50 200 { + [s 0 loading] == 1 + } else { + fail "[s 0 loading] sdsdad" + } + + # Generate some traffic for backlog ~2mb + populate 20 master 1000000 -1 + + set res [wait_for_log_messages -1 {"*Client * closed * for overcoming of output buffer limits.*"} $loglines 1000 10] + set loglines [lindex $res 1] + $replica config set key-load-delay 0 + + # Wait until replica loads RDB + wait_for_log_messages 0 {"*Done loading RDB*"} 0 1000 10 + } + + test "Test replication recovers after output buffer failures" { + # Verify system is operational + $master set x 1 + + # Wait until replica catches up + wait_replica_online $master 0 1000 100 + wait_for_ofs_sync $master $replica + + # Verify db's are identical + assert_morethan [$master dbsize] 0 + assert_equal [$replica get x] 1 + assert_equal [$master debug digest] [$replica debug digest] + } + } +} + +start_server {tags {"repl external:skip"}} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + $master config set repl-diskless-sync yes + $master config set repl-rdb-channel yes + $master config set rdb-key-save-delay 300 + $master config set client-output-buffer-limit "replica 0 0 0" + $master config set repl-diskless-sync-delay 5 + $master config set loglevel debug + + populate 10000 master 1 + + start_server {} { + set replica1 [srv 0 client] + $replica1 config set repl-rdb-channel yes + $replica1 config set loglevel debug + + start_server {} { + set replica2 [srv 0 client] + $replica2 config set repl-rdb-channel yes + $replica2 config set loglevel debug + + set load_handle [start_write_load $master_host $master_port 100 "key"] + + test "Test master continues RDB delivery if not all replicas are dropped" { + $replica1 replicaof $master_host $master_port + $replica2 replicaof $master_host $master_port + + wait_for_condition 50 200 { + [s -2 rdb_bgsave_in_progress] == 1 + } else { + fail "Sync did not start" + } + + # Wait for both replicas main conns to establish psync + wait_for_condition 500 100 { + [s -2 connected_slaves] == 2 + } else { + fail "Replicas didn't establish psync: + sync_partial_ok: [s -2 sync_partial_ok]" + } + + # kill one of the replicas + catch {$replica1 shutdown nosave} + + # Wait until replica completes full sync + # Verify there is no other full sync attempt + wait_for_condition 50 1000 { + [s 0 master_link_status] == "up" && + [s -2 sync_full] == 2 && + [s -2 connected_slaves] == 1 + } else { + fail "Sync session did not continue + master_link_status: [s 0 master_link_status] + sync_full:[s -2 sync_full] + connected_slaves: [s -2 connected_slaves]" + } + } + + test "Test master aborts rdb delivery if all replicas are dropped" { + $replica2 replicaof no one + + # Start replication + $replica2 replicaof $master_host $master_port + + wait_for_condition 50 1000 { + [s -2 rdb_bgsave_in_progress] == 1 + } else { + fail "Sync did not start" + } + set loglines [count_log_lines -2] + + # kill replica + catch {$replica2 shutdown nosave} + + # Verify master aborts rdb save + wait_for_condition 50 1000 { + [s -2 rdb_bgsave_in_progress] == 0 && + [s -2 connected_slaves] == 0 + } else { + fail "Master should abort the sync + rdb_bgsave_in_progress:[s -2 rdb_bgsave_in_progress] + connected_slaves: [s -2 connected_slaves]" + } + wait_for_log_messages -2 {"*Background transfer error*"} $loglines 1000 50 + } + + stop_write_load $load_handle + } + } +} + +start_server {tags {"repl external:skip"}} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + $master config set repl-diskless-sync yes + $master config set repl-rdb-channel yes + $master config set loglevel debug + $master config set rdb-key-save-delay 1000 + + populate 3000 prefix1 1 + populate 100 prefix2 100000 + + start_server {} { + set replica [srv 0 client] + set replica_pid [srv 0 pid] + + $replica config set repl-rdb-channel yes + $replica config set loglevel debug + $replica config set repl-timeout 10 + + set load_handle [start_write_load $master_host $master_port 100 "key"] + + test "Test replica recovers when rdb channel connection is killed" { + $replica replicaof $master_host $master_port + + # Wait for sync session to start + wait_for_condition 500 200 { + [string match "*state=send_bulk_and_stream*" [s -1 slave0]] && + [s -1 rdb_bgsave_in_progress] eq 1 + } else { + fail "replica didn't start sync session in time" + } + + set loglines [count_log_lines -1] + + # Kill rdb channel client + set id [get_replica_client_id $master yes] + $master client kill id $id + + wait_for_log_messages -1 {"*Background transfer error*"} $loglines 1000 10 + + # Verify master rejects main-ch-client-id after connection is killed + assert_error {*Unrecognized*} {$master replconf main-ch-client-id $id} + + # Replica should retry + wait_for_condition 500 200 { + [string match "*state=send_bulk_and_stream*" [s -1 slave0]] && + [s -1 rdb_bgsave_in_progress] eq 1 + } else { + fail "replica didn't retry after connection close" + } + } + + test "Test replica recovers when main channel connection is killed" { + set loglines [count_log_lines -1] + + # Kill main channel client + set id [get_replica_client_id $master yes] + $master client kill id $id + + wait_for_log_messages -1 {"*Background transfer error*"} $loglines 1000 20 + + # Replica should retry + wait_for_condition 500 2000 { + [string match "*state=send_bulk_and_stream*" [s -1 slave0]] && + [s -1 rdb_bgsave_in_progress] eq 1 + } else { + fail "replica didn't retry after connection close" + } + } + + stop_write_load $load_handle + + test "Test replica recovers connection failures" { + # Wait until replica catches up + wait_replica_online $master 0 1000 100 + wait_for_ofs_sync $master $replica + + # Verify db's are identical + assert_morethan [$master dbsize] 0 + assert_equal [$master debug digest] [$replica debug digest] + } + } +} + +start_server {tags {"repl external:skip"}} { + set replica [srv 0 client] + set replica_pid [srv 0 pid] + + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + test "Test master connection drops while streaming repl buffer into the db" { + # Just after replica loads RDB, it will stream repl buffer into the + # db. During streaming, we kill the master connection. Replica + # will abort streaming and then try another psync with master. + $master config set rdb-key-save-delay 1000 + $master config set repl-rdb-channel yes + $master config set repl-diskless-sync yes + $replica config set repl-rdb-channel yes + $replica config set loading-process-events-interval-bytes 1024 + + # Populate db and start write traffic + populate 2000 master 1000 + set load_handle [start_write_load $master_host $master_port 100 "key1"] + + # Replica will pause in the loop of repl buffer streaming + $replica debug repl-pause on-streaming-repl-buf + $replica replicaof $master_host $master_port + + # Check if repl stream accumulation is started. + wait_for_condition 50 1000 { + [s -1 replica_full_sync_buffer_size] > 0 + } else { + fail "repl stream accumulation not started" + } + + # Wait until replica starts streaming repl buffer + wait_for_log_messages -1 {"*Starting to stream replication buffer*"} 0 2000 10 + stop_write_load $load_handle + $master config set rdb-key-save-delay 0 + + # Kill master connection and resume the process + $replica deferred 1 + $replica client kill type master + $replica debug repl-pause clear + resume_process $replica_pid + $replica read + $replica read + $replica deferred 0 + + wait_for_log_messages -1 {"*Master client was freed while streaming*"} 0 500 10 + + # Quick check for stats test coverage + assert_morethan_equal [s -1 replica_full_sync_buffer_peak] [s -1 replica_full_sync_buffer_size] + + # Wait until replica recovers and verify db's are identical + wait_replica_online $master 0 1000 10 + wait_for_ofs_sync $master $replica + + assert_morethan [$master dbsize] 0 + assert_equal [$master debug digest] [$replica debug digest] + } + } +} + +start_server {tags {"repl external:skip"}} { + set replica [srv 0 client] + set replica_pid [srv 0 pid] + + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + test "Test main channel connection drops while loading rdb (disk based)" { + # While loading rdb, we kill main channel connection. + # We expect replica to complete loading RDB and then try psync + # with the master. + $master config set repl-rdb-channel yes + $replica config set repl-rdb-channel yes + $replica config set repl-diskless-load disabled + $replica config set key-load-delay 10000 + $replica config set loading-process-events-interval-bytes 1024 + + # Populate db and start write traffic + populate 10000 master 100 + $replica replicaof $master_host $master_port + + # Wait until replica starts loading + wait_for_condition 50 200 { + [s -1 loading] == 1 + } else { + fail "replica did not start loading" + } + + # Kill replica connections + $master client kill type replica + $master set x 1 + + # At this point, we expect replica to complete loading RDB. Then, + # it will try psync with master. + wait_for_log_messages -1 {"*Aborting rdb channel sync while loading the RDB*"} 0 2000 10 + wait_for_log_messages -1 {"*After loading RDB, replica will try psync with master*"} 0 2000 10 + + # Speed up loading + $replica config set key-load-delay 0 + + # Wait until replica becomes online + wait_replica_online $master 0 100 100 + + # Verify there is another successful psync and no other full sync + wait_for_condition 50 200 { + [s 0 sync_full] == 1 && + [s 0 sync_partial_ok] == 1 + } else { + fail "psync was not successful [s 0 sync_full] [s 0 sync_partial_ok]" + } + + # Verify db's are identical after recovery + wait_for_ofs_sync $master $replica + assert_morethan [$master dbsize] 0 + assert_equal [$master debug digest] [$replica debug digest] + } + } +} + +start_server {tags {"repl external:skip"}} { + set replica [srv 0 client] + set replica_pid [srv 0 pid] + + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + test "Test main channel connection drops while loading rdb (diskless)" { + # While loading rdb, kill both main and rdbchannel connections. + # We expect replica to abort sync and later retry again. + $master config set repl-rdb-channel yes + $replica config set repl-rdb-channel yes + $replica config set repl-diskless-load swapdb + $replica config set key-load-delay 10000 + $replica config set loading-process-events-interval-bytes 1024 + + # Populate db and start write traffic + populate 10000 master 100 + + $replica replicaof $master_host $master_port + + # Wait until replica starts loading + wait_for_condition 50 200 { + [s -1 loading] == 1 + } else { + fail "replica did not start loading" + } + + # Kill replica connections + $master client kill type replica + $master set x 1 + + # At this point, we expect replica to abort loading RDB. + wait_for_log_messages -1 {"*Aborting rdb channel sync while loading the RDB*"} 0 2000 10 + wait_for_log_messages -1 {"*Failed trying to load the MASTER synchronization DB from socket*"} 0 2000 10 + + # Speed up loading + $replica config set key-load-delay 0 + stop_write_load $load_handle + + # Wait until replica recovers and becomes online + wait_replica_online $master 0 100 100 + + # Verify replica attempts another full sync + wait_for_condition 50 200 { + [s 0 sync_full] == 2 && + [s 0 sync_partial_ok] == 0 + } else { + fail "sync was not successful [s 0 sync_full] [s 0 sync_partial_ok]" + } + + # Verify db's are identical after recovery + wait_for_ofs_sync $master $replica + assert_morethan [$master dbsize] 0 + assert_equal [$master debug digest] [$replica debug digest] + } + } +} diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 6f4c3d43d..0914c0e55 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -1,3 +1,16 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + proc log_file_matches {log pattern} { set fp [open $log r] set content [read $fp] @@ -303,7 +316,7 @@ start_server {tags {"repl external:skip"}} { } } -foreach mdl {no yes} { +foreach mdl {no yes} rdbchannel {no yes} { foreach sdl {disabled swapdb} { start_server {tags {"repl external:skip"} overrides {save {}}} { set master [srv 0 client] @@ -319,7 +332,13 @@ foreach mdl {no yes} { lappend slaves [srv 0 client] start_server {overrides {save {}}} { lappend slaves [srv 0 client] - test "Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl" { + test "Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl, rdbchannel=$rdbchannel" { + + $master config set repl-rdb-channel $rdbchannel + [lindex $slaves 0] config set repl-rdb-channel $rdbchannel + [lindex $slaves 1] config set repl-rdb-channel $rdbchannel + [lindex $slaves 2] config set repl-rdb-channel $rdbchannel + # start load handles only inside the test, so that the test can be skipped set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000000] set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000000] @@ -438,7 +457,7 @@ start_server {tags {"repl external:skip"} overrides {save {}}} { } # Diskless load swapdb when NOT async_loading (different master replid) -foreach testType {Successful Aborted} { +foreach testType {Successful Aborted} rdbchannel {yes no} { start_server {tags {"repl external:skip"}} { set replica [srv 0 client] set replica_host [srv 0 host] @@ -453,6 +472,7 @@ foreach testType {Successful Aborted} { $master config set repl-diskless-sync yes $master config set repl-diskless-sync-delay 0 $master config set save "" + $master config set repl-rdb-channel $rdbchannel $replica config set repl-diskless-load swapdb $replica config set save "" @@ -474,7 +494,7 @@ foreach testType {Successful Aborted} { # Start the replication process $replica replicaof $master_host $master_port - test {Diskless load swapdb (different replid): replica enter loading} { + test "Diskless load swapdb (different replid): replica enter loading rdbchannel=$rdbchannel" { # Wait for the replica to start reading the rdb wait_for_condition 100 100 { [s -1 loading] eq 1 @@ -498,7 +518,7 @@ foreach testType {Successful Aborted} { fail "Replica didn't disconnect" } - test {Diskless load swapdb (different replid): old database is exposed after replication fails} { + test "Diskless load swapdb (different replid): old database is exposed after replication fails rdbchannel=$rdbchannel" { # Ensure we see old values from replica assert_equal [$replica get mykey] "myvalue" @@ -590,8 +610,8 @@ foreach testType {Successful Aborted} { if {$testType == "Aborted"} { # Set master with a slow rdb generation, so that we can easily intercept loading - # 10ms per key, with 2000 keys is 20 seconds - $master config set rdb-key-save-delay 10000 + # 20ms per key, with 2000 keys is 40 seconds + $master config set rdb-key-save-delay 20000 } # Force the replica to try another full sync (this time it will have matching master replid) @@ -862,6 +882,7 @@ start_server {tags {"repl external:skip"} overrides {save ""}} { # we also need the replica to process requests during transfer (which it does only once in 2mb) $master debug populate 20000 test 10000 $master config set rdbcompression no + $master config set repl-rdb-channel no # If running on Linux, we also measure utime/stime to detect possible I/O handling issues set os [catch {exec uname}] set measure_time [expr {$os == "Linux"} ? 1 : 0] @@ -1009,6 +1030,7 @@ test "diskless replication child being killed is collected" { set master_pid [srv 0 pid] $master config set repl-diskless-sync yes $master config set repl-diskless-sync-delay 0 + $master config set repl-rdb-channel no # put enough data in the db that the rdb file will be bigger than the socket buffers $master debug populate 20000 test 10000 $master config set rdbcompression no @@ -1269,7 +1291,8 @@ start_server {tags {"repl external:skip"}} { r slaveof $master2_host $master2_port wait_for_condition 50 100 { ([s -2 rdb_bgsave_in_progress] == 1) && - ([string match "*wait_bgsave*" [s -2 slave0]]) + ([string match "*wait_bgsave*" [s -2 slave0]] || + [string match "*send_bulk_and_stream*" [s -2 slave0]]) } else { fail "full sync didn't start" } diff --git a/tests/support/util.tcl b/tests/support/util.tcl index c240a286c..a93f01ac4 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -1,3 +1,16 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + proc randstring {min max {type binary}} { set len [expr {$min+int(rand()*($max-$min+1))}] set output {} @@ -118,11 +131,11 @@ proc wait_for_sync r { } } -proc wait_replica_online r { - wait_for_condition 50 100 { - [string match "*slave0:*,state=online*" [$r info replication]] +proc wait_replica_online {r {replica_id 0} {maxtries 50} {delay 100}} { + wait_for_condition $maxtries $delay { + [string match "*slave$replica_id:*,state=online*" [$r info replication]] } else { - fail "replica didn't online in time" + fail "replica $replica_id did not become online in time" } } @@ -565,10 +578,11 @@ proc find_valgrind_errors {stderr on_termination} { } # Execute a background process writing random data for the specified number -# of seconds to the specified Redis instance. -proc start_write_load {host port seconds} { +# of seconds to the specified Redis instance. If key is omitted, a random key +# is used for every SET command. +proc start_write_load {host port seconds {key ""}} { set tclsh [info nameofexecutable] - exec $tclsh tests/helpers/gen_write_load.tcl $host $port $seconds $::tls & + exec $tclsh tests/helpers/gen_write_load.tcl $host $port $seconds $::tls $key & } # Stop a process generating write load executed with start_write_load. @@ -677,6 +691,12 @@ proc pause_process pid { } proc resume_process pid { + wait_for_condition 50 1000 { + [string match "T*" [exec ps -o state= -p $pid]] + } else { + puts [exec ps j $pid] + fail "process was not stopped" + } exec kill -SIGCONT $pid } diff --git a/tests/unit/auth.tcl b/tests/unit/auth.tcl index 9532e0bd2..023101fdf 100644 --- a/tests/unit/auth.tcl +++ b/tests/unit/auth.tcl @@ -1,3 +1,16 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + start_server {tags {"auth external:skip"}} { test {AUTH fails if there is no password configured server side} { catch {r auth foo} err @@ -65,24 +78,29 @@ start_server {tags {"auth_binary_password external:skip"}} { set master_port [srv -1 port] set slave [srv 0 client] - test {MASTERAUTH test with binary password} { - $master config set requirepass "abc\x00def" + foreach rdbchannel {yes no} { + test "MASTERAUTH test with binary password rdbchannel=$rdbchannel" { + $slave slaveof no one + $master config set requirepass "abc\x00def" + $master config set repl-rdb-channel $rdbchannel - # Configure the replica with masterauth - set loglines [count_log_lines 0] - $slave config set masterauth "abc" - $slave slaveof $master_host $master_port + # Configure the replica with masterauth + set loglines [count_log_lines 0] + $slave config set masterauth "abc" + $slave config set repl-rdb-channel $rdbchannel + $slave slaveof $master_host $master_port - # Verify replica is not able to sync with master - wait_for_log_messages 0 {"*Unable to AUTH to MASTER*"} $loglines 1000 10 - assert_equal {down} [s 0 master_link_status] - - # Test replica with the correct masterauth - $slave config set masterauth "abc\x00def" - wait_for_condition 50 100 { - [s 0 master_link_status] eq {up} - } else { - fail "Can't turn the instance into a replica" + # Verify replica is not able to sync with master + wait_for_log_messages 0 {"*Unable to AUTH to MASTER*"} $loglines 1000 10 + assert_equal {down} [s 0 master_link_status] + + # Test replica with the correct masterauth + $slave config set masterauth "abc\x00def" + wait_for_condition 50 100 { + [s 0 master_link_status] eq {up} + } else { + fail "Can't turn the instance into a replica" + } } } } From 4a95b3005a140165bbb9df373ba61f775c936554 Mon Sep 17 00:00:00 2001 From: YaacovHazan Date: Sun, 15 Dec 2024 11:27:48 +0200 Subject: [PATCH 39/42] Fix Read/Write key pattern selector (CVE-2024-51741) The '%' rule must contain one or both of R/W --- src/acl.c | 11 ++++++++--- tests/unit/acl-v2.tcl | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/acl.c b/src/acl.c index 5af6edbd9..c4d6bf5e6 100644 --- a/src/acl.c +++ b/src/acl.c @@ -1061,19 +1061,24 @@ int ACLSetSelector(aclSelector *selector, const char* op, size_t oplen) { int flags = 0; size_t offset = 1; if (op[0] == '%') { + int perm_ok = 1; for (; offset < oplen; offset++) { if (toupper(op[offset]) == 'R' && !(flags & ACL_READ_PERMISSION)) { flags |= ACL_READ_PERMISSION; } else if (toupper(op[offset]) == 'W' && !(flags & ACL_WRITE_PERMISSION)) { flags |= ACL_WRITE_PERMISSION; - } else if (op[offset] == '~' && flags) { + } else if (op[offset] == '~') { offset++; break; } else { - errno = EINVAL; - return C_ERR; + perm_ok = 0; + break; } } + if (!flags || !perm_ok) { + errno = EINVAL; + return C_ERR; + } } else { flags = ACL_ALL_PERMISSION; } diff --git a/tests/unit/acl-v2.tcl b/tests/unit/acl-v2.tcl index b259c2716..b233118dd 100644 --- a/tests/unit/acl-v2.tcl +++ b/tests/unit/acl-v2.tcl @@ -116,6 +116,32 @@ start_server {tags {"acl external:skip"}} { assert_match "*NOPERM*key*" $err } + test {Validate read and write permissions format - empty permission} { + catch {r ACL SETUSER key-permission-RW %~} err + set err + } {ERR Error in ACL SETUSER modifier '%~': Syntax error} + + test {Validate read and write permissions format - empty selector} { + catch {r ACL SETUSER key-permission-RW %} err + set err + } {ERR Error in ACL SETUSER modifier '%': Syntax error} + + test {Validate read and write permissions format - empty pattern} { + # Empty pattern results with R/W access to no key + r ACL SETUSER key-permission-RW on nopass %RW~ +@all + $r2 auth key-permission-RW password + catch {$r2 SET x 5} err + set err + } {NOPERM No permissions to access a key} + + test {Validate read and write permissions format - no pattern} { + # No pattern results with R/W access to no key (currently we accept this syntax error) + r ACL SETUSER key-permission-RW on nopass %RW +@all + $r2 auth key-permission-RW password + catch {$r2 SET x 5} err + set err + } {NOPERM No permissions to access a key} + test {Test separate read and write permissions on different selectors are not additive} { r ACL SETUSER key-permission-RW-selector on nopass "(%R~read* +@all)" "(%W~write* +@all)" $r2 auth key-permission-RW-selector password From 342ee426ad0d0731b2272553bd4db2cd78e24772 Mon Sep 17 00:00:00 2001 From: YaacovHazan Date: Sun, 15 Dec 2024 21:41:45 +0200 Subject: [PATCH 40/42] Fix LUA garbage collector (CVE-2024-46981) Reset GC state before closing the lua VM to prevent user data to be wrongly freed while still might be used on destructor callbacks. --- src/eval.c | 1 + src/function_lua.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/eval.c b/src/eval.c index 47fb43464..fed606d2d 100644 --- a/src/eval.c +++ b/src/eval.c @@ -266,6 +266,7 @@ void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State unsigned int lua_tcache = (unsigned int)(uintptr_t)ud; #endif + lua_gc(lua, LUA_GCCOLLECT, 0); lua_close(lua); #if defined(USE_JEMALLOC) diff --git a/src/function_lua.c b/src/function_lua.c index 37069ec21..ff471f2ac 100644 --- a/src/function_lua.c +++ b/src/function_lua.c @@ -198,6 +198,7 @@ static void luaEngineFreeCtx(void *engine_ctx) { unsigned int lua_tcache = (unsigned int)(uintptr_t)ud; #endif + lua_gc(lua_engine_ctx->lua, LUA_GCCOLLECT, 0); lua_close(lua_engine_ctx->lua); zfree(lua_engine_ctx); From 5b8b58e472fc567337429f63e93927f86db7f838 Mon Sep 17 00:00:00 2001 From: Yuan Wang Date: Tue, 14 Jan 2025 15:51:05 +0800 Subject: [PATCH 41/42] Fix incorrect parameter type reports (#13744) After upgrading of ubuntu 24.04, clang18 can check runtime error: call to function XXX through pointer to incorrect function type, our daily CI reports the errors by UndefinedBehaviorSanitizer (UBSan): https://github.com/redis/redis/actions/runs/12738281720/job/35500380251#step:6:346 now we add generic version of some existing `free` functions to support to call function through (void*) pointer, actually, they just are the wrapper functions that will cast the data type and call the corresponding functions. --- src/acl.c | 13 +++++++++---- src/adlist.c | 5 +++++ src/adlist.h | 1 + src/call_reply.c | 2 +- src/db.c | 2 +- src/eval.c | 2 +- src/functions.c | 6 +++++- src/listpack.c | 7 ++++++- src/listpack.h | 1 + src/module.c | 2 +- src/networking.c | 2 +- src/replication.c | 2 +- src/sds.c | 5 +++++ src/sds.h | 1 + src/server.c | 2 +- src/t_stream.c | 24 ++++++++++++++++++++---- src/ziplist.c | 2 +- 17 files changed, 61 insertions(+), 18 deletions(-) diff --git a/src/acl.c b/src/acl.c index c4d6bf5e6..7f7e19c76 100644 --- a/src/acl.c +++ b/src/acl.c @@ -277,7 +277,7 @@ int ACLListMatchSds(void *a, void *b) { /* Method to free list elements from ACL users password/patterns lists. */ void ACLListFreeSds(void *item) { - sdsfree(item); + sdsfreegeneric(item); } /* Method to duplicate list elements from ACL users password/patterns lists. */ @@ -469,6 +469,11 @@ void ACLFreeUser(user *u) { zfree(u); } +/* Generic version of ACLFreeUser. */ +void ACLFreeUserGeneric(void *u) { + ACLFreeUser((user *)u); +} + /* When a user is deleted we need to cycle the active * connections in order to kill all the pending ones that * are authenticated with such user. */ @@ -2459,12 +2464,12 @@ sds ACLLoadFromFile(const char *filename) { } if (user_channels) - raxFreeWithCallback(user_channels, (void(*)(void*))listRelease); - raxFreeWithCallback(old_users,(void(*)(void*))ACLFreeUser); + raxFreeWithCallback(user_channels, listReleaseGeneric); + raxFreeWithCallback(old_users, ACLFreeUserGeneric); sdsfree(errors); return NULL; } else { - raxFreeWithCallback(Users,(void(*)(void*))ACLFreeUser); + raxFreeWithCallback(Users, ACLFreeUserGeneric); Users = old_users; errors = sdscat(errors,"WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); return errors; diff --git a/src/adlist.c b/src/adlist.c index 0e8f6d5c1..48f005a5b 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -61,6 +61,11 @@ void listRelease(list *list) zfree(list); } +/* Generic version of listRelease. */ +void listReleaseGeneric(void *list) { + listRelease((struct list*)list); +} + /* Add a new node to the list, to head, containing the specified 'value' * pointer as value. * diff --git a/src/adlist.h b/src/adlist.h index b91fe5070..a6f235bd2 100644 --- a/src/adlist.h +++ b/src/adlist.h @@ -51,6 +51,7 @@ typedef struct list { /* Prototypes */ list *listCreate(void); void listRelease(list *list); +void listReleaseGeneric(void *list); void listEmpty(list *list); list *listAddNodeHead(list *list, void *value); list *listAddNodeTail(list *list, void *value); diff --git a/src/call_reply.c b/src/call_reply.c index b246361af..af9ebae58 100644 --- a/src/call_reply.c +++ b/src/call_reply.c @@ -533,7 +533,7 @@ CallReply *callReplyCreateError(sds reply, void *private_data) { sdsfree(reply); } list *deferred_error_list = listCreate(); - listSetFreeMethod(deferred_error_list, (void (*)(void*))sdsfree); + listSetFreeMethod(deferred_error_list, sdsfreegeneric); listAddNodeTail(deferred_error_list, sdsnew(err_buff)); return callReplyCreate(err_buff, deferred_error_list, private_data); } diff --git a/src/db.c b/src/db.c index 42248a20f..d8eca59ff 100644 --- a/src/db.c +++ b/src/db.c @@ -1256,7 +1256,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * The exception to the above is ZSET, where we do allocate temporary * strings even when scanning a dict. */ if (o && (!ht || o->type == OBJ_ZSET)) { - listSetFreeMethod(keys, (void (*)(void*))sdsfree); + listSetFreeMethod(keys, sdsfreegeneric); } /* For main dictionary scan or data structure using hashtable. */ diff --git a/src/eval.c b/src/eval.c index fed606d2d..783bc1ebf 100644 --- a/src/eval.c +++ b/src/eval.c @@ -759,7 +759,7 @@ void ldbInit(void) { ldb.conn = NULL; ldb.active = 0; ldb.logs = listCreate(); - listSetFreeMethod(ldb.logs,(void (*)(void*))sdsfree); + listSetFreeMethod(ldb.logs, sdsfreegeneric); ldb.children = listCreate(); ldb.src = NULL; ldb.lines = 0; diff --git a/src/functions.c b/src/functions.c index dde42daf6..961542a79 100644 --- a/src/functions.c +++ b/src/functions.c @@ -144,6 +144,10 @@ static void engineLibraryFree(functionLibInfo* li) { zfree(li); } +static void engineLibraryFreeGeneric(void *li) { + engineLibraryFree((functionLibInfo *)li); +} + static void engineLibraryDispose(dict *d, void *obj) { UNUSED(d); engineLibraryFree(obj); @@ -338,7 +342,7 @@ static int libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx * } else { if (!old_libraries_list) { old_libraries_list = listCreate(); - listSetFreeMethod(old_libraries_list, (void (*)(void*))engineLibraryFree); + listSetFreeMethod(old_libraries_list, engineLibraryFreeGeneric); } libraryUnlink(functions_lib_ctx_dst, old_li); listAddNodeTail(old_libraries_list, old_li); diff --git a/src/listpack.c b/src/listpack.c index 8b733d8b8..b96ebc763 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -231,6 +231,11 @@ void lpFree(unsigned char *lp) { lp_free(lp); } +/* Generic version of lpFree. */ +void lpFreeGeneric(void *lp) { + lp_free((unsigned char *)lp); +} + /* Shrink the memory to fit. */ unsigned char* lpShrinkToFit(unsigned char *lp) { size_t size = lpGetTotalBytes(lp); @@ -3120,7 +3125,7 @@ int listpackTest(int argc, char *argv[], int flags) { for (i = 0; i < iteration; i++) { lp = lpNew(0); ref = listCreate(); - listSetFreeMethod(ref,(void (*)(void*))sdsfree); + listSetFreeMethod(ref, sdsfreegeneric); len = rand() % 256; /* Create lists */ diff --git a/src/listpack.h b/src/listpack.h index bfddbd73b..1f80d5fc2 100644 --- a/src/listpack.h +++ b/src/listpack.h @@ -35,6 +35,7 @@ typedef struct { unsigned char *lpNew(size_t capacity); void lpFree(unsigned char *lp); +void lpFreeGeneric(void *lp); unsigned char* lpShrinkToFit(unsigned char *lp); unsigned char *lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp); diff --git a/src/module.c b/src/module.c index f662ebb30..644d24ad4 100644 --- a/src/module.c +++ b/src/module.c @@ -10557,7 +10557,7 @@ RedisModuleServerInfoData *RM_GetServerInfo(RedisModuleCtx *ctx, const char *sec * context instead of passing NULL. */ void RM_FreeServerInfo(RedisModuleCtx *ctx, RedisModuleServerInfoData *data) { if (ctx != NULL) autoMemoryFreed(ctx,REDISMODULE_AM_INFO,data); - raxFreeWithCallback(data->rax, (void(*)(void*))sdsfree); + raxFreeWithCallback(data->rax, sdsfreegeneric); zfree(data); } diff --git a/src/networking.c b/src/networking.c index 31799a406..80a7b1e36 100644 --- a/src/networking.c +++ b/src/networking.c @@ -502,7 +502,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { if (c->flags & CLIENT_MODULE) { if (!c->deferred_reply_errors) { c->deferred_reply_errors = listCreate(); - listSetFreeMethod(c->deferred_reply_errors, (void (*)(void*))sdsfree); + listSetFreeMethod(c->deferred_reply_errors, sdsfreegeneric); } listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len)); return; diff --git a/src/replication.c b/src/replication.c index 635497b67..4e030bada 100644 --- a/src/replication.c +++ b/src/replication.c @@ -239,7 +239,7 @@ void rebaseReplicationBuffer(long long base_repl_offset) { void resetReplicationBuffer(void) { server.repl_buffer_mem = 0; server.repl_buffer_blocks = listCreate(); - listSetFreeMethod(server.repl_buffer_blocks, (void (*)(void*))zfree); + listSetFreeMethod(server.repl_buffer_blocks, zfree); } int canFeedReplicaReplBuffer(client *replica) { diff --git a/src/sds.c b/src/sds.c index 6d9e63af5..97c2d7f79 100644 --- a/src/sds.c +++ b/src/sds.c @@ -174,6 +174,11 @@ void sdsfree(sds s) { s_free((char*)s-sdsHdrSize(s[-1])); } +/* Generic version of sdsfree. */ +void sdsfreegeneric(void *s) { + sdsfree((sds)s); +} + /* Set the sds string length to the length as obtained with strlen(), so * considering as content only up to the first null term character. * diff --git a/src/sds.h b/src/sds.h index bf31c7610..b20393bc0 100644 --- a/src/sds.h +++ b/src/sds.h @@ -198,6 +198,7 @@ sds sdsnew(const char *init); sds sdsempty(void); sds sdsdup(const sds s); void sdsfree(sds s); +void sdsfreegeneric(void *s); sds sdsgrowzero(sds s, size_t len); sds sdscatlen(sds s, const void *t, size_t len); sds sdscat(sds s, const char *t); diff --git a/src/server.c b/src/server.c index fb1fc1f4b..266da7a69 100644 --- a/src/server.c +++ b/src/server.c @@ -2757,7 +2757,7 @@ void initServer(void) { server.db[j].id = j; server.db[j].avg_ttl = 0; server.db[j].defrag_later = listCreate(); - listSetFreeMethod(server.db[j].defrag_later,(void (*)(void*))sdsfree); + listSetFreeMethod(server.db[j].defrag_later, sdsfreegeneric); } evictionPoolAlloc(); /* Initialize the LRU keys pool. */ /* Note that server.pubsub_channels was chosen to be a kvstore (with only one dict, which diff --git a/src/t_stream.c b/src/t_stream.c index 8578551a8..e78cef6ab 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -33,6 +33,7 @@ #define STREAM_LISTPACK_MAX_SIZE (1<<30) void streamFreeCG(streamCG *cg); +void streamFreeCGGeneric(void *cg); void streamFreeNACK(streamNACK *na); size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, streamID *start, streamID *end, size_t count, streamConsumer *consumer); int streamParseStrictIDOrReply(client *c, robj *o, streamID *id, uint64_t missing_seq, int *seq_given); @@ -60,9 +61,9 @@ stream *streamNew(void) { /* Free a stream, including the listpacks stored inside the radix tree. */ void freeStream(stream *s) { - raxFreeWithCallback(s->rax,(void(*)(void*))lpFree); + raxFreeWithCallback(s->rax, lpFreeGeneric); if (s->cgroups) - raxFreeWithCallback(s->cgroups,(void(*)(void*))streamFreeCG); + raxFreeWithCallback(s->cgroups, streamFreeCGGeneric); zfree(s); } @@ -2478,6 +2479,11 @@ void streamFreeNACK(streamNACK *na) { zfree(na); } +/* Generic version of streamFreeNACK. */ +void streamFreeNACKGeneric(void *na) { + streamFreeNACK((streamNACK *)na); +} + /* Free a consumer and associated data structures. Note that this function * will not reassign the pending messages associated with this consumer * nor will delete them from the stream, so when this function is called @@ -2490,6 +2496,11 @@ void streamFreeConsumer(streamConsumer *sc) { zfree(sc); } +/* Generic version of streamFreeConsumer. */ +void streamFreeConsumerGeneric(void *sc) { + streamFreeConsumer((streamConsumer *)sc); +} + /* Create a new consumer group in the context of the stream 's', having the * specified name, last server ID and reads counter. If a consumer group with * the same name already exists NULL is returned, otherwise the pointer to the @@ -2510,11 +2521,16 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo /* Free a consumer group and all its associated data. */ void streamFreeCG(streamCG *cg) { - raxFreeWithCallback(cg->pel,(void(*)(void*))streamFreeNACK); - raxFreeWithCallback(cg->consumers,(void(*)(void*))streamFreeConsumer); + raxFreeWithCallback(cg->pel, streamFreeNACKGeneric); + raxFreeWithCallback(cg->consumers, streamFreeConsumerGeneric); zfree(cg); } +/* Generic version of streamFreeCG. */ +void streamFreeCGGeneric(void *cg) { + streamFreeCG((streamCG *)cg); +} + /* Lookup the consumer group in the specified stream and returns its * pointer, otherwise if there is no such group, NULL is returned. */ streamCG *streamLookupCG(stream *s, sds groupname) { diff --git a/src/ziplist.c b/src/ziplist.c index a470c956d..712d259a9 100644 --- a/src/ziplist.c +++ b/src/ziplist.c @@ -2367,7 +2367,7 @@ int ziplistTest(int argc, char **argv, int flags) { for (i = 0; i < iteration; i++) { zl = ziplistNew(); ref = listCreate(); - listSetFreeMethod(ref,(void (*)(void*))sdsfree); + listSetFreeMethod(ref, sdsfreegeneric); len = rand() % 256; /* Create lists */ From 0f65806b5b0f21b96e9c688ce7d2d00062203a51 Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Tue, 14 Jan 2025 17:30:18 +0800 Subject: [PATCH 42/42] Update info.tcl test to revert client output limits sooner (#13738) This PR is based on: https://github.com/valkey-io/valkey/pull/1462 We set the client output buffer limits to 10 bytes, and then execute info stats which produces more than 10 bytes of output, which can cause that command to throw an error. I'm not sure why it wasn't consistently erroring before, might have been some change related to the ubuntu upgrade though. failed CI: https://github.com/redis/redis/actions/runs/12738281720/job/35500381299 ------ Co-authored-by: Madelyn Olson [madelyneolson@gmail.com](mailto:madelyneolson@gmail.com) --- tests/unit/info.tcl | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index fc66fb510..c18c76c88 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -1,3 +1,16 @@ +# +# Copyright (c) 2009-Present, Redis Ltd. +# All rights reserved. +# +# Copyright (c) 2024-present, Valkey contributors. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). +# +# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. +# + proc cmdstat {cmd} { return [cmdrstat $cmd r] } @@ -386,10 +399,10 @@ start_server {tags {"info" "external:skip"}} { r config set client-output-buffer-limit "normal 10 0 0" r set key [string repeat a 100000] ;# to trigger output buffer limit check this needs to be big catch {r get key} + r config set client-output-buffer-limit $org_outbuf_limit set info [r info stats] assert_equal [getInfoProperty $info client_output_buffer_limit_disconnections] {1} - r config set client-output-buffer-limit $org_outbuf_limit - } {OK} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres + } {} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres test {clients: pubsub clients} { set info [r info clients]