diff --git a/redis.conf b/redis.conf index 6688fdc2a..a1cbedd34 100644 --- a/redis.conf +++ b/redis.conf @@ -1291,38 +1291,27 @@ lazyfree-lazy-user-flush no # in different I/O threads. Since especially writing is so slow, normally # Redis users use pipelining in order to speed up the Redis performances per # core, and spawn multiple instances in order to scale more. Using I/O -# threads it is possible to easily speedup two times Redis without resorting +# threads it is possible to easily speedup several times Redis without resorting # to pipelining nor sharding of the instance. # # By default threading is disabled, we suggest enabling it only in machines # that have at least 4 or more cores, leaving at least one spare core. -# Using more than 8 threads is unlikely to help much. We also recommend using -# threaded I/O only if you actually have performance problems, with Redis -# instances being able to use a quite big percentage of CPU time, otherwise -# there is no point in using this feature. +# We also recommend using threaded I/O only if you actually have performance +# problems, with Redis instances being able to use a quite big percentage of +# CPU time, otherwise there is no point in using this feature. # -# So for instance if you have a four cores boxes, try to use 2 or 3 I/O -# threads, if you have a 8 cores, try to use 6 threads. In order to +# So for instance if you have a four cores boxes, try to use 3 I/O +# threads, if you have a 8 cores, try to use 7 threads. In order to # enable I/O threads use the following configuration directive: # # io-threads 4 # # Setting io-threads to 1 will just use the main thread as usual. -# When I/O threads are enabled, we only use threads for writes, that is -# to thread the write(2) syscall and transfer the client buffers to the -# socket. However it is also possible to enable threading of reads and -# protocol parsing using the following configuration directive, by setting -# it to yes: +# When I/O threads are enabled, we not only use threads for writes, that +# is to thread the write(2) syscall and transfer the client buffers to the +# socket, but also use threads for reads and protocol parsing. # -# io-threads-do-reads no -# -# Usually threading reads doesn't help much. -# -# NOTE 1: This configuration directive cannot be changed at runtime via -# CONFIG SET. Also, this feature currently does not work when SSL is -# enabled. -# -# NOTE 2: If you want to test the Redis speedup using redis-benchmark, make +# NOTE: If you want to test the Redis speedup using redis-benchmark, make # sure you also run the benchmark itself in threaded mode, using the # --threads option to match the number of Redis threads, otherwise you'll not # be able to notice the improvements. diff --git a/src/Makefile b/src/Makefile index 8f245d19d..4f394782d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -354,7 +354,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) diff --git a/src/ae.c b/src/ae.c index 3d3569865..ac4422398 100644 --- a/src/ae.c +++ b/src/ae.c @@ -42,7 +42,7 @@ #endif #endif - +#define INITIAL_EVENT 1024 aeEventLoop *aeCreateEventLoop(int setsize) { aeEventLoop *eventLoop; int i; @@ -50,8 +50,9 @@ aeEventLoop *aeCreateEventLoop(int setsize) { monotonicInit(); /* just in case the calling app didn't initialize */ if ((eventLoop = zmalloc(sizeof(*eventLoop))) == NULL) goto err; - eventLoop->events = zmalloc(sizeof(aeFileEvent)*setsize); - eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*setsize); + eventLoop->nevents = setsize < INITIAL_EVENT ? setsize : INITIAL_EVENT; + eventLoop->events = zmalloc(sizeof(aeFileEvent)*eventLoop->nevents); + eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*eventLoop->nevents); if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err; eventLoop->setsize = setsize; eventLoop->timeEventHead = NULL; @@ -61,10 +62,11 @@ aeEventLoop *aeCreateEventLoop(int setsize) { eventLoop->beforesleep = NULL; eventLoop->aftersleep = NULL; eventLoop->flags = 0; + memset(eventLoop->privdata, 0, sizeof(eventLoop->privdata)); if (aeApiCreate(eventLoop) == -1) goto err; /* Events with mask == AE_NONE are not set. So let's initialize the * vector with it. */ - for (i = 0; i < setsize; i++) + for (i = 0; i < eventLoop->nevents; i++) eventLoop->events[i].mask = AE_NONE; return eventLoop; @@ -102,20 +104,19 @@ void aeSetDontWait(aeEventLoop *eventLoop, int noWait) { * * Otherwise AE_OK is returned and the operation is successful. */ int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) { - int i; - if (setsize == eventLoop->setsize) return AE_OK; if (eventLoop->maxfd >= setsize) return AE_ERR; if (aeApiResize(eventLoop,setsize) == -1) return AE_ERR; - eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize); - eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize); eventLoop->setsize = setsize; - /* Make sure that if we created new slots, they are initialized with - * an AE_NONE mask. */ - for (i = eventLoop->maxfd+1; i < setsize; i++) - eventLoop->events[i].mask = AE_NONE; + /* If the current allocated space is larger than the requested size, + * we need to shrink it to the requested size. */ + if (setsize < eventLoop->nevents) { + eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize); + eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize); + eventLoop->nevents = setsize; + } return AE_OK; } @@ -147,6 +148,22 @@ int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, errno = ERANGE; return AE_ERR; } + + /* Resize the events and fired arrays if the file + * descriptor exceeds the current number of events. */ + if (unlikely(fd >= eventLoop->nevents)) { + int newnevents = eventLoop->nevents; + newnevents = (newnevents * 2 > fd + 1) ? newnevents * 2 : fd + 1; + newnevents = (newnevents > eventLoop->setsize) ? eventLoop->setsize : newnevents; + eventLoop->events = zrealloc(eventLoop->events, sizeof(aeFileEvent) * newnevents); + eventLoop->fired = zrealloc(eventLoop->fired, sizeof(aeFiredEvent) * newnevents); + + /* Initialize new slots with an AE_NONE mask */ + for (int i = eventLoop->nevents; i < newnevents; i++) + eventLoop->events[i].mask = AE_NONE; + eventLoop->nevents = newnevents; + } + aeFileEvent *fe = &eventLoop->events[fd]; if (aeApiAddEvent(eventLoop, fd, mask) == -1) diff --git a/src/ae.h b/src/ae.h index 5f1e17f7d..16c5fcc5c 100644 --- a/src/ae.h +++ b/src/ae.h @@ -79,6 +79,7 @@ typedef struct aeEventLoop { int maxfd; /* highest file descriptor currently registered */ int setsize; /* max number of file descriptors tracked */ long long timeEventNextId; + int nevents; /* Size of Registered events */ aeFileEvent *events; /* Registered events */ aeFiredEvent *fired; /* Fired events */ aeTimeEvent *timeEventHead; @@ -87,6 +88,7 @@ typedef struct aeEventLoop { aeBeforeSleepProc *beforesleep; aeBeforeSleepProc *aftersleep; int flags; + void *privdata[2]; } aeEventLoop; /* Prototypes */ diff --git a/src/cluster.c b/src/cluster.c index 876b1327f..6c0bf75cc 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -317,7 +317,7 @@ migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long ti } /* Create the connection */ - conn = connCreate(connTypeOfCluster()); + conn = connCreate(server.el, connTypeOfCluster()); if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout) != C_OK) { addReplyError(c,"-IOERR error or timeout connecting to the client"); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index ead19ac71..d707d863d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1262,7 +1262,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { return; } - connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth); + connection *conn = connCreateAccepted(server.el, connTypeOfCluster(), cfd, &require_auth); /* Make sure connection is not in an error state */ if (connGetState(conn) != CONN_STATE_ACCEPTING) { @@ -4583,7 +4583,7 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ if (node->link == NULL) { clusterLink *link = createClusterLink(node); - link->conn = connCreate(connTypeOfCluster()); + link->conn = connCreate(server.el, connTypeOfCluster()); connSetPrivateData(link->conn, link); if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr, clusterLinkConnectHandler) == C_ERR) { diff --git a/src/commands.def b/src/commands.def index ef42fb8da..53be28942 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1239,6 +1239,9 @@ commandHistory CLIENT_LIST_History[] = { {"6.2.0","Added `argv-mem`, `tot-mem`, `laddr` and `redir` fields and the optional `ID` filter."}, {"7.0.0","Added `resp`, `multi-mem`, `rbs` and `rbp` fields."}, {"7.0.3","Added `ssub` field."}, +{"7.2.0","Added `lib-name` and `lib-ver` fields."}, +{"7.4.0","Added `watch` field."}, +{"8.0.0","Added `io-thread` field."}, }; #endif @@ -1546,7 +1549,7 @@ struct COMMAND_STRUCT CLIENT_Subcommands[] = { {MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, {MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, {MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,6,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, -{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,6,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, +{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,9,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, {MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args}, {MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args}, {MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args}, diff --git a/src/commands/client-list.json b/src/commands/client-list.json index f72ffaf40..08305216c 100644 --- a/src/commands/client-list.json +++ b/src/commands/client-list.json @@ -31,6 +31,18 @@ [ "7.0.3", "Added `ssub` field." + ], + [ + "7.2.0", + "Added `lib-name` and `lib-ver` fields." + ], + [ + "7.4.0", + "Added `watch` field." + ], + [ + "8.0.0", + "Added `io-thread` field." ] ], "command_flags": [ diff --git a/src/config.c b/src/config.c index d0d30966c..797284347 100644 --- a/src/config.c +++ b/src/config.c @@ -430,6 +430,7 @@ void loadServerConfigFromString(char *config) { {"list-max-ziplist-entries", 2, 2}, {"list-max-ziplist-value", 2, 2}, {"lua-replicate-commands", 2, 2}, + {"io-threads-do-reads", 2, 2}, {NULL, 0}, }; char buf[1024]; @@ -2550,11 +2551,10 @@ static int updateMaxclients(const char **err) { *err = msg; return 0; } - if ((unsigned int) aeGetSetSize(server.el) < - server.maxclients + CONFIG_FDSET_INCR) - { - if (aeResizeSetSize(server.el, - server.maxclients + CONFIG_FDSET_INCR) == AE_ERR) + size_t newsize = server.maxclients + CONFIG_FDSET_INCR; + if ((unsigned int) aeGetSetSize(server.el) < newsize) { + if (aeResizeSetSize(server.el, newsize) == AE_ERR || + resizeAllIOThreadsEventLoops(newsize) == AE_ERR) { *err = "The event loop API used by Redis is not able to handle the specified number of clients"; return 0; @@ -3035,6 +3035,7 @@ static int applyClientMaxMemoryUsage(const char **err) { if (server.maxmemory_clients != 0) initServerClientMemUsageBuckets(); + pauseAllIOThreads(); /* When client eviction is enabled update memory buckets for all clients. * When disabled, clear that data structure. */ listRewind(server.clients, &li); @@ -3048,6 +3049,7 @@ static int applyClientMaxMemoryUsage(const char **err) { updateClientMemUsageAndBucket(c); } } + resumeAllIOThreads(); if (server.maxmemory_clients == 0) freeServerClientMemUsageBuckets(); diff --git a/src/config.h b/src/config.h index e8f77a350..ec0fb1529 100644 --- a/src/config.h +++ b/src/config.h @@ -47,6 +47,7 @@ #define HAVE_PROC_SMAPS 1 #define HAVE_PROC_SOMAXCONN 1 #define HAVE_PROC_OOM_SCORE_ADJ 1 +#define HAVE_EVENT_FD 1 #endif /* Test for task_info() */ diff --git a/src/connection.c b/src/connection.c index fd9d5d17a..6ac1b99d9 100644 --- a/src/connection.c +++ b/src/connection.c @@ -156,14 +156,14 @@ void connTypeCleanupAll(void) { } /* walk all the connection types until has pending data */ -int connTypeHasPendingData(void) { +int connTypeHasPendingData(struct aeEventLoop *el) { ConnectionType *ct; int type; int ret = 0; for (type = 0; type < CONN_TYPE_MAX; type++) { ct = connTypes[type]; - if (ct && ct->has_pending_data && (ret = ct->has_pending_data())) { + if (ct && ct->has_pending_data && (ret = ct->has_pending_data(el))) { return ret; } } @@ -172,7 +172,7 @@ int connTypeHasPendingData(void) { } /* walk all the connection types and process pending data for each connection type */ -int connTypeProcessPendingData(void) { +int connTypeProcessPendingData(struct aeEventLoop *el) { ConnectionType *ct; int type; int ret = 0; @@ -180,7 +180,7 @@ int connTypeProcessPendingData(void) { for (type = 0; type < CONN_TYPE_MAX; type++) { ct = connTypes[type]; if (ct && ct->process_pending_data) { - ret += ct->process_pending_data(); + ret += ct->process_pending_data(el); } } diff --git a/src/connection.h b/src/connection.h index a8c296d15..0ebc84489 100644 --- a/src/connection.h +++ b/src/connection.h @@ -60,8 +60,8 @@ typedef struct ConnectionType { int (*listen)(connListener *listener); /* create/shutdown/close connection */ - connection* (*conn_create)(void); - connection* (*conn_create_accepted)(int fd, void *priv); + connection* (*conn_create)(struct aeEventLoop *el); + connection* (*conn_create_accepted)(struct aeEventLoop *el, int fd, void *priv); void (*shutdown)(struct connection *conn); void (*close)(struct connection *conn); @@ -81,9 +81,13 @@ typedef struct ConnectionType { ssize_t (*sync_read)(struct connection *conn, char *ptr, ssize_t size, long long timeout); ssize_t (*sync_readline)(struct connection *conn, char *ptr, ssize_t size, long long timeout); + /* event loop */ + void (*unbind_event_loop)(struct connection *conn); + int (*rebind_event_loop)(struct connection *conn, aeEventLoop *el); + /* pending data */ - int (*has_pending_data)(void); - int (*process_pending_data)(void); + int (*has_pending_data)(struct aeEventLoop *el); + int (*process_pending_data)(struct aeEventLoop *el); /* TLS specified methods */ sds (*get_peer_cert)(struct connection *conn); @@ -98,6 +102,7 @@ struct connection { short int refs; unsigned short int iovcnt; void *private_data; + struct aeEventLoop *el; ConnectionCallbackFunc conn_handler; ConnectionCallbackFunc write_handler; ConnectionCallbackFunc read_handler; @@ -319,6 +324,28 @@ static inline int connHasReadHandler(connection *conn) { return conn->read_handler != NULL; } +/* Returns true if the connection is bound to an event loop */ +static inline int connHasEventLoop(connection *conn) { + return conn->el != NULL; +} + +/* Unbind the current event loop from the connection, so that it can be + * rebind to a different event loop in the future. */ +static inline void connUnbindEventLoop(connection *conn) { + if (conn->el == NULL) return; + connSetReadHandler(conn, NULL); + connSetWriteHandler(conn, NULL); + if (conn->type->unbind_event_loop) + conn->type->unbind_event_loop(conn); + conn->el = NULL; +} + +/* Rebind the connection to another event loop, read/write handlers must not + * be installed in the current event loop */ +static inline int connRebindEventLoop(connection *conn, aeEventLoop *el) { + return conn->type->rebind_event_loop(conn, el); +} + /* Associate a private data pointer with the connection */ static inline void connSetPrivateData(connection *conn, void *data) { conn->private_data = data; @@ -379,14 +406,14 @@ ConnectionType *connectionTypeUnix(void); int connectionIndexByType(const char *typename); /* Create a connection of specified type */ -static inline connection *connCreate(ConnectionType *ct) { - return ct->conn_create(); +static inline connection *connCreate(struct aeEventLoop *el, ConnectionType *ct) { + return ct->conn_create(el); } /* Create an accepted connection of specified type. * priv is connection type specified argument */ -static inline connection *connCreateAccepted(ConnectionType *ct, int fd, void *priv) { - return ct->conn_create_accepted(fd, priv); +static inline connection *connCreateAccepted(struct aeEventLoop *el, ConnectionType *ct, int fd, void *priv) { + return ct->conn_create_accepted(el, fd, priv); } /* Configure a connection type. A typical case is to configure TLS. @@ -400,10 +427,10 @@ static inline int connTypeConfigure(ConnectionType *ct, void *priv, int reconfig void connTypeCleanupAll(void); /* Test all the connection type has pending data or not. */ -int connTypeHasPendingData(void); +int connTypeHasPendingData(struct aeEventLoop *el); /* walk all the connection types and process pending data for each connection type */ -int connTypeProcessPendingData(void); +int connTypeProcessPendingData(struct aeEventLoop *el); /* Listen on an initialized listener */ static inline int connListen(connListener *listener) { diff --git a/src/debug.c b/src/debug.c index e40375fbe..c4d184b15 100644 --- a/src/debug.c +++ b/src/debug.c @@ -2451,6 +2451,8 @@ void removeSigSegvHandlers(void) { } void printCrashReport(void) { + server.crashing = 1; + /* Log INFO and CLIENT LIST */ logServerInfo(); diff --git a/src/eventnotifier.c b/src/eventnotifier.c new file mode 100644 index 000000000..6dc3cf990 --- /dev/null +++ b/src/eventnotifier.c @@ -0,0 +1,97 @@ +/* eventnotifier.c -- An event notifier based on eventfd or pipe. + * + * Copyright (c) 2024-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ + +#include "eventnotifier.h" + +#include +#include +#include +#ifdef HAVE_EVENT_FD +#include +#endif + +#include "anet.h" +#include "zmalloc.h" + +eventNotifier* createEventNotifier(void) { + eventNotifier *en = zmalloc(sizeof(eventNotifier)); + if (!en) return NULL; + +#ifdef HAVE_EVENT_FD + if ((en->efd = eventfd(0, EFD_NONBLOCK| EFD_CLOEXEC)) != -1) { + return en; + } +#else + if (anetPipe(en->pipefd, O_CLOEXEC|O_NONBLOCK, O_CLOEXEC|O_NONBLOCK) != -1) { + return en; + } +#endif + + /* Clean up if error. */ + zfree(en); + return NULL; +} + +int getReadEventFd(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + return en->efd; +#else + return en->pipefd[0]; +#endif +} + +int getWriteEventFd(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + return en->efd; +#else + return en->pipefd[1]; +#endif +} + +int triggerEventNotifier(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + uint64_t u = 1; + if (write(en->efd, &u, sizeof(uint64_t)) == -1) { + return EN_ERR; + } +#else + char buf[1] = {'R'}; + if (write(en->pipefd[1], buf, 1) == -1) { + return EN_ERR; + } +#endif + return EN_OK; +} + +int handleEventNotifier(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + uint64_t u; + if (read(en->efd, &u, sizeof(uint64_t)) == -1) { + return EN_ERR; + } +#else + char buf[1]; + if (read(en->pipefd[0], buf, 1) == -1) { + return EN_ERR; + } +#endif + return EN_OK; +} + +void freeEventNotifier(struct eventNotifier *en) { +#ifdef HAVE_EVENT_FD + close(en->efd); +#else + close(en->pipefd[0]); + close(en->pipefd[1]); +#endif + + /* Free memory */ + zfree(en); +} diff --git a/src/eventnotifier.h b/src/eventnotifier.h new file mode 100644 index 000000000..39e3b5113 --- /dev/null +++ b/src/eventnotifier.h @@ -0,0 +1,33 @@ +/* eventnotifier.h -- An event notifier based on eventfd or pipe. + * + * Copyright (c) 2024-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ + +#ifndef EVENTNOTIFIER_H +#define EVENTNOTIFIER_H + +#include "config.h" + +#define EN_OK 0 +#define EN_ERR -1 + +typedef struct eventNotifier { +#ifdef HAVE_EVENT_FD + int efd; +#else + int pipefd[2]; +#endif +} eventNotifier; + +eventNotifier* createEventNotifier(void); +int getReadEventFd(struct eventNotifier *en); +int getWriteEventFd(struct eventNotifier *en); +int triggerEventNotifier(struct eventNotifier *en); +int handleEventNotifier(struct eventNotifier *en); +void freeEventNotifier(struct eventNotifier *en); + +#endif diff --git a/src/iothread.c b/src/iothread.c new file mode 100644 index 000000000..2e5c98a28 --- /dev/null +++ b/src/iothread.c @@ -0,0 +1,631 @@ +/* iothread.c -- The threaded io implementation. + * + * Copyright (c) 2024-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ + +#include "server.h" + +/* IO threads. */ +static IOThread IOThreads[IO_THREADS_MAX_NUM]; + +/* For main thread */ +static list *mainThreadPendingClientsToIOThreads[IO_THREADS_MAX_NUM]; /* Clients to IO threads */ +static list *mainThreadProcessingClients[IO_THREADS_MAX_NUM]; /* Clients in processing */ +static list *mainThreadPendingClients[IO_THREADS_MAX_NUM]; /* Pending clients from IO threads */ +static pthread_mutex_t mainThreadPendingClientsMutexes[IO_THREADS_MAX_NUM]; /* Mutex for pending clients */ +static eventNotifier* mainThreadPendingClientsNotifiers[IO_THREADS_MAX_NUM]; /* Notifier for pending clients */ + +/* When IO threads read a complete query of clients or want to free clients, it + * should remove it from its clients list and put the client in the list to main + * thread, we will send these clients to main thread in IOThreadBeforeSleep. */ +void enqueuePendingClientsToMainThread(client *c, int unbind) { + /* If the IO thread may no longer manage it, such as closing client, we should + * unbind client from event loop, so main thread doesn't need to do it costly. */ + if (unbind) connUnbindEventLoop(c->conn); + /* Just skip if it already is transferred. */ + if (c->io_thread_client_list_node) { + listDelNode(IOThreads[c->tid].clients, c->io_thread_client_list_node); + c->io_thread_client_list_node = NULL; + /* Disable read and write to avoid race when main thread processes. */ + c->io_flags &= ~(CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED); + listAddNodeTail(IOThreads[c->tid].pending_clients_to_main_thread, c); + } +} + +/* Unbind connection of client from io thread event loop, write and read handlers + * also be removed, ensures that we can operate the client safely. */ +void unbindClientFromIOThreadEventLoop(client *c) { + serverAssert(c->tid != IOTHREAD_MAIN_THREAD_ID && + c->running_tid == IOTHREAD_MAIN_THREAD_ID); + if (!connHasEventLoop(c->conn)) return; + /* As calling in main thread, we should pause the io thread to make it safe. */ + pauseIOThread(c->tid); + connUnbindEventLoop(c->conn); + resumeIOThread(c->tid); +} + +/* When main thread is processing a client from IO thread, and wants to keep it, + * we should unbind connection of client from io thread event loop first, + * and then bind the client connection into server's event loop. */ +void keepClientInMainThread(client *c) { + serverAssert(c->tid != IOTHREAD_MAIN_THREAD_ID && + c->running_tid == IOTHREAD_MAIN_THREAD_ID); + /* IO thread no longer manage it. */ + server.io_threads_clients_num[c->tid]--; + /* Unbind connection of client from io thread event loop. */ + unbindClientFromIOThreadEventLoop(c); + /* Let main thread to run it, rebind event loop and read handler */ + connRebindEventLoop(c->conn, server.el); + connSetReadHandler(c->conn, readQueryFromClient); + c->io_flags |= CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED; + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + c->tid = IOTHREAD_MAIN_THREAD_ID; + /* Main thread starts to manage it. */ + server.io_threads_clients_num[c->tid]++; +} + +/* If the client is managed by IO thread, we should fetch it from IO thread + * and then main thread will can process it. Just like IO Thread transfers + * the client to the main thread for processing. */ +void fetchClientFromIOThread(client *c) { + serverAssert(c->tid != IOTHREAD_MAIN_THREAD_ID && + c->running_tid != IOTHREAD_MAIN_THREAD_ID); + pauseIOThread(c->tid); + /* Remove the client from clients list of IO thread or main thread. */ + if (c->io_thread_client_list_node) { + listDelNode(IOThreads[c->tid].clients, c->io_thread_client_list_node); + c->io_thread_client_list_node = NULL; + } else { + list *clients[5] = { + IOThreads[c->tid].pending_clients, + IOThreads[c->tid].pending_clients_to_main_thread, + mainThreadPendingClients[c->tid], + mainThreadProcessingClients[c->tid], + mainThreadPendingClientsToIOThreads[c->tid] + }; + for (int i = 0; i < 5; i++) { + listNode *ln = listSearchKey(clients[i], c); + if (ln) { + listDelNode(clients[i], ln); + /* Client only can be in one client list. */ + break; + } + } + } + /* Unbind connection of client from io thread event loop. */ + connUnbindEventLoop(c->conn); + /* Now main thread can process it. */ + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + resumeIOThread(c->tid); +} + +/* For some clients, we must handle them in the main thread, since there is + * data race to be processed in IO threads. + * + * - Close ASAP, we must free the client in main thread. + * - Replica, pubsub, monitor, blocked, tracking clients, main thread may + * directly write them a reply when conditions are met. + * - Script command with debug may operate connection directly. */ +int isClientMustHandledByMainThread(client *c) { + if (c->flags & (CLIENT_CLOSE_ASAP | CLIENT_MASTER | CLIENT_SLAVE | + CLIENT_PUBSUB | CLIENT_MONITOR | CLIENT_BLOCKED | + CLIENT_UNBLOCKED | CLIENT_TRACKING | CLIENT_LUA_DEBUG | + CLIENT_LUA_DEBUG_SYNC)) + { + return 1; + } + return 0; +} + +/* When the main thread accepts a new client or transfers clients to IO threads, + * it assigns the client to the IO thread with the fewest clients. */ +void assignClientToIOThread(client *c) { + serverAssert(c->tid == IOTHREAD_MAIN_THREAD_ID); + /* Find the IO thread with the fewest clients. */ + int min_id = 0; + int min = INT_MAX; + for (int i = 1; i < server.io_threads_num; i++) { + if (server.io_threads_clients_num[i] < min) { + min = server.io_threads_clients_num[i]; + min_id = i; + } + } + + /* Assign the client to the IO thread. */ + server.io_threads_clients_num[c->tid]--; + c->tid = min_id; + c->running_tid = min_id; + server.io_threads_clients_num[min_id]++; + + /* Unbind connection of client from main thread event loop, disable read and + * write, and then put it in the list, main thread will send these clients + * to IO thread in beforeSleep. */ + connUnbindEventLoop(c->conn); + c->io_flags &= ~(CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED); + listAddNodeTail(mainThreadPendingClientsToIOThreads[c->tid], c); +} + +/* If updating maxclients config, we not only resize the event loop of main thread + * but also resize the event loop of all io threads, and if one thread is failed, + * it is failed totally, since a fd can be distributed into any IO thread. */ +int resizeAllIOThreadsEventLoops(size_t newsize) { + int result = AE_OK; + if (server.io_threads_num <= 1) return result; + + /* To make context safe. */ + pauseAllIOThreads(); + for (int i = 1; i < server.io_threads_num; i++) { + IOThread *t = &IOThreads[i]; + if (aeResizeSetSize(t->el, newsize) == AE_ERR) + result = AE_ERR; + } + resumeAllIOThreads(); + return result; +} + +/* In the main thread, we may want to operate data of io threads, maybe uninstall + * event handler, access query/output buffer or resize event loop, we need a clean + * and safe context to do that. We pause io thread in IOThreadBeforeSleep, do some + * jobs and then resume it. To avoid thread suspended, we use busy waiting to confirm + * the target status. Besides we use atomic variable to make sure memory visibility + * and ordering. + * + * Make sure that only the main thread can call these function, + * - pauseIOThread, resumeIOThread + * - pauseAllIOThreads, resumeAllIOThreads + * - pauseIOThreadsRange, resumeIOThreadsRange + * + * The main thread will pause the io thread, and then wait for the io thread to + * be paused. The io thread will check the paused status in IOThreadBeforeSleep, + * and then pause itself. + * + * The main thread will resume the io thread, and then wait for the io thread to + * be resumed. The io thread will check the paused status in IOThreadBeforeSleep, + * and then resume itself. + */ + +/* We may pause the same io thread nestedly, so we need to record the times of + * pausing, and only when the times of pausing is 0, we can pause the io thread, + * and only when the times of pausing is 1, we can resume the io thread. */ +static int PausedIOThreads[IO_THREADS_MAX_NUM] = {0}; + +/* Pause the specific range of io threads, and wait for them to be paused. */ +void pauseIOThreadsRange(int start, int end) { + if (server.io_threads_num <= 1) return; + serverAssert(start >= 1 && end < server.io_threads_num && start <= end); + serverAssert(pthread_equal(pthread_self(), server.main_thread_id)); + + /* Try to make all io threads paused in parallel */ + for (int i = start; i <= end; i++) { + PausedIOThreads[i]++; + /* Skip if already paused */ + if (PausedIOThreads[i] > 1) continue; + + int paused; + atomicGetWithSync(IOThreads[i].paused, paused); + /* Don't support to call reentrant */ + serverAssert(paused == IO_THREAD_UNPAUSED); + atomicSetWithSync(IOThreads[i].paused, IO_THREAD_PAUSING); + /* Just notify io thread, no actual job, since io threads check paused + * status in IOThreadBeforeSleep, so just wake it up if polling wait. */ + triggerEventNotifier(IOThreads[i].pending_clients_notifier); + } + + /* Wait for all io threads paused */ + for (int i = start; i <= end; i++) { + if (PausedIOThreads[i] > 1) continue; + int paused = IO_THREAD_PAUSING; + while (paused != IO_THREAD_PAUSED) { + atomicGetWithSync(IOThreads[i].paused, paused); + } + } +} + +/* Resume the specific range of io threads, and wait for them to be resumed. */ +void resumeIOThreadsRange(int start, int end) { + if (server.io_threads_num <= 1) return; + serverAssert(start >= 1 && end < server.io_threads_num && start <= end); + serverAssert(pthread_equal(pthread_self(), server.main_thread_id)); + + for (int i = start; i <= end; i++) { + serverAssert(PausedIOThreads[i] > 0); + PausedIOThreads[i]--; + if (PausedIOThreads[i] > 0) continue; + + int paused; + /* Check if it is paused, since we must call 'pause' and + * 'resume' in pairs */ + atomicGetWithSync(IOThreads[i].paused, paused); + serverAssert(paused == IO_THREAD_PAUSED); + /* Resume */ + atomicSetWithSync(IOThreads[i].paused, IO_THREAD_RESUMING); + while (paused != IO_THREAD_UNPAUSED) { + atomicGetWithSync(IOThreads[i].paused, paused); + } + } +} + +/* The IO thread checks whether it is being paused, and if so, it pauses itself + * and waits for resuming, corresponding to the pause/resumeIOThread* functions. + * Currently, this is only called in IOThreadBeforeSleep, as there are no pending + * I/O events at this point, with a clean context. */ +void handlePauseAndResume(IOThread *t) { + int paused; + /* Check if i am being paused. */ + atomicGetWithSync(t->paused, paused); + if (paused == IO_THREAD_PAUSING) { + atomicSetWithSync(t->paused, IO_THREAD_PAUSED); + /* Wait for resuming */ + while (paused != IO_THREAD_RESUMING) { + atomicGetWithSync(t->paused, paused); + } + atomicSetWithSync(t->paused, IO_THREAD_UNPAUSED); + } +} + +/* Pause the specific io thread, and wait for it to be paused. */ +void pauseIOThread(int id) { + pauseIOThreadsRange(id, id); +} + +/* Resume the specific io thread, and wait for it to be resumed. */ +void resumeIOThread(int id) { + resumeIOThreadsRange(id, id); +} + +/* Pause all io threads, and wait for them to be paused. */ +void pauseAllIOThreads(void) { + pauseIOThreadsRange(1, server.io_threads_num-1); +} + +/* Resume all io threads, and wait for them to be resumed. */ +void resumeAllIOThreads(void) { + resumeIOThreadsRange(1, server.io_threads_num-1); +} + +/* Add the pending clients to the list of IO threads, and trigger an event to + * notify io threads to handle. */ +int sendPendingClientsToIOThreads(void) { + int processed = 0; + for (int i = 1; i < server.io_threads_num; i++) { + int len = listLength(mainThreadPendingClientsToIOThreads[i]); + if (len > 0) { + IOThread *t = &IOThreads[i]; + pthread_mutex_lock(&t->pending_clients_mutex); + listJoin(t->pending_clients, mainThreadPendingClientsToIOThreads[i]); + pthread_mutex_unlock(&t->pending_clients_mutex); + /* Trigger an event, maybe an error is returned when buffer is full + * if using pipe, but no worry, io thread will handle all clients + * in list when receiving a notification. */ + triggerEventNotifier(t->pending_clients_notifier); + } + processed += len; + } + return processed; +} + +extern int ProcessingEventsWhileBlocked; + +/* The main thread processes the clients from IO threads, these clients may have + * a complete command to execute or need to be freed. Note that IO threads never + * free client since this operation access much server data. + * + * Please notice that this function may be called reentrantly, i,e, the same goes + * for handleClientsFromIOThread and processClientsOfAllIOThreads. For example, + * when processing script command, it may call processEventsWhileBlocked to + * process new events, if the clients with fired events from the same io thread, + * it may call this function reentrantly. */ +void processClientsFromIOThread(IOThread *t) { + listNode *node = NULL; + + while (listLength(mainThreadProcessingClients[t->id])) { + /* Each time we pop up only the first client to process to guarantee + * reentrancy safety. */ + if (node) zfree(node); + node = listFirst(mainThreadProcessingClients[t->id]); + listUnlinkNode(mainThreadProcessingClients[t->id], node); + client *c = listNodeValue(node); + + /* Make sure the client is readable or writable in io thread to + * avoid data race. */ + serverAssert(!(c->io_flags & (CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED))); + serverAssert(!(c->flags & CLIENT_CLOSE_ASAP)); + + /* Let main thread to run it, set running thread id first. */ + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + + /* If a read error occurs, handle it in the main thread first, since we + * want to print logs about client information before freeing. */ + if (c->read_error) handleClientReadError(c); + + /* The client is asked to close in IO thread. */ + if (c->io_flags & CLIENT_IO_CLOSE_ASAP) { + freeClient(c); + continue; + } + + /* Update the client in the mem usage */ + updateClientMemUsageAndBucket(c); + + /* Process the pending command and input buffer. */ + if (!c->read_error && c->io_flags & CLIENT_IO_PENDING_COMMAND) { + c->flags |= CLIENT_PENDING_COMMAND; + if (processPendingCommandAndInputBuffer(c) == C_ERR) { + /* If the client is no longer valid, it must be freed safely. */ + continue; + } + } + + /* We may have pending replies if io thread may not finish writing + * reply to client, so we did not put the client in pending write + * queue. And we should do that first since we may keep the client + * in main thread instead of returning to io threads. */ + if (!(c->flags & CLIENT_PENDING_WRITE) && clientHasPendingReplies(c)) + putClientInPendingWriteQueue(c); + + /* The client only can be processed in the main thread, otherwise data + * race will happen, since we may touch client's data in main thread. */ + if (isClientMustHandledByMainThread(c)) { + keepClientInMainThread(c); + continue; + } + + /* Remove this client from pending write clients queue of main thread, + * And some clients may do not have reply if CLIENT REPLY OFF/SKIP. */ + if (c->flags & CLIENT_PENDING_WRITE) { + c->flags &= ~CLIENT_PENDING_WRITE; + listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); + } + c->running_tid = c->tid; + listLinkNodeHead(mainThreadPendingClientsToIOThreads[c->tid], node); + node = NULL; + } + if (node) zfree(node); + + /* Trigger the io thread to handle these clients ASAP to make them processed + * in parallel. + * + * If AOF fsync policy is always, we should not let io thread handle these + * clients now since we don't flush AOF buffer to file and sync yet. + * So these clients will be delayed to send io threads in beforeSleep after + * flushAppendOnlyFile. + * + * If we are in processEventsWhileBlocked, we don't send clients to io threads + * now, we want to update server.events_processed_while_blocked accurately. */ + if (listLength(mainThreadPendingClientsToIOThreads[t->id]) && + server.aof_fsync != AOF_FSYNC_ALWAYS && + !ProcessingEventsWhileBlocked) + { + pthread_mutex_lock(&(t->pending_clients_mutex)); + listJoin(t->pending_clients, mainThreadPendingClientsToIOThreads[t->id]); + pthread_mutex_unlock(&(t->pending_clients_mutex)); + triggerEventNotifier(t->pending_clients_notifier); + } +} + +/* When the io thread finishes processing the client with the read event, it will + * notify the main thread through event triggering in IOThreadBeforeSleep. The main + * thread handles the event through this function. */ +void handleClientsFromIOThread(struct aeEventLoop *el, int fd, void *ptr, int mask) { + UNUSED(el); + UNUSED(mask); + + IOThread *t = ptr; + + /* Handle fd event first. */ + serverAssert(fd == getReadEventFd(mainThreadPendingClientsNotifiers[t->id])); + handleEventNotifier(mainThreadPendingClientsNotifiers[t->id]); + + /* Get the list of clients to process. */ + pthread_mutex_lock(&mainThreadPendingClientsMutexes[t->id]); + listJoin(mainThreadProcessingClients[t->id], mainThreadPendingClients[t->id]); + pthread_mutex_unlock(&mainThreadPendingClientsMutexes[t->id]); + if (listLength(mainThreadProcessingClients[t->id]) == 0) return; + + /* Process the clients from IO threads. */ + processClientsFromIOThread(t); +} + +/* In the new threaded io design, one thread may process multiple clients, so when + * an io thread notifies the main thread of an event, there may be multiple clients + * with commands that need to be processed. But in the event handler function + * handleClientsFromIOThread may be blocked when processing the specific command, + * the previous clients can not get a reply, and the subsequent clients can not be + * processed, so we need to handle this scenario in beforeSleep. The function is to + * process the commands of subsequent clients from io threads. And another function + * sendPendingClientsToIOThreads make sure clients from io thread can get replies. + * See also beforeSleep. */ +void processClientsOfAllIOThreads(void) { + for (int i = 1; i < server.io_threads_num; i++) { + processClientsFromIOThread(&IOThreads[i]); + } +} + +/* After the main thread processes the clients, it will send the clients back to + * io threads to handle, and fire an event, the io thread handles the event by + * this function. If the client is not binded to the event loop, we should bind + * it first and install read handler, and we don't uninstall client read handler + * unless freeing client. If the client has pending reply, we just reply to client + * first, and then install write handler if needed. */ +void handleClientsFromMainThread(struct aeEventLoop *ae, int fd, void *ptr, int mask) { + UNUSED(ae); + UNUSED(mask); + + IOThread *t = ptr; + + /* Handle fd event first. */ + serverAssert(fd == getReadEventFd(t->pending_clients_notifier)); + handleEventNotifier(t->pending_clients_notifier); + + pthread_mutex_lock(&t->pending_clients_mutex); + listJoin(t->processing_clients, t->pending_clients); + pthread_mutex_unlock(&t->pending_clients_mutex); + if (listLength(t->processing_clients) == 0) return; + + listIter li; + listNode *ln; + listRewind(t->processing_clients, &li); + while((ln = listNext(&li))) { + client *c = listNodeValue(ln); + serverAssert(!(c->io_flags & (CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED))); + /* Main thread must handle clients with CLIENT_CLOSE_ASAP flag, since + * we only set io_flags when clients in io thread are freed ASAP. */ + serverAssert(!(c->flags & CLIENT_CLOSE_ASAP)); + + /* Link client in IO thread clients list first. */ + serverAssert(c->io_thread_client_list_node == NULL); + listAddNodeTail(t->clients, c); + c->io_thread_client_list_node = listLast(t->clients); + + /* The client is asked to close, we just let main thread free it. */ + if (c->io_flags & CLIENT_IO_CLOSE_ASAP) { + enqueuePendingClientsToMainThread(c, 1); + continue; + } + + /* Enable read and write and reset some flags. */ + c->io_flags |= CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED; + c->io_flags &= ~CLIENT_IO_PENDING_COMMAND; + + /* Only bind once, we never remove read handler unless freeing client. */ + if (!connHasEventLoop(c->conn)) { + connRebindEventLoop(c->conn, t->el); + serverAssert(!connHasReadHandler(c->conn)); + connSetReadHandler(c->conn, readQueryFromClient); + } + + /* If the client has pending replies, write replies to client. */ + if (clientHasPendingReplies(c)) { + writeToClient(c, 0); + if (!(c->io_flags & CLIENT_IO_CLOSE_ASAP) && clientHasPendingReplies(c)) { + connSetWriteHandler(c->conn, sendReplyToClient); + } + } + } + listEmpty(t->processing_clients); +} + +void IOThreadBeforeSleep(struct aeEventLoop *el) { + IOThread *t = el->privdata[0]; + + /* Handle pending data(typical TLS). */ + connTypeProcessPendingData(el); + + /* If any connection type(typical TLS) still has pending unread data don't sleep at all. */ + aeSetDontWait(el, connTypeHasPendingData(el)); + + /* Check if i am being paused, pause myself and resume. */ + handlePauseAndResume(t); + + /* Check if there are clients to be processed in main thread, and then join + * them to the list of main thread. */ + if (listLength(t->pending_clients_to_main_thread) > 0) { + pthread_mutex_lock(&mainThreadPendingClientsMutexes[t->id]); + listJoin(mainThreadPendingClients[t->id], t->pending_clients_to_main_thread); + pthread_mutex_unlock(&mainThreadPendingClientsMutexes[t->id]); + /* Trigger an event, maybe an error is returned when buffer is full + * if using pipe, but no worry, main thread will handle all clients + * in list when receiving a notification. */ + triggerEventNotifier(mainThreadPendingClientsNotifiers[t->id]); + } +} + +/* The main function of IO thread, it will run an event loop. The mian thread + * and IO thread will communicate through event notifier. */ +void *IOThreadMain(void *ptr) { + IOThread *t = ptr; + char thdname[16]; + snprintf(thdname, sizeof(thdname), "io_thd_%d", t->id); + redis_set_thread_title(thdname); + redisSetCpuAffinity(server.server_cpulist); + makeThreadKillable(); + aeSetBeforeSleepProc(t->el, IOThreadBeforeSleep); + aeMain(t->el); + return NULL; +} + +/* Initialize the data structures needed for threaded I/O. */ +void initThreadedIO(void) { + if (server.io_threads_num <= 1) return; + + server.io_threads_active = 1; + + if (server.io_threads_num > IO_THREADS_MAX_NUM) { + serverLog(LL_WARNING,"Fatal: too many I/O threads configured. " + "The maximum number is %d.", IO_THREADS_MAX_NUM); + exit(1); + } + + /* Spawn and initialize the I/O threads. */ + for (int i = 1; i < server.io_threads_num; i++) { + IOThread *t = &IOThreads[i]; + t->id = i; + t->el = aeCreateEventLoop(server.maxclients+CONFIG_FDSET_INCR); + t->el->privdata[0] = t; + t->pending_clients = listCreate(); + t->processing_clients = listCreate(); + t->pending_clients_to_main_thread = listCreate(); + t->clients = listCreate(); + atomicSetWithSync(t->paused, IO_THREAD_UNPAUSED); + + pthread_mutexattr_t *attr = NULL; + #if defined(__linux__) && defined(__GLIBC__) + attr = zmalloc(sizeof(pthread_mutexattr_t)); + pthread_mutexattr_init(attr); + pthread_mutexattr_settype(attr, PTHREAD_MUTEX_ADAPTIVE_NP); + #endif + pthread_mutex_init(&t->pending_clients_mutex, attr); + + t->pending_clients_notifier = createEventNotifier(); + if (aeCreateFileEvent(t->el, getReadEventFd(t->pending_clients_notifier), + AE_READABLE, handleClientsFromMainThread, t) != AE_OK) + { + serverLog(LL_WARNING, "Fatal: Can't register file event for IO thread notifications."); + exit(1); + } + + /* Create IO thread */ + if (pthread_create(&t->tid, NULL, IOThreadMain, (void*)t) != 0) { + serverLog(LL_WARNING, "Fatal: Can't initialize IO thread."); + exit(1); + } + + /* For main thread */ + mainThreadPendingClientsToIOThreads[i] = listCreate(); + mainThreadPendingClients[i] = listCreate(); + mainThreadProcessingClients[i] = listCreate(); + pthread_mutex_init(&mainThreadPendingClientsMutexes[i], attr); + mainThreadPendingClientsNotifiers[i] = createEventNotifier(); + if (aeCreateFileEvent(server.el, getReadEventFd(mainThreadPendingClientsNotifiers[i]), + AE_READABLE, handleClientsFromIOThread, t) != AE_OK) + { + serverLog(LL_WARNING, "Fatal: Can't register file event for main thread notifications."); + exit(1); + } + if (attr) zfree(attr); + } +} + +/* Kill the IO threads, TODO: release the applied resources. */ +void killIOThreads(void) { + if (server.io_threads_num <= 1) return; + + int err, j; + for (j = 1; j < server.io_threads_num; j++) { + if (IOThreads[j].tid == pthread_self()) continue; + if (IOThreads[j].tid && pthread_cancel(IOThreads[j].tid) == 0) { + if ((err = pthread_join(IOThreads[j].tid,NULL)) != 0) { + serverLog(LL_WARNING, + "IO thread(tid:%lu) can not be joined: %s", + (unsigned long)IOThreads[j].tid, strerror(err)); + } else { + serverLog(LL_WARNING, + "IO thread(tid:%lu) terminated",(unsigned long)IOThreads[j].tid); + } + } + } +} diff --git a/src/multi.c b/src/multi.c index 6d1ba5697..1956c3dd8 100644 --- a/src/multi.c +++ b/src/multi.c @@ -355,7 +355,12 @@ int isWatchedKeyExpired(client *c) { } /* "Touch" a key, so that if this key is being WATCHed by some client the - * next EXEC will fail. */ + * next EXEC will fail. + * + * Sanitizer suppression: IO threads also read c->flags, but never modify + * it or read the CLIENT_DIRTY_CAS bit, main thread just only modifies + * this bit, so there is actually no real data race. */ +REDIS_NO_SANITIZE("thread") void touchWatchedKey(redisDb *db, robj *key) { list *clients; listIter li; @@ -404,6 +409,7 @@ void touchWatchedKey(redisDb *db, robj *key) { * replaced_with: for SWAPDB, the WATCH should be invalidated if * the key exists in either of them, and skipped only if it * doesn't exist in both. */ +REDIS_NO_SANITIZE("thread") void touchAllWatchedKeysInDb(redisDb *emptied, redisDb *replaced_with) { listIter li; listNode *ln; diff --git a/src/networking.c b/src/networking.c index 9a9515f77..8fb37af08 100644 --- a/src/networking.c +++ b/src/networking.c @@ -24,7 +24,6 @@ static void setProtocolError(const char *errstr, client *c); static void pauseClientsByClient(mstime_t end, int isPauseClientAll); -int postponeClientRead(client *c); char *getClientSockname(client *c); static inline int clientTypeIsSlave(client *c); int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */ @@ -132,6 +131,9 @@ client *createClient(connection *conn) { uint64_t client_id; atomicGetIncr(server.next_client_id, client_id, 1); c->id = client_id; + c->tid = IOTHREAD_MAIN_THREAD_ID; + c->running_tid = IOTHREAD_MAIN_THREAD_ID; + if (conn) server.io_threads_clients_num[c->tid]++; #ifdef LOG_REQ_RES reqresReset(c, 0); c->resp = server.client_default_resp; @@ -163,6 +165,8 @@ client *createClient(connection *conn) { c->bulklen = -1; c->sentlen = 0; c->flags = 0; + c->io_flags = CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED; + c->read_error = 0; c->slot = -1; c->ctime = c->lastinteraction = server.unixtime; c->duration = 0; @@ -195,8 +199,8 @@ client *createClient(connection *conn) { c->peerid = NULL; c->sockname = NULL; c->client_list_node = NULL; + c->io_thread_client_list_node = NULL; c->postponed_list_node = NULL; - c->pending_read_list_node = NULL; c->client_tracking_redirection = 0; c->client_tracking_prefixes = NULL; c->last_memory_usage = 0; @@ -300,13 +304,8 @@ int prepareClientToWrite(client *c) { if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ /* Schedule the client to write the output buffers to the socket, unless - * it should already be setup to do so (it has already pending data). - * - * If CLIENT_PENDING_READ is set, we're in an IO thread and should - * not put the client in pending write queue. Instead, it will be - * done by handleClientsWithPendingReadsUsingThreads() upon return. - */ - if (!clientHasPendingReplies(c) && io_threads_op == IO_THREADS_OP_IDLE) + * it should already be setup to do so (it has already pending data). */ + if (!clientHasPendingReplies(c) && likely(c->running_tid == IOTHREAD_MAIN_THREAD_ID)) putClientInPendingWriteQueue(c); /* Authorize the caller to queue in the output buffer of this client. */ @@ -1359,6 +1358,9 @@ void clientAcceptHandler(connection *conn) { moduleFireServerEvent(REDISMODULE_EVENT_CLIENT_CHANGE, REDISMODULE_SUBEVENT_CLIENT_CHANGE_CONNECTED, c); + + /* Assign the client to an IO thread */ + if (server.io_threads_num > 1) assignClientToIOThread(c); } void acceptCommonHandler(connection *conn, int flags, char *ip) { @@ -1547,14 +1549,6 @@ void unlinkClient(client *c) { c->flags &= ~CLIENT_PENDING_WRITE; } - /* Remove from the list of pending reads if needed. */ - serverAssert(!c->conn || io_threads_op == IO_THREADS_OP_IDLE); - if (c->pending_read_list_node != NULL) { - listDelNode(server.clients_pending_read,c->pending_read_list_node); - c->pending_read_list_node = NULL; - } - - /* When client was just unblocked because of a blocking operation, * remove it from the list of unblocked clients. */ if (c->flags & CLIENT_UNBLOCKED) { @@ -1631,7 +1625,7 @@ void deauthenticateAndCloseClient(client *c) { * If any data remained in the buffer, the client will take ownership of the buffer * and a new empty buffer will be allocated for the reusable buffer. */ static void resetReusableQueryBuf(client *c) { - serverAssert(c->flags & CLIENT_REUSABLE_QUERYBUFFER); + serverAssert(c->io_flags & CLIENT_IO_REUSABLE_QUERYBUFFER); if (c->querybuf != thread_reusable_qb || sdslen(c->querybuf) > c->qb_pos) { /* If querybuf has been reallocated or there is still data left, * let the client take ownership of the reusable buffer. */ @@ -1645,7 +1639,7 @@ static void resetReusableQueryBuf(client *c) { /* Mark that the client is no longer using the reusable query buffer * and indicate that it is no longer used by any client. */ - c->flags &= ~CLIENT_REUSABLE_QUERYBUFFER; + c->io_flags &= ~CLIENT_IO_REUSABLE_QUERYBUFFER; thread_reusable_qb_used = 0; } @@ -1659,6 +1653,19 @@ void freeClient(client *c) { return; } + /* If the client is running in io thread, we can't free it directly. */ + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + fetchClientFromIOThread(c); + } + + /* We need to unbind connection of client from io thread event loop first. */ + if (c->tid != IOTHREAD_MAIN_THREAD_ID) { + unbindClientFromIOThreadEventLoop(c); + } + + /* Update the number of clients in the IO thread. */ + if (c->conn) server.io_threads_clients_num[c->tid]--; + /* For connected clients, call the disconnection event of modules hooks. */ if (c->conn) { moduleFireServerEvent(REDISMODULE_EVENT_CLIENT_CHANGE, @@ -1703,7 +1710,7 @@ void freeClient(client *c) { } /* Free the query buffer */ - if (c->flags & CLIENT_REUSABLE_QUERYBUFFER) + if (c->io_flags & CLIENT_IO_REUSABLE_QUERYBUFFER) resetReusableQueryBuf(c); sdsfree(c->querybuf); c->querybuf = NULL; @@ -1816,25 +1823,24 @@ void freeClient(client *c) { * a context where calling freeClient() is not possible, because the client * should be valid for the continuation of the flow of the program. */ void freeClientAsync(client *c) { - /* We need to handle concurrent access to the server.clients_to_close list - * only in the freeClientAsync() function, since it's the only function that - * may access the list while Redis uses I/O threads. All the other accesses - * are in the context of the main thread while the other threads are - * idle. */ + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + int main_thread = pthread_equal(pthread_self(), server.main_thread_id); + /* Make sure the main thread can access IO thread data safely. */ + if (main_thread) pauseIOThread(c->tid); + if (!(c->flags & CLIENT_IO_CLOSE_ASAP)) { + c->io_flags |= CLIENT_IO_CLOSE_ASAP; + enqueuePendingClientsToMainThread(c, 1); + } + if (main_thread) resumeIOThread(c->tid); + return; + } + if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_SCRIPT) return; c->flags |= CLIENT_CLOSE_ASAP; /* Replicas that was marked as CLIENT_CLOSE_ASAP should not keep the * replication backlog from been trimmed. */ if (c->flags & CLIENT_SLAVE) freeReplicaReferencedReplBuffer(c); - if (server.io_threads_num == 1) { - /* no need to bother with locking if there's just one thread (the main thread) */ - listAddNodeTail(server.clients_to_close,c); - return; - } - static pthread_mutex_t async_free_queue_mutex = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&async_free_queue_mutex); listAddNodeTail(server.clients_to_close,c); - pthread_mutex_unlock(&async_free_queue_mutex); } /* Log errors for invalid use and free the client in async way. @@ -1867,7 +1873,7 @@ int beforeNextClient(client *c) { /* Skip the client processing if we're in an IO thread, in that case we'll perform this operation later (this function is called again) in the fan-in stage of the threading mechanism */ - if (io_threads_op != IO_THREADS_OP_IDLE) + if (c && c->running_tid != IOTHREAD_MAIN_THREAD_ID) return C_OK; /* Handle async frees */ /* Note: this doesn't make the server.clients_to_close list redundant because of @@ -2052,8 +2058,12 @@ int _writeToClient(client *c, ssize_t *nwritten) { * set to 0. So when handler_installed is set to 0 the function must be * thread safe. */ int writeToClient(client *c, int handler_installed) { + if (!(c->io_flags & CLIENT_IO_WRITE_ENABLED)) return C_OK; /* Update total number of writes on server */ atomicIncr(server.stat_total_writes_processed, 1); + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + atomicIncr(server.stat_io_writes_processed, 1); + } ssize_t nwritten = 0, totwritten = 0; @@ -2107,7 +2117,7 @@ int writeToClient(client *c, int handler_installed) { * is always called with handler_installed set to 0 from threads * so we are fine. */ if (handler_installed) { - serverAssert(io_threads_op == IO_THREADS_OP_IDLE); + /* IO Thread also can do that now. */ connSetWriteHandler(c->conn, NULL); } @@ -2118,10 +2128,10 @@ int writeToClient(client *c, int handler_installed) { } } /* Update client's memory usage after writing. - * Since this isn't thread safe we do this conditionally. In case of threaded writes this is done in - * handleClientsWithPendingWritesUsingThreads(). */ - if (io_threads_op == IO_THREADS_OP_IDLE) + * Since this isn't thread safe we do this conditionally. */ + if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) { updateClientMemUsageAndBucket(c); + } return C_OK; } @@ -2153,6 +2163,15 @@ int handleClientsWithPendingWrites(void) { /* Don't write to clients that are going to be closed anyway. */ if (c->flags & CLIENT_CLOSE_ASAP) continue; + /* Let IO thread handle the client if possible. */ + if (server.io_threads_num > 1 && + !(c->flags & CLIENT_CLOSE_AFTER_REPLY) && + !isClientMustHandledByMainThread(c)) + { + assignClientToIOThread(c); + continue; + } + /* Try to write buffers to the client socket. */ if (writeToClient(c,0) == C_ERR) continue; @@ -2227,7 +2246,7 @@ void resetClient(client *c) { * path, it is not really released, but only marked for later release. */ void protectClient(client *c) { c->flags |= CLIENT_PROTECTED; - if (c->conn) { + if (c->conn && c->tid == IOTHREAD_MAIN_THREAD_ID) { connSetReadHandler(c->conn,NULL); connSetWriteHandler(c->conn,NULL); } @@ -2238,7 +2257,8 @@ void unprotectClient(client *c) { if (c->flags & CLIENT_PROTECTED) { c->flags &= ~CLIENT_PROTECTED; if (c->conn) { - connSetReadHandler(c->conn,readQueryFromClient); + if (c->tid == IOTHREAD_MAIN_THREAD_ID) + connSetReadHandler(c->conn,readQueryFromClient); if (clientHasPendingReplies(c)) putClientInPendingWriteQueue(c); } } @@ -2263,8 +2283,7 @@ int processInlineBuffer(client *c) { /* Nothing to do without a \r\n */ if (newline == NULL) { if (sdslen(c->querybuf)-c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c,"Protocol error: too big inline request"); - setProtocolError("too big inline request",c); + c->read_error = CLIENT_READ_TOO_BIG_INLINE_REQUEST; } return C_ERR; } @@ -2279,8 +2298,7 @@ int processInlineBuffer(client *c) { argv = sdssplitargs(aux,&argc); sdsfree(aux); if (argv == NULL) { - addReplyError(c,"Protocol error: unbalanced quotes in request"); - setProtocolError("unbalanced quotes in inline request",c); + c->read_error = CLIENT_READ_UNBALANCED_QUOTES; return C_ERR; } @@ -2299,8 +2317,7 @@ int processInlineBuffer(client *c) { * to keep the connection active. */ if (querylen != 0 && c->flags & CLIENT_MASTER) { sdsfreesplitres(argv,argc); - serverLog(LL_WARNING,"WARNING: Receiving inline protocol from master, master stream corruption? Closing the master connection and discarding the cached master."); - setProtocolError("Master using the inline protocol. Desync?",c); + c->read_error = CLIENT_READ_MASTER_USING_INLINE_PROTOCAL; return C_ERR; } @@ -2385,8 +2402,7 @@ int processMultibulkBuffer(client *c) { newline = strchr(c->querybuf+c->qb_pos,'\r'); if (newline == NULL) { if (sdslen(c->querybuf)-c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c,"Protocol error: too big mbulk count string"); - setProtocolError("too big mbulk count string",c); + c->read_error = CLIENT_READ_TOO_BIG_MBULK_COUNT_STRING; } return C_ERR; } @@ -2400,12 +2416,10 @@ int processMultibulkBuffer(client *c) { serverAssertWithInfo(c,NULL,c->querybuf[c->qb_pos] == '*'); ok = string2ll(c->querybuf+1+c->qb_pos,newline-(c->querybuf+1+c->qb_pos),&ll); if (!ok || ll > INT_MAX) { - addReplyError(c,"Protocol error: invalid multibulk length"); - setProtocolError("invalid mbulk count",c); + c->read_error = CLIENT_READ_INVALID_MULTIBUCK_LENGTH; return C_ERR; } else if (ll > 10 && authRequired(c)) { - addReplyError(c, "Protocol error: unauthenticated multibulk length"); - setProtocolError("unauth mbulk count", c); + c->read_error = CLIENT_READ_UNAUTH_MBUCK_COUNT; return C_ERR; } @@ -2432,9 +2446,7 @@ int processMultibulkBuffer(client *c) { newline = strchr(c->querybuf+c->qb_pos,'\r'); if (newline == NULL) { if (sdslen(c->querybuf)-c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c, - "Protocol error: too big bulk count string"); - setProtocolError("too big bulk count string",c); + c->read_error = CLIENT_READ_TOO_BIG_BUCK_COUNT_STRING; return C_ERR; } break; @@ -2445,22 +2457,17 @@ int processMultibulkBuffer(client *c) { break; if (c->querybuf[c->qb_pos] != '$') { - addReplyErrorFormat(c, - "Protocol error: expected '$', got '%c'", - c->querybuf[c->qb_pos]); - setProtocolError("expected $ but got something else",c); + c->read_error = CLIENT_READ_EXPECTED_DOLLAR; return C_ERR; } ok = string2ll(c->querybuf+c->qb_pos+1,newline-(c->querybuf+c->qb_pos+1),&ll); if (!ok || ll < 0 || (!(c->flags & CLIENT_MASTER) && ll > server.proto_max_bulk_len)) { - addReplyError(c,"Protocol error: invalid bulk length"); - setProtocolError("invalid bulk length",c); + c->read_error = CLIENT_READ_INVALID_BUCK_LENGTH; return C_ERR; } else if (ll > 16384 && authRequired(c)) { - addReplyError(c, "Protocol error: unauthenticated bulk length"); - setProtocolError("unauth bulk length", c); + c->read_error = CLIENT_READ_UNAUTH_BUCK_LENGTH; return C_ERR; } @@ -2637,6 +2644,74 @@ int processPendingCommandAndInputBuffer(client *c) { return C_OK; } +void handleClientReadError(client *c) { + switch (c->read_error) { + case CLIENT_READ_TOO_BIG_INLINE_REQUEST: + addReplyError(c,"Protocol error: too big inline request"); + setProtocolError("too big inline request",c); + break; + case CLIENT_READ_UNBALANCED_QUOTES: + addReplyError(c,"Protocol error: unbalanced quotes in request"); + setProtocolError("unbalanced quotes in request",c); + break; + case CLIENT_READ_MASTER_USING_INLINE_PROTOCAL: + serverLog(LL_WARNING,"WARNING: Receiving inline protocol from master, master stream corruption? Closing the master connection and discarding the cached master."); + setProtocolError("Master using the inline protocol. Desync?",c); + break; + case CLIENT_READ_TOO_BIG_MBULK_COUNT_STRING: + addReplyError(c,"Protocol error: too big mbulk count string"); + setProtocolError("too big mbulk count string",c); + break; + case CLIENT_READ_TOO_BIG_BUCK_COUNT_STRING: + addReplyError(c, "Protocol error: too big bulk count string"); + setProtocolError("too big bulk count string",c); + break; + case CLIENT_READ_EXPECTED_DOLLAR: + addReplyErrorFormat(c, + "Protocol error: expected '$', got '%c'", + c->querybuf[c->qb_pos]); + setProtocolError("expected $ but got something else",c); + break; + case CLIENT_READ_INVALID_BUCK_LENGTH: + addReplyError(c,"Protocol error: invalid bulk length"); + setProtocolError("invalid bulk length",c); + break; + case CLIENT_READ_UNAUTH_BUCK_LENGTH: + addReplyError(c, "Protocol error: unauthenticated bulk length"); + setProtocolError("unauth bulk length", c); + break; + case CLIENT_READ_INVALID_MULTIBUCK_LENGTH: + addReplyError(c,"Protocol error: invalid multibulk length"); + setProtocolError("invalid mbulk count",c); + break; + case CLIENT_READ_UNAUTH_MBUCK_COUNT: + addReplyError(c, "Protocol error: unauthenticated multibulk length"); + setProtocolError("unauth mbulk count", c); + break; + case CLIENT_READ_CONN_DISCONNECTED: + serverLog(LL_VERBOSE, "Reading from client: %s",connGetLastError(c->conn)); + break; + case CLIENT_READ_CONN_CLOSED: + if (server.verbosity <= LL_VERBOSE) { + sds info = catClientInfoString(sdsempty(), c); + serverLog(LL_VERBOSE, "Client closed connection %s", info); + sdsfree(info); + } + break; + case CLIENT_READ_REACHED_MAX_QUERYBUF: { + sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty(); + bytes = sdscatrepr(bytes,c->querybuf,64); + serverLog(LL_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes); + sdsfree(ci); + sdsfree(bytes); + break; + } + default: + serverPanic("Unknown client read error"); + break; + } +} + /* This function is called every time, in the client structure 'c', there is * more query buffer to process, because we read more data from the socket * or because a client was blocked and later reactivated, so there could be @@ -2656,7 +2731,7 @@ int processInputBuffer(client *c) { * condition on the slave. We want just to accumulate the replication * stream (instead of replying -BUSY like we do with other clients) and * later resume the processing. */ - if (isInsideYieldingLongCommand() && c->flags & CLIENT_MASTER) break; + if (c->flags & CLIENT_MASTER && isInsideYieldingLongCommand()) break; /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is * written to the client. Make sure to not let the reply grow after @@ -2675,23 +2750,34 @@ int processInputBuffer(client *c) { } if (c->reqtype == PROTO_REQ_INLINE) { - if (processInlineBuffer(c) != C_OK) break; + if (processInlineBuffer(c) != C_OK) { + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID && c->read_error) + enqueuePendingClientsToMainThread(c, 0); + break; + } } else if (c->reqtype == PROTO_REQ_MULTIBULK) { - if (processMultibulkBuffer(c) != C_OK) break; + if (processMultibulkBuffer(c) != C_OK) { + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID && c->read_error) + enqueuePendingClientsToMainThread(c, 0); + break; + } } else { serverPanic("Unknown request type"); } /* Multibulk processing could see a <= 0 length. */ if (c->argc == 0) { - resetClientInternal(c, 0); + freeClientArgvInternal(c, 0); + c->reqtype = 0; + c->multibulklen = 0; + c->bulklen = -1; } else { /* If we are in the context of an I/O thread, we can't really * execute the command here. All we can do is to flag the client * as one that needs to process the command. */ - if (io_threads_op != IO_THREADS_OP_IDLE) { - serverAssert(io_threads_op == IO_THREADS_OP_READ); - c->flags |= CLIENT_PENDING_COMMAND; + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + c->io_flags |= CLIENT_IO_PENDING_COMMAND; + enqueuePendingClientsToMainThread(c, 0); break; } @@ -2732,7 +2818,7 @@ int processInputBuffer(client *c) { /* Update client memory usage after processing the query buffer, this is * important in case the query buffer is big and wasn't drained during * the above loop (because of partially sent big commands). */ - if (io_threads_op == IO_THREADS_OP_IDLE) + if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) updateClientMemUsageAndBucket(c); return C_OK; @@ -2742,13 +2828,14 @@ void readQueryFromClient(connection *conn) { client *c = connGetPrivateData(conn); int nread, big_arg = 0; size_t qblen, readlen; - - /* Check if we want to read from the client later when exiting from - * the event loop. This is the case if threaded I/O is enabled. */ - if (postponeClientRead(c)) return; + if (!(c->io_flags & CLIENT_IO_READ_ENABLED)) return; + c->read_error = 0; /* Update total number of reads on server */ atomicIncr(server.stat_total_reads_processed, 1); + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + atomicIncr(server.stat_io_reads_processed, 1); + } readlen = PROTO_IOBUF_LEN; /* If this is a multi bulk request, and we are processing a bulk reply @@ -2793,7 +2880,7 @@ void readQueryFromClient(connection *conn) { /* Assign the reusable query buffer to the client and mark it as in use. */ serverAssert(sdslen(thread_reusable_qb) == 0); c->querybuf = thread_reusable_qb; - c->flags |= CLIENT_REUSABLE_QUERYBUFFER; + c->io_flags |= CLIENT_IO_REUSABLE_QUERYBUFFER; thread_reusable_qb_used = 1; } } @@ -2821,16 +2908,12 @@ void readQueryFromClient(connection *conn) { if (connGetState(conn) == CONN_STATE_CONNECTED) { goto done; } else { - serverLog(LL_VERBOSE, "Reading from client: %s",connGetLastError(c->conn)); + c->read_error = CLIENT_READ_CONN_DISCONNECTED; freeClientAsync(c); goto done; } } else if (nread == 0) { - if (server.verbosity <= LL_VERBOSE) { - sds info = catClientInfoString(sdsempty(), c); - serverLog(LL_VERBOSE, "Client closed connection %s", info); - sdsfree(info); - } + c->read_error = CLIENT_READ_CONN_CLOSED; freeClientAsync(c); goto done; } @@ -2853,13 +2936,9 @@ void readQueryFromClient(connection *conn) { * * For unauthenticated clients, the query buffer cannot exceed 1MB at most. */ (c->mstate.argv_len_sums + sdslen(c->querybuf) > server.client_max_querybuf_len || - (c->mstate.argv_len_sums + sdslen(c->querybuf) > 1024*1024 && authRequired(c)))) { - sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty(); - - bytes = sdscatrepr(bytes,c->querybuf,64); - serverLog(LL_WARNING,"Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, bytes); - sdsfree(ci); - sdsfree(bytes); + (c->mstate.argv_len_sums + sdslen(c->querybuf) > 1024*1024 && authRequired(c)))) + { + c->read_error = CLIENT_READ_REACHED_MAX_QUERYBUF; freeClientAsync(c); atomicIncr(server.stat_client_qbuf_limit_disconnections, 1); goto done; @@ -2871,7 +2950,13 @@ void readQueryFromClient(connection *conn) { c = NULL; done: - if (c && (c->flags & CLIENT_REUSABLE_QUERYBUFFER)) { + if (c && c->read_error) { + if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) { + handleClientReadError(c); + } + } + + if (c && (c->io_flags & CLIENT_IO_REUSABLE_QUERYBUFFER)) { serverAssert(c->qb_pos == 0); /* Ensure the client's query buffer is trimmed in processInputBuffer */ resetReusableQueryBuf(c); } @@ -2933,6 +3018,16 @@ char *getClientSockname(client *c) { sds catClientInfoString(sds s, client *client) { char flags[17], events[3], conninfo[CONN_INFO_LEN], *p; + /* Pause IO thread to access data of the client safely. */ + int paused = 0; + if (client->running_tid != IOTHREAD_MAIN_THREAD_ID && + pthread_equal(server.main_thread_id, pthread_self()) && + !server.crashing) + { + paused = 1; + pauseIOThread(client->running_tid); + } + p = flags; if (client->flags & CLIENT_SLAVE) { if (client->flags & CLIENT_MONITOR) @@ -3006,7 +3101,10 @@ sds catClientInfoString(sds s, client *client) { " redir=%I", (client->flags & CLIENT_TRACKING) ? (long long) client->client_tracking_redirection : -1, " resp=%i", client->resp, " lib-name=%s", client->lib_name ? (char*)client->lib_name->ptr : "", - " lib-ver=%s", client->lib_ver ? (char*)client->lib_ver->ptr : "")); + " lib-ver=%s", client->lib_ver ? (char*)client->lib_ver->ptr : "", + " io-thread=%i", client->tid)); + + if (paused) resumeIOThread(client->running_tid); return ret; } @@ -3016,6 +3114,17 @@ sds getAllClientsInfoString(int type) { client *client; sds o = sdsnewlen(SDS_NOINIT,200*listLength(server.clients)); sdsclear(o); + + /* Pause all IO threads to access data of clients safely, and pausing the + * specific IO thread will not repeatedly execute in catClientInfoString. */ + int allpaused = 0; + if (server.io_threads_num > 1 && !server.crashing && + pthread_equal(server.main_thread_id, pthread_self())) + { + allpaused = 1; + pauseAllIOThreads(); + } + listRewind(server.clients,&li); while ((ln = listNext(&li)) != NULL) { client = listNodeValue(ln); @@ -3023,6 +3132,8 @@ sds getAllClientsInfoString(int type) { o = catClientInfoString(o,client); o = sdscatlen(o,"\n",1); } + + if (allpaused) resumeAllIOThreads(); return o; } @@ -4331,388 +4442,6 @@ void processEventsWhileBlocked(void) { server.cmd_time_snapshot = prev_cmd_time_snapshot; } -/* ========================================================================== - * Threaded I/O - * ========================================================================== */ - -#define IO_THREADS_MAX_NUM 128 - -typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) threads_pending { - redisAtomic unsigned long value; -} threads_pending; - -pthread_t io_threads[IO_THREADS_MAX_NUM]; -pthread_mutex_t io_threads_mutex[IO_THREADS_MAX_NUM]; -threads_pending io_threads_pending[IO_THREADS_MAX_NUM]; -int io_threads_op; /* IO_THREADS_OP_IDLE, IO_THREADS_OP_READ or IO_THREADS_OP_WRITE. */ // TODO: should access to this be atomic??! - -/* This is the list of clients each thread will serve when threaded I/O is - * used. We spawn io_threads_num-1 threads, since one is the main thread - * itself. */ -list *io_threads_list[IO_THREADS_MAX_NUM]; - -static inline unsigned long getIOPendingCount(int i) { - unsigned long count = 0; - atomicGetWithSync(io_threads_pending[i].value, count); - return count; -} - -static inline void setIOPendingCount(int i, unsigned long count) { - atomicSetWithSync(io_threads_pending[i].value, count); -} - -void *IOThreadMain(void *myid) { - /* The ID is the thread number (from 0 to server.io_threads_num-1), and is - * used by the thread to just manipulate a single sub-array of clients. */ - long id = (unsigned long)myid; - char thdname[16]; - - snprintf(thdname, sizeof(thdname), "io_thd_%ld", id); - redis_set_thread_title(thdname); - redisSetCpuAffinity(server.server_cpulist); - makeThreadKillable(); - - while(1) { - /* Wait for start */ - for (int j = 0; j < 1000000; j++) { - if (getIOPendingCount(id) != 0) break; - } - - /* Give the main thread a chance to stop this thread. */ - if (getIOPendingCount(id) == 0) { - pthread_mutex_lock(&io_threads_mutex[id]); - pthread_mutex_unlock(&io_threads_mutex[id]); - continue; - } - - serverAssert(getIOPendingCount(id) != 0); - - /* Process: note that the main thread will never touch our list - * before we drop the pending count to 0. */ - listIter li; - listNode *ln; - listRewind(io_threads_list[id],&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - if (io_threads_op == IO_THREADS_OP_WRITE) { - writeToClient(c,0); - } else if (io_threads_op == IO_THREADS_OP_READ) { - readQueryFromClient(c->conn); - } else { - serverPanic("io_threads_op value is unknown"); - } - } - listEmpty(io_threads_list[id]); - setIOPendingCount(id, 0); - } -} - -/* Initialize the data structures needed for threaded I/O. */ -void initThreadedIO(void) { - server.io_threads_active = 0; /* We start with threads not active. */ - - /* Indicate that io-threads are currently idle */ - io_threads_op = IO_THREADS_OP_IDLE; - - /* Don't spawn any thread if the user selected a single thread: - * we'll handle I/O directly from the main thread. */ - if (server.io_threads_num == 1) return; - - if (server.io_threads_num > IO_THREADS_MAX_NUM) { - serverLog(LL_WARNING,"Fatal: too many I/O threads configured. " - "The maximum number is %d.", IO_THREADS_MAX_NUM); - exit(1); - } - - /* Spawn and initialize the I/O threads. */ - for (int i = 0; i < server.io_threads_num; i++) { - /* Things we do for all the threads including the main thread. */ - io_threads_list[i] = listCreate(); - if (i == 0) continue; /* Thread 0 is the main thread. */ - - /* Things we do only for the additional threads. */ - pthread_t tid; - pthread_mutex_init(&io_threads_mutex[i],NULL); - setIOPendingCount(i, 0); - pthread_mutex_lock(&io_threads_mutex[i]); /* Thread will be stopped. */ - if (pthread_create(&tid,NULL,IOThreadMain,(void*)(long)i) != 0) { - serverLog(LL_WARNING,"Fatal: Can't initialize IO thread."); - exit(1); - } - io_threads[i] = tid; - } -} - -void killIOThreads(void) { - int err, j; - for (j = 0; j < server.io_threads_num; j++) { - if (io_threads[j] == pthread_self()) continue; - if (io_threads[j] && pthread_cancel(io_threads[j]) == 0) { - if ((err = pthread_join(io_threads[j],NULL)) != 0) { - serverLog(LL_WARNING, - "IO thread(tid:%lu) can not be joined: %s", - (unsigned long)io_threads[j], strerror(err)); - } else { - serverLog(LL_WARNING, - "IO thread(tid:%lu) terminated",(unsigned long)io_threads[j]); - } - } - } -} - -void startThreadedIO(void) { - serverAssert(server.io_threads_active == 0); - for (int j = 1; j < server.io_threads_num; j++) - pthread_mutex_unlock(&io_threads_mutex[j]); - server.io_threads_active = 1; -} - -void stopThreadedIO(void) { - /* We may have still clients with pending reads when this function - * is called: handle them before stopping the threads. */ - handleClientsWithPendingReadsUsingThreads(); - serverAssert(server.io_threads_active == 1); - for (int j = 1; j < server.io_threads_num; j++) - pthread_mutex_lock(&io_threads_mutex[j]); - server.io_threads_active = 0; -} - -/* This function checks if there are not enough pending clients to justify - * taking the I/O threads active: in that case I/O threads are stopped if - * currently active. We track the pending writes as a measure of clients - * we need to handle in parallel, however the I/O threading is disabled - * globally for reads as well if we have too little pending clients. - * - * The function returns 0 if the I/O threading should be used because there - * are enough active threads, otherwise 1 is returned and the I/O threads - * could be possibly stopped (if already active) as a side effect. */ -int stopThreadedIOIfNeeded(void) { - int pending = listLength(server.clients_pending_write); - - /* Return ASAP if IO threads are disabled (single threaded mode). */ - if (server.io_threads_num == 1) return 1; - - if (pending < (server.io_threads_num*2)) { - if (server.io_threads_active) stopThreadedIO(); - return 1; - } else { - return 0; - } -} - -/* This function achieves thread safety using a fan-out -> fan-in paradigm: - * Fan out: The main thread fans out work to the io-threads which block until - * setIOPendingCount() is called with a value larger than 0 by the main thread. - * Fan in: The main thread waits until getIOPendingCount() returns 0. Then - * it can safely perform post-processing and return to normal synchronous - * work. */ -int handleClientsWithPendingWritesUsingThreads(void) { - int processed = listLength(server.clients_pending_write); - if (processed == 0) return 0; /* Return ASAP if there are no clients. */ - - /* If I/O threads are disabled or we have few clients to serve, don't - * use I/O threads, but the boring synchronous code. */ - if (server.io_threads_num == 1 || stopThreadedIOIfNeeded()) { - return handleClientsWithPendingWrites(); - } - - /* Start threads if needed. */ - if (!server.io_threads_active) startThreadedIO(); - - /* Distribute the clients across N different lists. */ - listIter li; - listNode *ln; - listRewind(server.clients_pending_write,&li); - int item_id = 0; - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - c->flags &= ~CLIENT_PENDING_WRITE; - - /* Remove clients from the list of pending writes since - * they are going to be closed ASAP. */ - if (c->flags & CLIENT_CLOSE_ASAP) { - listUnlinkNode(server.clients_pending_write, ln); - continue; - } - - /* Since all replicas and replication backlog use global replication - * buffer, to guarantee data accessing thread safe, we must put all - * replicas client into io_threads_list[0] i.e. main thread handles - * sending the output buffer of all replicas. */ - if (unlikely(clientTypeIsSlave(c))) { - listAddNodeTail(io_threads_list[0],c); - continue; - } - - int target_id = item_id % server.io_threads_num; - listAddNodeTail(io_threads_list[target_id],c); - item_id++; - } - - /* Give the start condition to the waiting threads, by setting the - * start condition atomic var. */ - io_threads_op = IO_THREADS_OP_WRITE; - for (int j = 1; j < server.io_threads_num; j++) { - int count = listLength(io_threads_list[j]); - setIOPendingCount(j, count); - } - - /* Also use the main thread to process a slice of clients. */ - listRewind(io_threads_list[0],&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - writeToClient(c,0); - } - listEmpty(io_threads_list[0]); - - /* Wait for all the other threads to end their work. */ - while(1) { - unsigned long pending = 0; - for (int j = 1; j < server.io_threads_num; j++) - pending += getIOPendingCount(j); - if (pending == 0) break; - } - - io_threads_op = IO_THREADS_OP_IDLE; - - /* Run the list of clients again to install the write handler where - * needed. */ - listRewind(server.clients_pending_write,&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - - /* Update the client in the mem usage after we're done processing it in the io-threads */ - updateClientMemUsageAndBucket(c); - - /* Install the write handler if there are pending writes in some - * of the clients. */ - if (clientHasPendingReplies(c)) { - installClientWriteHandler(c); - } - } - while(listLength(server.clients_pending_write) > 0) { - listUnlinkNode(server.clients_pending_write, server.clients_pending_write->head); - } - - /* Update processed count on server */ - server.stat_io_writes_processed += processed; - - return processed; -} - -/* Return 1 if we want to handle the client read later using threaded I/O. - * This is called by the readable handler of the event loop. - * As a side effect of calling this function the client is put in the - * pending read clients and flagged as such. */ -int postponeClientRead(client *c) { - if (server.io_threads_active && - server.io_threads_do_reads && - !ProcessingEventsWhileBlocked && - !(c->flags & (CLIENT_MASTER|CLIENT_SLAVE|CLIENT_BLOCKED)) && - io_threads_op == IO_THREADS_OP_IDLE) - { - listAddNodeHead(server.clients_pending_read,c); - c->pending_read_list_node = listFirst(server.clients_pending_read); - return 1; - } else { - return 0; - } -} - -/* When threaded I/O is also enabled for the reading + parsing side, the - * readable handler will just put normal clients into a queue of clients to - * process (instead of serving them synchronously). This function runs - * the queue using the I/O threads, and process them in order to accumulate - * the reads in the buffers, and also parse the first command available - * rendering it in the client structures. - * This function achieves thread safety using a fan-out -> fan-in paradigm: - * Fan out: The main thread fans out work to the io-threads which block until - * setIOPendingCount() is called with a value larger than 0 by the main thread. - * Fan in: The main thread waits until getIOPendingCount() returns 0. Then - * it can safely perform post-processing and return to normal synchronous - * work. */ -int handleClientsWithPendingReadsUsingThreads(void) { - if (!server.io_threads_active || !server.io_threads_do_reads) return 0; - int processed = listLength(server.clients_pending_read); - if (processed == 0) return 0; - - /* Distribute the clients across N different lists. */ - listIter li; - listNode *ln; - listRewind(server.clients_pending_read,&li); - int item_id = 0; - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - int target_id = item_id % server.io_threads_num; - listAddNodeTail(io_threads_list[target_id],c); - item_id++; - } - - /* Give the start condition to the waiting threads, by setting the - * start condition atomic var. */ - io_threads_op = IO_THREADS_OP_READ; - for (int j = 1; j < server.io_threads_num; j++) { - int count = listLength(io_threads_list[j]); - setIOPendingCount(j, count); - } - - /* Also use the main thread to process a slice of clients. */ - listRewind(io_threads_list[0],&li); - while((ln = listNext(&li))) { - client *c = listNodeValue(ln); - readQueryFromClient(c->conn); - } - listEmpty(io_threads_list[0]); - - /* Wait for all the other threads to end their work. */ - while(1) { - unsigned long pending = 0; - for (int j = 1; j < server.io_threads_num; j++) - pending += getIOPendingCount(j); - if (pending == 0) break; - } - - io_threads_op = IO_THREADS_OP_IDLE; - - /* Run the list of clients again to process the new buffers. */ - while(listLength(server.clients_pending_read)) { - ln = listFirst(server.clients_pending_read); - client *c = listNodeValue(ln); - listDelNode(server.clients_pending_read,ln); - c->pending_read_list_node = NULL; - - serverAssert(!(c->flags & CLIENT_BLOCKED)); - - if (beforeNextClient(c) == C_ERR) { - /* If the client is no longer valid, we avoid - * processing the client later. So we just go - * to the next. */ - continue; - } - - /* Once io-threads are idle we can update the client in the mem usage */ - updateClientMemUsageAndBucket(c); - - if (processPendingCommandAndInputBuffer(c) == C_ERR) { - /* If the client is no longer valid, we avoid - * processing the client later. So we just go - * to the next. */ - continue; - } - - /* We may have pending replies if a thread readQueryFromClient() produced - * replies and did not put the client in pending write queue (it can't). - */ - if (!(c->flags & CLIENT_PENDING_WRITE) && clientHasPendingReplies(c)) - putClientInPendingWriteQueue(c); - } - - /* Update processed count on server */ - server.stat_io_reads_processed += processed; - - return processed; -} - /* Returns the actual client eviction limit based on current configuration or * 0 if no limit. */ size_t getClientEvictionLimit(void) { @@ -4752,11 +4481,34 @@ void evictClients(void) { listNode *ln = listNext(&bucket_iter); if (ln) { client *c = ln->value; - sds ci = catClientInfoString(sdsempty(),c); - serverLog(LL_NOTICE, "Evicting client: %s", ci); - freeClient(c); - sdsfree(ci); - server.stat_evictedclients++; + size_t last_memory = c->last_memory_usage; + int tid = c->running_tid; + if (tid != IOTHREAD_MAIN_THREAD_ID) { + pauseIOThread(tid); + /* We need to update the client memory usage and bucket if the client + * is running in IO thread. This is because the client memory usage + * and bucket are updated 'only' in the main thread, such as processing + * command and clientsCron, it may delay updating, to avoid incorrectly + * evicting clients, we update again before evicting, if the memory + * used by the client does not decrease or memory usage bucket is not + * changed, then we will evict it, otherwise, not evict it. */ + updateClientMemUsageAndBucket(c); + } + if (c->last_memory_usage >= last_memory || + c->mem_usage_bucket == &server.client_mem_usage_buckets[curr_bucket]) + { + sds ci = catClientInfoString(sdsempty(),c); + serverLog(LL_NOTICE, "Evicting client: %s", ci); + freeClient(c); + sdsfree(ci); + server.stat_evictedclients++; + } + if (tid != IOTHREAD_MAIN_THREAD_ID) { + resumeIOThread(tid); + /* The 'next' of 'bucket_iter' may be changed after updating client memory + * usage and freeing client, so let reset 'bucket_iter'. */ + listRewind(server.client_mem_usage_buckets[curr_bucket].clients, &bucket_iter); + } } else { curr_bucket--; if (curr_bucket < 0) { diff --git a/src/replication.c b/src/replication.c index abf930e61..79a55d39b 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2925,7 +2925,7 @@ write_error: /* Handle sendCommand() errors. */ } int connectWithMaster(void) { - server.repl_transfer_s = connCreate(connTypeOfReplication()); + server.repl_transfer_s = connCreate(server.el, connTypeOfReplication()); if (connConnect(server.repl_transfer_s, server.masterhost, server.masterport, server.bind_source_addr, syncWithMaster) == C_ERR) { serverLog(LL_WARNING,"Unable to connect to MASTER: %s", diff --git a/src/server.c b/src/server.c index 4b729fede..0b4c95ce8 100644 --- a/src/server.c +++ b/src/server.c @@ -963,7 +963,7 @@ void removeClientFromMemUsageBucket(client *c, int allow_eviction) { * returns 1 if client eviction for this client is allowed, 0 otherwise. */ int updateClientMemUsageAndBucket(client *c) { - serverAssert(io_threads_op == IO_THREADS_OP_IDLE && c->conn); + serverAssert(pthread_equal(pthread_self(), server.main_thread_id) && c->conn); int allow_eviction = clientEvictionAllowed(c); removeClientFromMemUsageBucket(c, allow_eviction); @@ -1015,6 +1015,7 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) { * default server.hz value is 10, so sometimes here we need to process thousands * of clients per second, turning this function into a source of latency. */ +#define CLIENTS_CRON_PAUSE_IOTHREAD 8 #define CLIENTS_CRON_MIN_ITERATIONS 5 void clientsCron(void) { /* Try to process at least numclients/server.hz of clients @@ -1049,6 +1050,15 @@ void clientsCron(void) { ClientsPeakMemInput[zeroidx] = 0; ClientsPeakMemOutput[zeroidx] = 0; + /* Pause the IO threads that are processing clients, to let us access clients + * safely. In order to avoid increasing CPU usage by pausing all threads when + * there are too many io threads, we pause io threads in multiple batches. */ + static int start = 1, end = 0; + if (server.io_threads_num >= 1 && listLength(server.clients) > 0) { + end = start + CLIENTS_CRON_PAUSE_IOTHREAD - 1; + if (end >= server.io_threads_num) end = server.io_threads_num - 1; + pauseIOThreadsRange(start, end); + } while(listLength(server.clients) && iterations--) { client *c; @@ -1059,6 +1069,15 @@ void clientsCron(void) { head = listFirst(server.clients); c = listNodeValue(head); listRotateHeadToTail(server.clients); + + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID && + !(c->running_tid >= start && c->running_tid <= end)) + { + /* Skip clients that are being processed by the IO threads that + * are not paused. */ + continue; + } + /* The following functions do different service checks on the client. * The protocol is that they return non-zero if the client was * terminated. */ @@ -1080,6 +1099,14 @@ void clientsCron(void) { if (closeClientOnOutputBufferLimitReached(c, 0)) continue; } + + /* Resume the IO threads that were paused */ + if (end) { + resumeIOThreadsRange(start, end); + start = end + 1; + if (start >= server.io_threads_num) start = 1; + end = 0; + } } /* This function handles 'background' operations we are required to do @@ -1528,9 +1555,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { migrateCloseTimedoutSockets(); } - /* Stop the I/O threads if we don't have enough pending work. */ - stopThreadedIOIfNeeded(); - /* Resize tracking keys table if needed. This is also done at every * command execution, but we want to be sure that if the last command * executed changes the value via CONFIG SET, the server will perform @@ -1682,24 +1706,28 @@ void beforeSleep(struct aeEventLoop *eventLoop) { * events to handle. */ if (ProcessingEventsWhileBlocked) { uint64_t processed = 0; - processed += handleClientsWithPendingReadsUsingThreads(); - processed += connTypeProcessPendingData(); + processed += connTypeProcessPendingData(server.el); if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); processed += handleClientsWithPendingWrites(); processed += freeClientsInAsyncFreeQueue(); + + /* Let the clients after the blocking call be processed. */ + processClientsOfAllIOThreads(); + /* New connections may have been established while blocked, clients from + * IO thread may have replies to write, ensure they are promptly sent to + * IO threads. */ + processed += sendPendingClientsToIOThreads(); + server.events_processed_while_blocked += processed; return; } - /* We should handle pending reads clients ASAP after event loop. */ - handleClientsWithPendingReadsUsingThreads(); - /* Handle pending data(typical TLS). (must be done before flushAppendOnlyFile) */ - connTypeProcessPendingData(); + connTypeProcessPendingData(server.el); /* If any connection type(typical TLS) still has pending unread data don't sleep at all. */ - int dont_sleep = connTypeHasPendingData(); + int dont_sleep = connTypeHasPendingData(server.el); /* Call the Redis Cluster before sleep function. Note that this function * may change the state of Redis Cluster (from ok to fail or vice versa), @@ -1765,8 +1793,8 @@ void beforeSleep(struct aeEventLoop *eventLoop) { long long prev_fsynced_reploff = server.fsynced_reploff; /* Write the AOF buffer on disk, - * must be done before handleClientsWithPendingWritesUsingThreads, - * in case of appendfsync=always. */ + * must be done before handleClientsWithPendingWrites and + * sendPendingClientsToIOThreads, in case of appendfsync=always. */ if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); @@ -1788,7 +1816,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) { } /* Handle writes with pending output buffers. */ - handleClientsWithPendingWritesUsingThreads(); + handleClientsWithPendingWrites(); + + /* Let io thread to handle its pending clients. */ + sendPendingClientsToIOThreads(); /* Record cron time in beforeSleep. This does not include the time consumed by AOF writing and IO writing above. */ monotime cron_start_time_after_write = getMonotonicUs(); @@ -2117,6 +2148,7 @@ void initServerConfig(void) { memset(server.blocked_clients_by_type,0, sizeof(server.blocked_clients_by_type)); server.shutdown_asap = 0; + server.crashing = 0; server.shutdown_flags = 0; server.shutdown_mstime = 0; server.cluster_module_flags = CLUSTER_MODULE_FLAG_NONE; @@ -2583,9 +2615,9 @@ void resetServerStats(void) { server.stat_sync_full = 0; server.stat_sync_partial_ok = 0; server.stat_sync_partial_err = 0; - server.stat_io_reads_processed = 0; + atomicSet(server.stat_io_reads_processed, 0); atomicSet(server.stat_total_reads_processed, 0); - server.stat_io_writes_processed = 0; + atomicSet(server.stat_io_writes_processed, 0); atomicSet(server.stat_total_writes_processed, 0); atomicSet(server.stat_client_qbuf_limit_disconnections, 0); server.stat_client_outbuf_limit_disconnections = 0; @@ -2778,6 +2810,7 @@ void initServer(void) { server.aof_last_write_errno = 0; server.repl_good_slaves_count = 0; server.last_sig_received = 0; + memset(server.io_threads_clients_num, 0, sizeof(server.io_threads_clients_num)); /* Initiate acl info struct */ server.acl_info.invalid_cmd_accesses = 0; @@ -5535,7 +5568,7 @@ void releaseInfoSectionDict(dict *sec) { * The resulting dictionary should be released with releaseInfoSectionDict. */ dict *genInfoSectionDict(robj **argv, int argc, char **defaults, int *out_all, int *out_everything) { char *default_sections[] = { - "server", "clients", "memory", "persistence", "stats", "replication", + "server", "clients", "memory", "persistence", "stats", "replication", "threads", "cpu", "module_list", "errorstats", "cluster", "keyspace", "keysizes", NULL}; if (!defaults) defaults = default_sections; @@ -5886,6 +5919,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { long long current_active_defrag_time = server.stat_last_active_defrag_time ? (long long) elapsedUs(server.stat_last_active_defrag_time): 0; long long stat_client_qbuf_limit_disconnections; + long long stat_io_reads_processed, stat_io_writes_processed; atomicGet(server.stat_total_reads_processed, stat_total_reads_processed); atomicGet(server.stat_total_writes_processed, stat_total_writes_processed); atomicGet(server.stat_net_input_bytes, stat_net_input_bytes); @@ -5893,6 +5927,8 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { atomicGet(server.stat_net_repl_input_bytes, stat_net_repl_input_bytes); atomicGet(server.stat_net_repl_output_bytes, stat_net_repl_output_bytes); atomicGet(server.stat_client_qbuf_limit_disconnections, stat_client_qbuf_limit_disconnections); + atomicGet(server.stat_io_reads_processed, stat_io_reads_processed); + atomicGet(server.stat_io_writes_processed, stat_io_writes_processed); if (sections++) info = sdscat(info,"\r\n"); info = sdscatprintf(info, "# Stats\r\n" FMTARGS( @@ -5944,8 +5980,8 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "dump_payload_sanitizations:%lld\r\n", server.stat_dump_payload_sanitizations, "total_reads_processed:%lld\r\n", stat_total_reads_processed, "total_writes_processed:%lld\r\n", stat_total_writes_processed, - "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed, - "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed, + "io_threaded_reads_processed:%lld\r\n", stat_io_reads_processed, + "io_threaded_writes_processed:%lld\r\n", stat_io_writes_processed, "client_query_buffer_limit_disconnections:%lld\r\n", stat_client_qbuf_limit_disconnections, "client_output_buffer_limit_disconnections:%lld\r\n", server.stat_client_outbuf_limit_disconnections, "reply_buffer_shrinks:%lld\r\n", server.stat_reply_buffer_shrinks, @@ -6094,6 +6130,15 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { #endif /* RUSAGE_THREAD */ } + /* Threads */ + if (all_sections || (dictFind(section_dict,"threads") != NULL)) { + if (sections++) info = sdscat(info,"\r\n"); + info = sdscatprintf(info, "# Threads\r\n"); + for (j = 0; j < server.io_threads_num; j++) { + info = sdscatprintf(info, "io_thread_%d:clients=%d\r\n", j, server.io_threads_clients_num[j]); + } + } + /* Modules */ if (all_sections || (dictFind(section_dict,"module_list") != NULL) || (dictFind(section_dict,"modules") != NULL)) { if (sections++) info = sdscat(info,"\r\n"); diff --git a/src/server.h b/src/server.h index 205d73c68..bc965999e 100644 --- a/src/server.h +++ b/src/server.h @@ -61,6 +61,7 @@ typedef long long ustime_t; /* microsecond time type. */ N-elements flat arrays */ #include "rax.h" /* Radix tree */ #include "connection.h" /* Connection abstraction */ +#include "eventnotifier.h" /* Event notification */ #define REDISMODULE_CORE 1 typedef struct redisObject robj; @@ -184,6 +185,14 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; /* Hash table parameters */ #define HASHTABLE_MAX_LOAD_FACTOR 1.618 /* Maximum hash table load factor. */ +/* Max number of IO threads */ +#define IO_THREADS_MAX_NUM 128 + +/* Main thread id for doing IO work, whatever we enable or disable io thread + * the main thread always does IO work, so we can consider that the main thread + * is the io thread 0. */ +#define IOTHREAD_MAIN_THREAD_ID 0 + /* Command flags. Please check the definition of struct redisCommand in this file * for more information about the meaning of every flag. */ #define CMD_WRITE (1ULL<<0) @@ -385,11 +394,33 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CLIENT_MODULE_PREVENT_AOF_PROP (1ULL<<48) /* Module client do not want to propagate to AOF */ #define CLIENT_MODULE_PREVENT_REPL_PROP (1ULL<<49) /* Module client do not want to propagate to replica */ #define CLIENT_REPROCESSING_COMMAND (1ULL<<50) /* The client is re-processing the command. */ -#define CLIENT_REUSABLE_QUERYBUFFER (1ULL<<51) /* The client is using the reusable query buffer. */ /* Any flag that does not let optimize FLUSH SYNC to run it in bg as blocking client ASYNC */ #define CLIENT_AVOID_BLOCKING_ASYNC_FLUSH (CLIENT_DENY_BLOCKING|CLIENT_MULTI|CLIENT_LUA_DEBUG|CLIENT_LUA_DEBUG_SYNC|CLIENT_MODULE) +/* Client flags for client IO */ +#define CLIENT_IO_READ_ENABLED (1ULL<<0) /* Client can read from socket. */ +#define CLIENT_IO_WRITE_ENABLED (1ULL<<1) /* Client can write to socket. */ +#define CLIENT_IO_PENDING_COMMAND (1ULL<<2) /* Similar to CLIENT_PENDING_COMMAND. */ +#define CLIENT_IO_REUSABLE_QUERYBUFFER (1ULL<<3) /* The client is using the reusable query buffer. */ +#define CLIENT_IO_CLOSE_ASAP (1ULL<<4) /* Close this client ASAP in IO thread. */ + +/* Definitions for client read errors. These error codes are used to indicate + * various issues that can occur while reading or parsing data from a client. */ +#define CLIENT_READ_TOO_BIG_INLINE_REQUEST 1 +#define CLIENT_READ_UNBALANCED_QUOTES 2 +#define CLIENT_READ_MASTER_USING_INLINE_PROTOCAL 3 +#define CLIENT_READ_TOO_BIG_MBULK_COUNT_STRING 4 +#define CLIENT_READ_TOO_BIG_BUCK_COUNT_STRING 5 +#define CLIENT_READ_EXPECTED_DOLLAR 6 +#define CLIENT_READ_INVALID_BUCK_LENGTH 7 +#define CLIENT_READ_UNAUTH_BUCK_LENGTH 8 +#define CLIENT_READ_INVALID_MULTIBUCK_LENGTH 9 +#define CLIENT_READ_UNAUTH_MBUCK_COUNT 10 +#define CLIENT_READ_CONN_DISCONNECTED 11 +#define CLIENT_READ_CONN_CLOSED 12 +#define CLIENT_READ_REACHED_MAX_QUERYBUF 13 + /* Client block type (btype field in client structure) * if CLIENT_BLOCKED flag is set. */ typedef enum blocking_type { @@ -578,6 +609,12 @@ typedef enum { #define SHUTDOWN_NOW 4 /* Don't wait for replicas to catch up. */ #define SHUTDOWN_FORCE 8 /* Don't let errors prevent shutdown. */ +/* IO thread pause status */ +#define IO_THREAD_UNPAUSED 0 +#define IO_THREAD_PAUSING 1 +#define IO_THREAD_PAUSED 2 +#define IO_THREAD_RESUMING 3 + /* Command call flags, see call() function */ #define CMD_CALL_NONE 0 #define CMD_CALL_PROPAGATE_AOF (1<<0) @@ -1159,6 +1196,10 @@ typedef struct client { uint64_t id; /* Client incremental unique ID. */ uint64_t flags; /* Client flags: CLIENT_* macros. */ connection *conn; + uint8_t tid; /* Thread assigned ID this client is bound to. */ + uint8_t running_tid; /* Thread assigned ID this client is running on. */ + uint8_t io_flags; /* Accessed by both main and IO threads, but not modified concurrently */ + uint8_t read_error; /* Client read error: CLIENT_READ_* macros. */ int resp; /* RESP protocol version. Can be 2 or 3. */ redisDb *db; /* Pointer to currently SELECTed DB. */ robj *name; /* As set by CLIENT SETNAME. */ @@ -1226,8 +1267,8 @@ typedef struct client { sds peerid; /* Cached peer ID. */ sds sockname; /* Cached connection target address. */ listNode *client_list_node; /* list node in client list */ + listNode *io_thread_client_list_node; /* list node in io thread client list */ listNode *postponed_list_node; /* list node within the postponed list */ - listNode *pending_read_list_node; /* list node in clients pending read list */ void *module_blocked_client; /* Pointer to the RedisModuleBlockedClient associated with this * client. This is set in case of module authentication before the * unblocked client is reprocessed to handle reply callbacks. */ @@ -1280,6 +1321,20 @@ typedef struct client { #endif } client; +typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) { + uint8_t id; /* The unique ID assigned, if IO_THREADS_MAX_NUM is more + * than 256, we should also promote the data type. */ + pthread_t tid; /* Pthread ID */ + redisAtomic int paused; /* Paused status for the io thread. */ + aeEventLoop *el; /* Main event loop of io thread. */ + list *pending_clients; /* List of clients with pending writes. */ + list *processing_clients; /* List of clients being processed. */ + eventNotifier *pending_clients_notifier; /* Used to wake up the loop when write should be performed. */ + pthread_mutex_t pending_clients_mutex; /* Mutex for pending write list */ + list *pending_clients_to_main_thread; /* Clients that are waiting to be executed by the main thread. */ + list *clients; /* IO thread managed clients. */ +} IOThread; + /* ACL information */ typedef struct aclInfo { long long user_auth_failures; /* Auth failure counts on user level */ @@ -1568,6 +1623,7 @@ struct redisServer { int errors_enabled; /* If true, errorstats is enabled, and we will add new errors. */ unsigned int lruclock; /* Clock for LRU eviction */ volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */ + volatile sig_atomic_t crashing; /* Server is crashing report. */ mstime_t shutdown_mstime; /* Timestamp to limit graceful shutdown. */ int last_sig_received; /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */ int shutdown_flags; /* Flags passed to prepareForShutdown(). */ @@ -1638,6 +1694,7 @@ struct redisServer { redisAtomic uint64_t next_client_id; /* Next client unique ID. Incremental. */ int protected_mode; /* Don't accept external connections. */ int io_threads_num; /* Number of IO threads to use. */ + int io_threads_clients_num[IO_THREADS_MAX_NUM]; /* Number of clients assigned to each IO thread. */ int io_threads_do_reads; /* Read and parse from IO threads? */ int io_threads_active; /* Is IO threads currently active? */ long long events_processed_while_blocked; /* processEventsWhileBlocked() */ @@ -1710,8 +1767,8 @@ struct redisServer { long long stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to master, etc.) error replies */ long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */ long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ - long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */ - long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */ + redisAtomic long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */ + redisAtomic long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */ redisAtomic long long stat_total_reads_processed; /* Total number of read events processed */ redisAtomic long long stat_total_writes_processed; /* Total number of write events processed */ redisAtomic long long stat_client_qbuf_limit_disconnections; /* Total number of clients reached query buf length limit */ @@ -2461,11 +2518,6 @@ typedef struct { #define OBJ_HASH_KEY 1 #define OBJ_HASH_VALUE 2 -#define IO_THREADS_OP_IDLE 0 -#define IO_THREADS_OP_READ 1 -#define IO_THREADS_OP_WRITE 2 -extern int io_threads_op; - /* Hash-field data type (of t_hash.c) */ typedef mstr hfield; extern mstrKind mstrFieldKind; @@ -2680,9 +2732,6 @@ void whileBlockedCron(void); void blockingOperationStarts(void); void blockingOperationEnds(void); int handleClientsWithPendingWrites(void); -int handleClientsWithPendingWritesUsingThreads(void); -int handleClientsWithPendingReadsUsingThreads(void); -int stopThreadedIOIfNeeded(void); int clientHasPendingReplies(client *c); int updateClientMemUsageAndBucket(client *c); void removeClientFromMemUsageBucket(client *c, int allow_eviction); @@ -2691,13 +2740,32 @@ int writeToClient(client *c, int handler_installed); void linkClient(client *c); void protectClient(client *c); void unprotectClient(client *c); -void initThreadedIO(void); client *lookupClientByID(uint64_t id); int authRequired(client *c); void putClientInPendingWriteQueue(client *c); /* reply macros */ #define ADD_REPLY_BULK_CBUFFER_STRING_CONSTANT(c, str) addReplyBulkCBuffer(c, str, strlen(str)) +/* iothread.c - the threaded io implementation */ +void initThreadedIO(void); +void killIOThreads(void); +void pauseIOThread(int id); +void resumeIOThread(int id); +void pauseAllIOThreads(void); +void resumeAllIOThreads(void); +void pauseIOThreadsRange(int start, int end); +void resumeIOThreadsRange(int start, int end); +int resizeAllIOThreadsEventLoops(size_t newsize); +int sendPendingClientsToIOThreads(void); +void enqueuePendingClientsToMainThread(client *c, int unbind); +void putInPendingClienstForIOThreads(client *c); +void handleClientReadError(client *c); +void unbindClientFromIOThreadEventLoop(client *c); +void processClientsOfAllIOThreads(void); +void assignClientToIOThread(client *c); +void fetchClientFromIOThread(client *c); +int isClientMustHandledByMainThread(client *c); + /* logreqres.c - logging of requests and responses */ void reqresReset(client *c, int free_buf); void reqresSaveClientReplyOffset(client *c); @@ -3901,7 +3969,6 @@ void xorDigest(unsigned char *digest, const void *ptr, size_t len); sds catSubCommandFullname(const char *parent_name, const char *sub_name); void commandAddSubcommand(struct redisCommand *parent, struct redisCommand *subcommand, const char *declared_name); void debugDelay(int usec); -void killIOThreads(void); void killThreads(void); void makeThreadKillable(void); void swapMainDbWithTempDb(redisDb *tempDb); diff --git a/src/socket.c b/src/socket.c index 33c28588a..fd6335251 100644 --- a/src/socket.c +++ b/src/socket.c @@ -53,11 +53,12 @@ static ConnectionType CT_Socket; * be embedded in different structs, not just client. */ -static connection *connCreateSocket(void) { +static connection *connCreateSocket(struct aeEventLoop *el) { connection *conn = zcalloc(sizeof(connection)); conn->type = &CT_Socket; conn->fd = -1; conn->iovcnt = IOV_MAX; + conn->el = el; return conn; } @@ -72,9 +73,9 @@ static connection *connCreateSocket(void) { * is not in an error state (which is not possible for a socket connection, * but could but possible with other protocols). */ -static connection *connCreateAcceptedSocket(int fd, void *priv) { +static connection *connCreateAcceptedSocket(struct aeEventLoop *el, int fd, void *priv) { UNUSED(priv); - connection *conn = connCreateSocket(); + connection *conn = connCreateSocket(el); conn->fd = fd; conn->state = CONN_STATE_ACCEPTING; return conn; @@ -93,7 +94,7 @@ static int connSocketConnect(connection *conn, const char *addr, int port, const conn->state = CONN_STATE_CONNECTING; conn->conn_handler = connect_handler; - aeCreateFileEvent(server.el, conn->fd, AE_WRITABLE, + aeCreateFileEvent(conn->el, conn->fd, AE_WRITABLE, conn->type->ae_handler, conn); return C_OK; @@ -114,7 +115,7 @@ static void connSocketShutdown(connection *conn) { /* Close the connection and free resources. */ static void connSocketClose(connection *conn) { if (conn->fd != -1) { - aeDeleteFileEvent(server.el,conn->fd, AE_READABLE | AE_WRITABLE); + if (conn->el) aeDeleteFileEvent(conn->el, conn->fd, AE_READABLE | AE_WRITABLE); close(conn->fd); conn->fd = -1; } @@ -190,6 +191,15 @@ static int connSocketAccept(connection *conn, ConnectionCallbackFunc accept_hand return ret; } +/* Rebind the connection to another event loop, read/write handlers must not + * be installed in the current event loop, otherwise it will cause two event + * loops to manage the same connection at the same time. */ +static int connSocketRebindEventLoop(connection *conn, aeEventLoop *el) { + serverAssert(!conn->el && !conn->read_handler && !conn->write_handler); + conn->el = el; + return C_OK; +} + /* Register a write handler, to be called when the connection is writable. * If NULL, the existing handler is removed. * @@ -207,9 +217,9 @@ static int connSocketSetWriteHandler(connection *conn, ConnectionCallbackFunc fu else conn->flags &= ~CONN_FLAG_WRITE_BARRIER; if (!conn->write_handler) - aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE); + aeDeleteFileEvent(conn->el,conn->fd,AE_WRITABLE); else - if (aeCreateFileEvent(server.el,conn->fd,AE_WRITABLE, + if (aeCreateFileEvent(conn->el,conn->fd,AE_WRITABLE, conn->type->ae_handler,conn) == AE_ERR) return C_ERR; return C_OK; } @@ -222,9 +232,9 @@ static int connSocketSetReadHandler(connection *conn, ConnectionCallbackFunc fun conn->read_handler = func; if (!conn->read_handler) - aeDeleteFileEvent(server.el,conn->fd,AE_READABLE); + aeDeleteFileEvent(conn->el,conn->fd,AE_READABLE); else - if (aeCreateFileEvent(server.el,conn->fd, + if (aeCreateFileEvent(conn->el,conn->fd, AE_READABLE,conn->type->ae_handler,conn) == AE_ERR) return C_ERR; return C_OK; } @@ -250,7 +260,7 @@ static void connSocketEventHandler(struct aeEventLoop *el, int fd, void *clientD conn->state = CONN_STATE_CONNECTED; } - if (!conn->write_handler) aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE); + if (!conn->write_handler) aeDeleteFileEvent(conn->el, conn->fd, AE_WRITABLE); if (!callHandler(conn, conn->conn_handler)) return; conn->conn_handler = NULL; @@ -291,7 +301,6 @@ static void connSocketAcceptHandler(aeEventLoop *el, int fd, void *privdata, int int cport, cfd; int max = server.max_new_conns_per_cycle; char cip[NET_IP_STR_LEN]; - UNUSED(el); UNUSED(mask); UNUSED(privdata); @@ -304,7 +313,7 @@ static void connSocketAcceptHandler(aeEventLoop *el, int fd, void *privdata, int return; } serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport); - acceptCommonHandler(connCreateAcceptedSocket(cfd, NULL),0,cip); + acceptCommonHandler(connCreateAcceptedSocket(el,cfd,NULL), 0, cip); } } @@ -397,6 +406,10 @@ static ConnectionType CT_Socket = { .blocking_connect = connSocketBlockingConnect, .accept = connSocketAccept, + /* event loop */ + .unbind_event_loop = NULL, + .rebind_event_loop = connSocketRebindEventLoop, + /* IO */ .write = connSocketWrite, .writev = connSocketWritev, diff --git a/src/tls.c b/src/tls.c index 3cc504ad1..3c7b5c0a0 100644 --- a/src/tls.c +++ b/src/tls.c @@ -75,10 +75,6 @@ static int parseProtocolsConfig(const char *str) { return protocols; } -/* list of connections with pending data already read from the socket, but not - * served to the reader yet. */ -static list *pending_list = NULL; - /** * OpenSSL global initialization and locking handling callbacks. * Note that this is only required for OpenSSL < 1.1.0. @@ -144,8 +140,6 @@ static void tlsInit(void) { if (!RAND_poll()) { serverLog(LL_WARNING, "OpenSSL: Failed to seed random number generator."); } - - pending_list = listCreate(); } static void tlsCleanup(void) { @@ -435,20 +429,21 @@ typedef struct tls_connection { listNode *pending_list_node; } tls_connection; -static connection *createTLSConnection(int client_side) { +static connection *createTLSConnection(struct aeEventLoop *el, int client_side) { SSL_CTX *ctx = redis_tls_ctx; if (client_side && redis_tls_client_ctx) ctx = redis_tls_client_ctx; tls_connection *conn = zcalloc(sizeof(tls_connection)); conn->c.type = &CT_TLS; conn->c.fd = -1; + conn->c.el = el; conn->c.iovcnt = IOV_MAX; conn->ssl = SSL_new(ctx); return (connection *) conn; } -static connection *connCreateTLS(void) { - return createTLSConnection(1); +static connection *connCreateTLS(struct aeEventLoop *el) { + return createTLSConnection(el, 1); } /* Fetch the latest OpenSSL error and store it in the connection */ @@ -468,10 +463,11 @@ static void updateTLSError(tls_connection *conn) { * Callers should use connGetState() and verify the created connection * is not in an error state. */ -static connection *connCreateAcceptedTLS(int fd, void *priv) { +static connection *connCreateAcceptedTLS(struct aeEventLoop *el, int fd, void *priv) { int require_auth = *(int *)priv; - tls_connection *conn = (tls_connection *) createTLSConnection(0); + tls_connection *conn = (tls_connection *) createTLSConnection(el, 0); conn->c.fd = fd; + conn->c.el = el; conn->c.state = CONN_STATE_ACCEPTING; if (!conn->ssl) { @@ -575,17 +571,17 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update } static void registerSSLEvent(tls_connection *conn, WantIOType want) { - int mask = aeGetFileEvents(server.el, conn->c.fd); + int mask = aeGetFileEvents(conn->c.el, conn->c.fd); switch (want) { case WANT_READ: - if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); - if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, + if (mask & AE_WRITABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE); + if (!(mask & AE_READABLE)) aeCreateFileEvent(conn->c.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); break; case WANT_WRITE: - if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); - if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, + if (mask & AE_READABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_READABLE); + if (!(mask & AE_WRITABLE)) aeCreateFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); break; default: @@ -595,19 +591,42 @@ static void registerSSLEvent(tls_connection *conn, WantIOType want) { } static void updateSSLEvent(tls_connection *conn) { - int mask = aeGetFileEvents(server.el, conn->c.fd); + serverAssert(conn->c.el); + int mask = aeGetFileEvents(conn->c.el, conn->c.fd); int need_read = conn->c.read_handler || (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ); int need_write = conn->c.write_handler || (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE); if (need_read && !(mask & AE_READABLE)) - aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); + aeCreateFileEvent(conn->c.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); if (!need_read && (mask & AE_READABLE)) - aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); + aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_READABLE); if (need_write && !(mask & AE_WRITABLE)) - aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); + aeCreateFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); if (!need_write && (mask & AE_WRITABLE)) - aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); + aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE); +} + +/* Add a connection to the list of connections with pending data that has + * already been read from the socket but has not yet been served to the reader. */ +static void tlsPendingAdd(tls_connection *conn) { + if (!conn->c.el->privdata[1]) + conn->c.el->privdata[1] = listCreate(); + + list *pending_list = conn->c.el->privdata[1]; + if (!conn->pending_list_node) { + listAddNodeTail(pending_list, conn); + conn->pending_list_node = listLast(pending_list); + } +} + +/* Removes a connection from the list of connections with pending data. */ +static void tlsPendingRemove(tls_connection *conn) { + if (conn->pending_list_node) { + list *pending_list = conn->c.el->privdata[1]; + listDelNode(pending_list, conn->pending_list_node); + conn->pending_list_node = NULL; + } } static void tlsHandleEvent(tls_connection *conn, int mask) { @@ -718,13 +737,9 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { * to a list of pending connection that should be handled anyway. */ if ((mask & AE_READABLE)) { if (SSL_pending(conn->ssl) > 0) { - if (!conn->pending_list_node) { - listAddNodeTail(pending_list, conn); - conn->pending_list_node = listLast(pending_list); - } + tlsPendingAdd(conn); } else if (conn->pending_list_node) { - listDelNode(pending_list, conn->pending_list_node); - conn->pending_list_node = NULL; + tlsPendingRemove(conn); } } @@ -734,7 +749,8 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { break; } - updateSSLEvent(conn); + /* The event loop may have been unbound during the event processing above. */ + if (conn->c.el) updateSSLEvent(conn); } static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask) { @@ -748,7 +764,6 @@ static void tlsAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) int cport, cfd; int max = server.max_new_tls_conns_per_cycle; char cip[NET_IP_STR_LEN]; - UNUSED(el); UNUSED(mask); UNUSED(privdata); @@ -761,7 +776,7 @@ static void tlsAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) return; } serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport); - acceptCommonHandler(connCreateAcceptedTLS(cfd, &server.tls_auth_clients),0,cip); + acceptCommonHandler(connCreateAcceptedTLS(el,cfd,&server.tls_auth_clients), 0, cip); } } @@ -806,6 +821,7 @@ static void connTLSClose(connection *conn_) { } if (conn->pending_list_node) { + list *pending_list = conn->c.el->privdata[1]; listDelNode(pending_list, conn->pending_list_node); conn->pending_list_node = NULL; } @@ -863,6 +879,33 @@ static int connTLSConnect(connection *conn_, const char *addr, int port, const c return C_OK; } +static void connTLSUnbindEventLoop(connection *conn_) { + tls_connection *conn = (tls_connection *) conn_; + + /* We need to remove all events from the old event loop. The subsequent + * updateSSLEvent() will add the appropriate events to the new event loop. */ + if (conn->c.el) { + int mask = aeGetFileEvents(conn->c.el, conn->c.fd); + if (mask & AE_READABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_READABLE); + if (mask & AE_WRITABLE) aeDeleteFileEvent(conn->c.el, conn->c.fd, AE_WRITABLE); + + /* Check if there are pending events and handle accordingly. */ + int has_pending = conn->pending_list_node != NULL; + if (has_pending) tlsPendingRemove(conn); + } +} + +static int connTLSRebindEventLoop(connection *conn_, aeEventLoop *el) { + tls_connection *conn = (tls_connection *) conn_; + serverAssert(!conn->c.el && !conn->c.read_handler && + !conn->c.write_handler && !conn->pending_list_node); + conn->c.el = el; + if (el && SSL_pending(conn->ssl)) tlsPendingAdd(conn); + /* Add the appropriate events to the new event loop. */ + updateSSLEvent((tls_connection *) conn); + return C_OK; +} + static int connTLSWrite(connection *conn_, const void *data, size_t data_len) { tls_connection *conn = (tls_connection *) conn_; int ret; @@ -1044,16 +1087,19 @@ static const char *connTLSGetType(connection *conn_) { return CONN_TYPE_TLS; } -static int tlsHasPendingData(void) { +static int tlsHasPendingData(struct aeEventLoop *el) { + list *pending_list = el->privdata[1]; if (!pending_list) return 0; return listLength(pending_list) > 0; } -static int tlsProcessPendingData(void) { +static int tlsProcessPendingData(struct aeEventLoop *el) { listIter li; listNode *ln; + list *pending_list = el->privdata[1]; + if (!pending_list) return 0; int processed = listLength(pending_list); listRewind(pending_list,&li); while((ln = listNext(&li))) { @@ -1114,6 +1160,10 @@ static ConnectionType CT_TLS = { .blocking_connect = connTLSBlockingConnect, .accept = connTLSAccept, + /* event loop */ + .unbind_event_loop = connTLSUnbindEventLoop, + .rebind_event_loop = connTLSRebindEventLoop, + /* IO */ .read = connTLSRead, .write = connTLSWrite, diff --git a/src/tracking.c b/src/tracking.c index 8ff14369d..5eec3e1d1 100644 --- a/src/tracking.c +++ b/src/tracking.c @@ -253,6 +253,7 @@ void trackingRememberKeys(client *tracking, client *executing) { * - Following a flush command, to send a single RESP NULL to indicate * that all keys are now invalid. */ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { + int paused = 0; uint64_t old_flags = c->flags; c->flags |= CLIENT_PUSHING; @@ -275,6 +276,11 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { if (!(old_flags & CLIENT_PUSHING)) c->flags &= ~CLIENT_PUSHING; c = redir; using_redirection = 1; + /* Start to touch another client data. */ + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + pauseIOThread(c->running_tid); + paused = 1; + } old_flags = c->flags; c->flags |= CLIENT_PUSHING; } @@ -296,7 +302,7 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { * it since RESP2 does not support push messages in the same * connection. */ if (!(old_flags & CLIENT_PUSHING)) c->flags &= ~CLIENT_PUSHING; - return; + goto done; } /* Send the "value" part, which is the array of keys. */ @@ -308,6 +314,17 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) { } updateClientMemUsageAndBucket(c); if (!(old_flags & CLIENT_PUSHING)) c->flags &= ~CLIENT_PUSHING; + +done: + if (paused) { + if (clientHasPendingReplies(c)) { + serverAssert(!(c->flags & CLIENT_PENDING_WRITE)); + /* Actually we install write handler of client which is in IO thread + * event loop, it is safe since the io thread is paused */ + connSetWriteHandler(c->conn, sendReplyToClient); + } + resumeIOThread(c->running_tid); + } } /* This function is called when a key is modified in Redis and in the case diff --git a/src/unix.c b/src/unix.c index eb5850765..b61cb6d49 100644 --- a/src/unix.c +++ b/src/unix.c @@ -74,18 +74,19 @@ static int connUnixListen(connListener *listener) { return C_OK; } -static connection *connCreateUnix(void) { +static connection *connCreateUnix(struct aeEventLoop *el) { connection *conn = zcalloc(sizeof(connection)); conn->type = &CT_Unix; conn->fd = -1; conn->iovcnt = IOV_MAX; + conn->el = el; return conn; } -static connection *connCreateAcceptedUnix(int fd, void *priv) { +static connection *connCreateAcceptedUnix(struct aeEventLoop *el, int fd, void *priv) { UNUSED(priv); - connection *conn = connCreateUnix(); + connection *conn = connCreateUnix(el); conn->fd = fd; conn->state = CONN_STATE_ACCEPTING; return conn; @@ -107,7 +108,7 @@ static void connUnixAcceptHandler(aeEventLoop *el, int fd, void *privdata, int m return; } serverLog(LL_VERBOSE,"Accepted connection to %s", server.unixsocket); - acceptCommonHandler(connCreateAcceptedUnix(cfd, NULL),CLIENT_UNIX_SOCKET,NULL); + acceptCommonHandler(connCreateAcceptedUnix(el, cfd, NULL),CLIENT_UNIX_SOCKET,NULL); } } @@ -123,6 +124,10 @@ static int connUnixAccept(connection *conn, ConnectionCallbackFunc accept_handle return connectionTypeTcp()->accept(conn, accept_handler); } +static int connUnixRebindEventLoop(connection *conn, aeEventLoop *el) { + return connectionTypeTcp()->rebind_event_loop(conn, el); +} + static int connUnixWrite(connection *conn, const void *data, size_t data_len) { return connectionTypeTcp()->write(conn, data, data_len); } @@ -186,6 +191,10 @@ static ConnectionType CT_Unix = { .blocking_connect = NULL, .accept = connUnixAccept, + /* event loop */ + .unbind_event_loop = NULL, + .rebind_event_loop = connUnixRebindEventLoop, + /* IO */ .write = connUnixWrite, .writev = connUnixWritev, diff --git a/tests/integration/shutdown.tcl b/tests/integration/shutdown.tcl index b2ec32cbd..4169d64b7 100644 --- a/tests/integration/shutdown.tcl +++ b/tests/integration/shutdown.tcl @@ -156,6 +156,11 @@ test "Shutting down master waits for replica then fails" { set rd2 [redis_deferring_client -1] $rd1 shutdown $rd2 shutdown + wait_for_condition 100 10 { + [llength [regexp -all -inline {cmd=shutdown} [$master client list]]] eq 2 + } else { + fail "shutdown did not arrive" + } set info_clients [$master info clients] assert_match "*connected_clients:3*" $info_clients assert_match "*blocked_clients:2*" $info_clients @@ -209,6 +214,11 @@ test "Shutting down master waits for replica then aborted" { set rd2 [redis_deferring_client -1] $rd1 shutdown $rd2 shutdown + wait_for_condition 100 10 { + [llength [regexp -all -inline {cmd=shutdown} [$master client list]]] eq 2 + } else { + fail "shutdown did not arrive" + } set info_clients [$master info clients] assert_match "*connected_clients:3*" $info_clients assert_match "*blocked_clients:2*" $info_clients diff --git a/tests/support/util.tcl b/tests/support/util.tcl index f374c3dc9..c240a286c 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -698,6 +698,16 @@ proc latencyrstat_percentiles {cmd r} { } } +proc get_io_thread_clients {id {client r}} { + set pattern "io_thread_$id:clients=(\[0-9\]+)" + set info [$client info threads] + if {[regexp $pattern $info _ value]} { + return $value + } else { + return -1 + } +} + proc generate_fuzzy_traffic_on_key {key type duration} { # Commands per type, blocking commands removed # TODO: extract these from COMMAND DOCS, and improve to include other types diff --git a/tests/unit/client-eviction.tcl b/tests/unit/client-eviction.tcl index 7e8270aa8..3caaf9bd4 100644 --- a/tests/unit/client-eviction.tcl +++ b/tests/unit/client-eviction.tcl @@ -108,7 +108,11 @@ start_server {} { $rr write [join [list "*1\r\n\$$maxmemory_clients_actual\r\n" [string repeat v $maxmemory_clients_actual]] ""] $rr flush } e - assert {![client_exists $cname]} + wait_for_condition 100 10 { + ![client_exists $cname] + } else { + fail "Failed to evict client" + } $rr close # Restore settings @@ -360,6 +364,13 @@ start_server {} { resume_process $server_pid r ping ;# make sure a full event loop cycle is processed before issuing CLIENT LIST + # wait for get commands to be processed + wait_for_condition 100 10 { + [expr {[regexp {calls=(\d+)} [cmdrstat get r] -> calls] ? $calls : 0}] >= 2 + } else { + fail "get did not arrive" + } + # Validate obuf-clients were disconnected (because of obuf limit) catch {client_field obuf-client1 name} e assert_match {no client named obuf-client1 found*} $e @@ -367,7 +378,9 @@ start_server {} { assert_match {no client named obuf-client2 found*} $e # Validate qbuf-client is still connected and wasn't evicted - assert_equal [client_field qbuf-client name] {qbuf-client} + if {[lindex [r config get io-threads] 1] == 1} { + assert_equal [client_field qbuf-client name] {qbuf-client} + } $rr1 close $rr2 close @@ -404,8 +417,11 @@ start_server {} { # Decrease maxmemory_clients and expect client eviction r config set maxmemory-clients [expr $maxmemory_clients / 2] - set connected_clients [llength [lsearch -all [split [string trim [r client list]] "\r\n"] *name=client*]] - assert {$connected_clients > 0 && $connected_clients < $client_count} + wait_for_condition 200 10 { + [llength [regexp -all -inline {name=client} [r client list]]] < $client_count + } else { + fail "Failed to evict clients" + } foreach rr $rrs {$rr close} } @@ -463,8 +479,11 @@ start_server {} { assert {$total_client_mem <= $maxmemory_clients} # Make sure we have only half of our clients now - set connected_clients [llength [lsearch -all [split [string trim [r client list]] "\r\n"] *name=client*]] - assert {$connected_clients == [expr $client_count / 2]} + wait_for_condition 200 100 { + [llength [regexp -all -inline {name=client} [r client list]]] == $client_count / 2 + } else { + fail "Failed to evict clients" + } # Restore the reply buffer resize to default r debug replybuffer resizing 1 @@ -519,7 +538,8 @@ start_server {} { foreach size [lreverse $sizes] { set control_mem [client_field control tot-mem] set total_mem [expr $total_mem - $clients_per_size * $size] - r config set maxmemory-clients [expr $total_mem + $control_mem] + # allow some tolerance when using io threads + r config set maxmemory-clients [expr $total_mem + $control_mem + 1000] set clients [split [string trim [r client list]] "\r\n"] # Verify only relevant clients were evicted for {set i 0} {$i < [llength $sizes]} {incr i} { diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 6e2d381f5..fc66fb510 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -313,7 +313,7 @@ start_server {tags {"info" "external:skip"}} { assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" } assert_morethan $el_sum2 $el_sum1 - assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance + assert_lessthan $el_sum2 [expr $el_sum1+100000] ;# we expect roughly 100ms here, but allow some tolerance if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" } assert_morethan $cmd_sum2 $cmd_sum1 assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index fbd1d14fe..2ba1a8c96 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -6,8 +6,13 @@ start_server {tags {"introspection"}} { } test {CLIENT LIST} { - r client list - } {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|list user=* redir=-1 resp=*} + set client_list [r client list] + if {[lindex [r config get io-threads] 1] == 1} { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|list user=* redir=-1 resp=*} $client_list + } else { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|list user=* redir=-1 resp=*} $client_list + } + } test {CLIENT LIST with IDs} { set myid [r client id] @@ -16,8 +21,13 @@ start_server {tags {"introspection"}} { } test {CLIENT INFO} { - r client info - } {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=*} + set client [r client info] + if {[lindex [r config get io-threads] 1] == 1} { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=26 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=*} $client + } else { + assert_match {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=*} $client + } + } test {CLIENT KILL with illegal arguments} { assert_error "ERR wrong number of arguments for 'client|kill' command" {r client kill} @@ -86,6 +96,11 @@ start_server {tags {"introspection"}} { assert {$connected_clients >= 3} set res [r client kill skipme yes] assert {$res == $connected_clients - 1} + wait_for_condition 1000 10 { + [s connected_clients] eq 1 + } else { + fail "Can't kill all clients except the current one" + } # Kill all clients, including `me` set rd3 [redis_deferring_client] @@ -304,6 +319,9 @@ start_server {tags {"introspection"}} { $rd read ; # Discard the OK $bc blpop mylist 0 + # make sure the blpop arrives first + $bc flush + after 100 wait_for_blocked_clients_count 1 r lpush mylist 1 wait_for_blocked_clients_count 0 @@ -904,3 +922,62 @@ test {CONFIG REWRITE handles alias config properly} { assert_equal [r config get hash-max-listpack-entries] {hash-max-listpack-entries 100} } } {} {external:skip} + +test {IO threads client number} { + start_server {overrides {io-threads 2} tags {external:skip}} { + set iothread_clients [get_io_thread_clients 1] + assert_equal $iothread_clients [s connected_clients] + assert_equal [get_io_thread_clients 0] 0 + + r script debug yes ; # Transfer to main thread + assert_equal [get_io_thread_clients 0] 1 + assert_equal [get_io_thread_clients 1] [expr $iothread_clients - 1] + + set iothread_clients [get_io_thread_clients 1] + set rd1 [redis_deferring_client] + set rd2 [redis_deferring_client] + assert_equal [get_io_thread_clients 1] [expr $iothread_clients + 2] + $rd1 close + $rd2 close + wait_for_condition 1000 10 { + [get_io_thread_clients 1] eq $iothread_clients + } else { + fail "Fail to close clients of io thread 1" + } + assert_equal [get_io_thread_clients 0] 1 + + r script debug no ; # Transfer to io thread + assert_equal [get_io_thread_clients 0] 0 + assert_equal [get_io_thread_clients 1] [expr $iothread_clients + 1] + } +} + +test {Clients are evenly distributed among io threads} { + start_server {overrides {io-threads 4} tags {external:skip}} { + set cur_clients [s connected_clients] + assert_equal $cur_clients 1 + global rdclients + for {set i 1} {$i < 9} {incr i} { + set rdclients($i) [redis_deferring_client] + } + for {set i 1} {$i <= 3} {incr i} { + assert_equal [get_io_thread_clients $i] 3 + } + + $rdclients(3) close + $rdclients(4) close + wait_for_condition 1000 10 { + [get_io_thread_clients 1] eq 2 && + [get_io_thread_clients 2] eq 2 && + [get_io_thread_clients 3] eq 3 + } else { + fail "Fail to close clients" + } + + set $rdclients(3) [redis_deferring_client] + set $rdclients(4) [redis_deferring_client] + for {set i 1} {$i <= 3} {incr i} { + assert_equal [get_io_thread_clients $i] 3 + } + } +} diff --git a/tests/unit/lazyfree.tcl b/tests/unit/lazyfree.tcl index b4ade4031..cb3a4b014 100644 --- a/tests/unit/lazyfree.tcl +++ b/tests/unit/lazyfree.tcl @@ -10,6 +10,7 @@ start_server {tags {"lazyfree"}} { set peak_mem [s used_memory] assert {[r unlink myset] == 1} assert {$peak_mem > $orig_mem+1000000} + reconnect ;# free the memory of reused argv of client wait_for_condition 50 100 { [s used_memory] < $peak_mem && [s used_memory] < $orig_mem*2 @@ -32,6 +33,7 @@ start_server {tags {"lazyfree"}} { set peak_mem [s used_memory] r flushdb async assert {$peak_mem > $orig_mem+1000000} + reconnect ;# free the memory of reused argv of client wait_for_condition 50 100 { [s used_memory] < $peak_mem && [s used_memory] < $orig_mem*2 diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index 363dab472..966ac4487 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -29,7 +29,11 @@ start_server {tags {"maxmemory" "external:skip"}} { set dbsize [r dbsize] if $client_eviction { - return [expr $evicted_clients > 0 && $evicted_keys == 0 && $dbsize == 50] + if {[lindex [r config get io-threads] 1] == 1} { + return [expr $evicted_clients > 0 && $evicted_keys == 0 && $dbsize == 50] + } else { + return [expr $evicted_clients >= 0 && $evicted_keys >= 0 && $dbsize <= 50] + } } else { return [expr $evicted_clients == 0 && $evicted_keys > 0 && $dbsize < 50] } diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 130289aff..92c1f572c 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -420,7 +420,10 @@ run_solo {defrag} { $rd_pubsub read ; # Discard subscribe replies $rd_pubsub ssubscribe $channel_name $rd_pubsub read ; # Discard ssubscribe replies - $rd set k$j $channel_name + # Pub/Sub clients are handled in the main thread, so their memory is + # allocated there. Using the SETBIT command avoids the main thread + # referencing argv from IO threads. + $rd setbit k$j [expr {[string length $channel_name] * 8}] 1 $rd read ; # Discard set replies } @@ -583,6 +586,123 @@ run_solo {defrag} { } } + test "Active defrag for argv retained by the main thread from IO thread: $type" { + r flushdb + r config set hz 100 + r config set activedefrag no + wait_for_defrag_stop 500 100 + r config resetstat + set io_threads [lindex [r config get io-threads] 1] + if {$io_threads == 1} { + r config set active-defrag-threshold-lower 5 + } else { + r config set active-defrag-threshold-lower 10 + } + r config set active-defrag-cycle-min 65 + r config set active-defrag-cycle-max 75 + r config set active-defrag-ignore-bytes 1000kb + r config set maxmemory 0 + + # Create some clients so that they are distributed among different io threads. + set clients {} + for {set i 0} {$i < 8} {incr i} { + lappend clients [redis_client] + } + + # Populate memory with interleaving key pattern of same size + set dummy "[string repeat x 400]" + set n 10000 + for {set i 0} {$i < [llength $clients]} {incr i} { + set rr [lindex $clients $i] + for {set j 0} {$j < $n} {incr j} { + $rr set "k$i-$j" $dummy + } + } + + # If io-threads is enable, verify that memory allocation is not from the main thread. + if {$io_threads != 1} { + # At least make sure that bin 448 is created in the main thread's arena. + r set k dummy + r del k + + # We created 10000 string keys of 400 bytes each for each client, so when the memory + # allocation for the 448 bin in the main thread is significantly smaller than this, + # we can conclude that the memory allocation is not coming from it. + set malloc_stats [r memory malloc-stats] + if {[regexp {(?s)arenas\[0\]:.*?448[ ]+[\d]+[ ]+([\d]+)[ ]} $malloc_stats - allocated]} { + # Ensure the allocation for bin 448 in the main thread’s arena + # is far less than 4375k (10000 * 448 bytes). + assert_lessthan $allocated 200000 + } else { + fail "Failed to get the main thread's malloc stats." + } + } + + after 120 ;# serverCron only updates the info once in 100ms + if {$::verbose} { + puts "used [s allocator_allocated]" + puts "rss [s allocator_active]" + puts "frag [s allocator_frag_ratio]" + puts "frag_bytes [s allocator_frag_bytes]" + } + assert_lessthan [s allocator_frag_ratio] 1.05 + + # Delete keys with even indices to create fragmentation. + for {set i 0} {$i < [llength $clients]} {incr i} { + set rd [lindex $clients $i] + for {set j 0} {$j < $n} {incr j 2} { + $rd del "k$i-$j" + } + } + for {set i 0} {$i < [llength $clients]} {incr i} { + [lindex $clients $i] close + } + + after 120 ;# serverCron only updates the info once in 100ms + if {$::verbose} { + puts "used [s allocator_allocated]" + puts "rss [s allocator_active]" + puts "frag [s allocator_frag_ratio]" + puts "frag_bytes [s allocator_frag_bytes]" + } + assert_morethan [s allocator_frag_ratio] 1.4 + + catch {r config set activedefrag yes} e + if {[r config get activedefrag] eq "activedefrag yes"} { + + # wait for the active defrag to start working (decision once a second) + wait_for_condition 50 100 { + [s total_active_defrag_time] ne 0 + } else { + after 120 ;# serverCron only updates the info once in 100ms + puts [r info memory] + puts [r info stats] + puts [r memory malloc-stats] + fail "defrag not started." + } + + # wait for the active defrag to stop working + wait_for_defrag_stop 500 100 + + # test the fragmentation is lower + after 120 ;# serverCron only updates the info once in 100ms + if {$::verbose} { + puts "used [s allocator_allocated]" + puts "rss [s allocator_active]" + puts "frag [s allocator_frag_ratio]" + puts "frag_bytes [s allocator_frag_bytes]" + } + + if {$io_threads == 1} { + assert_lessthan_equal [s allocator_frag_ratio] 1.05 + } else { + # TODO: When multithreading is enabled, argv may be created in the io thread + # and kept in the main thread, which can cause fragmentation to become worse. + assert_lessthan_equal [s allocator_frag_ratio] 1.1 + } + } + } + if {$type eq "standalone"} { ;# skip in cluster mode test "Active defrag big list: $type" { r flushdb diff --git a/tests/unit/moduleapi/blockedclient.tcl b/tests/unit/moduleapi/blockedclient.tcl index 22b2c4bae..28cc76fe8 100644 --- a/tests/unit/moduleapi/blockedclient.tcl +++ b/tests/unit/moduleapi/blockedclient.tcl @@ -130,7 +130,12 @@ foreach call_type {nested normal} { $rd flush # make sure we get BUSY error, and that we didn't get it too early - assert_error {*BUSY Slow module operation*} {r ping} + wait_for_condition 50 100 { + ([catch {r ping} reply] == 1) && + ([string match {*BUSY Slow module operation*} $reply]) + } else { + fail "Failed waiting for busy slow response" + } assert_morethan_equal [expr [clock clicks -milliseconds]-$start] $busy_time_limit # abort the blocking operation diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 1defb5158..9a4f1196b 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -85,6 +85,11 @@ start_server {tags {"pubsub network"}} { set rd1 [redis_deferring_client] assert_equal {1 2 3} [subscribe $rd1 {chan1 chan2 chan3}] unsubscribe $rd1 + wait_for_condition 100 10 { + [regexp {cmd=unsubscribe} [r client list]] eq 1 + } else { + fail "unsubscribe did not arrive" + } assert_equal 0 [r publish chan1 hello] assert_equal 0 [r publish chan2 hello] assert_equal 0 [r publish chan3 hello] @@ -158,6 +163,11 @@ start_server {tags {"pubsub network"}} { set rd1 [redis_deferring_client] assert_equal {1 2 3} [psubscribe $rd1 {chan1.* chan2.* chan3.*}] punsubscribe $rd1 + wait_for_condition 100 10 { + [regexp {cmd=punsubscribe} [r client list]] eq 1 + } else { + fail "punsubscribe did not arrive" + } assert_equal 0 [r publish chan1.hi hello] assert_equal 0 [r publish chan2.hi hello] assert_equal 0 [r publish chan3.hi hello] diff --git a/tests/unit/pubsubshard.tcl b/tests/unit/pubsubshard.tcl index 6e3fb61c1..a3c841d36 100644 --- a/tests/unit/pubsubshard.tcl +++ b/tests/unit/pubsubshard.tcl @@ -46,6 +46,11 @@ start_server {tags {"pubsubshard external:skip"}} { assert_equal {2} [ssubscribe $rd1 {chan2}] assert_equal {3} [ssubscribe $rd1 {chan3}] sunsubscribe $rd1 + wait_for_condition 100 10 { + [regexp {cmd=sunsubscribe} [r client list]] eq 1 + } else { + fail "sunsubscribe did not arrive" + } assert_equal 0 [r SPUBLISH chan1 hello] assert_equal 0 [r SPUBLISH chan2 hello] assert_equal 0 [r SPUBLISH chan3 hello] diff --git a/tests/unit/querybuf.tcl b/tests/unit/querybuf.tcl index d05911156..9dcf986e8 100644 --- a/tests/unit/querybuf.tcl +++ b/tests/unit/querybuf.tcl @@ -166,7 +166,12 @@ start_server {tags {"querybuf"}} { # The client executing the command is currently using the reusable query buffer, # so the size shown is that of the reusable query buffer. It will be returned # to the reusable query buffer after command execution. - assert_match {*qbuf=26 qbuf-free=* cmd=client|list *} $res + # Note that if IO threads are enabled, the reusable query buffer will be dereferenced earlier. + if {[lindex [r config get io-threads] 1] == 1} { + assert_match {*qbuf=26 qbuf-free=* cmd=client|list *} $res + } else { + assert_match {*qbuf=0 qbuf-free=* cmd=client|list *} $res + } $rd close } diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl index 9f46a8beb..fad95b970 100644 --- a/tests/unit/type/list.tcl +++ b/tests/unit/type/list.tcl @@ -1100,6 +1100,11 @@ foreach {pop} {BLPOP BLMPOP_LEFT} { $watching_client get somekey{t} $watching_client read $watching_client exec + wait_for_condition 100 10 { + [regexp {cmd=exec} [r client list]] eq 1 + } else { + fail "exec did not arrive" + } # Blocked BLPOPLPUSH may create problems, unblock it. r lpush srclist{t} element set res [$watching_client read]