mirror of https://mirror.osredm.com/root/redis.git
reduce getNodeByQuery CPU time by using less cache lines (from 2064 Bytes struct to 64 Bytes): reduces LLC misses and Memory Loads (#13296)
The following PR goes from 33 cacheline on getKeysResult struct (by default has 256 static buffer) ``` root@hpe10:~/redis# pahole -p ./src/server.o -C getKeysResult typedef struct { keyReference keysbuf[256]; /* 0 2048 */ /* --- cacheline 32 boundary (2048 bytes) --- */ /* typedef keyReference */ struct { int pos; int flags; } *keys; /* 2048 8 */ int numkeys; /* 2056 4 */ int size; /* 2060 4 */ /* size: 2064, cachelines: 33, members: 4 */ /* last cacheline: 16 bytes */ } getKeysResult; ``` to 1 cacheline with a static buffer of 6 keys per command): ``` root@hpe10:~/redis# pahole -p ./src/server.o -C getKeysResult typedef struct { int numkeys; /* 0 4 */ int size; /* 4 4 */ keyReference keysbuf[6]; /* 8 48 */ /* typedef keyReference */ struct { int pos; int flags; } *keys; /* 56 8 */ /* size: 64, cachelines: 1, members: 4 */ } getKeysResult; ``` we get around 1.5% higher ops/sec, and a confirmation of around 15% less LLC loads on getNodeByQuery and 37% less Stores. Function / Call Stack | CPU Time: Difference | CPU Time:9462436fa4
| CPU Time: this PR | Loads: Difference | Loads:9462436fa4
| Loads: this PR | Stores: Difference | Stores:9462436fa4
| Stores: This PR -- | -- | -- | -- | -- | -- | -- | -- | -- | -- getNodeByQuery | 0.753767 | 1.57118 | 0.817416 | 144297829 (15% less loads) | 920575969 | 776278140 | 367607824 (37% less stores) | 991642384 | 624034560 ## results on client side ### baseline ``` taskset -c 2,3 memtier_benchmark -s 192.168.1.200 --port 6379 --authenticate perf --cluster-mode --pipeline 10 --data-size 100 --ratio 1:0 --key-pattern P:P --key-minimum=1 --key-maximum 1000000 --test-time 180 -c 25 -t 2 --hide-histogram Writing results to stdout [RUN #1] Preparing benchmark client... [RUN #1] Launching threads now... [RUN #1 100%, 180 secs] 0 threads: 110333450 ops, 604992 (avg: 612942) ops/sec, 84.75MB/sec (avg: 85.86MB/sec), 0.82 (avg: 0.81) msec latency 2 Threads 25 Connections per thread 180 Seconds ALL STATS ====================================================================================================================================================== Type Ops/sec Hits/sec Misses/sec MOVED/sec ASK/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------------------------------------------------------ Sets 612942.14 --- --- 0.00 0.00 0.81332 0.80700 1.26300 2.92700 87924.12 Gets 0.00 0.00 0.00 0.00 0.00 --- --- --- --- 0.00 Waits 0.00 --- --- --- --- --- --- --- --- --- Totals 612942.14 0.00 0.00 0.00 0.00 0.81332 0.80700 1.26300 2.92700 87924.12 ``` ### comparison ``` taskset -c 2,3 memtier_benchmark -s 192.168.1.200 --port 6379 --authenticate perf --cluster-mode --pipeline 10 --data-size 100 --ratio 1:0 --key-pattern P:P --key-minimum=1 --key-maximum 1000000 --test-time 180 -c 25 -t 2 --hide-histogram Writing results to stdout [RUN #1] Preparing benchmark client... [RUN #1] Launching threads now... [RUN #1 100%, 180 secs] 0 threads: 111731310 ops, 610195 (avg: 620707) ops/sec, 85.48MB/sec (avg: 86.95MB/sec), 0.82 (avg: 0.80) msec latency 2 Threads 25 Connections per thread 180 Seconds ALL STATS ====================================================================================================================================================== Type Ops/sec Hits/sec Misses/sec MOVED/sec ASK/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------------------------------------------------------ Sets 620707.72 --- --- 0.00 0.00 0.80312 0.79900 1.23900 2.87900 89037.78 Gets 0.00 0.00 0.00 0.00 0.00 --- --- --- --- 0.00 Waits 0.00 --- --- --- --- --- --- --- --- --- Totals 620707.72 0.00 0.00 0.00 0.00 0.80312 0.79900 1.23900 2.87900 89037.78 ``` Co-authored-by: filipecosta90 <filipecosta.90@gmail.com>
This commit is contained in:
parent
4aa25d042c
commit
24c85cc368
|
@ -2071,7 +2071,8 @@ struct redisServer {
|
|||
char *locale_collate;
|
||||
};
|
||||
|
||||
#define MAX_KEYS_BUFFER 256
|
||||
/* we use 6 so that all getKeyResult fits a cacheline */
|
||||
#define MAX_KEYS_BUFFER 6
|
||||
|
||||
typedef struct {
|
||||
int pos; /* The position of the key within the client array */
|
||||
|
@ -2084,12 +2085,12 @@ typedef struct {
|
|||
* for returning channel information.
|
||||
*/
|
||||
typedef struct {
|
||||
int numkeys; /* Number of key indices return */
|
||||
int size; /* Available array size */
|
||||
keyReference keysbuf[MAX_KEYS_BUFFER]; /* Pre-allocated buffer, to save heap allocations */
|
||||
keyReference *keys; /* Key indices array, points to keysbuf or heap */
|
||||
int numkeys; /* Number of key indices return */
|
||||
int size; /* Available array size */
|
||||
} getKeysResult;
|
||||
#define GETKEYS_RESULT_INIT { {{0}}, NULL, 0, MAX_KEYS_BUFFER }
|
||||
#define GETKEYS_RESULT_INIT { 0, MAX_KEYS_BUFFER, {{0}}, NULL }
|
||||
|
||||
/* Key specs definitions.
|
||||
*
|
||||
|
|
Loading…
Reference in New Issue