reduce getNodeByQuery CPU time by using less cache lines (from 2064 Bytes struct to 64 Bytes): reduces LLC misses and Memory Loads (#13296)

The following PR goes from 33 cacheline on getKeysResult struct (by
default has 256 static buffer)

```
root@hpe10:~/redis# pahole -p   ./src/server.o -C getKeysResult
typedef struct {
	keyReference               keysbuf[256];         /*     0  2048 */
	/* --- cacheline 32 boundary (2048 bytes) --- */
	/* typedef keyReference */ struct {
		int                pos;
		int                flags;
	} *keys; /*  2048     8 */
	int                        numkeys;              /*  2056     4 */
	int                        size;                 /*  2060     4 */

	/* size: 2064, cachelines: 33, members: 4 */
	/* last cacheline: 16 bytes */
} getKeysResult;
```


to 1 cacheline with a static buffer of 6 keys per command):
```
root@hpe10:~/redis# pahole -p   ./src/server.o -C getKeysResult
typedef struct {
	int                        numkeys;              /*     0     4 */
	int                        size;                 /*     4     4 */
	keyReference               keysbuf[6];           /*     8    48 */
	/* typedef keyReference */ struct {
		int                pos;
		int                flags;
	} *keys; /*    56     8 */

	/* size: 64, cachelines: 1, members: 4 */
} getKeysResult; 
```

we get around 1.5% higher ops/sec, and a confirmation of around 15% less
LLC loads on getNodeByQuery and 37% less Stores.

Function / Call Stack | CPU Time: Difference | CPU Time:
9462436fa4 | CPU Time: this PR | Loads:
Difference | Loads: 9462436fa4 | Loads:
this PR | Stores: Difference | Stores:
9462436fa4 | Stores: This PR
-- | -- | -- | -- | -- | -- | -- | -- | -- | --
getNodeByQuery | 0.753767 | 1.57118 | 0.817416 | 144297829 (15% less
loads) | 920575969 | 776278140 | 367607824 (37% less stores) | 991642384
| 624034560

## results on client side

### baseline 
```
taskset -c 2,3 memtier_benchmark -s 192.168.1.200 --port 6379 --authenticate perf --cluster-mode --pipeline 10 --data-size 100 --ratio 1:0 --key-pattern P:P --key-minimum=1 --key-maximum 1000000 --test-time 180 -c 25 -t 2 --hide-histogram 
Writing results to stdout
[RUN #1] Preparing benchmark client...
[RUN #1] Launching threads now...
[RUN #1 100%, 180 secs]  0 threads:   110333450 ops,  604992 (avg:  612942) ops/sec, 84.75MB/sec (avg: 85.86MB/sec),  0.82 (avg:  0.81) msec latency

2         Threads
25        Connections per thread
180       Seconds


ALL STATS
======================================================================================================================================================
Type         Ops/sec     Hits/sec   Misses/sec    MOVED/sec      ASK/sec    Avg. Latency     p50 Latency     p99 Latency   p99.9 Latency       KB/sec 
------------------------------------------------------------------------------------------------------------------------------------------------------
Sets       612942.14          ---          ---         0.00         0.00         0.81332         0.80700         1.26300         2.92700     87924.12 
Gets            0.00         0.00         0.00         0.00         0.00             ---             ---             ---             ---         0.00 
Waits           0.00          ---          ---          ---          ---             ---             ---             ---             ---          --- 
Totals     612942.14         0.00         0.00         0.00         0.00         0.81332         0.80700         1.26300         2.92700     87924.12 
```

### comparison 
```
taskset -c 2,3 memtier_benchmark -s 192.168.1.200 --port 6379 --authenticate perf --cluster-mode --pipeline 10 --data-size 100 --ratio 1:0 --key-pattern P:P --key-minimum=1 --key-maximum 1000000 --test-time 180 -c 25 -t 2 --hide-histogram 
Writing results to stdout
[RUN #1] Preparing benchmark client...
[RUN #1] Launching threads now...
[RUN #1 100%, 180 secs]  0 threads:   111731310 ops,  610195 (avg:  620707) ops/sec, 85.48MB/sec (avg: 86.95MB/sec),  0.82 (avg:  0.80) msec latency

2         Threads
25        Connections per thread
180       Seconds


ALL STATS
======================================================================================================================================================
Type         Ops/sec     Hits/sec   Misses/sec    MOVED/sec      ASK/sec    Avg. Latency     p50 Latency     p99 Latency   p99.9 Latency       KB/sec 
------------------------------------------------------------------------------------------------------------------------------------------------------
Sets       620707.72          ---          ---         0.00         0.00         0.80312         0.79900         1.23900         2.87900     89037.78 
Gets            0.00         0.00         0.00         0.00         0.00             ---             ---             ---             ---         0.00 
Waits           0.00          ---          ---          ---          ---             ---             ---             ---             ---          --- 
Totals     620707.72         0.00         0.00         0.00         0.00         0.80312         0.79900         1.23900         2.87900     89037.78
```

Co-authored-by: filipecosta90 <filipecosta.90@gmail.com>
This commit is contained in:
Filipe Oliveira (Redis) 2024-06-18 11:00:47 +01:00 committed by GitHub
parent 4aa25d042c
commit 24c85cc368
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 5 additions and 4 deletions

View File

@ -2071,7 +2071,8 @@ struct redisServer {
char *locale_collate;
};
#define MAX_KEYS_BUFFER 256
/* we use 6 so that all getKeyResult fits a cacheline */
#define MAX_KEYS_BUFFER 6
typedef struct {
int pos; /* The position of the key within the client array */
@ -2084,12 +2085,12 @@ typedef struct {
* for returning channel information.
*/
typedef struct {
keyReference keysbuf[MAX_KEYS_BUFFER]; /* Pre-allocated buffer, to save heap allocations */
keyReference *keys; /* Key indices array, points to keysbuf or heap */
int numkeys; /* Number of key indices return */
int size; /* Available array size */
keyReference keysbuf[MAX_KEYS_BUFFER]; /* Pre-allocated buffer, to save heap allocations */
keyReference *keys; /* Key indices array, points to keysbuf or heap */
} getKeysResult;
#define GETKEYS_RESULT_INIT { {{0}}, NULL, 0, MAX_KEYS_BUFFER }
#define GETKEYS_RESULT_INIT { 0, MAX_KEYS_BUFFER, {{0}}, NULL }
/* Key specs definitions.
*