From 662cb2fe75d747734d3ca4bfd062044315fb9dc9 Mon Sep 17 00:00:00 2001
From: Yuan Wang <yuan.wang@redis.com>
Date: Thu, 13 Feb 2025 10:52:19 +0800
Subject: [PATCH] Don't send unnecessary PING to replicas (#13790)

The reason why master sends PING is to keep the connection with replica
active, so master need not send PING to replicas if already sent
replication stream in the past `repl_ping_slave_period` time.

Now master only sends PINGs and increases `master_repl_offset` if there
is no traffic, so this PR also can reduce the impact of issue in
https://github.com/redis/redis/pull/13773, of course, does not resolve
it completely.
> Fix ping causing inconsistency between AOF size and replication offset
in the future PR. Because we increment the replication offset when
sending PING/REPLCONF to the replica but do not write data to the AOF
file, this might cause the starting offset of the AOF file plus its size
to be inconsistent with the actual replication offset.
---
 src/replication.c | 15 +++++++++------
 src/server.c      |  1 +
 src/server.h      |  1 +
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/replication.c b/src/replication.c
index c2a49951a..1c2351c0f 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -519,6 +519,9 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
     /* We can't have slaves attached and no backlog. */
     serverAssert(!(listLength(slaves) != 0 && server.repl_backlog == NULL));
 
+    /* Update the time of sending replication stream to replicas. */
+    server.repl_stream_lastio = server.unixtime;
+
     /* Must install write handler for all replicas first before feeding
      * replication stream. */
     prepareReplicasToWrite();
@@ -4479,8 +4482,6 @@ long long replicationGetSlaveOffset(void) {
 
 /* Replication cron function, called 1 time per second. */
 void replicationCron(void) {
-    static long long replication_cron_loops = 0;
-
     /* Check failover status first, to see if we need to start
      * handling the failover. */
     updateFailoverStatus();
@@ -4533,9 +4534,12 @@ void replicationCron(void) {
     listNode *ln;
     robj *ping_argv[1];
 
-    /* First, send PING according to ping_slave_period. */
-    if ((replication_cron_loops % server.repl_ping_slave_period) == 0 &&
-        listLength(server.slaves))
+    /* First, send PING according to ping_slave_period. The reason why master
+     * sends PING is to keep the connection with replica active, so master need
+     * not send PING to replicas if already sent replication stream in the past
+     * repl_ping_slave_period time. */
+    if (server.masterhost == NULL && listLength(server.slaves) &&
+        server.unixtime >= server.repl_stream_lastio + server.repl_ping_slave_period)
     {
         /* Note that we don't send the PING if the clients are paused during
          * a Redis Cluster manual failover: the PING we send will otherwise
@@ -4672,7 +4676,6 @@ void replicationCron(void) {
 
     /* Refresh the number of slaves with lag <= min-slaves-max-lag. */
     refreshGoodSlavesCount();
-    replication_cron_loops++; /* Incremented with frequency 1 HZ. */
 }
 
 int shouldStartChildReplication(int *mincapa_out, int *req_out) {
diff --git a/src/server.c b/src/server.c
index 5f20900e0..b76b3526c 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2192,6 +2192,7 @@ void initServerConfig(void) {
     server.repl_down_since = 0; /* Never connected, repl is down since EVER. */
     server.master_repl_offset = 0;
     server.fsynced_reploff_pending = 0;
+    server.repl_stream_lastio = server.unixtime;
 
     /* Replication partial resync backlog */
     server.repl_backlog = NULL;
diff --git a/src/server.h b/src/server.h
index 72a6f779d..e0634d77a 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2012,6 +2012,7 @@ struct redisServer {
     size_t repl_buffer_mem;         /* The memory of replication buffer. */
     list *repl_buffer_blocks;       /* Replication buffers blocks list
                                      * (serving replica clients and repl backlog) */
+    time_t repl_stream_lastio;      /* Unix time of the latest sending replication stream. */
     /* Replication (slave) */
     char *masteruser;               /* AUTH with this user and masterauth with master */
     sds masterauth;                 /* AUTH with this password with master */