diff --git a/src/cluster.c b/src/cluster.c index c788194f1..e08ed6acb 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -4198,9 +4198,11 @@ void clusterCron(void) { * received PONG is older than half the cluster timeout, send * a new ping now, to ensure all the nodes are pinged without * a too big delay. */ + mstime_t ping_interval = server.cluster_ping_interval ? + server.cluster_ping_interval : server.cluster_node_timeout/2; if (node->link && node->ping_sent == 0 && - (now - node->pong_received) > server.cluster_node_timeout/2) + (now - node->pong_received) > ping_interval) { clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); continue; diff --git a/src/config.c b/src/config.c index 2a34adfd3..4149e06c3 100644 --- a/src/config.c +++ b/src/config.c @@ -3116,6 +3116,7 @@ standardConfig static_configs[] = { /* Long Long configs */ createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL),/* milliseconds */ createLongLongConfig("cluster-node-timeout", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.cluster_node_timeout, 15000, INTEGER_CONFIG, NULL, NULL), + createLongLongConfig("cluster-ping-interval", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, LLONG_MAX, server.cluster_ping_interval, 0, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("slowlog-log-slower-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.slowlog_log_slower_than, 10000, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL), createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024*1024, LONG_MAX, server.proto_max_bulk_len, 512ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */ diff --git a/src/server.h b/src/server.h index 92acaa9a0..c88087164 100644 --- a/src/server.h +++ b/src/server.h @@ -1852,6 +1852,7 @@ struct redisServer { int cluster_enabled; /* Is cluster enabled? */ int cluster_port; /* Set the cluster port for a node. */ mstime_t cluster_node_timeout; /* Cluster node timeout. */ + mstime_t cluster_ping_interval; /* A debug configuration for setting how often cluster nodes send ping messages. */ char *cluster_configfile; /* Cluster auto-generated config file name. */ struct clusterState *cluster; /* State of the cluster */ int cluster_migration_barrier; /* Cluster replicas migration barrier. */ diff --git a/tests/support/cluster_helper.tcl b/tests/support/cluster_helper.tcl index 6d70e44c1..644eefdae 100644 --- a/tests/support/cluster_helper.tcl +++ b/tests/support/cluster_helper.tcl @@ -103,7 +103,7 @@ proc start_cluster {masters replicas options code {slot_allocator continuous_slo # Configure the starting of multiple servers. Set cluster node timeout # aggressively since many tests depend on ping/pong messages. - set cluster_options [list overrides [list cluster-enabled yes cluster-node-timeout 500]] + set cluster_options [list overrides [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000]] set options [concat $cluster_options $options] # Cluster mode only supports a single database, so before executing the tests diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 6cc846b97..a23224bd7 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -418,7 +418,7 @@ proc start_server {options {code undefined}} { set baseconfig $value } "overrides" { - set overrides $value + set overrides [concat $overrides $value] } "config_lines" { set config_lines $value diff --git a/tests/unit/cluster/hostnames.tcl b/tests/unit/cluster/hostnames.tcl index 02fb83615..031310172 100644 --- a/tests/unit/cluster/hostnames.tcl +++ b/tests/unit/cluster/hostnames.tcl @@ -42,8 +42,9 @@ proc get_slot_field {slot_output shard_id node_id attrib_id} { return [lindex [lindex [lindex $slot_output $shard_id] $node_id] $attrib_id] } -# Start a cluster with 3 masters and 4 replicas. -start_cluster 3 4 {tags {external:skip cluster}} { +# Start a cluster with 3 masters and 4 replicas. +# These tests rely on specific node ordering, so make sure no node fails over. +start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} { test "Set cluster hostnames and verify they are propagated" { for {set j 0} {$j < [llength $::servers]} {incr j} { R $j config set cluster-announce-hostname "host-$j.com" @@ -202,7 +203,9 @@ test "Verify the nodes configured with prefer hostname only show hostname for ne R 0 DEBUG DROP-CLUSTER-PACKET-FILTER -1 R 6 DEBUG DROP-CLUSTER-PACKET-FILTER -1 - wait_for_condition 50 100 { + # This operation sometimes spikes to around 5 seconds to resolve the state, + # so it has a higher timeout. + wait_for_condition 50 500 { [llength [R 6 CLUSTER SLOTS]] eq 3 } else { fail "Node did not learn about the 2 shards it can talk to" @@ -220,10 +223,6 @@ test "Test restart will keep hostname information" { # Store the hostname in the config R 0 config rewrite - # If the primary is slow to reboot it might get demoted, so prevent the replica - # from nominating itself. - R 3 config set cluster-replica-no-failover yes - restart_server 0 true false set slot_result [R 0 CLUSTER SLOTS] assert_equal [lindex [get_slot_field $slot_result 0 2 3] 1] "restart-1.com"