Stabilize cluster hostnames tests (#11307)

This PR introduces a couple of changes to improve cluster test stability: 1. Increase the cluster node timeout to 3 seconds, which is similar to the normal cluster tests, but introduce a new mechanism to increase the ping period so that the tests are still fast. This new config is a debug config. 2. Set `cluster-replica-no-failover yes` on a wider array of tests which are sensitive to failovers. This was occurring on the ARM CI.
2022-10-02 23:25:16 -07:00 · 2022-10-02 23:25:16 -07:00 · 663fbd3459
parent a549b78c48
commit 663fbd3459
6 changed files with 13 additions and 10 deletions
--- a/src/cluster.c
+++ b/src/cluster.c
@ -4198,9 +4198,11 @@ void clusterCron(void) {
         * received PONG is older than half the cluster timeout, send
         * a new ping now, to ensure all the nodes are pinged without
         * a too big delay. */
+        mstime_t ping_interval = server.cluster_ping_interval ? 
+            server.cluster_ping_interval : server.cluster_node_timeout/2;
        if (node->link &&
            node->ping_sent == 0 &&
-            (now - node->pong_received) > server.cluster_node_timeout/2)
+            (now - node->pong_received) > ping_interval)
        {
            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
            continue;
--- a/src/config.c
+++ b/src/config.c
@ -3116,6 +3116,7 @@ standardConfig static_configs[] = {
    /* Long Long configs */
    createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL),/* milliseconds */
    createLongLongConfig("cluster-node-timeout", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.cluster_node_timeout, 15000, INTEGER_CONFIG, NULL, NULL),
+    createLongLongConfig("cluster-ping-interval", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, LLONG_MAX, server.cluster_ping_interval, 0, INTEGER_CONFIG, NULL, NULL),
    createLongLongConfig("slowlog-log-slower-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.slowlog_log_slower_than, 10000, INTEGER_CONFIG, NULL, NULL),
    createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL),
    createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024*1024, LONG_MAX, server.proto_max_bulk_len, 512ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */
--- a/src/server.h
+++ b/src/server.h
@ -1852,6 +1852,7 @@ struct redisServer {
    int cluster_enabled;      /* Is cluster enabled? */
    int cluster_port;         /* Set the cluster port for a node. */
    mstime_t cluster_node_timeout; /* Cluster node timeout. */
+    mstime_t cluster_ping_interval;    /* A debug configuration for setting how often cluster nodes send ping messages. */
    char *cluster_configfile; /* Cluster auto-generated config file name. */
    struct clusterState *cluster;  /* State of the cluster */
    int cluster_migration_barrier; /* Cluster replicas migration barrier. */
--- a/tests/support/cluster_helper.tcl
+++ b/tests/support/cluster_helper.tcl
@ -103,7 +103,7 @@ proc start_cluster {masters replicas options code {slot_allocator continuous_slo

    # Configure the starting of multiple servers. Set cluster node timeout
    # aggressively since many tests depend on ping/pong messages. 
-    set cluster_options [list overrides [list cluster-enabled yes cluster-node-timeout 500]]
+    set cluster_options [list overrides [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000]]
    set options [concat $cluster_options $options]

    # Cluster mode only supports a single database, so before executing the tests
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@ -418,7 +418,7 @@ proc start_server {options {code undefined}} {
                set baseconfig $value
            }
            "overrides" {
-                set overrides $value
+                set overrides [concat $overrides $value]
            }
            "config_lines" {
                set config_lines $value
--- a/tests/unit/cluster/hostnames.tcl
+++ b/tests/unit/cluster/hostnames.tcl
@ -42,8 +42,9 @@ proc get_slot_field {slot_output shard_id node_id attrib_id} {
    return [lindex [lindex [lindex $slot_output $shard_id] $node_id] $attrib_id]
 }

-# Start a cluster with 3 masters and 4 replicas. 
-start_cluster 3 4 {tags {external:skip cluster}} {
+# Start a cluster with 3 masters and 4 replicas.
+# These tests rely on specific node ordering, so make sure no node fails over.
+start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} {
 test "Set cluster hostnames and verify they are propagated" {
    for {set j 0} {$j < [llength $::servers]} {incr j} {
        R $j config set cluster-announce-hostname "host-$j.com"
@ -202,7 +203,9 @@ test "Verify the nodes configured with prefer hostname only show hostname for ne
    R 0 DEBUG DROP-CLUSTER-PACKET-FILTER -1
    R 6 DEBUG DROP-CLUSTER-PACKET-FILTER -1

-    wait_for_condition 50 100 {
+    # This operation sometimes spikes to around 5 seconds to resolve the state,
+    # so it has a higher timeout. 
+    wait_for_condition 50 500 {
        [llength [R 6 CLUSTER SLOTS]] eq 3
    } else {
        fail "Node did not learn about the 2 shards it can talk to"
@ -220,10 +223,6 @@ test "Test restart will keep hostname information" {
    # Store the hostname in the config
    R 0 config rewrite

-    # If the primary is slow to reboot it might get demoted, so prevent the replica
-    # from nominating itself.
-    R 3 config set cluster-replica-no-failover yes
-
    restart_server 0 true false
    set slot_result [R 0 CLUSTER SLOTS]
    assert_equal [lindex [get_slot_field $slot_result 0 2 3] 1] "restart-1.com"