diff --git a/src/commands.def b/src/commands.def index 5a7545e32..fb7de3cce 100644 --- a/src/commands.def +++ b/src/commands.def @@ -5891,7 +5891,9 @@ struct COMMAND_ARG SENTINEL_DEBUG_Args[] = { #ifndef SKIP_CMD_HISTORY_TABLE /* SENTINEL FAILOVER history */ -#define SENTINEL_FAILOVER_History NULL +commandHistory SENTINEL_FAILOVER_History[] = { +{"8.0.0","Add safe failover option."}, +}; #endif #ifndef SKIP_CMD_TIPS_TABLE @@ -5907,6 +5909,7 @@ struct COMMAND_ARG SENTINEL_DEBUG_Args[] = { /* SENTINEL FAILOVER argument table */ struct COMMAND_ARG SENTINEL_FAILOVER_Args[] = { {MAKE_ARG("master-name",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("safe",ARG_TYPE_PURE_TOKEN,-1,"SAFE",NULL,"8.0.0",CMD_ARG_OPTIONAL,0,NULL)}, }; /********** SENTINEL FLUSHCONFIG ********************/ @@ -6283,7 +6286,7 @@ struct COMMAND_STRUCT SENTINEL_Subcommands[] = { {MAKE_CMD("ckquorum","Checks for a Redis Sentinel quorum.",NULL,"2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_CKQUORUM_History,0,SENTINEL_CKQUORUM_Tips,0,sentinelCommand,3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_CKQUORUM_Keyspecs,0,NULL,1),.args=SENTINEL_CKQUORUM_Args}, {MAKE_CMD("config","Configures Redis Sentinel.","O(N) when N is the number of configuration parameters provided","6.2.0",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_CONFIG_History,1,SENTINEL_CONFIG_Tips,0,sentinelCommand,-4,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_CONFIG_Keyspecs,0,NULL,1),.args=SENTINEL_CONFIG_Args}, {MAKE_CMD("debug","Lists or updates the current configurable parameters of Redis Sentinel.","O(N) where N is the number of configurable parameters","7.0.0",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_DEBUG_History,0,SENTINEL_DEBUG_Tips,0,sentinelCommand,-2,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_DEBUG_Keyspecs,0,NULL,1),.args=SENTINEL_DEBUG_Args}, -{MAKE_CMD("failover","Forces a Redis Sentinel failover.",NULL,"2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_FAILOVER_History,0,SENTINEL_FAILOVER_Tips,0,sentinelCommand,3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_FAILOVER_Keyspecs,0,NULL,1),.args=SENTINEL_FAILOVER_Args}, +{MAKE_CMD("failover","Forces a Redis Sentinel failover.",NULL,"2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_FAILOVER_History,1,SENTINEL_FAILOVER_Tips,0,sentinelCommand,-3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_FAILOVER_Keyspecs,0,NULL,2),.args=SENTINEL_FAILOVER_Args}, {MAKE_CMD("flushconfig","Rewrites the Redis Sentinel configuration file.","O(1)","2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_FLUSHCONFIG_History,0,SENTINEL_FLUSHCONFIG_Tips,0,sentinelCommand,2,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_FLUSHCONFIG_Keyspecs,0,NULL,0)}, {MAKE_CMD("get-master-addr-by-name","Returns the port and address of a master Redis instance.","O(1)","2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_GET_MASTER_ADDR_BY_NAME_History,0,SENTINEL_GET_MASTER_ADDR_BY_NAME_Tips,0,sentinelCommand,3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_GET_MASTER_ADDR_BY_NAME_Keyspecs,0,NULL,1),.args=SENTINEL_GET_MASTER_ADDR_BY_NAME_Args}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_HELP_History,0,SENTINEL_HELP_Tips,0,sentinelCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_HELP_Keyspecs,0,NULL,0)}, diff --git a/src/commands/sentinel-failover.json b/src/commands/sentinel-failover.json index 8f5037589..0157cddfd 100644 --- a/src/commands/sentinel-failover.json +++ b/src/commands/sentinel-failover.json @@ -3,9 +3,15 @@ "summary": "Forces a Redis Sentinel failover.", "group": "sentinel", "since": "2.8.4", - "arity": 3, + "arity": -3, "container": "SENTINEL", "function": "sentinelCommand", + "history": [ + [ + "8.0.0", + "Add safe failover option." + ] + ], "command_flags": [ "ADMIN", "SENTINEL", @@ -13,12 +19,19 @@ ], "reply_schema": { "const": "OK", - "description": "Force a fail over as if the master was not reachable, and without asking for agreement to other Sentinels." + "description": "Force a fail over as if the master was not reachable, and without asking for agreement to other Sentinels. The SAFE option will attempt to perform a fail over in a more safe manner, avoiding a full resynchronization." }, "arguments": [ { "name": "master-name", "type": "string" + }, + { + "token": "SAFE", + "name": "safe", + "type": "pure-token", + "optional": true, + "since": "8.0.0" } ] } diff --git a/src/sentinel.c b/src/sentinel.c index 51df378bf..cff7c6707 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -57,6 +57,7 @@ typedef struct sentinelAddr { #define SRI_FORCE_FAILOVER (1<<11) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<12) /* SCRIPT KILL already sent on -BUSY */ #define SRI_MASTER_REBOOT (1<<13) /* Master was detected as rebooting */ +#define SRI_SAFE_FAILOVER (1<<14) /* Force failover in a more safe way. */ /* Note: when adding new flags, please check the flags section in addReplySentinelRedisInstance. */ /* Note: times are in milliseconds. */ @@ -75,6 +76,7 @@ static mstime_t sentinel_election_timeout = 10000; static mstime_t sentinel_script_max_runtime = 60000; /* 60 seconds max exec time. */ static mstime_t sentinel_script_retry_delay = 30000; /* 30 seconds between retries. */ static mstime_t sentinel_default_failover_timeout = 60*3*1000; +static mstime_t sentinel_default_safe_failover_timeout = 60000; /* default 1 minute safe failover time. */ #define SENTINEL_HELLO_CHANNEL "__sentinel__:hello" #define SENTINEL_DEFAULT_SLAVE_PRIORITY 100 @@ -91,9 +93,10 @@ static mstime_t sentinel_default_failover_timeout = 60*3*1000; #define SENTINEL_FAILOVER_STATE_WAIT_START 1 /* Wait for failover_start_time*/ #define SENTINEL_FAILOVER_STATE_SELECT_SLAVE 2 /* Select slave to promote */ #define SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE 3 /* Slave -> Master */ -#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* Wait slave to change role */ -#define SENTINEL_FAILOVER_STATE_RECONF_SLAVES 5 /* SLAVEOF newmaster */ -#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 6 /* Monitor promoted slave. */ +#define SENTINEL_FAILOVER_STATE_SEND_FAILOVER 4 /* Send FAILOVER command to master */ +#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 5 /* Wait slave to change role */ +#define SENTINEL_FAILOVER_STATE_RECONF_SLAVES 6 /* SLAVEOF newmaster */ +#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 7 /* Monitor promoted slave. */ #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 @@ -223,6 +226,7 @@ typedef struct sentinelRedisInstance { mstime_t failover_timeout; /* Max time to refresh failover state. */ mstime_t failover_delay_logged; /* For what failover_start_time value we logged the failover delay. */ + mstime_t safe_failover_timeout; /* Max time to safe failover. */ struct sentinelRedisInstance *promoted_slave; /* Promoted slave instance. */ /* Scripts executed to notify admin or reconfigure clients: when they * are set to NULL no script is executed. */ @@ -1338,6 +1342,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->failover_state_change_time = 0; ri->failover_start_time = 0; ri->failover_timeout = sentinel_default_failover_timeout; + ri->safe_failover_timeout = sentinel_default_safe_failover_timeout; ri->failover_delay_logged = 0; ri->promoted_slave = NULL; ri->notification_script = NULL; @@ -1529,6 +1534,7 @@ void sentinelResetMaster(sentinelRedisInstance *ri, int flags) { ri->failover_state_change_time = 0; ri->failover_start_time = 0; /* We can failover again ASAP. */ ri->promoted_slave = NULL; + ri->safe_failover_timeout = sentinel_default_safe_failover_timeout; sdsfree(ri->runid); sdsfree(ri->slave_master_host); ri->runid = NULL; @@ -3369,6 +3375,7 @@ const char *sentinelFailoverStateStr(int state) { case SENTINEL_FAILOVER_STATE_WAIT_START: return "wait_start"; case SENTINEL_FAILOVER_STATE_SELECT_SLAVE: return "select_slave"; case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE: return "send_slaveof_noone"; + case SENTINEL_FAILOVER_STATE_SEND_FAILOVER: return "send_failover"; case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion"; case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: return "reconf_slaves"; case SENTINEL_FAILOVER_STATE_UPDATE_CONFIG: return "update_config"; @@ -4001,12 +4008,26 @@ NULL addReplyBulkLongLong(c,addr->port); } } else if (!strcasecmp(c->argv[1]->ptr,"failover")) { - /* SENTINEL FAILOVER */ + /* SENTINEL FAILOVER [SAFE [timeout]]*/ sentinelRedisInstance *ri; + int safeMode = 0; + long long safeFailOverTimeout = sentinel_default_safe_failover_timeout; - if (c->argc != 3) goto numargserr; + if (c->argc < 3 || c->argc > 5) goto numargserr; if ((ri = sentinelGetMasterByNameOrReplyError(c,c->argv[2])) == NULL) return; + if (c->argc > 3) { + if (strcasecmp(c->argv[3]->ptr, "safe") != 0) { + addReplyError(c, "Unknown failover option specified"); + return; + } + safeMode = 1; + if (c->argc == 5 && (getLongLongFromObject(c->argv[4],&safeFailOverTimeout) == C_ERR + || safeFailOverTimeout < 0)) { + addReplyError(c,"Invalid failover timeout specified"); + return; + } + } if (ri->flags & SRI_FAILOVER_IN_PROGRESS) { addReplyError(c,"-INPROG Failover already in progress"); return; @@ -4019,6 +4040,10 @@ NULL ri->name); sentinelStartFailover(ri); ri->flags |= SRI_FORCE_FAILOVER; + if (safeMode) { + ri->flags |= SRI_SAFE_FAILOVER; + ri->safe_failover_timeout = safeFailOverTimeout; + } addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) { /* SENTINEL PENDING-SCRIPTS */ @@ -4924,6 +4949,26 @@ int sentinelSendSlaveOf(sentinelRedisInstance *ri, const sentinelAddr *addr) { return C_OK; } +/* Send FAILOVER to the specified instance using the specified timeout. + * The command returns C_OK if the FAILOVER command was accepted for + * (later) delivery otherwise C_ERR. */ +int sentinelSendFailover(sentinelRedisInstance *master, const sentinelAddr *addr, mstime_t timeout) { + char portstr[32]; + const char *host; + int retval; + + host = announceSentinelAddr(addr); + ll2string(portstr,sizeof(portstr),addr->port); + + retval = redisAsyncCommand(master->link->cc, sentinelDiscardReplyCallback, master, + "%s TO %s %s FORCE TIMEOUT %d", sentinelInstanceMapCommand(master, "FAILOVER"), + host, portstr, timeout); + if (retval == C_ERR) return retval; + master->link->pending_commands++; + + return C_OK; +} + /* Setup the master state to start a failover. */ void sentinelStartFailover(sentinelRedisInstance *master) { serverAssert(master->flags & SRI_MASTER); @@ -5130,10 +5175,15 @@ void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { sentinelEvent(LL_WARNING,"+selected-slave",slave,"%@"); slave->flags |= SRI_PROMOTED; ri->promoted_slave = slave; - ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE; ri->failover_state_change_time = mstime(); - sentinelEvent(LL_NOTICE,"+failover-state-send-slaveof-noone", - slave, "%@"); + if (ri->flags & SRI_SAFE_FAILOVER) { + ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_FAILOVER; + sentinelEvent(LL_WARNING,"+failover-state-send-failover", ri,"%@"); + } else { + ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE; + sentinelEvent(LL_NOTICE,"+failover-state-send-slaveof-noone", + slave, "%@"); + } } } @@ -5163,6 +5213,32 @@ void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) { ri->failover_state_change_time = mstime(); } +void sentinelFailoverSendFailover(sentinelRedisInstance *ri) { + int retval; + + /* We can't send the command to the promoted slave if it is now + * disconnected. Retry again and again with this state until the timeout + * is reached, then abort the failover. */ + if (ri->promoted_slave->link->disconnected) { + if (mstime() - ri->failover_state_change_time > ri->failover_timeout) { + sentinelEvent(LL_WARNING,"-failover-abort-slave-timeout",ri,"%@"); + sentinelAbortFailover(ri); + } + return; + } + + /* Send FAILOVER command to turn the master into a slave. + * We actually register a generic callback for this command as we don't + * really care about the reply. We check if it worked indirectly observing + * if INFO returns a different role (master instead of slave). */ + retval = sentinelSendFailover(ri, ri->promoted_slave->addr, ri->safe_failover_timeout); + if (retval != C_OK) return; + sentinelEvent(LL_NOTICE, "+failover-state-wait-promotion", + ri->promoted_slave,"%@"); + ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION; + ri->failover_state_change_time = mstime(); +} + /* We actually wait for promotion indirectly checking with INFO when the * slave turns into a master. */ void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) { @@ -5323,6 +5399,9 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE: sentinelFailoverSendSlaveOfNoOne(ri); break; + case SENTINEL_FAILOVER_STATE_SEND_FAILOVER: + sentinelFailoverSendFailover(ri); + break; case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: sentinelFailoverWaitPromotion(ri); break; @@ -5341,7 +5420,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { serverAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); serverAssert(ri->failover_state <= SENTINEL_FAILOVER_STATE_WAIT_PROMOTION); - ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER); + ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER|SRI_SAFE_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_slave) { diff --git a/tests/sentinel/tests/15-config-set-config-get.tcl b/tests/sentinel/tests/15-config-set-config-get.tcl index f9831f8e8..ba947bc65 100644 --- a/tests/sentinel/tests/15-config-set-config-get.tcl +++ b/tests/sentinel/tests/15-config-set-config-get.tcl @@ -56,3 +56,10 @@ test "SENTINEL CONFIG SET, wrong number of arguments" { fail "Expected to return Missing argument error" } } + +test "Undo SENTINEL CONFIG SET changes" { + foreach_sentinel_id id { + assert_equal {OK} [S $id SENTINEL CONFIG SET resolve-hostnames no announce-port 0] + } + assert_match {*no*0*} [S 1 SENTINEL CONFIG GET resolve-hostnames announce-port] +} \ No newline at end of file diff --git a/tests/sentinel/tests/16-manual-safe.tcl b/tests/sentinel/tests/16-manual-safe.tcl new file mode 100644 index 000000000..968dcdcf8 --- /dev/null +++ b/tests/sentinel/tests/16-manual-safe.tcl @@ -0,0 +1,83 @@ +# Test manual safe failover + +source "../tests/includes/init-tests.tcl" + +foreach_sentinel_id id { + S $id sentinel debug info-period 2000 + S $id sentinel debug default-down-after 6000 + S $id sentinel debug publish-period 1000 +} + +set loop_counter 0 + +test "Manual safe failover works" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + + # Enable the repl-read-only configuration of the master node. + R $master_id config set replica-read-only yes + + R $master_id set counter 0 + + # Perform a safe failover. + catch {S 0 SENTINEL FAILOVER mymaster safe} reply + assert {$reply eq "OK"} + + while {1} { + catch {R $master_id incr counter} reply + if {[string match "*READONLY*" $reply]} { + break + } + incr loop_counter + } + + set old_master_counter [R $master_id get counter] + assert {$old_master_counter == $loop_counter} + + # Wait for all Sentinel nodes to update the master node information. + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] +} + +test "safe failover: Check data consistency" { + set master_counter [R $master_id get counter] + foreach_redis_id id { + if {$id != $master_id} { + set slave_counter [R $id get counter] + assert {$slave_counter == $master_counter} + } + } +} + +test "safe failover: New master [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} + +test "safe failover: All the other slaves now point to the new master" { + foreach_redis_id id { + if {$id != $master_id && $id != 0} { + wait_for_condition 1000 50 { + [RI $id master_port] == [lindex $addr 1] + } else { + fail "Redis ID $id not configured to replicate with new master" + } + } + } +} + +test "safe failover: The old master eventually gets reconfigured as a slave" { + wait_for_condition 1000 50 { + [RI 0 master_port] == [lindex $addr 1] + } else { + fail "Old master not reconfigured as slave of new master" + } +} \ No newline at end of file