mirror of https://mirror.osredm.com/root/redis.git
Sentinel master reboot fix (#9438)
Add master-reboot-down-after-period as a configurable parameter, to make it possible to trigger a failover from a master that is responding with `-LOADING` for a long time after being restarted.
This commit is contained in:
parent
af072c26bc
commit
2afa41f628
|
@ -339,3 +339,13 @@ SENTINEL resolve-hostnames no
|
||||||
# to retain the hostnames when announced, enable announce-hostnames below.
|
# to retain the hostnames when announced, enable announce-hostnames below.
|
||||||
#
|
#
|
||||||
SENTINEL announce-hostnames no
|
SENTINEL announce-hostnames no
|
||||||
|
|
||||||
|
# When master_reboot_down_after_period is set to 0, Sentinel does not fail over
|
||||||
|
# when receiving a -LOADING response from a master. This was the only supported
|
||||||
|
# behavior before version 7.0.
|
||||||
|
#
|
||||||
|
# Otherwise, Sentinel will use this value as the time (in ms) it is willing to
|
||||||
|
# accept a -LOADING response after a master has been rebooted, before failing
|
||||||
|
# over.
|
||||||
|
|
||||||
|
SENTINEL master-reboot-down-after-period mymaster 0
|
||||||
|
|
|
@ -76,6 +76,7 @@ typedef struct sentinelAddr {
|
||||||
#define SRI_RECONF_DONE (1<<10) /* Slave synchronized with new master. */
|
#define SRI_RECONF_DONE (1<<10) /* Slave synchronized with new master. */
|
||||||
#define SRI_FORCE_FAILOVER (1<<11) /* Force failover with master up. */
|
#define SRI_FORCE_FAILOVER (1<<11) /* Force failover with master up. */
|
||||||
#define SRI_SCRIPT_KILL_SENT (1<<12) /* SCRIPT KILL already sent on -BUSY */
|
#define SRI_SCRIPT_KILL_SENT (1<<12) /* SCRIPT KILL already sent on -BUSY */
|
||||||
|
#define SRI_MASTER_REBOOT (1<<13) /* Master was detected as rebooting */
|
||||||
|
|
||||||
/* Note: times are in milliseconds. */
|
/* Note: times are in milliseconds. */
|
||||||
#define SENTINEL_PING_PERIOD 1000
|
#define SENTINEL_PING_PERIOD 1000
|
||||||
|
@ -193,6 +194,8 @@ typedef struct sentinelRedisInstance {
|
||||||
mstime_t s_down_since_time; /* Subjectively down since time. */
|
mstime_t s_down_since_time; /* Subjectively down since time. */
|
||||||
mstime_t o_down_since_time; /* Objectively down since time. */
|
mstime_t o_down_since_time; /* Objectively down since time. */
|
||||||
mstime_t down_after_period; /* Consider it down after that period. */
|
mstime_t down_after_period; /* Consider it down after that period. */
|
||||||
|
mstime_t master_reboot_down_after_period; /* Consider master down after that period. */
|
||||||
|
mstime_t master_reboot_since_time; /* master reboot time since time. */
|
||||||
mstime_t info_refresh; /* Time at which we received INFO output from it. */
|
mstime_t info_refresh; /* Time at which we received INFO output from it. */
|
||||||
dict *renamed_commands; /* Commands renamed in this instance:
|
dict *renamed_commands; /* Commands renamed in this instance:
|
||||||
Sentinel will use the alternative commands
|
Sentinel will use the alternative commands
|
||||||
|
@ -1294,8 +1297,8 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *
|
||||||
ri->last_master_down_reply_time = mstime();
|
ri->last_master_down_reply_time = mstime();
|
||||||
ri->s_down_since_time = 0;
|
ri->s_down_since_time = 0;
|
||||||
ri->o_down_since_time = 0;
|
ri->o_down_since_time = 0;
|
||||||
ri->down_after_period = master ? master->down_after_period :
|
ri->down_after_period = master ? master->down_after_period : sentinel_default_down_after;
|
||||||
sentinel_default_down_after;
|
ri->master_reboot_down_after_period = 0;
|
||||||
ri->master_link_down_time = 0;
|
ri->master_link_down_time = 0;
|
||||||
ri->auth_pass = NULL;
|
ri->auth_pass = NULL;
|
||||||
ri->auth_user = NULL;
|
ri->auth_user = NULL;
|
||||||
|
@ -1971,6 +1974,13 @@ const char *sentinelHandleConfiguration(char **argv, int argc) {
|
||||||
if ((sentinel.announce_hostnames = yesnotoi(argv[1])) == -1) {
|
if ((sentinel.announce_hostnames = yesnotoi(argv[1])) == -1) {
|
||||||
return "Please specify yes or no for the announce-hostnames option.";
|
return "Please specify yes or no for the announce-hostnames option.";
|
||||||
}
|
}
|
||||||
|
} else if (!strcasecmp(argv[0],"master-reboot-down-after-period") && argc == 3) {
|
||||||
|
/* master-reboot-down-after-period <name> <milliseconds> */
|
||||||
|
ri = sentinelGetMasterByName(argv[1]);
|
||||||
|
if (!ri) return "No such master with specified name.";
|
||||||
|
ri->master_reboot_down_after_period = atoi(argv[2]);
|
||||||
|
if (ri->master_reboot_down_after_period < 0)
|
||||||
|
return "negative time parameter.";
|
||||||
} else {
|
} else {
|
||||||
return "Unrecognized sentinel configuration statement.";
|
return "Unrecognized sentinel configuration statement.";
|
||||||
}
|
}
|
||||||
|
@ -2090,6 +2100,15 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) {
|
||||||
/* rewriteConfigMarkAsProcessed is handled after the loop */
|
/* rewriteConfigMarkAsProcessed is handled after the loop */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* sentinel master-reboot-down-after-period */
|
||||||
|
if (master->master_reboot_down_after_period != 0) {
|
||||||
|
line = sdscatprintf(sdsempty(),
|
||||||
|
"sentinel master-reboot-down-after-period %s %ld",
|
||||||
|
master->name, (long) master->master_reboot_down_after_period);
|
||||||
|
rewriteConfigRewriteLine(state,"sentinel master-reboot-down-after-period",line,1);
|
||||||
|
/* rewriteConfigMarkAsProcessed is handled after the loop */
|
||||||
|
}
|
||||||
|
|
||||||
/* sentinel config-epoch */
|
/* sentinel config-epoch */
|
||||||
line = sdscatprintf(sdsempty(),
|
line = sdscatprintf(sdsempty(),
|
||||||
"sentinel config-epoch %s %llu",
|
"sentinel config-epoch %s %llu",
|
||||||
|
@ -2214,6 +2233,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) {
|
||||||
rewriteConfigMarkAsProcessed(state,"sentinel known-replica");
|
rewriteConfigMarkAsProcessed(state,"sentinel known-replica");
|
||||||
rewriteConfigMarkAsProcessed(state,"sentinel known-sentinel");
|
rewriteConfigMarkAsProcessed(state,"sentinel known-sentinel");
|
||||||
rewriteConfigMarkAsProcessed(state,"sentinel rename-command");
|
rewriteConfigMarkAsProcessed(state,"sentinel rename-command");
|
||||||
|
rewriteConfigMarkAsProcessed(state,"sentinel master-reboot-down-after-period");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This function uses the config rewriting Redis engine in order to persist
|
/* This function uses the config rewriting Redis engine in order to persist
|
||||||
|
@ -2456,6 +2476,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
|
||||||
} else {
|
} else {
|
||||||
if (strncmp(ri->runid,l+7,40) != 0) {
|
if (strncmp(ri->runid,l+7,40) != 0) {
|
||||||
sentinelEvent(LL_NOTICE,"+reboot",ri,"%@");
|
sentinelEvent(LL_NOTICE,"+reboot",ri,"%@");
|
||||||
|
|
||||||
|
if (ri->flags & SRI_MASTER && ri->master_reboot_down_after_period != 0) {
|
||||||
|
ri->flags |= SRI_MASTER_REBOOT;
|
||||||
|
ri->master_reboot_since_time = mstime();
|
||||||
|
}
|
||||||
|
|
||||||
sdsfree(ri->runid);
|
sdsfree(ri->runid);
|
||||||
ri->runid = sdsnewlen(l+7,40);
|
ri->runid = sdsnewlen(l+7,40);
|
||||||
}
|
}
|
||||||
|
@ -2723,6 +2749,10 @@ void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata
|
||||||
{
|
{
|
||||||
link->last_avail_time = mstime();
|
link->last_avail_time = mstime();
|
||||||
link->act_ping_time = 0; /* Flag the pong as received. */
|
link->act_ping_time = 0; /* Flag the pong as received. */
|
||||||
|
|
||||||
|
if (ri->flags & SRI_MASTER_REBOOT && strncmp(r->str,"PONG",4) == 0)
|
||||||
|
ri->flags &= ~SRI_MASTER_REBOOT;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
/* Send a SCRIPT KILL command if the instance appears to be
|
/* Send a SCRIPT KILL command if the instance appears to be
|
||||||
* down because of a busy script. */
|
* down because of a busy script. */
|
||||||
|
@ -4255,6 +4285,15 @@ void sentinelSetCommand(client *c) {
|
||||||
dictAdd(ri->renamed_commands,oldname,newname);
|
dictAdd(ri->renamed_commands,oldname,newname);
|
||||||
}
|
}
|
||||||
changes++;
|
changes++;
|
||||||
|
} else if (!strcasecmp(option,"master-reboot-down-after-period") && moreargs > 0) {
|
||||||
|
/* master-reboot-down-after-period <milliseconds> */
|
||||||
|
robj *o = c->argv[++j];
|
||||||
|
if (getLongLongFromObject(o,&ll) == C_ERR || ll < 0) {
|
||||||
|
badarg = j;
|
||||||
|
goto badfmt;
|
||||||
|
}
|
||||||
|
ri->master_reboot_down_after_period = ll;
|
||||||
|
changes++;
|
||||||
} else {
|
} else {
|
||||||
addReplyErrorFormat(c,"Unknown option or number of arguments for "
|
addReplyErrorFormat(c,"Unknown option or number of arguments for "
|
||||||
"SENTINEL SET '%s'", option);
|
"SENTINEL SET '%s'", option);
|
||||||
|
@ -4358,7 +4397,9 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
|
||||||
(ri->flags & SRI_MASTER &&
|
(ri->flags & SRI_MASTER &&
|
||||||
ri->role_reported == SRI_SLAVE &&
|
ri->role_reported == SRI_SLAVE &&
|
||||||
mstime() - ri->role_reported_time >
|
mstime() - ri->role_reported_time >
|
||||||
(ri->down_after_period+sentinel_info_period*2)))
|
(ri->down_after_period+sentinel_info_period*2)) ||
|
||||||
|
(ri->flags & SRI_MASTER_REBOOT &&
|
||||||
|
mstime()-ri->master_reboot_since_time > ri->master_reboot_down_after_period))
|
||||||
{
|
{
|
||||||
/* Is subjectively down */
|
/* Is subjectively down */
|
||||||
if ((ri->flags & SRI_S_DOWN) == 0) {
|
if ((ri->flags & SRI_S_DOWN) == 0) {
|
||||||
|
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Check the basic monitoring and failover capabilities.
|
||||||
|
source "../tests/includes/init-tests.tcl"
|
||||||
|
|
||||||
|
|
||||||
|
if {$::simulate_error} {
|
||||||
|
test "This test will fail" {
|
||||||
|
fail "Simulated error"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Reboot an instance previously in very short time but do not check if it is loading
|
||||||
|
proc reboot_instance {type id} {
|
||||||
|
set dirname "${type}_${id}"
|
||||||
|
set cfgfile [file join $dirname $type.conf]
|
||||||
|
set port [get_instance_attrib $type $id port]
|
||||||
|
|
||||||
|
# Execute the instance with its old setup and append the new pid
|
||||||
|
# file for cleanup.
|
||||||
|
set pid [exec_instance $type $dirname $cfgfile]
|
||||||
|
set_instance_attrib $type $id pid $pid
|
||||||
|
lappend ::pids $pid
|
||||||
|
|
||||||
|
# Check that the instance is running
|
||||||
|
if {[server_is_up 127.0.0.1 $port 100] == 0} {
|
||||||
|
set logfile [file join $dirname log.txt]
|
||||||
|
puts [exec tail $logfile]
|
||||||
|
abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Connect with it with a fresh link
|
||||||
|
set link [redis 127.0.0.1 $port 0 $::tls]
|
||||||
|
$link reconnect 1
|
||||||
|
set_instance_attrib $type $id link $link
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
test "Master reboot in very short time" {
|
||||||
|
set old_port [RPort $master_id]
|
||||||
|
set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster]
|
||||||
|
assert {[lindex $addr 1] == $old_port}
|
||||||
|
|
||||||
|
R $master_id debug populate 10000
|
||||||
|
R $master_id bgsave
|
||||||
|
R $master_id config set key-load-delay 1500
|
||||||
|
R $master_id config set loading-process-events-interval-bytes 1024
|
||||||
|
R $master_id config rewrite
|
||||||
|
|
||||||
|
foreach_sentinel_id id {
|
||||||
|
S $id SENTINEL SET mymaster master-reboot-down-after-period 5000
|
||||||
|
S $id sentinel debug ping-period 500
|
||||||
|
S $id sentinel debug ask-period 500
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_instance redis $master_id
|
||||||
|
reboot_instance redis $master_id
|
||||||
|
|
||||||
|
foreach_sentinel_id id {
|
||||||
|
wait_for_condition 1000 100 {
|
||||||
|
[lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port
|
||||||
|
} else {
|
||||||
|
fail "At least one Sentinel did not receive failover info"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster]
|
||||||
|
set master_id [get_instance_id_by_port redis [lindex $addr 1]]
|
||||||
|
|
||||||
|
# Make sure the instance load all the dataset
|
||||||
|
while 1 {
|
||||||
|
catch {[$link ping]} retval
|
||||||
|
if {[string match {*LOADING*} $retval]} {
|
||||||
|
after 100
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test "New master [join $addr {:}] role matches" {
|
||||||
|
assert {[RI $master_id role] eq {master}}
|
||||||
|
}
|
||||||
|
|
||||||
|
test "All the other slaves now point to the new master" {
|
||||||
|
foreach_redis_id id {
|
||||||
|
if {$id != $master_id && $id != 0} {
|
||||||
|
wait_for_condition 1000 50 {
|
||||||
|
[RI $id master_port] == [lindex $addr 1]
|
||||||
|
} else {
|
||||||
|
fail "Redis ID $id not configured to replicate with new master"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test "The old master eventually gets reconfigured as a slave" {
|
||||||
|
wait_for_condition 1000 50 {
|
||||||
|
[RI 0 master_port] == [lindex $addr 1]
|
||||||
|
} else {
|
||||||
|
fail "Old master not reconfigured as slave of new master"
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue