diff --git a/src/replication.c b/src/replication.c index 99e73285d..e1d39582c 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1731,6 +1731,18 @@ void readSyncBulkPayload(connection *conn) { * dictionaries. */ diskless_load_backup = disklessLoadMakeBackup(); } + + /* Replica starts to apply data from new master, we must discard the cached + * master structure. */ + serverAssert(server.master == NULL); + replicationDiscardCachedMaster(); + + /* We want our slaves to resync with us as well, if we have any sub-slaves. + * The master already transferred us an entirely different data set and we + * have no way to incrementally feed our slaves after that. */ + disconnectSlaves(); /* Force our slaves to resync with us as well. */ + freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */ + /* We call to emptyDb even in case of REPL_DISKLESS_LOAD_SWAPDB * (Where disklessLoadMakeBackup left server.db empty) because we * want to execute all the auxiliary logic of emptyDb (Namely, @@ -2140,8 +2152,6 @@ int slaveTryPartialResynchronization(connection *conn, int read_reply) { server.master_replid, server.master_initial_offset); } - /* We are going to full resync, discard the cached master structure. */ - replicationDiscardCachedMaster(); sdsfree(reply); return PSYNC_FULLRESYNC; } @@ -2221,7 +2231,6 @@ int slaveTryPartialResynchronization(connection *conn, int read_reply) { "error state (reply: %s)", reply); } sdsfree(reply); - replicationDiscardCachedMaster(); return PSYNC_NOT_SUPPORTED; } @@ -2463,13 +2472,6 @@ void syncWithMaster(connection *conn) { return; } - /* PSYNC failed or is not supported: we want our slaves to resync with us - * as well, if we have any sub-slaves. The master may transfer us an - * entirely different data set and we have no way to incrementally feed - * our slaves after that. */ - disconnectSlaves(); /* Force our slaves to resync with us as well. */ - freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */ - /* Fall back to SYNC if needed. Otherwise psync_result == PSYNC_FULLRESYNC * and the server.master_replid and master_initial_offset are * already populated. */ @@ -2631,9 +2633,12 @@ void replicationSetMaster(char *ip, int port) { /* Update oom_score_adj */ setOOMScoreAdj(-1); - /* Force our slaves to resync with us as well. They may hopefully be able - * to partially resync with us, but we can notify the replid change. */ - disconnectSlaves(); + /* Here we don't disconnect with replicas, since they may hopefully be able + * to partially resync with us. We will disconnect with replicas and force + * them to resync with us when changing replid on partially resync with new + * master, or finishing transferring RDB and preparing loading DB on full + * sync with new master. */ + cancelReplicationHandshake(0); /* Before destroying our master state, create a cached master using * our own parameters, to later PSYNC with the new master. */ diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index c0f0240ff..ceb587ff6 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -926,3 +926,71 @@ test {Kill rdb child process if its dumping RDB is not useful} { } } } {} {external:skip} + +start_server {tags {"repl external:skip"}} { + set master1_host [srv 0 host] + set master1_port [srv 0 port] + r set a b + + start_server {} { + set master2 [srv 0 client] + set master2_host [srv 0 host] + set master2_port [srv 0 port] + # Take 10s for dumping RDB + $master2 debug populate 10 master2 10 + $master2 config set rdb-key-save-delay 1000000 + + start_server {} { + set sub_replica [srv 0 client] + + start_server {} { + # Full sync with master1 + r slaveof $master1_host $master1_port + wait_for_sync r + assert_equal "b" [r get a] + + # Let sub replicas sync with me + $sub_replica slaveof [srv 0 host] [srv 0 port] + wait_for_sync $sub_replica + assert_equal "b" [$sub_replica get a] + + # Full sync with master2, and then kill master2 before finishing dumping RDB + r slaveof $master2_host $master2_port + wait_for_condition 50 100 { + ([s -2 rdb_bgsave_in_progress] == 1) && + ([string match "*wait_bgsave*" [s -2 slave0]]) + } else { + fail "full sync didn't start" + } + catch {$master2 shutdown nosave} + + test {Don't disconnect with replicas before loading transferred RDB when full sync} { + assert ![log_file_matches [srv -1 stdout] "*Connection with master lost*"] + # The replication id is not changed in entire replication chain + assert_equal [s master_replid] [s -3 master_replid] + assert_equal [s master_replid] [s -1 master_replid] + } + + test {Discard cache master before loading transferred RDB when full sync} { + set full_sync [s -3 sync_full] + set partial_sync [s -3 sync_partial_ok] + # Partial sync with master1 + r slaveof $master1_host $master1_port + wait_for_sync r + # master1 accepts partial sync instead of full sync + assert_equal $full_sync [s -3 sync_full] + assert_equal [expr $partial_sync+1] [s -3 sync_partial_ok] + + # Since master only partially sync replica, and repl id is not changed, + # the replica doesn't disconnect with its sub-replicas + assert_equal [s master_replid] [s -3 master_replid] + assert_equal [s master_replid] [s -1 master_replid] + assert ![log_file_matches [srv -1 stdout] "*Connection with master lost*"] + # Sub replica just has one full sync, no partial resync. + assert_equal 1 [s sync_full] + assert_equal 0 [s sync_partial_ok] + } + } + } + } +}