Fixes around AOF failed rewrite rate limiting (#10582)

Changes:
1. Check the failed rewrite time threshold only when we actually consider triggering a rewrite.
  i.e. this should be the last condition tested, since the test has side effects (increasing time threshold)
  Could have happened in some rare scenarios 
2. no limit in startup state (e.g. after restarting redis that previously failed and had many incr files)
3. the “triggered the limit” log would be recorded only when the limit status is returned
4. remove failure count in log (could be misleading in some cases)

Co-authored-by: chenyang8094 <chenyang8094@users.noreply.github.com>
Co-authored-by: Oran Agra <oran@redislabs.com>
This commit is contained in:
judeng 2022-04-19 17:06:39 +08:00 committed by GitHub
parent 1a93804645
commit d4cbd8140b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 29 deletions

View File

@ -815,38 +815,39 @@ int openNewIncrAofForAppend(void) {
#define AOF_REWRITE_LIMITE_THRESHOLD 3
#define AOF_REWRITE_LIMITE_MAX_MINUTES 60 /* 1 hour */
int aofRewriteLimited(void) {
int limit = 0;
static int limit_delay_minutes = 0;
static int next_delay_minutes = 0;
static time_t next_rewrite_time = 0;
/* If the number of incr AOFs exceeds the threshold but server.aof_lastbgrewrite_status is OK, it
* means that redis may have just loaded a dataset containing many incr AOFs. At this time, we
* will not limit the AOFRW. */
unsigned long incr_aof_num = listLength(server.aof_manifest->incr_aof_list);
if (incr_aof_num >= AOF_REWRITE_LIMITE_THRESHOLD) {
if (server.unixtime < next_rewrite_time) {
limit = 1;
} else {
if (limit_delay_minutes == 0) {
limit = 1;
limit_delay_minutes = 1;
} else {
limit_delay_minutes *= 2;
}
if (limit_delay_minutes > AOF_REWRITE_LIMITE_MAX_MINUTES) {
limit_delay_minutes = AOF_REWRITE_LIMITE_MAX_MINUTES;
}
next_rewrite_time = server.unixtime + limit_delay_minutes * 60;
serverLog(LL_WARNING,
"Background AOF rewrite has repeatedly failed %ld times and triggered the limit, will retry in %d minutes",
incr_aof_num, limit_delay_minutes);
}
} else {
limit_delay_minutes = 0;
if (incr_aof_num < AOF_REWRITE_LIMITE_THRESHOLD || server.aof_lastbgrewrite_status == C_OK) {
/* We may be recovering from limited state, so reset all states. */
next_delay_minutes = 0;
next_rewrite_time = 0;
return 0;
}
return limit;
/* if it is in the limiting state, then check if the next_rewrite_time is reached */
if (next_rewrite_time != 0) {
if (server.unixtime < next_rewrite_time) {
return 1;
} else {
next_rewrite_time = 0;
return 0;
}
}
next_delay_minutes = (next_delay_minutes == 0) ? 1 : (next_delay_minutes * 2);
if (next_delay_minutes > AOF_REWRITE_LIMITE_MAX_MINUTES) {
next_delay_minutes = AOF_REWRITE_LIMITE_MAX_MINUTES;
}
next_rewrite_time = server.unixtime + next_delay_minutes * 60;
serverLog(LL_WARNING,
"Background AOF rewrite has repeatedly failed and triggered the limit, will retry in %d minutes", next_delay_minutes);
return 1;
}
/* ----------------------------------------------------------------------------

View File

@ -1296,13 +1296,12 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
if (server.aof_state == AOF_ON &&
!hasActiveChildProcess() &&
server.aof_rewrite_perc &&
server.aof_current_size > server.aof_rewrite_min_size &&
!aofRewriteLimited())
server.aof_current_size > server.aof_rewrite_min_size)
{
long long base = server.aof_rewrite_base_size ?
server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
if (growth >= server.aof_rewrite_perc) {
if (growth >= server.aof_rewrite_perc && !aofRewriteLimited()) {
serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
rewriteAppendOnlyFileBackground();
}