This commit is contained in:
Stav-Levi 2025-07-15 12:07:28 +08:00 committed by GitHub
commit 33bc30b92c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 182 additions and 8 deletions

View File

@ -1658,7 +1658,7 @@ int loadSingleAppendOnlyFile(char *filename) {
/* Clean up. Command code may have changed argv/argc so we use the
* argv/argc of the client instead of the local variables. */
freeClientArgv(fakeClient);
if (server.aof_load_truncated) valid_up_to = ftello(fp);
if (server.aof_load_truncated || server.aof_load_broken) valid_up_to = ftello(fp);
if (server.key_load_delay)
debugDelay(server.key_load_delay);
}
@ -1685,7 +1685,7 @@ readerr: /* Read error. If feof(fp) is true, fall through to unexpected EOF. */
ret = AOF_FAILED;
goto cleanup;
}
uxeof: /* Unexpected AOF end of file. */
if (server.aof_load_truncated) {
serverLog(LL_WARNING,"!!! Warning: short read while loading the AOF file %s!!!", filename);
@ -1719,8 +1719,41 @@ uxeof: /* Unexpected AOF end of file. */
goto cleanup;
fmterr: /* Format error. */
serverLog(LL_WARNING, "Bad file format reading the append only file %s: "
"make a backup of your AOF file, then use ./redis-check-aof --fix <filename.manifest>", filename);
/* fmterr may be caused by accidentally machine shutdown, so if the broken tail
* is less than a specified size, try to recover it automatically */
if (server.aof_load_broken) {
if (valid_up_to == -1) {
serverLog(LL_WARNING,"Last valid command offset is invalid");
} else if (sb.st_size - valid_up_to < server.aof_load_broken_max_size) {
if (truncate(aof_filepath,valid_up_to) == -1) {
serverLog(LL_WARNING,"Error truncating the AOF file: %s",
strerror(errno));
} else {
/* Make sure the AOF file descriptor points to the end of the
* file after the truncate call. */
if (server.aof_fd != -1 && lseek(server.aof_fd,0,SEEK_END) == -1) {
serverLog(LL_WARNING,"Can't seek the end of the AOF file: %s",
strerror(errno));
} else {
serverLog(LL_WARNING,
"AOF loaded anyway because aof-load-broken is enabled and "
"broken size '%lld' is less than aof-load-broken-max-size '%lld'",
(long long)(sb.st_size - valid_up_to), (long long)(server.aof_load_broken_max_size));
ret = AOF_BROKEN_RECOVERED;
goto loaded_ok;
}
}
} else { /* The size of the corrupted portion exceeds the configured limit. */
serverLog(LL_WARNING,
"AOF was not loaded because the size of the corrupted portion "
"exceeds the configured limit. aof-load-broken is enabled and broken size '%lld' "
"is bigger than aof-load-broken-max-size '%lld'",
(long long)(sb.st_size - valid_up_to), (long long)(server.aof_load_broken_max_size));
}
} else {
serverLog(LL_WARNING, "Bad file format reading the append only file %s: "
"make a backup of your AOF file, then use ./redis-check-aof --fix <filename.manifest>", filename);
}
ret = AOF_FAILED;
/* fall through to cleanup. */
@ -1794,13 +1827,13 @@ int loadAppendOnlyFiles(aofManifest *am) {
last_file = ++aof_num == total_num;
start = ustime();
ret = loadSingleAppendOnlyFile(aof_name);
if (ret == AOF_OK || (ret == AOF_TRUNCATED && last_file)) {
if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) {
serverLog(LL_NOTICE, "DB loaded from base file %s: %.3f seconds",
aof_name, (float)(ustime()-start)/1000000);
}
/* If the truncated file is not the last file, we consider this to be a fatal error. */
if (ret == AOF_TRUNCATED && !last_file) {
if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) {
ret = AOF_FAILED;
serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file");
}
@ -1824,7 +1857,7 @@ int loadAppendOnlyFiles(aofManifest *am) {
last_file = ++aof_num == total_num;
start = ustime();
ret = loadSingleAppendOnlyFile(aof_name);
if (ret == AOF_OK || (ret == AOF_TRUNCATED && last_file)) {
if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) {
serverLog(LL_NOTICE, "DB loaded from incr file %s: %.3f seconds",
aof_name, (float)(ustime()-start)/1000000);
}
@ -1834,7 +1867,7 @@ int loadAppendOnlyFiles(aofManifest *am) {
if (ret == AOF_EMPTY) ret = AOF_OK;
/* If the truncated file is not the last file, we consider this to be a fatal error. */
if (ret == AOF_TRUNCATED && !last_file) {
if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) {
ret = AOF_FAILED;
serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file");
}

View File

@ -3098,6 +3098,7 @@ standardConfig static_configs[] = {
createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL),
createBoolConfig("rdb-save-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.rdb_save_incremental_fsync, 1, NULL, NULL),
createBoolConfig("aof-load-truncated", NULL, MODIFIABLE_CONFIG, server.aof_load_truncated, 1, NULL, NULL),
createBoolConfig("aof-load-broken", NULL, MODIFIABLE_CONFIG, server.aof_load_broken, 0, NULL, NULL),
createBoolConfig("aof-use-rdb-preamble", NULL, MODIFIABLE_CONFIG, server.aof_use_rdb_preamble, 1, NULL, NULL),
createBoolConfig("aof-timestamp-enabled", NULL, MODIFIABLE_CONFIG, server.aof_timestamp_enabled, 0, NULL, NULL),
createBoolConfig("cluster-replica-no-failover", "cluster-slave-no-failover", MODIFIABLE_CONFIG, server.cluster_slave_no_failover, 0, NULL, updateClusterFlags), /* Failover by default. */
@ -3264,6 +3265,7 @@ standardConfig static_configs[] = {
createTimeTConfig("repl-backlog-ttl", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.repl_backlog_time_limit, 60*60, INTEGER_CONFIG, NULL, NULL), /* Default: 1 hour */
createOffTConfig("auto-aof-rewrite-min-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_rewrite_min_size, 64*1024*1024, MEMORY_CONFIG, NULL, NULL),
createOffTConfig("loading-process-events-interval-bytes", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 1024, INT_MAX, server.loading_process_events_interval_bytes, 1024*512, INTEGER_CONFIG, NULL, NULL),
createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.aof_load_broken_max_size, 4*1024, INTEGER_CONFIG, NULL, NULL),
createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.tls_port, 0, INTEGER_CONFIG, NULL, applyTLSPort), /* TCP port. */
createIntConfig("tls-session-cache-size", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tls_ctx_config.session_cache_size, 20*1024, INTEGER_CONFIG, NULL, applyTlsCfg),

View File

@ -346,6 +346,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
#define AOF_OPEN_ERR 3
#define AOF_FAILED 4
#define AOF_TRUNCATED 5
#define AOF_BROKEN_RECOVERED 6
/* RDB return values for rdbLoad. */
#define RDB_OK 0
@ -2018,6 +2019,8 @@ struct redisServer {
int aof_last_write_status; /* C_OK or C_ERR */
int aof_last_write_errno; /* Valid if aof write/fsync status is ERR */
int aof_load_truncated; /* Don't stop on unexpected AOF EOF. */
int aof_load_broken; /* Don't stop on bad fmt. */
off_t aof_load_broken_max_size; /* The max size of broken AOF tail than can be ignored. */
int aof_use_rdb_preamble; /* Specify base AOF to use RDB encoding on AOF rewrites. */
redisAtomic int aof_bio_fsync_status; /* Status of AOF fsync in bio job. */
redisAtomic int aof_bio_fsync_errno; /* Errno of AOF fsync in bio job. */

View File

@ -701,4 +701,140 @@ tags {"aof external:skip"} {
assert_equal {1} [r get t]
}
}
# Check AOF load broken behavior
# Corrupted base AOF, existing AOF files
create_aof $aof_dirpath $aof_base_file {
append_to_aof [formatCommand set param ok]
append_to_aof "corruption"
}
create_aof $aof_dirpath $aof_file {
append_to_aof [formatCommand set foo hello]
}
start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] {
test "Log should mention truncated file is not last" {
wait_for_log_messages 0 {
{*AOF loaded anyway because aof-load-broken is enabled*}
{*Fatal error: the truncated file is not the last file*}
} 0 10 1000
}
}
# Remove all incr AOF files to make the base file being the last file
exec rm -f $aof_dirpath/appendonly.aof.*
start_server_aof [list dir $server_path aof-load-broken yes] {
test "Corrupted base AOF (last file): should recover" {
assert_equal 1 [is_alive [srv pid]]
}
test "param should be 'ok'" {
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get param] eq "ok"}
}
}
# Should also start with broken incr AOF.
create_aof $aof_dirpath $aof_file {
append_to_aof [formatCommand set foo 1]
append_to_aof [formatCommand incr foo]
append_to_aof [formatCommand incr foo]
append_to_aof [formatCommand incr foo]
append_to_aof [formatCommand incr foo]
append_to_aof "corruption"
}
start_server_aof [list dir $server_path aof-load-broken yes] {
test "Short read: Server should start if aof-load-broken is yes" {
assert_equal 1 [is_alive [srv pid]]
}
# The AOF file is expected to be correct because default value for aof-load-broken-max-size is 4096,
# so the AOF will reload without the corruption
test "Broken AOF loaded: we expect foo to be equal to 5" {
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get foo] eq "5"}
}
test "Append a new command after loading an incomplete AOF" {
$client incr foo
}
}
start_server_aof [list dir $server_path aof-load-broken yes] {
test "Short read + command: Server should start" {
assert_equal 1 [is_alive [srv pid]]
}
test "Broken AOF loaded: we expect foo to be equal to 6 now" {
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get foo] eq "6"}
}
}
# Test that the server exits when the AOF contains a format error
create_aof $aof_dirpath $aof_file {
append_to_aof [formatCommand set foo hello]
append_to_aof [string range [formatCommand incr foo] 0 end-3]
append_to_aof "corruption"
}
# We set the maximum allowed corrupted size to 2 bytes, but the actual corrupted portion is larger,
# so the AOF file will not be reloaded.
start_server_aof_ex [list dir $server_path aof-load-broken yes aof-load-broken-max-size 2] [list wait_ready false] {
test "Bad format: Server should have logged an error" {
wait_for_log_messages 0 {"*AOF was not loaded because the size*"} 0 10 1000
}
}
create_aof_manifest $aof_dirpath $aof_manifest_file {
append_to_manifest "file appendonly.aof.1.base.aof seq 1 type b\n"
append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n"
append_to_manifest "file appendonly.aof.2.incr.aof seq 2 type i\n"
}
# Create base AOF file
set base_aof_file "$aof_dirpath/appendonly.aof.1.base.aof"
create_aof $aof_dirpath $base_aof_file {
append_to_aof [formatCommand set fo base]
}
# Create middle incr AOF file with corruption
set mid_aof_file "$aof_dirpath/appendonly.aof.1.incr.aof"
create_aof $aof_dirpath $mid_aof_file {
append_to_aof [formatCommand set fo mid]
append_to_aof "CORRUPTION"
}
# Create last incr AOF file (valid)
set last_aof_file "$aof_dirpath/appendonly.aof.2.incr.aof"
create_aof $aof_dirpath $last_aof_file {
append_to_aof [formatCommand set fo last]
}
# Check that Redis fails to load because corruption is in the middle file
start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] {
test "Intermediate AOF is broken: should log fatal and not start" {
wait_for_log_messages 0 {
{*Fatal error: the truncated file is not the last file*}
} 0 10 1000
}
}
# Swap mid and last files
set tmp_file "$aof_dirpath/temp.aof"
file rename -force $mid_aof_file $tmp_file
file rename -force $last_aof_file $mid_aof_file
file rename -force $tmp_file $last_aof_file
# Should now start successfully since corruption is in last AOF file
start_server_aof [list dir $server_path aof-load-broken yes] {
test "Corrupted last AOF file: Server should still start and recover" {
assert_equal 1 [is_alive [srv pid]]
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get fo] eq "mid"}
}
}
}