Improve lost-increment message in repo analysis (#131200)

Today repository analysis may fail with a message like the following:

    [test-repo] register [test-register-contended-F_NNXHrSSDGveoeyj1skwg]
    should have value [10] but instead had value
    [OptionalBytesReference[00 00 00 00 00 00 00 09]]

This is confusing because one might interpret `should have value [10]`
as an indication that Elasticsearch definitely wrote this value to the
register, leaving you trying to work out how that particular write was
lost. In fact it can be more subtle than that, we only believe the
register blob should have this value because we know we completed 10
supposedly-atomic increment operations, and the failure could instead be
that these operations are not as atomic as they need to be and that one
or more of the increments was lost.

This commit makes the message more verbose, clarifying that this failure
could be an atomicity problem rather than a simple lost write:

    [test-repo] Successfully completed all [10] atomic increments of
    register [test-register-contended-F_NNXHrSSDGveoeyj1skwg] so its
    expected value is [OptionalBytesReference[00 00 00 00 00 00 00 0a]],
    but reading its value with [getRegister] unexpectedly yielded
    [OptionalBytesReference[00 00 00 00 00 00 00 09]]. This anomaly may
    indicate an atomicity failure amongst concurrent
    compare-and-exchange operations on registers in this repository.
This commit is contained in:
David Turner 2025-07-15 08:32:49 +01:00 committed by GitHub
parent 70176c8356
commit 6f5579656c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 156 additions and 57 deletions

View File

@ -0,0 +1,5 @@
pr: 131200
summary: Improve lost-increment message in repo analysis
area: Snapshot/Restore
type: enhancement
issues: []

View File

@ -76,6 +76,7 @@ import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.matchesPattern;
import static org.hamcrest.Matchers.nullValue;
public class RepositoryAnalysisFailureIT extends AbstractSnapshotIntegTestCase {
@ -385,6 +386,55 @@ public class RepositoryAnalysisFailureIT extends AbstractSnapshotIntegTestCase {
assertAnalysisFailureMessage(analyseRepositoryExpectFailure(request).getMessage());
}
public void testFailsOnLostIncrement() {
final RepositoryAnalyzeAction.Request request = new RepositoryAnalyzeAction.Request("test-repo");
final AtomicBoolean registerWasCorrupted = new AtomicBoolean();
blobStore.setDisruption(new Disruption() {
@Override
public BytesReference onContendedCompareAndExchange(BytesRegister register, BytesReference expected, BytesReference updated) {
if (expected.equals(updated) == false // not the initial read
&& updated.length() == Long.BYTES // not the final write
&& randomBoolean()
&& register.get().equals(expected) // would have succeeded
&& registerWasCorrupted.compareAndSet(false, true)) {
// indicate success without actually applying the update
return expected;
}
return register.compareAndExchange(expected, updated);
}
});
safeAwait((ActionListener<RepositoryAnalyzeAction.Response> l) -> analyseRepository(request, l.delegateResponse((ll, e) -> {
if (ExceptionsHelper.unwrapCause(e) instanceof RepositoryVerificationException repositoryVerificationException) {
assertAnalysisFailureMessage(repositoryVerificationException.getMessage());
assertTrue(
"did not lose increment, so why did the verification fail?",
// clear flag for final assertion
registerWasCorrupted.compareAndSet(true, false)
);
assertThat(
asInstanceOf(
RepositoryVerificationException.class,
ExceptionsHelper.unwrapCause(repositoryVerificationException.getCause())
).getMessage(),
matchesPattern("""
\\[test-repo] Successfully completed all \\[.*] atomic increments of register \\[test-register-contended-.*] \
so its expected value is \\[OptionalBytesReference\\[.*]], but reading its value with \\[.*] unexpectedly \
yielded \\[OptionalBytesReference\\[.*]]\\. This anomaly may indicate an atomicity failure amongst concurrent \
compare-and-exchange operations on registers in this repository\\.""")
);
ll.onResponse(null);
} else {
ll.onFailure(e);
}
})));
assertFalse(registerWasCorrupted.get());
}
public void testFailsIfRegisterHoldsSpuriousValue() {
final RepositoryAnalyzeAction.Request request = new RepositoryAnalyzeAction.Request("test-repo");

View File

@ -42,6 +42,7 @@ import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.util.concurrent.ThrottledIterator;
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
@ -647,12 +648,73 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
}
private Runnable finalRegisterValueVerifier(String registerName, Random random, Releasable ref) {
return () -> {
return new Runnable() {
final CheckedConsumer<ActionListener<OptionalBytesReference>, Exception> finalValueReader = switch (random.nextInt(3)) {
case 0 -> new CheckedConsumer<ActionListener<OptionalBytesReference>, Exception>() {
@Override
public void accept(ActionListener<OptionalBytesReference> listener) {
getBlobContainer().getRegister(OperationPurpose.REPOSITORY_ANALYSIS, registerName, listener);
}
@Override
public String toString() {
return "getRegister";
}
};
case 1 -> new CheckedConsumer<ActionListener<OptionalBytesReference>, Exception>() {
@Override
public void accept(ActionListener<OptionalBytesReference> listener) {
getBlobContainer().compareAndExchangeRegister(
OperationPurpose.REPOSITORY_ANALYSIS,
registerName,
bytesFromLong(expectedFinalRegisterValue),
new BytesArray(new byte[] { (byte) 0xff }),
listener
);
}
@Override
public String toString() {
return "compareAndExchangeRegister";
}
};
case 2 -> new CheckedConsumer<ActionListener<OptionalBytesReference>, Exception>() {
@Override
public void accept(ActionListener<OptionalBytesReference> listener) {
getBlobContainer().compareAndSetRegister(
OperationPurpose.REPOSITORY_ANALYSIS,
registerName,
bytesFromLong(expectedFinalRegisterValue),
new BytesArray(new byte[] { (byte) 0xff }),
listener.map(
b -> b
? OptionalBytesReference.of(bytesFromLong(expectedFinalRegisterValue))
: OptionalBytesReference.MISSING
)
);
}
@Override
public String toString() {
return "compareAndSetRegister";
}
};
default -> {
assert false;
throw new IllegalStateException();
}
};
long expectedFinalRegisterValue = Long.MIN_VALUE;
@Override
public void run() {
if (isRunning()) {
final var expectedFinalRegisterValue = expectedRegisterValue.get();
expectedFinalRegisterValue = expectedRegisterValue.get();
transportService.getThreadPool()
.executor(ThreadPool.Names.SNAPSHOT)
.execute(ActionRunnable.wrap(ActionListener.releaseAfter(new ActionListener<OptionalBytesReference>() {
.execute(ActionRunnable.wrap(ActionListener.releaseAfter(new ActionListener<>() {
@Override
public void onResponse(OptionalBytesReference actualFinalRegisterValue) {
if (actualFinalRegisterValue.isPresent() == false
@ -661,9 +723,15 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
new RepositoryVerificationException(
request.getRepositoryName(),
Strings.format(
"register [%s] should have value [%d] but instead had value [%s]",
registerName,
"""
Successfully completed all [%d] atomic increments of register [%s] so its expected \
value is [%s], but reading its value with [%s] unexpectedly yielded [%s]. This \
anomaly may indicate an atomicity failure amongst concurrent compare-and-exchange \
operations on registers in this repository.""",
expectedFinalRegisterValue,
registerName,
OptionalBytesReference.of(bytesFromLong(expectedFinalRegisterValue)),
finalValueReader.toString(),
actualFinalRegisterValue
)
)
@ -678,36 +746,12 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
fail(exp);
}
}
}, ref), listener -> {
switch (random.nextInt(3)) {
case 0 -> getBlobContainer().getRegister(OperationPurpose.REPOSITORY_ANALYSIS, registerName, listener);
case 1 -> getBlobContainer().compareAndExchangeRegister(
OperationPurpose.REPOSITORY_ANALYSIS,
registerName,
bytesFromLong(expectedFinalRegisterValue),
new BytesArray(new byte[] { (byte) 0xff }),
listener
);
case 2 -> getBlobContainer().compareAndSetRegister(
OperationPurpose.REPOSITORY_ANALYSIS,
registerName,
bytesFromLong(expectedFinalRegisterValue),
new BytesArray(new byte[] { (byte) 0xff }),
listener.map(
b -> b
? OptionalBytesReference.of(bytesFromLong(expectedFinalRegisterValue))
: OptionalBytesReference.MISSING
)
);
default -> {
assert false;
throw new IllegalStateException();
}
}
}));
}, ref), finalValueReader));
} else {
ref.close();
}
}
};
}