Clean up update state when snapshots are interrupted or cancelled.
This patch addresses two edge cases with interrupted updates: (1) If the device reverts to the old slot *before* merging, snapshots must be removed. (2) If during a merge, a snapshot is detected to be invalid (due to flashing), the snapshot must be removed. To encapsulate this logic, WaitForMerge has been renamed to ProcessUpdateState. It is still intended to be called after /data is mounted, or after a merge is initiated. Bug: 139154795 Test: libsnapshot_test gtest Change-Id: I37a25ca722f30ae9548894dcfbd70cb64330e416
This commit is contained in:
parent
c058df7b12
commit
a6e288a178
|
@ -66,7 +66,11 @@ enum class UpdateState : unsigned int {
|
|||
MergeCompleted,
|
||||
|
||||
// Merging failed due to an unrecoverable error.
|
||||
MergeFailed
|
||||
MergeFailed,
|
||||
|
||||
// The update was implicitly cancelled, either by a rollback or a flash
|
||||
// operation via fastboot. This state can only be returned by WaitForMerge.
|
||||
Cancelled
|
||||
};
|
||||
|
||||
class SnapshotManager final {
|
||||
|
@ -82,6 +86,7 @@ class SnapshotManager final {
|
|||
virtual std::string GetGsidDir() const = 0;
|
||||
virtual std::string GetMetadataDir() const = 0;
|
||||
virtual std::string GetSlotSuffix() const = 0;
|
||||
virtual std::string GetSuperDevice(uint32_t slot) const = 0;
|
||||
virtual const IPartitionOpener& GetPartitionOpener() const = 0;
|
||||
};
|
||||
|
||||
|
@ -117,12 +122,15 @@ class SnapshotManager final {
|
|||
// update has been marked successful after booting.
|
||||
bool InitiateMerge();
|
||||
|
||||
// Wait for the current merge to finish, then perform cleanup when it
|
||||
// completes. It is necessary to call this after InitiateMerge(), or when
|
||||
// a merge state is detected during boot.
|
||||
// Perform any necessary post-boot actions. This should be run soon after
|
||||
// /data is mounted.
|
||||
//
|
||||
// Note that after calling WaitForMerge(), GetUpdateState() may still return
|
||||
// that a merge is in progress:
|
||||
// If a merge is in progress, this function will block until the merge is
|
||||
// completed. If a merge or update was cancelled, this will clean up any
|
||||
// update artifacts and return.
|
||||
//
|
||||
// Note that after calling this, GetUpdateState() may still return that a
|
||||
// merge is in progress:
|
||||
// MergeFailed indicates that a fatal error occurred. WaitForMerge() may
|
||||
// called any number of times again to attempt to make more progress, but
|
||||
// we do not expect it to succeed if a catastrophic error occurred.
|
||||
|
@ -135,7 +143,7 @@ class SnapshotManager final {
|
|||
//
|
||||
// MergeCompleted indicates that the update has fully completed.
|
||||
// GetUpdateState will return None, and a new update can begin.
|
||||
UpdateState WaitForMerge();
|
||||
UpdateState ProcessUpdateState();
|
||||
|
||||
// Find the status of the current update, if any.
|
||||
//
|
||||
|
@ -158,6 +166,7 @@ class SnapshotManager final {
|
|||
FRIEND_TEST(SnapshotTest, CreateSnapshot);
|
||||
FRIEND_TEST(SnapshotTest, FirstStageMountAfterRollback);
|
||||
FRIEND_TEST(SnapshotTest, FirstStageMountAndMerge);
|
||||
FRIEND_TEST(SnapshotTest, FlashSuperDuringMerge);
|
||||
FRIEND_TEST(SnapshotTest, FlashSuperDuringUpdate);
|
||||
FRIEND_TEST(SnapshotTest, MapPartialSnapshot);
|
||||
FRIEND_TEST(SnapshotTest, MapSnapshot);
|
||||
|
@ -245,6 +254,14 @@ class SnapshotManager final {
|
|||
// List the known snapshot names.
|
||||
bool ListSnapshots(LockedFile* lock, std::vector<std::string>* snapshots);
|
||||
|
||||
// Check for a cancelled or rolled back merge, returning true if such a
|
||||
// condition was detected and handled.
|
||||
bool HandleCancelledUpdate(LockedFile* lock);
|
||||
|
||||
// Remove artifacts created by the update process, such as snapshots, and
|
||||
// set the update state to None.
|
||||
bool RemoveAllUpdateState(LockedFile* lock);
|
||||
|
||||
// Interact with /metadata/ota/state.
|
||||
std::unique_ptr<LockedFile> OpenStateFile(int open_flags, int lock_flags);
|
||||
std::unique_ptr<LockedFile> LockShared();
|
||||
|
@ -272,6 +289,7 @@ class SnapshotManager final {
|
|||
bool MarkSnapshotMergeCompleted(LockedFile* snapshot_lock, const std::string& snapshot_name);
|
||||
void AcknowledgeMergeSuccess(LockedFile* lock);
|
||||
void AcknowledgeMergeFailure();
|
||||
bool IsCancelledSnapshot(const std::string& snapshot_name);
|
||||
|
||||
// Note that these require the name of the device containing the snapshot,
|
||||
// which may be the "inner" device. Use GetsnapshotDeviecName().
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include <android-base/strings.h>
|
||||
#include <android-base/unique_fd.h>
|
||||
#include <ext4_utils/ext4_utils.h>
|
||||
#include <fs_mgr.h>
|
||||
#include <fs_mgr_dm_linear.h>
|
||||
#include <fstab/fstab.h>
|
||||
#include <libdm/dm.h>
|
||||
|
@ -64,6 +65,9 @@ class DeviceInfo final : public SnapshotManager::IDeviceInfo {
|
|||
std::string GetMetadataDir() const override { return "/metadata/ota"s; }
|
||||
std::string GetSlotSuffix() const override { return fs_mgr_get_slot_suffix(); }
|
||||
const android::fs_mgr::IPartitionOpener& GetPartitionOpener() const { return opener_; }
|
||||
std::string GetSuperDevice(uint32_t slot) const override {
|
||||
return fs_mgr_get_super_partition_name(slot);
|
||||
}
|
||||
|
||||
private:
|
||||
android::fs_mgr::PartitionOpener opener_;
|
||||
|
@ -123,17 +127,20 @@ bool SnapshotManager::CancelUpdate() {
|
|||
LOG(ERROR) << "Cannot cancel update after it has completed or started merging";
|
||||
return false;
|
||||
}
|
||||
return RemoveAllUpdateState(file.get());
|
||||
}
|
||||
|
||||
if (!RemoveAllSnapshots(file.get())) {
|
||||
bool SnapshotManager::RemoveAllUpdateState(LockedFile* lock) {
|
||||
if (!RemoveAllSnapshots(lock)) {
|
||||
LOG(ERROR) << "Could not remove all snapshots";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!WriteUpdateState(file.get(), UpdateState::None)) {
|
||||
LOG(ERROR) << "Could not write new update state";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
RemoveSnapshotBootIndicator();
|
||||
|
||||
// If this fails, we'll keep trying to remove the update state (as the
|
||||
// device reboots or starts a new update) until it finally succeeds.
|
||||
return WriteUpdateState(lock, UpdateState::None);
|
||||
}
|
||||
|
||||
bool SnapshotManager::FinishedSnapshotWrites() {
|
||||
|
@ -362,14 +369,13 @@ bool SnapshotManager::DeleteSnapshot(LockedFile* lock, const std::string& name)
|
|||
if (!EnsureImageManager()) return false;
|
||||
|
||||
auto cow_name = GetCowName(name);
|
||||
if (!images_->BackingImageExists(cow_name)) {
|
||||
return true;
|
||||
}
|
||||
if (images_->IsImageMapped(cow_name) && !images_->UnmapImageDevice(cow_name)) {
|
||||
return false;
|
||||
}
|
||||
if (!images_->DeleteBackingImage(cow_name)) {
|
||||
return false;
|
||||
if (images_->BackingImageExists(cow_name)) {
|
||||
if (images_->IsImageMapped(cow_name) && !images_->UnmapImageDevice(cow_name)) {
|
||||
return false;
|
||||
}
|
||||
if (!images_->DeleteBackingImage(cow_name)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::string error;
|
||||
|
@ -575,9 +581,12 @@ bool SnapshotManager::QuerySnapshotStatus(const std::string& dm_name, std::strin
|
|||
// Note that when a merge fails, we will *always* try again to complete the
|
||||
// merge each time the device boots. There is no harm in doing so, and if
|
||||
// the problem was transient, we might manage to get a new outcome.
|
||||
UpdateState SnapshotManager::WaitForMerge() {
|
||||
UpdateState SnapshotManager::ProcessUpdateState() {
|
||||
while (true) {
|
||||
UpdateState state = CheckMergeState();
|
||||
if (state == UpdateState::MergeFailed) {
|
||||
AcknowledgeMergeFailure();
|
||||
}
|
||||
if (state != UpdateState::Merging) {
|
||||
// Either there is no merge, or the merge was finished, so no need
|
||||
// to keep waiting.
|
||||
|
@ -593,15 +602,16 @@ UpdateState SnapshotManager::WaitForMerge() {
|
|||
UpdateState SnapshotManager::CheckMergeState() {
|
||||
auto lock = LockExclusive();
|
||||
if (!lock) {
|
||||
AcknowledgeMergeFailure();
|
||||
return UpdateState::MergeFailed;
|
||||
}
|
||||
|
||||
auto state = CheckMergeState(lock.get());
|
||||
UpdateState state = CheckMergeState(lock.get());
|
||||
if (state == UpdateState::MergeCompleted) {
|
||||
// Do this inside the same lock. Failures get acknowledged without the
|
||||
// lock, because flock() might have failed.
|
||||
AcknowledgeMergeSuccess(lock.get());
|
||||
} else if (state == UpdateState::MergeFailed) {
|
||||
AcknowledgeMergeFailure();
|
||||
} else if (state == UpdateState::Cancelled) {
|
||||
RemoveAllUpdateState(lock.get());
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
@ -623,10 +633,17 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock) {
|
|||
// run.
|
||||
break;
|
||||
|
||||
case UpdateState::Unverified:
|
||||
// This is an edge case. Normally cancelled updates are detected
|
||||
// via the merge poll below, but if we never started a merge, we
|
||||
// need to also check here.
|
||||
if (HandleCancelledUpdate(lock)) {
|
||||
return UpdateState::Cancelled;
|
||||
}
|
||||
return state;
|
||||
|
||||
default:
|
||||
LOG(ERROR) << "No merge exists, cannot wait. Update state: "
|
||||
<< static_cast<uint32_t>(state);
|
||||
return UpdateState::None;
|
||||
return state;
|
||||
}
|
||||
|
||||
std::vector<std::string> snapshots;
|
||||
|
@ -634,6 +651,7 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock) {
|
|||
return UpdateState::MergeFailed;
|
||||
}
|
||||
|
||||
bool cancelled = false;
|
||||
bool failed = false;
|
||||
bool merging = false;
|
||||
bool needs_reboot = false;
|
||||
|
@ -651,6 +669,9 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock) {
|
|||
break;
|
||||
case UpdateState::MergeCompleted:
|
||||
break;
|
||||
case UpdateState::Cancelled:
|
||||
cancelled = true;
|
||||
break;
|
||||
default:
|
||||
LOG(ERROR) << "Unknown merge status: " << static_cast<uint32_t>(snapshot_state);
|
||||
failed = true;
|
||||
|
@ -673,6 +694,14 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock) {
|
|||
WriteUpdateState(lock, UpdateState::MergeNeedsReboot);
|
||||
return UpdateState::MergeNeedsReboot;
|
||||
}
|
||||
if (cancelled) {
|
||||
// This is an edge case, that we handle as correctly as we sensibly can.
|
||||
// The underlying partition has changed behind update_engine, and we've
|
||||
// removed the snapshot as a result. The exact state of the update is
|
||||
// undefined now, but this can only happen on an unlocked device where
|
||||
// partitions can be flashed without wiping userdata.
|
||||
return UpdateState::Cancelled;
|
||||
}
|
||||
return UpdateState::MergeCompleted;
|
||||
}
|
||||
|
||||
|
@ -684,17 +713,30 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::
|
|||
|
||||
std::string dm_name = GetSnapshotDeviceName(name, snapshot_status);
|
||||
|
||||
// During a check, we decided the merge was complete, but we were unable to
|
||||
// collapse the device-mapper stack and perform COW cleanup. If we haven't
|
||||
// rebooted after this check, the device will still be a snapshot-merge
|
||||
// target. If the have rebooted, the device will now be a linear target,
|
||||
// and we can try cleanup again.
|
||||
if (snapshot_status.state == SnapshotState::MergeCompleted && !IsSnapshotDevice(dm_name)) {
|
||||
// NB: It's okay if this fails now, we gave cleanup our best effort.
|
||||
OnSnapshotMergeComplete(lock, name, snapshot_status);
|
||||
return UpdateState::MergeCompleted;
|
||||
if (!IsSnapshotDevice(dm_name)) {
|
||||
if (IsCancelledSnapshot(name)) {
|
||||
DeleteSnapshot(lock, name);
|
||||
return UpdateState::Cancelled;
|
||||
}
|
||||
|
||||
// During a check, we decided the merge was complete, but we were unable to
|
||||
// collapse the device-mapper stack and perform COW cleanup. If we haven't
|
||||
// rebooted after this check, the device will still be a snapshot-merge
|
||||
// target. If the have rebooted, the device will now be a linear target,
|
||||
// and we can try cleanup again.
|
||||
if (snapshot_status.state == SnapshotState::MergeCompleted) {
|
||||
// NB: It's okay if this fails now, we gave cleanup our best effort.
|
||||
OnSnapshotMergeComplete(lock, name, snapshot_status);
|
||||
return UpdateState::MergeCompleted;
|
||||
}
|
||||
|
||||
LOG(ERROR) << "Expected snapshot or snapshot-merge for device: " << dm_name;
|
||||
return UpdateState::MergeFailed;
|
||||
}
|
||||
|
||||
// This check is expensive so it is only enabled for debugging.
|
||||
DCHECK(!IsCancelledSnapshot(name));
|
||||
|
||||
std::string target_type;
|
||||
DmTargetSnapshot::Status status;
|
||||
if (!QuerySnapshotStatus(dm_name, &target_type, &status)) {
|
||||
|
@ -750,12 +792,7 @@ void SnapshotManager::RemoveSnapshotBootIndicator() {
|
|||
}
|
||||
|
||||
void SnapshotManager::AcknowledgeMergeSuccess(LockedFile* lock) {
|
||||
RemoveSnapshotBootIndicator();
|
||||
|
||||
if (!WriteUpdateState(lock, UpdateState::None)) {
|
||||
// We'll try again next reboot, ad infinitum.
|
||||
return;
|
||||
}
|
||||
RemoveAllUpdateState(lock);
|
||||
}
|
||||
|
||||
void SnapshotManager::AcknowledgeMergeFailure() {
|
||||
|
@ -906,6 +943,44 @@ bool SnapshotManager::CollapseSnapshotDevice(const std::string& name,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool SnapshotManager::HandleCancelledUpdate(LockedFile* lock) {
|
||||
std::string old_slot;
|
||||
auto boot_file = GetSnapshotBootIndicatorPath();
|
||||
if (!android::base::ReadFileToString(boot_file, &old_slot)) {
|
||||
PLOG(ERROR) << "Unable to read the snapshot indicator file: " << boot_file;
|
||||
return false;
|
||||
}
|
||||
if (device_->GetSlotSuffix() != old_slot) {
|
||||
// We're booted into the target slot, which means we just rebooted
|
||||
// after applying the update.
|
||||
return false;
|
||||
}
|
||||
|
||||
// The only way we can get here is if:
|
||||
// (1) The device rolled back to the previous slot.
|
||||
// (2) This function was called prematurely before rebooting the device.
|
||||
// (3) fastboot set_active was used.
|
||||
//
|
||||
// In any case, delete the snapshots. It may be worth using the boot_control
|
||||
// HAL to differentiate case (2).
|
||||
RemoveAllUpdateState(lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SnapshotManager::IsCancelledSnapshot(const std::string& snapshot_name) {
|
||||
const auto& opener = device_->GetPartitionOpener();
|
||||
uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
|
||||
auto super_device = device_->GetSuperDevice(slot);
|
||||
auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot);
|
||||
if (!metadata) {
|
||||
LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device;
|
||||
return false;
|
||||
}
|
||||
auto partition = android::fs_mgr::FindPartition(*metadata.get(), snapshot_name);
|
||||
if (!partition) return false;
|
||||
return (partition->attributes & LP_PARTITION_ATTR_UPDATED) == 0;
|
||||
}
|
||||
|
||||
bool SnapshotManager::RemoveAllSnapshots(LockedFile* lock) {
|
||||
std::vector<std::string> snapshots;
|
||||
if (!ListSnapshots(lock, &snapshots)) {
|
||||
|
|
|
@ -347,7 +347,7 @@ TEST_F(SnapshotTest, Merge) {
|
|||
// We should not be able to cancel an update now.
|
||||
ASSERT_FALSE(sm->CancelUpdate());
|
||||
|
||||
ASSERT_EQ(sm->WaitForMerge(), UpdateState::MergeCompleted);
|
||||
ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::MergeCompleted);
|
||||
ASSERT_EQ(sm->GetUpdateState(), UpdateState::None);
|
||||
|
||||
// The device should no longer be a snapshot or snapshot-merge.
|
||||
|
@ -388,7 +388,7 @@ TEST_F(SnapshotTest, MergeCannotRemoveCow) {
|
|||
ASSERT_TRUE(sm->InitiateMerge());
|
||||
|
||||
// COW cannot be removed due to open fd, so expect a soft failure.
|
||||
ASSERT_EQ(sm->WaitForMerge(), UpdateState::MergeNeedsReboot);
|
||||
ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::MergeNeedsReboot);
|
||||
|
||||
// Forcefully delete the snapshot device, so it looks like we just rebooted.
|
||||
DeleteSnapshotDevice("test-snapshot");
|
||||
|
@ -401,7 +401,7 @@ TEST_F(SnapshotTest, MergeCannotRemoveCow) {
|
|||
fd = {};
|
||||
lock_ = nullptr;
|
||||
|
||||
ASSERT_EQ(sm->WaitForMerge(), UpdateState::MergeCompleted);
|
||||
ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::MergeCompleted);
|
||||
}
|
||||
|
||||
TEST_F(SnapshotTest, FirstStageMountAndMerge) {
|
||||
|
@ -482,6 +482,52 @@ TEST_F(SnapshotTest, FlashSuperDuringUpdate) {
|
|||
DeviceMapper::TargetInfo target;
|
||||
auto dm_name = init->GetSnapshotDeviceName("test_partition_b", status);
|
||||
ASSERT_FALSE(init->IsSnapshotDevice(dm_name, &target));
|
||||
|
||||
// We should see a cancelled update as well.
|
||||
lock_ = nullptr;
|
||||
ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::Cancelled);
|
||||
}
|
||||
|
||||
TEST_F(SnapshotTest, FlashSuperDuringMerge) {
|
||||
ON_CALL(*GetMockedPropertyFetcher(), GetBoolProperty("ro.virtual_ab.enabled", _))
|
||||
.WillByDefault(Return(true));
|
||||
|
||||
ASSERT_TRUE(AcquireLock());
|
||||
|
||||
static const uint64_t kDeviceSize = 1024 * 1024;
|
||||
|
||||
ASSERT_TRUE(CreatePartition("test_partition_a", kDeviceSize));
|
||||
ASSERT_TRUE(MapUpdatePartitions());
|
||||
ASSERT_TRUE(sm->CreateSnapshot(lock_.get(), "test_partition_b", kDeviceSize, kDeviceSize,
|
||||
kDeviceSize));
|
||||
|
||||
// Simulate a reboot into the new slot.
|
||||
lock_ = nullptr;
|
||||
ASSERT_TRUE(sm->FinishedSnapshotWrites());
|
||||
ASSERT_TRUE(DestroyLogicalPartition("test_partition_b"));
|
||||
|
||||
auto rebooted = new TestDeviceInfo(fake_super);
|
||||
rebooted->set_slot_suffix("_b");
|
||||
|
||||
auto init = SnapshotManager::NewForFirstStageMount(rebooted);
|
||||
ASSERT_NE(init, nullptr);
|
||||
ASSERT_TRUE(init->NeedSnapshotsInFirstStageMount());
|
||||
ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super"));
|
||||
ASSERT_TRUE(init->InitiateMerge());
|
||||
|
||||
// Now, reflash super. Note that we haven't called ProcessUpdateState, so the
|
||||
// status is still Merging.
|
||||
DeleteSnapshotDevice("test_partition_b");
|
||||
ASSERT_TRUE(init->image_manager()->UnmapImageDevice("test_partition_b-cow"));
|
||||
FormatFakeSuper();
|
||||
ASSERT_TRUE(CreatePartition("test_partition_b", kDeviceSize));
|
||||
ASSERT_TRUE(init->NeedSnapshotsInFirstStageMount());
|
||||
ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super"));
|
||||
|
||||
// Because the status is Merging, we must call ProcessUpdateState, which should
|
||||
// detect a cancelled update.
|
||||
ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::Cancelled);
|
||||
ASSERT_EQ(sm->GetUpdateState(), UpdateState::None);
|
||||
}
|
||||
|
||||
} // namespace snapshot
|
||||
|
|
|
@ -47,6 +47,7 @@ class TestDeviceInfo : public SnapshotManager::IDeviceInfo {
|
|||
std::string GetGsidDir() const override { return "ota/test"s; }
|
||||
std::string GetMetadataDir() const override { return "/metadata/ota/test"s; }
|
||||
std::string GetSlotSuffix() const override { return slot_suffix_; }
|
||||
std::string GetSuperDevice([[maybe_unused]] uint32_t slot) const override { return "super"; }
|
||||
const android::fs_mgr::IPartitionOpener& GetPartitionOpener() const override {
|
||||
return *opener_.get();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue