dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a
("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
This commit is contained in:
parent
65fc7c3704
commit
4ad8d880b6
|
@ -48,7 +48,7 @@ struct dm_exception_table {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct dm_snapshot {
|
struct dm_snapshot {
|
||||||
struct mutex lock;
|
struct rw_semaphore lock;
|
||||||
|
|
||||||
struct dm_dev *origin;
|
struct dm_dev *origin;
|
||||||
struct dm_dev *cow;
|
struct dm_dev *cow;
|
||||||
|
@ -457,9 +457,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
|
||||||
if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
|
if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
mutex_lock(&s->lock);
|
down_read(&s->lock);
|
||||||
active = s->active;
|
active = s->active;
|
||||||
mutex_unlock(&s->lock);
|
up_read(&s->lock);
|
||||||
|
|
||||||
if (active) {
|
if (active) {
|
||||||
if (snap_src)
|
if (snap_src)
|
||||||
|
@ -927,7 +927,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
|
||||||
int r;
|
int r;
|
||||||
chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
|
chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
|
||||||
|
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Process chunks (and associated exceptions) in reverse order
|
* Process chunks (and associated exceptions) in reverse order
|
||||||
|
@ -942,7 +942,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
|
||||||
b = __release_queued_bios_after_merge(s);
|
b = __release_queued_bios_after_merge(s);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
if (b)
|
if (b)
|
||||||
flush_bios(b);
|
flush_bios(b);
|
||||||
|
|
||||||
|
@ -1001,9 +1001,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
|
||||||
if (linear_chunks < 0) {
|
if (linear_chunks < 0) {
|
||||||
DMERR("Read error in exception store: "
|
DMERR("Read error in exception store: "
|
||||||
"shutting down merge");
|
"shutting down merge");
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
s->merge_failed = 1;
|
s->merge_failed = 1;
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
}
|
}
|
||||||
goto shut;
|
goto shut;
|
||||||
}
|
}
|
||||||
|
@ -1044,10 +1044,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
|
||||||
previous_count = read_pending_exceptions_done_count();
|
previous_count = read_pending_exceptions_done_count();
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
s->first_merging_chunk = old_chunk;
|
s->first_merging_chunk = old_chunk;
|
||||||
s->num_merging_chunks = linear_chunks;
|
s->num_merging_chunks = linear_chunks;
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
|
|
||||||
/* Wait until writes to all 'linear_chunks' drain */
|
/* Wait until writes to all 'linear_chunks' drain */
|
||||||
for (i = 0; i < linear_chunks; i++)
|
for (i = 0; i < linear_chunks; i++)
|
||||||
|
@ -1089,10 +1089,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
shut:
|
shut:
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
s->merge_failed = 1;
|
s->merge_failed = 1;
|
||||||
b = __release_queued_bios_after_merge(s);
|
b = __release_queued_bios_after_merge(s);
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
error_bios(b);
|
error_bios(b);
|
||||||
|
|
||||||
merge_shutdown(s);
|
merge_shutdown(s);
|
||||||
|
@ -1191,7 +1191,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||||
s->exception_start_sequence = 0;
|
s->exception_start_sequence = 0;
|
||||||
s->exception_complete_sequence = 0;
|
s->exception_complete_sequence = 0;
|
||||||
s->out_of_order_tree = RB_ROOT;
|
s->out_of_order_tree = RB_ROOT;
|
||||||
mutex_init(&s->lock);
|
init_rwsem(&s->lock);
|
||||||
INIT_LIST_HEAD(&s->list);
|
INIT_LIST_HEAD(&s->list);
|
||||||
spin_lock_init(&s->pe_lock);
|
spin_lock_init(&s->pe_lock);
|
||||||
s->state_bits = 0;
|
s->state_bits = 0;
|
||||||
|
@ -1357,9 +1357,9 @@ static void snapshot_dtr(struct dm_target *ti)
|
||||||
/* Check whether exception handover must be cancelled */
|
/* Check whether exception handover must be cancelled */
|
||||||
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
||||||
if (snap_src && snap_dest && (s == snap_src)) {
|
if (snap_src && snap_dest && (s == snap_src)) {
|
||||||
mutex_lock(&snap_dest->lock);
|
down_write(&snap_dest->lock);
|
||||||
snap_dest->valid = 0;
|
snap_dest->valid = 0;
|
||||||
mutex_unlock(&snap_dest->lock);
|
up_write(&snap_dest->lock);
|
||||||
DMERR("Cancelling snapshot handover.");
|
DMERR("Cancelling snapshot handover.");
|
||||||
}
|
}
|
||||||
up_read(&_origins_lock);
|
up_read(&_origins_lock);
|
||||||
|
@ -1390,8 +1390,6 @@ static void snapshot_dtr(struct dm_target *ti)
|
||||||
|
|
||||||
dm_exception_store_destroy(s->store);
|
dm_exception_store_destroy(s->store);
|
||||||
|
|
||||||
mutex_destroy(&s->lock);
|
|
||||||
|
|
||||||
dm_put_device(ti, s->cow);
|
dm_put_device(ti, s->cow);
|
||||||
|
|
||||||
dm_put_device(ti, s->origin);
|
dm_put_device(ti, s->origin);
|
||||||
|
@ -1479,7 +1477,7 @@ static void pending_complete(void *context, int success)
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
/* Read/write error - snapshot is unusable */
|
/* Read/write error - snapshot is unusable */
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
__invalidate_snapshot(s, -EIO);
|
__invalidate_snapshot(s, -EIO);
|
||||||
error = 1;
|
error = 1;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -1487,14 +1485,14 @@ static void pending_complete(void *context, int success)
|
||||||
|
|
||||||
e = alloc_completed_exception(GFP_NOIO);
|
e = alloc_completed_exception(GFP_NOIO);
|
||||||
if (!e) {
|
if (!e) {
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
__invalidate_snapshot(s, -ENOMEM);
|
__invalidate_snapshot(s, -ENOMEM);
|
||||||
error = 1;
|
error = 1;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
*e = pe->e;
|
*e = pe->e;
|
||||||
|
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
if (!s->valid) {
|
if (!s->valid) {
|
||||||
free_completed_exception(e);
|
free_completed_exception(e);
|
||||||
error = 1;
|
error = 1;
|
||||||
|
@ -1512,9 +1510,9 @@ static void pending_complete(void *context, int success)
|
||||||
|
|
||||||
/* Wait for conflicting reads to drain */
|
/* Wait for conflicting reads to drain */
|
||||||
if (__chunk_is_tracked(s, pe->e.old_chunk)) {
|
if (__chunk_is_tracked(s, pe->e.old_chunk)) {
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
__check_for_conflicting_io(s, pe->e.old_chunk);
|
__check_for_conflicting_io(s, pe->e.old_chunk);
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
@ -1527,7 +1525,7 @@ static void pending_complete(void *context, int success)
|
||||||
full_bio->bi_end_io = pe->full_bio_end_io;
|
full_bio->bi_end_io = pe->full_bio_end_io;
|
||||||
increment_pending_exceptions_done_count();
|
increment_pending_exceptions_done_count();
|
||||||
|
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
|
|
||||||
/* Submit any pending write bios */
|
/* Submit any pending write bios */
|
||||||
if (error) {
|
if (error) {
|
||||||
|
@ -1750,7 +1748,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
|
||||||
if (!s->valid)
|
if (!s->valid)
|
||||||
return DM_MAPIO_KILL;
|
return DM_MAPIO_KILL;
|
||||||
|
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
|
|
||||||
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
|
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
|
||||||
bio_data_dir(bio) == WRITE)) {
|
bio_data_dir(bio) == WRITE)) {
|
||||||
|
@ -1773,9 +1771,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
|
||||||
if (bio_data_dir(bio) == WRITE) {
|
if (bio_data_dir(bio) == WRITE) {
|
||||||
pe = __lookup_pending_exception(s, chunk);
|
pe = __lookup_pending_exception(s, chunk);
|
||||||
if (!pe) {
|
if (!pe) {
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
pe = alloc_pending_exception(s);
|
pe = alloc_pending_exception(s);
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
|
|
||||||
if (!s->valid || s->snapshot_overflowed) {
|
if (!s->valid || s->snapshot_overflowed) {
|
||||||
free_pending_exception(pe);
|
free_pending_exception(pe);
|
||||||
|
@ -1810,7 +1808,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
|
||||||
bio->bi_iter.bi_size ==
|
bio->bi_iter.bi_size ==
|
||||||
(s->store->chunk_size << SECTOR_SHIFT)) {
|
(s->store->chunk_size << SECTOR_SHIFT)) {
|
||||||
pe->started = 1;
|
pe->started = 1;
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
start_full_bio(pe, bio);
|
start_full_bio(pe, bio);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
@ -1820,7 +1818,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
|
||||||
if (!pe->started) {
|
if (!pe->started) {
|
||||||
/* this is protected by snap->lock */
|
/* this is protected by snap->lock */
|
||||||
pe->started = 1;
|
pe->started = 1;
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
start_copy(pe);
|
start_copy(pe);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
@ -1830,7 +1828,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
|
||||||
}
|
}
|
||||||
|
|
||||||
out_unlock:
|
out_unlock:
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
out:
|
out:
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
@ -1866,7 +1864,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
|
||||||
|
|
||||||
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
|
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
|
||||||
|
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
|
|
||||||
/* Full merging snapshots are redirected to the origin */
|
/* Full merging snapshots are redirected to the origin */
|
||||||
if (!s->valid)
|
if (!s->valid)
|
||||||
|
@ -1897,12 +1895,12 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
|
||||||
bio_set_dev(bio, s->origin->bdev);
|
bio_set_dev(bio, s->origin->bdev);
|
||||||
|
|
||||||
if (bio_data_dir(bio) == WRITE) {
|
if (bio_data_dir(bio) == WRITE) {
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
return do_origin(s->origin, bio);
|
return do_origin(s->origin, bio);
|
||||||
}
|
}
|
||||||
|
|
||||||
out_unlock:
|
out_unlock:
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
@ -1934,7 +1932,7 @@ static int snapshot_preresume(struct dm_target *ti)
|
||||||
down_read(&_origins_lock);
|
down_read(&_origins_lock);
|
||||||
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
||||||
if (snap_src && snap_dest) {
|
if (snap_src && snap_dest) {
|
||||||
mutex_lock(&snap_src->lock);
|
down_read(&snap_src->lock);
|
||||||
if (s == snap_src) {
|
if (s == snap_src) {
|
||||||
DMERR("Unable to resume snapshot source until "
|
DMERR("Unable to resume snapshot source until "
|
||||||
"handover completes.");
|
"handover completes.");
|
||||||
|
@ -1944,7 +1942,7 @@ static int snapshot_preresume(struct dm_target *ti)
|
||||||
"source is suspended.");
|
"source is suspended.");
|
||||||
r = -EINVAL;
|
r = -EINVAL;
|
||||||
}
|
}
|
||||||
mutex_unlock(&snap_src->lock);
|
up_read(&snap_src->lock);
|
||||||
}
|
}
|
||||||
up_read(&_origins_lock);
|
up_read(&_origins_lock);
|
||||||
|
|
||||||
|
@ -1990,11 +1988,11 @@ static void snapshot_resume(struct dm_target *ti)
|
||||||
|
|
||||||
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
||||||
if (snap_src && snap_dest) {
|
if (snap_src && snap_dest) {
|
||||||
mutex_lock(&snap_src->lock);
|
down_write(&snap_src->lock);
|
||||||
mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
|
down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
|
||||||
__handover_exceptions(snap_src, snap_dest);
|
__handover_exceptions(snap_src, snap_dest);
|
||||||
mutex_unlock(&snap_dest->lock);
|
up_write(&snap_dest->lock);
|
||||||
mutex_unlock(&snap_src->lock);
|
up_write(&snap_src->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
up_read(&_origins_lock);
|
up_read(&_origins_lock);
|
||||||
|
@ -2009,9 +2007,9 @@ static void snapshot_resume(struct dm_target *ti)
|
||||||
/* Now we have correct chunk size, reregister */
|
/* Now we have correct chunk size, reregister */
|
||||||
reregister_snapshot(s);
|
reregister_snapshot(s);
|
||||||
|
|
||||||
mutex_lock(&s->lock);
|
down_write(&s->lock);
|
||||||
s->active = 1;
|
s->active = 1;
|
||||||
mutex_unlock(&s->lock);
|
up_write(&s->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
|
static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
|
||||||
|
@ -2051,7 +2049,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case STATUSTYPE_INFO:
|
case STATUSTYPE_INFO:
|
||||||
|
|
||||||
mutex_lock(&snap->lock);
|
down_write(&snap->lock);
|
||||||
|
|
||||||
if (!snap->valid)
|
if (!snap->valid)
|
||||||
DMEMIT("Invalid");
|
DMEMIT("Invalid");
|
||||||
|
@ -2076,7 +2074,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
|
||||||
DMEMIT("Unknown");
|
DMEMIT("Unknown");
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_unlock(&snap->lock);
|
up_write(&snap->lock);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -2142,7 +2140,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
|
||||||
if (dm_target_is_snapshot_merge(snap->ti))
|
if (dm_target_is_snapshot_merge(snap->ti))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
mutex_lock(&snap->lock);
|
down_write(&snap->lock);
|
||||||
|
|
||||||
/* Only deal with valid and active snapshots */
|
/* Only deal with valid and active snapshots */
|
||||||
if (!snap->valid || !snap->active)
|
if (!snap->valid || !snap->active)
|
||||||
|
@ -2169,9 +2167,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
|
||||||
if (e)
|
if (e)
|
||||||
goto next_snapshot;
|
goto next_snapshot;
|
||||||
|
|
||||||
mutex_unlock(&snap->lock);
|
up_write(&snap->lock);
|
||||||
pe = alloc_pending_exception(snap);
|
pe = alloc_pending_exception(snap);
|
||||||
mutex_lock(&snap->lock);
|
down_write(&snap->lock);
|
||||||
|
|
||||||
if (!snap->valid) {
|
if (!snap->valid) {
|
||||||
free_pending_exception(pe);
|
free_pending_exception(pe);
|
||||||
|
@ -2221,7 +2219,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
|
||||||
}
|
}
|
||||||
|
|
||||||
next_snapshot:
|
next_snapshot:
|
||||||
mutex_unlock(&snap->lock);
|
up_write(&snap->lock);
|
||||||
|
|
||||||
if (pe_to_start_now) {
|
if (pe_to_start_now) {
|
||||||
start_copy(pe_to_start_now);
|
start_copy(pe_to_start_now);
|
||||||
|
|
Loading…
Reference in New Issue