From 503785306d182ab624a2232856ef8ab503ee85f9 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Fri, 18 Dec 2015 21:33:05 +0800 Subject: [PATCH 01/17] btrfs: reada: Fix in-segment calculation for reada reada_zone->end is end pos of segment: end = start + cache->key.offset - 1; So we need to use "<=" in condition to judge is a pos in the segment. The problem happened rearly, because logical pos rarely pointed to last 4k of a blockgroup, but we need to fix it to make code right in logic. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 619f92963e27..49b3fb73ffbf 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -265,7 +265,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); if (ret == 1) { - if (logical >= zone->start && logical < zone->end) + if (logical >= zone->start && logical <= zone->end) return zone; spin_lock(&fs_info->reada_lock); kref_put(&zone->refcnt, reada_zone_release); @@ -679,7 +679,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, */ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, dev->reada_next >> PAGE_CACHE_SHIFT, 1); - if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + if (ret == 0 || re->logical > dev->reada_curr_zone->end) { ret = reada_pick_zone(dev); if (!ret) { spin_unlock(&fs_info->reada_lock); From c37f49c7ef317fb8043fd28594d1e5d728a1ef89 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Fri, 18 Dec 2015 21:48:48 +0800 Subject: [PATCH 02/17] btrfs: reada: reduce additional fs_info->reada_lock in reada_find_zone We can avoid additional locking-acquirment and one pair of kref_get/put by combine two condition. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 49b3fb73ffbf..74480809be76 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -260,18 +260,14 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) + if (ret == 1 && logical >= zone->start && logical <= zone->end) { kref_get(&zone->refcnt); - spin_unlock(&fs_info->reada_lock); - - if (ret == 1) { - if (logical >= zone->start && logical <= zone->end) - return zone; - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); + return zone; } + spin_unlock(&fs_info->reada_lock); + cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return NULL; From 8e9aa51f5405b2a01a44818120116b65a2ba4d3a Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Fri, 18 Dec 2015 21:56:08 +0800 Subject: [PATCH 03/17] btrfs: reada: Add missed segment checking in reada_find_zone In rechecking zone-in-tree, we still need to check zone include our logical address. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 74480809be76..7e5d4ac800d9 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -303,8 +303,10 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, kfree(zone); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) + if (ret == 1 && logical >= zone->start && logical <= zone->end) kref_get(&zone->refcnt); + else + zone = NULL; } spin_unlock(&fs_info->reada_lock); From 97d5f0e63d78a48cef35349261f90d0d4266dc24 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 16:57:21 +0800 Subject: [PATCH 04/17] btrfs: reada: Avoid many times of empty loop We can see following loop(10000 times) in trace_log: [ 75.416137] ZL_DEBUG: reada_start_machine_dev:730: pid=771 comm=kworker/u2:3 re->ref_cnt ffff88003741e0c0 1 -> 2 [ 75.417413] ZL_DEBUG: reada_extent_put:524: pid=771 comm=kworker/u2:3 re = ffff88003741e0c0, refcnt = 2 -> 1 [ 75.418611] ZL_DEBUG: __readahead_hook:129: pid=771 comm=kworker/u2:3 re->ref_cnt ffff88003741e0c0 1 -> 2 [ 75.419793] ZL_DEBUG: reada_extent_put:524: pid=771 comm=kworker/u2:3 re = ffff88003741e0c0, refcnt = 2 -> 1 [ 75.421016] ZL_DEBUG: reada_start_machine_dev:730: pid=771 comm=kworker/u2:3 re->ref_cnt ffff88003741e0c0 1 -> 2 [ 75.422324] ZL_DEBUG: reada_extent_put:524: pid=771 comm=kworker/u2:3 re = ffff88003741e0c0, refcnt = 2 -> 1 [ 75.423661] ZL_DEBUG: __readahead_hook:129: pid=771 comm=kworker/u2:3 re->ref_cnt ffff88003741e0c0 1 -> 2 [ 75.424882] ZL_DEBUG: reada_extent_put:524: pid=771 comm=kworker/u2:3 re = ffff88003741e0c0, refcnt = 2 -> 1 ...(10000 times) [ 124.101672] ZL_DEBUG: reada_start_machine_dev:730: pid=771 comm=kworker/u2:3 re->ref_cnt ffff88003741e0c0 1 -> 2 [ 124.102850] ZL_DEBUG: reada_extent_put:524: pid=771 comm=kworker/u2:3 re = ffff88003741e0c0, refcnt = 2 -> 1 [ 124.104008] ZL_DEBUG: __readahead_hook:129: pid=771 comm=kworker/u2:3 re->ref_cnt ffff88003741e0c0 1 -> 2 [ 124.105121] ZL_DEBUG: reada_extent_put:524: pid=771 comm=kworker/u2:3 re = ffff88003741e0c0, refcnt = 2 -> 1 Reason: If more than one user trigger reada in same extent, the first task finished setting of reada data struct and call reada_start_machine() to start, and the second task only add a ref_count but have not add reada_extctl struct completely, the reada_extent can not finished all jobs, and will be selected in __reada_start_machine() for 10000 times(total times in __reada_start_machine()). Fix: For a reada_extent without job, we don't need to run it, just return 0 to let caller break. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 7e5d4ac800d9..f0cf5f3b865a 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -708,7 +708,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, logical = re->logical; spin_lock(&re->lock); - if (re->scheduled_for == NULL) { + if (!re->scheduled_for && !list_empty(&re->extctl)) { re->scheduled_for = dev; need_kick = 1; } From a3f7fde24350a17a589c470265ac2d5a0e6f119f Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 22:57:52 +0800 Subject: [PATCH 05/17] btrfs: reada: Move is_need_to_readahead contition earlier Move is_need_to_readahead contition earlier to avoid useless loop to get relative data for readahead. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index f0cf5f3b865a..b2f768a3174a 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -660,7 +660,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, u64 logical; int ret; int i; - int need_kick = 0; spin_lock(&fs_info->reada_lock); if (dev->reada_curr_zone == NULL) { @@ -696,6 +695,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); + spin_lock(&re->lock); + if (re->scheduled_for || list_empty(&re->extctl)) { + spin_unlock(&re->lock); + reada_extent_put(fs_info, re); + return 0; + } + re->scheduled_for = dev; + spin_unlock(&re->lock); + /* * find mirror num */ @@ -707,18 +715,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } logical = re->logical; - spin_lock(&re->lock); - if (!re->scheduled_for && !list_empty(&re->extctl)) { - re->scheduled_for = dev; - need_kick = 1; - } - spin_unlock(&re->lock); - reada_extent_put(fs_info, re); - if (!need_kick) - return 0; - atomic_inc(&dev->reada_in_flight); ret = reada_tree_block_flagged(fs_info->extent_root, logical, mirror_num, &eb); From 6a159d2ae488a835a8ca5f1f658db72b7e13d064 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 18:15:47 +0800 Subject: [PATCH 06/17] btrfs: reada: add all reachable mirrors into reada device list If some device is not reachable, we should bypass and continus addingb next, instead of break on bad device. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b2f768a3174a..27a2fe914c67 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -328,7 +328,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 length; int real_stripes; int nzones = 0; - int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; int dev_replace_is_ongoing; @@ -375,9 +374,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, dev = bbio->stripes[nzones].dev; zone = reada_find_zone(fs_info, dev, logical, bbio); if (!zone) - break; + continue; - re->zones[nzones] = zone; + re->zones[re->nzones++] = zone; spin_lock(&zone->lock); if (!zone->elems) kref_get(&zone->refcnt); @@ -387,8 +386,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - re->nzones = nzones; - if (nzones == 0) { + if (re->nzones == 0) { /* not a single zone found, error and out */ goto error; } @@ -413,8 +411,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); - for (i = 0; i < nzones; ++i) { - dev = bbio->stripes[i].dev; + for (nzones = 0; nzones < re->nzones; ++nzones) { + dev = re->zones[nzones]->device; + if (dev == prev_dev) { /* * in case of DUP, just add the first zone. As both @@ -445,8 +444,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { - while (--i >= 0) { - dev = bbio->stripes[i].dev; + while (--nzones >= 0) { + dev = re->zones[nzones]->device; BUG_ON(dev == NULL); /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); @@ -465,10 +464,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, return re; error: - while (nzones) { + for (nzones = 0; nzones < re->nzones; ++nzones) { struct reada_zone *zone; - --nzones; zone = re->zones[nzones]; kref_get(&zone->refcnt); spin_lock(&zone->lock); From 319450211842ba92d0604af6e4ddf15f445efbcf Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 18:48:54 +0800 Subject: [PATCH 07/17] btrfs: reada: bypass adding extent when all zone failed When failed adding all dev_zones for a reada_extent, the extent will have no chance to be selected to run, and keep in memory for ever. We should bypass this extent to avoid above case. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 27a2fe914c67..1d00de7369a8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -330,6 +330,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int nzones = 0; unsigned long index = logical >> PAGE_CACHE_SHIFT; int dev_replace_is_ongoing; + int have_zone = 0; spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -456,10 +457,14 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } + have_zone = 1; } spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace); + if (!have_zone) + goto error; + btrfs_put_bbio(bbio); return re; From 1e7970c0f31d6916e0ab523e5f05c9741f71955f Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 20:30:00 +0800 Subject: [PATCH 08/17] btrfs: reada: Remove level argument in severial functions level is not used in severial functions, remove them from arguments, and remove relative code for get its value. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 1d00de7369a8..b40dbf717737 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -101,7 +101,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info); static void __reada_start_machine(struct btrfs_fs_info *fs_info); static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation); + struct btrfs_key *top, u64 generation); /* recurses */ /* in case of err, eb might be NULL */ @@ -197,8 +197,7 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, if (rec->generation == generation && btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, - level - 1, n_gen); + reada_add_block(rc, bytenr, &next_key, n_gen); } } /* @@ -315,7 +314,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 logical, - struct btrfs_key *top, int level) + struct btrfs_key *top) { int ret; struct reada_extent *re = NULL; @@ -557,13 +556,13 @@ static void reada_control_release(struct kref *kref) } static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation) + struct btrfs_key *top, u64 generation) { struct btrfs_root *root = rc->root; struct reada_extent *re; struct reada_extctl *rec; - re = reada_find_extent(root, logical, top, level); /* takes one ref */ + re = reada_find_extent(root, logical, top); /* takes one ref */ if (!re) return -1; @@ -916,7 +915,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct reada_control *rc; u64 start; u64 generation; - int level; int ret; struct extent_buffer *node; static struct btrfs_key max_key = { @@ -939,11 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, node = btrfs_root_node(root); start = node->start; - level = btrfs_header_level(node); generation = btrfs_header_generation(node); free_extent_buffer(node); - ret = reada_add_block(rc, start, &max_key, level, generation); + ret = reada_add_block(rc, start, &max_key, generation); if (ret) { kfree(rc); return ERR_PTR(ret); From b257cf50060cada63a3fe11a932909b9badf4595 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 21:07:17 +0800 Subject: [PATCH 09/17] btrfs: reada: move reada_extent_put to place after __readahead_hook() We can't release reada_extent earlier than __readahead_hook(), because __readahead_hook() still need to use it, it is necessary to hode a refcnt to avoid it be freed. Actually it is not a problem after my patch named: Avoid many times of empty loop It make reada_extent in above line include at least one reada_extctl, which keeps additional one refcnt for reada_extent. But we still need this patch to make the code in pretty logic. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b40dbf717737..f7642e569bd9 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -717,8 +717,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } logical = re->logical; - reada_extent_put(fs_info, re); - atomic_inc(&dev->reada_in_flight); ret = reada_tree_block_flagged(fs_info->extent_root, logical, mirror_num, &eb); @@ -730,6 +728,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, if (eb) free_extent_buffer(eb); + reada_extent_put(fs_info, re); + return 1; } From 6e39dbe8b9e55280c396a19fb92e82f1b56f83d7 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 22:09:05 +0800 Subject: [PATCH 10/17] btrfs: reada: Pass reada_extent into __readahead_hook directly reada_start_machine_dev() already have reada_extent pointer, pass it into __readahead_hook() directly instead of search radix_tree will make code run faster. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index f7642e569bd9..203bec4823e1 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -105,33 +105,21 @@ static int reada_add_block(struct reada_control *rc, u64 logical, /* recurses */ /* in case of err, eb might be NULL */ -static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) +static void __readahead_hook(struct btrfs_root *root, struct reada_extent *re, + struct extent_buffer *eb, u64 start, int err) { int level = 0; int nritems; int i; u64 bytenr; u64 generation; - struct reada_extent *re; struct btrfs_fs_info *fs_info = root->fs_info; struct list_head list; - unsigned long index = start >> PAGE_CACHE_SHIFT; struct btrfs_device *for_dev; if (eb) level = btrfs_header_level(eb); - /* find extent */ - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - - if (!re) - return -1; - spin_lock(&re->lock); /* * just take the full list from the extent. afterwards we @@ -221,11 +209,11 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, reada_extent_put(fs_info, re); /* one ref for each entry */ } - reada_extent_put(fs_info, re); /* our ref */ + if (for_dev) atomic_dec(&for_dev->reada_in_flight); - return 0; + return; } /* @@ -235,12 +223,27 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err) { - int ret; + int ret = 0; + struct reada_extent *re; + struct btrfs_fs_info *fs_info = root->fs_info; - ret = __readahead_hook(root, eb, start, err); + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, + start >> PAGE_CACHE_SHIFT); + if (re) + re->refcnt++; + spin_unlock(&fs_info->reada_lock); + if (!re) { + ret = -1; + goto start_machine; + } - reada_start_machine(root->fs_info); + __readahead_hook(fs_info, re, eb, start, err); + reada_extent_put(fs_info, re); /* our ref */ +start_machine: + reada_start_machine(fs_info); return ret; } @@ -721,9 +724,9 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, ret = reada_tree_block_flagged(fs_info->extent_root, logical, mirror_num, &eb); if (ret) - __readahead_hook(fs_info->extent_root, NULL, logical, ret); + __readahead_hook(fs_info->extent_root, re, NULL, logical, ret); else if (eb) - __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + __readahead_hook(fs_info->extent_root, re, eb, eb->start, ret); if (eb) free_extent_buffer(eb); From 02873e432518f84ad8f15d8911e79659ea38085f Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 22:46:45 +0800 Subject: [PATCH 11/17] btrfs: reada: Use fs_info instead of root in __readahead_hook's argument What __readahead_hook() need exactly is fs_info, no need to convert fs_info to root in caller and convert back in __readahead_hook() Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/disk-io.c | 22 +++++++++++----------- fs/btrfs/reada.c | 23 +++++++++++------------ 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bfe4a337fb4d..e557e05d2318 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4525,8 +4525,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err); +int btree_readahead_hook(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, u64 start, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4545e2e2ad45..498156ede32c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -612,6 +612,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, int found_level; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int reads_done; @@ -637,21 +638,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", - found_start, eb->start); + btrfs_err_rl(fs_info, "bad tree block start %llu %llu", + found_start, eb->start); ret = -EIO; goto err; } - if (check_tree_block_fsid(root->fs_info, eb)) { - btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", - eb->start); + if (check_tree_block_fsid(fs_info, eb)) { + btrfs_err_rl(fs_info, "bad fsid on block %llu", + eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(root->fs_info, "bad tree block level %d", - (int)btrfs_header_level(eb)); + btrfs_err(fs_info, "bad tree block level %d", + (int)btrfs_header_level(eb)); ret = -EIO; goto err; } @@ -659,7 +660,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(root->fs_info, eb, 1); + ret = csum_tree_block(fs_info, eb, 1); if (ret) { ret = -EIO; goto err; @@ -680,7 +681,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(root, eb, eb->start, ret); + btree_readahead_hook(fs_info, eb, eb->start, ret); if (ret) { /* @@ -699,14 +700,13 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, static int btree_io_failed_hook(struct page *page, int failed_mirror) { struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(root, eb, eb->start, -EIO); + btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 203bec4823e1..6d9069d01914 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -105,15 +105,15 @@ static int reada_add_block(struct reada_control *rc, u64 logical, /* recurses */ /* in case of err, eb might be NULL */ -static void __readahead_hook(struct btrfs_root *root, struct reada_extent *re, - struct extent_buffer *eb, u64 start, int err) +static void __readahead_hook(struct btrfs_fs_info *fs_info, + struct reada_extent *re, struct extent_buffer *eb, + u64 start, int err) { int level = 0; int nritems; int i; u64 bytenr; u64 generation; - struct btrfs_fs_info *fs_info = root->fs_info; struct list_head list; struct btrfs_device *for_dev; @@ -176,10 +176,10 @@ static void __readahead_hook(struct btrfs_root *root, struct reada_extent *re, */ #ifdef DEBUG if (rec->generation != generation) { - btrfs_debug(root->fs_info, - "generation mismatch for (%llu,%d,%llu) %llu != %llu", - key.objectid, key.type, key.offset, - rec->generation, generation); + btrfs_debug(fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", + key.objectid, key.type, key.offset, + rec->generation, generation); } #endif if (rec->generation == generation && @@ -220,12 +220,11 @@ static void __readahead_hook(struct btrfs_root *root, struct reada_extent *re, * start is passed separately in case eb in NULL, which may be the case with * failed I/O */ -int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) +int btree_readahead_hook(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, u64 start, int err) { int ret = 0; struct reada_extent *re; - struct btrfs_fs_info *fs_info = root->fs_info; /* find extent */ spin_lock(&fs_info->reada_lock); @@ -724,9 +723,9 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, ret = reada_tree_block_flagged(fs_info->extent_root, logical, mirror_num, &eb); if (ret) - __readahead_hook(fs_info->extent_root, re, NULL, logical, ret); + __readahead_hook(fs_info, re, NULL, logical, ret); else if (eb) - __readahead_hook(fs_info->extent_root, re, eb, eb->start, ret); + __readahead_hook(fs_info, re, eb, eb->start, ret); if (eb) free_extent_buffer(eb); From 57f16e08269c6a91fb77508b2fe58130c6442d94 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 22:20:59 +0800 Subject: [PATCH 12/17] btrfs: reada: Jump into cleanup in direct way for __readahead_hook() Current code set nritems to 0 to make for_loop useless to bypass it, and set generation's value which is not necessary. Jump into cleanup directly is better choise. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 6d9069d01914..04d3e7c8ada0 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -130,26 +130,26 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, re->scheduled_for = NULL; spin_unlock(&re->lock); - if (err == 0) { - nritems = level ? btrfs_header_nritems(eb) : 0; - generation = btrfs_header_generation(eb); - /* - * FIXME: currently we just set nritems to 0 if this is a leaf, - * effectively ignoring the content. In a next step we could - * trigger more readahead depending from the content, e.g. - * fetch the checksums for the extents in the leaf. - */ - } else { - /* - * this is the error case, the extent buffer has not been - * read correctly. We won't access anything from it and - * just cleanup our data structures. Effectively this will - * cut the branch below this node from read ahead. - */ - nritems = 0; - generation = 0; - } + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + if (err) + goto cleanup; + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + if (!level) + goto cleanup; + + nritems = btrfs_header_nritems(eb); + generation = btrfs_header_generation(eb); for (i = 0; i < nritems; i++) { struct reada_extctl *rec; u64 n_gen; @@ -188,6 +188,8 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, reada_add_block(rc, bytenr, &next_key, n_gen); } } + +cleanup: /* * free extctl records */ From 8afd6841e13f8bbdf543c576bc1b919d331003ea Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 31 Dec 2015 22:28:51 +0800 Subject: [PATCH 13/17] btrfs: reada: Fix a debug code typo Remove one copy of loop to fix the typo of iterate zones. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 04d3e7c8ada0..5871306a5a3b 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -893,14 +893,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) printk(KERN_CONT " zone %llu-%llu devs", re->zones[i]->start, re->zones[i]->end); - for (i = 0; i < re->nzones; ++i) { - printk(KERN_CONT " zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - printk(KERN_CONT " %lld", - re->zones[i]->devs[j]->devid); - } + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); } } printk(KERN_CONT "\n"); From 895a11b868347ca8e287f152f7816862ad4b179d Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 12 Jan 2016 14:58:39 +0800 Subject: [PATCH 14/17] btrfs: reada: simplify dev->reada_in_flight processing No need to decrease dev->reada_in_flight in __readahead_hook()'s internal and reada_extent_put(). reada_extent_put() have no chance to decrease dev->reada_in_flight in free operation, because reada_extent have additional refcnt when scheduled to a dev. We can put inc and dec operation for dev->reada_in_flight to one place instead to make logic simple and safe, and move useless reada_extent->scheduled_for to a bool flag instead. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 5871306a5a3b..9157d789ae86 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -72,7 +72,7 @@ struct reada_extent { spinlock_t lock; struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; - struct btrfs_device *scheduled_for; + int scheduled; }; struct reada_zone { @@ -115,7 +115,6 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, u64 bytenr; u64 generation; struct list_head list; - struct btrfs_device *for_dev; if (eb) level = btrfs_header_level(eb); @@ -126,8 +125,7 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, * don't need the lock anymore */ list_replace_init(&re->extctl, &list); - for_dev = re->scheduled_for; - re->scheduled_for = NULL; + re->scheduled = 0; spin_unlock(&re->lock); /* @@ -212,9 +210,6 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, reada_extent_put(fs_info, re); /* one ref for each entry */ } - if (for_dev) - atomic_dec(&for_dev->reada_in_flight); - return; } @@ -535,8 +530,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - if (re->scheduled_for) - atomic_dec(&re->scheduled_for->reada_in_flight); kfree(re); } @@ -702,12 +695,12 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); spin_lock(&re->lock); - if (re->scheduled_for || list_empty(&re->extctl)) { + if (re->scheduled || list_empty(&re->extctl)) { spin_unlock(&re->lock); reada_extent_put(fs_info, re); return 0; } - re->scheduled_for = dev; + re->scheduled = 1; spin_unlock(&re->lock); /* @@ -732,6 +725,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, if (eb) free_extent_buffer(eb); + atomic_dec(&dev->reada_in_flight); reada_extent_put(fs_info, re); return 1; @@ -850,10 +844,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; printk(KERN_DEBUG - " re: logical %llu size %u empty %d for %lld", + " re: logical %llu size %u empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), re->scheduled_for ? - re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", @@ -880,15 +873,14 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) index, 1); if (ret == 0) break; - if (!re->scheduled_for) { + if (!re->scheduled) { index = (re->logical >> PAGE_CACHE_SHIFT) + 1; continue; } printk(KERN_DEBUG - "re: logical %llu size %u list empty %d for %lld", + "re: logical %llu size %u list empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), - re->scheduled_for ? re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", re->zones[i]->start, From 2fefd5583f8b86171c898f90cadac7c09ccf9d73 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 7 Jan 2016 18:38:48 +0800 Subject: [PATCH 15/17] btrfs: reada: limit max works count Reada creates 2 works for each level of tree recursively. In case of a tree having many levels, the number of created works is 2^level_of_tree. Actually we don't need so many works in parallel, this patch limits max works to BTRFS_MAX_MIRRORS * 2. The per-fs works_counter will be also used for btrfs_reada_wait() to check is there are background workers. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 1 + fs/btrfs/reada.c | 9 ++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e557e05d2318..e43d987e1c99 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1822,6 +1822,9 @@ struct btrfs_fs_info { spinlock_t reada_lock; struct radix_tree_root reada_tree; + /* readahead works cnt */ + atomic_t reada_works_cnt; + /* Extent buffer radix tree */ spinlock_t buffer_lock; struct radix_tree_root buffer_radix; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 498156ede32c..5e3ec1fc0ac3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2603,6 +2603,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); + atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9157d789ae86..e97bc8eb01e2 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -748,6 +748,8 @@ static void reada_start_machine_worker(struct btrfs_work *work) set_task_ioprio(current, BTRFS_IOPRIO_READA); __reada_start_machine(fs_info); set_task_ioprio(current, old_ioprio); + + atomic_dec(&fs_info->reada_works_cnt); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) @@ -779,8 +781,12 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) * enqueue to workers to finish it. This will distribute the load to * the cores. */ - for (i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { reada_start_machine(fs_info); + if (atomic_read(&fs_info->reada_works_cnt) > + BTRFS_MAX_MIRRORS * 2) + break; + } } static void reada_start_machine(struct btrfs_fs_info *fs_info) @@ -797,6 +803,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); + atomic_inc(&fs_info->reada_works_cnt); } #ifdef DEBUG From 4fe7a0e13864238fe5b4cc2640e963581f96429e Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 26 Jan 2016 18:42:40 +0800 Subject: [PATCH 16/17] btrfs: reada: avoid undone reada extents in btrfs_reada_wait Reada background works is not designed to finish all jobs completely, it will break in following case: 1: When a device reaches workload limit (MAX_IN_FLIGHT) 2: Total reads reach max limit (10000) 3: All devices don't have queued more jobs, often happened in DUP case And if all background works exit with remaining jobs, btrfs_reada_wait() will wait indefinetelly. Above problem is rarely happened in old code, because: 1: Every work queues 2x new works So many works reduced chances of undone jobs. 2: One work will continue 10000 times loop in case of no-jobs It reduced no-thread window time. But after we fixed above case, the "undone reada extents" frequently happened. Fix: Check to ensure we have at least one thread if there are undone jobs in btrfs_reada_wait(). Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e97bc8eb01e2..5bcd567f4827 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -953,8 +953,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; + struct btrfs_fs_info *fs_info = rc->root->fs_info; while (atomic_read(&rc->elems)) { + if (!atomic_read(&fs_info->reada_works_cnt)) + reada_start_machine(fs_info); wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 5 * HZ); dump_devs(rc->root->fs_info, @@ -971,9 +974,13 @@ int btrfs_reada_wait(void *handle) int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; + struct btrfs_fs_info *fs_info = rc->root->fs_info; while (atomic_read(&rc->elems)) { - wait_event(rc->wait, atomic_read(&rc->elems) == 0); + if (!atomic_read(&fs_info->reada_works_cnt)) + reada_start_machine(fs_info); + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + (HZ + 9) / 10); } kref_put(&rc->refcnt, reada_control_release); From 7aff8cf4a6d6190e64386f407a7f5cc5f24c60d2 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 14 Jan 2016 18:39:00 +0800 Subject: [PATCH 17/17] btrfs: reada: ignore creating reada_extent for a non-existent device For a non-existent device, old code bypasses adding it in dev's reada queue. And to solve problem of unfinished waitting in raid5/6, commit 5fbc7c59fd22 ("Btrfs: fix unfinished readahead thread for raid5/6 degraded mounting") adding an exception for the first stripe, in short, the first stripe will always be processed whether the device exists or not. Actually we have a better way for the above request: just bypass creation of the reada_extent for non-existent device, it will make code simple and effective. Signed-off-by: Zhao Lei Signed-off-by: David Sterba --- fs/btrfs/reada.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 5bcd567f4827..dd5f361f1a8e 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -371,6 +371,11 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct reada_zone *zone; dev = bbio->stripes[nzones].dev; + + /* cannot read ahead on missing device. */ + if (!dev->bdev) + continue; + zone = reada_find_zone(fs_info, dev, logical, bbio); if (!zone) continue; @@ -423,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, */ continue; } - if (!dev->bdev) { - /* - * cannot read ahead on missing device, but for RAID5/6, - * REQ_GET_READ_MIRRORS return 1. So don't skip missing - * device for such case. - */ - if (nzones > 1) - continue; - } + if (!dev->bdev) + continue; + if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { /*