Merge branch 'for-4.6/core' of git://git.kernel.dk/linux-block

Pull core block updates from Jens Axboe:
 "Here are the core block changes for this merge window.  Not a lot of
  exciting stuff going on in this round, most of the changes have been
  on the driver side of things.  That pull request is coming next.  This
  pull request contains:

   - A set of fixes for chained bio handling from Christoph.

   - A tag bounds check for blk-mq from Hannes, ensuring that we don't
     do something stupid if a device reports an invalid tag value.

   - A set of fixes/updates for the CFQ IO scheduler from Jan Kara.

   - A set of blk-mq fixes from Keith, adding support for dynamic
     hardware queues, and fixing init of max_dev_sectors for stacking
     devices.

   - A fix for the dynamic hw context from Ming.

   - Enabling of cgroup writeback support on a block device, from
     Shaohua"

* 'for-4.6/core' of git://git.kernel.dk/linux-block:
  blk-mq: add bounds check on tag-to-rq conversion
  block: bio_remaining_done() isn't unlikely
  block: cleanup bio_endio
  block: factor out chained bio completion
  block: don't unecessarily clobber bi_error for chained bios
  block-dev: enable writeback cgroup support
  blk-mq: Fix NULL pointer updating nr_requests
  blk-mq: mark request queue as mq asap
  block: Initialize max_dev_sectors to 0
  blk-mq: dynamic h/w context count
  cfq-iosched: Allow parent cgroup to preempt its child
  cfq-iosched: Allow sync noidle workloads to preempt each other
  cfq-iosched: Reorder checks in cfq_should_preempt()
  cfq-iosched: Don't group_idle if cfqq has big thinktime
This commit is contained in:
Linus Torvalds 2016-03-18 16:43:11 -07:00
commit 35d88d97be
7 changed files with 185 additions and 110 deletions

View File

@ -296,13 +296,19 @@ void bio_reset(struct bio *bio)
}
EXPORT_SYMBOL(bio_reset);
static void bio_chain_endio(struct bio *bio)
static struct bio *__bio_chain_endio(struct bio *bio)
{
struct bio *parent = bio->bi_private;
if (!parent->bi_error)
parent->bi_error = bio->bi_error;
bio_endio(parent);
bio_put(bio);
return parent;
}
static void bio_chain_endio(struct bio *bio)
{
bio_endio(__bio_chain_endio(bio));
}
/*
@ -1742,29 +1748,25 @@ static inline bool bio_remaining_done(struct bio *bio)
**/
void bio_endio(struct bio *bio)
{
while (bio) {
if (unlikely(!bio_remaining_done(bio)))
break;
again:
if (!bio_remaining_done(bio))
return;
/*
* Need to have a real endio function for chained bios,
* otherwise various corner cases will break (like stacking
* block devices that save/restore bi_end_io) - however, we want
* to avoid unbounded recursion and blowing the stack. Tail call
* optimization would handle this, but compiling with frame
* pointers also disables gcc's sibling call optimization.
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
* save/restore bi_end_io) - however, we want to avoid unbounded
* recursion and blowing the stack. Tail call optimization would
* handle this, but compiling with frame pointers also disables
* gcc's sibling call optimization.
*/
if (bio->bi_end_io == bio_chain_endio) {
struct bio *parent = bio->bi_private;
parent->bi_error = bio->bi_error;
bio_put(bio);
bio = parent;
} else {
bio = __bio_chain_endio(bio);
goto again;
}
if (bio->bi_end_io)
bio->bi_end_io(bio);
bio = NULL;
}
}
}
EXPORT_SYMBOL(bio_endio);

View File

@ -408,17 +408,18 @@ void blk_mq_unregister_disk(struct gendisk *disk)
blk_mq_enable_hotplug();
}
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
}
static void blk_mq_sysfs_init(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
int i;
kobject_init(&q->mq_kobj, &blk_mq_ktype);
queue_for_each_hw_ctx(q, hctx, i)
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
queue_for_each_ctx(q, ctx, i)
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
}

View File

@ -544,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
if (tag < tags->nr_tags)
return tags->rqs[tag];
return NULL;
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
@ -1744,31 +1747,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
return -1;
}
static int blk_mq_init_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
/*
* Initialize hardware queues
*/
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_init_hctx(q, set, hctx, i))
break;
}
if (i == q->nr_hw_queues)
return 0;
/*
* Init failed
*/
blk_mq_exit_hw_queues(q, set, i);
return 1;
}
static void blk_mq_init_cpu_queues(struct request_queue *q,
unsigned int nr_hw_queues)
{
@ -1826,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
continue;
hctx = q->mq_ops->map_queue(q, i);
cpumask_set_cpu(i, hctx->cpumask);
ctx->index_hw = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
@ -1974,56 +1953,93 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
struct blk_mq_hw_ctx **hctxs;
struct blk_mq_ctx __percpu *ctx;
unsigned int *map;
int i;
ctx = alloc_percpu(struct blk_mq_ctx);
if (!ctx)
return ERR_PTR(-ENOMEM);
hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
set->numa_node);
if (!hctxs)
goto err_percpu;
map = blk_mq_make_queue_map(set);
if (!map)
goto err_map;
int i, j;
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
blk_mq_sysfs_unregister(q);
for (i = 0; i < set->nr_hw_queues; i++) {
int node = blk_mq_hw_queue_to_node(map, i);
int node;
if (hctxs[i])
continue;
node = blk_mq_hw_queue_to_node(q->mq_map, i);
hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
GFP_KERNEL, node);
if (!hctxs[i])
goto err_hctxs;
break;
if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
node))
goto err_hctxs;
node)) {
kfree(hctxs[i]);
hctxs[i] = NULL;
break;
}
atomic_set(&hctxs[i]->nr_active, 0);
hctxs[i]->numa_node = node;
hctxs[i]->queue_num = i;
if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
free_cpumask_var(hctxs[i]->cpumask);
kfree(hctxs[i]);
hctxs[i] = NULL;
break;
}
blk_mq_hctx_kobj_init(hctxs[i]);
}
for (j = i; j < q->nr_hw_queues; j++) {
struct blk_mq_hw_ctx *hctx = hctxs[j];
if (hctx) {
if (hctx->tags) {
blk_mq_free_rq_map(set, hctx->tags, j);
set->tags[j] = NULL;
}
blk_mq_exit_hctx(q, set, hctx, j);
free_cpumask_var(hctx->cpumask);
kobject_put(&hctx->kobj);
kfree(hctx->ctxs);
kfree(hctx);
hctxs[j] = NULL;
}
}
q->nr_hw_queues = i;
blk_mq_sysfs_register(q);
}
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!q->queue_ctx)
return ERR_PTR(-ENOMEM);
q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
GFP_KERNEL, set->numa_node);
if (!q->queue_hw_ctx)
goto err_percpu;
q->mq_map = blk_mq_make_queue_map(set);
if (!q->mq_map)
goto err_map;
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->nr_queues = nr_cpu_ids;
q->nr_hw_queues = set->nr_hw_queues;
q->mq_map = map;
q->queue_ctx = ctx;
q->queue_hw_ctx = hctxs;
q->mq_ops = set->ops;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
if (!(set->flags & BLK_MQ_F_SG_MERGE))
@ -2050,9 +2066,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
if (blk_mq_init_hw_queues(q, set))
goto err_hctxs;
get_online_cpus();
mutex_lock(&all_q_mutex);
@ -2066,17 +2079,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
return q;
err_hctxs:
kfree(map);
for (i = 0; i < set->nr_hw_queues; i++) {
if (!hctxs[i])
break;
free_cpumask_var(hctxs[i]->cpumask);
kfree(hctxs[i]);
}
kfree(q->mq_map);
err_map:
kfree(hctxs);
kfree(q->queue_hw_ctx);
err_percpu:
free_percpu(ctx);
free_percpu(q->queue_ctx);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@ -2284,9 +2291,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
set->nr_hw_queues = 1;
set->queue_depth = min(64U, set->queue_depth);
}
/*
* There is no use for more h/w queues than cpus.
*/
if (set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
set->tags = kmalloc_node(set->nr_hw_queues *
sizeof(struct blk_mq_tags *),
set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
if (!set->tags)
return -ENOMEM;
@ -2309,7 +2320,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
for (i = 0; i < nr_cpu_ids; i++) {
if (set->tags[i])
blk_mq_free_rq_map(set, set->tags[i], i);
}
@ -2330,6 +2341,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->tags)
continue;
ret = blk_mq_tag_update_depth(hctx->tags, nr);
if (ret)
break;
@ -2341,6 +2354,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
struct request_queue *q;
if (nr_hw_queues > nr_cpu_ids)
nr_hw_queues = nr_cpu_ids;
if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
return;
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
set->nr_hw_queues = nr_hw_queues;
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues > 1)
blk_queue_make_request(q, blk_mq_make_request);
else
blk_queue_make_request(q, blk_sq_make_request);
blk_mq_queue_reinit(q, cpu_online_mask);
}
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
void blk_mq_disable_hotplug(void)
{
mutex_lock(&all_q_mutex);

View File

@ -57,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
*/
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);

View File

@ -632,6 +632,13 @@ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
return pblkg ? blkg_to_cfqg(pblkg) : NULL;
}
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
struct cfq_group *ancestor)
{
return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
}
static inline void cfqg_get(struct cfq_group *cfqg)
{
return blkg_get(cfqg_to_blkg(cfqg));
@ -758,6 +765,11 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
#else /* CONFIG_CFQ_GROUP_IOSCHED */
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
struct cfq_group *ancestor)
{
return true;
}
static inline void cfqg_get(struct cfq_group *cfqg) { }
static inline void cfqg_put(struct cfq_group *cfqg) { }
@ -2897,6 +2909,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;
struct cfq_rb_root *st = cfqq->service_tree;
struct cfq_io_cq *cic;
unsigned long sl, group_idle = 0;
@ -2947,8 +2960,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
return;
}
/* There are other queues in the group, don't do group idle */
if (group_idle && cfqq->cfqg->nr_cfqq > 1)
/*
* There are other queues in the group or this is the only group and
* it has too big thinktime, don't do group idle.
*/
if (group_idle &&
(cfqq->cfqg->nr_cfqq > 1 ||
cfq_io_thinktime_big(cfqd, &st->ttime, true)))
return;
cfq_mark_cfqq_wait_request(cfqq);
@ -3947,16 +3965,27 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
return true;
if (new_cfqq->cfqg != cfqq->cfqg)
/*
* Treat ancestors of current cgroup the same way as current cgroup.
* For anybody else we disallow preemption to guarantee service
* fairness among cgroups.
*/
if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
return false;
if (cfq_slice_used(cfqq))
return true;
/*
* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
*/
if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
return true;
WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
/* Allow preemption only if we are idling on sync-noidle tree */
if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
new_cfqq->service_tree->count == 2 &&
RB_EMPTY_ROOT(&cfqq->sort_list))
return true;
@ -3967,12 +3996,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
return true;
/*
* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
*/
if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
return true;
/* An idle queue should not be idle now for some reason */
if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
return true;

View File

@ -575,7 +575,11 @@ static const struct super_operations bdev_sops = {
static struct dentry *bd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
struct dentry *dent;
dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
if (dent)
dent->d_sb->s_iflags |= SB_I_CGROUPWB;
return dent;
}
static struct file_system_type bd_type = {

View File

@ -244,6 +244,8 @@ void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_mq_freeze_queue_start(struct request_queue *q);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
/*
* Driver command data is immediately after the request. So subtract request
* size to get back to the original request, add request size to get the PDU.