Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:
 "This is the main pull request for block IO related changes for the
  4.16 kernel. Nothing major in this pull request, but a good amount of
  improvements and fixes all over the map. This contains:

   - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
     Paolo.

   - Support for SMR zones for deadline and mq-deadline from Damien and
     Christoph.

   - Set of fixes for bcache by way of Michael Lyle, including fixes
     from himself, Kent, Rui, Tang, and Coly.

   - Series from Matias for lightnvm with fixes from Hans Holmberg,
     Javier, and Matias. Mostly centered around pblk, and the removing
     rrpc 1.2 in preparation for supporting 2.0.

   - A couple of NVMe pull requests from Christoph. Nothing major in
     here, just fixes and cleanups, and support for command tracing from
     Johannes.

   - Support for blk-throttle for tracking reads and writes separately.
     From Joseph Qi. A few cleanups/fixes also for blk-throttle from
     Weiping.

   - Series from Mike Snitzer that enables dm to register its queue more
     logically, something that's alwways been problematic on dm since
     it's a stacked device.

   - Series from Ming cleaning up some of the bio accessor use, in
     preparation for supporting multipage bvecs.

   - Various fixes from Ming closing up holes around queue mapping and
     quiescing.

   - BSD partition fix from Richard Narron, fixing a problem where we
     can't mount newer (10/11) FreeBSD partitions.

   - Series from Tejun reworking blk-mq timeout handling. The previous
     scheme relied on atomic bits, but it had races where we would think
     a request had timed out if it to reused at the wrong time.

   - null_blk now supports faking timeouts, to enable us to better
     exercise and test that functionality separately. From me.

   - Kill the separate atomic poll bit in the request struct. After
     this, we don't use the atomic bits on blk-mq anymore at all. From
     me.

   - sgl_alloc/free helpers from Bart.

   - Heavily contended tag case scalability improvement from me.

   - Various little fixes and cleanups from Arnd, Bart, Corentin,
     Douglas, Eryu, Goldwyn, and myself"

* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
  block: remove smart1,2.h
  nvme: add tracepoint for nvme_complete_rq
  nvme: add tracepoint for nvme_setup_cmd
  nvme-pci: introduce RECONNECTING state to mark initializing procedure
  nvme-rdma: remove redundant boolean for inline_data
  nvme: don't free uuid pointer before printing it
  nvme-pci: Suspend queues after deleting them
  bsg: use pr_debug instead of hand crafted macros
  blk-mq-debugfs: don't allow write on attributes with seq_operations set
  nvme-pci: Fix queue double allocations
  block: Set BIO_TRACE_COMPLETION on new bio during split
  blk-throttle: use queue_is_rq_based
  block: Remove kblockd_schedule_delayed_work{,_on}()
  blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
  blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
  lib/scatterlist: Fix chaining support in sgl_alloc_order()
  blk-throttle: track read and write request individually
  block: add bdev_read_only() checks to common helpers
  block: fail op_is_write() requests to read-only partitions
  blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
  ...
This commit is contained in:
Linus Torvalds 2018-01-29 11:51:49 -08:00
commit 0a4b6e2f80
124 changed files with 3878 additions and 4723 deletions

View File

@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
unsigned long flags; unsigned long flags;
int i; int i;
if (!entity) /* root group */
return;
spin_lock_irqsave(&bfqd->lock, flags); spin_lock_irqsave(&bfqd->lock, flags);
if (!entity) /* root group */
goto put_async_queues;
/* /*
* Empty all service_trees belonging to this group before * Empty all service_trees belonging to this group before
* deactivating the group itself. * deactivating the group itself.
@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
} }
__bfq_deactivate_entity(entity, false); __bfq_deactivate_entity(entity, false);
put_async_queues:
bfq_put_async_queues(bfqd, bfqg); bfq_put_async_queues(bfqd, bfqg);
spin_unlock_irqrestore(&bfqd->lock, flags); spin_unlock_irqrestore(&bfqd->lock, flags);

View File

@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;
/* Default timeout values, in jiffies, approximating CFQ defaults. */ /* Default timeout values, in jiffies, approximating CFQ defaults. */
const int bfq_timeout = HZ / 8; const int bfq_timeout = HZ / 8;
/*
* Time limit for merging (see comments in bfq_setup_cooperator). Set
* to the slowest value that, in our tests, proved to be effective in
* removing false positives, while not causing true positives to miss
* queue merging.
*
* As can be deduced from the low time limit below, queue merging, if
* successful, happens at the very beggining of the I/O of the involved
* cooperating processes, as a consequence of the arrival of the very
* first requests from each cooperator. After that, there is very
* little chance to find cooperators.
*/
static const unsigned long bfq_merge_time_limit = HZ/10;
static struct kmem_cache *bfq_pool; static struct kmem_cache *bfq_pool;
/* Below this threshold (in ns), we consider thinktime immediate. */ /* Below this threshold (in ns), we consider thinktime immediate. */
@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool;
#define BFQQ_SEEK_THR (sector_t)(8 * 100) #define BFQQ_SEEK_THR (sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
/* Min number of samples required to perform peak-rate update */ /* Min number of samples required to perform peak-rate update */
#define BFQ_RATE_MIN_SAMPLES 32 #define BFQ_RATE_MIN_SAMPLES 32
@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool;
* interactive applications automatically, using the following formula: * interactive applications automatically, using the following formula:
* duration = (R / r) * T, where r is the peak rate of the device, and * duration = (R / r) * T, where r is the peak rate of the device, and
* R and T are two reference parameters. * R and T are two reference parameters.
* In particular, R is the peak rate of the reference device (see below), * In particular, R is the peak rate of the reference device (see
* and T is a reference time: given the systems that are likely to be * below), and T is a reference time: given the systems that are
* installed on the reference device according to its speed class, T is * likely to be installed on the reference device according to its
* about the maximum time needed, under BFQ and while reading two files in * speed class, T is about the maximum time needed, under BFQ and
* parallel, to load typical large applications on these systems. * while reading two files in parallel, to load typical large
* In practice, the slower/faster the device at hand is, the more/less it * applications on these systems (see the comments on
* takes to load applications with respect to the reference device. * max_service_from_wr below, for more details on how T is obtained).
* Accordingly, the longer/shorter BFQ grants weight raising to interactive * In practice, the slower/faster the device at hand is, the more/less
* applications. * it takes to load applications with respect to the reference device.
* Accordingly, the longer/shorter BFQ grants weight raising to
* interactive applications.
* *
* BFQ uses four different reference pairs (R, T), depending on: * BFQ uses four different reference pairs (R, T), depending on:
* . whether the device is rotational or non-rotational; * . whether the device is rotational or non-rotational;
@ -240,6 +256,60 @@ static int T_slow[2];
static int T_fast[2]; static int T_fast[2];
static int device_speed_thresh[2]; static int device_speed_thresh[2];
/*
* BFQ uses the above-detailed, time-based weight-raising mechanism to
* privilege interactive tasks. This mechanism is vulnerable to the
* following false positives: I/O-bound applications that will go on
* doing I/O for much longer than the duration of weight
* raising. These applications have basically no benefit from being
* weight-raised at the beginning of their I/O. On the opposite end,
* while being weight-raised, these applications
* a) unjustly steal throughput to applications that may actually need
* low latency;
* b) make BFQ uselessly perform device idling; device idling results
* in loss of device throughput with most flash-based storage, and may
* increase latencies when used purposelessly.
*
* BFQ tries to reduce these problems, by adopting the following
* countermeasure. To introduce this countermeasure, we need first to
* finish explaining how the duration of weight-raising for
* interactive tasks is computed.
*
* For a bfq_queue deemed as interactive, the duration of weight
* raising is dynamically adjusted, as a function of the estimated
* peak rate of the device, so as to be equal to the time needed to
* execute the 'largest' interactive task we benchmarked so far. By
* largest task, we mean the task for which each involved process has
* to do more I/O than for any of the other tasks we benchmarked. This
* reference interactive task is the start-up of LibreOffice Writer,
* and in this task each process/bfq_queue needs to have at most ~110K
* sectors transferred.
*
* This last piece of information enables BFQ to reduce the actual
* duration of weight-raising for at least one class of I/O-bound
* applications: those doing sequential or quasi-sequential I/O. An
* example is file copy. In fact, once started, the main I/O-bound
* processes of these applications usually consume the above 110K
* sectors in much less time than the processes of an application that
* is starting, because these I/O-bound processes will greedily devote
* almost all their CPU cycles only to their target,
* throughput-friendly I/O operations. This is even more true if BFQ
* happens to be underestimating the device peak rate, and thus
* overestimating the duration of weight raising. But, according to
* our measurements, once transferred 110K sectors, these processes
* have no right to be weight-raised any longer.
*
* Basing on the last consideration, BFQ ends weight-raising for a
* bfq_queue if the latter happens to have received an amount of
* service at least equal to the following constant. The constant is
* set to slightly more than 110K, to have a minimum safety margin.
*
* This early ending of weight-raising reduces the amount of time
* during which interactive false positives cause the two problems
* described at the beginning of these comments.
*/
static const unsigned long max_service_from_wr = 120000;
#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
} }
} }
/*
* See the comments on bfq_limit_depth for the purpose of
* the depths set in the function.
*/
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
bfqd->sb_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
*
* In next formulas, right-shift the value
* (1U<<bfqd->sb_shift), instead of computing directly
* (1U<<(bfqd->sb_shift - something)), to be robust against
* any possible value of bfqd->sb_shift, without having to
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
* raised: leaving ~63% of tags for sync reads. This is the
* highest percentage for which, in our tests, application
* start-up times didn't suffer from any regression due to tag
* shortage.
*/
/* no more than ~18% of tags for async I/O */
bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
}
/*
* Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads.
* Limit depths of async I/O and sync writes so as to counter both
* problems.
*/
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct sbitmap_queue *bt;
if (op_is_sync(op) && !op_is_write(op))
return;
if (data->flags & BLK_MQ_REQ_RESERVED) {
if (unlikely(!tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
return;
}
bt = &tags->breserved_tags;
} else
bt = &tags->bitmap_tags;
if (unlikely(bfqd->sb_shift != bt->sb.shift))
bfq_update_depths(bfqd, bt);
data->shallow_depth =
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
__func__, bfqd->wr_busy_queues, op_is_sync(op),
data->shallow_depth);
}
static struct bfq_queue * static struct bfq_queue *
bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
sector_t sector, struct rb_node **ret_parent, sector_t sector, struct rb_node **ret_parent,
@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
return bfqq; return bfqq;
} }
static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
{
return bfqq->service_from_backlogged > 0 &&
time_is_before_jiffies(bfqq->first_IO_time +
bfq_merge_time_limit);
}
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{ {
struct rb_node **p, *parent; struct rb_node **p, *parent;
@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL; bfqq->pos_root = NULL;
} }
/*
* bfqq cannot be merged any longer (see comments in
* bfq_setup_cooperator): no point in adding bfqq into the
* position tree.
*/
if (bfq_too_late_for_merging(bfqq))
return;
if (bfq_class_idle(bfqq)) if (bfq_class_idle(bfqq))
return; return;
if (!bfqq->next_rq) if (!bfqq->next_rq)
@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
if (old_wr_coeff == 1 && wr_or_deserves_wr) { if (old_wr_coeff == 1 && wr_or_deserves_wr) {
/* start a weight-raising period */ /* start a weight-raising period */
if (interactive) { if (interactive) {
bfqq->service_from_wr = 0;
bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_coeff = bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
} else { } else {
@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q,
rb_erase(&bfqq->pos_node, bfqq->pos_root); rb_erase(&bfqq->pos_node, bfqq->pos_root);
bfqq->pos_root = NULL; bfqq->pos_root = NULL;
} }
} else {
bfq_pos_tree_add_move(bfqd, bfqq);
} }
if (rq->cmd_flags & REQ_META) if (rq->cmd_flags & REQ_META)
@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
struct bfq_queue *new_bfqq) struct bfq_queue *new_bfqq)
{ {
if (bfq_too_late_for_merging(new_bfqq))
return false;
if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
(bfqq->ioprio_class != new_bfqq->ioprio_class)) (bfqq->ioprio_class != new_bfqq->ioprio_class))
return false; return false;
@ -1956,20 +2123,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
return true; return true;
} }
/*
* If this function returns true, then bfqq cannot be merged. The idea
* is that true cooperation happens very early after processes start
* to do I/O. Usually, late cooperations are just accidental false
* positives. In case bfqq is weight-raised, such false positives
* would evidently degrade latency guarantees for bfqq.
*/
static bool wr_from_too_long(struct bfq_queue *bfqq)
{
return bfqq->wr_coeff > 1 &&
time_is_before_jiffies(bfqq->last_wr_start_finish +
msecs_to_jiffies(100));
}
/* /*
* Attempt to schedule a merge of bfqq with the currently in-service * Attempt to schedule a merge of bfqq with the currently in-service
* queue or with a close queue among the scheduled queues. Return * queue or with a close queue among the scheduled queues. Return
@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)
* to maintain. Besides, in such a critical condition as an out of memory, * to maintain. Besides, in such a critical condition as an out of memory,
* the benefits of queue merging may be little relevant, or even negligible. * the benefits of queue merging may be little relevant, or even negligible.
* *
* Weight-raised queues can be merged only if their weight-raising
* period has just started. In fact cooperating processes are usually
* started together. Thus, with this filter we avoid false positives
* that would jeopardize low-latency guarantees.
*
* WARNING: queue merging may impair fairness among non-weight raised * WARNING: queue merging may impair fairness among non-weight raised
* queues, for at least two reasons: 1) the original weight of a * queues, for at least two reasons: 1) the original weight of a
* merged queue may change during the merged state, 2) even being the * merged queue may change during the merged state, 2) even being the
@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{ {
struct bfq_queue *in_service_bfqq, *new_bfqq; struct bfq_queue *in_service_bfqq, *new_bfqq;
/*
* Prevent bfqq from being merged if it has been created too
* long ago. The idea is that true cooperating processes, and
* thus their associated bfq_queues, are supposed to be
* created shortly after each other. This is the case, e.g.,
* for KVM/QEMU and dump I/O threads. Basing on this
* assumption, the following filtering greatly reduces the
* probability that two non-cooperating processes, which just
* happen to do close I/O for some short time interval, have
* their queues merged by mistake.
*/
if (bfq_too_late_for_merging(bfqq))
return NULL;
if (bfqq->new_bfqq) if (bfqq->new_bfqq)
return bfqq->new_bfqq; return bfqq->new_bfqq;
if (!io_struct || if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
wr_from_too_long(bfqq) ||
unlikely(bfqq == &bfqd->oom_bfqq))
return NULL; return NULL;
/* If there is only one backlogged queue, don't search. */ /* If there is only one backlogged queue, don't search. */
@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
in_service_bfqq = bfqd->in_service_queue; in_service_bfqq = bfqd->in_service_queue;
if (!in_service_bfqq || in_service_bfqq == bfqq if (in_service_bfqq && in_service_bfqq != bfqq &&
|| wr_from_too_long(in_service_bfqq) || likely(in_service_bfqq != &bfqd->oom_bfqq) &&
unlikely(in_service_bfqq == &bfqd->oom_bfqq)) bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
goto check_scheduled;
if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
bfqq->entity.parent == in_service_bfqq->entity.parent && bfqq->entity.parent == in_service_bfqq->entity.parent &&
bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* queues. The only thing we need is that the bio/request is not * queues. The only thing we need is that the bio/request is not
* NULL, as we need it to establish whether a cooperator exists. * NULL, as we need it to establish whether a cooperator exists.
*/ */
check_scheduled:
new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
bfq_io_struct_pos(io_struct, request)); bfq_io_struct_pos(io_struct, request));
if (new_bfqq && !wr_from_too_long(new_bfqq) && if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
likely(new_bfqq != &bfqd->oom_bfqq) &&
bfq_may_be_close_cooperator(bfqq, new_bfqq)) bfq_may_be_close_cooperator(bfqq, new_bfqq))
return bfq_setup_merge(bfqq, new_bfqq); return bfq_setup_merge(bfqq, new_bfqq);
@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
if (unlikely(bfq_bfqq_just_created(bfqq) && if (unlikely(bfq_bfqq_just_created(bfqq) &&
!bfq_bfqq_in_large_burst(bfqq))) { !bfq_bfqq_in_large_burst(bfqq) &&
bfqq->bfqd->low_latency)) {
/* /*
* bfqq being merged right after being created: bfqq * bfqq being merged right after being created: bfqq
* would have deserved interactive weight raising, but * would have deserved interactive weight raising, but
@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* whereas soft_rt_next_start is set to infinity for applications that do * whereas soft_rt_next_start is set to infinity for applications that do
* not. * not.
* *
* Unfortunately, even a greedy application may happen to behave in an * Unfortunately, even a greedy (i.e., I/O-bound) application may
* isochronous way if the CPU load is high. In fact, the application may * happen to meet, occasionally or systematically, both the above
* stop issuing requests while the CPUs are busy serving other processes, * bandwidth and isochrony requirements. This may happen at least in
* then restart, then stop again for a while, and so on. In addition, if * the following circumstances. First, if the CPU load is high. The
* the disk achieves a low enough throughput with the request pattern * application may stop issuing requests while the CPUs are busy
* issued by the application (e.g., because the request pattern is random * serving other processes, then restart, then stop again for a while,
* and/or the device is slow), then the application may meet the above * and so on. The other circumstances are related to the storage
* bandwidth requirement too. To prevent such a greedy application to be * device: the storage device is highly loaded or reaches a low-enough
* deemed as soft real-time, a further rule is used in the computation of * throughput with the I/O of the application (e.g., because the I/O
* soft_rt_next_start: soft_rt_next_start must be higher than the current * is random and/or the device is slow). In all these cases, the
* time plus the maximum time for which the arrival of a request is waited * I/O of the application may be simply slowed down enough to meet
* for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. * the bandwidth and isochrony requirements. To reduce the probability
* This filters out greedy applications, as the latter issue instead their * that greedy applications are deemed as soft real-time in these
* next request as soon as possible after the last one has been completed * corner cases, a further rule is used in the computation of
* (in contrast, when a batch of requests is completed, a soft real-time * soft_rt_next_start: the return value of this function is forced to
* application spends some time processing data). * be higher than the maximum between the following two quantities.
* *
* Unfortunately, the last filter may easily generate false positives if * (a) Current time plus: (1) the maximum time for which the arrival
* only bfqd->bfq_slice_idle is used as a reference time interval and one * of a request is waited for when a sync queue becomes idle,
* or both the following cases occur: * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We
* 1) HZ is so low that the duration of a jiffy is comparable to or higher * postpone for a moment the reason for adding a few extra
* than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with * jiffies; we get back to it after next item (b). Lower-bounding
* HZ=100. * the return value of this function with the current time plus
* bfqd->bfq_slice_idle tends to filter out greedy applications,
* because the latter issue their next request as soon as possible
* after the last one has been completed. In contrast, a soft
* real-time application spends some time processing data, after a
* batch of its requests has been completed.
*
* (b) Current value of bfqq->soft_rt_next_start. As pointed out
* above, greedy applications may happen to meet both the
* bandwidth and isochrony requirements under heavy CPU or
* storage-device load. In more detail, in these scenarios, these
* applications happen, only for limited time periods, to do I/O
* slowly enough to meet all the requirements described so far,
* including the filtering in above item (a). These slow-speed
* time intervals are usually interspersed between other time
* intervals during which these applications do I/O at a very high
* speed. Fortunately, exactly because of the high speed of the
* I/O in the high-speed intervals, the values returned by this
* function happen to be so high, near the end of any such
* high-speed interval, to be likely to fall *after* the end of
* the low-speed time interval that follows. These high values are
* stored in bfqq->soft_rt_next_start after each invocation of
* this function. As a consequence, if the last value of
* bfqq->soft_rt_next_start is constantly used to lower-bound the
* next value that this function may return, then, from the very
* beginning of a low-speed interval, bfqq->soft_rt_next_start is
* likely to be constantly kept so high that any I/O request
* issued during the low-speed interval is considered as arriving
* to soon for the application to be deemed as soft
* real-time. Then, in the high-speed interval that follows, the
* application will not be deemed as soft real-time, just because
* it will do I/O at a high speed. And so on.
*
* Getting back to the filtering in item (a), in the following two
* cases this filtering might be easily passed by a greedy
* application, if the reference quantity was just
* bfqd->bfq_slice_idle:
* 1) HZ is so low that the duration of a jiffy is comparable to or
* higher than bfqd->bfq_slice_idle. This happens, e.g., on slow
* devices with HZ=100. The time granularity may be so coarse
* that the approximation, in jiffies, of bfqd->bfq_slice_idle
* is rather lower than the exact value.
* 2) jiffies, instead of increasing at a constant rate, may stop increasing * 2) jiffies, instead of increasing at a constant rate, may stop increasing
* for a while, then suddenly 'jump' by several units to recover the lost * for a while, then suddenly 'jump' by several units to recover the lost
* increments. This seems to happen, e.g., inside virtual machines. * increments. This seems to happen, e.g., inside virtual machines.
* To address this issue, we do not use as a reference time interval just * To address this issue, in the filtering in (a) we do not use as a
* bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In * reference time interval just bfqd->bfq_slice_idle, but
* particular we add the minimum number of jiffies for which the filter * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the
* seems to be quite precise also in embedded systems and KVM/QEMU virtual * minimum number of jiffies for which the filter seems to be quite
* machines. * precise also in embedded systems and KVM/QEMU virtual machines.
*/ */
static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
struct bfq_queue *bfqq) struct bfq_queue *bfqq)
{ {
return max(bfqq->last_idle_bklogged + return max3(bfqq->soft_rt_next_start,
HZ * bfqq->service_from_backlogged / bfqq->last_idle_bklogged +
bfqd->bfq_wr_max_softrt_rate, HZ * bfqq->service_from_backlogged /
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); bfqd->bfq_wr_max_softrt_rate,
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
} }
/** /**
@ -2999,17 +3197,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
*/ */
slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
/*
* Increase service_from_backlogged before next statement,
* because the possible next invocation of
* bfq_bfqq_charge_time would likely inflate
* entity->service. In contrast, service_from_backlogged must
* contain real service, to enable the soft real-time
* heuristic to correctly compute the bandwidth consumed by
* bfqq.
*/
bfqq->service_from_backlogged += entity->service;
/* /*
* As above explained, charge slow (typically seeky) and * As above explained, charge slow (typically seeky) and
* timed-out queues with the time and not the service * timed-out queues with the time and not the service
@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->entity.prio_changed = 1; bfqq->entity.prio_changed = 1;
} }
} }
if (bfqq->wr_coeff > 1 &&
bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
bfqq->service_from_wr > max_service_from_wr) {
/* see comments on max_service_from_wr */
bfq_bfqq_end_wr(bfqq);
}
} }
/* /*
* To improve latency (for this or other queues), immediately * To improve latency (for this or other queues), immediately
@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
} }
/* /*
* We exploit the put_rq_private hook to decrement * We exploit the bfq_finish_request hook to decrement
* rq_in_driver, but put_rq_private will not be * rq_in_driver, but bfq_finish_request will not be
* invoked on this request. So, to avoid unbalance, * invoked on this request. So, to avoid unbalance,
* just start this request, without incrementing * just start this request, without incrementing
* rq_in_driver. As a negative consequence, * rq_in_driver. As a negative consequence,
@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* bfq_schedule_dispatch to be invoked uselessly. * bfq_schedule_dispatch to be invoked uselessly.
* *
* As for implementing an exact solution, the * As for implementing an exact solution, the
* put_request hook, if defined, is probably invoked * bfq_finish_request hook, if defined, is probably
* also on this request. So, by exploiting this hook, * invoked also on this request. So, by exploiting
* we could 1) increment rq_in_driver here, and 2) * this hook, we could 1) increment rq_in_driver here,
* decrement it in put_request. Such a solution would * and 2) decrement it in bfq_finish_request. Such a
* let the value of the counter be always accurate, * solution would let the value of the counter be
* but it would entail using an extra interface * always accurate, but it would entail using an extra
* function. This cost seems higher than the benefit, * interface function. This cost seems higher than the
* being the frequency of non-elevator-private * benefit, being the frequency of non-elevator-private
* requests very low. * requests very low.
*/ */
goto start_rq; goto start_rq;
@ -3689,35 +3882,16 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
return rq; return rq;
} }
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
static void bfq_update_dispatch_stats(struct request_queue *q,
struct request *rq,
struct bfq_queue *in_serv_queue,
bool idle_timer_disabled)
{ {
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL;
struct request *rq;
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
struct bfq_queue *in_serv_queue, *bfqq;
bool waiting_rq, idle_timer_disabled;
#endif
spin_lock_irq(&bfqd->lock);
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
in_serv_queue = bfqd->in_service_queue;
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx);
idle_timer_disabled =
waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
#else
rq = __bfq_dispatch_request(hctx);
#endif
spin_unlock_irq(&bfqd->lock);
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
bfqq = rq ? RQ_BFQQ(rq) : NULL;
if (!idle_timer_disabled && !bfqq) if (!idle_timer_disabled && !bfqq)
return rq; return;
/* /*
* rq and bfqq are guaranteed to exist until this function * rq and bfqq are guaranteed to exist until this function
@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* In addition, the following queue lock guarantees that * In addition, the following queue lock guarantees that
* bfqq_group(bfqq) exists as well. * bfqq_group(bfqq) exists as well.
*/ */
spin_lock_irq(hctx->queue->queue_lock); spin_lock_irq(q->queue_lock);
if (idle_timer_disabled) if (idle_timer_disabled)
/* /*
* Since the idle timer has been disabled, * Since the idle timer has been disabled,
@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_set_start_empty_time(bfqg);
bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
} }
spin_unlock_irq(hctx->queue->queue_lock); spin_unlock_irq(q->queue_lock);
}
#else
static inline void bfq_update_dispatch_stats(struct request_queue *q,
struct request *rq,
struct bfq_queue *in_serv_queue,
bool idle_timer_disabled) {}
#endif #endif
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct request *rq;
struct bfq_queue *in_serv_queue;
bool waiting_rq, idle_timer_disabled;
spin_lock_irq(&bfqd->lock);
in_serv_queue = bfqd->in_service_queue;
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx);
idle_timer_disabled =
waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
spin_unlock_irq(&bfqd->lock);
bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
idle_timer_disabled);
return rq; return rq;
} }
@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->split_time = bfq_smallest_from_now(); bfqq->split_time = bfq_smallest_from_now();
/* /*
* Set to the value for which bfqq will not be deemed as * To not forget the possibly high bandwidth consumed by a
* soft rt when it becomes backlogged. * process/queue in the recent past,
* bfq_bfqq_softrt_next_start() returns a value at least equal
* to the current value of bfqq->soft_rt_next_start (see
* comments on bfq_bfqq_softrt_next_start). Set
* soft_rt_next_start to now, to mean that bfqq has consumed
* no bandwidth so far.
*/ */
bfqq->soft_rt_next_start = bfq_greatest_from_now(); bfqq->soft_rt_next_start = jiffies;
/* first request is almost certainly seeky */ /* first request is almost certainly seeky */
bfqq->seek_history = 1; bfqq->seek_history = 1;
@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
return idle_timer_disabled; return idle_timer_disabled;
} }
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
static void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
unsigned int cmd_flags)
{
if (!bfqq)
return;
/*
* bfqq still exists, because it can disappear only after
* either it is merged with another queue, or the process it
* is associated with exits. But both actions must be taken by
* the same process currently executing this flow of
* instructions.
*
* In addition, the following queue lock guarantees that
* bfqq_group(bfqq) exists as well.
*/
spin_lock_irq(q->queue_lock);
bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
if (idle_timer_disabled)
bfqg_stats_update_idle_time(bfqq_group(bfqq));
spin_unlock_irq(q->queue_lock);
}
#else
static inline void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
unsigned int cmd_flags) {}
#endif
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head) bool at_head)
{ {
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_data *bfqd = q->elevator->elevator_data;
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_queue *bfqq = RQ_BFQQ(rq);
bool idle_timer_disabled = false; bool idle_timer_disabled = false;
unsigned int cmd_flags; unsigned int cmd_flags;
#endif
spin_lock_irq(&bfqd->lock); spin_lock_irq(&bfqd->lock);
if (blk_mq_sched_try_insert_merge(q, rq)) { if (blk_mq_sched_try_insert_merge(q, rq)) {
@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
else else
list_add_tail(&rq->queuelist, &bfqd->dispatch); list_add_tail(&rq->queuelist, &bfqd->dispatch);
} else { } else {
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
idle_timer_disabled = __bfq_insert_request(bfqd, rq); idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/* /*
* Update bfqq, because, if a queue merge has occurred * Update bfqq, because, if a queue merge has occurred
@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* redirected into a new queue. * redirected into a new queue.
*/ */
bfqq = RQ_BFQQ(rq); bfqq = RQ_BFQQ(rq);
#else
__bfq_insert_request(bfqd, rq);
#endif
if (rq_mergeable(rq)) { if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq); elv_rqhash_add(q, rq);
@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
} }
} }
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
/* /*
* Cache cmd_flags before releasing scheduler lock, because rq * Cache cmd_flags before releasing scheduler lock, because rq
* may disappear afterwards (for example, because of a request * may disappear afterwards (for example, because of a request
* merge). * merge).
*/ */
cmd_flags = rq->cmd_flags; cmd_flags = rq->cmd_flags;
#endif
spin_unlock_irq(&bfqd->lock); spin_unlock_irq(&bfqd->lock);
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
if (!bfqq) cmd_flags);
return;
/*
* bfqq still exists, because it can disappear only after
* either it is merged with another queue, or the process it
* is associated with exits. But both actions must be taken by
* the same process currently executing this flow of
* instruction.
*
* In addition, the following queue lock guarantees that
* bfqq_group(bfqq) exists as well.
*/
spin_lock_irq(q->queue_lock);
bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
if (idle_timer_disabled)
bfqg_stats_update_idle_time(bfqq_group(bfqq));
spin_unlock_irq(q->queue_lock);
#endif
} }
static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_schedule_dispatch(bfqd); bfq_schedule_dispatch(bfqd);
} }
static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) static void bfq_finish_request_body(struct bfq_queue *bfqq)
{ {
bfqq->allocated--; bfqq->allocated--;
@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq)
spin_lock_irqsave(&bfqd->lock, flags); spin_lock_irqsave(&bfqd->lock, flags);
bfq_completed_request(bfqq, bfqd); bfq_completed_request(bfqq, bfqd);
bfq_put_rq_priv_body(bfqq); bfq_finish_request_body(bfqq);
spin_unlock_irqrestore(&bfqd->lock, flags); spin_unlock_irqrestore(&bfqd->lock, flags);
} else { } else {
@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq)
bfqg_stats_update_io_remove(bfqq_group(bfqq), bfqg_stats_update_io_remove(bfqq_group(bfqq),
rq->cmd_flags); rq->cmd_flags);
} }
bfq_put_rq_priv_body(bfqq); bfq_finish_request_body(bfqq);
} }
rq->elv.priv[0] = NULL; rq->elv.priv[0] = NULL;
@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e)
hrtimer_cancel(&bfqd->idle_slice_timer); hrtimer_cancel(&bfqd->idle_slice_timer);
#ifdef CONFIG_BFQ_GROUP_IOSCHED #ifdef CONFIG_BFQ_GROUP_IOSCHED
/* release oom-queue reference to root group */
bfqg_and_blkg_put(bfqd->root_group);
blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
#else #else
spin_lock_irq(&bfqd->lock); spin_lock_irq(&bfqd->lock);
@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {
static struct elevator_type iosched_bfq_mq = { static struct elevator_type iosched_bfq_mq = {
.ops.mq = { .ops.mq = {
.limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request, .prepare_request = bfq_prepare_request,
.finish_request = bfq_finish_request, .finish_request = bfq_finish_request,
.exit_icq = bfq_exit_icq, .exit_icq = bfq_exit_icq,

View File

@ -337,6 +337,11 @@ struct bfq_queue {
* last transition from idle to backlogged. * last transition from idle to backlogged.
*/ */
unsigned long service_from_backlogged; unsigned long service_from_backlogged;
/*
* Cumulative service received from the @bfq_queue since its
* last transition to weight-raised state.
*/
unsigned long service_from_wr;
/* /*
* Value of wr start time when switching to soft rt * Value of wr start time when switching to soft rt
@ -344,6 +349,8 @@ struct bfq_queue {
unsigned long wr_start_at_switch_to_srt; unsigned long wr_start_at_switch_to_srt;
unsigned long split_time; /* time of last split */ unsigned long split_time; /* time of last split */
unsigned long first_IO_time; /* time of first I/O for this queue */
}; };
/** /**
@ -627,6 +634,18 @@ struct bfq_data {
struct bfq_io_cq *bio_bic; struct bfq_io_cq *bio_bic;
/* bfqq associated with the task issuing current bio for merging */ /* bfqq associated with the task issuing current bio for merging */
struct bfq_queue *bio_bfqq; struct bfq_queue *bio_bfqq;
/*
* Cached sbitmap shift, used to compute depth limits in
* bfq_update_depths.
*/
unsigned int sb_shift;
/*
* Depth limits used in bfq_limit_depth (see comments on the
* function)
*/
unsigned int word_depths[2][2];
}; };
enum bfqq_state_flags { enum bfqq_state_flags {

View File

@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
struct bfq_entity *entity = &bfqq->entity; struct bfq_entity *entity = &bfqq->entity;
struct bfq_service_tree *st; struct bfq_service_tree *st;
if (!bfqq->service_from_backlogged)
bfqq->first_IO_time = jiffies;
if (bfqq->wr_coeff > 1)
bfqq->service_from_wr += served;
bfqq->service_from_backlogged += served;
for_each_entity(entity) { for_each_entity(entity) {
st = bfq_entity_service_tree(entity); st = bfq_entity_service_tree(entity);

View File

@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
/** /**
* __bio_integrity_endio - Integrity I/O completion function * __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio * @bio: Protected bio
* @error: Pointer to errno
* *
* Description: Completion for integrity I/O * Description: Completion for integrity I/O
* *

View File

@ -970,34 +970,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
} }
EXPORT_SYMBOL(bio_advance); EXPORT_SYMBOL(bio_advance);
/**
* bio_alloc_pages - allocates a single page for each bvec in a bio
* @bio: bio to allocate pages for
* @gfp_mask: flags for allocation
*
* Allocates pages up to @bio->bi_vcnt.
*
* Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
* freed.
*/
int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
{
int i;
struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
__free_page(bv->bv_page);
return -ENOMEM;
}
}
return 0;
}
EXPORT_SYMBOL(bio_alloc_pages);
/** /**
* bio_copy_data - copy contents of data buffers from one chain of bios to * bio_copy_data - copy contents of data buffers from one chain of bios to
* another * another
@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_advance(bio, split->bi_iter.bi_size); bio_advance(bio, split->bi_iter.bi_size);
if (bio_flagged(bio, BIO_TRACE_COMPLETION)) if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(bio, BIO_TRACE_COMPLETION); bio_set_flag(split, BIO_TRACE_COMPLETION);
return split; return split;
} }

View File

@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->start_time = jiffies; rq->start_time = jiffies;
set_start_time_ns(rq); set_start_time_ns(rq);
rq->part = NULL; rq->part = NULL;
seqcount_init(&rq->gstate_seq);
u64_stats_init(&rq->aborted_gstate_sync);
} }
EXPORT_SYMBOL(blk_rq_init); EXPORT_SYMBOL(blk_rq_init);
@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q)
queue_flag_set(QUEUE_FLAG_DEAD, q); queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock); spin_unlock_irq(lock);
/*
* make sure all in-progress dispatch are completed because
* blk_freeze_queue() can only complete all requests, and
* dispatch may still be in-progress since we dispatch requests
* from more than one contexts
*/
if (q->mq_ops)
blk_mq_quiesce_queue(q);
/* for synchronous bio-based driver finish in-flight integrity i/o */ /* for synchronous bio-based driver finish in-flight integrity i/o */
blk_flush_integrity(); blk_flush_integrity();
@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
lockdep_assert_held(q->queue_lock); lockdep_assert_held(q->queue_lock);
blk_req_zone_write_unlock(req);
blk_pm_put_request(req); blk_pm_put_request(req);
elv_completed_request(q, req); elv_completed_request(q, req);
@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part,
#endif /* CONFIG_FAIL_MAKE_REQUEST */ #endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
{
if (part->policy && op_is_write(bio_op(bio))) {
char b[BDEVNAME_SIZE];
printk(KERN_ERR
"generic_make_request: Trying to write "
"to read-only block-device %s (partno %d)\n",
bio_devname(bio, b), part->partno);
return true;
}
return false;
}
/* /*
* Remap block n of partition p to block n+start(p) of the disk. * Remap block n of partition p to block n+start(p) of the disk.
*/ */
@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio)
struct hd_struct *p; struct hd_struct *p;
int ret = 0; int ret = 0;
rcu_read_lock();
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
bio_check_ro(bio, p))) {
ret = -EIO;
goto out;
}
/* /*
* Zone reset does not include bi_size so bio_sectors() is always 0. * Zone reset does not include bi_size so bio_sectors() is always 0.
* Include a test for the reset op code and perform the remap if needed. * Include a test for the reset op code and perform the remap if needed.
*/ */
if (!bio->bi_partno || if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
(!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) goto out;
return 0;
rcu_read_lock(); bio->bi_iter.bi_sector += p->start_sect;
p = __disk_get_part(bio->bi_disk, bio->bi_partno); bio->bi_partno = 0;
if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
bio->bi_iter.bi_sector += p->start_sect; bio->bi_iter.bi_sector - p->start_sect);
bio->bi_partno = 0;
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), out:
bio->bi_iter.bi_sector - p->start_sect);
} else {
printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
ret = -EIO;
}
rcu_read_unlock(); rcu_read_unlock();
return ret; return ret;
} }
@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio)
* For a REQ_NOWAIT based request, return -EOPNOTSUPP * For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue. * if queue is not a request based queue.
*/ */
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported; goto not_supported;
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
goto end_io; goto end_io;
if (blk_partition_remap(bio)) if (!bio->bi_partno) {
goto end_io; if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
goto end_io;
} else {
if (blk_partition_remap(bio))
goto end_io;
}
if (bio_check_eod(bio, nr_sectors)) if (bio_check_eod(bio, nr_sectors))
goto end_io; goto end_io;
@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for * bypass a potential scheduler on the bottom device for
* insert. * insert.
*/ */
blk_mq_request_bypass_insert(rq, true); return blk_mq_request_issue_directly(rq);
return BLK_STS_OK;
} }
spin_lock_irqsave(q->queue_lock, flags); spin_lock_irqsave(q->queue_lock, flags);
@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req)
wbt_issue(req->q->rq_wb, &req->issue_stat); wbt_issue(req->q->rq_wb, &req->issue_stat);
} }
BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); BUG_ON(blk_rq_is_complete(req));
blk_add_timer(req); blk_add_timer(req);
} }
EXPORT_SYMBOL(blk_start_request); EXPORT_SYMBOL(blk_start_request);
@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
} }
EXPORT_SYMBOL(kblockd_mod_delayed_work_on); EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work(kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work);
int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
/** /**
* blk_start_plug - initialize blk_plug and track it inside the task_struct * blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized * @plug: The &struct blk_plug that needs to be initialized

View File

@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be reused after dying flag is set * be reused after dying flag is set
*/ */
if (q->mq_ops) { if (q->mq_ops) {
blk_mq_sched_insert_request(rq, at_head, true, false, false); blk_mq_sched_insert_request(rq, at_head, true, false);
return; return;
} }

View File

@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!q) if (!q)
return -ENXIO; return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
if (flags & BLKDEV_DISCARD_SECURE) { if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secure_erase(q)) if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP; return -EOPNOTSUPP;
@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if (!q) if (!q)
return -ENXIO; return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask) if ((sector | nr_sects) & bs_mask)
return -EINVAL; return -EINVAL;
@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
if (!q) if (!q)
return -ENXIO; return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
if (!q) if (!q)
return -ENXIO; return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) { while (nr_sects != 0) {
bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
gfp_mask); gfp_mask);

View File

@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL; struct bio *bio = NULL;
struct iov_iter i; struct iov_iter i;
int ret; int ret = -EINVAL;
if (!iter_is_iovec(iter)) if (!iter_is_iovec(iter))
goto fail; goto fail;
@ -148,7 +148,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
__blk_rq_unmap_user(bio); __blk_rq_unmap_user(bio);
fail: fail:
rq->bio = NULL; rq->bio = NULL;
return -EINVAL; return ret;
} }
EXPORT_SYMBOL(blk_rq_map_user_iov); EXPORT_SYMBOL(blk_rq_map_user_iov);

View File

@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
nsegs++; nsegs++;
sectors = max_sectors; sectors = max_sectors;
} }
if (sectors) goto split;
goto split;
/* Make this single bvec as the 1st segment */
} }
if (bvprvp && blk_queue_cluster(q)) { if (bvprvp && blk_queue_cluster(q)) {
@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bvprvp = &bvprv; bvprvp = &bvprv;
sectors += bv.bv_len >> 9; sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
continue; continue;
} }
new_segment: new_segment:
if (nsegs == queue_max_segments(q)) if (nsegs == queue_max_segments(q))
goto split; goto split;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
nsegs++; nsegs++;
bvprv = bv; bvprv = bv;
bvprvp = &bvprv; bvprvp = &bvprv;
seg_size = bv.bv_len; seg_size = bv.bv_len;
sectors += bv.bv_len >> 9; sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
} }
do_split = false; do_split = false;
@ -174,6 +171,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bio = new; bio = new;
} }
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size; bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size) if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size; bio->bi_seg_back_size = seg_size;

View File

@ -289,17 +289,12 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED), RQF_NAME(HASHED),
RQF_NAME(STATS), RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(MQ_TIMEOUT_EXPIRED),
RQF_NAME(MQ_POLL_SLEPT),
}; };
#undef RQF_NAME #undef RQF_NAME
#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
static const char *const rqaf_name[] = {
RQAF_NAME(COMPLETE),
RQAF_NAME(STARTED),
RQAF_NAME(POLL_SLEPT),
};
#undef RQAF_NAME
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{ {
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
seq_puts(m, ", .rq_flags="); seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name)); ARRAY_SIZE(rqf_name));
seq_puts(m, ", .atomic_flags="); seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
rq->internal_tag); rq->internal_tag);
if (mq_ops->show_rq) if (mq_ops->show_rq)
@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
const struct show_busy_params *params = data; const struct show_busy_params *params = data;
if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) blk_mq_rq_state(rq) != MQ_RQ_IDLE)
__blk_mq_debugfs_rq_show(params->m, __blk_mq_debugfs_rq_show(params->m,
list_entry_rq(&rq->queuelist)); list_entry_rq(&rq->queuelist));
} }
@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
const struct blk_mq_debugfs_attr *attr = m->private; const struct blk_mq_debugfs_attr *attr = m->private;
void *data = d_inode(file->f_path.dentry->d_parent)->i_private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
if (!attr->write) /*
* Attributes that only implement .seq_ops are read-only and 'attr' is
* the same with 'data' in this case.
*/
if (attr == data || !attr->write)
return -EPERM; return -EPERM;
return attr->write(data, buf, count, ppos); return attr->write(data, buf, count, ppos);

View File

@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
WRITE_ONCE(hctx->dispatch_from, ctx); WRITE_ONCE(hctx->dispatch_from, ctx);
} }
/* return true if hw queue need to be run again */
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{ {
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
@ -428,7 +427,7 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
} }
void blk_mq_sched_insert_request(struct request *rq, bool at_head, void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block) bool run_queue, bool async)
{ {
struct request_queue *q = rq->q; struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;

View File

@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head, void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block); bool run_queue, bool async);
void blk_mq_sched_insert_requests(struct request_queue *q, void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_ctx *ctx, struct blk_mq_ctx *ctx,
struct list_head *list, bool run_queue_async); struct list_head *list, bool run_queue_async);

View File

@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret; return ret;
} }
static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
int i; int i;
@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
q->mq_sysfs_init_done = false; q->mq_sysfs_init_done = false;
} }
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
mutex_lock(&q->sysfs_lock);
__blk_mq_unregister_dev(dev, q);
mutex_unlock(&q->sysfs_lock);
}
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{ {
kobject_init(&hctx->kobj, &blk_mq_hw_ktype); kobject_init(&hctx->kobj, &blk_mq_hw_ktype);

View File

@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx); ws = bt_wait_ptr(bt, data->hctx);
drop_ctx = data->ctx == NULL; drop_ctx = data->ctx == NULL;
do { do {
prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
/* /*
* We're out of tags on this hardware queue, kick any * We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for * pending IO submits before going to sleep waiting for
@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != -1) if (tag != -1)
break; break;
prepare_to_wait_exclusive(&ws->wait, &wait,
TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
if (data->ctx) if (data->ctx)
blk_mq_put_ctx(data->ctx); blk_mq_put_ctx(data->ctx);

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,20 @@ struct blk_mq_ctx {
struct kobject kobj; struct kobject kobj;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/*
* Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
* and the upper bits the generation number.
*/
enum mq_rq_state {
MQ_RQ_IDLE = 0,
MQ_RQ_IN_FLIGHT = 1,
MQ_RQ_COMPLETE = 2,
MQ_RQ_STATE_BITS = 2,
MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
};
void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list); struct list_head *list);
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq);
/* /*
* CPU -> queue mappings * CPU -> queue mappings
*/ */
@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q); void blk_mq_release(struct request_queue *q);
/**
* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
* @rq: target request.
*/
static inline int blk_mq_rq_state(struct request *rq)
{
return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
}
/**
* blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
* @rq: target request.
* @state: new state to set.
*
* Set @rq's state to @state. The caller is responsible for ensuring that
* there are no other updaters. A request can transition into IN_FLIGHT
* only from IDLE and doing so increments the generation number.
*/
static inline void blk_mq_rq_update_state(struct request *rq,
enum mq_rq_state state)
{
u64 old_val = READ_ONCE(rq->gstate);
u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
if (state == MQ_RQ_IN_FLIGHT) {
WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
new_val += MQ_RQ_GEN_INC;
}
/* avoid exposing interim values */
WRITE_ONCE(rq->gstate, new_val);
}
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu) unsigned int cpu)
{ {

View File

@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue, .release = blk_release_queue,
}; };
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
*/
int blk_register_queue(struct gendisk *disk) int blk_register_queue(struct gendisk *disk)
{ {
int ret; int ret;
@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
if (q->request_fn || (q->mq_ops && q->elevator)) { if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q); ret = elv_register_queue(q);
if (ret) { if (ret) {
mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj); kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev); blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj); kobject_put(&dev->kobj);
goto unlock; return ret;
} }
} }
ret = 0; ret = 0;
@ -921,7 +926,15 @@ int blk_register_queue(struct gendisk *disk)
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(blk_register_queue);
/**
* blk_unregister_queue - counterpart of blk_register_queue()
* @disk: Disk of which the request queue should be unregistered from sysfs.
*
* Note: the caller is responsible for guaranteeing that this function is called
* after blk_register_queue() has finished.
*/
void blk_unregister_queue(struct gendisk *disk) void blk_unregister_queue(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue; struct request_queue *q = disk->queue;
@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q)) if (WARN_ON(!q))
return; return;
/* Return early if disk->queue was never registered. */
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
return;
/*
* Since sysfs_remove_dir() prevents adding new directory entries
* before removal of existing entries starts, protect against
* concurrent elv_iosched_store() calls.
*/
mutex_lock(&q->sysfs_lock); mutex_lock(&q->sysfs_lock);
queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
mutex_unlock(&q->sysfs_lock);
wbt_exit(q);
spin_lock_irq(q->queue_lock);
queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
spin_unlock_irq(q->queue_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
*/
if (q->mq_ops) if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q); blk_mq_unregister_dev(disk_to_dev(disk), q);
mutex_unlock(&q->sysfs_lock);
if (q->request_fn || (q->mq_ops && q->elevator))
elv_unregister_queue(q);
kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj); kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk)); blk_trace_remove_sysfs(disk_to_dev(disk));
wbt_exit(q);
mutex_lock(&q->sysfs_lock);
if (q->request_fn || (q->mq_ops && q->elevator))
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
kobject_put(&disk_to_dev(disk)->kobj); kobject_put(&disk_to_dev(disk)->kobj);
} }

View File

@ -216,9 +216,9 @@ struct throtl_data
unsigned int scale; unsigned int scale;
struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
struct latency_bucket __percpu *latency_buckets; struct latency_bucket __percpu *latency_buckets[2];
unsigned long last_calculate_time; unsigned long last_calculate_time;
unsigned long filtered_latency; unsigned long filtered_latency;
@ -1510,11 +1510,21 @@ static struct cftype throtl_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_throtl, .private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_bytes, .seq_show = blkg_print_stat_bytes,
}, },
{
.name = "throttle.io_service_bytes_recursive",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_bytes_recursive,
},
{ {
.name = "throttle.io_serviced", .name = "throttle.io_serviced",
.private = (unsigned long)&blkcg_policy_throtl, .private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios, .seq_show = blkg_print_stat_ios,
}, },
{
.name = "throttle.io_serviced_recursive",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios_recursive,
},
{ } /* terminate */ { } /* terminate */
}; };
@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td) static void throtl_update_latency_buckets(struct throtl_data *td)
{ {
struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
int i, cpu; int i, cpu, rw;
unsigned long last_latency = 0; unsigned long last_latency[2] = { 0 };
unsigned long latency; unsigned long latency[2];
if (!blk_queue_nonrot(td->queue)) if (!blk_queue_nonrot(td->queue))
return; return;
@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
td->last_calculate_time = jiffies; td->last_calculate_time = jiffies;
memset(avg_latency, 0, sizeof(avg_latency)); memset(avg_latency, 0, sizeof(avg_latency));
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { for (rw = READ; rw <= WRITE; rw++) {
struct latency_bucket *tmp = &td->tmp_buckets[i]; for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
struct latency_bucket *bucket; struct latency_bucket *bucket;
/* this isn't race free, but ok in practice */ /* this isn't race free, but ok in practice */
bucket = per_cpu_ptr(td->latency_buckets, cpu); bucket = per_cpu_ptr(td->latency_buckets[rw],
tmp->total_latency += bucket[i].total_latency; cpu);
tmp->samples += bucket[i].samples; tmp->total_latency += bucket[i].total_latency;
bucket[i].total_latency = 0; tmp->samples += bucket[i].samples;
bucket[i].samples = 0; bucket[i].total_latency = 0;
} bucket[i].samples = 0;
}
if (tmp->samples >= 32) { if (tmp->samples >= 32) {
int samples = tmp->samples; int samples = tmp->samples;
latency = tmp->total_latency; latency[rw] = tmp->total_latency;
tmp->total_latency = 0; tmp->total_latency = 0;
tmp->samples = 0; tmp->samples = 0;
latency /= samples; latency[rw] /= samples;
if (latency == 0) if (latency[rw] == 0)
continue; continue;
avg_latency[i].latency = latency; avg_latency[rw][i].latency = latency[rw];
}
} }
} }
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { for (rw = READ; rw <= WRITE; rw++) {
if (!avg_latency[i].latency) { for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
if (td->avg_buckets[i].latency < last_latency) if (!avg_latency[rw][i].latency) {
td->avg_buckets[i].latency = last_latency; if (td->avg_buckets[rw][i].latency < last_latency[rw])
continue; td->avg_buckets[rw][i].latency =
last_latency[rw];
continue;
}
if (!td->avg_buckets[rw][i].valid)
latency[rw] = avg_latency[rw][i].latency;
else
latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
avg_latency[rw][i].latency) >> 3;
td->avg_buckets[rw][i].latency = max(latency[rw],
last_latency[rw]);
td->avg_buckets[rw][i].valid = true;
last_latency[rw] = td->avg_buckets[rw][i].latency;
} }
if (!td->avg_buckets[i].valid)
latency = avg_latency[i].latency;
else
latency = (td->avg_buckets[i].latency * 7 +
avg_latency[i].latency) >> 3;
td->avg_buckets[i].latency = max(latency, last_latency);
td->avg_buckets[i].valid = true;
last_latency = td->avg_buckets[i].latency;
} }
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
throtl_log(&td->service_queue, throtl_log(&td->service_queue,
"Latency bucket %d: latency=%ld, valid=%d", i, "Latency bucket %d: read latency=%ld, read valid=%d, "
td->avg_buckets[i].latency, td->avg_buckets[i].valid); "write latency=%ld, write valid=%d", i,
td->avg_buckets[READ][i].latency,
td->avg_buckets[READ][i].valid,
td->avg_buckets[WRITE][i].latency,
td->avg_buckets[WRITE][i].valid);
} }
#else #else
static inline void throtl_update_latency_buckets(struct throtl_data *td) static inline void throtl_update_latency_buckets(struct throtl_data *td)
@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
struct latency_bucket *latency; struct latency_bucket *latency;
int index; int index;
if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || if (!td || td->limit_index != LIMIT_LOW ||
!(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
!blk_queue_nonrot(td->queue)) !blk_queue_nonrot(td->queue))
return; return;
index = request_bucket_index(size); index = request_bucket_index(size);
latency = get_cpu_ptr(td->latency_buckets); latency = get_cpu_ptr(td->latency_buckets[op]);
latency[index].total_latency += time; latency[index].total_latency += time;
latency[index].samples++; latency[index].samples++;
put_cpu_ptr(td->latency_buckets); put_cpu_ptr(td->latency_buckets[op]);
} }
void blk_throtl_stat_add(struct request *rq, u64 time_ns) void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
unsigned long finish_time; unsigned long finish_time;
unsigned long start_time; unsigned long start_time;
unsigned long lat; unsigned long lat;
int rw = bio_data_dir(bio);
tg = bio->bi_cg_private; tg = bio->bi_cg_private;
if (!tg) if (!tg)
@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
bucket = request_bucket_index( bucket = request_bucket_index(
blk_stat_size(&bio->bi_issue_stat)); blk_stat_size(&bio->bi_issue_stat));
threshold = tg->td->avg_buckets[bucket].latency + threshold = tg->td->avg_buckets[rw][bucket].latency +
tg->latency_target; tg->latency_target;
if (lat > threshold) if (lat > threshold)
tg->bad_bio_cnt++; tg->bad_bio_cnt++;
@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td) if (!td)
return -ENOMEM; return -ENOMEM;
td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64)); LATENCY_BUCKET_SIZE, __alignof__(u64));
if (!td->latency_buckets) { if (!td->latency_buckets[READ]) {
kfree(td);
return -ENOMEM;
}
td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
if (!td->latency_buckets[WRITE]) {
free_percpu(td->latency_buckets[READ]);
kfree(td); kfree(td);
return -ENOMEM; return -ENOMEM;
} }
@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
/* activate policy */ /* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl); ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
if (ret) { if (ret) {
free_percpu(td->latency_buckets); free_percpu(td->latency_buckets[READ]);
free_percpu(td->latency_buckets[WRITE]);
kfree(td); kfree(td);
} }
return ret; return ret;
@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td); BUG_ON(!q->td);
throtl_shutdown_wq(q); throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl); blkcg_deactivate_policy(q, &blkcg_policy_throtl);
free_percpu(q->td->latency_buckets); free_percpu(q->td->latency_buckets[READ]);
free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td); kfree(q->td);
} }
@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
} else { } else {
td->throtl_slice = DFL_THROTL_SLICE_HD; td->throtl_slice = DFL_THROTL_SLICE_HD;
td->filtered_latency = LATENCY_FILTERED_HD; td->filtered_latency = LATENCY_FILTERED_HD;
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
}
} }
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */ /* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD; td->throtl_slice = DFL_THROTL_SLICE_HD;
#endif #endif
td->track_bio_latency = !q->mq_ops && !q->request_fn; td->track_bio_latency = !queue_is_rq_based(q);
if (!td->track_bio_latency) if (!td->track_bio_latency)
blk_stat_enable_accounting(q); blk_stat_enable_accounting(q);
} }

View File

@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
unsigned int *next_set) unsigned int *next_set)
{ {
if (time_after_eq(jiffies, rq->deadline)) { const unsigned long deadline = blk_rq_deadline(rq);
if (time_after_eq(jiffies, deadline)) {
list_del_init(&rq->timeout_list); list_del_init(&rq->timeout_list);
/* /*
@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
*/ */
if (!blk_mark_rq_complete(rq)) if (!blk_mark_rq_complete(rq))
blk_rq_timed_out(rq); blk_rq_timed_out(rq);
} else if (!*next_set || time_after(*next_timeout, rq->deadline)) { } else if (!*next_set || time_after(*next_timeout, deadline)) {
*next_timeout = rq->deadline; *next_timeout = deadline;
*next_set = 1; *next_set = 1;
} }
} }
@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)
*/ */
void blk_abort_request(struct request *req) void blk_abort_request(struct request *req)
{ {
if (blk_mark_rq_complete(req))
return;
if (req->q->mq_ops) { if (req->q->mq_ops) {
blk_mq_rq_timed_out(req, false); /*
* All we need to ensure is that timeout scan takes place
* immediately and that scan sees the new timeout value.
* No need for fancy synchronizations.
*/
blk_rq_set_deadline(req, jiffies);
mod_timer(&req->q->timeout, 0);
} else { } else {
if (blk_mark_rq_complete(req))
return;
blk_delete_timer(req); blk_delete_timer(req);
blk_rq_timed_out(req); blk_rq_timed_out(req);
} }
@ -208,7 +215,8 @@ void blk_add_timer(struct request *req)
if (!req->timeout) if (!req->timeout)
req->timeout = q->rq_timeout; req->timeout = q->rq_timeout;
WRITE_ONCE(req->deadline, jiffies + req->timeout); blk_rq_set_deadline(req, jiffies + req->timeout);
req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
/* /*
* Only the non-mq case needs to add the request to a protected list. * Only the non-mq case needs to add the request to a protected list.
@ -222,7 +230,7 @@ void blk_add_timer(struct request *req)
* than an existing one, modify the timer. Round up to next nearest * than an existing one, modify the timer. Round up to next nearest
* second. * second.
*/ */
expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
if (!timer_pending(&q->timeout) || if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) { time_before(expiry, q->timeout.expires)) {

View File

@ -21,6 +21,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
return sector & ~zone_mask; return sector & ~zone_mask;
} }
/*
* Return true if a request is a write requests that needs zone write locking.
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
if (!rq->q->seq_zones_wlock)
return false;
if (blk_rq_is_passthrough(rq))
return false;
switch (req_op(rq)) {
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_WRITE:
return blk_rq_zone_is_seq(rq);
default:
return false;
}
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
void __blk_req_zone_write_lock(struct request *rq)
{
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
rq->q->seq_zones_wlock)))
return;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
void __blk_req_zone_write_unlock(struct request *rq)
{
rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
if (rq->q->seq_zones_wlock)
WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
rq->q->seq_zones_wlock));
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
/* /*
* Check that a zone report belongs to the partition. * Check that a zone report belongs to the partition.
* If yes, fix its start sector and write pointer, copy it in the * If yes, fix its start sector and write pointer, copy it in the

View File

@ -119,34 +119,24 @@ void blk_account_io_start(struct request *req, bool new_io);
void blk_account_io_completion(struct request *req, unsigned int bytes); void blk_account_io_completion(struct request *req, unsigned int bytes);
void blk_account_io_done(struct request *req); void blk_account_io_done(struct request *req);
/*
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
/*
* Keep these two bits first - not because we depend on the
* value of them, but we do depend on them being in the same
* byte of storage to ensure ordering on writes. Keeping them
* first will achieve that nicely.
*/
REQ_ATOM_COMPLETE = 0,
REQ_ATOM_STARTED,
REQ_ATOM_POLL_SLEPT,
};
/* /*
* EH timer and IO completion will both attempt to 'grab' the request, make * EH timer and IO completion will both attempt to 'grab' the request, make
* sure that only one of them succeeds * sure that only one of them succeeds. Steal the bottom bit of the
* __deadline field for this.
*/ */
static inline int blk_mark_rq_complete(struct request *rq) static inline int blk_mark_rq_complete(struct request *rq)
{ {
return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); return test_and_set_bit(0, &rq->__deadline);
} }
static inline void blk_clear_rq_complete(struct request *rq) static inline void blk_clear_rq_complete(struct request *rq)
{ {
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); clear_bit(0, &rq->__deadline);
}
static inline bool blk_rq_is_complete(struct request *rq)
{
return test_bit(0, &rq->__deadline);
} }
/* /*
@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
e->type->ops.sq.elevator_deactivate_req_fn(q, rq); e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
} }
int elv_register_queue(struct request_queue *q);
void elv_unregister_queue(struct request_queue *q);
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
#ifdef CONFIG_FAIL_IO_TIMEOUT #ifdef CONFIG_FAIL_IO_TIMEOUT
@ -245,6 +238,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
q->last_merge = NULL; q->last_merge = NULL;
} }
/*
* Steal a bit from this field for legacy IO path atomic IO marking. Note that
* setting the deadline clears the bottom bit, potentially clearing the
* completed bit. The user has to be OK with this (current ones are fine).
*/
static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
{
rq->__deadline = time & ~0x1UL;
}
static inline unsigned long blk_rq_deadline(struct request *rq)
{
return rq->__deadline & ~0x1UL;
}
/* /*
* Internal io_context interface * Internal io_context interface
*/ */

View File

@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)
static void copy_to_high_bio_irq(struct bio *to, struct bio *from) static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{ {
unsigned char *vfrom; unsigned char *vfrom;
struct bio_vec tovec, *fromvec = from->bi_io_vec; struct bio_vec tovec, fromvec;
struct bvec_iter iter; struct bvec_iter iter;
/*
* The bio of @from is created by bounce, so we can iterate
* its bvec from start to end, but the @from->bi_iter can't be
* trusted because it might be changed by splitting.
*/
struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
bio_for_each_segment(tovec, to, iter) { bio_for_each_segment(tovec, to, iter) {
if (tovec.bv_page != fromvec->bv_page) { fromvec = bio_iter_iovec(from, from_iter);
if (tovec.bv_page != fromvec.bv_page) {
/* /*
* fromvec->bv_offset and fromvec->bv_len might have * fromvec->bv_offset and fromvec->bv_len might have
* been modified by the block layer, so use the original * been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len * copy, bounce_copy_vec already uses tovec->bv_len
*/ */
vfrom = page_address(fromvec->bv_page) + vfrom = page_address(fromvec.bv_page) +
tovec.bv_offset; tovec.bv_offset;
bounce_copy_vec(&tovec, vfrom); bounce_copy_vec(&tovec, vfrom);
flush_dcache_page(tovec.bv_page); flush_dcache_page(tovec.bv_page);
} }
bio_advance_iter(from, &from_iter, tovec.bv_len);
fromvec++;
} }
} }
static void bounce_end_io(struct bio *bio, mempool_t *pool) static void bounce_end_io(struct bio *bio, mempool_t *pool)
{ {
struct bio *bio_orig = bio->bi_private; struct bio *bio_orig = bio->bi_private;
struct bio_vec *bvec, *org_vec; struct bio_vec *bvec, orig_vec;
int i; int i;
int start = bio_orig->bi_iter.bi_idx; struct bvec_iter orig_iter = bio_orig->bi_iter;
/* /*
* free up bounce indirect pages used * free up bounce indirect pages used
*/ */
bio_for_each_segment_all(bvec, bio, i) { bio_for_each_segment_all(bvec, bio, i) {
org_vec = bio_orig->bi_io_vec + i + start; orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page != orig_vec.bv_page) {
if (bvec->bv_page == org_vec->bv_page) dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
continue; mempool_free(bvec->bv_page, pool);
}
dec_zone_page_state(bvec->bv_page, NR_BOUNCE); bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
mempool_free(bvec->bv_page, pool);
} }
bio_orig->bi_status = bio->bi_status; bio_orig->bi_status = bio->bi_status;

View File

@ -30,7 +30,7 @@
/** /**
* bsg_teardown_job - routine to teardown a bsg job * bsg_teardown_job - routine to teardown a bsg job
* @job: bsg_job that is to be torn down * @kref: kref inside bsg_job that is to be torn down
*/ */
static void bsg_teardown_job(struct kref *kref) static void bsg_teardown_job(struct kref *kref)
{ {
@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
* @name: device to give bsg device * @name: device to give bsg device
* @job_fn: bsg job handler * @job_fn: bsg job handler
* @dd_job_size: size of LLD data needed for each job * @dd_job_size: size of LLD data needed for each job
* @release: @dev release function
*/ */
struct request_queue *bsg_setup_queue(struct device *dev, const char *name, struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
bsg_job_fn *job_fn, int dd_job_size, bsg_job_fn *job_fn, int dd_job_size,

View File

@ -32,6 +32,9 @@
#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
#define BSG_VERSION "0.4" #define BSG_VERSION "0.4"
#define bsg_dbg(bd, fmt, ...) \
pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
struct bsg_device { struct bsg_device {
struct request_queue *queue; struct request_queue *queue;
spinlock_t lock; spinlock_t lock;
@ -55,14 +58,6 @@ enum {
#define BSG_DEFAULT_CMDS 64 #define BSG_DEFAULT_CMDS 64
#define BSG_MAX_DEVS 32768 #define BSG_MAX_DEVS 32768
#undef BSG_DEBUG
#ifdef BSG_DEBUG
#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
#else
#define dprintk(fmt, args...)
#endif
static DEFINE_MUTEX(bsg_mutex); static DEFINE_MUTEX(bsg_mutex);
static DEFINE_IDR(bsg_minor_idr); static DEFINE_IDR(bsg_minor_idr);
@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
bc->bd = bd; bc->bd = bd;
INIT_LIST_HEAD(&bc->list); INIT_LIST_HEAD(&bc->list);
dprintk("%s: returning free cmd %p\n", bd->name, bc); bsg_dbg(bd, "returning free cmd %p\n", bc);
return bc; return bc;
out: out:
spin_unlock_irq(&bd->lock); spin_unlock_irq(&bd->lock);
@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
if (!bcd->class_dev) if (!bcd->class_dev)
return ERR_PTR(-ENXIO); return ERR_PTR(-ENXIO);
dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
(unsigned long long) hdr->dout_xferp,
hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
hdr->din_xfer_len); hdr->din_xfer_len);
@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)
struct bsg_device *bd = bc->bd; struct bsg_device *bd = bc->bd;
unsigned long flags; unsigned long flags;
dprintk("%s: finished rq %p bc %p, bio %p\n", bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
bd->name, rq, bc, bc->bio); rq, bc, bc->bio);
bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
list_add_tail(&bc->list, &bd->busy_list); list_add_tail(&bc->list, &bd->busy_list);
spin_unlock_irq(&bd->lock); spin_unlock_irq(&bd->lock);
dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
rq->end_io_data = bc; rq->end_io_data = bc;
blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
} }
} while (1); } while (1);
dprintk("%s: returning done %p\n", bd->name, bc); bsg_dbg(bd, "returning done %p\n", bc);
return bc; return bc;
} }
@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct scsi_request *req = scsi_req(rq); struct scsi_request *req = scsi_req(rq);
int ret = 0; int ret = 0;
dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
/* /*
* fill in all the output members * fill in all the output members
*/ */
@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
struct bsg_command *bc; struct bsg_command *bc;
int ret, tret; int ret, tret;
dprintk("%s: entered\n", bd->name); bsg_dbg(bd, "entered\n");
/* /*
* wait for all commands to complete * wait for all commands to complete
@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
int ret; int ret;
ssize_t bytes_read; ssize_t bytes_read;
dprintk("%s: read %zd bytes\n", bd->name, count); bsg_dbg(bd, "read %zd bytes\n", count);
bsg_set_block(bd, file); bsg_set_block(bd, file);
@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
ssize_t bytes_written; ssize_t bytes_written;
int ret; int ret;
dprintk("%s: write %zd bytes\n", bd->name, count); bsg_dbg(bd, "write %zd bytes\n", count);
if (unlikely(uaccess_kernel())) if (unlikely(uaccess_kernel()))
return -EINVAL; return -EINVAL;
@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
if (!bytes_written || err_block_err(ret)) if (!bytes_written || err_block_err(ret))
bytes_written = ret; bytes_written = ret;
dprintk("%s: returning %zd\n", bd->name, bytes_written); bsg_dbg(bd, "returning %zd\n", bytes_written);
return bytes_written; return bytes_written;
} }
@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)
hlist_del(&bd->dev_list); hlist_del(&bd->dev_list);
mutex_unlock(&bsg_mutex); mutex_unlock(&bsg_mutex);
dprintk("%s: tearing down\n", bd->name); bsg_dbg(bd, "tearing down\n");
/* /*
* close can always block * close can always block
@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
struct file *file) struct file *file)
{ {
struct bsg_device *bd; struct bsg_device *bd;
#ifdef BSG_DEBUG
unsigned char buf[32]; unsigned char buf[32];
#endif
if (!blk_queue_scsi_passthrough(rq)) { if (!blk_queue_scsi_passthrough(rq)) {
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
dprintk("bound to <%s>, max queue %d\n", bsg_dbg(bd, "bound to <%s>, max queue %d\n",
format_dev_t(buf, inode->i_rdev), bd->max_queue); format_dev_t(buf, inode->i_rdev), bd->max_queue);
mutex_unlock(&bsg_mutex); mutex_unlock(&bsg_mutex);

View File

@ -50,8 +50,6 @@ struct deadline_data {
int front_merges; int front_merges;
}; };
static void deadline_move_request(struct deadline_data *, struct request *);
static inline struct rb_root * static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq) deadline_rb_root(struct deadline_data *dd, struct request *rq)
{ {
@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)
struct deadline_data *dd = q->elevator->elevator_data; struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq); const int data_dir = rq_data_dir(rq);
/*
* This may be a requeue of a write request that has locked its
* target zone. If it is the case, this releases the zone lock.
*/
blk_req_zone_write_unlock(rq);
deadline_add_rq_rb(dd, rq); deadline_add_rq_rb(dd, rq);
/* /*
@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
{ {
struct request_queue *q = rq->q; struct request_queue *q = rq->q;
/*
* For a zoned block device, write requests must write lock their
* target zone.
*/
blk_req_zone_write_lock(rq);
deadline_remove_request(q, rq); deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq); elv_dispatch_add_tail(q, rq);
} }
@ -230,6 +240,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
return 0; return 0;
} }
/*
* For the specified data direction, return the next request to dispatch using
* arrival ordered lists.
*/
static struct request *
deadline_fifo_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
if (list_empty(&dd->fifo_list[data_dir]))
return NULL;
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
if (blk_req_can_dispatch_to_zone(rq))
return rq;
}
return NULL;
}
/*
* For the specified data direction, return the next request to dispatch using
* sector position sorted lists.
*/
static struct request *
deadline_next_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
rq = dd->next_rq[data_dir];
if (!rq)
return NULL;
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
return rq;
rq = deadline_latter_request(rq);
}
return NULL;
}
/* /*
* deadline_dispatch_requests selects the best request according to * deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc * read/write expire, fifo_batch, etc
@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
struct deadline_data *dd = q->elevator->elevator_data; struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(&dd->fifo_list[READ]); const int reads = !list_empty(&dd->fifo_list[READ]);
const int writes = !list_empty(&dd->fifo_list[WRITE]); const int writes = !list_empty(&dd->fifo_list[WRITE]);
struct request *rq; struct request *rq, *next_rq;
int data_dir; int data_dir;
/* /*
* batches are currently reads XOR writes * batches are currently reads XOR writes
*/ */
if (dd->next_rq[WRITE]) rq = deadline_next_request(dd, WRITE);
rq = dd->next_rq[WRITE]; if (!rq)
else rq = deadline_next_request(dd, READ);
rq = dd->next_rq[READ];
if (rq && dd->batching < dd->fifo_batch) if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */ /* we have a next request are still entitled to batch */
@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
if (reads) { if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
if (writes && (dd->starved++ >= dd->writes_starved)) if (deadline_fifo_request(dd, WRITE) &&
(dd->starved++ >= dd->writes_starved))
goto dispatch_writes; goto dispatch_writes;
data_dir = READ; data_dir = READ;
@ -291,21 +364,29 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
/* /*
* we are not running a batch, find best request for selected data_dir * we are not running a batch, find best request for selected data_dir
*/ */
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { next_rq = deadline_next_request(dd, data_dir);
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/* /*
* A deadline has expired, the last request was in the other * A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests. * direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time. * Start again from the request with the earliest expiry time.
*/ */
rq = rq_entry_fifo(dd->fifo_list[data_dir].next); rq = deadline_fifo_request(dd, data_dir);
} else { } else {
/* /*
* The last req was the same dir and we have a next request in * The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here. * sort order. No expired requests so continue on from here.
*/ */
rq = dd->next_rq[data_dir]; rq = next_rq;
} }
/*
* For a zoned block device, if we only have writes queued and none of
* them can be dispatched, rq will be NULL.
*/
if (!rq)
return 0;
dd->batching = 0; dd->batching = 0;
dispatch_request: dispatch_request:
@ -318,6 +399,16 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
return 1; return 1;
} }
/*
* For zoned block devices, write unlock the target zone of completed
* write requests.
*/
static void
deadline_completed_request(struct request_queue *q, struct request *rq)
{
blk_req_zone_write_unlock(rq);
}
static void deadline_exit_queue(struct elevator_queue *e) static void deadline_exit_queue(struct elevator_queue *e)
{ {
struct deadline_data *dd = e->elevator_data; struct deadline_data *dd = e->elevator_data;
@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {
.elevator_merged_fn = deadline_merged_request, .elevator_merged_fn = deadline_merged_request,
.elevator_merge_req_fn = deadline_merged_requests, .elevator_merge_req_fn = deadline_merged_requests,
.elevator_dispatch_fn = deadline_dispatch_requests, .elevator_dispatch_fn = deadline_dispatch_requests,
.elevator_completed_req_fn = deadline_completed_request,
.elevator_add_req_fn = deadline_add_request, .elevator_add_req_fn = deadline_add_request,
.elevator_former_req_fn = elv_rb_former_request, .elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request, .elevator_latter_req_fn = elv_rb_latter_request,

View File

@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;
int error; int error;
lockdep_assert_held(&q->sysfs_lock);
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) { if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs; struct elv_fs_entry *attr = e->type->elevator_attrs;
@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
} }
return error; return error;
} }
EXPORT_SYMBOL(elv_register_queue);
void elv_unregister_queue(struct request_queue *q) void elv_unregister_queue(struct request_queue *q)
{ {
lockdep_assert_held(&q->sysfs_lock);
if (q) { if (q) {
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;
@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
wbt_enable_default(q); wbt_enable_default(q);
} }
} }
EXPORT_SYMBOL(elv_unregister_queue);
int elv_register(struct elevator_type *e) int elv_register(struct elevator_type *e)
{ {
@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,
{ {
int ret; int ret;
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q); blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
if (q->elevator) { if (q->elevator) {
if (q->elevator->registered) if (q->elevator->registered)
@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,
blk_add_trace_msg(q, "elv switch: none"); blk_add_trace_msg(q, "elv switch: none");
out: out:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q); blk_mq_unfreeze_queue(q);
return ret; return ret;
} }
@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
bool old_registered = false; bool old_registered = false;
int err; int err;
lockdep_assert_held(&q->sysfs_lock);
if (q->mq_ops) if (q->mq_ops)
return elevator_switch_mq(q, new_e); return elevator_switch_mq(q, new_e);

View File

@ -629,16 +629,18 @@ static void register_disk(struct device *parent, struct gendisk *disk)
} }
/** /**
* device_add_disk - add partitioning information to kernel list * __device_add_disk - add disk information to kernel list
* @parent: parent device for the disk * @parent: parent device for the disk
* @disk: per-device partitioning information * @disk: per-device partitioning information
* @register_queue: register the queue if set to true
* *
* This function registers the partitioning information in @disk * This function registers the partitioning information in @disk
* with the kernel. * with the kernel.
* *
* FIXME: error handling * FIXME: error handling
*/ */
void device_add_disk(struct device *parent, struct gendisk *disk) static void __device_add_disk(struct device *parent, struct gendisk *disk,
bool register_queue)
{ {
dev_t devt; dev_t devt;
int retval; int retval;
@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
exact_match, exact_lock, disk); exact_match, exact_lock, disk);
} }
register_disk(parent, disk); register_disk(parent, disk);
blk_register_queue(disk); if (register_queue)
blk_register_queue(disk);
/* /*
* Take an extra ref on queue which will be put on disk_release() * Take an extra ref on queue which will be put on disk_release()
@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
disk_add_events(disk); disk_add_events(disk);
blk_integrity_add(disk); blk_integrity_add(disk);
} }
void device_add_disk(struct device *parent, struct gendisk *disk)
{
__device_add_disk(parent, disk, true);
}
EXPORT_SYMBOL(device_add_disk); EXPORT_SYMBOL(device_add_disk);
void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
{
__device_add_disk(parent, disk, false);
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
void del_gendisk(struct gendisk *disk) void del_gendisk(struct gendisk *disk)
{ {
struct disk_part_iter piter; struct disk_part_iter piter;
@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
* Unregister bdi before releasing device numbers (as they can * Unregister bdi before releasing device numbers (as they can
* get reused and we'd get clashes in sysfs). * get reused and we'd get clashes in sysfs).
*/ */
bdi_unregister(disk->queue->backing_dev_info); if (!(disk->flags & GENHD_FL_HIDDEN))
bdi_unregister(disk->queue->backing_dev_info);
blk_unregister_queue(disk); blk_unregister_queue(disk);
} else { } else {
WARN_ON(1); WARN_ON(1);

View File

@ -59,6 +59,7 @@ struct deadline_data {
int front_merges; int front_merges;
spinlock_t lock; spinlock_t lock;
spinlock_t zone_lock;
struct list_head dispatch; struct list_head dispatch;
}; };
@ -191,14 +192,84 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
return 0; return 0;
} }
/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
static struct request *
deadline_fifo_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
unsigned long flags;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
if (list_empty(&dd->fifo_list[data_dir]))
return NULL;
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
if (blk_req_can_dispatch_to_zone(rq))
goto out;
}
rq = NULL;
out:
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
}
/*
* For the specified data direction, return the next request to
* dispatch using sector position sorted lists.
*/
static struct request *
deadline_next_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
unsigned long flags;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
rq = dd->next_rq[data_dir];
if (!rq)
return NULL;
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
rq = deadline_latter_request(rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
}
/* /*
* deadline_dispatch_requests selects the best request according to * deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc * read/write expire, fifo_batch, etc
*/ */
static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) static struct request *__dd_dispatch_request(struct deadline_data *dd)
{ {
struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq, *next_rq;
struct request *rq;
bool reads, writes; bool reads, writes;
int data_dir; int data_dir;
@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
/* /*
* batches are currently reads XOR writes * batches are currently reads XOR writes
*/ */
if (dd->next_rq[WRITE]) rq = deadline_next_request(dd, WRITE);
rq = dd->next_rq[WRITE]; if (!rq)
else rq = deadline_next_request(dd, READ);
rq = dd->next_rq[READ];
if (rq && dd->batching < dd->fifo_batch) if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */ /* we have a next request are still entitled to batch */
@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
if (reads) { if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
if (writes && (dd->starved++ >= dd->writes_starved)) if (deadline_fifo_request(dd, WRITE) &&
(dd->starved++ >= dd->writes_starved))
goto dispatch_writes; goto dispatch_writes;
data_dir = READ; data_dir = READ;
@ -260,21 +331,29 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
/* /*
* we are not running a batch, find best request for selected data_dir * we are not running a batch, find best request for selected data_dir
*/ */
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { next_rq = deadline_next_request(dd, data_dir);
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/* /*
* A deadline has expired, the last request was in the other * A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests. * direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time. * Start again from the request with the earliest expiry time.
*/ */
rq = rq_entry_fifo(dd->fifo_list[data_dir].next); rq = deadline_fifo_request(dd, data_dir);
} else { } else {
/* /*
* The last req was the same dir and we have a next request in * The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here. * sort order. No expired requests so continue on from here.
*/ */
rq = dd->next_rq[data_dir]; rq = next_rq;
} }
/*
* For a zoned block device, if we only have writes queued and none of
* them can be dispatched, rq will be NULL.
*/
if (!rq)
return NULL;
dd->batching = 0; dd->batching = 0;
dispatch_request: dispatch_request:
@ -284,17 +363,27 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
dd->batching++; dd->batching++;
deadline_move_request(dd, rq); deadline_move_request(dd, rq);
done: done:
/*
* If the request needs its target zone locked, do it.
*/
blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED; rq->rq_flags |= RQF_STARTED;
return rq; return rq;
} }
/*
* One confusing aspect here is that we get called for a specific
* hardware queue, but we return a request that may not be for a
* different hardware queue. This is because mq-deadline has shared
* state for all hardware queues, in terms of sorting, FIFOs, etc.
*/
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{ {
struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct deadline_data *dd = hctx->queue->elevator->elevator_data;
struct request *rq; struct request *rq;
spin_lock(&dd->lock); spin_lock(&dd->lock);
rq = __dd_dispatch_request(hctx); rq = __dd_dispatch_request(dd);
spin_unlock(&dd->lock); spin_unlock(&dd->lock);
return rq; return rq;
@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
dd->front_merges = 1; dd->front_merges = 1;
dd->fifo_batch = fifo_batch; dd->fifo_batch = fifo_batch;
spin_lock_init(&dd->lock); spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
INIT_LIST_HEAD(&dd->dispatch); INIT_LIST_HEAD(&dd->dispatch);
q->elevator = eq; q->elevator = eq;
@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
struct deadline_data *dd = q->elevator->elevator_data; struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq); const int data_dir = rq_data_dir(rq);
/*
* This may be a requeue of a write request that has locked its
* target zone. If it is the case, this releases the zone lock.
*/
blk_req_zone_write_unlock(rq);
if (blk_mq_sched_try_insert_merge(q, rq)) if (blk_mq_sched_try_insert_merge(q, rq))
return; return;
@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
spin_unlock(&dd->lock); spin_unlock(&dd->lock);
} }
/*
* For zoned block devices, write unlock the target zone of
* completed write requests. Do this while holding the zone lock
* spinlock so that the zone is never unlocked while deadline_fifo_request()
* while deadline_next_request() are executing.
*/
static void dd_completed_request(struct request *rq)
{
struct request_queue *q = rq->q;
if (blk_queue_is_zoned(q)) {
struct deadline_data *dd = q->elevator->elevator_data;
unsigned long flags;
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
spin_unlock_irqrestore(&dd->zone_lock, flags);
}
}
static bool dd_has_work(struct blk_mq_hw_ctx *hctx) static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{ {
struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct deadline_data *dd = hctx->queue->elevator->elevator_data;
@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = {
.ops.mq = { .ops.mq = {
.insert_requests = dd_insert_requests, .insert_requests = dd_insert_requests,
.dispatch_request = dd_dispatch_request, .dispatch_request = dd_dispatch_request,
.completed_request = dd_completed_request,
.next_request = elv_rb_latter_request, .next_request = elv_rb_latter_request,
.former_request = elv_rb_former_request, .former_request = elv_rb_former_request,
.bio_merge = dd_bio_merge, .bio_merge = dd_bio_merge,

View File

@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,
continue; continue;
bsd_start = le32_to_cpu(p->p_offset); bsd_start = le32_to_cpu(p->p_offset);
bsd_size = le32_to_cpu(p->p_size); bsd_size = le32_to_cpu(p->p_size);
if (memcmp(flavour, "bsd\0", 4) == 0) /* FreeBSD has relative offset if C partition offset is zero */
if (memcmp(flavour, "bsd\0", 4) == 0 &&
le32_to_cpu(l->d_partitions[2].p_offset) == 0)
bsd_start += offset; bsd_start += offset;
if (offset == bsd_start && size == bsd_size) if (offset == bsd_start && size == bsd_size)
/* full parent partition, we have it already */ /* full parent partition, we have it already */

View File

@ -384,9 +384,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
/** /**
* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
* @file: file this ioctl operates on (optional)
* @q: request queue to send scsi commands down * @q: request queue to send scsi commands down
* @disk: gendisk to operate on (option) * @disk: gendisk to operate on (option)
* @mode: mode used to open the file through which the ioctl has been
* submitted
* @sic: userspace structure describing the command to perform * @sic: userspace structure describing the command to perform
* *
* Send down the scsi command described by @sic to the device below * Send down the scsi command described by @sic to the device below
@ -415,10 +416,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
* Positive numbers returned are the compacted SCSI error codes (4 * Positive numbers returned are the compacted SCSI error codes (4
* bytes in one int) where the lowest byte is the SCSI status. * bytes in one int) where the lowest byte is the SCSI status.
*/ */
#define OMAX_SB_LEN 16 /* For backward compatibility */
int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
struct scsi_ioctl_command __user *sic) struct scsi_ioctl_command __user *sic)
{ {
enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */
struct request *rq; struct request *rq;
struct scsi_request *req; struct scsi_request *req;
int err; int err;
@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
if (bd && bd == bd->bd_contains) if (bd && bd == bd->bd_contains)
return 0; return 0;
/* Actually none of these is particularly useful on a partition,
* but they are safe.
*/
switch (cmd) {
case SCSI_IOCTL_GET_IDLUN:
case SCSI_IOCTL_GET_BUS_NUMBER:
case SCSI_IOCTL_GET_PCI:
case SCSI_IOCTL_PROBE_HOST:
case SG_GET_VERSION_NUM:
case SG_SET_TIMEOUT:
case SG_GET_TIMEOUT:
case SG_GET_RESERVED_SIZE:
case SG_SET_RESERVED_SIZE:
case SG_EMULATED_HOST:
return 0;
case CDROM_GET_CAPABILITY:
/* Keep this until we remove the printk below. udev sends it
* and we do not want to spam dmesg about it. CD-ROMs do
* not have partitions, so we get here only for disks.
*/
return -ENOIOCTLCMD;
default:
break;
}
if (capable(CAP_SYS_RAWIO)) if (capable(CAP_SYS_RAWIO))
return 0; return 0;
/* In particular, rule out all resets and host-specific ioctls. */
printk_ratelimited(KERN_WARNING
"%s: sending ioctl %x to a partition!\n", current->comm, cmd);
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
} }
EXPORT_SYMBOL(scsi_verify_blk_ioctl); EXPORT_SYMBOL(scsi_verify_blk_ioctl);

View File

@ -106,6 +106,7 @@ config CRYPTO_KPP
config CRYPTO_ACOMP2 config CRYPTO_ACOMP2
tristate tristate
select CRYPTO_ALGAPI2 select CRYPTO_ALGAPI2
select SGL_ALLOC
config CRYPTO_ACOMP config CRYPTO_ACOMP
tristate tristate

View File

@ -140,53 +140,6 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm)
return ret; return ret;
} }
static void crypto_scomp_sg_free(struct scatterlist *sgl)
{
int i, n;
struct page *page;
if (!sgl)
return;
n = sg_nents(sgl);
for_each_sg(sgl, sgl, n, i) {
page = sg_page(sgl);
if (page)
__free_page(page);
}
kfree(sgl);
}
static struct scatterlist *crypto_scomp_sg_alloc(size_t size, gfp_t gfp)
{
struct scatterlist *sgl;
struct page *page;
int i, n;
n = ((size - 1) >> PAGE_SHIFT) + 1;
sgl = kmalloc_array(n, sizeof(struct scatterlist), gfp);
if (!sgl)
return NULL;
sg_init_table(sgl, n);
for (i = 0; i < n; i++) {
page = alloc_page(gfp);
if (!page)
goto err;
sg_set_page(sgl + i, page, PAGE_SIZE, 0);
}
return sgl;
err:
sg_mark_end(sgl + i);
crypto_scomp_sg_free(sgl);
return NULL;
}
static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
{ {
struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
@ -220,7 +173,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
scratch_dst, &req->dlen, *ctx); scratch_dst, &req->dlen, *ctx);
if (!ret) { if (!ret) {
if (!req->dst) { if (!req->dst) {
req->dst = crypto_scomp_sg_alloc(req->dlen, GFP_ATOMIC); req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL);
if (!req->dst) if (!req->dst)
goto out; goto out;
} }
@ -274,7 +227,7 @@ int crypto_init_scomp_ops_async(struct crypto_tfm *tfm)
crt->compress = scomp_acomp_compress; crt->compress = scomp_acomp_compress;
crt->decompress = scomp_acomp_decompress; crt->decompress = scomp_acomp_decompress;
crt->dst_free = crypto_scomp_sg_free; crt->dst_free = sgl_free;
crt->reqsize = sizeof(void *); crt->reqsize = sizeof(void *);
return 0; return 0;

View File

@ -6619,43 +6619,27 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller)
#ifdef DAC960_GAM_MINOR #ifdef DAC960_GAM_MINOR
/* static long DAC960_gam_get_controller_info(DAC960_ControllerInfo_T __user *UserSpaceControllerInfo)
* DAC960_gam_ioctl is the ioctl function for performing RAID operations.
*/
static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
unsigned long Argument)
{ {
long ErrorCode = 0;
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
mutex_lock(&DAC960_mutex);
switch (Request)
{
case DAC960_IOCTL_GET_CONTROLLER_COUNT:
ErrorCode = DAC960_ControllerCount;
break;
case DAC960_IOCTL_GET_CONTROLLER_INFO:
{
DAC960_ControllerInfo_T __user *UserSpaceControllerInfo =
(DAC960_ControllerInfo_T __user *) Argument;
DAC960_ControllerInfo_T ControllerInfo; DAC960_ControllerInfo_T ControllerInfo;
DAC960_Controller_T *Controller; DAC960_Controller_T *Controller;
int ControllerNumber; int ControllerNumber;
long ErrorCode;
if (UserSpaceControllerInfo == NULL) if (UserSpaceControllerInfo == NULL)
ErrorCode = -EINVAL; ErrorCode = -EINVAL;
else ErrorCode = get_user(ControllerNumber, else ErrorCode = get_user(ControllerNumber,
&UserSpaceControllerInfo->ControllerNumber); &UserSpaceControllerInfo->ControllerNumber);
if (ErrorCode != 0) if (ErrorCode != 0)
break; goto out;
ErrorCode = -ENXIO; ErrorCode = -ENXIO;
if (ControllerNumber < 0 || if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1) { ControllerNumber > DAC960_ControllerCount - 1) {
break; goto out;
} }
Controller = DAC960_Controllers[ControllerNumber]; Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL) if (Controller == NULL)
break; goto out;
memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T));
ControllerInfo.ControllerNumber = ControllerNumber; ControllerInfo.ControllerNumber = ControllerNumber;
ControllerInfo.FirmwareType = Controller->FirmwareType; ControllerInfo.FirmwareType = Controller->FirmwareType;
@ -6670,12 +6654,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion); strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion);
ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo,
sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0); sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0);
break; out:
} return ErrorCode;
case DAC960_IOCTL_V1_EXECUTE_COMMAND: }
{
DAC960_V1_UserCommand_T __user *UserSpaceUserCommand = static long DAC960_gam_v1_execute_command(DAC960_V1_UserCommand_T __user *UserSpaceUserCommand)
(DAC960_V1_UserCommand_T __user *) Argument; {
DAC960_V1_UserCommand_T UserCommand; DAC960_V1_UserCommand_T UserCommand;
DAC960_Controller_T *Controller; DAC960_Controller_T *Controller;
DAC960_Command_T *Command = NULL; DAC960_Command_T *Command = NULL;
@ -6688,39 +6672,41 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
int ControllerNumber, DataTransferLength; int ControllerNumber, DataTransferLength;
unsigned char *DataTransferBuffer = NULL; unsigned char *DataTransferBuffer = NULL;
dma_addr_t DataTransferBufferDMA; dma_addr_t DataTransferBufferDMA;
long ErrorCode;
if (UserSpaceUserCommand == NULL) { if (UserSpaceUserCommand == NULL) {
ErrorCode = -EINVAL; ErrorCode = -EINVAL;
break; goto out;
} }
if (copy_from_user(&UserCommand, UserSpaceUserCommand, if (copy_from_user(&UserCommand, UserSpaceUserCommand,
sizeof(DAC960_V1_UserCommand_T))) { sizeof(DAC960_V1_UserCommand_T))) {
ErrorCode = -EFAULT; ErrorCode = -EFAULT;
break; goto out;
} }
ControllerNumber = UserCommand.ControllerNumber; ControllerNumber = UserCommand.ControllerNumber;
ErrorCode = -ENXIO; ErrorCode = -ENXIO;
if (ControllerNumber < 0 || if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1) ControllerNumber > DAC960_ControllerCount - 1)
break; goto out;
Controller = DAC960_Controllers[ControllerNumber]; Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL) if (Controller == NULL)
break; goto out;
ErrorCode = -EINVAL; ErrorCode = -EINVAL;
if (Controller->FirmwareType != DAC960_V1_Controller) if (Controller->FirmwareType != DAC960_V1_Controller)
break; goto out;
CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode; CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode;
DataTransferLength = UserCommand.DataTransferLength; DataTransferLength = UserCommand.DataTransferLength;
if (CommandOpcode & 0x80) if (CommandOpcode & 0x80)
break; goto out;
if (CommandOpcode == DAC960_V1_DCDB) if (CommandOpcode == DAC960_V1_DCDB)
{ {
if (copy_from_user(&DCDB, UserCommand.DCDB, if (copy_from_user(&DCDB, UserCommand.DCDB,
sizeof(DAC960_V1_DCDB_T))) { sizeof(DAC960_V1_DCDB_T))) {
ErrorCode = -EFAULT; ErrorCode = -EFAULT;
break; goto out;
} }
if (DCDB.Channel >= DAC960_V1_MaxChannels) if (DCDB.Channel >= DAC960_V1_MaxChannels)
break; goto out;
if (!((DataTransferLength == 0 && if (!((DataTransferLength == 0 &&
DCDB.Direction DCDB.Direction
== DAC960_V1_DCDB_NoDataTransfer) || == DAC960_V1_DCDB_NoDataTransfer) ||
@ -6730,15 +6716,15 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
(DataTransferLength < 0 && (DataTransferLength < 0 &&
DCDB.Direction DCDB.Direction
== DAC960_V1_DCDB_DataTransferSystemToDevice))) == DAC960_V1_DCDB_DataTransferSystemToDevice)))
break; goto out;
if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength) if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength)
!= abs(DataTransferLength)) != abs(DataTransferLength))
break; goto out;
DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice, DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice,
sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA); sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA);
if (DCDB_IOBUF == NULL) { if (DCDB_IOBUF == NULL) {
ErrorCode = -ENOMEM; ErrorCode = -ENOMEM;
break; goto out;
} }
} }
ErrorCode = -ENOMEM; ErrorCode = -ENOMEM;
@ -6748,19 +6734,19 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
DataTransferLength, DataTransferLength,
&DataTransferBufferDMA); &DataTransferBufferDMA);
if (DataTransferBuffer == NULL) if (DataTransferBuffer == NULL)
break; goto out;
} }
else if (DataTransferLength < 0) else if (DataTransferLength < 0)
{ {
DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
-DataTransferLength, &DataTransferBufferDMA); -DataTransferLength, &DataTransferBufferDMA);
if (DataTransferBuffer == NULL) if (DataTransferBuffer == NULL)
break; goto out;
if (copy_from_user(DataTransferBuffer, if (copy_from_user(DataTransferBuffer,
UserCommand.DataTransferBuffer, UserCommand.DataTransferBuffer,
-DataTransferLength)) { -DataTransferLength)) {
ErrorCode = -EFAULT; ErrorCode = -EFAULT;
break; goto out;
} }
} }
if (CommandOpcode == DAC960_V1_DCDB) if (CommandOpcode == DAC960_V1_DCDB)
@ -6837,12 +6823,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
if (DCDB_IOBUF != NULL) if (DCDB_IOBUF != NULL)
pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T),
DCDB_IOBUF, DCDB_IOBUFDMA); DCDB_IOBUF, DCDB_IOBUFDMA);
break; out:
} return ErrorCode;
case DAC960_IOCTL_V2_EXECUTE_COMMAND: }
{
DAC960_V2_UserCommand_T __user *UserSpaceUserCommand = static long DAC960_gam_v2_execute_command(DAC960_V2_UserCommand_T __user *UserSpaceUserCommand)
(DAC960_V2_UserCommand_T __user *) Argument; {
DAC960_V2_UserCommand_T UserCommand; DAC960_V2_UserCommand_T UserCommand;
DAC960_Controller_T *Controller; DAC960_Controller_T *Controller;
DAC960_Command_T *Command = NULL; DAC960_Command_T *Command = NULL;
@ -6855,26 +6841,26 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
dma_addr_t DataTransferBufferDMA; dma_addr_t DataTransferBufferDMA;
unsigned char *RequestSenseBuffer = NULL; unsigned char *RequestSenseBuffer = NULL;
dma_addr_t RequestSenseBufferDMA; dma_addr_t RequestSenseBufferDMA;
long ErrorCode = -EINVAL;
ErrorCode = -EINVAL;
if (UserSpaceUserCommand == NULL) if (UserSpaceUserCommand == NULL)
break; goto out;
if (copy_from_user(&UserCommand, UserSpaceUserCommand, if (copy_from_user(&UserCommand, UserSpaceUserCommand,
sizeof(DAC960_V2_UserCommand_T))) { sizeof(DAC960_V2_UserCommand_T))) {
ErrorCode = -EFAULT; ErrorCode = -EFAULT;
break; goto out;
} }
ErrorCode = -ENXIO; ErrorCode = -ENXIO;
ControllerNumber = UserCommand.ControllerNumber; ControllerNumber = UserCommand.ControllerNumber;
if (ControllerNumber < 0 || if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1) ControllerNumber > DAC960_ControllerCount - 1)
break; goto out;
Controller = DAC960_Controllers[ControllerNumber]; Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL) if (Controller == NULL)
break; goto out;
if (Controller->FirmwareType != DAC960_V2_Controller){ if (Controller->FirmwareType != DAC960_V2_Controller){
ErrorCode = -EINVAL; ErrorCode = -EINVAL;
break; goto out;
} }
DataTransferLength = UserCommand.DataTransferLength; DataTransferLength = UserCommand.DataTransferLength;
ErrorCode = -ENOMEM; ErrorCode = -ENOMEM;
@ -6884,14 +6870,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
DataTransferLength, DataTransferLength,
&DataTransferBufferDMA); &DataTransferBufferDMA);
if (DataTransferBuffer == NULL) if (DataTransferBuffer == NULL)
break; goto out;
} }
else if (DataTransferLength < 0) else if (DataTransferLength < 0)
{ {
DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
-DataTransferLength, &DataTransferBufferDMA); -DataTransferLength, &DataTransferBufferDMA);
if (DataTransferBuffer == NULL) if (DataTransferBuffer == NULL)
break; goto out;
if (copy_from_user(DataTransferBuffer, if (copy_from_user(DataTransferBuffer,
UserCommand.DataTransferBuffer, UserCommand.DataTransferBuffer,
-DataTransferLength)) { -DataTransferLength)) {
@ -7001,42 +6987,44 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
if (RequestSenseBuffer != NULL) if (RequestSenseBuffer != NULL)
pci_free_consistent(Controller->PCIDevice, RequestSenseLength, pci_free_consistent(Controller->PCIDevice, RequestSenseLength,
RequestSenseBuffer, RequestSenseBufferDMA); RequestSenseBuffer, RequestSenseBufferDMA);
break; out:
} return ErrorCode;
case DAC960_IOCTL_V2_GET_HEALTH_STATUS: }
{
DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus = static long DAC960_gam_v2_get_health_status(DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus)
(DAC960_V2_GetHealthStatus_T __user *) Argument; {
DAC960_V2_GetHealthStatus_T GetHealthStatus; DAC960_V2_GetHealthStatus_T GetHealthStatus;
DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer; DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer;
DAC960_Controller_T *Controller; DAC960_Controller_T *Controller;
int ControllerNumber; int ControllerNumber;
long ErrorCode;
if (UserSpaceGetHealthStatus == NULL) { if (UserSpaceGetHealthStatus == NULL) {
ErrorCode = -EINVAL; ErrorCode = -EINVAL;
break; goto out;
} }
if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus, if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus,
sizeof(DAC960_V2_GetHealthStatus_T))) { sizeof(DAC960_V2_GetHealthStatus_T))) {
ErrorCode = -EFAULT; ErrorCode = -EFAULT;
break; goto out;
} }
ErrorCode = -ENXIO; ErrorCode = -ENXIO;
ControllerNumber = GetHealthStatus.ControllerNumber; ControllerNumber = GetHealthStatus.ControllerNumber;
if (ControllerNumber < 0 || if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1) ControllerNumber > DAC960_ControllerCount - 1)
break; goto out;
Controller = DAC960_Controllers[ControllerNumber]; Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL) if (Controller == NULL)
break; goto out;
if (Controller->FirmwareType != DAC960_V2_Controller) { if (Controller->FirmwareType != DAC960_V2_Controller) {
ErrorCode = -EINVAL; ErrorCode = -EINVAL;
break; goto out;
} }
if (copy_from_user(&HealthStatusBuffer, if (copy_from_user(&HealthStatusBuffer,
GetHealthStatus.HealthStatusBuffer, GetHealthStatus.HealthStatusBuffer,
sizeof(DAC960_V2_HealthStatusBuffer_T))) { sizeof(DAC960_V2_HealthStatusBuffer_T))) {
ErrorCode = -EFAULT; ErrorCode = -EFAULT;
break; goto out;
} }
ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue, ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue,
!(Controller->V2.HealthStatusBuffer->StatusChangeCounter !(Controller->V2.HealthStatusBuffer->StatusChangeCounter
@ -7046,7 +7034,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
DAC960_MonitoringTimerInterval); DAC960_MonitoringTimerInterval);
if (ErrorCode == -ERESTARTSYS) { if (ErrorCode == -ERESTARTSYS) {
ErrorCode = -EINTR; ErrorCode = -EINTR;
break; goto out;
} }
if (copy_to_user(GetHealthStatus.HealthStatusBuffer, if (copy_to_user(GetHealthStatus.HealthStatusBuffer,
Controller->V2.HealthStatusBuffer, Controller->V2.HealthStatusBuffer,
@ -7054,7 +7042,39 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
ErrorCode = -EFAULT; ErrorCode = -EFAULT;
else else
ErrorCode = 0; ErrorCode = 0;
}
out:
return ErrorCode;
}
/*
* DAC960_gam_ioctl is the ioctl function for performing RAID operations.
*/
static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
unsigned long Argument)
{
long ErrorCode = 0;
void __user *argp = (void __user *)Argument;
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
mutex_lock(&DAC960_mutex);
switch (Request)
{
case DAC960_IOCTL_GET_CONTROLLER_COUNT:
ErrorCode = DAC960_ControllerCount;
break;
case DAC960_IOCTL_GET_CONTROLLER_INFO:
ErrorCode = DAC960_gam_get_controller_info(argp);
break;
case DAC960_IOCTL_V1_EXECUTE_COMMAND:
ErrorCode = DAC960_gam_v1_execute_command(argp);
break;
case DAC960_IOCTL_V2_EXECUTE_COMMAND:
ErrorCode = DAC960_gam_v2_execute_command(argp);
break;
case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
ErrorCode = DAC960_gam_v2_get_health_status(argp);
break; break;
default: default:
ErrorCode = -ENOTTY; ErrorCode = -ENOTTY;

View File

@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK
tristate "Null test block driver" tristate "Null test block driver"
select CONFIGFS_FS select CONFIGFS_FS
config BLK_DEV_NULL_BLK_FAULT_INJECTION
bool "Support fault injection for Null test block driver"
depends on BLK_DEV_NULL_BLK && FAULT_INJECTION
config BLK_DEV_FD config BLK_DEV_FD
tristate "Normal floppy disk support" tristate "Normal floppy disk support"
depends on ARCH_MAY_HAVE_PC_FDC depends on ARCH_MAY_HAVE_PC_FDC

View File

@ -112,8 +112,7 @@ enum frame_flags {
struct frame { struct frame {
struct list_head head; struct list_head head;
u32 tag; u32 tag;
struct timeval sent; /* high-res time packet was sent */ ktime_t sent; /* high-res time packet was sent */
u32 sent_jiffs; /* low-res jiffies-based sent time */
ulong waited; ulong waited;
ulong waited_total; ulong waited_total;
struct aoetgt *t; /* parent target I belong to */ struct aoetgt *t; /* parent target I belong to */

View File

@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
skb = skb_clone(f->skb, GFP_ATOMIC); skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) { if (skb) {
do_gettimeofday(&f->sent); f->sent = ktime_get();
f->sent_jiffs = (u32) jiffies;
__skb_queue_head_init(&queue); __skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb); __skb_queue_tail(&queue, skb);
aoenet_xmit(&queue); aoenet_xmit(&queue);
@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
skb = skb_clone(skb, GFP_ATOMIC); skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL) if (skb == NULL)
return; return;
do_gettimeofday(&f->sent); f->sent = ktime_get();
f->sent_jiffs = (u32) jiffies;
__skb_queue_head_init(&queue); __skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb); __skb_queue_tail(&queue, skb);
aoenet_xmit(&queue); aoenet_xmit(&queue);
@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
static int static int
tsince_hr(struct frame *f) tsince_hr(struct frame *f)
{ {
struct timeval now; u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
int n;
do_gettimeofday(&now); /* delta is normally under 4.2 seconds, avoid 64-bit division */
n = now.tv_usec - f->sent.tv_usec; if (likely(delta <= UINT_MAX))
n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC; return (u32)delta / NSEC_PER_USEC;
if (n < 0) /* avoid overflow after 71 minutes */
n = -n; if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
return INT_MAX;
/* For relatively long periods, use jiffies to avoid return div_u64(delta, NSEC_PER_USEC);
* discrepancies caused by updates to the system time.
*
* On system with HZ of 1000, 32-bits is over 49 days
* worth of jiffies, or over 71 minutes worth of usecs.
*
* Jiffies overflow is handled by subtraction of unsigned ints:
* (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
* $3 = 4
* (gdb)
*/
if (n > USEC_PER_SEC / 4) {
n = ((u32) jiffies) - f->sent_jiffs;
n *= USEC_PER_SEC / HZ;
}
return n;
} }
static int static int
@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
nf->waited = 0; nf->waited = 0;
nf->waited_total = f->waited_total; nf->waited_total = f->waited_total;
nf->sent = f->sent; nf->sent = f->sent;
nf->sent_jiffs = f->sent_jiffs;
f->skb = skb; f->skb = skb;
return nf; return nf;
@ -633,8 +614,7 @@ probe(struct aoetgt *t)
skb = skb_clone(f->skb, GFP_ATOMIC); skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) { if (skb) {
do_gettimeofday(&f->sent); f->sent = ktime_get();
f->sent_jiffs = (u32) jiffies;
__skb_queue_head_init(&queue); __skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb); __skb_queue_tail(&queue, skb);
aoenet_xmit(&queue); aoenet_xmit(&queue);
@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
d->timer.function = rexmit_timer; d->timer.function = rexmit_timer;
skb = skb_clone(skb, GFP_ATOMIC); skb = skb_clone(skb, GFP_ATOMIC);
if (skb) { if (skb)
do_gettimeofday(&f->sent); f->sent = ktime_get();
f->sent_jiffs = (u32) jiffies;
}
return skb; return skb;
} }

View File

@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio)
struct drbd_bm_aio_ctx *ctx = bio->bi_private; struct drbd_bm_aio_ctx *ctx = bio->bi_private;
struct drbd_device *device = ctx->device; struct drbd_device *device = ctx->device;
struct drbd_bitmap *b = device->bitmap; struct drbd_bitmap *b = device->bitmap;
unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
!bm_test_page_unchanged(b->bm_pages[idx])) !bm_test_page_unchanged(b->bm_pages[idx]))

View File

@ -12,9 +12,9 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
#include <linux/lightnvm.h>
#include <linux/configfs.h> #include <linux/configfs.h>
#include <linux/badblocks.h> #include <linux/badblocks.h>
#include <linux/fault-inject.h>
#define SECTOR_SHIFT 9 #define SECTOR_SHIFT 9
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
@ -27,6 +27,10 @@
#define TICKS_PER_SEC 50ULL #define TICKS_PER_SEC 50ULL
#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC)
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static DECLARE_FAULT_ATTR(null_timeout_attr);
#endif
static inline u64 mb_per_tick(int mbps) static inline u64 mb_per_tick(int mbps)
{ {
return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
@ -107,7 +111,6 @@ struct nullb_device {
unsigned int hw_queue_depth; /* queue depth */ unsigned int hw_queue_depth; /* queue depth */
unsigned int index; /* index of the disk, only valid with a disk */ unsigned int index; /* index of the disk, only valid with a disk */
unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
bool use_lightnvm; /* register as a LightNVM device */
bool blocking; /* blocking blk-mq device */ bool blocking; /* blocking blk-mq device */
bool use_per_node_hctx; /* use per-node allocation for hardware context */ bool use_per_node_hctx; /* use per-node allocation for hardware context */
bool power; /* power on/off the device */ bool power; /* power on/off the device */
@ -121,7 +124,6 @@ struct nullb {
unsigned int index; unsigned int index;
struct request_queue *q; struct request_queue *q;
struct gendisk *disk; struct gendisk *disk;
struct nvm_dev *ndev;
struct blk_mq_tag_set *tag_set; struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set; struct blk_mq_tag_set __tag_set;
unsigned int queue_depth; unsigned int queue_depth;
@ -139,7 +141,6 @@ static LIST_HEAD(nullb_list);
static struct mutex lock; static struct mutex lock;
static int null_major; static int null_major;
static DEFINE_IDA(nullb_indexes); static DEFINE_IDA(nullb_indexes);
static struct kmem_cache *ppa_cache;
static struct blk_mq_tag_set tag_set; static struct blk_mq_tag_set tag_set;
enum { enum {
@ -166,6 +167,11 @@ static int g_home_node = NUMA_NO_NODE;
module_param_named(home_node, g_home_node, int, S_IRUGO); module_param_named(home_node, g_home_node, int, S_IRUGO);
MODULE_PARM_DESC(home_node, "Home node for the device"); MODULE_PARM_DESC(home_node, "Home node for the device");
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static char g_timeout_str[80];
module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
#endif
static int g_queue_mode = NULL_Q_MQ; static int g_queue_mode = NULL_Q_MQ;
static int null_param_store_val(const char *str, int *val, int min, int max) static int null_param_store_val(const char *str, int *val, int min, int max)
@ -208,10 +214,6 @@ static int nr_devices = 1;
module_param(nr_devices, int, S_IRUGO); module_param(nr_devices, int, S_IRUGO);
MODULE_PARM_DESC(nr_devices, "Number of devices to register"); MODULE_PARM_DESC(nr_devices, "Number of devices to register");
static bool g_use_lightnvm;
module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
static bool g_blocking; static bool g_blocking;
module_param_named(blocking, g_blocking, bool, S_IRUGO); module_param_named(blocking, g_blocking, bool, S_IRUGO);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
@ -345,7 +347,6 @@ NULLB_DEVICE_ATTR(blocksize, uint);
NULLB_DEVICE_ATTR(irqmode, uint); NULLB_DEVICE_ATTR(irqmode, uint);
NULLB_DEVICE_ATTR(hw_queue_depth, uint); NULLB_DEVICE_ATTR(hw_queue_depth, uint);
NULLB_DEVICE_ATTR(index, uint); NULLB_DEVICE_ATTR(index, uint);
NULLB_DEVICE_ATTR(use_lightnvm, bool);
NULLB_DEVICE_ATTR(blocking, bool); NULLB_DEVICE_ATTR(blocking, bool);
NULLB_DEVICE_ATTR(use_per_node_hctx, bool); NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
NULLB_DEVICE_ATTR(memory_backed, bool); NULLB_DEVICE_ATTR(memory_backed, bool);
@ -455,7 +456,6 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_irqmode, &nullb_device_attr_irqmode,
&nullb_device_attr_hw_queue_depth, &nullb_device_attr_hw_queue_depth,
&nullb_device_attr_index, &nullb_device_attr_index,
&nullb_device_attr_use_lightnvm,
&nullb_device_attr_blocking, &nullb_device_attr_blocking,
&nullb_device_attr_use_per_node_hctx, &nullb_device_attr_use_per_node_hctx,
&nullb_device_attr_power, &nullb_device_attr_power,
@ -573,7 +573,6 @@ static struct nullb_device *null_alloc_dev(void)
dev->blocksize = g_bs; dev->blocksize = g_bs;
dev->irqmode = g_irqmode; dev->irqmode = g_irqmode;
dev->hw_queue_depth = g_hw_queue_depth; dev->hw_queue_depth = g_hw_queue_depth;
dev->use_lightnvm = g_use_lightnvm;
dev->blocking = g_blocking; dev->blocking = g_blocking;
dev->use_per_node_hctx = g_use_per_node_hctx; dev->use_per_node_hctx = g_use_per_node_hctx;
return dev; return dev;
@ -1352,6 +1351,12 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE; return BLK_QC_T_NONE;
} }
static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
{
pr_info("null: rq %p timed out\n", rq);
return BLK_EH_HANDLED;
}
static int null_rq_prep_fn(struct request_queue *q, struct request *req) static int null_rq_prep_fn(struct request_queue *q, struct request *req)
{ {
struct nullb *nullb = q->queuedata; struct nullb *nullb = q->queuedata;
@ -1369,6 +1374,16 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req)
return BLKPREP_DEFER; return BLKPREP_DEFER;
} }
static bool should_timeout_request(struct request *rq)
{
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
if (g_timeout_str[0])
return should_fail(&null_timeout_attr, 1);
#endif
return false;
}
static void null_request_fn(struct request_queue *q) static void null_request_fn(struct request_queue *q)
{ {
struct request *rq; struct request *rq;
@ -1376,12 +1391,20 @@ static void null_request_fn(struct request_queue *q)
while ((rq = blk_fetch_request(q)) != NULL) { while ((rq = blk_fetch_request(q)) != NULL) {
struct nullb_cmd *cmd = rq->special; struct nullb_cmd *cmd = rq->special;
spin_unlock_irq(q->queue_lock); if (!should_timeout_request(rq)) {
null_handle_cmd(cmd); spin_unlock_irq(q->queue_lock);
spin_lock_irq(q->queue_lock); null_handle_cmd(cmd);
spin_lock_irq(q->queue_lock);
}
} }
} }
static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
{
pr_info("null: rq %p timed out\n", rq);
return BLK_EH_HANDLED;
}
static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd) const struct blk_mq_queue_data *bd)
{ {
@ -1399,12 +1422,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(bd->rq); blk_mq_start_request(bd->rq);
return null_handle_cmd(cmd); if (!should_timeout_request(bd->rq))
return null_handle_cmd(cmd);
return BLK_STS_OK;
} }
static const struct blk_mq_ops null_mq_ops = { static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq, .queue_rq = null_queue_rq,
.complete = null_softirq_done_fn, .complete = null_softirq_done_fn,
.timeout = null_timeout_rq,
}; };
static void cleanup_queue(struct nullb_queue *nq) static void cleanup_queue(struct nullb_queue *nq)
@ -1423,170 +1450,6 @@ static void cleanup_queues(struct nullb *nullb)
kfree(nullb->queues); kfree(nullb->queues);
} }
#ifdef CONFIG_NVM
static void null_lnvm_end_io(struct request *rq, blk_status_t status)
{
struct nvm_rq *rqd = rq->end_io_data;
/* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
rqd->error = status ? -EIO : 0;
nvm_end_io(rqd);
blk_put_request(rq);
}
static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
{
struct request_queue *q = dev->q;
struct request *rq;
struct bio *bio = rqd->bio;
rq = blk_mq_alloc_request(q,
op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
if (IS_ERR(rq))
return -ENOMEM;
blk_init_request_from_bio(rq, bio);
rq->end_io_data = rqd;
blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
return 0;
}
static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
{
struct nullb *nullb = dev->q->queuedata;
sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
sector_t blksize;
struct nvm_id_group *grp;
id->ver_id = 0x1;
id->vmnt = 0;
id->cap = 0x2;
id->dom = 0x1;
id->ppaf.blk_offset = 0;
id->ppaf.blk_len = 16;
id->ppaf.pg_offset = 16;
id->ppaf.pg_len = 16;
id->ppaf.sect_offset = 32;
id->ppaf.sect_len = 8;
id->ppaf.pln_offset = 40;
id->ppaf.pln_len = 8;
id->ppaf.lun_offset = 48;
id->ppaf.lun_len = 8;
id->ppaf.ch_offset = 56;
id->ppaf.ch_len = 8;
sector_div(size, nullb->dev->blocksize); /* convert size to pages */
size >>= 8; /* concert size to pgs pr blk */
grp = &id->grp;
grp->mtype = 0;
grp->fmtype = 0;
grp->num_ch = 1;
grp->num_pg = 256;
blksize = size;
size >>= 16;
grp->num_lun = size + 1;
sector_div(blksize, grp->num_lun);
grp->num_blk = blksize;
grp->num_pln = 1;
grp->fpg_sz = nullb->dev->blocksize;
grp->csecs = nullb->dev->blocksize;
grp->trdt = 25000;
grp->trdm = 25000;
grp->tprt = 500000;
grp->tprm = 500000;
grp->tbet = 1500000;
grp->tbem = 1500000;
grp->mpos = 0x010101; /* single plane rwe */
grp->cpar = nullb->dev->hw_queue_depth;
return 0;
}
static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
{
mempool_t *virtmem_pool;
virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
if (!virtmem_pool) {
pr_err("null_blk: Unable to create virtual memory pool\n");
return NULL;
}
return virtmem_pool;
}
static void null_lnvm_destroy_dma_pool(void *pool)
{
mempool_destroy(pool);
}
static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
gfp_t mem_flags, dma_addr_t *dma_handler)
{
return mempool_alloc(pool, mem_flags);
}
static void null_lnvm_dev_dma_free(void *pool, void *entry,
dma_addr_t dma_handler)
{
mempool_free(entry, pool);
}
static struct nvm_dev_ops null_lnvm_dev_ops = {
.identity = null_lnvm_id,
.submit_io = null_lnvm_submit_io,
.create_dma_pool = null_lnvm_create_dma_pool,
.destroy_dma_pool = null_lnvm_destroy_dma_pool,
.dev_dma_alloc = null_lnvm_dev_dma_alloc,
.dev_dma_free = null_lnvm_dev_dma_free,
/* Simulate nvme protocol restriction */
.max_phys_sect = 64,
};
static int null_nvm_register(struct nullb *nullb)
{
struct nvm_dev *dev;
int rv;
dev = nvm_alloc_dev(0);
if (!dev)
return -ENOMEM;
dev->q = nullb->q;
memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
dev->ops = &null_lnvm_dev_ops;
rv = nvm_register(dev);
if (rv) {
kfree(dev);
return rv;
}
nullb->ndev = dev;
return 0;
}
static void null_nvm_unregister(struct nullb *nullb)
{
nvm_unregister(nullb->ndev);
}
#else
static int null_nvm_register(struct nullb *nullb)
{
pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
return -EINVAL;
}
static void null_nvm_unregister(struct nullb *nullb) {}
#endif /* CONFIG_NVM */
static void null_del_dev(struct nullb *nullb) static void null_del_dev(struct nullb *nullb)
{ {
struct nullb_device *dev = nullb->dev; struct nullb_device *dev = nullb->dev;
@ -1595,10 +1458,7 @@ static void null_del_dev(struct nullb *nullb)
list_del_init(&nullb->list); list_del_init(&nullb->list);
if (dev->use_lightnvm) del_gendisk(nullb->disk);
null_nvm_unregister(nullb);
else
del_gendisk(nullb->disk);
if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
hrtimer_cancel(&nullb->bw_timer); hrtimer_cancel(&nullb->bw_timer);
@ -1610,8 +1470,7 @@ static void null_del_dev(struct nullb *nullb)
if (dev->queue_mode == NULL_Q_MQ && if (dev->queue_mode == NULL_Q_MQ &&
nullb->tag_set == &nullb->__tag_set) nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set); blk_mq_free_tag_set(nullb->tag_set);
if (!dev->use_lightnvm) put_disk(nullb->disk);
put_disk(nullb->disk);
cleanup_queues(nullb); cleanup_queues(nullb);
if (null_cache_active(nullb)) if (null_cache_active(nullb))
null_free_device_storage(nullb->dev, true); null_free_device_storage(nullb->dev, true);
@ -1775,11 +1634,6 @@ static void null_validate_conf(struct nullb_device *dev)
{ {
dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
if (dev->use_lightnvm && dev->blocksize != 4096)
dev->blocksize = 4096;
if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
dev->queue_mode = NULL_Q_MQ;
if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
if (dev->submit_queues != nr_online_nodes) if (dev->submit_queues != nr_online_nodes)
@ -1805,6 +1659,20 @@ static void null_validate_conf(struct nullb_device *dev)
dev->mbps = 0; dev->mbps = 0;
} }
static bool null_setup_fault(void)
{
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
if (!g_timeout_str[0])
return true;
if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
return false;
null_timeout_attr.verbose = 0;
#endif
return true;
}
static int null_add_dev(struct nullb_device *dev) static int null_add_dev(struct nullb_device *dev)
{ {
struct nullb *nullb; struct nullb *nullb;
@ -1838,6 +1706,10 @@ static int null_add_dev(struct nullb_device *dev)
if (rv) if (rv)
goto out_cleanup_queues; goto out_cleanup_queues;
if (!null_setup_fault())
goto out_cleanup_queues;
nullb->tag_set->timeout = 5 * HZ;
nullb->q = blk_mq_init_queue(nullb->tag_set); nullb->q = blk_mq_init_queue(nullb->tag_set);
if (IS_ERR(nullb->q)) { if (IS_ERR(nullb->q)) {
rv = -ENOMEM; rv = -ENOMEM;
@ -1861,8 +1733,14 @@ static int null_add_dev(struct nullb_device *dev)
rv = -ENOMEM; rv = -ENOMEM;
goto out_cleanup_queues; goto out_cleanup_queues;
} }
if (!null_setup_fault())
goto out_cleanup_blk_queue;
blk_queue_prep_rq(nullb->q, null_rq_prep_fn); blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
blk_queue_softirq_done(nullb->q, null_softirq_done_fn); blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
nullb->q->rq_timeout = 5 * HZ;
rv = init_driver_queues(nullb); rv = init_driver_queues(nullb);
if (rv) if (rv)
goto out_cleanup_blk_queue; goto out_cleanup_blk_queue;
@ -1895,11 +1773,7 @@ static int null_add_dev(struct nullb_device *dev)
sprintf(nullb->disk_name, "nullb%d", nullb->index); sprintf(nullb->disk_name, "nullb%d", nullb->index);
if (dev->use_lightnvm) rv = null_gendisk_register(nullb);
rv = null_nvm_register(nullb);
else
rv = null_gendisk_register(nullb);
if (rv) if (rv)
goto out_cleanup_blk_queue; goto out_cleanup_blk_queue;
@ -1938,18 +1812,6 @@ static int __init null_init(void)
g_bs = PAGE_SIZE; g_bs = PAGE_SIZE;
} }
if (g_use_lightnvm && g_bs != 4096) {
pr_warn("null_blk: LightNVM only supports 4k block size\n");
pr_warn("null_blk: defaults block size to 4k\n");
g_bs = 4096;
}
if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
pr_warn("null_blk: LightNVM only supported for blk-mq\n");
pr_warn("null_blk: defaults queue mode to blk-mq\n");
g_queue_mode = NULL_Q_MQ;
}
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) { if (g_submit_queues != nr_online_nodes) {
pr_warn("null_blk: submit_queues param is set to %u.\n", pr_warn("null_blk: submit_queues param is set to %u.\n",
@ -1982,16 +1844,6 @@ static int __init null_init(void)
goto err_conf; goto err_conf;
} }
if (g_use_lightnvm) {
ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
0, 0, NULL);
if (!ppa_cache) {
pr_err("null_blk: unable to create ppa cache\n");
ret = -ENOMEM;
goto err_ppa;
}
}
for (i = 0; i < nr_devices; i++) { for (i = 0; i < nr_devices; i++) {
dev = null_alloc_dev(); dev = null_alloc_dev();
if (!dev) { if (!dev) {
@ -2015,8 +1867,6 @@ static int __init null_init(void)
null_del_dev(nullb); null_del_dev(nullb);
null_free_dev(dev); null_free_dev(dev);
} }
kmem_cache_destroy(ppa_cache);
err_ppa:
unregister_blkdev(null_major, "nullb"); unregister_blkdev(null_major, "nullb");
err_conf: err_conf:
configfs_unregister_subsystem(&nullb_subsys); configfs_unregister_subsystem(&nullb_subsys);
@ -2047,8 +1897,6 @@ static void __exit null_exit(void)
if (g_queue_mode == NULL_Q_MQ && shared_tags) if (g_queue_mode == NULL_Q_MQ && shared_tags)
blk_mq_free_tag_set(&tag_set); blk_mq_free_tag_set(&tag_set);
kmem_cache_destroy(ppa_cache);
} }
module_init(null_init); module_init(null_init);

View File

@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
bdev = bdget(dev); bdev = bdget(dev);
if (!bdev) if (!bdev)
return -ENOMEM; return -ENOMEM;
if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
bdput(bdev);
return -EINVAL;
}
ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
if (ret) if (ret)
return ret; return ret;
if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
return -EINVAL;
}
/* This is safe, since we have a reference from open(). */ /* This is safe, since we have a reference from open(). */
__module_get(THIS_MODULE); __module_get(THIS_MODULE);
@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
pd->pkt_dev = MKDEV(pktdev_major, idx); pd->pkt_dev = MKDEV(pktdev_major, idx);
ret = pkt_new_dev(pd, dev); ret = pkt_new_dev(pd, dev);
if (ret) if (ret)
goto out_new_dev; goto out_mem2;
/* inherit events of the host device */ /* inherit events of the host device */
disk->events = pd->bdev->bd_disk->events; disk->events = pd->bdev->bd_disk->events;
@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
mutex_unlock(&ctl_mutex); mutex_unlock(&ctl_mutex);
return 0; return 0;
out_new_dev:
blk_cleanup_queue(disk->queue);
out_mem2: out_mem2:
put_disk(disk); put_disk(disk);
out_mem: out_mem:

View File

@ -1,278 +0,0 @@
/*
* Disk Array driver for Compaq SMART2 Controllers
* Copyright 1998 Compaq Computer Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Questions/Comments/Bugfixes to iss_storagedev@hp.com
*
* If you want to make changes, improve or add functionality to this
* driver, you'll probably need the Compaq Array Controller Interface
* Specificiation (Document number ECG086/1198)
*/
/*
* This file contains the controller communication implementation for
* Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge,
* this should support:
*
* PCI:
* SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200
* Integerated SMART Array Controller, SMART-4200, SMART-4250ES
*
* EISA:
* SMART-2/E, SMART, IAES, IDA-2, IDA
*/
/*
* Memory mapped FIFO interface (SMART 42xx cards)
*/
static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c)
{
writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET);
}
/*
* This card is the opposite of the other cards.
* 0 turns interrupts on...
* 0x08 turns them off...
*/
static void smart4_intr_mask(ctlr_info_t *h, unsigned long val)
{
if (val)
{ /* Turn interrupts on */
writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
} else /* Turn them off */
{
writel( S42XX_INTR_OFF,
h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
}
}
/*
* For older cards FIFO Full = 0.
* On this card 0 means there is room, anything else FIFO Full.
*
*/
static unsigned long smart4_fifo_full(ctlr_info_t *h)
{
return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET));
}
/* This type of controller returns -1 if the fifo is empty,
* Not 0 like the others.
* And we need to let it know we read a value out
*/
static unsigned long smart4_completed(ctlr_info_t *h)
{
long register_value
= readl(h->vaddr + S42XX_REPLY_PORT_OFFSET);
/* Fifo is empty */
if( register_value == 0xffffffff)
return 0;
/* Need to let it know we got the reply */
/* We do this by writing a 0 to the port we just read from */
writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET);
return ((unsigned long) register_value);
}
/*
* This hardware returns interrupt pending at a different place and
* it does not tell us if the fifo is empty, we will have check
* that by getting a 0 back from the command_completed call.
*/
static unsigned long smart4_intr_pending(ctlr_info_t *h)
{
unsigned long register_value =
readl(h->vaddr + S42XX_INTR_STATUS);
if( register_value & S42XX_INTR_PENDING)
return FIFO_NOT_EMPTY;
return 0 ;
}
static struct access_method smart4_access = {
smart4_submit_command,
smart4_intr_mask,
smart4_fifo_full,
smart4_intr_pending,
smart4_completed,
};
/*
* Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards)
*/
static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c)
{
writel(c->busaddr, h->vaddr + COMMAND_FIFO);
}
static void smart2_intr_mask(ctlr_info_t *h, unsigned long val)
{
writel(val, h->vaddr + INTR_MASK);
}
static unsigned long smart2_fifo_full(ctlr_info_t *h)
{
return readl(h->vaddr + COMMAND_FIFO);
}
static unsigned long smart2_completed(ctlr_info_t *h)
{
return readl(h->vaddr + COMMAND_COMPLETE_FIFO);
}
static unsigned long smart2_intr_pending(ctlr_info_t *h)
{
return readl(h->vaddr + INTR_PENDING);
}
static struct access_method smart2_access = {
smart2_submit_command,
smart2_intr_mask,
smart2_fifo_full,
smart2_intr_pending,
smart2_completed,
};
/*
* IO access for SMART-2/E cards
*/
static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c)
{
outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO);
}
static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val)
{
outl(val, h->io_mem_addr + INTR_MASK);
}
static unsigned long smart2e_fifo_full(ctlr_info_t *h)
{
return inl(h->io_mem_addr + COMMAND_FIFO);
}
static unsigned long smart2e_completed(ctlr_info_t *h)
{
return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO);
}
static unsigned long smart2e_intr_pending(ctlr_info_t *h)
{
return inl(h->io_mem_addr + INTR_PENDING);
}
static struct access_method smart2e_access = {
smart2e_submit_command,
smart2e_intr_mask,
smart2e_fifo_full,
smart2e_intr_pending,
smart2e_completed,
};
/*
* IO access for older SMART-1 type cards
*/
#define SMART1_SYSTEM_MASK 0xC8E
#define SMART1_SYSTEM_DOORBELL 0xC8F
#define SMART1_LOCAL_MASK 0xC8C
#define SMART1_LOCAL_DOORBELL 0xC8D
#define SMART1_INTR_MASK 0xC89
#define SMART1_LISTADDR 0xC90
#define SMART1_LISTLEN 0xC94
#define SMART1_TAG 0xC97
#define SMART1_COMPLETE_ADDR 0xC98
#define SMART1_LISTSTATUS 0xC9E
#define CHANNEL_BUSY 0x01
#define CHANNEL_CLEAR 0x02
static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c)
{
/*
* This __u16 is actually a bunch of control flags on SMART
* and below. We want them all to be zero.
*/
c->hdr.size = 0;
outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR);
outw(c->size, h->io_mem_addr + SMART1_LISTLEN);
outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
}
static void smart1_intr_mask(ctlr_info_t *h, unsigned long val)
{
if (val == 1) {
outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
outb(0x01, h->io_mem_addr + SMART1_INTR_MASK);
outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK);
} else {
outb(0, h->io_mem_addr + 0xC8E);
}
}
static unsigned long smart1_fifo_full(ctlr_info_t *h)
{
unsigned char chan;
chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR;
return chan;
}
static unsigned long smart1_completed(ctlr_info_t *h)
{
unsigned char status;
unsigned long cmd;
if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) {
outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR);
status = inb(h->io_mem_addr + SMART1_LISTSTATUS);
outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
/*
* this is x86 (actually compaq x86) only, so it's ok
*/
if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status;
} else {
cmd = 0;
}
return cmd;
}
static unsigned long smart1_intr_pending(ctlr_info_t *h)
{
unsigned char chan;
chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY;
return chan;
}
static struct access_method smart1_access = {
smart1_submit_command,
smart1_intr_mask,
smart1_fifo_full,
smart1_intr_pending,
smart1_completed,
};

View File

@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
static void zram_page_end_io(struct bio *bio) static void zram_page_end_io(struct bio *bio)
{ {
struct page *page = bio->bi_io_vec[0].bv_page; struct page *page = bio_first_page_all(bio);
page_endio(page, op_is_write(bio_op(bio)), page_endio(page, op_is_write(bio_op(bio)),
blk_status_to_errno(bio->bi_status)); blk_status_to_errno(bio->bi_status));

View File

@ -27,13 +27,6 @@ config NVM_DEBUG
It is required to create/remove targets without IOCTLs. It is required to create/remove targets without IOCTLs.
config NVM_RRPC
tristate "Round-robin Hybrid Open-Channel SSD target"
---help---
Allows an open-channel SSD to be exposed as a block device to the
host. The target is implemented using a linear mapping table and
cost-based garbage collection. It is optimized for 4K IO sizes.
config NVM_PBLK config NVM_PBLK
tristate "Physical Block Device Open-Channel SSD target" tristate "Physical Block Device Open-Channel SSD target"
---help--- ---help---

View File

@ -4,7 +4,6 @@
# #
obj-$(CONFIG_NVM) := core.o obj-$(CONFIG_NVM) := core.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
obj-$(CONFIG_NVM_PBLK) += pblk.o obj-$(CONFIG_NVM_PBLK) += pblk.o
pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
pblk-write.o pblk-cache.o pblk-read.o \ pblk-write.o pblk-cache.o pblk-read.o \

View File

@ -45,12 +45,6 @@ struct nvm_dev_map {
int nr_chnls; int nr_chnls;
}; };
struct nvm_area {
struct list_head list;
sector_t begin;
sector_t end; /* end is excluded */
};
static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
{ {
struct nvm_target *tgt; struct nvm_target *tgt;
@ -62,6 +56,30 @@ static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
return NULL; return NULL;
} }
static bool nvm_target_exists(const char *name)
{
struct nvm_dev *dev;
struct nvm_target *tgt;
bool ret = false;
down_write(&nvm_lock);
list_for_each_entry(dev, &nvm_devices, devices) {
mutex_lock(&dev->mlock);
list_for_each_entry(tgt, &dev->targets, list) {
if (!strcmp(name, tgt->disk->disk_name)) {
ret = true;
mutex_unlock(&dev->mlock);
goto out;
}
}
mutex_unlock(&dev->mlock);
}
out:
up_write(&nvm_lock);
return ret;
}
static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
{ {
int i; int i;
@ -104,7 +122,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
if (clear) { if (clear) {
for (j = 0; j < ch_map->nr_luns; j++) { for (j = 0; j < ch_map->nr_luns; j++) {
int lun = j + lun_offs[j]; int lun = j + lun_offs[j];
int lunid = (ch * dev->geo.luns_per_chnl) + lun; int lunid = (ch * dev->geo.nr_luns) + lun;
WARN_ON(!test_and_clear_bit(lunid, WARN_ON(!test_and_clear_bit(lunid,
dev->lun_map)); dev->lun_map));
@ -122,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
} }
static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
int lun_begin, int lun_end) u16 lun_begin, u16 lun_end,
u16 op)
{ {
struct nvm_tgt_dev *tgt_dev = NULL; struct nvm_tgt_dev *tgt_dev = NULL;
struct nvm_dev_map *dev_rmap = dev->rmap; struct nvm_dev_map *dev_rmap = dev->rmap;
@ -130,10 +149,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
struct ppa_addr *luns; struct ppa_addr *luns;
int nr_luns = lun_end - lun_begin + 1; int nr_luns = lun_end - lun_begin + 1;
int luns_left = nr_luns; int luns_left = nr_luns;
int nr_chnls = nr_luns / dev->geo.luns_per_chnl; int nr_chnls = nr_luns / dev->geo.nr_luns;
int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; int nr_chnls_mod = nr_luns % dev->geo.nr_luns;
int bch = lun_begin / dev->geo.luns_per_chnl; int bch = lun_begin / dev->geo.nr_luns;
int blun = lun_begin % dev->geo.luns_per_chnl; int blun = lun_begin % dev->geo.nr_luns;
int lunid = 0; int lunid = 0;
int lun_balanced = 1; int lun_balanced = 1;
int prev_nr_luns; int prev_nr_luns;
@ -154,15 +173,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
if (!luns) if (!luns)
goto err_luns; goto err_luns;
prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? prev_nr_luns = (luns_left > dev->geo.nr_luns) ?
dev->geo.luns_per_chnl : luns_left; dev->geo.nr_luns : luns_left;
for (i = 0; i < nr_chnls; i++) { for (i = 0; i < nr_chnls; i++) {
struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
int *lun_roffs = ch_rmap->lun_offs; int *lun_roffs = ch_rmap->lun_offs;
struct nvm_ch_map *ch_map = &dev_map->chnls[i]; struct nvm_ch_map *ch_map = &dev_map->chnls[i];
int *lun_offs; int *lun_offs;
int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? int luns_in_chnl = (luns_left > dev->geo.nr_luns) ?
dev->geo.luns_per_chnl : luns_left; dev->geo.nr_luns : luns_left;
if (lun_balanced && prev_nr_luns != luns_in_chnl) if (lun_balanced && prev_nr_luns != luns_in_chnl)
lun_balanced = 0; lun_balanced = 0;
@ -199,8 +218,9 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
/* Target device only owns a portion of the physical device */ /* Target device only owns a portion of the physical device */
tgt_dev->geo.nr_chnls = nr_chnls; tgt_dev->geo.nr_chnls = nr_chnls;
tgt_dev->geo.nr_luns = nr_luns; tgt_dev->geo.all_luns = nr_luns;
tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
tgt_dev->geo.op = op;
tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
tgt_dev->q = dev->q; tgt_dev->q = dev->q;
tgt_dev->map = dev_map; tgt_dev->map = dev_map;
@ -226,27 +246,79 @@ static const struct block_device_operations nvm_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
}; };
static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) static struct nvm_tgt_type *__nvm_find_target_type(const char *name)
{ {
struct nvm_tgt_type *tmp, *tt = NULL; struct nvm_tgt_type *tt;
if (lock) list_for_each_entry(tt, &nvm_tgt_types, list)
down_write(&nvm_tgtt_lock); if (!strcmp(name, tt->name))
return tt;
list_for_each_entry(tmp, &nvm_tgt_types, list) return NULL;
if (!strcmp(name, tmp->name)) { }
tt = tmp;
break; static struct nvm_tgt_type *nvm_find_target_type(const char *name)
} {
struct nvm_tgt_type *tt;
down_write(&nvm_tgtt_lock);
tt = __nvm_find_target_type(name);
up_write(&nvm_tgtt_lock);
if (lock)
up_write(&nvm_tgtt_lock);
return tt; return tt;
} }
static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
int lun_end)
{
if (lun_begin > lun_end || lun_end >= geo->all_luns) {
pr_err("nvm: lun out of bound (%u:%u > %u)\n",
lun_begin, lun_end, geo->all_luns - 1);
return -EINVAL;
}
return 0;
}
static int __nvm_config_simple(struct nvm_dev *dev,
struct nvm_ioctl_create_simple *s)
{
struct nvm_geo *geo = &dev->geo;
if (s->lun_begin == -1 && s->lun_end == -1) {
s->lun_begin = 0;
s->lun_end = geo->all_luns - 1;
}
return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
}
static int __nvm_config_extended(struct nvm_dev *dev,
struct nvm_ioctl_create_extended *e)
{
struct nvm_geo *geo = &dev->geo;
if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
e->lun_begin = 0;
e->lun_end = dev->geo.all_luns - 1;
}
/* op not set falls into target's default */
if (e->op == 0xFFFF)
e->op = NVM_TARGET_DEFAULT_OP;
if (e->op < NVM_TARGET_MIN_OP ||
e->op > NVM_TARGET_MAX_OP) {
pr_err("nvm: invalid over provisioning value\n");
return -EINVAL;
}
return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
}
static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
{ {
struct nvm_ioctl_create_simple *s = &create->conf.s; struct nvm_ioctl_create_extended e;
struct request_queue *tqueue; struct request_queue *tqueue;
struct gendisk *tdisk; struct gendisk *tdisk;
struct nvm_tgt_type *tt; struct nvm_tgt_type *tt;
@ -255,22 +327,41 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
void *targetdata; void *targetdata;
int ret; int ret;
tt = nvm_find_target_type(create->tgttype, 1); switch (create->conf.type) {
case NVM_CONFIG_TYPE_SIMPLE:
ret = __nvm_config_simple(dev, &create->conf.s);
if (ret)
return ret;
e.lun_begin = create->conf.s.lun_begin;
e.lun_end = create->conf.s.lun_end;
e.op = NVM_TARGET_DEFAULT_OP;
break;
case NVM_CONFIG_TYPE_EXTENDED:
ret = __nvm_config_extended(dev, &create->conf.e);
if (ret)
return ret;
e = create->conf.e;
break;
default:
pr_err("nvm: config type not valid\n");
return -EINVAL;
}
tt = nvm_find_target_type(create->tgttype);
if (!tt) { if (!tt) {
pr_err("nvm: target type %s not found\n", create->tgttype); pr_err("nvm: target type %s not found\n", create->tgttype);
return -EINVAL; return -EINVAL;
} }
mutex_lock(&dev->mlock); if (nvm_target_exists(create->tgtname)) {
t = nvm_find_target(dev, create->tgtname); pr_err("nvm: target name already exists (%s)\n",
if (t) { create->tgtname);
pr_err("nvm: target name already exists.\n");
mutex_unlock(&dev->mlock);
return -EINVAL; return -EINVAL;
} }
mutex_unlock(&dev->mlock);
ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end); ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
if (ret) if (ret)
return ret; return ret;
@ -280,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
goto err_reserve; goto err_reserve;
} }
tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end); tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
if (!tgt_dev) { if (!tgt_dev) {
pr_err("nvm: could not create target device\n"); pr_err("nvm: could not create target device\n");
ret = -ENOMEM; ret = -ENOMEM;
@ -350,7 +441,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
err_t: err_t:
kfree(t); kfree(t);
err_reserve: err_reserve:
nvm_release_luns_err(dev, s->lun_begin, s->lun_end); nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
return ret; return ret;
} }
@ -420,7 +511,7 @@ static int nvm_register_map(struct nvm_dev *dev)
for (i = 0; i < dev->geo.nr_chnls; i++) { for (i = 0; i < dev->geo.nr_chnls; i++) {
struct nvm_ch_map *ch_rmap; struct nvm_ch_map *ch_rmap;
int *lun_roffs; int *lun_roffs;
int luns_in_chnl = dev->geo.luns_per_chnl; int luns_in_chnl = dev->geo.nr_luns;
ch_rmap = &rmap->chnls[i]; ch_rmap = &rmap->chnls[i];
@ -524,41 +615,12 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
} }
void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
int len)
{
struct nvm_geo *geo = &dev->geo;
struct nvm_dev_map *dev_rmap = dev->rmap;
u64 i;
for (i = 0; i < len; i++) {
struct nvm_ch_map *ch_rmap;
int *lun_roffs;
struct ppa_addr gaddr;
u64 pba = le64_to_cpu(entries[i]);
u64 diff;
if (!pba)
continue;
gaddr = linear_to_generic_addr(geo, pba);
ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
lun_roffs = ch_rmap->lun_offs;
diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
entries[i] -= cpu_to_le64(diff);
}
}
EXPORT_SYMBOL(nvm_part_to_tgt);
int nvm_register_tgt_type(struct nvm_tgt_type *tt) int nvm_register_tgt_type(struct nvm_tgt_type *tt)
{ {
int ret = 0; int ret = 0;
down_write(&nvm_tgtt_lock); down_write(&nvm_tgtt_lock);
if (nvm_find_target_type(tt->name, 0)) if (__nvm_find_target_type(tt->name))
ret = -EEXIST; ret = -EEXIST;
else else
list_add(&tt->list, &nvm_tgt_types); list_add(&tt->list, &nvm_tgt_types);
@ -726,112 +788,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
} }
EXPORT_SYMBOL(nvm_submit_io_sync); EXPORT_SYMBOL(nvm_submit_io_sync);
int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
int nr_ppas)
{
struct nvm_geo *geo = &tgt_dev->geo;
struct nvm_rq rqd;
int ret;
memset(&rqd, 0, sizeof(struct nvm_rq));
rqd.opcode = NVM_OP_ERASE;
rqd.flags = geo->plane_mode >> 1;
ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
if (ret)
return ret;
ret = nvm_submit_io_sync(tgt_dev, &rqd);
if (ret) {
pr_err("rrpr: erase I/O submission failed: %d\n", ret);
goto free_ppa_list;
}
free_ppa_list:
nvm_free_rqd_ppalist(tgt_dev, &rqd);
return ret;
}
EXPORT_SYMBOL(nvm_erase_sync);
int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
nvm_l2p_update_fn *update_l2p, void *priv)
{
struct nvm_dev *dev = tgt_dev->parent;
if (!dev->ops->get_l2p_tbl)
return 0;
return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv);
}
EXPORT_SYMBOL(nvm_get_l2p_tbl);
int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
{
struct nvm_dev *dev = tgt_dev->parent;
struct nvm_geo *geo = &dev->geo;
struct nvm_area *area, *prev, *next;
sector_t begin = 0;
sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
if (len > max_sectors)
return -EINVAL;
area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
if (!area)
return -ENOMEM;
prev = NULL;
spin_lock(&dev->lock);
list_for_each_entry(next, &dev->area_list, list) {
if (begin + len > next->begin) {
begin = next->end;
prev = next;
continue;
}
break;
}
if ((begin + len) > max_sectors) {
spin_unlock(&dev->lock);
kfree(area);
return -EINVAL;
}
area->begin = *lba = begin;
area->end = begin + len;
if (prev) /* insert into sorted order */
list_add(&area->list, &prev->list);
else
list_add(&area->list, &dev->area_list);
spin_unlock(&dev->lock);
return 0;
}
EXPORT_SYMBOL(nvm_get_area);
void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
{
struct nvm_dev *dev = tgt_dev->parent;
struct nvm_area *area;
spin_lock(&dev->lock);
list_for_each_entry(area, &dev->area_list, list) {
if (area->begin != begin)
continue;
list_del(&area->list);
spin_unlock(&dev->lock);
kfree(area);
return;
}
spin_unlock(&dev->lock);
}
EXPORT_SYMBOL(nvm_put_area);
void nvm_end_io(struct nvm_rq *rqd) void nvm_end_io(struct nvm_rq *rqd)
{ {
struct nvm_tgt_dev *tgt_dev = rqd->dev; struct nvm_tgt_dev *tgt_dev = rqd->dev;
@ -858,10 +814,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
int blk, offset, pl, blktype; int blk, offset, pl, blktype;
if (nr_blks != geo->blks_per_lun * geo->plane_mode) if (nr_blks != geo->nr_chks * geo->plane_mode)
return -EINVAL; return -EINVAL;
for (blk = 0; blk < geo->blks_per_lun; blk++) { for (blk = 0; blk < geo->nr_chks; blk++) {
offset = blk * geo->plane_mode; offset = blk * geo->plane_mode;
blktype = blks[offset]; blktype = blks[offset];
@ -877,7 +833,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
blks[blk] = blktype; blks[blk] = blktype;
} }
return geo->blks_per_lun; return geo->nr_chks;
} }
EXPORT_SYMBOL(nvm_bb_tbl_fold); EXPORT_SYMBOL(nvm_bb_tbl_fold);
@ -892,53 +848,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
} }
EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
{
struct nvm_geo *geo = &dev->geo;
int i;
dev->lps_per_blk = geo->pgs_per_blk;
dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
if (!dev->lptbl)
return -ENOMEM;
/* Just a linear array */
for (i = 0; i < dev->lps_per_blk; i++)
dev->lptbl[i] = i;
return 0;
}
static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
{
int i, p;
struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
if (!mlc->num_pairs)
return 0;
dev->lps_per_blk = mlc->num_pairs;
dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
if (!dev->lptbl)
return -ENOMEM;
/* The lower page table encoding consists of a list of bytes, where each
* has a lower and an upper half. The first half byte maintains the
* increment value and every value after is an offset added to the
* previous incrementation value
*/
dev->lptbl[0] = mlc->pairs[0] & 0xF;
for (i = 1; i < dev->lps_per_blk; i++) {
p = mlc->pairs[i >> 1];
if (i & 0x1) /* upper */
dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
else /* lower */
dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
}
return 0;
}
static int nvm_core_init(struct nvm_dev *dev) static int nvm_core_init(struct nvm_dev *dev)
{ {
struct nvm_id *id = &dev->identity; struct nvm_id *id = &dev->identity;
@ -946,66 +855,44 @@ static int nvm_core_init(struct nvm_dev *dev)
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
int ret; int ret;
/* Whole device values */
geo->nr_chnls = grp->num_ch;
geo->luns_per_chnl = grp->num_lun;
/* Generic device values */
geo->pgs_per_blk = grp->num_pg;
geo->blks_per_lun = grp->num_blk;
geo->nr_planes = grp->num_pln;
geo->fpg_size = grp->fpg_sz;
geo->pfpg_size = grp->fpg_sz * grp->num_pln;
geo->sec_size = grp->csecs;
geo->oob_size = grp->sos;
geo->sec_per_pg = grp->fpg_sz / grp->csecs;
geo->mccap = grp->mccap;
memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
geo->plane_mode = NVM_PLANE_SINGLE;
geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
if (grp->mpos & 0x020202)
geo->plane_mode = NVM_PLANE_DOUBLE;
if (grp->mpos & 0x040404)
geo->plane_mode = NVM_PLANE_QUAD;
if (grp->mtype != 0) { if (grp->mtype != 0) {
pr_err("nvm: memory type not supported\n"); pr_err("nvm: memory type not supported\n");
return -EINVAL; return -EINVAL;
} }
/* calculated values */ /* Whole device values */
geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes; geo->nr_chnls = grp->num_ch;
geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk; geo->nr_luns = grp->num_lun;
geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun;
geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls;
dev->total_secs = geo->nr_luns * geo->sec_per_lun; /* Generic device geometry values */
dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns), geo->ws_min = grp->ws_min;
geo->ws_opt = grp->ws_opt;
geo->ws_seq = grp->ws_seq;
geo->ws_per_chk = grp->ws_per_chk;
geo->nr_chks = grp->num_chk;
geo->sec_size = grp->csecs;
geo->oob_size = grp->sos;
geo->mccap = grp->mccap;
geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
geo->sec_per_chk = grp->clba;
geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
geo->all_luns = geo->nr_luns * geo->nr_chnls;
/* 1.2 spec device geometry values */
geo->plane_mode = 1 << geo->ws_seq;
geo->nr_planes = geo->ws_opt / geo->ws_min;
geo->sec_per_pg = geo->ws_min;
geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
dev->total_secs = geo->all_luns * geo->sec_per_lun;
dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
sizeof(unsigned long), GFP_KERNEL); sizeof(unsigned long), GFP_KERNEL);
if (!dev->lun_map) if (!dev->lun_map)
return -ENOMEM; return -ENOMEM;
switch (grp->fmtype) {
case NVM_ID_FMTYPE_SLC:
if (nvm_init_slc_tbl(dev, grp)) {
ret = -ENOMEM;
goto err_fmtype;
}
break;
case NVM_ID_FMTYPE_MLC:
if (nvm_init_mlc_tbl(dev, grp)) {
ret = -ENOMEM;
goto err_fmtype;
}
break;
default:
pr_err("nvm: flash type not supported\n");
ret = -EINVAL;
goto err_fmtype;
}
INIT_LIST_HEAD(&dev->area_list); INIT_LIST_HEAD(&dev->area_list);
INIT_LIST_HEAD(&dev->targets); INIT_LIST_HEAD(&dev->targets);
mutex_init(&dev->mlock); mutex_init(&dev->mlock);
@ -1031,7 +918,6 @@ static void nvm_free(struct nvm_dev *dev)
dev->ops->destroy_dma_pool(dev->dma_pool); dev->ops->destroy_dma_pool(dev->dma_pool);
nvm_unregister_map(dev); nvm_unregister_map(dev);
kfree(dev->lptbl);
kfree(dev->lun_map); kfree(dev->lun_map);
kfree(dev); kfree(dev);
} }
@ -1062,8 +948,8 @@ static int nvm_init(struct nvm_dev *dev)
pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n", pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
dev->name, geo->sec_per_pg, geo->nr_planes, dev->name, geo->sec_per_pg, geo->nr_planes,
geo->pgs_per_blk, geo->blks_per_lun, geo->ws_per_chk, geo->nr_chks,
geo->nr_luns, geo->nr_chnls); geo->all_luns, geo->nr_chnls);
return 0; return 0;
err: err:
pr_err("nvm: failed to initialize nvm\n"); pr_err("nvm: failed to initialize nvm\n");
@ -1135,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister);
static int __nvm_configure_create(struct nvm_ioctl_create *create) static int __nvm_configure_create(struct nvm_ioctl_create *create)
{ {
struct nvm_dev *dev; struct nvm_dev *dev;
struct nvm_ioctl_create_simple *s;
down_write(&nvm_lock); down_write(&nvm_lock);
dev = nvm_find_nvm_dev(create->dev); dev = nvm_find_nvm_dev(create->dev);
@ -1146,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
return -EINVAL; return -EINVAL;
} }
if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
pr_err("nvm: config type not valid\n");
return -EINVAL;
}
s = &create->conf.s;
if (s->lun_begin == -1 && s->lun_end == -1) {
s->lun_begin = 0;
s->lun_end = dev->geo.nr_luns - 1;
}
if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) {
pr_err("nvm: lun out of bound (%u:%u > %u)\n",
s->lun_begin, s->lun_end, dev->geo.nr_luns - 1);
return -EINVAL;
}
return nvm_create_tgt(dev, create); return nvm_create_tgt(dev, create);
} }
@ -1262,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
return -EFAULT; return -EFAULT;
if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
create.conf.e.rsv != 0) {
pr_err("nvm: reserved config field in use\n");
return -EINVAL;
}
create.dev[DISK_NAME_LEN - 1] = '\0'; create.dev[DISK_NAME_LEN - 1] = '\0';
create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
create.tgtname[DISK_NAME_LEN - 1] = '\0'; create.tgtname[DISK_NAME_LEN - 1] = '\0';

View File

@ -19,12 +19,16 @@
int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
{ {
struct request_queue *q = pblk->dev->q;
struct pblk_w_ctx w_ctx; struct pblk_w_ctx w_ctx;
sector_t lba = pblk_get_lba(bio); sector_t lba = pblk_get_lba(bio);
unsigned long start_time = jiffies;
unsigned int bpos, pos; unsigned int bpos, pos;
int nr_entries = pblk_get_secs(bio); int nr_entries = pblk_get_secs(bio);
int i, ret; int i, ret;
generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
/* Update the write buffer head (mem) with the entries that we can /* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to * write. The write in itself cannot fail, so there is no need to
* rollback from here on. * rollback from here on.
@ -67,6 +71,7 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
pblk_rl_inserted(&pblk->rl, nr_entries); pblk_rl_inserted(&pblk->rl, nr_entries);
out: out:
generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
pblk_write_should_kick(pblk); pblk_write_should_kick(pblk);
return ret; return ret;
} }

View File

@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work)
struct pblk_line *line; struct pblk_line *line;
int pos; int pos;
line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; line = &pblk->lines[pblk_ppa_to_line(*ppa)];
pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); pos = pblk_ppa_to_pos(&dev->geo, *ppa);
pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
line->id, pos); line->id, pos);
@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
{ {
struct nvm_tgt_dev *dev = pblk->dev; struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
int pos = pblk_dev_ppa_to_pos(geo, *ppa); int pos = pblk_ppa_to_pos(geo, *ppa);
pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
atomic_long_inc(&pblk->erase_failed); atomic_long_inc(&pblk->erase_failed);
@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
{ {
struct pblk_line *line; struct pblk_line *line;
line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)]; line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
atomic_dec(&line->left_seblks); atomic_dec(&line->left_seblks);
if (rqd->error) { if (rqd->error) {
@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
BUG_ON(pblk_ppa_empty(ppa)); BUG_ON(pblk_ppa_empty(ppa));
#endif #endif
line_id = pblk_tgt_ppa_to_line(ppa); line_id = pblk_ppa_to_line(ppa);
line = &pblk->lines[line_id]; line = &pblk->lines[line_id];
paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
@ -650,7 +650,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
} else { } else {
for (i = 0; i < rqd.nr_ppas; ) { for (i = 0; i < rqd.nr_ppas; ) {
struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
int pos = pblk_dev_ppa_to_pos(geo, ppa); int pos = pblk_ppa_to_pos(geo, ppa);
int read_type = PBLK_READ_RANDOM; int read_type = PBLK_READ_RANDOM;
if (pblk_io_aligned(pblk, rq_ppas)) if (pblk_io_aligned(pblk, rq_ppas))
@ -668,7 +668,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
} }
ppa = addr_to_gen_ppa(pblk, paddr, id); ppa = addr_to_gen_ppa(pblk, paddr, id);
pos = pblk_dev_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
} }
if (pblk_boundary_paddr_checks(pblk, paddr + min)) { if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
cmd_op = NVM_OP_PWRITE; cmd_op = NVM_OP_PWRITE;
flags = pblk_set_progr_mode(pblk, PBLK_WRITE); flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
lba_list = emeta_to_lbas(pblk, line->emeta->buf); lba_list = emeta_to_lbas(pblk, line->emeta->buf);
} else if (dir == PBLK_READ) { } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
bio_op = REQ_OP_READ; bio_op = REQ_OP_READ;
cmd_op = NVM_OP_PREAD; cmd_op = NVM_OP_PREAD;
flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
if (rqd.error) { if (rqd.error) {
if (dir == PBLK_WRITE) if (dir == PBLK_WRITE)
pblk_log_write_err(pblk, &rqd); pblk_log_write_err(pblk, &rqd);
else else if (dir == PBLK_READ)
pblk_log_read_err(pblk, &rqd); pblk_log_read_err(pblk, &rqd);
} }
@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
{ {
u64 bpaddr = pblk_line_smeta_start(pblk, line); u64 bpaddr = pblk_line_smeta_start(pblk, line);
return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ); return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
} }
int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
pr_err("pblk: could not sync erase line:%d,blk:%d\n", pr_err("pblk: could not sync erase line:%d,blk:%d\n",
pblk_dev_ppa_to_line(ppa), pblk_ppa_to_line(ppa),
pblk_dev_ppa_to_pos(geo, ppa)); pblk_ppa_to_pos(geo, ppa));
rqd.error = ret; rqd.error = ret;
goto out; goto out;
@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
/* Start metadata */ /* Start metadata */
smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns); smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
/* Fill metadata among lines */ /* Fill metadata among lines */
if (cur) { if (cur) {
@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
lm->sec_per_line); lm->sec_per_line);
bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
lm->sec_per_line); lm->sec_per_line);
line->sec_in_line -= geo->sec_per_blk; line->sec_in_line -= geo->sec_per_chk;
if (bit >= lm->emeta_bb) if (bit >= lm->emeta_bb)
nr_bb++; nr_bb++;
} }
@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
} }
spin_unlock(&l_mg->free_lock); spin_unlock(&l_mg->free_lock);
pblk_rl_free_lines_dec(&pblk->rl, line); pblk_rl_free_lines_dec(&pblk->rl, line, true);
if (!pblk_line_init_bb(pblk, line, 0)) { if (!pblk_line_init_bb(pblk, line, 0)) {
list_add(&line->list, &l_mg->free_list); list_add(&line->list, &l_mg->free_list);
@ -1233,7 +1233,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
l_mg->data_line = retry_line; l_mg->data_line = retry_line;
spin_unlock(&l_mg->free_lock); spin_unlock(&l_mg->free_lock);
pblk_rl_free_lines_dec(&pblk->rl, retry_line); pblk_rl_free_lines_dec(&pblk->rl, line, false);
if (pblk_line_erase(pblk, retry_line)) if (pblk_line_erase(pblk, retry_line))
goto retry; goto retry;
@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
{ {
struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line; struct pblk_line *line;
int is_next = 0;
spin_lock(&l_mg->free_lock); spin_lock(&l_mg->free_lock);
line = pblk_line_get(pblk); line = pblk_line_get(pblk);
@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
} else { } else {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++; l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA; l_mg->data_next->type = PBLK_LINETYPE_DATA;
is_next = 1;
} }
spin_unlock(&l_mg->free_lock); spin_unlock(&l_mg->free_lock);
@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
return NULL; return NULL;
} }
pblk_rl_free_lines_dec(&pblk->rl, line);
if (is_next)
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
retry_setup: retry_setup:
if (!pblk_line_init_metadata(pblk, line, NULL)) { if (!pblk_line_init_metadata(pblk, line, NULL)) {
line = pblk_line_retry(pblk, line); line = pblk_line_retry(pblk, line);
@ -1311,6 +1305,8 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
goto retry_setup; goto retry_setup;
} }
pblk_rl_free_lines_dec(&pblk->rl, line, true);
return line; return line;
} }
@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *cur, *new = NULL; struct pblk_line *cur, *new = NULL;
unsigned int left_seblks; unsigned int left_seblks;
int is_next = 0;
cur = l_mg->data_line; cur = l_mg->data_line;
new = l_mg->data_next; new = l_mg->data_next;
@ -1444,6 +1439,8 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
goto retry_setup; goto retry_setup;
} }
pblk_rl_free_lines_dec(&pblk->rl, new, true);
/* Allocate next line for preparation */ /* Allocate next line for preparation */
spin_lock(&l_mg->free_lock); spin_lock(&l_mg->free_lock);
l_mg->data_next = pblk_line_get(pblk); l_mg->data_next = pblk_line_get(pblk);
@ -1457,13 +1454,9 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
} else { } else {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++; l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA; l_mg->data_next->type = PBLK_LINETYPE_DATA;
is_next = 1;
} }
spin_unlock(&l_mg->free_lock); spin_unlock(&l_mg->free_lock);
if (is_next)
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
out: out:
return new; return new;
} }
@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
pr_err("pblk: could not async erase line:%d,blk:%d\n", pr_err("pblk: could not async erase line:%d,blk:%d\n",
pblk_dev_ppa_to_line(ppa), pblk_ppa_to_line(ppa),
pblk_dev_ppa_to_pos(geo, ppa)); pblk_ppa_to_pos(geo, ppa));
} }
return err; return err;
@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
struct nvm_tgt_dev *dev = pblk->dev; struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
struct pblk_lun *rlun; struct pblk_lun *rlun;
int nr_luns = geo->nr_luns; int nr_luns = geo->all_luns;
int bit = -1; int bit = -1;
while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) { while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
/* If the L2P entry maps to a line, the reference is valid */ /* If the L2P entry maps to a line, the reference is valid */
if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
int line_id = pblk_dev_ppa_to_line(ppa); int line_id = pblk_ppa_to_line(ppa);
struct pblk_line *line = &pblk->lines[line_id]; struct pblk_line *line = &pblk->lines[line_id];
kref_get(&line->ref); kref_get(&line->ref);

View File

@ -169,7 +169,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
* the line untouched. TODO: Implement a recovery routine that scans and * the line untouched. TODO: Implement a recovery routine that scans and
* moves all sectors on the line. * moves all sectors on the line.
*/ */
lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
ret = pblk_recov_check_emeta(pblk, emeta_buf);
if (ret) {
pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
goto fail_free_emeta;
}
lba_list = emeta_to_lbas(pblk, emeta_buf);
if (!lba_list) { if (!lba_list) {
pr_err("pblk: could not interpret emeta (line %d)\n", line->id); pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
goto fail_free_emeta; goto fail_free_emeta;
@ -519,22 +526,12 @@ void pblk_gc_should_start(struct pblk *pblk)
} }
} }
/*
* If flush_wq == 1 then no lock should be held by the caller since
* flush_workqueue can sleep
*/
static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
{
pblk->gc.gc_active = 0;
pr_debug("pblk: gc stop\n");
}
void pblk_gc_should_stop(struct pblk *pblk) void pblk_gc_should_stop(struct pblk *pblk)
{ {
struct pblk_gc *gc = &pblk->gc; struct pblk_gc *gc = &pblk->gc;
if (gc->gc_active && !gc->gc_forced) if (gc->gc_active && !gc->gc_forced)
pblk_gc_stop(pblk, 0); gc->gc_active = 0;
} }
void pblk_gc_should_kick(struct pblk *pblk) void pblk_gc_should_kick(struct pblk *pblk)
@ -660,7 +657,7 @@ void pblk_gc_exit(struct pblk *pblk)
gc->gc_enabled = 0; gc->gc_enabled = 0;
del_timer_sync(&gc->gc_timer); del_timer_sync(&gc->gc_timer);
pblk_gc_stop(pblk, 1); gc->gc_active = 0;
if (gc->gc_ts) if (gc->gc_ts)
kthread_stop(gc->gc_ts); kthread_stop(gc->gc_ts);

View File

@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk)
} }
ppaf.ch_len = power_len; ppaf.ch_len = power_len;
power_len = get_count_order(geo->luns_per_chnl); power_len = get_count_order(geo->nr_luns);
if (1 << power_len != geo->luns_per_chnl) { if (1 << power_len != geo->nr_luns) {
pr_err("pblk: supports only power-of-two LUN config.\n"); pr_err("pblk: supports only power-of-two LUN config.\n");
return -EINVAL; return -EINVAL;
} }
@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk)
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
geo->nr_planes * geo->nr_luns; geo->nr_planes * geo->all_luns;
if (pblk_init_global_caches(pblk)) if (pblk_init_global_caches(pblk))
return -ENOMEM; return -ENOMEM;
@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk)
if (!pblk->gen_ws_pool) if (!pblk->gen_ws_pool)
goto free_page_bio_pool; goto free_page_bio_pool;
pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
pblk_rec_cache);
if (!pblk->rec_pool) if (!pblk->rec_pool)
goto free_gen_ws_pool; goto free_gen_ws_pool;
pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns, pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
pblk_g_rq_cache); pblk_g_rq_cache);
if (!pblk->r_rq_pool) if (!pblk->r_rq_pool)
goto free_rec_pool; goto free_rec_pool;
pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns, pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
pblk_g_rq_cache); pblk_g_rq_cache);
if (!pblk->e_rq_pool) if (!pblk->e_rq_pool)
goto free_r_rq_pool; goto free_r_rq_pool;
pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns, pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
pblk_w_rq_cache); pblk_w_rq_cache);
if (!pblk->w_rq_pool) if (!pblk->w_rq_pool)
goto free_e_rq_pool; goto free_e_rq_pool;
@ -354,6 +355,8 @@ static void pblk_core_free(struct pblk *pblk)
mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->e_rq_pool);
mempool_destroy(pblk->w_rq_pool); mempool_destroy(pblk->w_rq_pool);
pblk_rwb_free(pblk);
pblk_free_global_caches(pblk); pblk_free_global_caches(pblk);
} }
@ -409,7 +412,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
u8 *blks; u8 *blks;
int nr_blks, ret; int nr_blks, ret;
nr_blks = geo->blks_per_lun * geo->plane_mode; nr_blks = geo->nr_chks * geo->plane_mode;
blks = kmalloc(nr_blks, GFP_KERNEL); blks = kmalloc(nr_blks, GFP_KERNEL);
if (!blks) if (!blks)
return -ENOMEM; return -ENOMEM;
@ -482,20 +485,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
int i, ret; int i, ret;
/* TODO: Implement unbalanced LUN support */ /* TODO: Implement unbalanced LUN support */
if (geo->luns_per_chnl < 0) { if (geo->nr_luns < 0) {
pr_err("pblk: unbalanced LUN config.\n"); pr_err("pblk: unbalanced LUN config.\n");
return -EINVAL; return -EINVAL;
} }
pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL); pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
GFP_KERNEL);
if (!pblk->luns) if (!pblk->luns)
return -ENOMEM; return -ENOMEM;
for (i = 0; i < geo->nr_luns; i++) { for (i = 0; i < geo->all_luns; i++) {
/* Stripe across channels */ /* Stripe across channels */
int ch = i % geo->nr_chnls; int ch = i % geo->nr_chnls;
int lun_raw = i / geo->nr_chnls; int lun_raw = i / geo->nr_chnls;
int lunid = lun_raw + ch * geo->luns_per_chnl; int lunid = lun_raw + ch * geo->nr_luns;
rlun = &pblk->luns[i]; rlun = &pblk->luns[i];
rlun->bppa = luns[lunid]; rlun->bppa = luns[lunid];
@ -577,22 +581,37 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
{ {
struct nvm_tgt_dev *dev = pblk->dev; struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
sector_t provisioned; sector_t provisioned;
int sec_meta, blk_meta;
pblk->over_pct = 20; if (geo->op == NVM_TARGET_DEFAULT_OP)
pblk->op = PBLK_DEFAULT_OP;
else
pblk->op = geo->op;
provisioned = nr_free_blks; provisioned = nr_free_blks;
provisioned *= (100 - pblk->over_pct); provisioned *= (100 - pblk->op);
sector_div(provisioned, 100); sector_div(provisioned, 100);
pblk->op_blks = nr_free_blks - provisioned;
/* Internally pblk manages all free blocks, but all calculations based /* Internally pblk manages all free blocks, but all calculations based
* on user capacity consider only provisioned blocks * on user capacity consider only provisioned blocks
*/ */
pblk->rl.total_blocks = nr_free_blks; pblk->rl.total_blocks = nr_free_blks;
pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk; pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk;
pblk->capacity = provisioned * geo->sec_per_blk;
/* Consider sectors used for metadata */
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk;
atomic_set(&pblk->rl.free_blocks, nr_free_blks); atomic_set(&pblk->rl.free_blocks, nr_free_blks);
atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
} }
static int pblk_lines_alloc_metadata(struct pblk *pblk) static int pblk_lines_alloc_metadata(struct pblk *pblk)
@ -683,7 +702,7 @@ static int pblk_lines_init(struct pblk *pblk)
int i, ret; int i, ret;
pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
max_write_ppas = pblk->min_write_pgs * geo->nr_luns; max_write_ppas = pblk->min_write_pgs * geo->all_luns;
pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
max_write_ppas : nvm_max_phys_sects(dev); max_write_ppas : nvm_max_phys_sects(dev);
pblk_set_sec_per_write(pblk, pblk->min_write_pgs); pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
@ -693,26 +712,26 @@ static int pblk_lines_init(struct pblk *pblk)
return -EINVAL; return -EINVAL;
} }
div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod);
if (mod) { if (mod) {
pr_err("pblk: bad configuration of sectors/pages\n"); pr_err("pblk: bad configuration of sectors/pages\n");
return -EINVAL; return -EINVAL;
} }
l_mg->nr_lines = geo->blks_per_lun; l_mg->nr_lines = geo->nr_chks;
l_mg->log_line = l_mg->data_line = NULL; l_mg->log_line = l_mg->data_line = NULL;
l_mg->l_seq_nr = l_mg->d_seq_nr = 0; l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
l_mg->nr_free_lines = 0; l_mg->nr_free_lines = 0;
bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
lm->sec_per_line = geo->sec_per_blk * geo->nr_luns; lm->sec_per_line = geo->sec_per_chk * geo->all_luns;
lm->blk_per_line = geo->nr_luns; lm->blk_per_line = geo->all_luns;
lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
lm->mid_thrs = lm->sec_per_line / 2; lm->mid_thrs = lm->sec_per_line / 2;
lm->high_thrs = lm->sec_per_line / 4; lm->high_thrs = lm->sec_per_line / 4;
lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
/* Calculate necessary pages for smeta. See comment over struct /* Calculate necessary pages for smeta. See comment over struct
* line_smeta definition * line_smeta definition
@ -742,12 +761,12 @@ static int pblk_lines_init(struct pblk *pblk)
goto add_emeta_page; goto add_emeta_page;
} }
lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0; lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
lm->min_blk_line = 1; lm->min_blk_line = 1;
if (geo->nr_luns > 1) if (geo->all_luns > 1)
lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
lm->emeta_sec[0], geo->sec_per_blk); lm->emeta_sec[0], geo->sec_per_chk);
if (lm->min_blk_line > lm->blk_per_line) { if (lm->min_blk_line > lm->blk_per_line) {
pr_err("pblk: config. not supported. Min. LUN in line:%d\n", pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
@ -772,7 +791,7 @@ static int pblk_lines_init(struct pblk *pblk)
goto fail_free_bb_template; goto fail_free_bb_template;
} }
bb_distance = (geo->nr_luns) * geo->sec_per_pl; bb_distance = (geo->all_luns) * geo->sec_per_pl;
for (i = 0; i < lm->sec_per_line; i += bb_distance) for (i = 0; i < lm->sec_per_line; i += bb_distance)
bitmap_set(l_mg->bb_template, i, geo->sec_per_pl); bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
@ -844,7 +863,7 @@ static int pblk_lines_init(struct pblk *pblk)
pblk_set_provision(pblk, nr_free_blks); pblk_set_provision(pblk, nr_free_blks);
/* Cleanup per-LUN bad block lists - managed within lines on run-time */ /* Cleanup per-LUN bad block lists - managed within lines on run-time */
for (i = 0; i < geo->nr_luns; i++) for (i = 0; i < geo->all_luns; i++)
kfree(pblk->luns[i].bb_list); kfree(pblk->luns[i].bb_list);
return 0; return 0;
@ -858,7 +877,7 @@ static int pblk_lines_init(struct pblk *pblk)
fail_free_meta: fail_free_meta:
pblk_line_meta_free(pblk); pblk_line_meta_free(pblk);
fail: fail:
for (i = 0; i < geo->nr_luns; i++) for (i = 0; i < geo->all_luns; i++)
kfree(pblk->luns[i].bb_list); kfree(pblk->luns[i].bb_list);
return ret; return ret;
@ -866,15 +885,19 @@ static int pblk_lines_init(struct pblk *pblk)
static int pblk_writer_init(struct pblk *pblk) static int pblk_writer_init(struct pblk *pblk)
{ {
timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
if (IS_ERR(pblk->writer_ts)) { if (IS_ERR(pblk->writer_ts)) {
pr_err("pblk: could not allocate writer kthread\n"); int err = PTR_ERR(pblk->writer_ts);
return PTR_ERR(pblk->writer_ts);
if (err != -EINTR)
pr_err("pblk: could not allocate writer kthread (%d)\n",
err);
return err;
} }
timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
return 0; return 0;
} }
@ -910,7 +933,6 @@ static void pblk_tear_down(struct pblk *pblk)
pblk_pipeline_stop(pblk); pblk_pipeline_stop(pblk);
pblk_writer_stop(pblk); pblk_writer_stop(pblk);
pblk_rb_sync_l2p(&pblk->rwb); pblk_rb_sync_l2p(&pblk->rwb);
pblk_rwb_free(pblk);
pblk_rl_free(&pblk->rl); pblk_rl_free(&pblk->rl);
pr_debug("pblk: consistent tear down\n"); pr_debug("pblk: consistent tear down\n");
@ -1025,7 +1047,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
ret = pblk_writer_init(pblk); ret = pblk_writer_init(pblk);
if (ret) { if (ret) {
pr_err("pblk: could not initialize write thread\n"); if (ret != -EINTR)
pr_err("pblk: could not initialize write thread\n");
goto fail_free_lines; goto fail_free_lines;
} }
@ -1041,13 +1064,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
blk_queue_write_cache(tqueue, true, false); blk_queue_write_cache(tqueue, true, false);
tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size; tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size;
tqueue->limits.discard_alignment = 0; tqueue->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n", pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
geo->nr_luns, pblk->l_mg.nr_lines, tdisk->disk_name,
geo->all_luns, pblk->l_mg.nr_lines,
(unsigned long long)pblk->rl.nr_secs, (unsigned long long)pblk->rl.nr_secs,
pblk->rwb.nr_entries); pblk->rwb.nr_entries);

View File

@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
return; return;
/* Erase blocks that are bad in this line but might not be in next */ /* Erase blocks that are bad in this line but might not be in next */
if (unlikely(ppa_empty(*erase_ppa)) && if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
int bit = -1; int bit = -1;

View File

@ -54,7 +54,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
rb->seg_size = (1 << power_seg_sz); rb->seg_size = (1 << power_seg_sz);
rb->nr_entries = (1 << power_size); rb->nr_entries = (1 << power_size);
rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
rb->sync_point = EMPTY_ENTRY; rb->flush_point = EMPTY_ENTRY;
spin_lock_init(&rb->w_lock); spin_lock_init(&rb->w_lock);
spin_lock_init(&rb->s_lock); spin_lock_init(&rb->s_lock);
@ -112,7 +112,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
up_write(&pblk_rb_lock); up_write(&pblk_rb_lock);
#ifdef CONFIG_NVM_DEBUG #ifdef CONFIG_NVM_DEBUG
atomic_set(&rb->inflight_sync_point, 0); atomic_set(&rb->inflight_flush_point, 0);
#endif #endif
/* /*
@ -226,7 +226,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
entry->cacheline); entry->cacheline);
line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
kref_put(&line->ref, pblk_line_put); kref_put(&line->ref, pblk_line_put);
clean_wctx(w_ctx); clean_wctx(w_ctx);
rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
@ -349,35 +349,35 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
smp_store_release(&entry->w_ctx.flags, flags); smp_store_release(&entry->w_ctx.flags, flags);
} }
static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
unsigned int pos) unsigned int pos)
{ {
struct pblk_rb_entry *entry; struct pblk_rb_entry *entry;
unsigned int subm, sync_point; unsigned int sync, flush_point;
subm = READ_ONCE(rb->subm); sync = READ_ONCE(rb->sync);
if (pos == sync)
return 0;
#ifdef CONFIG_NVM_DEBUG #ifdef CONFIG_NVM_DEBUG
atomic_inc(&rb->inflight_sync_point); atomic_inc(&rb->inflight_flush_point);
#endif #endif
if (pos == subm) flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
return 0; entry = &rb->entries[flush_point];
sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); pblk_rb_sync_init(rb, NULL);
entry = &rb->entries[sync_point];
/* Protect syncs */ /* Protect flush points */
smp_store_release(&rb->sync_point, sync_point); smp_store_release(&rb->flush_point, flush_point);
if (!bio) if (bio)
return 0; bio_list_add(&entry->w_ctx.bios, bio);
spin_lock_irq(&rb->s_lock); pblk_rb_sync_end(rb, NULL);
bio_list_add(&entry->w_ctx.bios, bio);
spin_unlock_irq(&rb->s_lock);
return 1; return bio ? 1 : 0;
} }
static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
@ -416,7 +416,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk *pblk = container_of(rb, struct pblk, rwb);
unsigned int mem = READ_ONCE(rb->mem); unsigned int mem = READ_ONCE(rb->mem);
if (pblk_rb_sync_point_set(rb, NULL, mem)) if (pblk_rb_flush_point_set(rb, NULL, mem))
return; return;
pblk_write_should_kick(pblk); pblk_write_should_kick(pblk);
@ -440,7 +440,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
#ifdef CONFIG_NVM_DEBUG #ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->nr_flush); atomic_long_inc(&pblk->nr_flush);
#endif #endif
if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem)) if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
*io_ret = NVM_IO_OK; *io_ret = NVM_IO_OK;
} }
@ -606,21 +606,6 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
return NVM_IO_ERR; return NVM_IO_ERR;
} }
if (flags & PBLK_FLUSH_ENTRY) {
unsigned int sync_point;
sync_point = READ_ONCE(rb->sync_point);
if (sync_point == pos) {
/* Protect syncs */
smp_store_release(&rb->sync_point, EMPTY_ENTRY);
}
flags &= ~PBLK_FLUSH_ENTRY;
#ifdef CONFIG_NVM_DEBUG
atomic_dec(&rb->inflight_sync_point);
#endif
}
flags &= ~PBLK_WRITTEN_DATA; flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY; flags |= PBLK_SUBMITTED_ENTRY;
@ -730,15 +715,24 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
{ {
unsigned int sync; unsigned int sync, flush_point;
unsigned int i;
lockdep_assert_held(&rb->s_lock); lockdep_assert_held(&rb->s_lock);
sync = READ_ONCE(rb->sync); sync = READ_ONCE(rb->sync);
flush_point = READ_ONCE(rb->flush_point);
for (i = 0; i < nr_entries; i++) if (flush_point != EMPTY_ENTRY) {
sync = (sync + 1) & (rb->nr_entries - 1); unsigned int secs_to_flush;
secs_to_flush = pblk_rb_ring_count(flush_point, sync,
rb->nr_entries);
if (secs_to_flush < nr_entries) {
/* Protect flush points */
smp_store_release(&rb->flush_point, EMPTY_ENTRY);
}
}
sync = (sync + nr_entries) & (rb->nr_entries - 1);
/* Protect from counts */ /* Protect from counts */
smp_store_release(&rb->sync, sync); smp_store_release(&rb->sync, sync);
@ -746,22 +740,27 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
return sync; return sync;
} }
unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb) /* Calculate how many sectors to submit up to the current flush point. */
unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
{ {
unsigned int subm, sync_point; unsigned int subm, sync, flush_point;
unsigned int count; unsigned int submitted, to_flush;
/* Protect syncs */ /* Protect flush points */
sync_point = smp_load_acquire(&rb->sync_point); flush_point = smp_load_acquire(&rb->flush_point);
if (sync_point == EMPTY_ENTRY) if (flush_point == EMPTY_ENTRY)
return 0; return 0;
/* Protect syncs */
sync = smp_load_acquire(&rb->sync);
subm = READ_ONCE(rb->subm); subm = READ_ONCE(rb->subm);
submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
/* The sync point itself counts as a sector to sync */ /* The sync point itself counts as a sector to sync */
count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1; to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
return count; return (submitted < to_flush) ? (to_flush - submitted) : 0;
} }
/* /*
@ -801,7 +800,7 @@ int pblk_rb_tear_down_check(struct pblk_rb *rb)
if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
(rb->sync == rb->l2p_update) && (rb->sync == rb->l2p_update) &&
(rb->sync_point == EMPTY_ENTRY)) { (rb->flush_point == EMPTY_ENTRY)) {
goto out; goto out;
} }
@ -848,7 +847,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
queued_entries++; queued_entries++;
spin_unlock_irq(&rb->s_lock); spin_unlock_irq(&rb->s_lock);
if (rb->sync_point != EMPTY_ENTRY) if (rb->flush_point != EMPTY_ENTRY)
offset = scnprintf(buf, PAGE_SIZE, offset = scnprintf(buf, PAGE_SIZE,
"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
rb->nr_entries, rb->nr_entries,
@ -857,14 +856,14 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
rb->sync, rb->sync,
rb->l2p_update, rb->l2p_update,
#ifdef CONFIG_NVM_DEBUG #ifdef CONFIG_NVM_DEBUG
atomic_read(&rb->inflight_sync_point), atomic_read(&rb->inflight_flush_point),
#else #else
0, 0,
#endif #endif
rb->sync_point, rb->flush_point,
pblk_rb_read_count(rb), pblk_rb_read_count(rb),
pblk_rb_space(rb), pblk_rb_space(rb),
pblk_rb_sync_point_count(rb), pblk_rb_flush_point_count(rb),
queued_entries); queued_entries);
else else
offset = scnprintf(buf, PAGE_SIZE, offset = scnprintf(buf, PAGE_SIZE,
@ -875,13 +874,13 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
rb->sync, rb->sync,
rb->l2p_update, rb->l2p_update,
#ifdef CONFIG_NVM_DEBUG #ifdef CONFIG_NVM_DEBUG
atomic_read(&rb->inflight_sync_point), atomic_read(&rb->inflight_flush_point),
#else #else
0, 0,
#endif #endif
pblk_rb_read_count(rb), pblk_rb_read_count(rb),
pblk_rb_space(rb), pblk_rb_space(rb),
pblk_rb_sync_point_count(rb), pblk_rb_flush_point_count(rb),
queued_entries); queued_entries);
return offset; return offset;

View File

@ -141,7 +141,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
struct ppa_addr ppa = ppa_list[i]; struct ppa_addr ppa = ppa_list[i];
struct pblk_line *line; struct pblk_line *line;
line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; line = &pblk->lines[pblk_ppa_to_line(ppa)];
kref_put(&line->ref, pblk_line_put_wq); kref_put(&line->ref, pblk_line_put_wq);
} }
} }
@ -158,8 +158,12 @@ static void pblk_end_user_read(struct bio *bio)
static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
bool put_line) bool put_line)
{ {
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = rqd->bio; struct bio *bio = rqd->bio;
unsigned long start_time = r_ctx->start_time;
generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
if (rqd->error) if (rqd->error)
pblk_log_read_err(pblk, rqd); pblk_log_read_err(pblk, rqd);
@ -193,9 +197,9 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
__pblk_end_io_read(pblk, rqd, true); __pblk_end_io_read(pblk, rqd, true);
} }
static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int bio_init_idx, unsigned int bio_init_idx,
unsigned long *read_bitmap) unsigned long *read_bitmap)
{ {
struct bio *new_bio, *bio = rqd->bio; struct bio *new_bio, *bio = rqd->bio;
struct pblk_sec_meta *meta_list = rqd->meta_list; struct pblk_sec_meta *meta_list = rqd->meta_list;
@ -270,7 +274,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
i = 0; i = 0;
hole = find_first_zero_bit(read_bitmap, nr_secs); hole = find_first_zero_bit(read_bitmap, nr_secs);
do { do {
int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]); int line_id = pblk_ppa_to_line(rqd->ppa_list[i]);
struct pblk_line *line = &pblk->lines[line_id]; struct pblk_line *line = &pblk->lines[line_id];
kref_put(&line->ref, pblk_line_put); kref_put(&line->ref, pblk_line_put);
@ -306,6 +310,8 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
return NVM_IO_OK; return NVM_IO_OK;
err: err:
pr_err("pblk: failed to perform partial read\n");
/* Free allocated pages in new bio */ /* Free allocated pages in new bio */
pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
__pblk_end_io_read(pblk, rqd, false); __pblk_end_io_read(pblk, rqd, false);
@ -357,6 +363,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
int pblk_submit_read(struct pblk *pblk, struct bio *bio) int pblk_submit_read(struct pblk *pblk, struct bio *bio)
{ {
struct nvm_tgt_dev *dev = pblk->dev; struct nvm_tgt_dev *dev = pblk->dev;
struct request_queue *q = dev->q;
sector_t blba = pblk_get_lba(bio); sector_t blba = pblk_get_lba(bio);
unsigned int nr_secs = pblk_get_secs(bio); unsigned int nr_secs = pblk_get_secs(bio);
struct pblk_g_ctx *r_ctx; struct pblk_g_ctx *r_ctx;
@ -372,6 +379,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
return NVM_IO_ERR; return NVM_IO_ERR;
} }
generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
bitmap_zero(&read_bitmap, nr_secs); bitmap_zero(&read_bitmap, nr_secs);
rqd = pblk_alloc_rqd(pblk, PBLK_READ); rqd = pblk_alloc_rqd(pblk, PBLK_READ);
@ -383,6 +392,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->end_io = pblk_end_io_read; rqd->end_io = pblk_end_io_read;
r_ctx = nvm_rq_to_pdu(rqd); r_ctx = nvm_rq_to_pdu(rqd);
r_ctx->start_time = jiffies;
r_ctx->lba = blba; r_ctx->lba = blba;
/* Save the index for this bio's start. This is needed in case /* Save the index for this bio's start. This is needed in case
@ -422,7 +432,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
if (!int_bio) { if (!int_bio) {
pr_err("pblk: could not clone read bio\n"); pr_err("pblk: could not clone read bio\n");
return NVM_IO_ERR; goto fail_end_io;
} }
rqd->bio = int_bio; rqd->bio = int_bio;
@ -433,7 +443,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
pr_err("pblk: read IO submission failed\n"); pr_err("pblk: read IO submission failed\n");
if (int_bio) if (int_bio)
bio_put(int_bio); bio_put(int_bio);
return ret; goto fail_end_io;
} }
return NVM_IO_OK; return NVM_IO_OK;
@ -442,17 +452,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
/* The read bio request could be partially filled by the write buffer, /* The read bio request could be partially filled by the write buffer,
* but there are some holes that need to be read from the drive. * but there are some holes that need to be read from the drive.
*/ */
ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap); return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
if (ret) {
pr_err("pblk: failed to perform partial read\n");
return ret;
}
return NVM_IO_OK;
fail_rqd_free: fail_rqd_free:
pblk_free_rqd(pblk, rqd, PBLK_READ); pblk_free_rqd(pblk, rqd, PBLK_READ);
return ret; return ret;
fail_end_io:
__pblk_end_io_read(pblk, rqd, false);
return ret;
} }
static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,

View File

@ -111,18 +111,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
return 0; return 0;
} }
__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf) int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
{ {
u32 crc; u32 crc;
crc = pblk_calc_emeta_crc(pblk, emeta_buf); crc = pblk_calc_emeta_crc(pblk, emeta_buf);
if (le32_to_cpu(emeta_buf->crc) != crc) if (le32_to_cpu(emeta_buf->crc) != crc)
return NULL; return 1;
if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
return NULL; return 1;
return emeta_to_lbas(pblk, emeta_buf); return 0;
} }
static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
@ -137,7 +137,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
u64 nr_valid_lbas, nr_lbas = 0; u64 nr_valid_lbas, nr_lbas = 0;
u64 i; u64 i;
lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); lba_list = emeta_to_lbas(pblk, emeta_buf);
if (!lba_list) if (!lba_list)
return 1; return 1;
@ -149,7 +149,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
struct ppa_addr ppa; struct ppa_addr ppa;
int pos; int pos;
ppa = addr_to_pblk_ppa(pblk, i, line->id); ppa = addr_to_gen_ppa(pblk, i, line->id);
pos = pblk_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
/* Do not update bad blocks */ /* Do not update bad blocks */
@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
nr_bb * geo->sec_per_blk; nr_bb * geo->sec_per_chk;
} }
struct pblk_recov_alloc { struct pblk_recov_alloc {
@ -263,12 +263,12 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
int pos; int pos;
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) { while (test_bit(pos, line->blk_bitmap)) {
r_ptr_int += pblk->min_write_pgs; r_ptr_int += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
} }
for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++) for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
@ -288,7 +288,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
/* At this point, the read should not fail. If it does, it is a problem /* At this point, the read should not fail. If it does, it is a problem
* we cannot recover from here. Need FTL log. * we cannot recover from here. Need FTL log.
*/ */
if (rqd->error) { if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
return -EINTR; return -EINTR;
} }
@ -411,12 +411,12 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
int pos; int pos;
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) { while (test_bit(pos, line->blk_bitmap)) {
w_ptr += pblk->min_write_pgs; w_ptr += pblk->min_write_pgs;
ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
} }
@ -541,12 +541,12 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) { while (test_bit(pos, line->blk_bitmap)) {
w_ptr += pblk->min_write_pgs; w_ptr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
} }
for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
@ -672,12 +672,12 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_gen_ppa(pblk, paddr, line->id); ppa = addr_to_gen_ppa(pblk, paddr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) { while (test_bit(pos, line->blk_bitmap)) {
paddr += pblk->min_write_pgs; paddr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, paddr, line->id); ppa = addr_to_gen_ppa(pblk, paddr, line->id);
pos = pblk_dev_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
} }
for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++) for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
@ -817,7 +817,7 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
while (emeta_secs) { while (emeta_secs) {
emeta_start--; emeta_start--;
ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id); ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
pos = pblk_ppa_to_pos(geo, ppa); pos = pblk_ppa_to_pos(geo, ppa);
if (!test_bit(pos, line->blk_bitmap)) if (!test_bit(pos, line->blk_bitmap))
emeta_secs--; emeta_secs--;
@ -938,6 +938,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
goto next; goto next;
} }
if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
pblk_recov_l2p_from_oob(pblk, line);
goto next;
}
if (pblk_recov_l2p_from_emeta(pblk, line)) if (pblk_recov_l2p_from_emeta(pblk, line))
pblk_recov_l2p_from_oob(pblk, line); pblk_recov_l2p_from_oob(pblk, line);
@ -984,10 +989,8 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
} }
spin_unlock(&l_mg->free_lock); spin_unlock(&l_mg->free_lock);
if (is_next) { if (is_next)
pblk_line_erase(pblk, l_mg->data_next); pblk_line_erase(pblk, l_mg->data_next);
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
}
out: out:
if (found_lines != recovered_lines) if (found_lines != recovered_lines)

View File

@ -89,17 +89,15 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
return atomic_read(&rl->free_blocks); return atomic_read(&rl->free_blocks);
} }
/* unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
* We check for (i) the number of free blocks in the current LUN and (ii) the {
* total number of free blocks in the pblk instance. This is to even out the return atomic_read(&rl->free_user_blocks);
* number of free blocks on each LUN when GC kicks in. }
*
* Only the total number of free blocks is used to configure the rate limiter. static void __pblk_rl_update_rates(struct pblk_rl *rl,
*/ unsigned long free_blocks)
void pblk_rl_update_rates(struct pblk_rl *rl)
{ {
struct pblk *pblk = container_of(rl, struct pblk, rl); struct pblk *pblk = container_of(rl, struct pblk, rl);
unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
int max = rl->rb_budget; int max = rl->rb_budget;
if (free_blocks >= rl->high) { if (free_blocks >= rl->high) {
@ -132,20 +130,37 @@ void pblk_rl_update_rates(struct pblk_rl *rl)
pblk_gc_should_stop(pblk); pblk_gc_should_stop(pblk);
} }
void pblk_rl_update_rates(struct pblk_rl *rl)
{
__pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
}
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
{ {
int blk_in_line = atomic_read(&line->blk_in_line); int blk_in_line = atomic_read(&line->blk_in_line);
int free_blocks;
atomic_add(blk_in_line, &rl->free_blocks); atomic_add(blk_in_line, &rl->free_blocks);
pblk_rl_update_rates(rl); free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
__pblk_rl_update_rates(rl, free_blocks);
} }
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
bool used)
{ {
int blk_in_line = atomic_read(&line->blk_in_line); int blk_in_line = atomic_read(&line->blk_in_line);
int free_blocks;
atomic_sub(blk_in_line, &rl->free_blocks); atomic_sub(blk_in_line, &rl->free_blocks);
pblk_rl_update_rates(rl);
if (used)
free_blocks = atomic_sub_return(blk_in_line,
&rl->free_user_blocks);
else
free_blocks = atomic_read(&rl->free_user_blocks);
__pblk_rl_update_rates(rl, free_blocks);
} }
int pblk_rl_high_thrs(struct pblk_rl *rl) int pblk_rl_high_thrs(struct pblk_rl *rl)
@ -174,16 +189,21 @@ void pblk_rl_free(struct pblk_rl *rl)
void pblk_rl_init(struct pblk_rl *rl, int budget) void pblk_rl_init(struct pblk_rl *rl, int budget)
{ {
struct pblk *pblk = container_of(rl, struct pblk, rl); struct pblk *pblk = container_of(rl, struct pblk, rl);
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_meta *lm = &pblk->lm;
int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
int sec_meta, blk_meta;
unsigned int rb_windows; unsigned int rb_windows;
rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS; /* Consider sectors used for metadata */
rl->high_pw = get_count_order(rl->high); sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
if (rl->low < min_blocks) rl->high_pw = get_count_order(rl->high);
rl->low = min_blocks;
rl->rsv_blocks = min_blocks; rl->rsv_blocks = min_blocks;

View File

@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
ssize_t sz = 0; ssize_t sz = 0;
int i; int i;
for (i = 0; i < geo->nr_luns; i++) { for (i = 0; i < geo->all_luns; i++) {
int active = 1; int active = 1;
rlun = &pblk->luns[i]; rlun = &pblk->luns[i];
@ -49,11 +49,12 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
{ {
int free_blocks, total_blocks; int free_blocks, free_user_blocks, total_blocks;
int rb_user_max, rb_user_cnt; int rb_user_max, rb_user_cnt;
int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
free_blocks = atomic_read(&pblk->rl.free_blocks); free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
rb_user_max = pblk->rl.rb_user_max; rb_user_max = pblk->rl.rb_user_max;
rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
rb_gc_max = pblk->rl.rb_gc_max; rb_gc_max = pblk->rl.rb_gc_max;
@ -64,16 +65,16 @@ static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
total_blocks = pblk->rl.total_blocks; total_blocks = pblk->rl.total_blocks;
return snprintf(page, PAGE_SIZE, return snprintf(page, PAGE_SIZE,
"u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
rb_user_cnt, rb_user_cnt,
rb_user_max, rb_user_max,
rb_gc_cnt, rb_gc_cnt,
rb_gc_max, rb_gc_max,
rb_state, rb_state,
rb_budget, rb_budget,
pblk->rl.low,
pblk->rl.high, pblk->rl.high,
free_blocks, free_blocks,
free_user_blocks,
total_blocks, total_blocks,
READ_ONCE(pblk->rl.rb_user_active)); READ_ONCE(pblk->rl.rb_user_active));
} }
@ -238,7 +239,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
sz = snprintf(page, PAGE_SIZE - sz, sz = snprintf(page, PAGE_SIZE - sz,
"line: nluns:%d, nblks:%d, nsecs:%d\n", "line: nluns:%d, nblks:%d, nsecs:%d\n",
geo->nr_luns, lm->blk_per_line, lm->sec_per_line); geo->all_luns, lm->blk_per_line, lm->sec_per_line);
sz += snprintf(page + sz, PAGE_SIZE - sz, sz += snprintf(page + sz, PAGE_SIZE - sz,
"lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
@ -287,7 +288,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
"blk_line:%d, sec_line:%d, sec_blk:%d\n", "blk_line:%d, sec_line:%d, sec_blk:%d\n",
lm->blk_per_line, lm->blk_per_line,
lm->sec_per_line, lm->sec_per_line,
geo->sec_per_blk); geo->sec_per_chk);
return sz; return sz;
} }

View File

@ -21,13 +21,28 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx) struct pblk_c_ctx *c_ctx)
{ {
struct bio *original_bio; struct bio *original_bio;
struct pblk_rb *rwb = &pblk->rwb;
unsigned long ret; unsigned long ret;
int i; int i;
for (i = 0; i < c_ctx->nr_valid; i++) { for (i = 0; i < c_ctx->nr_valid; i++) {
struct pblk_w_ctx *w_ctx; struct pblk_w_ctx *w_ctx;
int pos = c_ctx->sentry + i;
int flags;
w_ctx = pblk_rb_w_ctx(rwb, pos);
flags = READ_ONCE(w_ctx->flags);
if (flags & PBLK_FLUSH_ENTRY) {
flags &= ~PBLK_FLUSH_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&w_ctx->flags, flags);
#ifdef CONFIG_NVM_DEBUG
atomic_dec(&rwb->inflight_flush_point);
#endif
}
w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
while ((original_bio = bio_list_pop(&w_ctx->bios))) while ((original_bio = bio_list_pop(&w_ctx->bios)))
bio_endio(original_bio); bio_endio(original_bio);
} }
@ -439,7 +454,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
struct pblk_line *meta_line; struct pblk_line *meta_line;
int err; int err;
ppa_set_empty(&erase_ppa); pblk_ppa_set_empty(&erase_ppa);
/* Assign lbas to ppas and populate request structure */ /* Assign lbas to ppas and populate request structure */
err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
@ -457,7 +472,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
return NVM_IO_ERR; return NVM_IO_ERR;
} }
if (!ppa_empty(erase_ppa)) { if (!pblk_ppa_empty(erase_ppa)) {
/* Submit erase for next data line */ /* Submit erase for next data line */
if (pblk_blk_erase_async(pblk, erase_ppa)) { if (pblk_blk_erase_async(pblk, erase_ppa)) {
struct pblk_line *e_line = pblk_line_get_erase(pblk); struct pblk_line *e_line = pblk_line_get_erase(pblk);
@ -508,7 +523,7 @@ static int pblk_submit_write(struct pblk *pblk)
if (!secs_avail) if (!secs_avail)
return 1; return 1;
secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb); secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
if (!secs_to_flush && secs_avail < pblk->min_write_pgs) if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
return 1; return 1;

View File

@ -51,17 +51,16 @@
#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) #define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
#define pblk_for_each_lun(pblk, rlun, i) \
for ((i) = 0, rlun = &(pblk)->luns[0]; \
(i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
/* Static pool sizes */ /* Static pool sizes */
#define PBLK_GEN_WS_POOL_SIZE (2) #define PBLK_GEN_WS_POOL_SIZE (2)
#define PBLK_DEFAULT_OP (11)
enum { enum {
PBLK_READ = READ, PBLK_READ = READ,
PBLK_WRITE = WRITE,/* Write from write buffer */ PBLK_WRITE = WRITE,/* Write from write buffer */
PBLK_WRITE_INT, /* Internal write - no write buffer */ PBLK_WRITE_INT, /* Internal write - no write buffer */
PBLK_READ_RECOV, /* Recovery read - errors allowed */
PBLK_ERASE, PBLK_ERASE,
}; };
@ -114,6 +113,7 @@ struct pblk_c_ctx {
/* read context */ /* read context */
struct pblk_g_ctx { struct pblk_g_ctx {
void *private; void *private;
unsigned long start_time;
u64 lba; u64 lba;
}; };
@ -170,7 +170,7 @@ struct pblk_rb {
* the last submitted entry that has * the last submitted entry that has
* been successfully persisted to media * been successfully persisted to media
*/ */
unsigned int sync_point; /* Sync point - last entry that must be unsigned int flush_point; /* Sync point - last entry that must be
* flushed to the media. Used with * flushed to the media. Used with
* REQ_FLUSH and REQ_FUA * REQ_FLUSH and REQ_FUA
*/ */
@ -193,7 +193,7 @@ struct pblk_rb {
spinlock_t s_lock; /* Sync lock */ spinlock_t s_lock; /* Sync lock */
#ifdef CONFIG_NVM_DEBUG #ifdef CONFIG_NVM_DEBUG
atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */ atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
#endif #endif
}; };
@ -256,9 +256,6 @@ struct pblk_rl {
unsigned int high; /* Upper threshold for rate limiter (free run - unsigned int high; /* Upper threshold for rate limiter (free run -
* user I/O rate limiter * user I/O rate limiter
*/ */
unsigned int low; /* Lower threshold for rate limiter (user I/O
* rate limiter - stall)
*/
unsigned int high_pw; /* High rounded up as a power of 2 */ unsigned int high_pw; /* High rounded up as a power of 2 */
#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ #define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */
@ -292,7 +289,9 @@ struct pblk_rl {
unsigned long long nr_secs; unsigned long long nr_secs;
unsigned long total_blocks; unsigned long total_blocks;
atomic_t free_blocks;
atomic_t free_blocks; /* Total number of free blocks (+ OP) */
atomic_t free_user_blocks; /* Number of user free blocks (no OP) */
}; };
#define PBLK_LINE_EMPTY (~0U) #define PBLK_LINE_EMPTY (~0U)
@ -583,7 +582,9 @@ struct pblk {
*/ */
sector_t capacity; /* Device capacity when bad blocks are subtracted */ sector_t capacity; /* Device capacity when bad blocks are subtracted */
int over_pct; /* Percentage of device used for over-provisioning */
int op; /* Percentage of device used for over-provisioning */
int op_blks; /* Number of blocks used for over-provisioning */
/* pblk provisioning values. Used by rate limiter */ /* pblk provisioning values. Used by rate limiter */
struct pblk_rl rl; struct pblk_rl rl;
@ -691,7 +692,7 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb, struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
struct ppa_addr *ppa); struct ppa_addr *ppa);
void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb); unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb);
unsigned int pblk_rb_read_count(struct pblk_rb *rb); unsigned int pblk_rb_read_count(struct pblk_rb *rb);
unsigned int pblk_rb_sync_count(struct pblk_rb *rb); unsigned int pblk_rb_sync_count(struct pblk_rb *rb);
@ -812,7 +813,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
void pblk_submit_rec(struct work_struct *work); void pblk_submit_rec(struct work_struct *work);
struct pblk_line *pblk_recov_l2p(struct pblk *pblk); struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
int pblk_recov_pad(struct pblk *pblk); int pblk_recov_pad(struct pblk *pblk);
__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta); int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
struct pblk_rec_ctx *recovery, u64 *comp_bits, struct pblk_rec_ctx *recovery, u64 *comp_bits,
unsigned int comp); unsigned int comp);
@ -843,6 +844,7 @@ void pblk_rl_free(struct pblk_rl *rl);
void pblk_rl_update_rates(struct pblk_rl *rl); void pblk_rl_update_rates(struct pblk_rl *rl);
int pblk_rl_high_thrs(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl);
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl);
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
@ -851,7 +853,8 @@ void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
int pblk_rl_max_io(struct pblk_rl *rl); int pblk_rl_max_io(struct pblk_rl *rl);
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
bool used);
int pblk_rl_is_limit(struct pblk_rl *rl); int pblk_rl_is_limit(struct pblk_rl *rl);
/* /*
@ -907,15 +910,10 @@ static inline int pblk_pad_distance(struct pblk *pblk)
struct nvm_tgt_dev *dev = pblk->dev; struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo; struct nvm_geo *geo = &dev->geo;
return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl; return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl;
} }
static inline int pblk_dev_ppa_to_line(struct ppa_addr p) static inline int pblk_ppa_to_line(struct ppa_addr p)
{
return p.g.blk;
}
static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
{ {
return p.g.blk; return p.g.blk;
} }
@ -925,10 +923,34 @@ static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
return p.g.lun * geo->nr_chnls + p.g.ch; return p.g.lun * geo->nr_chnls + p.g.ch;
} }
/* A block within a line corresponds to the lun */ static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) u64 line_id)
{ {
return p.g.lun * geo->nr_chnls + p.g.ch; struct ppa_addr ppa;
ppa.ppa = 0;
ppa.g.blk = line_id;
ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
return ppa;
}
static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
struct ppa_addr p)
{
u64 paddr;
paddr = (u64)p.g.pg << pblk->ppaf.pg_offset;
paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
return paddr;
} }
static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
@ -960,24 +982,6 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
return ppa64; return ppa64;
} }
static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
sector_t lba)
{
struct ppa_addr ppa;
if (pblk->ppaf_bitsize < 32) {
u32 *map = (u32 *)pblk->trans_map;
ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
} else {
struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
ppa = map[lba];
}
return ppa;
}
static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
{ {
u32 ppa32 = 0; u32 ppa32 = 0;
@ -999,6 +1003,24 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
return ppa32; return ppa32;
} }
static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
sector_t lba)
{
struct ppa_addr ppa;
if (pblk->ppaf_bitsize < 32) {
u32 *map = (u32 *)pblk->trans_map;
ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
} else {
struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
ppa = map[lba];
}
return ppa;
}
static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
struct ppa_addr ppa) struct ppa_addr ppa)
{ {
@ -1013,21 +1035,6 @@ static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
} }
} }
static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
struct ppa_addr p)
{
u64 paddr;
paddr = 0;
paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
return paddr;
}
static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
{ {
return (ppa_addr.ppa == ADDR_EMPTY); return (ppa_addr.ppa == ADDR_EMPTY);
@ -1040,10 +1047,7 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa)
{ {
if (lppa.ppa == rppa.ppa) return (lppa.ppa == rppa.ppa);
return true;
return false;
} }
static inline int pblk_addr_in_cache(struct ppa_addr ppa) static inline int pblk_addr_in_cache(struct ppa_addr ppa)
@ -1066,32 +1070,6 @@ static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
return p; return p;
} }
static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
u64 line_id)
{
struct ppa_addr ppa;
ppa.ppa = 0;
ppa.g.blk = line_id;
ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
return ppa;
}
static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
u64 line_id)
{
struct ppa_addr ppa;
ppa = addr_to_gen_ppa(pblk, paddr, line_id);
return ppa;
}
static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
struct line_header *header) struct line_header *header)
{ {
@ -1212,10 +1190,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
if (!ppa->c.is_cached && if (!ppa->c.is_cached &&
ppa->g.ch < geo->nr_chnls && ppa->g.ch < geo->nr_chnls &&
ppa->g.lun < geo->luns_per_chnl && ppa->g.lun < geo->nr_luns &&
ppa->g.pl < geo->nr_planes && ppa->g.pl < geo->nr_planes &&
ppa->g.blk < geo->blks_per_lun && ppa->g.blk < geo->nr_chks &&
ppa->g.pg < geo->pgs_per_blk && ppa->g.pg < geo->ws_per_chk &&
ppa->g.sec < geo->sec_per_pg) ppa->g.sec < geo->sec_per_pg)
continue; continue;
@ -1245,7 +1223,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
for (i = 0; i < rqd->nr_ppas; i++) { for (i = 0; i < rqd->nr_ppas; i++) {
ppa = ppa_list[i]; ppa = ppa_list[i];
line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; line = &pblk->lines[pblk_ppa_to_line(ppa)];
spin_lock(&line->lock); spin_lock(&line->lock);
if (line->state != PBLK_LINESTATE_OPEN) { if (line->state != PBLK_LINESTATE_OPEN) {
@ -1288,11 +1266,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
} }
static inline sector_t pblk_get_sector(sector_t lba)
{
return lba * NR_PHY_IN_LOG;
}
static inline void pblk_setup_uuid(struct pblk *pblk) static inline void pblk_setup_uuid(struct pblk *pblk)
{ {
uuid_le uuid; uuid_le uuid;

File diff suppressed because it is too large Load Diff

View File

@ -1,290 +0,0 @@
/*
* Copyright (C) 2015 IT University of Copenhagen
* Initial release: Matias Bjorling <m@bjorling.me>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
*/
#ifndef RRPC_H_
#define RRPC_H_
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/vmalloc.h>
#include <linux/lightnvm.h>
/* Run only GC if less than 1/X blocks are free */
#define GC_LIMIT_INVERSE 10
#define GC_TIME_SECS 100
#define RRPC_SECTOR (512)
#define RRPC_EXPOSED_PAGE_SIZE (4096)
#define NR_PHY_IN_LOG (RRPC_EXPOSED_PAGE_SIZE / RRPC_SECTOR)
struct rrpc_inflight {
struct list_head reqs;
spinlock_t lock;
};
struct rrpc_inflight_rq {
struct list_head list;
sector_t l_start;
sector_t l_end;
};
struct rrpc_rq {
struct rrpc_inflight_rq inflight_rq;
unsigned long flags;
};
struct rrpc_block {
int id; /* id inside of LUN */
struct rrpc_lun *rlun;
struct list_head prio; /* LUN CG list */
struct list_head list; /* LUN free, used, bb list */
#define MAX_INVALID_PAGES_STORAGE 8
/* Bitmap for invalid page intries */
unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE];
/* points to the next writable page within a block */
unsigned int next_page;
/* number of pages that are invalid, wrt host page size */
unsigned int nr_invalid_pages;
int state;
spinlock_t lock;
atomic_t data_cmnt_size; /* data pages committed to stable storage */
};
struct rrpc_lun {
struct rrpc *rrpc;
int id;
struct ppa_addr bppa;
struct rrpc_block *cur, *gc_cur;
struct rrpc_block *blocks; /* Reference to block allocation */
struct list_head prio_list; /* Blocks that may be GC'ed */
struct list_head wblk_list; /* Queued blocks to be written to */
/* lun block lists */
struct list_head used_list; /* In-use blocks */
struct list_head free_list; /* Not used blocks i.e. released
* and ready for use
*/
struct list_head bb_list; /* Bad blocks. Mutually exclusive with
* free_list and used_list
*/
unsigned int nr_free_blocks; /* Number of unused blocks */
struct work_struct ws_gc;
int reserved_blocks;
spinlock_t lock;
};
struct rrpc {
struct nvm_tgt_dev *dev;
struct gendisk *disk;
sector_t soffset; /* logical sector offset */
int nr_luns;
struct rrpc_lun *luns;
/* calculated values */
unsigned long long nr_sects;
/* Write strategy variables. Move these into each for structure for each
* strategy
*/
atomic_t next_lun; /* Whenever a page is written, this is updated
* to point to the next write lun
*/
spinlock_t bio_lock;
struct bio_list requeue_bios;
struct work_struct ws_requeue;
/* Simple translation map of logical addresses to physical addresses.
* The logical addresses is known by the host system, while the physical
* addresses are used when writing to the disk block device.
*/
struct rrpc_addr *trans_map;
/* also store a reverse map for garbage collection */
struct rrpc_rev_addr *rev_trans_map;
spinlock_t rev_lock;
struct rrpc_inflight inflights;
mempool_t *addr_pool;
mempool_t *page_pool;
mempool_t *gcb_pool;
mempool_t *rq_pool;
struct timer_list gc_timer;
struct workqueue_struct *krqd_wq;
struct workqueue_struct *kgc_wq;
};
struct rrpc_block_gc {
struct rrpc *rrpc;
struct rrpc_block *rblk;
struct work_struct ws_gc;
};
/* Logical to physical mapping */
struct rrpc_addr {
u64 addr;
struct rrpc_block *rblk;
};
/* Physical to logical mapping */
struct rrpc_rev_addr {
u64 addr;
};
static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo,
struct ppa_addr r)
{
struct ppa_addr l;
int secs, pgs;
sector_t ppa = r.ppa;
l.ppa = 0;
div_u64_rem(ppa, geo->sec_per_pg, &secs);
l.g.sec = secs;
sector_div(ppa, geo->sec_per_pg);
div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
l.g.pg = pgs;
return l;
}
static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba)
{
return linear_to_generic_addr(&dev->geo, pba);
}
static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk)
{
struct nvm_tgt_dev *dev = rrpc->dev;
struct nvm_geo *geo = &dev->geo;
struct rrpc_lun *rlun = rblk->rlun;
return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
}
static inline sector_t rrpc_get_laddr(struct bio *bio)
{
return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
}
static inline unsigned int rrpc_get_pages(struct bio *bio)
{
return bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
}
static inline sector_t rrpc_get_sector(sector_t laddr)
{
return laddr * NR_PHY_IN_LOG;
}
static inline int request_intersects(struct rrpc_inflight_rq *r,
sector_t laddr_start, sector_t laddr_end)
{
return (laddr_end >= r->l_start) && (laddr_start <= r->l_end);
}
static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
unsigned int pages, struct rrpc_inflight_rq *r)
{
sector_t laddr_end = laddr + pages - 1;
struct rrpc_inflight_rq *rtmp;
WARN_ON(irqs_disabled());
spin_lock_irq(&rrpc->inflights.lock);
list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) {
if (unlikely(request_intersects(rtmp, laddr, laddr_end))) {
/* existing, overlapping request, come back later */
spin_unlock_irq(&rrpc->inflights.lock);
return 1;
}
}
r->l_start = laddr;
r->l_end = laddr_end;
list_add_tail(&r->list, &rrpc->inflights.reqs);
spin_unlock_irq(&rrpc->inflights.lock);
return 0;
}
static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
unsigned int pages,
struct rrpc_inflight_rq *r)
{
BUG_ON((laddr + pages) > rrpc->nr_sects);
return __rrpc_lock_laddr(rrpc, laddr, pages, r);
}
static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq *rqd)
{
struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
return &rrqd->inflight_rq;
}
static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio,
struct nvm_rq *rqd)
{
sector_t laddr = rrpc_get_laddr(bio);
unsigned int pages = rrpc_get_pages(bio);
struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
return rrpc_lock_laddr(rrpc, laddr, pages, r);
}
static inline void rrpc_unlock_laddr(struct rrpc *rrpc,
struct rrpc_inflight_rq *r)
{
unsigned long flags;
spin_lock_irqsave(&rrpc->inflights.lock, flags);
list_del_init(&r->list);
spin_unlock_irqrestore(&rrpc->inflights.lock, flags);
}
static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd)
{
struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
uint8_t pages = rqd->nr_ppas;
BUG_ON((r->l_start + pages) > rrpc->nr_sects);
rrpc_unlock_laddr(rrpc, r);
}
#endif /* RRPC_H_ */

View File

@ -525,15 +525,21 @@ struct open_bucket {
/* /*
* We keep multiple buckets open for writes, and try to segregate different * We keep multiple buckets open for writes, and try to segregate different
* write streams for better cache utilization: first we look for a bucket where * write streams for better cache utilization: first we try to segregate flash
* the last write to it was sequential with the current write, and failing that * only volume write streams from cached devices, secondly we look for a bucket
* we look for a bucket that was last used by the same task. * where the last write to it was sequential with the current write, and
* failing that we look for a bucket that was last used by the same task.
* *
* The ideas is if you've got multiple tasks pulling data into the cache at the * The ideas is if you've got multiple tasks pulling data into the cache at the
* same time, you'll get better cache utilization if you try to segregate their * same time, you'll get better cache utilization if you try to segregate their
* data and preserve locality. * data and preserve locality.
* *
* For example, say you've starting Firefox at the same time you're copying a * For example, dirty sectors of flash only volume is not reclaimable, if their
* dirty sectors mixed with dirty sectors of cached device, such buckets will
* be marked as dirty and won't be reclaimed, though the dirty data of cached
* device have been written back to backend device.
*
* And say you've starting Firefox at the same time you're copying a
* bunch of files. Firefox will likely end up being fairly hot and stay in the * bunch of files. Firefox will likely end up being fairly hot and stay in the
* cache awhile, but the data you copied might not be; if you wrote all that * cache awhile, but the data you copied might not be; if you wrote all that
* data to the same buckets it'd get invalidated at the same time. * data to the same buckets it'd get invalidated at the same time.
@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
struct open_bucket *ret, *ret_task = NULL; struct open_bucket *ret, *ret_task = NULL;
list_for_each_entry_reverse(ret, &c->data_buckets, list) list_for_each_entry_reverse(ret, &c->data_buckets, list)
if (!bkey_cmp(&ret->key, search)) if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
continue;
else if (!bkey_cmp(&ret->key, search))
goto found; goto found;
else if (ret->last_write_point == write_point) else if (ret->last_write_point == write_point)
ret_task = ret; ret_task = ret;

View File

@ -320,15 +320,16 @@ struct cached_dev {
*/ */
atomic_t has_dirty; atomic_t has_dirty;
/*
* Set to zero by things that touch the backing volume-- except
* writeback. Incremented by writeback. Used to determine when to
* accelerate idle writeback.
*/
atomic_t backing_idle;
struct bch_ratelimit writeback_rate; struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update; struct delayed_work writeback_rate_update;
/*
* Internal to the writeback code, so read_dirty() can keep track of
* where it's at.
*/
sector_t last_read;
/* Limit number of writeback bios in flight */ /* Limit number of writeback bios in flight */
struct semaphore in_flight; struct semaphore in_flight;
struct task_struct *writeback_thread; struct task_struct *writeback_thread;
@ -336,6 +337,14 @@ struct cached_dev {
struct keybuf writeback_keys; struct keybuf writeback_keys;
/*
* Order the write-half of writeback operations strongly in dispatch
* order. (Maintain LBA order; don't allow reads completing out of
* order to re-order the writes...)
*/
struct closure_waitlist writeback_ordering_wait;
atomic_t writeback_sequence_next;
/* For tracking sequential IO */ /* For tracking sequential IO */
#define RECENT_IO_BITS 7 #define RECENT_IO_BITS 7
#define RECENT_IO (1 << RECENT_IO_BITS) #define RECENT_IO (1 << RECENT_IO_BITS)
@ -488,6 +497,7 @@ struct cache_set {
int caches_loaded; int caches_loaded;
struct bcache_device **devices; struct bcache_device **devices;
unsigned devices_max_used;
struct list_head cached_devs; struct list_head cached_devs;
uint64_t cached_dev_sectors; uint64_t cached_dev_sectors;
struct closure caching; struct closure caching;
@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c)
/* Forward declarations */ /* Forward declarations */
void bch_count_io_errors(struct cache *, blk_status_t, const char *); void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *, void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
blk_status_t, const char *); blk_status_t, const char *);
void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,

View File

@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i)); bset_sector_offset(&b->keys, i));
if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
int j; int j;
struct bio_vec *bv; struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b)
continue_at(cl, btree_node_write_done, NULL); continue_at(cl, btree_node_write_done, NULL);
} else { } else {
/* No problem for multipage bvec since the bio is just allocated */
b->bio->bi_vcnt = 0; b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i); bch_bio_map(b->bio, i);
@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
/* don't reclaim buckets to which writeback keys point */ /* don't reclaim buckets to which writeback keys point */
rcu_read_lock(); rcu_read_lock();
for (i = 0; i < c->nr_uuids; i++) { for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i]; struct bcache_device *d = c->devices[i];
struct cached_dev *dc; struct cached_dev *dc;
struct keybuf_key *w, *n; struct keybuf_key *w, *n;
@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg)
int bch_gc_thread_start(struct cache_set *c) int bch_gc_thread_start(struct cache_set *c)
{ {
c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
if (IS_ERR(c->gc_thread)) return PTR_ERR_OR_ZERO(c->gc_thread);
return PTR_ERR(c->gc_thread);
return 0;
} }
/* Initial partial gc */ /* Initial partial gc */

View File

@ -8,6 +8,7 @@
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/sched/debug.h>
#include "closure.h" #include "closure.h"
@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
BUG_ON(flags & CLOSURE_GUARD_MASK); BUG_ON(flags & CLOSURE_GUARD_MASK);
BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
/* Must deliver precisely one wakeup */
if (r == 1 && (flags & CLOSURE_SLEEPING))
wake_up_process(cl->task);
if (!r) { if (!r) {
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining, atomic_set(&cl->remaining,
@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
} }
EXPORT_SYMBOL(closure_wait); EXPORT_SYMBOL(closure_wait);
/** struct closure_syncer {
* closure_sync - sleep until a closure has nothing left to wait on struct task_struct *task;
* int done;
* Sleeps until the refcount hits 1 - the thread that's running the closure owns };
* the last refcount.
*/ static void closure_sync_fn(struct closure *cl)
void closure_sync(struct closure *cl)
{ {
cl->s->done = 1;
wake_up_process(cl->s->task);
}
void __sched __closure_sync(struct closure *cl)
{
struct closure_syncer s = { .task = current };
cl->s = &s;
continue_at(cl, closure_sync_fn, NULL);
while (1) { while (1) {
__closure_start_sleep(cl); set_current_state(TASK_UNINTERRUPTIBLE);
closure_set_ret_ip(cl); if (s.done)
if ((atomic_read(&cl->remaining) &
CLOSURE_REMAINING_MASK) == 1)
break; break;
schedule(); schedule();
} }
__closure_end_sleep(cl); __set_current_state(TASK_RUNNING);
} }
EXPORT_SYMBOL(closure_sync); EXPORT_SYMBOL(__closure_sync);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data)
cl, (void *) cl->ip, cl->fn, cl->parent, cl, (void *) cl->ip, cl->fn, cl->parent,
r & CLOSURE_REMAINING_MASK); r & CLOSURE_REMAINING_MASK);
seq_printf(f, "%s%s%s%s\n", seq_printf(f, "%s%s\n",
test_bit(WORK_STRUCT_PENDING_BIT, test_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(&cl->work)) ? "Q" : "", work_data_bits(&cl->work)) ? "Q" : "",
r & CLOSURE_RUNNING ? "R" : "", r & CLOSURE_RUNNING ? "R" : "");
r & CLOSURE_STACK ? "S" : "",
r & CLOSURE_SLEEPING ? "Sl" : "");
if (r & CLOSURE_WAITING) if (r & CLOSURE_WAITING)
seq_printf(f, " W %pF\n", seq_printf(f, " W %pF\n",

View File

@ -103,6 +103,7 @@
*/ */
struct closure; struct closure;
struct closure_syncer;
typedef void (closure_fn) (struct closure *); typedef void (closure_fn) (struct closure *);
struct closure_waitlist { struct closure_waitlist {
@ -115,10 +116,6 @@ enum closure_state {
* the thread that owns the closure, and cleared by the thread that's * the thread that owns the closure, and cleared by the thread that's
* waking up the closure. * waking up the closure.
* *
* CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
* - indicates that cl->task is valid and closure_put() may wake it up.
* Only set or cleared by the thread that owns the closure.
*
* The rest are for debugging and don't affect behaviour: * The rest are for debugging and don't affect behaviour:
* *
* CLOSURE_RUNNING: Set when a closure is running (i.e. by * CLOSURE_RUNNING: Set when a closure is running (i.e. by
@ -128,22 +125,16 @@ enum closure_state {
* continue_at() and closure_return() clear it for you, if you're doing * continue_at() and closure_return() clear it for you, if you're doing
* something unusual you can use closure_set_dead() which also helps * something unusual you can use closure_set_dead() which also helps
* annotate where references are being transferred. * annotate where references are being transferred.
*
* CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
* closure with this flag set
*/ */
CLOSURE_BITS_START = (1 << 23), CLOSURE_BITS_START = (1U << 26),
CLOSURE_DESTRUCTOR = (1 << 23), CLOSURE_DESTRUCTOR = (1U << 26),
CLOSURE_WAITING = (1 << 25), CLOSURE_WAITING = (1U << 28),
CLOSURE_SLEEPING = (1 << 27), CLOSURE_RUNNING = (1U << 30),
CLOSURE_RUNNING = (1 << 29),
CLOSURE_STACK = (1 << 31),
}; };
#define CLOSURE_GUARD_MASK \ #define CLOSURE_GUARD_MASK \
((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
CLOSURE_RUNNING|CLOSURE_STACK) << 1)
#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@ -152,7 +143,7 @@ struct closure {
union { union {
struct { struct {
struct workqueue_struct *wq; struct workqueue_struct *wq;
struct task_struct *task; struct closure_syncer *s;
struct llist_node list; struct llist_node list;
closure_fn *fn; closure_fn *fn;
}; };
@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v);
void closure_put(struct closure *cl); void closure_put(struct closure *cl);
void __closure_wake_up(struct closure_waitlist *list); void __closure_wake_up(struct closure_waitlist *list);
bool closure_wait(struct closure_waitlist *list, struct closure *cl); bool closure_wait(struct closure_waitlist *list, struct closure *cl);
void closure_sync(struct closure *cl); void __closure_sync(struct closure *cl);
/**
* closure_sync - sleep until a closure a closure has nothing left to wait on
*
* Sleeps until the refcount hits 1 - the thread that's running the closure owns
* the last refcount.
*/
static inline void closure_sync(struct closure *cl)
{
if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
__closure_sync(cl);
}
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
#endif #endif
} }
static inline void __closure_end_sleep(struct closure *cl)
{
__set_current_state(TASK_RUNNING);
if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
}
static inline void __closure_start_sleep(struct closure *cl)
{
closure_set_ip(cl);
cl->task = current;
set_current_state(TASK_UNINTERRUPTIBLE);
if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
atomic_add(CLOSURE_SLEEPING, &cl->remaining);
}
static inline void closure_set_stopped(struct closure *cl) static inline void closure_set_stopped(struct closure *cl)
{ {
atomic_sub(CLOSURE_RUNNING, &cl->remaining); atomic_sub(CLOSURE_RUNNING, &cl->remaining);
@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl)
static inline void set_closure_fn(struct closure *cl, closure_fn *fn, static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
struct workqueue_struct *wq) struct workqueue_struct *wq)
{ {
BUG_ON(object_is_on_stack(cl));
closure_set_ip(cl); closure_set_ip(cl);
cl->fn = fn; cl->fn = fn;
cl->wq = wq; cl->wq = wq;
@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
static inline void closure_init_stack(struct closure *cl) static inline void closure_init_stack(struct closure *cl)
{ {
memset(cl, 0, sizeof(struct closure)); memset(cl, 0, sizeof(struct closure));
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
} }
/** /**
@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list)
* This is because after calling continue_at() you no longer have a ref on @cl, * This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn * and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops. * has a ref on its own closure which continue_at() drops.
*
* Note you are expected to immediately return after using this macro.
*/ */
#define continue_at(_cl, _fn, _wq) \ #define continue_at(_cl, _fn, _wq) \
do { \ do { \

View File

@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
return; return;
check->bi_opf = REQ_OP_READ; check->bi_opf = REQ_OP_READ;
if (bio_alloc_pages(check, GFP_NOIO)) if (bch_bio_alloc_pages(check, GFP_NOIO))
goto out_put; goto out_put;
submit_bio_wait(check); submit_bio_wait(check);
@ -251,8 +251,7 @@ void bch_debug_exit(void)
int __init bch_debug_init(struct kobject *kobj) int __init bch_debug_init(struct kobject *kobj)
{ {
int ret = 0;
debug = debugfs_create_dir("bcache", NULL); debug = debugfs_create_dir("bcache", NULL);
return ret;
return IS_ERR_OR_NULL(debug);
} }

View File

@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
/* IO errors */ /* IO errors */
void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) void bch_count_io_errors(struct cache *ca,
blk_status_t error,
int is_read,
const char *m)
{ {
/* /*
* The halflife of an error is: * The halflife of an error is:
@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
errors >>= IO_ERROR_SHIFT; errors >>= IO_ERROR_SHIFT;
if (errors < ca->set->error_limit) if (errors < ca->set->error_limit)
pr_err("%s: IO error on %s, recovering", pr_err("%s: IO error on %s%s",
bdevname(ca->bdev, buf), m); bdevname(ca->bdev, buf), m,
is_read ? ", recovering." : ".");
else else
bch_cache_set_error(ca->set, bch_cache_set_error(ca->set,
"%s: too many IO errors %s", "%s: too many IO errors %s",
@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
{ {
struct bbio *b = container_of(bio, struct bbio, bio); struct bbio *b = container_of(bio, struct bbio, bio);
struct cache *ca = PTR_CACHE(c, &b->key, 0); struct cache *ca = PTR_CACHE(c, &b->key, 0);
int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
unsigned threshold = op_is_write(bio_op(bio)) unsigned threshold = op_is_write(bio_op(bio))
? c->congested_write_threshold_us ? c->congested_write_threshold_us
@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
atomic_inc(&c->congested); atomic_inc(&c->congested);
} }
bch_count_io_errors(ca, error, m); bch_count_io_errors(ca, error, is_read, m);
} }
void bch_bbio_endio(struct cache_set *c, struct bio *bio, void bch_bbio_endio(struct cache_set *c, struct bio *bio,

View File

@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
bio_set_op_attrs(bio, REQ_OP_READ, 0); bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_end_io = read_moving_endio; bio->bi_end_io = read_moving_endio;
if (bio_alloc_pages(bio, GFP_KERNEL)) if (bch_bio_alloc_pages(bio, GFP_KERNEL))
goto err; goto err;
trace_bcache_gc_copy(&w->key); trace_bcache_gc_copy(&w->key);

View File

@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl)
{ {
struct search *s = container_of(cl, struct search, iop.cl); struct search *s = container_of(cl, struct search, iop.cl);
struct bio *bio = &s->bio.bio; struct bio *bio = &s->bio.bio;
struct cached_dev *dc;
int ret; int ret;
bch_btree_op_init(&s->op, -1); bch_btree_op_init(&s->op, -1);
@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl)
return; return;
} }
/*
* We might meet err when searching the btree, If that happens, we will
* get negative ret, in this scenario we should not recover data from
* backing device (when cache device is dirty) because we don't know
* whether bkeys the read request covered are all clean.
*
* And after that happened, s->iop.status is still its initial value
* before we submit s->bio.bio
*/
if (ret < 0) {
BUG_ON(ret == -EINTR);
if (s->d && s->d->c &&
!UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
dc = container_of(s->d, struct cached_dev, disk);
if (dc && atomic_read(&dc->has_dirty))
s->recoverable = false;
}
if (!s->iop.status)
s->iop.status = BLK_STS_IOERR;
}
closure_return(cl); closure_return(cl);
} }
@ -611,8 +633,8 @@ static void request_endio(struct bio *bio)
static void bio_complete(struct search *s) static void bio_complete(struct search *s)
{ {
if (s->orig_bio) { if (s->orig_bio) {
struct request_queue *q = s->orig_bio->bi_disk->queue; generic_end_io_acct(s->d->disk->queue,
generic_end_io_acct(q, bio_data_dir(s->orig_bio), bio_data_dir(s->orig_bio),
&s->d->disk->part0, s->start_time); &s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio); trace_bcache_request_end(s->d, s->orig_bio);
@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
cache_bio->bi_private = &s->cl; cache_bio->bi_private = &s->cl;
bch_bio_map(cache_bio, NULL); bch_bio_map(cache_bio, NULL);
if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
goto out_put; goto out_put;
if (reada) if (reada)
@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
struct cached_dev *dc = container_of(d, struct cached_dev, disk); struct cached_dev *dc = container_of(d, struct cached_dev, disk);
int rw = bio_data_dir(bio); int rw = bio_data_dir(bio);
atomic_set(&dc->backing_idle, 0);
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
bio_set_dev(bio, dc->bdev); bio_set_dev(bio, dc->bdev);

View File

@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio)
static void __write_super(struct cache_sb *sb, struct bio *bio) static void __write_super(struct cache_sb *sb, struct bio *bio)
{ {
struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); struct cache_sb *out = page_address(bio_first_page_all(bio));
unsigned i; unsigned i;
bio->bi_iter.bi_sector = SB_SECTOR; bio->bi_iter.bi_sector = SB_SECTOR;
@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio)
{ {
struct cache *ca = bio->bi_private; struct cache *ca = bio->bi_private;
bch_count_io_errors(ca, bio->bi_status, "writing superblock"); /* is_read = 0 */
bch_count_io_errors(ca, bio->bi_status, 0,
"writing superblock");
closure_put(&ca->set->sb_write); closure_put(&ca->set->sb_write);
} }
@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
d->c = c; d->c = c;
c->devices[id] = d; c->devices[id] = d;
if (id >= c->devices_max_used)
c->devices_max_used = id + 1;
closure_get(&c->caching); closure_get(&c->caching);
} }
@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_lock(&bch_register_lock); mutex_lock(&bch_register_lock);
cancel_delayed_work_sync(&dc->writeback_rate_update);
if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
kthread_stop(dc->writeback_thread);
dc->writeback_thread = NULL;
}
memset(&dc->sb.set_uuid, 0, 16); memset(&dc->sb.set_uuid, 0, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
dc->bdev->bd_holder = dc; dc->bdev->bd_holder = dc;
bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
dc->sb_bio.bi_io_vec[0].bv_page = sb_page; bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
get_page(sb_page); get_page(sb_page);
if (cached_dev_init(dc, sb->block_size << 9)) if (cached_dev_init(dc, sb->block_size << 9))
@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c)
struct uuid_entry *u; struct uuid_entry *u;
for (u = c->uuids; for (u = c->uuids;
u < c->uuids + c->nr_uuids && !ret; u < c->uuids + c->devices_max_used && !ret;
u++) u++)
if (UUID_FLASH_ONLY(u)) if (UUID_FLASH_ONLY(u))
ret = flash_dev_run(c, u); ret = flash_dev_run(c, u);
@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl)
mutex_lock(&bch_register_lock); mutex_lock(&bch_register_lock);
for (i = 0; i < c->nr_uuids; i++) for (i = 0; i < c->devices_max_used; i++)
if (c->devices[i]) { if (c->devices[i]) {
if (!UUID_FLASH_ONLY(&c->uuids[i]) && if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->bucket_bits = ilog2(sb->bucket_size); c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size); c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
c->btree_pages = bucket_pages(c); c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES) if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4, c->btree_pages = max_t(int, c->btree_pages / 4,
@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj)
free_fifo(&ca->free[i]); free_fifo(&ca->free[i]);
if (ca->sb_bio.bi_inline_vecs[0].bv_page) if (ca->sb_bio.bi_inline_vecs[0].bv_page)
put_page(ca->sb_bio.bi_io_vec[0].bv_page); put_page(bio_first_page_all(&ca->sb_bio));
if (!IS_ERR_OR_NULL(ca->bdev)) if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
ca->bdev->bd_holder = ca; ca->bdev->bd_holder = ca;
bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
ca->sb_bio.bi_io_vec[0].bv_page = sb_page; bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
get_page(sb_page); get_page(sb_page);
if (blk_queue_discard(bdev_get_queue(ca->bdev))) if (blk_queue_discard(bdev_get_queue(ca->bdev)))

View File

@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
: 0; : 0;
} }
/*
* Generally it isn't good to access .bi_io_vec and .bi_vcnt directly,
* the preferred way is bio_add_page, but in this case, bch_bio_map()
* supposes that the bvec table is empty, so it is safe to access
* .bi_vcnt & .bi_io_vec in this way even after multipage bvec is
* supported.
*/
void bch_bio_map(struct bio *bio, void *base) void bch_bio_map(struct bio *bio, void *base)
{ {
size_t size = bio->bi_iter.bi_size; size_t size = bio->bi_iter.bi_size;
@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
} }
} }
/**
* bch_bio_alloc_pages - allocates a single page for each bvec in a bio
* @bio: bio to allocate pages for
* @gfp_mask: flags for allocation
*
* Allocates pages up to @bio->bi_vcnt.
*
* Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
* freed.
*/
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
{
int i;
struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
__free_page(bv->bv_page);
return -ENOMEM;
}
}
return 0;
}
/* /*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
* use permitted, subject to terms of PostgreSQL license; see.) * use permitted, subject to terms of PostgreSQL license; see.)

View File

@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
} }
void bch_bio_map(struct bio *bio, void *base); void bch_bio_map(struct bio *bio, void *base);
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
static inline sector_t bdev_sectors(struct block_device *bdev) static inline sector_t bdev_sectors(struct block_device *bdev)
{ {

View File

@ -18,17 +18,39 @@
#include <trace/events/bcache.h> #include <trace/events/bcache.h>
/* Rate limiting */ /* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
/*
* This is the size of the cache, minus the amount used for
* flash-only devices
*/
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
bcache_flash_devs_sectors_dirty(c);
/*
* Unfortunately there is no control of global dirty data. If the
* user states that they want 10% dirty data in the cache, and has,
* e.g., 5 backing volumes of equal size, we try and ensure each
* backing volume uses about 2% of the cache for dirty data.
*/
uint32_t bdev_share =
div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
c->cached_dev_sectors);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
/* Ensure each backing dev gets at least one dirty share */
if (bdev_share < 1)
bdev_share = 1;
return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
}
static void __update_writeback_rate(struct cached_dev *dc) static void __update_writeback_rate(struct cached_dev *dc)
{ {
struct cache_set *c = dc->disk.c;
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
bcache_flash_devs_sectors_dirty(c);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
c->cached_dev_sectors);
/* /*
* PI controller: * PI controller:
* Figures out the amount that should be written per second. * Figures out the amount that should be written per second.
@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
* This acts as a slow, long-term average that is not subject to * This acts as a slow, long-term average that is not subject to
* variations in usage like the p term. * variations in usage like the p term.
*/ */
int64_t target = __calc_target_rate(dc);
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t error = dirty - target; int64_t error = dirty - target;
int64_t proportional_scaled = int64_t proportional_scaled =
@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
struct dirty_io { struct dirty_io {
struct closure cl; struct closure cl;
struct cached_dev *dc; struct cached_dev *dc;
uint16_t sequence;
struct bio bio; struct bio bio;
}; };
@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl)
{ {
struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private; struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
uint16_t next_sequence;
if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
/* Not our turn to write; wait for a write to complete */
closure_wait(&dc->writeback_ordering_wait, cl);
if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
/*
* Edge case-- it happened in indeterminate order
* relative to when we were added to wait list..
*/
closure_wake_up(&dc->writeback_ordering_wait);
}
continue_at(cl, write_dirty, io->dc->writeback_write_wq);
return;
}
next_sequence = io->sequence + 1;
/* /*
* IO errors are signalled using the dirty bit on the key. * IO errors are signalled using the dirty bit on the key.
@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl); closure_bio_submit(&io->bio, cl);
} }
atomic_set(&dc->writeback_sequence_next, next_sequence);
closure_wake_up(&dc->writeback_ordering_wait);
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
} }
@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio)
struct keybuf_key *w = bio->bi_private; struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private; struct dirty_io *io = w->private;
/* is_read = 1 */
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
bio->bi_status, "reading dirty data from cache"); bio->bi_status, 1,
"reading dirty data from cache");
dirty_endio(bio); dirty_endio(bio);
} }
@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl)
static void read_dirty(struct cached_dev *dc) static void read_dirty(struct cached_dev *dc)
{ {
unsigned delay = 0; unsigned delay = 0;
struct keybuf_key *w; struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
size_t size;
int nk, i;
struct dirty_io *io; struct dirty_io *io;
struct closure cl; struct closure cl;
uint16_t sequence = 0;
BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
atomic_set(&dc->writeback_sequence_next, sequence);
closure_init_stack(&cl); closure_init_stack(&cl);
/* /*
@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc)
* mempools. * mempools.
*/ */
while (!kthread_should_stop()) { next = bch_keybuf_next(&dc->writeback_keys);
w = bch_keybuf_next(&dc->writeback_keys); while (!kthread_should_stop() && next) {
if (!w) size = 0;
break; nk = 0;
BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); do {
BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
if (KEY_START(&w->key) != dc->last_read || /*
jiffies_to_msecs(delay) > 50) * Don't combine too many operations, even if they
while (!kthread_should_stop() && delay) * are all small.
delay = schedule_timeout_interruptible(delay); */
if (nk >= MAX_WRITEBACKS_IN_PASS)
break;
dc->last_read = KEY_OFFSET(&w->key); /*
* If the current operation is very large, don't
* further combine operations.
*/
if (size >= MAX_WRITESIZE_IN_PASS)
break;
io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) /*
* DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), * Operations are only eligible to be combined
GFP_KERNEL); * if they are contiguous.
if (!io) *
goto err; * TODO: add a heuristic willing to fire a
* certain amount of non-contiguous IO per pass,
* so that we can benefit from backing device
* command queueing.
*/
if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
&START_KEY(&next->key)))
break;
w->private = io; size += KEY_SIZE(&next->key);
io->dc = dc; keys[nk++] = next;
} while ((next = bch_keybuf_next(&dc->writeback_keys)));
dirty_init(w); /* Now we have gathered a set of 1..5 keys to write back. */
bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); for (i = 0; i < nk; i++) {
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); w = keys[i];
bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
io->bio.bi_end_io = read_dirty_endio;
if (bio_alloc_pages(&io->bio, GFP_KERNEL)) io = kzalloc(sizeof(struct dirty_io) +
goto err_free; sizeof(struct bio_vec) *
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
GFP_KERNEL);
if (!io)
goto err;
trace_bcache_writeback(&w->key); w->private = io;
io->dc = dc;
io->sequence = sequence++;
down(&dc->in_flight); dirty_init(w);
closure_call(&io->cl, read_dirty_submit, NULL, &cl); bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
bio_set_dev(&io->bio,
PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
io->bio.bi_end_io = read_dirty_endio;
delay = writeback_delay(dc, KEY_SIZE(&w->key)); if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
goto err_free;
trace_bcache_writeback(&w->key);
down(&dc->in_flight);
/* We've acquired a semaphore for the maximum
* simultaneous number of writebacks; from here
* everything happens asynchronously.
*/
closure_call(&io->cl, read_dirty_submit, NULL, &cl);
}
delay = writeback_delay(dc, size);
/* If the control system would wait for at least half a
* second, and there's been no reqs hitting the backing disk
* for awhile: use an alternate mode where we have at most
* one contiguous set of writebacks in flight at a time. If
* someone wants to do IO it will be quick, as it will only
* have to contend with one operation in flight, and we'll
* be round-tripping data to the backing disk as quickly as
* it can accept it.
*/
if (delay >= HZ / 2) {
/* 3 means at least 1.5 seconds, up to 7.5 if we
* have slowed way down.
*/
if (atomic_inc_return(&dc->backing_idle) >= 3) {
/* Wait for current I/Os to finish */
closure_sync(&cl);
/* And immediately launch a new set. */
delay = 0;
}
}
while (!kthread_should_stop() && delay) {
schedule_timeout_interruptible(delay);
delay = writeback_delay(dc, 0);
}
} }
if (0) { if (0) {

View File

@ -5,6 +5,16 @@
#define CUTOFF_WRITEBACK 40 #define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70 #define CUTOFF_WRITEBACK_SYNC 70
#define MAX_WRITEBACKS_IN_PASS 5
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up
* until individual backing devices are a petabyte.
*/
#define WRITEBACK_SHARE_SHIFT 14
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
{ {
uint64_t i, ret = 0; uint64_t i, ret = 0;
@ -21,7 +31,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
mutex_lock(&bch_register_lock); mutex_lock(&bch_register_lock);
for (i = 0; i < c->nr_uuids; i++) { for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i]; struct bcache_device *d = c->devices[i];
if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))

View File

@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
bio_for_each_segment_all(bv, clone, i) { bio_for_each_segment_all(bv, clone, i) {
BUG_ON(!bv->bv_page); BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool); mempool_free(bv->bv_page, cc->page_pool);
bv->bv_page = NULL;
} }
} }

View File

@ -1475,21 +1475,6 @@ static void activate_path_work(struct work_struct *work)
activate_or_offline_path(pgpath); activate_or_offline_path(pgpath);
} }
static int noretry_error(blk_status_t error)
{
switch (error) {
case BLK_STS_NOTSUPP:
case BLK_STS_NOSPC:
case BLK_STS_TARGET:
case BLK_STS_NEXUS:
case BLK_STS_MEDIUM:
return 1;
}
/* Anything else could be a path failure, so should be retried */
return 0;
}
static int multipath_end_io(struct dm_target *ti, struct request *clone, static int multipath_end_io(struct dm_target *ti, struct request *clone,
blk_status_t error, union map_info *map_context) blk_status_t error, union map_info *map_context)
{ {
@ -1508,7 +1493,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
* request into dm core, which will remake a clone request and * request into dm core, which will remake a clone request and
* clone bios for it and resubmit it later. * clone bios for it and resubmit it later.
*/ */
if (error && !noretry_error(error)) { if (error && blk_path_error(error)) {
struct multipath *m = ti->private; struct multipath *m = ti->private;
r = DM_ENDIO_REQUEUE; r = DM_ENDIO_REQUEUE;
@ -1544,7 +1529,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
unsigned long flags; unsigned long flags;
int r = DM_ENDIO_DONE; int r = DM_ENDIO_DONE;
if (!*error || noretry_error(*error)) if (!*error || !blk_path_error(*error))
goto done; goto done;
if (pgpath) if (pgpath)

View File

@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
dm_complete_request(tio->orig, error); dm_complete_request(tio->orig, error);
} }
static void dm_dispatch_clone_request(struct request *clone, struct request *rq) static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
{ {
blk_status_t r; blk_status_t r;
@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
clone->start_time = jiffies; clone->start_time = jiffies;
r = blk_insert_cloned_request(clone->q, clone); r = blk_insert_cloned_request(clone->q, clone);
if (r) if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
/* must complete clone in terms of original request */ /* must complete clone in terms of original request */
dm_complete_request(rq, r); dm_complete_request(rq, r);
return r;
} }
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
struct mapped_device *md = tio->md; struct mapped_device *md = tio->md;
struct request *rq = tio->orig; struct request *rq = tio->orig;
struct request *clone = NULL; struct request *clone = NULL;
blk_status_t ret;
r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
check_again:
switch (r) { switch (r) {
case DM_MAPIO_SUBMITTED: case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */ /* The target has taken the I/O to submit by itself later */
@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
/* The target has remapped the I/O so dispatch it */ /* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
blk_rq_pos(rq)); blk_rq_pos(rq));
dm_dispatch_clone_request(clone, rq); ret = dm_dispatch_clone_request(clone, rq);
if (ret == BLK_STS_RESOURCE) {
blk_rq_unprep_clone(clone);
tio->ti->type->release_clone_rq(clone);
tio->clone = NULL;
if (!rq->q->mq_ops)
r = DM_MAPIO_DELAY_REQUEUE;
else
r = DM_MAPIO_REQUEUE;
goto check_again;
}
break; break;
case DM_MAPIO_REQUEUE: case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */ /* The target wants to requeue the I/O */
@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
return error; return error;
} }
elv_register_queue(md->queue);
return 0; return 0;
} }
@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
} }
dm_init_md_queue(md); dm_init_md_queue(md);
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
err = blk_mq_register_dev(disk_to_dev(md->disk), q);
if (err)
goto out_cleanup_queue;
return 0; return 0;
out_cleanup_queue:
blk_cleanup_queue(q);
out_tag_set: out_tag_set:
blk_mq_free_tag_set(md->tag_set); blk_mq_free_tag_set(md->tag_set);
out_kfree_tag_set: out_kfree_tag_set:

View File

@ -920,7 +920,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
return -EINVAL; return -EINVAL;
} }
ti->max_io_len = (uint32_t) len; /*
* BIO based queue uses its own splitting. When multipage bvecs
* is switched on, size of the incoming bio may be too big to
* be handled in some targets, such as crypt.
*
* When these targets are ready for the big bio, we can remove
* the limit.
*/
ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
return 0; return 0;
} }
@ -1753,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor)
goto bad; goto bad;
md->dax_dev = dax_dev; md->dax_dev = dax_dev;
add_disk(md->disk); add_disk_no_queue_reg(md->disk);
format_dev_t(md->name, MKDEV(_major, minor)); format_dev_t(md->name, MKDEV(_major, minor));
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
@ -2013,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{ {
int r; int r;
struct queue_limits limits;
enum dm_queue_mode type = dm_get_md_type(md); enum dm_queue_mode type = dm_get_md_type(md);
switch (type) { switch (type) {
@ -2049,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
break; break;
} }
r = dm_calculate_queue_limits(t, &limits);
if (r) {
DMERR("Cannot calculate initial queue limits");
return r;
}
dm_table_set_restrictions(t, md->queue, &limits);
blk_register_queue(md->disk);
return 0; return 0;
} }

View File

@ -1,4 +1,7 @@
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
ccflags-y += -I$(src)
obj-$(CONFIG_NVME_CORE) += nvme-core.o obj-$(CONFIG_NVME_CORE) += nvme-core.o
obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_FC) += nvme-fc.o
nvme-core-y := core.o nvme-core-y := core.o
nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-core-$(CONFIG_NVM) += lightnvm.o

View File

@ -29,6 +29,9 @@
#include <linux/pm_qos.h> #include <linux/pm_qos.h>
#include <asm/unaligned.h> #include <asm/unaligned.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#include "nvme.h" #include "nvme.h"
#include "fabrics.h" #include "fabrics.h"
@ -65,9 +68,26 @@ static bool streams;
module_param(streams, bool, 0644); module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
/*
* nvme_wq - hosts nvme related works that are not reset or delete
* nvme_reset_wq - hosts nvme reset works
* nvme_delete_wq - hosts nvme delete works
*
* nvme_wq will host works such are scan, aen handling, fw activation,
* keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
* runs reset works which also flush works hosted on nvme_wq for
* serialization purposes. nvme_delete_wq host controller deletion
* works which flush reset works for serialization.
*/
struct workqueue_struct *nvme_wq; struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq); EXPORT_SYMBOL_GPL(nvme_wq);
struct workqueue_struct *nvme_reset_wq;
EXPORT_SYMBOL_GPL(nvme_reset_wq);
struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
static DEFINE_IDA(nvme_subsystems_ida); static DEFINE_IDA(nvme_subsystems_ida);
static LIST_HEAD(nvme_subsystems); static LIST_HEAD(nvme_subsystems);
static DEFINE_MUTEX(nvme_subsystems_lock); static DEFINE_MUTEX(nvme_subsystems_lock);
@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{ {
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return -EBUSY; return -EBUSY;
if (!queue_work(nvme_wq, &ctrl->reset_work)) if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY; return -EBUSY;
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(nvme_reset_ctrl); EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
{ {
int ret; int ret;
@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
flush_work(&ctrl->reset_work); flush_work(&ctrl->reset_work);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
static void nvme_delete_ctrl_work(struct work_struct *work) static void nvme_delete_ctrl_work(struct work_struct *work)
{ {
@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{ {
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
return -EBUSY; return -EBUSY;
if (!queue_work(nvme_wq, &ctrl->delete_work)) if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
return -EBUSY; return -EBUSY;
return 0; return 0;
} }
@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req)
return BLK_STS_OK; return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED: case NVME_SC_CAP_EXCEEDED:
return BLK_STS_NOSPC; return BLK_STS_NOSPC;
case NVME_SC_LBA_RANGE:
return BLK_STS_TARGET;
case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_ONCS_NOT_SUPPORTED: case NVME_SC_ONCS_NOT_SUPPORTED:
case NVME_SC_INVALID_OPCODE:
case NVME_SC_INVALID_FIELD:
case NVME_SC_INVALID_NS:
return BLK_STS_NOTSUPP; return BLK_STS_NOTSUPP;
case NVME_SC_WRITE_FAULT: case NVME_SC_WRITE_FAULT:
case NVME_SC_READ_ERROR: case NVME_SC_READ_ERROR:
case NVME_SC_UNWRITTEN_BLOCK: case NVME_SC_UNWRITTEN_BLOCK:
case NVME_SC_ACCESS_DENIED: case NVME_SC_ACCESS_DENIED:
case NVME_SC_READ_ONLY: case NVME_SC_READ_ONLY:
case NVME_SC_COMPARE_FAILED:
return BLK_STS_MEDIUM; return BLK_STS_MEDIUM;
case NVME_SC_GUARD_CHECK: case NVME_SC_GUARD_CHECK:
case NVME_SC_APPTAG_CHECK: case NVME_SC_APPTAG_CHECK:
@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req)
void nvme_complete_rq(struct request *req) void nvme_complete_rq(struct request *req)
{ {
if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { blk_status_t status = nvme_error_status(req);
if (nvme_req_needs_failover(req)) {
trace_nvme_complete_rq(req);
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
if (nvme_req_needs_failover(req, status)) {
nvme_failover_req(req); nvme_failover_req(req);
return; return;
} }
@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req)
return; return;
} }
} }
blk_mq_end_request(req, status);
blk_mq_end_request(req, nvme_error_status(req));
} }
EXPORT_SYMBOL_GPL(nvme_complete_rq); EXPORT_SYMBOL_GPL(nvme_complete_rq);
@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
old_state = ctrl->state; old_state = ctrl->state;
switch (new_state) { switch (new_state) {
case NVME_CTRL_ADMIN_ONLY:
switch (old_state) {
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_LIVE: case NVME_CTRL_LIVE:
switch (old_state) { switch (old_state) {
case NVME_CTRL_NEW: case NVME_CTRL_NEW:
@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) { switch (old_state) {
case NVME_CTRL_NEW: case NVME_CTRL_NEW:
case NVME_CTRL_LIVE: case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
changed = true; changed = true;
/* FALLTHRU */ /* FALLTHRU */
default: default:
@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_DELETING: case NVME_CTRL_DELETING:
switch (old_state) { switch (old_state) {
case NVME_CTRL_LIVE: case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
case NVME_CTRL_RESETTING: case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING: case NVME_CTRL_RECONNECTING:
changed = true; changed = true;
@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
} }
cmd->common.command_id = req->tag; cmd->common.command_id = req->tag;
if (ns)
trace_nvme_setup_nvm_cmd(req->q->id, cmd);
else
trace_nvme_setup_admin_cmd(cmd);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(nvme_setup_cmd); EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
/* should never be called due to GENHD_FL_HIDDEN */ /* should never be called due to GENHD_FL_HIDDEN */
if (WARN_ON_ONCE(ns->head->disk)) if (WARN_ON_ONCE(ns->head->disk))
return -ENXIO; goto fail;
#endif #endif
if (!kref_get_unless_zero(&ns->kref)) if (!kref_get_unless_zero(&ns->kref))
return -ENXIO; goto fail;
if (!try_module_get(ns->ctrl->ops->module))
goto fail_put_ns;
return 0; return 0;
fail_put_ns:
nvme_put_ns(ns);
fail:
return -ENXIO;
} }
static void nvme_release(struct gendisk *disk, fmode_t mode) static void nvme_release(struct gendisk *disk, fmode_t mode)
{ {
nvme_put_ns(disk->private_data); struct nvme_ns *ns = disk->private_data;
module_put(ns->ctrl->ops->module);
nvme_put_ns(ns);
} }
static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
NULL, NULL,
}; };
static int nvme_active_ctrls(struct nvme_subsystem *subsys)
{
int count = 0;
struct nvme_ctrl *ctrl;
mutex_lock(&subsys->lock);
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
if (ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DEAD)
count++;
}
mutex_unlock(&subsys->lock);
return count;
}
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{ {
struct nvme_subsystem *subsys, *found; struct nvme_subsystem *subsys, *found;
@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
* Verify that the subsystem actually supports multiple * Verify that the subsystem actually supports multiple
* controllers, else bail out. * controllers, else bail out.
*/ */
if (!(id->cmic & (1 << 1))) { if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
dev_err(ctrl->device, dev_err(ctrl->device,
"ignoring ctrl due to duplicate subnqn (%s).\n", "ignoring ctrl due to duplicate subnqn (%s).\n",
found->subnqn); found->subnqn);
@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
shutdown_timeout, 60); shutdown_timeout, 60);
if (ctrl->shutdown_timeout != shutdown_timeout) if (ctrl->shutdown_timeout != shutdown_timeout)
dev_warn(ctrl->device, dev_info(ctrl->device,
"Shutdown timeout set to %u seconds\n", "Shutdown timeout set to %u seconds\n",
ctrl->shutdown_timeout); ctrl->shutdown_timeout);
} else } else
@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
struct nvme_ctrl *ctrl = struct nvme_ctrl *ctrl =
container_of(inode->i_cdev, struct nvme_ctrl, cdev); container_of(inode->i_cdev, struct nvme_ctrl, cdev);
if (ctrl->state != NVME_CTRL_LIVE) switch (ctrl->state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
break;
default:
return -EWOULDBLOCK; return -EWOULDBLOCK;
}
file->private_data = ctrl; file->private_data = ctrl;
return 0; return 0;
} }
@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
static const char *const state_name[] = { static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new", [NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live", [NVME_CTRL_LIVE] = "live",
[NVME_CTRL_ADMIN_ONLY] = "only-admin",
[NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_RECONNECTING]= "reconnecting", [NVME_CTRL_RECONNECTING]= "reconnecting",
[NVME_CTRL_DELETING] = "deleting", [NVME_CTRL_DELETING] = "deleting",
@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work)
if (ctrl->state != NVME_CTRL_LIVE) if (ctrl->state != NVME_CTRL_LIVE)
return; return;
WARN_ON_ONCE(!ctrl->tagset);
if (nvme_identify_ctrl(ctrl, &id)) if (nvme_identify_ctrl(ctrl, &id))
return; return;
@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work)
void nvme_queue_scan(struct nvme_ctrl *ctrl) void nvme_queue_scan(struct nvme_ctrl *ctrl)
{ {
/* /*
* Do not queue new scan work when a controller is reset during * Only new queue scan work when admin and IO queues are both alive
* removal.
*/ */
if (ctrl->state == NVME_CTRL_LIVE) if (ctrl->state == NVME_CTRL_LIVE)
queue_work(nvme_wq, &ctrl->scan_work); queue_work(nvme_wq, &ctrl->scan_work);
@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
int __init nvme_core_init(void) int __init nvme_core_init(void)
{ {
int result; int result = -ENOMEM;
nvme_wq = alloc_workqueue("nvme-wq", nvme_wq = alloc_workqueue("nvme-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_wq) if (!nvme_wq)
return -ENOMEM; goto out;
nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_reset_wq)
goto destroy_wq;
nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_delete_wq)
goto destroy_reset_wq;
result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
if (result < 0) if (result < 0)
goto destroy_wq; goto destroy_delete_wq;
nvme_class = class_create(THIS_MODULE, "nvme"); nvme_class = class_create(THIS_MODULE, "nvme");
if (IS_ERR(nvme_class)) { if (IS_ERR(nvme_class)) {
@ -3505,8 +3596,13 @@ int __init nvme_core_init(void)
class_destroy(nvme_class); class_destroy(nvme_class);
unregister_chrdev: unregister_chrdev:
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
destroy_delete_wq:
destroy_workqueue(nvme_delete_wq);
destroy_reset_wq:
destroy_workqueue(nvme_reset_wq);
destroy_wq: destroy_wq:
destroy_workqueue(nvme_wq); destroy_workqueue(nvme_wq);
out:
return result; return result;
} }
@ -3516,6 +3612,8 @@ void nvme_core_exit(void)
class_destroy(nvme_subsys_class); class_destroy(nvme_subsys_class);
class_destroy(nvme_class); class_destroy(nvme_class);
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_delete_wq);
destroy_workqueue(nvme_reset_wq);
destroy_workqueue(nvme_wq); destroy_workqueue(nvme_wq);
} }

View File

@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
*/ */
int nvmf_register_transport(struct nvmf_transport_ops *ops) int nvmf_register_transport(struct nvmf_transport_ops *ops)
{ {
if (!ops->create_ctrl) if (!ops->create_ctrl || !ops->module)
return -EINVAL; return -EINVAL;
down_write(&nvmf_transports_rwsem); down_write(&nvmf_transports_rwsem);
@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
if (uuid_parse(p, &hostid)) { ret = uuid_parse(p, &hostid);
if (ret) {
pr_err("Invalid hostid %s\n", p); pr_err("Invalid hostid %s\n", p);
ret = -EINVAL; ret = -EINVAL;
kfree(p);
goto out; goto out;
} }
kfree(p);
break; break;
case NVMF_OPT_DUP_CONNECT: case NVMF_OPT_DUP_CONNECT:
opts->duplicate_connect = true; opts->duplicate_connect = true;
@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
goto out_unlock; goto out_unlock;
} }
if (!try_module_get(ops->module)) {
ret = -EBUSY;
goto out_unlock;
}
ret = nvmf_check_required_opts(opts, ops->required_opts); ret = nvmf_check_required_opts(opts, ops->required_opts);
if (ret) if (ret)
goto out_unlock; goto out_module_put;
ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
ops->allowed_opts | ops->required_opts); ops->allowed_opts | ops->required_opts);
if (ret) if (ret)
goto out_unlock; goto out_module_put;
ctrl = ops->create_ctrl(dev, opts); ctrl = ops->create_ctrl(dev, opts);
if (IS_ERR(ctrl)) { if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl); ret = PTR_ERR(ctrl);
goto out_unlock; goto out_module_put;
} }
if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
dev_warn(ctrl->device, dev_warn(ctrl->device,
"controller returned incorrect NQN: \"%s\".\n", "controller returned incorrect NQN: \"%s\".\n",
ctrl->subsys->subnqn); ctrl->subsys->subnqn);
module_put(ops->module);
up_read(&nvmf_transports_rwsem); up_read(&nvmf_transports_rwsem);
nvme_delete_ctrl_sync(ctrl); nvme_delete_ctrl_sync(ctrl);
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
module_put(ops->module);
up_read(&nvmf_transports_rwsem); up_read(&nvmf_transports_rwsem);
return ctrl; return ctrl;
out_module_put:
module_put(ops->module);
out_unlock: out_unlock:
up_read(&nvmf_transports_rwsem); up_read(&nvmf_transports_rwsem);
out_free_opts: out_free_opts:

View File

@ -108,6 +108,7 @@ struct nvmf_ctrl_options {
* fabric implementation of NVMe fabrics. * fabric implementation of NVMe fabrics.
* @entry: Used by the fabrics library to add the new * @entry: Used by the fabrics library to add the new
* registration entry to its linked-list internal tree. * registration entry to its linked-list internal tree.
* @module: Transport module reference
* @name: Name of the NVMe fabric driver implementation. * @name: Name of the NVMe fabric driver implementation.
* @required_opts: sysfs command-line options that must be specified * @required_opts: sysfs command-line options that must be specified
* when adding a new NVMe controller. * when adding a new NVMe controller.
@ -126,6 +127,7 @@ struct nvmf_ctrl_options {
*/ */
struct nvmf_transport_ops { struct nvmf_transport_ops {
struct list_head entry; struct list_head entry;
struct module *module;
const char *name; const char *name;
int required_opts; int required_opts;
int allowed_opts; int allowed_opts;

View File

@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
nvme_fc_free_queue(&ctrl->queues[0]); nvme_fc_free_queue(&ctrl->queues[0]);
/* re-enable the admin_q so anything new can fast fail */
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_fc_ctlr_inactive_on_rport(ctrl); nvme_fc_ctlr_inactive_on_rport(ctrl);
} }
@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
* waiting for io to terminate * waiting for io to terminate
*/ */
nvme_fc_delete_association(ctrl); nvme_fc_delete_association(ctrl);
/* resume the io queues so that things will fast fail */
nvme_start_queues(nctrl);
} }
static void static void
@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
static struct nvmf_transport_ops nvme_fc_transport = { static struct nvmf_transport_ops nvme_fc_transport = {
.name = "fc", .name = "fc",
.module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
.allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
.create_ctrl = nvme_fc_create_ctrl, .create_ctrl = nvme_fc_create_ctrl,

View File

@ -31,27 +31,10 @@
enum nvme_nvm_admin_opcode { enum nvme_nvm_admin_opcode {
nvme_nvm_admin_identity = 0xe2, nvme_nvm_admin_identity = 0xe2,
nvme_nvm_admin_get_l2p_tbl = 0xea,
nvme_nvm_admin_get_bb_tbl = 0xf2, nvme_nvm_admin_get_bb_tbl = 0xf2,
nvme_nvm_admin_set_bb_tbl = 0xf1, nvme_nvm_admin_set_bb_tbl = 0xf1,
}; };
struct nvme_nvm_hb_rw {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2;
__le64 metadata;
__le64 prp1;
__le64 prp2;
__le64 spba;
__le16 length;
__le16 control;
__le32 dsmgmt;
__le64 slba;
};
struct nvme_nvm_ph_rw { struct nvme_nvm_ph_rw {
__u8 opcode; __u8 opcode;
__u8 flags; __u8 flags;
@ -80,19 +63,6 @@ struct nvme_nvm_identity {
__u32 rsvd11[5]; __u32 rsvd11[5];
}; };
struct nvme_nvm_l2ptbl {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__le32 cdw2[4];
__le64 prp1;
__le64 prp2;
__le64 slba;
__le32 nlb;
__le16 cdw14[6];
};
struct nvme_nvm_getbbtbl { struct nvme_nvm_getbbtbl {
__u8 opcode; __u8 opcode;
__u8 flags; __u8 flags;
@ -139,9 +109,7 @@ struct nvme_nvm_command {
union { union {
struct nvme_common_command common; struct nvme_common_command common;
struct nvme_nvm_identity identity; struct nvme_nvm_identity identity;
struct nvme_nvm_hb_rw hb_rw;
struct nvme_nvm_ph_rw ph_rw; struct nvme_nvm_ph_rw ph_rw;
struct nvme_nvm_l2ptbl l2p;
struct nvme_nvm_getbbtbl get_bb; struct nvme_nvm_getbbtbl get_bb;
struct nvme_nvm_setbbtbl set_bb; struct nvme_nvm_setbbtbl set_bb;
struct nvme_nvm_erase_blk erase; struct nvme_nvm_erase_blk erase;
@ -167,7 +135,7 @@ struct nvme_nvm_id_group {
__u8 num_lun; __u8 num_lun;
__u8 num_pln; __u8 num_pln;
__u8 rsvd1; __u8 rsvd1;
__le16 num_blk; __le16 num_chk;
__le16 num_pg; __le16 num_pg;
__le16 fpg_sz; __le16 fpg_sz;
__le16 csecs; __le16 csecs;
@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl {
static inline void _nvme_nvm_check_size(void) static inline void _nvme_nvm_check_size(void)
{ {
BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void)
static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
{ {
struct nvme_nvm_id_group *src; struct nvme_nvm_id_group *src;
struct nvm_id_group *dst; struct nvm_id_group *grp;
int sec_per_pg, sec_per_pl, pg_per_blk;
if (nvme_nvm_id->cgrps != 1) if (nvme_nvm_id->cgrps != 1)
return -EINVAL; return -EINVAL;
src = &nvme_nvm_id->groups[0]; src = &nvme_nvm_id->groups[0];
dst = &nvm_id->grp; grp = &nvm_id->grp;
dst->mtype = src->mtype; grp->mtype = src->mtype;
dst->fmtype = src->fmtype; grp->fmtype = src->fmtype;
dst->num_ch = src->num_ch;
dst->num_lun = src->num_lun;
dst->num_pln = src->num_pln;
dst->num_pg = le16_to_cpu(src->num_pg); grp->num_ch = src->num_ch;
dst->num_blk = le16_to_cpu(src->num_blk); grp->num_lun = src->num_lun;
dst->fpg_sz = le16_to_cpu(src->fpg_sz);
dst->csecs = le16_to_cpu(src->csecs);
dst->sos = le16_to_cpu(src->sos);
dst->trdt = le32_to_cpu(src->trdt); grp->num_chk = le16_to_cpu(src->num_chk);
dst->trdm = le32_to_cpu(src->trdm); grp->csecs = le16_to_cpu(src->csecs);
dst->tprt = le32_to_cpu(src->tprt); grp->sos = le16_to_cpu(src->sos);
dst->tprm = le32_to_cpu(src->tprm);
dst->tbet = le32_to_cpu(src->tbet);
dst->tbem = le32_to_cpu(src->tbem);
dst->mpos = le32_to_cpu(src->mpos);
dst->mccap = le32_to_cpu(src->mccap);
dst->cpar = le16_to_cpu(src->cpar); pg_per_blk = le16_to_cpu(src->num_pg);
sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs;
sec_per_pl = sec_per_pg * src->num_pln;
grp->clba = sec_per_pl * pg_per_blk;
grp->ws_per_chk = pg_per_blk;
if (dst->fmtype == NVM_ID_FMTYPE_MLC) { grp->mpos = le32_to_cpu(src->mpos);
memcpy(dst->lptbl.id, src->lptbl.id, 8); grp->cpar = le16_to_cpu(src->cpar);
dst->lptbl.mlc.num_pairs = grp->mccap = le32_to_cpu(src->mccap);
le16_to_cpu(src->lptbl.mlc.num_pairs);
if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { grp->ws_opt = grp->ws_min = sec_per_pg;
pr_err("nvm: number of MLC pairs not supported\n"); grp->ws_seq = NVM_IO_SNGL_ACCESS;
return -EINVAL;
}
memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, if (grp->mpos & 0x020202) {
dst->lptbl.mlc.num_pairs); grp->ws_seq = NVM_IO_DUAL_ACCESS;
grp->ws_opt <<= 1;
} else if (grp->mpos & 0x040404) {
grp->ws_seq = NVM_IO_QUAD_ACCESS;
grp->ws_opt <<= 2;
} }
grp->trdt = le32_to_cpu(src->trdt);
grp->trdm = le32_to_cpu(src->trdm);
grp->tprt = le32_to_cpu(src->tprt);
grp->tprm = le32_to_cpu(src->tprm);
grp->tbet = le32_to_cpu(src->tbet);
grp->tbem = le32_to_cpu(src->tbem);
/* 1.2 compatibility */
grp->num_pln = src->num_pln;
grp->num_pg = le16_to_cpu(src->num_pg);
grp->fpg_sz = le16_to_cpu(src->fpg_sz);
return 0; return 0;
} }
@ -332,62 +305,6 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
return ret; return ret;
} }
static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
nvm_l2p_update_fn *update_l2p, void *priv)
{
struct nvme_ns *ns = nvmdev->q->queuedata;
struct nvme_nvm_command c = {};
u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
u32 nlb_pr_rq = len / sizeof(u64);
u64 cmd_slba = slba;
void *entries;
int ret = 0;
c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
entries = kmalloc(len, GFP_KERNEL);
if (!entries)
return -ENOMEM;
while (nlb) {
u32 cmd_nlb = min(nlb_pr_rq, nlb);
u64 elba = slba + cmd_nlb;
c.l2p.slba = cpu_to_le64(cmd_slba);
c.l2p.nlb = cpu_to_le32(cmd_nlb);
ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
(struct nvme_command *)&c, entries, len);
if (ret) {
dev_err(ns->ctrl->device,
"L2P table transfer failed (%d)\n", ret);
ret = -EIO;
goto out;
}
if (unlikely(elba > nvmdev->total_secs)) {
pr_err("nvm: L2P data from device is out of bounds!\n");
ret = -EINVAL;
goto out;
}
/* Transform physical address to target address space */
nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
ret = -EINTR;
goto out;
}
cmd_slba += cmd_nlb;
nlb -= cmd_nlb;
}
out:
kfree(entries);
return ret;
}
static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
u8 *blks) u8 *blks)
{ {
@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_nvm_command c = {}; struct nvme_nvm_command c = {};
struct nvme_nvm_bb_tbl *bb_tbl; struct nvme_nvm_bb_tbl *bb_tbl;
int nr_blks = geo->blks_per_lun * geo->plane_mode; int nr_blks = geo->nr_chks * geo->plane_mode;
int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
int ret = 0; int ret = 0;
@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
goto out; goto out;
} }
memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode); memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode);
out: out:
kfree(bb_tbl); kfree(bb_tbl);
return ret; return ret;
@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
c->ph_rw.control = cpu_to_le16(rqd->flags); c->ph_rw.control = cpu_to_le16(rqd->flags);
c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
rqd->bio->bi_iter.bi_sector));
} }
static void nvme_nvm_end_io(struct request *rq, blk_status_t status) static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr,
static struct nvm_dev_ops nvme_nvm_dev_ops = { static struct nvm_dev_ops nvme_nvm_dev_ops = {
.identity = nvme_nvm_identity, .identity = nvme_nvm_identity,
.get_l2p_tbl = nvme_nvm_get_l2p_tbl,
.get_bb_tbl = nvme_nvm_get_bb_tbl, .get_bb_tbl = nvme_nvm_get_bb_tbl,
.set_bb_tbl = nvme_nvm_set_bb_tbl, .set_bb_tbl = nvme_nvm_set_bb_tbl,
@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
} else if (strcmp(attr->name, "num_planes") == 0) { } else if (strcmp(attr->name, "num_planes") == 0) {
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
} else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk);
} else if (strcmp(attr->name, "num_pages") == 0) { } else if (strcmp(attr->name, "num_pages") == 0) {
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
} else if (strcmp(attr->name, "page_size") == 0) { } else if (strcmp(attr->name, "page_size") == 0) {

View File

@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req)
kblockd_schedule_work(&ns->head->requeue_work); kblockd_schedule_work(&ns->head->requeue_work);
} }
bool nvme_req_needs_failover(struct request *req) bool nvme_req_needs_failover(struct request *req, blk_status_t error)
{ {
if (!(req->cmd_flags & REQ_NVME_MPATH)) if (!(req->cmd_flags & REQ_NVME_MPATH))
return false; return false;
return blk_path_error(error);
switch (nvme_req(req)->status & 0x7ff) {
/*
* Generic command status:
*/
case NVME_SC_INVALID_OPCODE:
case NVME_SC_INVALID_FIELD:
case NVME_SC_INVALID_NS:
case NVME_SC_LBA_RANGE:
case NVME_SC_CAP_EXCEEDED:
case NVME_SC_RESERVATION_CONFLICT:
return false;
/*
* I/O command set specific error. Unfortunately these values are
* reused for fabrics commands, but those should never get here.
*/
case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_INVALID_PI:
case NVME_SC_READ_ONLY:
case NVME_SC_ONCS_NOT_SUPPORTED:
WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
nvme_fabrics_command);
return false;
/*
* Media and Data Integrity Errors:
*/
case NVME_SC_WRITE_FAULT:
case NVME_SC_READ_ERROR:
case NVME_SC_GUARD_CHECK:
case NVME_SC_APPTAG_CHECK:
case NVME_SC_REFTAG_CHECK:
case NVME_SC_COMPARE_FAILED:
case NVME_SC_ACCESS_DENIED:
case NVME_SC_UNWRITTEN_BLOCK:
return false;
}
/* Everything else could be a path failure, so should be retried */
return true;
} }
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)

View File

@ -32,6 +32,8 @@ extern unsigned int admin_timeout;
#define NVME_KATO_GRACE 10 #define NVME_KATO_GRACE 10
extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
enum { enum {
NVME_NS_LBA = 0, NVME_NS_LBA = 0,
@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
enum nvme_ctrl_state { enum nvme_ctrl_state {
NVME_CTRL_NEW, NVME_CTRL_NEW,
NVME_CTRL_LIVE, NVME_CTRL_LIVE,
NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
NVME_CTRL_RESETTING, NVME_CTRL_RESETTING,
NVME_CTRL_RECONNECTING, NVME_CTRL_RECONNECTING,
NVME_CTRL_DELETING, NVME_CTRL_DELETING,
@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
void nvme_failover_req(struct request *req); void nvme_failover_req(struct request *req);
bool nvme_req_needs_failover(struct request *req); bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns_head *head);
@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
static inline void nvme_failover_req(struct request *req) static inline void nvme_failover_req(struct request *req)
{ {
} }
static inline bool nvme_req_needs_failover(struct request *req) static inline bool nvme_req_needs_failover(struct request *req,
blk_status_t error)
{ {
return false; return false;
} }

View File

@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
* Represents an NVM Express device. Each nvme_dev is a PCI function. * Represents an NVM Express device. Each nvme_dev is a PCI function.
*/ */
struct nvme_dev { struct nvme_dev {
struct nvme_queue **queues; struct nvme_queue *queues;
struct blk_mq_tag_set tagset; struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset; struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs; u32 __iomem *dbs;
@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_dev *dev = data; struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0); WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_dev *dev = data; struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
if (!nvmeq->tags) if (!nvmeq->tags)
nvmeq->tags = &dev->tagset.tags[hctx_idx]; nvmeq->tags = &dev->tagset.tags[hctx_idx];
@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
struct nvme_dev *dev = set->driver_data; struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
struct nvme_queue *nvmeq = dev->queues[queue_idx]; struct nvme_queue *nvmeq = &dev->queues[queue_idx];
BUG_ON(!nvmeq); BUG_ON(!nvmeq);
iod->nvmeq = nvmeq; iod->nvmeq = nvmeq;
@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
{ {
struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_dev *dev = to_nvme_dev(ctrl);
struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_queue *nvmeq = &dev->queues[0];
struct nvme_command c; struct nvme_command c;
memset(&c, 0, sizeof(c)); memset(&c, 0, sizeof(c));
@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
*/ */
bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
/* If there is a reset ongoing, we shouldn't reset again. */ /* If there is a reset/reinit ongoing, we shouldn't reset again. */
if (dev->ctrl.state == NVME_CTRL_RESETTING) switch (dev->ctrl.state) {
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
return false; return false;
default:
break;
}
/* We shouldn't reset unless the controller is on fatal error state /* We shouldn't reset unless the controller is on fatal error state
* _or_ if we lost the communication with it. * _or_ if we lost the communication with it.
@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
if (nvmeq->sq_cmds) if (nvmeq->sq_cmds)
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr); nvmeq->sq_cmds, nvmeq->sq_dma_addr);
kfree(nvmeq);
} }
static void nvme_free_queues(struct nvme_dev *dev, int lowest) static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
int i; int i;
for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
dev->ctrl.queue_count--; dev->ctrl.queue_count--;
dev->queues[i] = NULL; nvme_free_queue(&dev->queues[i]);
nvme_free_queue(nvmeq);
} }
} }
@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{ {
struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_queue *nvmeq = &dev->queues[0];
if (!nvmeq)
return;
if (nvme_suspend_queue(nvmeq))
return;
if (shutdown) if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl); nvme_shutdown_ctrl(&dev->ctrl);
@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
int qid, int depth) int qid, int depth)
{ {
if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
dev->ctrl.page_size); dev->ctrl.page_size);
nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
return 0; return 0;
} }
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth, int node) int depth, int node)
{ {
struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, struct nvme_queue *nvmeq = &dev->queues[qid];
node);
if (!nvmeq) if (dev->ctrl.queue_count > qid)
return NULL; return 0;
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL); &nvmeq->cq_dma_addr, GFP_KERNEL);
@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
nvmeq->q_depth = depth; nvmeq->q_depth = depth;
nvmeq->qid = qid; nvmeq->qid = qid;
nvmeq->cq_vector = -1; nvmeq->cq_vector = -1;
dev->queues[qid] = nvmeq;
dev->ctrl.queue_count++; dev->ctrl.queue_count++;
return nvmeq; return 0;
free_cqdma: free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr); nvmeq->cq_dma_addr);
free_nvmeq: free_nvmeq:
kfree(nvmeq); return -ENOMEM;
return NULL;
} }
static int queue_request_irq(struct nvme_queue *nvmeq) static int queue_request_irq(struct nvme_queue *nvmeq)
@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
if (result < 0) if (result < 0)
return result; return result;
nvmeq = dev->queues[0]; result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
if (!nvmeq) { dev_to_node(dev->dev));
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, if (result)
dev_to_node(dev->dev)); return result;
if (!nvmeq)
return -ENOMEM;
}
nvmeq = &dev->queues[0];
aqa = nvmeq->q_depth - 1; aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16; aqa |= aqa << 16;
@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
/* vector == qid - 1, match nvme_create_queue */ /* vector == qid - 1, match nvme_create_queue */
if (!nvme_alloc_queue(dev, i, dev->q_depth, if (nvme_alloc_queue(dev, i, dev->q_depth,
pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
ret = -ENOMEM; ret = -ENOMEM;
break; break;
@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
max = min(dev->max_qid, dev->ctrl.queue_count - 1); max = min(dev->max_qid, dev->ctrl.queue_count - 1);
for (i = dev->online_queues; i <= max; i++) { for (i = dev->online_queues; i <= max; i++) {
ret = nvme_create_queue(dev->queues[i], i); ret = nvme_create_queue(&dev->queues[i], i);
if (ret) if (ret)
break; break;
} }
/* /*
* Ignore failing Create SQ/CQ commands, we can continue with less * Ignore failing Create SQ/CQ commands, we can continue with less
* than the desired aount of queues, and even a controller without * than the desired amount of queues, and even a controller without
* I/O queues an still be used to issue admin commands. This might * I/O queues can still be used to issue admin commands. This might
* be useful to upgrade a buggy firmware for example. * be useful to upgrade a buggy firmware for example.
*/ */
return ret >= 0 ? 0 : ret; return ret >= 0 ? 0 : ret;
@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev,
} }
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
static void __iomem *nvme_map_cmb(struct nvme_dev *dev) static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
{ {
u64 szu, size, offset; u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
return 1ULL << (12 + 4 * szu);
}
static u32 nvme_cmb_size(struct nvme_dev *dev)
{
return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}
static void nvme_map_cmb(struct nvme_dev *dev)
{
u64 size, offset;
resource_size_t bar_size; resource_size_t bar_size;
struct pci_dev *pdev = to_pci_dev(dev->dev); struct pci_dev *pdev = to_pci_dev(dev->dev);
void __iomem *cmb;
int bar; int bar;
dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
if (!(NVME_CMB_SZ(dev->cmbsz))) if (!dev->cmbsz)
return NULL; return;
dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
if (!use_cmb_sqes) if (!use_cmb_sqes)
return NULL; return;
szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
size = szu * NVME_CMB_SZ(dev->cmbsz); offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
offset = szu * NVME_CMB_OFST(dev->cmbloc);
bar = NVME_CMB_BIR(dev->cmbloc); bar = NVME_CMB_BIR(dev->cmbloc);
bar_size = pci_resource_len(pdev, bar); bar_size = pci_resource_len(pdev, bar);
if (offset > bar_size) if (offset > bar_size)
return NULL; return;
/* /*
* Controllers may support a CMB size larger than their BAR, * Controllers may support a CMB size larger than their BAR,
@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
if (size > bar_size - offset) if (size > bar_size - offset)
size = bar_size - offset; size = bar_size - offset;
cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
if (!cmb) if (!dev->cmb)
return NULL; return;
dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
dev->cmb_size = size; dev->cmb_size = size;
return cmb;
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL))
dev_warn(dev->ctrl.device,
"failed to add sysfs attribute for CMB\n");
} }
static inline void nvme_release_cmb(struct nvme_dev *dev) static inline void nvme_release_cmb(struct nvme_dev *dev)
@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
dma_addr_t descs_dma; dma_addr_t descs_dma;
int i = 0; int i = 0;
void **bufs; void **bufs;
u64 size = 0, tmp; u64 size, tmp;
tmp = (preferred + chunk_size - 1); tmp = (preferred + chunk_size - 1);
do_div(tmp, chunk_size); do_div(tmp, chunk_size);
@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
u64 preferred = (u64)dev->ctrl.hmpre * 4096; u64 preferred = (u64)dev->ctrl.hmpre * 4096;
u64 min = (u64)dev->ctrl.hmmin * 4096; u64 min = (u64)dev->ctrl.hmmin * 4096;
u32 enable_bits = NVME_HOST_MEM_ENABLE; u32 enable_bits = NVME_HOST_MEM_ENABLE;
int ret = 0; int ret;
preferred = min(preferred, max); preferred = min(preferred, max);
if (min > max) { if (min > max) {
@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
static int nvme_setup_io_queues(struct nvme_dev *dev) static int nvme_setup_io_queues(struct nvme_dev *dev)
{ {
struct nvme_queue *adminq = dev->queues[0]; struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev); struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, nr_io_queues; int result, nr_io_queues;
unsigned long size; unsigned long size;
@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (nr_io_queues == 0) if (nr_io_queues == 0)
return 0; return 0;
if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
result = nvme_cmb_qdepth(dev, nr_io_queues, result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command)); sizeof(struct nvme_command));
if (result > 0) if (result > 0)
@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
return 0; return 0;
} }
static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) static void nvme_disable_io_queues(struct nvme_dev *dev)
{ {
int pass; int pass, queues = dev->online_queues - 1;
unsigned long timeout; unsigned long timeout;
u8 opcode = nvme_admin_delete_sq; u8 opcode = nvme_admin_delete_sq;
@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
retry: retry:
timeout = ADMIN_TIMEOUT; timeout = ADMIN_TIMEOUT;
for (; i > 0; i--, sent++) for (; i > 0; i--, sent++)
if (nvme_delete_queue(dev->queues[i], opcode)) if (nvme_delete_queue(&dev->queues[i], opcode))
break; break;
while (sent--) { while (sent--) {
@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
} }
/* /*
* Return: error value if an error occurred setting up the queues or calling * return error value only when tagset allocation failed
* Identify Device. 0 if these succeeded, even if adding some of the
* namespaces failed. At the moment, these failures are silent. TBD which
* failures should be reported.
*/ */
static int nvme_dev_add(struct nvme_dev *dev) static int nvme_dev_add(struct nvme_dev *dev)
{ {
int ret;
if (!dev->ctrl.tagset) { if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops; dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1; dev->tagset.nr_hw_queues = dev->online_queues - 1;
@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev; dev->tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->tagset)) ret = blk_mq_alloc_tag_set(&dev->tagset);
return 0; if (ret) {
dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret);
return ret;
}
dev->ctrl.tagset = &dev->tagset; dev->ctrl.tagset = &dev->tagset;
nvme_dbbuf_set(dev); nvme_dbbuf_set(dev);
@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
"set queue depth=%u\n", dev->q_depth); "set queue depth=%u\n", dev->q_depth);
} }
/* nvme_map_cmb(dev);
* CMBs can currently only exist on >=1.2 PCIe devices. We only
* populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
* has no name we can pass NULL as final argument to
* sysfs_add_file_to_group.
*/
if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
dev->cmb = nvme_map_cmb(dev);
if (dev->cmb) {
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL))
dev_warn(dev->ctrl.device,
"failed to add sysfs attribute for CMB\n");
}
}
pci_enable_pcie_error_reporting(pdev); pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev); pci_save_state(pdev);
@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{ {
int i, queues; int i;
bool dead = true; bool dead = true;
struct pci_dev *pdev = to_pci_dev(dev->dev); struct pci_dev *pdev = to_pci_dev(dev->dev);
@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
} }
nvme_stop_queues(&dev->ctrl); nvme_stop_queues(&dev->ctrl);
queues = dev->online_queues - 1; if (!dead) {
for (i = dev->ctrl.queue_count - 1; i > 0; i--) nvme_disable_io_queues(dev);
nvme_suspend_queue(dev->queues[i]);
if (dead) {
/* A device might become IO incapable very soon during
* probe, before the admin queue is configured. Thus,
* queue_count can be 0 here.
*/
if (dev->ctrl.queue_count)
nvme_suspend_queue(dev->queues[0]);
} else {
nvme_disable_io_queues(dev, queues);
nvme_disable_admin_queue(dev, shutdown); nvme_disable_admin_queue(dev, shutdown);
} }
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
nvme_suspend_queue(&dev->queues[i]);
nvme_pci_disable(dev); nvme_pci_disable(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work)
container_of(work, struct nvme_dev, ctrl.reset_work); container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result = -ENODEV; int result = -ENODEV;
enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
goto out; goto out;
@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work)
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false); nvme_dev_disable(dev, false);
/*
* Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
* initializing procedure here.
*/
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
dev_warn(dev->ctrl.device,
"failed to mark controller RECONNECTING\n");
goto out;
}
result = nvme_pci_enable(dev); result = nvme_pci_enable(dev);
if (result) if (result)
goto out; goto out;
@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work)
dev_warn(dev->ctrl.device, "IO queues not created\n"); dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl); nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl);
new_state = NVME_CTRL_ADMIN_ONLY;
} else { } else {
nvme_start_queues(&dev->ctrl); nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl); nvme_wait_freeze(&dev->ctrl);
nvme_dev_add(dev); /* hit this only when allocate tagset fails */
if (nvme_dev_add(dev))
new_state = NVME_CTRL_ADMIN_ONLY;
nvme_unfreeze(&dev->ctrl); nvme_unfreeze(&dev->ctrl);
} }
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { /*
dev_warn(dev->ctrl.device, "failed to mark controller live\n"); * If only admin queue live, keep it to do further investigation or
* recovery.
*/
if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
dev_warn(dev->ctrl.device,
"failed to mark controller state %d\n", new_state);
goto out; goto out;
} }
@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev) if (!dev)
return -ENOMEM; return -ENOMEM;
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
GFP_KERNEL, node); dev->queues = kcalloc_node(num_possible_cpus() + 1,
sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues) if (!dev->queues)
goto free; goto free;
@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result) if (result)
goto release_pools; goto release_pools;
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
queue_work(nvme_wq, &dev->ctrl.reset_work); nvme_reset_ctrl(&dev->ctrl);
return 0; return 0;
release_pools: release_pools:
@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev)
static void nvme_reset_done(struct pci_dev *pdev) static void nvme_reset_done(struct pci_dev *pdev)
{ {
struct nvme_dev *dev = pci_get_drvdata(pdev); struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_reset_ctrl(&dev->ctrl); nvme_reset_ctrl_sync(&dev->ctrl);
} }
static void nvme_shutdown(struct pci_dev *pdev) static void nvme_shutdown(struct pci_dev *pdev)

View File

@ -66,7 +66,6 @@ struct nvme_rdma_request {
struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
u32 num_sge; u32 num_sge;
int nents; int nents;
bool inline_data;
struct ib_reg_wr reg_wr; struct ib_reg_wr reg_wr;
struct ib_cqe reg_cqe; struct ib_cqe reg_cqe;
struct nvme_rdma_queue *queue; struct nvme_rdma_queue *queue;
@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
req->inline_data = true;
req->num_sge++; req->num_sge++;
return 0; return 0;
} }
@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
int count, ret; int count, ret;
req->num_sge = 1; req->num_sge = 1;
req->inline_data = false;
refcount_set(&req->ref, 2); /* send and recv completions */ refcount_set(&req->ref, 2); /* send and recv completions */
c->common.flags |= NVME_CMD_SGL_METABUF; c->common.flags |= NVME_CMD_SGL_METABUF;
@ -2018,6 +2015,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
static struct nvmf_transport_ops nvme_rdma_transport = { static struct nvmf_transport_ops nvme_rdma_transport = {
.name = "rdma", .name = "rdma",
.module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR, .required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
} }
mutex_unlock(&nvme_rdma_ctrl_mutex); mutex_unlock(&nvme_rdma_ctrl_mutex);
flush_workqueue(nvme_wq); flush_workqueue(nvme_delete_wq);
} }
static struct ib_client nvme_rdma_ib_client = { static struct ib_client nvme_rdma_ib_client = {

130
drivers/nvme/host/trace.c Normal file
View File

@ -0,0 +1,130 @@
/*
* NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <asm/unaligned.h>
#include "trace.h"
static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u16 sqid = get_unaligned_le16(cdw10);
u16 qsize = get_unaligned_le16(cdw10 + 2);
u16 sq_flags = get_unaligned_le16(cdw10 + 4);
u16 cqid = get_unaligned_le16(cdw10 + 6);
trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u",
sqid, qsize, sq_flags, cqid);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u16 cqid = get_unaligned_le16(cdw10);
u16 qsize = get_unaligned_le16(cdw10 + 2);
u16 cq_flags = get_unaligned_le16(cdw10 + 4);
u16 irq_vector = get_unaligned_le16(cdw10 + 6);
trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u",
cqid, qsize, cq_flags, irq_vector);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u8 cns = cdw10[0];
u16 ctrlid = get_unaligned_le16(cdw10 + 2);
trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u64 slba = get_unaligned_le64(cdw10);
u16 length = get_unaligned_le16(cdw10 + 8);
u16 control = get_unaligned_le16(cdw10 + 10);
u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
u32 reftag = get_unaligned_le32(cdw10 + 16);
trace_seq_printf(p,
"slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
slba, length, control, dsmgmt, reftag);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
trace_seq_printf(p, "nr=%u, attributes=%u",
get_unaligned_le32(cdw10),
get_unaligned_le32(cdw10 + 4));
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
trace_seq_printf(p, "cdw10=%*ph", 24, cdw10);
trace_seq_putc(p, 0);
return ret;
}
const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
u8 opcode, u8 *cdw10)
{
switch (opcode) {
case nvme_admin_create_sq:
return nvme_trace_create_sq(p, cdw10);
case nvme_admin_create_cq:
return nvme_trace_create_cq(p, cdw10);
case nvme_admin_identify:
return nvme_trace_admin_identify(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
}
const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
u8 opcode, u8 *cdw10)
{
switch (opcode) {
case nvme_cmd_read:
case nvme_cmd_write:
case nvme_cmd_write_zeroes:
return nvme_trace_read_write(p, cdw10);
case nvme_cmd_dsm:
return nvme_trace_dsm(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
}

165
drivers/nvme/host/trace.h Normal file
View File

@ -0,0 +1,165 @@
/*
* NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM nvme
#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NVME_H
#include <linux/nvme.h>
#include <linux/tracepoint.h>
#include <linux/trace_seq.h>
#include "nvme.h"
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
#define show_admin_opcode_name(val) \
__print_symbolic(val, \
nvme_admin_opcode_name(nvme_admin_delete_sq), \
nvme_admin_opcode_name(nvme_admin_create_sq), \
nvme_admin_opcode_name(nvme_admin_get_log_page), \
nvme_admin_opcode_name(nvme_admin_delete_cq), \
nvme_admin_opcode_name(nvme_admin_create_cq), \
nvme_admin_opcode_name(nvme_admin_identify), \
nvme_admin_opcode_name(nvme_admin_abort_cmd), \
nvme_admin_opcode_name(nvme_admin_set_features), \
nvme_admin_opcode_name(nvme_admin_get_features), \
nvme_admin_opcode_name(nvme_admin_async_event), \
nvme_admin_opcode_name(nvme_admin_ns_mgmt), \
nvme_admin_opcode_name(nvme_admin_activate_fw), \
nvme_admin_opcode_name(nvme_admin_download_fw), \
nvme_admin_opcode_name(nvme_admin_ns_attach), \
nvme_admin_opcode_name(nvme_admin_keep_alive), \
nvme_admin_opcode_name(nvme_admin_directive_send), \
nvme_admin_opcode_name(nvme_admin_directive_recv), \
nvme_admin_opcode_name(nvme_admin_dbbuf), \
nvme_admin_opcode_name(nvme_admin_format_nvm), \
nvme_admin_opcode_name(nvme_admin_security_send), \
nvme_admin_opcode_name(nvme_admin_security_recv), \
nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
#define __parse_nvme_admin_cmd(opcode, cdw10) \
nvme_trace_parse_admin_cmd(p, opcode, cdw10)
#define nvme_opcode_name(opcode) { opcode, #opcode }
#define show_opcode_name(val) \
__print_symbolic(val, \
nvme_opcode_name(nvme_cmd_flush), \
nvme_opcode_name(nvme_cmd_write), \
nvme_opcode_name(nvme_cmd_read), \
nvme_opcode_name(nvme_cmd_write_uncor), \
nvme_opcode_name(nvme_cmd_compare), \
nvme_opcode_name(nvme_cmd_write_zeroes), \
nvme_opcode_name(nvme_cmd_dsm), \
nvme_opcode_name(nvme_cmd_resv_register), \
nvme_opcode_name(nvme_cmd_resv_report), \
nvme_opcode_name(nvme_cmd_resv_acquire), \
nvme_opcode_name(nvme_cmd_resv_release))
const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
#define __parse_nvme_cmd(opcode, cdw10) \
nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
TRACE_EVENT(nvme_setup_admin_cmd,
TP_PROTO(struct nvme_command *cmd),
TP_ARGS(cmd),
TP_STRUCT__entry(
__field(u8, opcode)
__field(u8, flags)
__field(u16, cid)
__field(u64, metadata)
__array(u8, cdw10, 24)
),
TP_fast_assign(
__entry->opcode = cmd->common.opcode;
__entry->flags = cmd->common.flags;
__entry->cid = cmd->common.command_id;
__entry->metadata = le64_to_cpu(cmd->common.metadata);
memcpy(__entry->cdw10, cmd->common.cdw10,
sizeof(__entry->cdw10));
),
TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
__entry->cid, __entry->flags, __entry->metadata,
show_admin_opcode_name(__entry->opcode),
__parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
);
TRACE_EVENT(nvme_setup_nvm_cmd,
TP_PROTO(int qid, struct nvme_command *cmd),
TP_ARGS(qid, cmd),
TP_STRUCT__entry(
__field(int, qid)
__field(u8, opcode)
__field(u8, flags)
__field(u16, cid)
__field(u32, nsid)
__field(u64, metadata)
__array(u8, cdw10, 24)
),
TP_fast_assign(
__entry->qid = qid;
__entry->opcode = cmd->common.opcode;
__entry->flags = cmd->common.flags;
__entry->cid = cmd->common.command_id;
__entry->nsid = le32_to_cpu(cmd->common.nsid);
__entry->metadata = le64_to_cpu(cmd->common.metadata);
memcpy(__entry->cdw10, cmd->common.cdw10,
sizeof(__entry->cdw10));
),
TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
__entry->qid, __entry->nsid, __entry->cid,
__entry->flags, __entry->metadata,
show_opcode_name(__entry->opcode),
__parse_nvme_cmd(__entry->opcode, __entry->cdw10))
);
TRACE_EVENT(nvme_complete_rq,
TP_PROTO(struct request *req),
TP_ARGS(req),
TP_STRUCT__entry(
__field(int, qid)
__field(int, cid)
__field(u64, result)
__field(u8, retries)
__field(u8, flags)
__field(u16, status)
),
TP_fast_assign(
__entry->qid = req->q->id;
__entry->cid = req->tag;
__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
__entry->retries = nvme_req(req)->retries;
__entry->flags = nvme_req(req)->flags;
__entry->status = nvme_req(req)->status;
),
TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
__entry->cid, __entry->qid, __entry->result,
__entry->retries, __entry->flags, __entry->status)
);
#endif /* _TRACE_NVME_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@ -29,6 +29,7 @@ config NVME_TARGET_RDMA
tristate "NVMe over Fabrics RDMA target support" tristate "NVMe over Fabrics RDMA target support"
depends on INFINIBAND depends on INFINIBAND
depends on NVME_TARGET depends on NVME_TARGET
select SGL_ALLOC
help help
This enables the NVMe RDMA target support, which allows exporting NVMe This enables the NVMe RDMA target support, which allows exporting NVMe
devices over RDMA. devices over RDMA.
@ -39,6 +40,7 @@ config NVME_TARGET_FC
tristate "NVMe over Fabrics FC target driver" tristate "NVMe over Fabrics FC target driver"
depends on NVME_TARGET depends on NVME_TARGET
depends on HAS_DMA depends on HAS_DMA
select SGL_ALLOC
help help
This enables the NVMe FC target support, which allows exporting NVMe This enables the NVMe FC target support, which allows exporting NVMe
devices over FC. devices over FC.

View File

@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
req->sg_cnt = 0; req->sg_cnt = 0;
req->transfer_len = 0; req->transfer_len = 0;
req->rsp->status = 0; req->rsp->status = 0;
req->ns = NULL;
/* no support for fused commands yet */ /* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
void nvmet_req_uninit(struct nvmet_req *req) void nvmet_req_uninit(struct nvmet_req *req)
{ {
percpu_ref_put(&req->sq->ref); percpu_ref_put(&req->sq->ref);
if (req->ns)
nvmet_put_namespace(req->ns);
} }
EXPORT_SYMBOL_GPL(nvmet_req_uninit); EXPORT_SYMBOL_GPL(nvmet_req_uninit);
@ -830,7 +833,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
/* Don't accept keep-alive timeout for discovery controllers */ /* Don't accept keep-alive timeout for discovery controllers */
if (kato) { if (kato) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto out_free_sqs; goto out_remove_ida;
} }
/* /*
@ -860,6 +863,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
*ctrlp = ctrl; *ctrlp = ctrl;
return 0; return 0;
out_remove_ida:
ida_simple_remove(&cntlid_ida, ctrl->cntlid);
out_free_sqs: out_free_sqs:
kfree(ctrl->sqs); kfree(ctrl->sqs);
out_free_cqs: out_free_cqs:
@ -877,21 +882,22 @@ static void nvmet_ctrl_free(struct kref *ref)
struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
struct nvmet_subsys *subsys = ctrl->subsys; struct nvmet_subsys *subsys = ctrl->subsys;
nvmet_stop_keep_alive_timer(ctrl);
mutex_lock(&subsys->lock); mutex_lock(&subsys->lock);
list_del(&ctrl->subsys_entry); list_del(&ctrl->subsys_entry);
mutex_unlock(&subsys->lock); mutex_unlock(&subsys->lock);
nvmet_stop_keep_alive_timer(ctrl);
flush_work(&ctrl->async_event_work); flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fatal_err_work); cancel_work_sync(&ctrl->fatal_err_work);
ida_simple_remove(&cntlid_ida, ctrl->cntlid); ida_simple_remove(&cntlid_ida, ctrl->cntlid);
nvmet_subsys_put(subsys);
kfree(ctrl->sqs); kfree(ctrl->sqs);
kfree(ctrl->cqs); kfree(ctrl->cqs);
kfree(ctrl); kfree(ctrl);
nvmet_subsys_put(subsys);
} }
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)

View File

@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
goto out_ctrl_put; goto out_ctrl_put;
} }
pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
out: out:
kfree(d); kfree(d);

View File

@ -1697,31 +1697,12 @@ static int
nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
{ {
struct scatterlist *sg; struct scatterlist *sg;
struct page *page;
unsigned int nent; unsigned int nent;
u32 page_len, length;
int i = 0;
length = fod->req.transfer_len; sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent);
nent = DIV_ROUND_UP(length, PAGE_SIZE);
sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
if (!sg) if (!sg)
goto out; goto out;
sg_init_table(sg, nent);
while (length) {
page_len = min_t(u32, length, PAGE_SIZE);
page = alloc_page(GFP_KERNEL);
if (!page)
goto out_free_pages;
sg_set_page(&sg[i], page, page_len, 0);
length -= page_len;
i++;
}
fod->data_sg = sg; fod->data_sg = sg;
fod->data_sg_cnt = nent; fod->data_sg_cnt = nent;
fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent, fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent,
@ -1731,14 +1712,6 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
return 0; return 0;
out_free_pages:
while (i > 0) {
i--;
__free_page(sg_page(&sg[i]));
}
kfree(sg);
fod->data_sg = NULL;
fod->data_sg_cnt = 0;
out: out:
return NVME_SC_INTERNAL; return NVME_SC_INTERNAL;
} }
@ -1746,18 +1719,13 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
static void static void
nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
{ {
struct scatterlist *sg;
int count;
if (!fod->data_sg || !fod->data_sg_cnt) if (!fod->data_sg || !fod->data_sg_cnt)
return; return;
fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt, fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt,
((fod->io_dir == NVMET_FCP_WRITE) ? ((fod->io_dir == NVMET_FCP_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE)); DMA_FROM_DEVICE : DMA_TO_DEVICE));
for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count) sgl_free(fod->data_sg);
__free_page(sg_page(sg));
kfree(fod->data_sg);
fod->data_sg = NULL; fod->data_sg = NULL;
fod->data_sg_cnt = 0; fod->data_sg_cnt = 0;
} }
@ -2522,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port)
list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
if ((tgtport->fc_target_port.node_name == traddr.nn) && if ((tgtport->fc_target_port.node_name == traddr.nn) &&
(tgtport->fc_target_port.port_name == traddr.pn)) { (tgtport->fc_target_port.port_name == traddr.pn)) {
/* a FC port can only be 1 nvmet port id */ tgtport->port = port;
if (!tgtport->port) { ret = 0;
tgtport->port = port;
port->priv = tgtport;
nvmet_fc_tgtport_get(tgtport);
ret = 0;
} else
ret = -EALREADY;
break; break;
} }
} }
@ -2540,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
static void static void
nvmet_fc_remove_port(struct nvmet_port *port) nvmet_fc_remove_port(struct nvmet_port *port)
{ {
struct nvmet_fc_tgtport *tgtport = port->priv; /* nothing to do */
unsigned long flags;
bool matched = false;
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
if (tgtport->port == port) {
matched = true;
tgtport->port = NULL;
}
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
if (matched)
nvmet_fc_tgtport_put(tgtport);
} }
static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {

View File

@ -204,6 +204,10 @@ struct fcloop_lport {
struct completion unreg_done; struct completion unreg_done;
}; };
struct fcloop_lport_priv {
struct fcloop_lport *lport;
};
struct fcloop_rport { struct fcloop_rport {
struct nvme_fc_remote_port *remoteport; struct nvme_fc_remote_port *remoteport;
struct nvmet_fc_target_port *targetport; struct nvmet_fc_target_port *targetport;
@ -238,21 +242,32 @@ struct fcloop_lsreq {
int status; int status;
}; };
enum {
INI_IO_START = 0,
INI_IO_ACTIVE = 1,
INI_IO_ABORTED = 2,
INI_IO_COMPLETED = 3,
};
struct fcloop_fcpreq { struct fcloop_fcpreq {
struct fcloop_tport *tport; struct fcloop_tport *tport;
struct nvmefc_fcp_req *fcpreq; struct nvmefc_fcp_req *fcpreq;
spinlock_t reqlock; spinlock_t reqlock;
u16 status; u16 status;
u32 inistate;
bool active; bool active;
bool aborted; bool aborted;
struct work_struct work; struct kref ref;
struct work_struct fcp_rcv_work;
struct work_struct abort_rcv_work;
struct work_struct tio_done_work;
struct nvmefc_tgt_fcp_req tgt_fcp_req; struct nvmefc_tgt_fcp_req tgt_fcp_req;
}; };
struct fcloop_ini_fcpreq { struct fcloop_ini_fcpreq {
struct nvmefc_fcp_req *fcpreq; struct nvmefc_fcp_req *fcpreq;
struct fcloop_fcpreq *tfcp_req; struct fcloop_fcpreq *tfcp_req;
struct work_struct iniwork; spinlock_t inilock;
}; };
static inline struct fcloop_lsreq * static inline struct fcloop_lsreq *
@ -343,17 +358,122 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
return 0; return 0;
} }
/*
* FCP IO operation done by initiator abort.
* call back up initiator "done" flows.
*/
static void static void
fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work) fcloop_tfcp_req_free(struct kref *ref)
{ {
struct fcloop_ini_fcpreq *inireq = struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_ini_fcpreq, iniwork); container_of(ref, struct fcloop_fcpreq, ref);
inireq->fcpreq->done(inireq->fcpreq); kfree(tfcp_req);
}
static void
fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
{
kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
}
static int
fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
{
return kref_get_unless_zero(&tfcp_req->ref);
}
static void
fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
struct fcloop_fcpreq *tfcp_req, int status)
{
struct fcloop_ini_fcpreq *inireq = NULL;
if (fcpreq) {
inireq = fcpreq->private;
spin_lock(&inireq->inilock);
inireq->tfcp_req = NULL;
spin_unlock(&inireq->inilock);
fcpreq->status = status;
fcpreq->done(fcpreq);
}
/* release original io reference on tgt struct */
fcloop_tfcp_req_put(tfcp_req);
}
static void
fcloop_fcp_recv_work(struct work_struct *work)
{
struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
int ret = 0;
bool aborted = false;
spin_lock(&tfcp_req->reqlock);
switch (tfcp_req->inistate) {
case INI_IO_START:
tfcp_req->inistate = INI_IO_ACTIVE;
break;
case INI_IO_ABORTED:
aborted = true;
break;
default:
spin_unlock(&tfcp_req->reqlock);
WARN_ON(1);
return;
}
spin_unlock(&tfcp_req->reqlock);
if (unlikely(aborted))
ret = -ECANCELED;
else
ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
&tfcp_req->tgt_fcp_req,
fcpreq->cmdaddr, fcpreq->cmdlen);
if (ret)
fcloop_call_host_done(fcpreq, tfcp_req, ret);
return;
}
static void
fcloop_fcp_abort_recv_work(struct work_struct *work)
{
struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, abort_rcv_work);
struct nvmefc_fcp_req *fcpreq;
bool completed = false;
spin_lock(&tfcp_req->reqlock);
fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) {
case INI_IO_ABORTED:
break;
case INI_IO_COMPLETED:
completed = true;
break;
default:
spin_unlock(&tfcp_req->reqlock);
WARN_ON(1);
return;
}
spin_unlock(&tfcp_req->reqlock);
if (unlikely(completed)) {
/* remove reference taken in original abort downcall */
fcloop_tfcp_req_put(tfcp_req);
return;
}
if (tfcp_req->tport->targetport)
nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
&tfcp_req->tgt_fcp_req);
spin_lock(&tfcp_req->reqlock);
tfcp_req->fcpreq = NULL;
spin_unlock(&tfcp_req->reqlock);
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
/* call_host_done releases reference for abort downcall */
} }
/* /*
@ -364,20 +484,15 @@ static void
fcloop_tgt_fcprqst_done_work(struct work_struct *work) fcloop_tgt_fcprqst_done_work(struct work_struct *work)
{ {
struct fcloop_fcpreq *tfcp_req = struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, work); container_of(work, struct fcloop_fcpreq, tio_done_work);
struct fcloop_tport *tport = tfcp_req->tport;
struct nvmefc_fcp_req *fcpreq; struct nvmefc_fcp_req *fcpreq;
spin_lock(&tfcp_req->reqlock); spin_lock(&tfcp_req->reqlock);
fcpreq = tfcp_req->fcpreq; fcpreq = tfcp_req->fcpreq;
tfcp_req->inistate = INI_IO_COMPLETED;
spin_unlock(&tfcp_req->reqlock); spin_unlock(&tfcp_req->reqlock);
if (tport->remoteport && fcpreq) { fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
fcpreq->status = tfcp_req->status;
fcpreq->done(fcpreq);
}
kfree(tfcp_req);
} }
@ -390,7 +505,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
struct fcloop_rport *rport = remoteport->private; struct fcloop_rport *rport = remoteport->private;
struct fcloop_ini_fcpreq *inireq = fcpreq->private; struct fcloop_ini_fcpreq *inireq = fcpreq->private;
struct fcloop_fcpreq *tfcp_req; struct fcloop_fcpreq *tfcp_req;
int ret = 0;
if (!rport->targetport) if (!rport->targetport)
return -ECONNREFUSED; return -ECONNREFUSED;
@ -401,16 +515,20 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
inireq->fcpreq = fcpreq; inireq->fcpreq = fcpreq;
inireq->tfcp_req = tfcp_req; inireq->tfcp_req = tfcp_req;
INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work); spin_lock_init(&inireq->inilock);
tfcp_req->fcpreq = fcpreq; tfcp_req->fcpreq = fcpreq;
tfcp_req->tport = rport->targetport->private; tfcp_req->tport = rport->targetport->private;
tfcp_req->inistate = INI_IO_START;
spin_lock_init(&tfcp_req->reqlock); spin_lock_init(&tfcp_req->reqlock);
INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work); INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
kref_init(&tfcp_req->ref);
ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req, schedule_work(&tfcp_req->fcp_rcv_work);
fcpreq->cmdaddr, fcpreq->cmdlen);
return ret; return 0;
} }
static void static void
@ -589,7 +707,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
{ {
struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
schedule_work(&tfcp_req->work); schedule_work(&tfcp_req->tio_done_work);
} }
static void static void
@ -605,27 +723,47 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
void *hw_queue_handle, void *hw_queue_handle,
struct nvmefc_fcp_req *fcpreq) struct nvmefc_fcp_req *fcpreq)
{ {
struct fcloop_rport *rport = remoteport->private;
struct fcloop_ini_fcpreq *inireq = fcpreq->private; struct fcloop_ini_fcpreq *inireq = fcpreq->private;
struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req; struct fcloop_fcpreq *tfcp_req;
bool abortio = true;
spin_lock(&inireq->inilock);
tfcp_req = inireq->tfcp_req;
if (tfcp_req)
fcloop_tfcp_req_get(tfcp_req);
spin_unlock(&inireq->inilock);
if (!tfcp_req) if (!tfcp_req)
/* abort has already been called */ /* abort has already been called */
return; return;
if (rport->targetport)
nvmet_fc_rcv_fcp_abort(rport->targetport,
&tfcp_req->tgt_fcp_req);
/* break initiator/target relationship for io */ /* break initiator/target relationship for io */
spin_lock(&tfcp_req->reqlock); spin_lock(&tfcp_req->reqlock);
inireq->tfcp_req = NULL; switch (tfcp_req->inistate) {
tfcp_req->fcpreq = NULL; case INI_IO_START:
case INI_IO_ACTIVE:
tfcp_req->inistate = INI_IO_ABORTED;
break;
case INI_IO_COMPLETED:
abortio = false;
break;
default:
spin_unlock(&tfcp_req->reqlock);
WARN_ON(1);
return;
}
spin_unlock(&tfcp_req->reqlock); spin_unlock(&tfcp_req->reqlock);
/* post the aborted io completion */ if (abortio)
fcpreq->status = -ECANCELED; /* leave the reference while the work item is scheduled */
schedule_work(&inireq->iniwork); WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
else {
/*
* as the io has already had the done callback made,
* nothing more to do. So release the reference taken above
*/
fcloop_tfcp_req_put(tfcp_req);
}
} }
static void static void
@ -657,7 +795,8 @@ fcloop_nport_get(struct fcloop_nport *nport)
static void static void
fcloop_localport_delete(struct nvme_fc_local_port *localport) fcloop_localport_delete(struct nvme_fc_local_port *localport)
{ {
struct fcloop_lport *lport = localport->private; struct fcloop_lport_priv *lport_priv = localport->private;
struct fcloop_lport *lport = lport_priv->lport;
/* release any threads waiting for the unreg to complete */ /* release any threads waiting for the unreg to complete */
complete(&lport->unreg_done); complete(&lport->unreg_done);
@ -697,7 +836,7 @@ static struct nvme_fc_port_template fctemplate = {
.max_dif_sgl_segments = FCLOOP_SGL_SEGS, .max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G, .dma_boundary = FCLOOP_DMABOUND_4G,
/* sizes of additional private data for data structures */ /* sizes of additional private data for data structures */
.local_priv_sz = sizeof(struct fcloop_lport), .local_priv_sz = sizeof(struct fcloop_lport_priv),
.remote_priv_sz = sizeof(struct fcloop_rport), .remote_priv_sz = sizeof(struct fcloop_rport),
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq), .lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
@ -714,8 +853,7 @@ static struct nvmet_fc_target_template tgttemplate = {
.max_dif_sgl_segments = FCLOOP_SGL_SEGS, .max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G, .dma_boundary = FCLOOP_DMABOUND_4G,
/* optional features */ /* optional features */
.target_features = NVMET_FCTGTFEAT_CMD_IN_ISR | .target_features = 0,
NVMET_FCTGTFEAT_OPDONE_IN_ISR,
/* sizes of additional private data for data structures */ /* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport), .target_priv_sz = sizeof(struct fcloop_tport),
}; };
@ -728,11 +866,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
struct fcloop_ctrl_options *opts; struct fcloop_ctrl_options *opts;
struct nvme_fc_local_port *localport; struct nvme_fc_local_port *localport;
struct fcloop_lport *lport; struct fcloop_lport *lport;
int ret; struct fcloop_lport_priv *lport_priv;
unsigned long flags;
int ret = -ENOMEM;
lport = kzalloc(sizeof(*lport), GFP_KERNEL);
if (!lport)
return -ENOMEM;
opts = kzalloc(sizeof(*opts), GFP_KERNEL); opts = kzalloc(sizeof(*opts), GFP_KERNEL);
if (!opts) if (!opts)
return -ENOMEM; goto out_free_lport;
ret = fcloop_parse_options(opts, buf); ret = fcloop_parse_options(opts, buf);
if (ret) if (ret)
@ -752,23 +896,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport); ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
if (!ret) { if (!ret) {
unsigned long flags;
/* success */ /* success */
lport = localport->private; lport_priv = localport->private;
lport_priv->lport = lport;
lport->localport = localport; lport->localport = localport;
INIT_LIST_HEAD(&lport->lport_list); INIT_LIST_HEAD(&lport->lport_list);
spin_lock_irqsave(&fcloop_lock, flags); spin_lock_irqsave(&fcloop_lock, flags);
list_add_tail(&lport->lport_list, &fcloop_lports); list_add_tail(&lport->lport_list, &fcloop_lports);
spin_unlock_irqrestore(&fcloop_lock, flags); spin_unlock_irqrestore(&fcloop_lock, flags);
/* mark all of the input buffer consumed */
ret = count;
} }
out_free_opts: out_free_opts:
kfree(opts); kfree(opts);
out_free_lport:
/* free only if we're going to fail */
if (ret)
kfree(lport);
return ret ? ret : count; return ret ? ret : count;
} }
@ -790,6 +936,8 @@ __wait_localport_unreg(struct fcloop_lport *lport)
wait_for_completion(&lport->unreg_done); wait_for_completion(&lport->unreg_done);
kfree(lport);
return ret; return ret;
} }

View File

@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = {
static struct nvmf_transport_ops nvme_loop_transport = { static struct nvmf_transport_ops nvme_loop_transport = {
.name = "loop", .name = "loop",
.module = THIS_MODULE,
.create_ctrl = nvme_loop_create_ctrl, .create_ctrl = nvme_loop_create_ctrl,
}; };
@ -716,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void)
nvme_delete_ctrl(&ctrl->ctrl); nvme_delete_ctrl(&ctrl->ctrl);
mutex_unlock(&nvme_loop_ctrl_mutex); mutex_unlock(&nvme_loop_ctrl_mutex);
flush_workqueue(nvme_wq); flush_workqueue(nvme_delete_wq);
} }
module_init(nvme_loop_init_module); module_init(nvme_loop_init_module);

View File

@ -185,59 +185,6 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
} }
static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
{
struct scatterlist *sg;
int count;
if (!sgl || !nents)
return;
for_each_sg(sgl, sg, nents, count)
__free_page(sg_page(sg));
kfree(sgl);
}
static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
u32 length)
{
struct scatterlist *sg;
struct page *page;
unsigned int nent;
int i = 0;
nent = DIV_ROUND_UP(length, PAGE_SIZE);
sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
if (!sg)
goto out;
sg_init_table(sg, nent);
while (length) {
u32 page_len = min_t(u32, length, PAGE_SIZE);
page = alloc_page(GFP_KERNEL);
if (!page)
goto out_free_pages;
sg_set_page(&sg[i], page, page_len, 0);
length -= page_len;
i++;
}
*sgl = sg;
*nents = nent;
return 0;
out_free_pages:
while (i > 0) {
i--;
__free_page(sg_page(&sg[i]));
}
kfree(sg);
out:
return NVME_SC_INTERNAL;
}
static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
struct nvmet_rdma_cmd *c, bool admin) struct nvmet_rdma_cmd *c, bool admin)
{ {
@ -484,7 +431,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
} }
if (rsp->req.sg != &rsp->cmd->inline_sg) if (rsp->req.sg != &rsp->cmd->inline_sg)
nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); sgl_free(rsp->req.sg);
if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
nvmet_rdma_process_wr_wait_list(queue); nvmet_rdma_process_wr_wait_list(queue);
@ -621,16 +568,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
u32 len = get_unaligned_le24(sgl->length); u32 len = get_unaligned_le24(sgl->length);
u32 key = get_unaligned_le32(sgl->key); u32 key = get_unaligned_le32(sgl->key);
int ret; int ret;
u16 status;
/* no data command? */ /* no data command? */
if (!len) if (!len)
return 0; return 0;
status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
len); if (!rsp->req.sg)
if (status) return NVME_SC_INTERNAL;
return status;
ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
@ -976,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
{ {
pr_info("freeing queue %d\n", queue->idx); pr_debug("freeing queue %d\n", queue->idx);
nvmet_sq_destroy(&queue->nvme_sq); nvmet_sq_destroy(&queue->nvme_sq);
@ -1558,25 +1503,9 @@ static int __init nvmet_rdma_init(void)
static void __exit nvmet_rdma_exit(void) static void __exit nvmet_rdma_exit(void)
{ {
struct nvmet_rdma_queue *queue;
nvmet_unregister_transport(&nvmet_rdma_ops); nvmet_unregister_transport(&nvmet_rdma_ops);
flush_scheduled_work();
mutex_lock(&nvmet_rdma_queue_mutex);
while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
struct nvmet_rdma_queue, queue_list))) {
list_del_init(&queue->queue_list);
mutex_unlock(&nvmet_rdma_queue_mutex);
__nvmet_rdma_queue_disconnect(queue);
mutex_lock(&nvmet_rdma_queue_mutex);
}
mutex_unlock(&nvmet_rdma_queue_mutex);
flush_scheduled_work();
ib_unregister_client(&nvmet_rdma_ib_client); ib_unregister_client(&nvmet_rdma_ib_client);
WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
ida_destroy(&nvmet_rdma_queue_ida); ida_destroy(&nvmet_rdma_queue_ida);
} }

View File

@ -5,6 +5,7 @@ menuconfig TARGET_CORE
select CONFIGFS_FS select CONFIGFS_FS
select CRC_T10DIF select CRC_T10DIF
select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
select SGL_ALLOC
default n default n
help help
Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled

View File

@ -2300,13 +2300,7 @@ static void target_complete_ok_work(struct work_struct *work)
void target_free_sgl(struct scatterlist *sgl, int nents) void target_free_sgl(struct scatterlist *sgl, int nents)
{ {
struct scatterlist *sg; sgl_free_n_order(sgl, nents, 0);
int count;
for_each_sg(sgl, sg, nents, count)
__free_page(sg_page(sg));
kfree(sgl);
} }
EXPORT_SYMBOL(target_free_sgl); EXPORT_SYMBOL(target_free_sgl);
@ -2414,42 +2408,10 @@ int
target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length, target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length,
bool zero_page, bool chainable) bool zero_page, bool chainable)
{ {
struct scatterlist *sg; gfp_t gfp = GFP_KERNEL | (zero_page ? __GFP_ZERO : 0);
struct page *page;
gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0;
unsigned int nalloc, nent;
int i = 0;
nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE); *sgl = sgl_alloc_order(length, 0, chainable, gfp, nents);
if (chainable) return *sgl ? 0 : -ENOMEM;
nalloc++;
sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL);
if (!sg)
return -ENOMEM;
sg_init_table(sg, nalloc);
while (length) {
u32 page_len = min_t(u32, length, PAGE_SIZE);
page = alloc_page(GFP_KERNEL | zero_flag);
if (!page)
goto out;
sg_set_page(&sg[i], page, page_len, 0);
length -= page_len;
i++;
}
*sgl = sg;
*nents = nent;
return 0;
out:
while (i > 0) {
i--;
__free_page(sg_page(&sg[i]));
}
kfree(sg);
return -ENOMEM;
} }
EXPORT_SYMBOL(target_alloc_sgl); EXPORT_SYMBOL(target_alloc_sgl);

View File

@ -411,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
static u64 bio_end_offset(struct bio *bio) static u64 bio_end_offset(struct bio *bio)
{ {
struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; struct bio_vec *last = bio_last_bvec_all(bio);
return page_offset(last->bv_page) + last->bv_len + last->bv_offset; return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
} }
@ -563,7 +563,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* we need the actual starting offset of this extent in the file */ /* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock); read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, em = lookup_extent_mapping(em_tree,
page_offset(bio->bi_io_vec->bv_page), page_offset(bio_first_page_all(bio)),
PAGE_SIZE); PAGE_SIZE);
read_unlock(&em_tree->lock); read_unlock(&em_tree->lock);
if (!em) if (!em)

Some files were not shown because too many files have changed in this diff Show More