mirror of https://gitee.com/openkylin/linux.git
for-5.4/block-2019-09-16
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl1/no0QHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpmo9EACFXMbdNmEEUMyRSdOkVLlr7ZlTyQi1tLpB YESDPxdBfybzpi0qa8JSaysGIfvSkSjmSAqBqrWPmASOSOL6CK4bbA4fTYbgPplk XeHUdgGiG34oCQUn8Xil5reYaTm7I6LQWnWTpVa5fIhAyUYaGJL+987ykoGmpQmB Dvf3YSc+8H0RTp9PCMVd6UCGPkZbVlLImGad3PF5ULvTEaE4RCXC2aiAgh0p1l5A J2CkRZ+/mio3zN2O4YN7VdPGfr1Wo1iZ834xbIGLegv1miHXagFk7jwTcC7zIt5t oSnJnqIg3iCe7SpWt4Bkzw/zy/2UqaspifbCMgw8vychlViVRUHFO5h85Yboo7kQ OMLEQPcwjm6dTHv5h1iXF9LW1O7NoiYmmgvApU9uOo1HUrl1X7PZ3JEfUsVHxkOO T4D5igf0Krsl1eAbiwEUQzy7vFZ8PlRHqrHgK+fkyotzHu1BJR7OQkYygEfGFOB/ EfMxplGDpmibYGuWCwDX2bPAmLV3SPUQENReHrfPJRDt5TD1UkFpVGv/PLLhbr0p cLYI78DKpDSigBpVMmwq5nTYpnex33eyDTTA8C0sakcsdzdmU5qv30y3wm4nTiep f6gZo6IMXwRg/rCgVVrd9SKQAr/8wEzVlsDW3qyi2pVT8sHIgm0tFv7paihXGdDV xsKgmTrQQQ== =Qt+h -----END PGP SIGNATURE----- Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block Pull block updates from Jens Axboe: - Two NVMe pull requests: - ana log parse fix from Anton - nvme quirks support for Apple devices from Ben - fix missing bio completion tracing for multipath stack devices from Hannes and Mikhail - IP TOS settings for nvme rdma and tcp transports from Israel - rq_dma_dir cleanups from Israel - tracing for Get LBA Status command from Minwoo - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself - Some consolidation between the fabrics transports for handling the CAP register - reset race with ns scanning fix for fabrics (move fabrics commands to a dedicated request queue with a different lifetime from the admin request queue)." - controller reset and namespace scan races fixes - nvme discovery log change uevent support - naming improvements from Keith - multiple discovery controllers reject fix from James - some regular cleanups from various people - Series fixing (and re-fixing) null_blk debug printing and nr_devices checks (André) - A few pull requests from Song, with fixes from Andy, Guoqing, Guilherme, Neil, Nigel, and Yufen. - REQ_OP_ZONE_RESET_ALL support (Chaitanya) - Bio merge handling unification (Christoph) - Pick default elevator correctly for devices with special needs (Damien) - Block stats fixes (Hou) - Timeout and support devices nbd fixes (Mike) - Series fixing races around elevator switching and device add/remove (Ming) - sed-opal cleanups (Revanth) - Per device weight support for BFQ (Fam) - Support for blk-iocost, a new model that can properly account cost of IO workloads. (Tejun) - blk-cgroup writeback fixes (Tejun) - paride queue init fixes (zhengbin) - blk_set_runtime_active() cleanup (Stanley) - Block segment mapping optimizations (Bart) - lightnvm fixes (Hans/Minwoo/YueHaibing) - Various little fixes and cleanups * tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits) null_blk: format pr_* logs with pr_fmt null_blk: match the type of parameter nr_devices null_blk: do not fail the module load with zero devices block: also check RQF_STATS in blk_mq_need_time_stamp() block: make rq sector size accessible for block stats bfq: Fix bfq linkage error raid5: use bio_end_sector in r5_next_bio raid5: remove STRIPE_OPS_REQ_PENDING md: add feature flag MD_FEATURE_RAID0_LAYOUT md/raid0: avoid RAID0 data corruption due to layout confusion. raid5: don't set STRIPE_HANDLE to stripe which is in batch list raid5: don't increment read_errors on EILSEQ return nvmet: fix a wrong error status returned in error log page nvme: send discovery log page change events to userspace nvme: add uevent variables for controller devices nvme: enable aen regardless of the presence of I/O queues nvme-fabrics: allow discovery subsystems accept a kato nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery() nvme: Remove redundant assignment of cq vector nvme: Assign subsys instance from first ctrl ...
This commit is contained in:
commit
7ad67ca553
|
@ -1469,6 +1469,103 @@ IO Interface Files
|
|||
8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
|
||||
8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
|
||||
|
||||
io.cost.qos
|
||||
A read-write nested-keyed file with exists only on the root
|
||||
cgroup.
|
||||
|
||||
This file configures the Quality of Service of the IO cost
|
||||
model based controller (CONFIG_BLK_CGROUP_IOCOST) which
|
||||
currently implements "io.weight" proportional control. Lines
|
||||
are keyed by $MAJ:$MIN device numbers and not ordered. The
|
||||
line for a given device is populated on the first write for
|
||||
the device on "io.cost.qos" or "io.cost.model". The following
|
||||
nested keys are defined.
|
||||
|
||||
====== =====================================
|
||||
enable Weight-based control enable
|
||||
ctrl "auto" or "user"
|
||||
rpct Read latency percentile [0, 100]
|
||||
rlat Read latency threshold
|
||||
wpct Write latency percentile [0, 100]
|
||||
wlat Write latency threshold
|
||||
min Minimum scaling percentage [1, 10000]
|
||||
max Maximum scaling percentage [1, 10000]
|
||||
====== =====================================
|
||||
|
||||
The controller is disabled by default and can be enabled by
|
||||
setting "enable" to 1. "rpct" and "wpct" parameters default
|
||||
to zero and the controller uses internal device saturation
|
||||
state to adjust the overall IO rate between "min" and "max".
|
||||
|
||||
When a better control quality is needed, latency QoS
|
||||
parameters can be configured. For example::
|
||||
|
||||
8:16 enable=1 ctrl=auto rpct=95.00 rlat=75000 wpct=95.00 wlat=150000 min=50.00 max=150.0
|
||||
|
||||
shows that on sdb, the controller is enabled, will consider
|
||||
the device saturated if the 95th percentile of read completion
|
||||
latencies is above 75ms or write 150ms, and adjust the overall
|
||||
IO issue rate between 50% and 150% accordingly.
|
||||
|
||||
The lower the saturation point, the better the latency QoS at
|
||||
the cost of aggregate bandwidth. The narrower the allowed
|
||||
adjustment range between "min" and "max", the more conformant
|
||||
to the cost model the IO behavior. Note that the IO issue
|
||||
base rate may be far off from 100% and setting "min" and "max"
|
||||
blindly can lead to a significant loss of device capacity or
|
||||
control quality. "min" and "max" are useful for regulating
|
||||
devices which show wide temporary behavior changes - e.g. a
|
||||
ssd which accepts writes at the line speed for a while and
|
||||
then completely stalls for multiple seconds.
|
||||
|
||||
When "ctrl" is "auto", the parameters are controlled by the
|
||||
kernel and may change automatically. Setting "ctrl" to "user"
|
||||
or setting any of the percentile and latency parameters puts
|
||||
it into "user" mode and disables the automatic changes. The
|
||||
automatic mode can be restored by setting "ctrl" to "auto".
|
||||
|
||||
io.cost.model
|
||||
A read-write nested-keyed file with exists only on the root
|
||||
cgroup.
|
||||
|
||||
This file configures the cost model of the IO cost model based
|
||||
controller (CONFIG_BLK_CGROUP_IOCOST) which currently
|
||||
implements "io.weight" proportional control. Lines are keyed
|
||||
by $MAJ:$MIN device numbers and not ordered. The line for a
|
||||
given device is populated on the first write for the device on
|
||||
"io.cost.qos" or "io.cost.model". The following nested keys
|
||||
are defined.
|
||||
|
||||
===== ================================
|
||||
ctrl "auto" or "user"
|
||||
model The cost model in use - "linear"
|
||||
===== ================================
|
||||
|
||||
When "ctrl" is "auto", the kernel may change all parameters
|
||||
dynamically. When "ctrl" is set to "user" or any other
|
||||
parameters are written to, "ctrl" become "user" and the
|
||||
automatic changes are disabled.
|
||||
|
||||
When "model" is "linear", the following model parameters are
|
||||
defined.
|
||||
|
||||
============= ========================================
|
||||
[r|w]bps The maximum sequential IO throughput
|
||||
[r|w]seqiops The maximum 4k sequential IOs per second
|
||||
[r|w]randiops The maximum 4k random IOs per second
|
||||
============= ========================================
|
||||
|
||||
From the above, the builtin linear model determines the base
|
||||
costs of a sequential and random IO and the cost coefficient
|
||||
for the IO size. While simple, this model can cover most
|
||||
common device classes acceptably.
|
||||
|
||||
The IO cost model isn't expected to be accurate in absolute
|
||||
sense and is scaled to the device behavior dynamically.
|
||||
|
||||
If needed, tools/cgroup/iocost_coef_gen.py can be used to
|
||||
generate device-specific coefficients.
|
||||
|
||||
io.weight
|
||||
A read-write flat-keyed file which exists on non-root cgroups.
|
||||
The default is "default 100".
|
||||
|
|
|
@ -1201,12 +1201,6 @@
|
|||
See comment before function elanfreq_setup() in
|
||||
arch/x86/kernel/cpu/cpufreq/elanfreq.c.
|
||||
|
||||
elevator= [IOSCHED]
|
||||
Format: { "mq-deadline" | "kyber" | "bfq" }
|
||||
See Documentation/block/deadline-iosched.rst,
|
||||
Documentation/block/kyber-iosched.rst and
|
||||
Documentation/block/bfq-iosched.rst for details.
|
||||
|
||||
elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390]
|
||||
Specifies physical address of start of kernel core
|
||||
image elf header and optionally the size. Generally
|
||||
|
|
|
@ -274,9 +274,7 @@ To reduce its OS jitter, do any of the following:
|
|||
(based on an earlier one from Gilad Ben-Yossef) that
|
||||
reduces or even eliminates vmstat overhead for some
|
||||
workloads at https://lkml.org/lkml/2013/9/4/379.
|
||||
e. Boot with "elevator=noop" to avoid workqueue use by
|
||||
the block layer.
|
||||
f. If running on high-end powerpc servers, build with
|
||||
e. If running on high-end powerpc servers, build with
|
||||
CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS
|
||||
daemon from running on each CPU every second or so.
|
||||
(This will require editing Kconfig files and will defeat
|
||||
|
@ -284,12 +282,12 @@ To reduce its OS jitter, do any of the following:
|
|||
due to the rtas_event_scan() function.
|
||||
WARNING: Please check your CPU specifications to
|
||||
make sure that this is safe on your particular system.
|
||||
g. If running on Cell Processor, build your kernel with
|
||||
f. If running on Cell Processor, build your kernel with
|
||||
CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
|
||||
spu_gov_work().
|
||||
WARNING: Please check your CPU specifications to
|
||||
make sure that this is safe on your particular system.
|
||||
h. If running on PowerMAC, build your kernel with
|
||||
g. If running on PowerMAC, build your kernel with
|
||||
CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
|
||||
avoiding OS jitter from rackmeter_do_timer().
|
||||
|
||||
|
|
|
@ -1,19 +1,16 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
========================
|
||||
Null block device driver
|
||||
========================
|
||||
|
||||
1. Overview
|
||||
===========
|
||||
Overview
|
||||
========
|
||||
|
||||
The null block device (/dev/nullb*) is used for benchmarking the various
|
||||
The null block device (``/dev/nullb*``) is used for benchmarking the various
|
||||
block-layer implementations. It emulates a block device of X gigabytes in size.
|
||||
The following instances are possible:
|
||||
|
||||
Single-queue block-layer
|
||||
|
||||
- Request-based.
|
||||
- Single submission queue per device.
|
||||
- Implements IO scheduling algorithms (CFQ, Deadline, noop).
|
||||
It does not execute any read/write operation, just mark them as complete in
|
||||
the request queue. The following instances are possible:
|
||||
|
||||
Multi-queue block-layer
|
||||
|
||||
|
@ -27,15 +24,15 @@ The following instances are possible:
|
|||
|
||||
All of them have a completion queue for each core in the system.
|
||||
|
||||
2. Module parameters applicable for all instances
|
||||
=================================================
|
||||
Module parameters
|
||||
=================
|
||||
|
||||
queue_mode=[0-2]: Default: 2-Multi-queue
|
||||
Selects which block-layer the module should instantiate with.
|
||||
|
||||
= ============
|
||||
0 Bio-based
|
||||
1 Single-queue
|
||||
1 Single-queue (deprecated)
|
||||
2 Multi-queue
|
||||
= ============
|
||||
|
||||
|
@ -67,7 +64,7 @@ irqmode=[0-2]: Default: 1-Soft-irq
|
|||
completion_nsec=[ns]: Default: 10,000ns
|
||||
Combined with irqmode=2 (timer). The time each completion event must wait.
|
||||
|
||||
submit_queues=[1..nr_cpus]:
|
||||
submit_queues=[1..nr_cpus]: Default: 1
|
||||
The number of submission queues attached to the device driver. If unset, it
|
||||
defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
|
||||
parameter is 1.
|
||||
|
@ -75,9 +72,11 @@ submit_queues=[1..nr_cpus]:
|
|||
hw_queue_depth=[0..qdepth]: Default: 64
|
||||
The hardware queue depth of the device.
|
||||
|
||||
III: Multi-queue specific parameters
|
||||
Multi-queue specific parameters
|
||||
-------------------------------
|
||||
|
||||
use_per_node_hctx=[0/1]: Default: 0
|
||||
Number of hardware context queues.
|
||||
|
||||
= =====================================================================
|
||||
0 The number of submit queues are set to the value of the submit_queues
|
||||
|
@ -87,6 +86,7 @@ use_per_node_hctx=[0/1]: Default: 0
|
|||
= =====================================================================
|
||||
|
||||
no_sched=[0/1]: Default: 0
|
||||
Enable/disable the io scheduler.
|
||||
|
||||
= ======================================
|
||||
0 nullb* use default blk-mq io scheduler
|
||||
|
@ -94,6 +94,7 @@ no_sched=[0/1]: Default: 0
|
|||
= ======================================
|
||||
|
||||
blocking=[0/1]: Default: 0
|
||||
Blocking behavior of the request queue.
|
||||
|
||||
= ===============================================================
|
||||
0 Register as a non-blocking blk-mq driver device.
|
||||
|
@ -103,6 +104,7 @@ blocking=[0/1]: Default: 0
|
|||
= ===============================================================
|
||||
|
||||
shared_tags=[0/1]: Default: 0
|
||||
Sharing tags between devices.
|
||||
|
||||
= ================================================================
|
||||
0 Tag set is not shared.
|
||||
|
@ -111,6 +113,7 @@ shared_tags=[0/1]: Default: 0
|
|||
= ================================================================
|
||||
|
||||
zoned=[0/1]: Default: 0
|
||||
Device is a random-access or a zoned block device.
|
||||
|
||||
= ======================================================================
|
||||
0 Block device is exposed as a random-access block device.
|
||||
|
|
|
@ -2,10 +2,6 @@
|
|||
Switching Scheduler
|
||||
===================
|
||||
|
||||
To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
|
||||
'noop' and 'cfq' (the default) are also available. IO schedulers are assigned
|
||||
globally at boot time only presently.
|
||||
|
||||
Each io queue has a set of io scheduler tunables associated with it. These
|
||||
tunables control how the io scheduler works. You can find these entries
|
||||
in::
|
||||
|
|
|
@ -26,6 +26,9 @@ menuconfig BLOCK
|
|||
|
||||
if BLOCK
|
||||
|
||||
config BLK_RQ_ALLOC_TIME
|
||||
bool
|
||||
|
||||
config BLK_SCSI_REQUEST
|
||||
bool
|
||||
|
||||
|
@ -132,6 +135,16 @@ config BLK_CGROUP_IOLATENCY
|
|||
|
||||
Note, this is an experimental interface and could be changed someday.
|
||||
|
||||
config BLK_CGROUP_IOCOST
|
||||
bool "Enable support for cost model based cgroup IO controller"
|
||||
depends on BLK_CGROUP=y
|
||||
select BLK_RQ_ALLOC_TIME
|
||||
---help---
|
||||
Enabling this option enables the .weight interface for cost
|
||||
model based proportional IO control. The IO controller
|
||||
distributes IO capacity between different groups based on
|
||||
their share of the overall weight distribution.
|
||||
|
||||
config BLK_WBT_MQ
|
||||
bool "Multiqueue writeback throttling"
|
||||
default y
|
||||
|
|
|
@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
|
|||
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
|
||||
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
|
||||
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
|
||||
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
|
||||
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
|
||||
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
|
||||
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
|
||||
|
|
|
@ -501,11 +501,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd)
|
|||
kfree(cpd_to_bfqgd(cpd));
|
||||
}
|
||||
|
||||
static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
|
||||
static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q,
|
||||
struct blkcg *blkcg)
|
||||
{
|
||||
struct bfq_group *bfqg;
|
||||
|
||||
bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
|
||||
bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node);
|
||||
if (!bfqg)
|
||||
return NULL;
|
||||
|
||||
|
@ -904,7 +905,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd)
|
|||
bfq_end_wr_async_queues(bfqd, bfqd->root_group);
|
||||
}
|
||||
|
||||
static int bfq_io_show_weight(struct seq_file *sf, void *v)
|
||||
static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
|
||||
struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
|
||||
|
@ -918,6 +919,60 @@ static int bfq_io_show_weight(struct seq_file *sf, void *v)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static u64 bfqg_prfill_weight_device(struct seq_file *sf,
|
||||
struct blkg_policy_data *pd, int off)
|
||||
{
|
||||
struct bfq_group *bfqg = pd_to_bfqg(pd);
|
||||
|
||||
if (!bfqg->entity.dev_weight)
|
||||
return 0;
|
||||
return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight);
|
||||
}
|
||||
|
||||
static int bfq_io_show_weight(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
|
||||
struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
|
||||
|
||||
seq_printf(sf, "default %u\n", bfqgd->weight);
|
||||
blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device,
|
||||
&blkcg_policy_bfq, 0, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight)
|
||||
{
|
||||
weight = dev_weight ?: weight;
|
||||
|
||||
bfqg->entity.dev_weight = dev_weight;
|
||||
/*
|
||||
* Setting the prio_changed flag of the entity
|
||||
* to 1 with new_weight == weight would re-set
|
||||
* the value of the weight to its ioprio mapping.
|
||||
* Set the flag only if necessary.
|
||||
*/
|
||||
if ((unsigned short)weight != bfqg->entity.new_weight) {
|
||||
bfqg->entity.new_weight = (unsigned short)weight;
|
||||
/*
|
||||
* Make sure that the above new value has been
|
||||
* stored in bfqg->entity.new_weight before
|
||||
* setting the prio_changed flag. In fact,
|
||||
* this flag may be read asynchronously (in
|
||||
* critical sections protected by a different
|
||||
* lock than that held here), and finding this
|
||||
* flag set may cause the execution of the code
|
||||
* for updating parameters whose value may
|
||||
* depend also on bfqg->entity.new_weight (in
|
||||
* __bfq_entity_update_weight_prio).
|
||||
* This barrier makes sure that the new value
|
||||
* of bfqg->entity.new_weight is correctly
|
||||
* seen in that code.
|
||||
*/
|
||||
smp_wmb();
|
||||
bfqg->entity.prio_changed = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
|
||||
struct cftype *cftype,
|
||||
u64 val)
|
||||
|
@ -936,53 +991,70 @@ static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
|
|||
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
|
||||
struct bfq_group *bfqg = blkg_to_bfqg(blkg);
|
||||
|
||||
if (!bfqg)
|
||||
continue;
|
||||
/*
|
||||
* Setting the prio_changed flag of the entity
|
||||
* to 1 with new_weight == weight would re-set
|
||||
* the value of the weight to its ioprio mapping.
|
||||
* Set the flag only if necessary.
|
||||
*/
|
||||
if ((unsigned short)val != bfqg->entity.new_weight) {
|
||||
bfqg->entity.new_weight = (unsigned short)val;
|
||||
/*
|
||||
* Make sure that the above new value has been
|
||||
* stored in bfqg->entity.new_weight before
|
||||
* setting the prio_changed flag. In fact,
|
||||
* this flag may be read asynchronously (in
|
||||
* critical sections protected by a different
|
||||
* lock than that held here), and finding this
|
||||
* flag set may cause the execution of the code
|
||||
* for updating parameters whose value may
|
||||
* depend also on bfqg->entity.new_weight (in
|
||||
* __bfq_entity_update_weight_prio).
|
||||
* This barrier makes sure that the new value
|
||||
* of bfqg->entity.new_weight is correctly
|
||||
* seen in that code.
|
||||
*/
|
||||
smp_wmb();
|
||||
bfqg->entity.prio_changed = 1;
|
||||
}
|
||||
if (bfqg)
|
||||
bfq_group_set_weight(bfqg, val, 0);
|
||||
}
|
||||
spin_unlock_irq(&blkcg->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
int ret;
|
||||
struct blkg_conf_ctx ctx;
|
||||
struct blkcg *blkcg = css_to_blkcg(of_css(of));
|
||||
struct bfq_group *bfqg;
|
||||
u64 v;
|
||||
|
||||
ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (sscanf(ctx.body, "%llu", &v) == 1) {
|
||||
/* require "default" on dfl */
|
||||
ret = -ERANGE;
|
||||
if (!v)
|
||||
goto out;
|
||||
} else if (!strcmp(strim(ctx.body), "default")) {
|
||||
v = 0;
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
bfqg = blkg_to_bfqg(ctx.blkg);
|
||||
|
||||
ret = -ERANGE;
|
||||
if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) {
|
||||
bfq_group_set_weight(bfqg, bfqg->entity.weight, v);
|
||||
ret = 0;
|
||||
}
|
||||
out:
|
||||
blkg_conf_finish(&ctx);
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
|
||||
static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
u64 weight;
|
||||
/* First unsigned long found in the file is used */
|
||||
int ret = kstrtoull(strim(buf), 0, &weight);
|
||||
char *endp;
|
||||
int ret;
|
||||
u64 v;
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
buf = strim(buf);
|
||||
|
||||
ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight);
|
||||
return ret ?: nbytes;
|
||||
/* "WEIGHT" or "default WEIGHT" sets the default weight */
|
||||
v = simple_strtoull(buf, &endp, 0);
|
||||
if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
|
||||
ret = bfq_io_set_weight_legacy(of_css(of), NULL, v);
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
|
||||
return bfq_io_set_device_weight(of, buf, nbytes, off);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BFQ_CGROUP_DEBUG
|
||||
|
@ -1141,9 +1213,15 @@ struct cftype bfq_blkcg_legacy_files[] = {
|
|||
{
|
||||
.name = "bfq.weight",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.seq_show = bfq_io_show_weight,
|
||||
.seq_show = bfq_io_show_weight_legacy,
|
||||
.write_u64 = bfq_io_set_weight_legacy,
|
||||
},
|
||||
{
|
||||
.name = "bfq.weight_device",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.seq_show = bfq_io_show_weight,
|
||||
.write = bfq_io_set_weight,
|
||||
},
|
||||
|
||||
/* statistics, covers only the tasks in the bfqg */
|
||||
{
|
||||
|
|
|
@ -168,6 +168,9 @@ struct bfq_entity {
|
|||
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
|
||||
int budget;
|
||||
|
||||
/* device weight, if non-zero, it overrides the default weight of
|
||||
* bfq_group_data */
|
||||
int dev_weight;
|
||||
/* weight of the queue */
|
||||
int weight;
|
||||
/* next weight if a change is in progress */
|
||||
|
|
|
@ -744,6 +744,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Matches the smp_wmb() in bfq_group_set_weight. */
|
||||
smp_rmb();
|
||||
old_st->wsum -= entity->weight;
|
||||
|
||||
if (entity->new_weight != entity->orig_weight) {
|
||||
|
|
60
block/bio.c
60
block/bio.c
|
@ -646,25 +646,20 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the @page can be added to the current segment(@bv), and make
|
||||
* sure to call it only if page_is_mergeable(@bv, @page) is true
|
||||
*/
|
||||
static bool can_add_page_to_seg(struct request_queue *q,
|
||||
struct bio_vec *bv, struct page *page, unsigned len,
|
||||
unsigned offset)
|
||||
static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned len, unsigned offset,
|
||||
bool *same_page)
|
||||
{
|
||||
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
unsigned long mask = queue_segment_boundary(q);
|
||||
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
|
||||
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
|
||||
|
||||
if ((addr1 | mask) != (addr2 | mask))
|
||||
return false;
|
||||
|
||||
if (bv->bv_len + len > queue_max_segment_size(q))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
return __bio_try_merge_page(bio, page, len, offset, same_page);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -674,7 +669,7 @@ static bool can_add_page_to_seg(struct request_queue *q,
|
|||
* @page: page to add
|
||||
* @len: vec entry length
|
||||
* @offset: vec entry offset
|
||||
* @put_same_page: put the page if it is same with last added page
|
||||
* @same_page: return if the merge happen inside the same page
|
||||
*
|
||||
* Attempt to add a page to the bio_vec maplist. This can fail for a
|
||||
* number of reasons, such as the bio being full or target block device
|
||||
|
@ -685,10 +680,9 @@ static bool can_add_page_to_seg(struct request_queue *q,
|
|||
*/
|
||||
static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset,
|
||||
bool put_same_page)
|
||||
bool *same_page)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
bool same_page = false;
|
||||
|
||||
/*
|
||||
* cloned bio must not modify vec list
|
||||
|
@ -700,28 +694,16 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
|
|||
return 0;
|
||||
|
||||
if (bio->bi_vcnt > 0) {
|
||||
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
|
||||
if (page == bvec->bv_page &&
|
||||
offset == bvec->bv_offset + bvec->bv_len) {
|
||||
if (put_same_page)
|
||||
put_page(page);
|
||||
bvec->bv_len += len;
|
||||
goto done;
|
||||
}
|
||||
if (bio_try_merge_pc_page(q, bio, page, len, offset, same_page))
|
||||
return len;
|
||||
|
||||
/*
|
||||
* If the queue doesn't support SG gaps and adding this
|
||||
* offset would create a gap, disallow it.
|
||||
* If the queue doesn't support SG gaps and adding this segment
|
||||
* would create a gap, disallow it.
|
||||
*/
|
||||
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
if (bvec_gap_to_prev(q, bvec, offset))
|
||||
return 0;
|
||||
|
||||
if (page_is_mergeable(bvec, page, len, offset, &same_page) &&
|
||||
can_add_page_to_seg(q, bvec, page, len, offset)) {
|
||||
bvec->bv_len += len;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
if (bio_full(bio, len))
|
||||
|
@ -735,7 +717,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
|
|||
bvec->bv_len = len;
|
||||
bvec->bv_offset = offset;
|
||||
bio->bi_vcnt++;
|
||||
done:
|
||||
bio->bi_iter.bi_size += len;
|
||||
return len;
|
||||
}
|
||||
|
@ -743,7 +724,8 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
|
|||
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset)
|
||||
{
|
||||
return __bio_add_pc_page(q, bio, page, len, offset, false);
|
||||
bool same_page = false;
|
||||
return __bio_add_pc_page(q, bio, page, len, offset, &same_page);
|
||||
}
|
||||
EXPORT_SYMBOL(bio_add_pc_page);
|
||||
|
||||
|
@ -806,6 +788,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
|
|||
|
||||
bio->bi_iter.bi_size += len;
|
||||
bio->bi_vcnt++;
|
||||
|
||||
if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
|
||||
bio_set_flag(bio, BIO_WORKINGSET);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__bio_add_page);
|
||||
|
||||
|
@ -1384,13 +1369,17 @@ struct bio *bio_map_user_iov(struct request_queue *q,
|
|||
for (j = 0; j < npages; j++) {
|
||||
struct page *page = pages[j];
|
||||
unsigned int n = PAGE_SIZE - offs;
|
||||
bool same_page = false;
|
||||
|
||||
if (n > bytes)
|
||||
n = bytes;
|
||||
|
||||
if (!__bio_add_pc_page(q, bio, page, n, offs,
|
||||
true))
|
||||
&same_page)) {
|
||||
if (same_page)
|
||||
put_page(page);
|
||||
break;
|
||||
}
|
||||
|
||||
added += n;
|
||||
bytes -= n;
|
||||
|
@ -1521,7 +1510,6 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
|
|||
bio->bi_end_io = bio_map_kern_endio;
|
||||
return bio;
|
||||
}
|
||||
EXPORT_SYMBOL(bio_map_kern);
|
||||
|
||||
static void bio_copy_kern_endio(struct bio *bio)
|
||||
{
|
||||
|
@ -1842,8 +1830,8 @@ EXPORT_SYMBOL(bio_endio);
|
|||
* @bio, and updates @bio to represent the remaining sectors.
|
||||
*
|
||||
* Unless this is a discard request the newly allocated bio will point
|
||||
* to @bio's bi_io_vec; it is the caller's responsibility to ensure that
|
||||
* @bio is not freed before the split.
|
||||
* to @bio's bi_io_vec. It is the caller's responsibility to ensure that
|
||||
* neither @bio nor @bs are freed before the split bio.
|
||||
*/
|
||||
struct bio *bio_split(struct bio *bio, int sectors,
|
||||
gfp_t gfp, struct bio_set *bs)
|
||||
|
|
|
@ -175,7 +175,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
|
|||
continue;
|
||||
|
||||
/* alloc per-policy data and attach it to blkg */
|
||||
pd = pol->pd_alloc_fn(gfp_mask, q->node);
|
||||
pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
|
||||
if (!pd)
|
||||
goto err_free;
|
||||
|
||||
|
@ -753,6 +753,44 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
|
|||
return __blkg_lookup(blkcg, q, true /* update_hint */);
|
||||
}
|
||||
|
||||
/**
|
||||
* blkg_conf_prep - parse and prepare for per-blkg config update
|
||||
* @inputp: input string pointer
|
||||
*
|
||||
* Parse the device node prefix part, MAJ:MIN, of per-blkg config update
|
||||
* from @input and get and return the matching gendisk. *@inputp is
|
||||
* updated to point past the device node prefix. Returns an ERR_PTR()
|
||||
* value on error.
|
||||
*
|
||||
* Use this function iff blkg_conf_prep() can't be used for some reason.
|
||||
*/
|
||||
struct gendisk *blkcg_conf_get_disk(char **inputp)
|
||||
{
|
||||
char *input = *inputp;
|
||||
unsigned int major, minor;
|
||||
struct gendisk *disk;
|
||||
int key_len, part;
|
||||
|
||||
if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
input += key_len;
|
||||
if (!isspace(*input))
|
||||
return ERR_PTR(-EINVAL);
|
||||
input = skip_spaces(input);
|
||||
|
||||
disk = get_gendisk(MKDEV(major, minor), &part);
|
||||
if (!disk)
|
||||
return ERR_PTR(-ENODEV);
|
||||
if (part) {
|
||||
put_disk_and_module(disk);
|
||||
return ERR_PTR(-ENODEV);
|
||||
}
|
||||
|
||||
*inputp = input;
|
||||
return disk;
|
||||
}
|
||||
|
||||
/**
|
||||
* blkg_conf_prep - parse and prepare for per-blkg config update
|
||||
* @blkcg: target block cgroup
|
||||
|
@ -772,25 +810,11 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
|
|||
struct gendisk *disk;
|
||||
struct request_queue *q;
|
||||
struct blkcg_gq *blkg;
|
||||
unsigned int major, minor;
|
||||
int key_len, part, ret;
|
||||
char *body;
|
||||
int ret;
|
||||
|
||||
if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
|
||||
return -EINVAL;
|
||||
|
||||
body = input + key_len;
|
||||
if (!isspace(*body))
|
||||
return -EINVAL;
|
||||
body = skip_spaces(body);
|
||||
|
||||
disk = get_gendisk(MKDEV(major, minor), &part);
|
||||
if (!disk)
|
||||
return -ENODEV;
|
||||
if (part) {
|
||||
ret = -ENODEV;
|
||||
goto fail;
|
||||
}
|
||||
disk = blkcg_conf_get_disk(&input);
|
||||
if (IS_ERR(disk))
|
||||
return PTR_ERR(disk);
|
||||
|
||||
q = disk->queue;
|
||||
|
||||
|
@ -856,7 +880,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
|
|||
success:
|
||||
ctx->disk = disk;
|
||||
ctx->blkg = blkg;
|
||||
ctx->body = body;
|
||||
ctx->body = input;
|
||||
return 0;
|
||||
|
||||
fail_unlock:
|
||||
|
@ -876,6 +900,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
|
|||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkg_conf_prep);
|
||||
|
||||
/**
|
||||
* blkg_conf_finish - finish up per-blkg config update
|
||||
|
@ -891,6 +916,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
|
|||
rcu_read_unlock();
|
||||
put_disk_and_module(ctx->disk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkg_conf_finish);
|
||||
|
||||
static int blkcg_print_stat(struct seq_file *sf, void *v)
|
||||
{
|
||||
|
@ -1346,7 +1372,7 @@ int blkcg_activate_policy(struct request_queue *q,
|
|||
blk_mq_freeze_queue(q);
|
||||
pd_prealloc:
|
||||
if (!pd_prealloc) {
|
||||
pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
|
||||
pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, &blkcg_root);
|
||||
if (!pd_prealloc) {
|
||||
ret = -ENOMEM;
|
||||
goto out_bypass_end;
|
||||
|
@ -1362,7 +1388,7 @@ int blkcg_activate_policy(struct request_queue *q,
|
|||
if (blkg->pd[pol->plid])
|
||||
continue;
|
||||
|
||||
pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
|
||||
pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, &blkcg_root);
|
||||
if (!pd)
|
||||
swap(pd, pd_prealloc);
|
||||
if (!pd) {
|
||||
|
@ -1475,7 +1501,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
|
|||
blkcg->cpd[pol->plid] = cpd;
|
||||
cpd->blkcg = blkcg;
|
||||
cpd->plid = pol->plid;
|
||||
pol->cpd_init_fn(cpd);
|
||||
if (pol->cpd_init_fn)
|
||||
pol->cpd_init_fn(cpd);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/psi.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/block.h>
|
||||
|
@ -129,6 +130,7 @@ static const char *const blk_op_name[] = {
|
|||
REQ_OP_NAME(DISCARD),
|
||||
REQ_OP_NAME(SECURE_ERASE),
|
||||
REQ_OP_NAME(ZONE_RESET),
|
||||
REQ_OP_NAME(ZONE_RESET_ALL),
|
||||
REQ_OP_NAME(WRITE_SAME),
|
||||
REQ_OP_NAME(WRITE_ZEROES),
|
||||
REQ_OP_NAME(SCSI_IN),
|
||||
|
@ -344,7 +346,8 @@ void blk_cleanup_queue(struct request_queue *q)
|
|||
|
||||
/*
|
||||
* Drain all requests queued before DYING marking. Set DEAD flag to
|
||||
* prevent that q->request_fn() gets invoked after draining finished.
|
||||
* prevent that blk_mq_run_hw_queues() accesses the hardware queues
|
||||
* after draining finished.
|
||||
*/
|
||||
blk_freeze_queue(q);
|
||||
|
||||
|
@ -479,7 +482,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
|
|||
if (!q)
|
||||
return NULL;
|
||||
|
||||
INIT_LIST_HEAD(&q->queue_head);
|
||||
q->last_merge = NULL;
|
||||
|
||||
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
|
||||
|
@ -518,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
|
|||
mutex_init(&q->blk_trace_mutex);
|
||||
#endif
|
||||
mutex_init(&q->sysfs_lock);
|
||||
mutex_init(&q->sysfs_dir_lock);
|
||||
spin_lock_init(&q->queue_lock);
|
||||
|
||||
init_waitqueue_head(&q->mq_freeze_wq);
|
||||
|
@ -601,6 +604,7 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio,
|
|||
return false;
|
||||
|
||||
trace_block_bio_backmerge(req->q, req, bio);
|
||||
rq_qos_merge(req->q, req, bio);
|
||||
|
||||
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
|
||||
blk_rq_set_mixed_merge(req);
|
||||
|
@ -622,6 +626,7 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio,
|
|||
return false;
|
||||
|
||||
trace_block_bio_frontmerge(req->q, req, bio);
|
||||
rq_qos_merge(req->q, req, bio);
|
||||
|
||||
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
|
||||
blk_rq_set_mixed_merge(req);
|
||||
|
@ -647,6 +652,8 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
|
|||
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
|
||||
goto no_merge;
|
||||
|
||||
rq_qos_merge(q, req, bio);
|
||||
|
||||
req->biotail->bi_next = bio;
|
||||
req->biotail = bio;
|
||||
req->__data_len += bio->bi_iter.bi_size;
|
||||
|
@ -931,6 +938,10 @@ generic_make_request_checks(struct bio *bio)
|
|||
if (!blk_queue_is_zoned(q))
|
||||
goto not_supported;
|
||||
break;
|
||||
case REQ_OP_ZONE_RESET_ALL:
|
||||
if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
|
||||
goto not_supported;
|
||||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
if (!q->limits.max_write_zeroes_sectors)
|
||||
goto not_supported;
|
||||
|
@ -1128,6 +1139,10 @@ EXPORT_SYMBOL_GPL(direct_make_request);
|
|||
*/
|
||||
blk_qc_t submit_bio(struct bio *bio)
|
||||
{
|
||||
bool workingset_read = false;
|
||||
unsigned long pflags;
|
||||
blk_qc_t ret;
|
||||
|
||||
if (blkcg_punt_bio_submit(bio))
|
||||
return BLK_QC_T_NONE;
|
||||
|
||||
|
@ -1146,6 +1161,8 @@ blk_qc_t submit_bio(struct bio *bio)
|
|||
if (op_is_write(bio_op(bio))) {
|
||||
count_vm_events(PGPGOUT, count);
|
||||
} else {
|
||||
if (bio_flagged(bio, BIO_WORKINGSET))
|
||||
workingset_read = true;
|
||||
task_io_account_read(bio->bi_iter.bi_size);
|
||||
count_vm_events(PGPGIN, count);
|
||||
}
|
||||
|
@ -1160,7 +1177,21 @@ blk_qc_t submit_bio(struct bio *bio)
|
|||
}
|
||||
}
|
||||
|
||||
return generic_make_request(bio);
|
||||
/*
|
||||
* If we're reading data that is part of the userspace
|
||||
* workingset, count submission time as memory stall. When the
|
||||
* device is congested, or the submitting cgroup IO-throttled,
|
||||
* submission can be a significant part of overall IO time.
|
||||
*/
|
||||
if (workingset_read)
|
||||
psi_memstall_enter(&pflags);
|
||||
|
||||
ret = generic_make_request(bio);
|
||||
|
||||
if (workingset_read)
|
||||
psi_memstall_leave(&pflags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(submit_bio);
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -725,7 +725,7 @@ int blk_iolatency_init(struct request_queue *q)
|
|||
return -ENOMEM;
|
||||
|
||||
rqos = &blkiolat->rqos;
|
||||
rqos->id = RQ_QOS_CGROUP;
|
||||
rqos->id = RQ_QOS_LATENCY;
|
||||
rqos->ops = &blkcg_iolatency_ops;
|
||||
rqos->q = q;
|
||||
|
||||
|
@ -934,11 +934,13 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
|
|||
}
|
||||
|
||||
|
||||
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
|
||||
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
|
||||
struct request_queue *q,
|
||||
struct blkcg *blkcg)
|
||||
{
|
||||
struct iolatency_grp *iolat;
|
||||
|
||||
iolat = kzalloc_node(sizeof(*iolat), gfp, node);
|
||||
iolat = kzalloc_node(sizeof(*iolat), gfp, q->node);
|
||||
if (!iolat)
|
||||
return NULL;
|
||||
iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
|
||||
|
|
|
@ -132,19 +132,32 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
|
|||
return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the maximum number of sectors from the start of a bio that may be
|
||||
* submitted as a single request to a block device. If enough sectors remain,
|
||||
* align the end to the physical block size. Otherwise align the end to the
|
||||
* logical block size. This approach minimizes the number of non-aligned
|
||||
* requests that are submitted to a block device if the start of a bio is not
|
||||
* aligned to a physical block boundary.
|
||||
*/
|
||||
static inline unsigned get_max_io_size(struct request_queue *q,
|
||||
struct bio *bio)
|
||||
{
|
||||
unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
|
||||
unsigned mask = queue_logical_block_size(q) - 1;
|
||||
unsigned max_sectors = sectors;
|
||||
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
|
||||
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
|
||||
unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
|
||||
|
||||
/* aligned to logical block size */
|
||||
sectors &= ~(mask >> 9);
|
||||
max_sectors += start_offset;
|
||||
max_sectors &= ~(pbs - 1);
|
||||
if (max_sectors > start_offset)
|
||||
return max_sectors - start_offset;
|
||||
|
||||
return sectors;
|
||||
return sectors & (lbs - 1);
|
||||
}
|
||||
|
||||
static unsigned get_max_segment_size(struct request_queue *q,
|
||||
static unsigned get_max_segment_size(const struct request_queue *q,
|
||||
unsigned offset)
|
||||
{
|
||||
unsigned long mask = queue_segment_boundary(q);
|
||||
|
@ -157,26 +170,41 @@ static unsigned get_max_segment_size(struct request_queue *q,
|
|||
queue_max_segment_size(q));
|
||||
}
|
||||
|
||||
/*
|
||||
* Split the bvec @bv into segments, and update all kinds of
|
||||
* variables.
|
||||
/**
|
||||
* bvec_split_segs - verify whether or not a bvec should be split in the middle
|
||||
* @q: [in] request queue associated with the bio associated with @bv
|
||||
* @bv: [in] bvec to examine
|
||||
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
|
||||
* by the number of segments from @bv that may be appended to that
|
||||
* bio without exceeding @max_segs
|
||||
* @sectors: [in,out] Number of sectors in the bio being built. Incremented
|
||||
* by the number of sectors from @bv that may be appended to that
|
||||
* bio without exceeding @max_sectors
|
||||
* @max_segs: [in] upper bound for *@nsegs
|
||||
* @max_sectors: [in] upper bound for *@sectors
|
||||
*
|
||||
* When splitting a bio, it can happen that a bvec is encountered that is too
|
||||
* big to fit in a single segment and hence that it has to be split in the
|
||||
* middle. This function verifies whether or not that should happen. The value
|
||||
* %true is returned if and only if appending the entire @bv to a bio with
|
||||
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
|
||||
* the block driver.
|
||||
*/
|
||||
static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
|
||||
unsigned *nsegs, unsigned *sectors, unsigned max_segs)
|
||||
static bool bvec_split_segs(const struct request_queue *q,
|
||||
const struct bio_vec *bv, unsigned *nsegs,
|
||||
unsigned *sectors, unsigned max_segs,
|
||||
unsigned max_sectors)
|
||||
{
|
||||
unsigned len = bv->bv_len;
|
||||
unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
|
||||
unsigned len = min(bv->bv_len, max_len);
|
||||
unsigned total_len = 0;
|
||||
unsigned new_nsegs = 0, seg_size = 0;
|
||||
unsigned seg_size = 0;
|
||||
|
||||
/*
|
||||
* Multi-page bvec may be too big to hold in one segment, so the
|
||||
* current bvec has to be splitted as multiple segments.
|
||||
*/
|
||||
while (len && new_nsegs + *nsegs < max_segs) {
|
||||
while (len && *nsegs < max_segs) {
|
||||
seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
|
||||
seg_size = min(seg_size, len);
|
||||
|
||||
new_nsegs++;
|
||||
(*nsegs)++;
|
||||
total_len += seg_size;
|
||||
len -= seg_size;
|
||||
|
||||
|
@ -184,16 +212,31 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
|
|||
break;
|
||||
}
|
||||
|
||||
if (new_nsegs) {
|
||||
*nsegs += new_nsegs;
|
||||
if (sectors)
|
||||
*sectors += total_len >> 9;
|
||||
}
|
||||
*sectors += total_len >> 9;
|
||||
|
||||
/* split in the middle of the bvec if len != 0 */
|
||||
return !!len;
|
||||
/* tell the caller to split the bvec if it is too big to fit */
|
||||
return len > 0 || bv->bv_len > max_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_bio_segment_split - split a bio in two bios
|
||||
* @q: [in] request queue pointer
|
||||
* @bio: [in] bio to be split
|
||||
* @bs: [in] bio set to allocate the clone from
|
||||
* @segs: [out] number of segments in the bio with the first half of the sectors
|
||||
*
|
||||
* Clone @bio, update the bi_iter of the clone to represent the first sectors
|
||||
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
|
||||
* following is guaranteed for the cloned bio:
|
||||
* - That it has at most get_max_io_size(@q, @bio) sectors.
|
||||
* - That it has at most queue_max_segments(@q) segments.
|
||||
*
|
||||
* Except for discard requests the cloned bio will point at the bi_io_vec of
|
||||
* the original bio. It is the responsibility of the caller to ensure that the
|
||||
* original bio is not freed before the cloned bio. The caller is also
|
||||
* responsible for ensuring that @bs is only destroyed after processing of the
|
||||
* split bio has finished.
|
||||
*/
|
||||
static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
struct bio *bio,
|
||||
struct bio_set *bs,
|
||||
|
@ -213,34 +256,18 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
|||
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
|
||||
goto split;
|
||||
|
||||
if (sectors + (bv.bv_len >> 9) > max_sectors) {
|
||||
/*
|
||||
* Consider this a new segment if we're splitting in
|
||||
* the middle of this vector.
|
||||
*/
|
||||
if (nsegs < max_segs &&
|
||||
sectors < max_sectors) {
|
||||
/* split in the middle of bvec */
|
||||
bv.bv_len = (max_sectors - sectors) << 9;
|
||||
bvec_split_segs(q, &bv, &nsegs,
|
||||
§ors, max_segs);
|
||||
}
|
||||
if (nsegs < max_segs &&
|
||||
sectors + (bv.bv_len >> 9) <= max_sectors &&
|
||||
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
|
||||
nsegs++;
|
||||
sectors += bv.bv_len >> 9;
|
||||
} else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs,
|
||||
max_sectors)) {
|
||||
goto split;
|
||||
}
|
||||
|
||||
if (nsegs == max_segs)
|
||||
goto split;
|
||||
|
||||
bvprv = bv;
|
||||
bvprvp = &bvprv;
|
||||
|
||||
if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
|
||||
nsegs++;
|
||||
sectors += bv.bv_len >> 9;
|
||||
} else if (bvec_split_segs(q, &bv, &nsegs, §ors,
|
||||
max_segs)) {
|
||||
goto split;
|
||||
}
|
||||
}
|
||||
|
||||
*segs = nsegs;
|
||||
|
@ -250,6 +277,19 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
|||
return bio_split(bio, sectors, GFP_NOIO, bs);
|
||||
}
|
||||
|
||||
/**
|
||||
* __blk_queue_split - split a bio and submit the second half
|
||||
* @q: [in] request queue pointer
|
||||
* @bio: [in, out] bio to be split
|
||||
* @nr_segs: [out] number of segments in the first bio
|
||||
*
|
||||
* Split a bio into two bios, chain the two bios, submit the second half and
|
||||
* store a pointer to the first half in *@bio. If the second bio is still too
|
||||
* big it will be split by a recursive call to this function. Since this
|
||||
* function may allocate a new bio from @q->bio_split, it is the responsibility
|
||||
* of the caller to ensure that @q is only released after processing of the
|
||||
* split bio has finished.
|
||||
*/
|
||||
void __blk_queue_split(struct request_queue *q, struct bio **bio,
|
||||
unsigned int *nr_segs)
|
||||
{
|
||||
|
@ -294,6 +334,17 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_queue_split - split a bio and submit the second half
|
||||
* @q: [in] request queue pointer
|
||||
* @bio: [in, out] bio to be split
|
||||
*
|
||||
* Split a bio into two bios, chains the two bios, submit the second half and
|
||||
* store a pointer to the first half in *@bio. Since this function may allocate
|
||||
* a new bio from @q->bio_split, it is the responsibility of the caller to
|
||||
* ensure that @q is only released after processing of the split bio has
|
||||
* finished.
|
||||
*/
|
||||
void blk_queue_split(struct request_queue *q, struct bio **bio)
|
||||
{
|
||||
unsigned int nr_segs;
|
||||
|
@ -305,6 +356,7 @@ EXPORT_SYMBOL(blk_queue_split);
|
|||
unsigned int blk_recalc_rq_segments(struct request *rq)
|
||||
{
|
||||
unsigned int nr_phys_segs = 0;
|
||||
unsigned int nr_sectors = 0;
|
||||
struct req_iterator iter;
|
||||
struct bio_vec bv;
|
||||
|
||||
|
@ -321,7 +373,8 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
|
|||
}
|
||||
|
||||
rq_for_each_bvec(bv, rq, iter)
|
||||
bvec_split_segs(rq->q, &bv, &nr_phys_segs, NULL, UINT_MAX);
|
||||
bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
|
||||
UINT_MAX, UINT_MAX);
|
||||
return nr_phys_segs;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,10 +15,10 @@
|
|||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
|
||||
static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
|
||||
unsigned int nr_queues, const int cpu)
|
||||
static int queue_index(struct blk_mq_queue_map *qmap,
|
||||
unsigned int nr_queues, const int q)
|
||||
{
|
||||
return qmap->queue_offset + (cpu % nr_queues);
|
||||
return qmap->queue_offset + (q % nr_queues);
|
||||
}
|
||||
|
||||
static int get_first_sibling(unsigned int cpu)
|
||||
|
@ -36,21 +36,36 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
|
|||
{
|
||||
unsigned int *map = qmap->mq_map;
|
||||
unsigned int nr_queues = qmap->nr_queues;
|
||||
unsigned int cpu, first_sibling;
|
||||
unsigned int cpu, first_sibling, q = 0;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
map[cpu] = -1;
|
||||
|
||||
/*
|
||||
* Spread queues among present CPUs first for minimizing
|
||||
* count of dead queues which are mapped by all un-present CPUs
|
||||
*/
|
||||
for_each_present_cpu(cpu) {
|
||||
if (q >= nr_queues)
|
||||
break;
|
||||
map[cpu] = queue_index(qmap, nr_queues, q++);
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
if (map[cpu] != -1)
|
||||
continue;
|
||||
/*
|
||||
* First do sequential mapping between CPUs and queues.
|
||||
* In case we still have CPUs to map, and we have some number of
|
||||
* threads per cores then map sibling threads to the same queue
|
||||
* for performance optimizations.
|
||||
*/
|
||||
if (cpu < nr_queues) {
|
||||
map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
|
||||
if (q < nr_queues) {
|
||||
map[cpu] = queue_index(qmap, nr_queues, q++);
|
||||
} else {
|
||||
first_sibling = get_first_sibling(cpu);
|
||||
if (first_sibling == cpu)
|
||||
map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
|
||||
map[cpu] = queue_index(qmap, nr_queues, q++);
|
||||
else
|
||||
map[cpu] = map[first_sibling];
|
||||
}
|
||||
|
|
|
@ -270,7 +270,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
|
|||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
lockdep_assert_held(&q->sysfs_dir_lock);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i)
|
||||
blk_mq_unregister_hctx(hctx);
|
||||
|
@ -320,7 +320,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
|
|||
int ret, i;
|
||||
|
||||
WARN_ON_ONCE(!q->kobj.parent);
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
lockdep_assert_held(&q->sysfs_dir_lock);
|
||||
|
||||
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
|
||||
if (ret < 0)
|
||||
|
@ -349,23 +349,12 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
|
|||
return ret;
|
||||
}
|
||||
|
||||
int blk_mq_register_dev(struct device *dev, struct request_queue *q)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
ret = __blk_mq_register_dev(dev, q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void blk_mq_sysfs_unregister(struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
mutex_lock(&q->sysfs_dir_lock);
|
||||
if (!q->mq_sysfs_init_done)
|
||||
goto unlock;
|
||||
|
||||
|
@ -373,7 +362,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
|
|||
blk_mq_unregister_hctx(hctx);
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
}
|
||||
|
||||
int blk_mq_sysfs_register(struct request_queue *q)
|
||||
|
@ -381,7 +370,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
|
|||
struct blk_mq_hw_ctx *hctx;
|
||||
int i, ret = 0;
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
mutex_lock(&q->sysfs_dir_lock);
|
||||
if (!q->mq_sysfs_init_done)
|
||||
goto unlock;
|
||||
|
||||
|
@ -392,7 +381,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
|
|||
}
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include <linux/module.h>
|
||||
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/delay.h>
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-tag.h"
|
||||
|
@ -354,6 +355,37 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
|
|||
}
|
||||
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
|
||||
|
||||
static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
|
||||
void *data, bool reserved)
|
||||
{
|
||||
unsigned *count = data;
|
||||
|
||||
if (blk_mq_request_completed(rq))
|
||||
(*count)++;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_mq_tagset_wait_completed_request - wait until all completed req's
|
||||
* complete funtion is run
|
||||
* @tagset: Tag set to drain completed request
|
||||
*
|
||||
* Note: This function has to be run after all IO queues are shutdown
|
||||
*/
|
||||
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
|
||||
{
|
||||
while (true) {
|
||||
unsigned count = 0;
|
||||
|
||||
blk_mq_tagset_busy_iter(tagset,
|
||||
blk_mq_tagset_count_completed_rqs, &count);
|
||||
if (!count)
|
||||
break;
|
||||
msleep(5);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
|
||||
|
||||
/**
|
||||
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
|
||||
* @q: Request queue to examine.
|
||||
|
|
|
@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
|
|||
|
||||
static int blk_mq_poll_stats_bkt(const struct request *rq)
|
||||
{
|
||||
int ddir, bytes, bucket;
|
||||
int ddir, sectors, bucket;
|
||||
|
||||
ddir = rq_data_dir(rq);
|
||||
bytes = blk_rq_bytes(rq);
|
||||
sectors = blk_rq_stats_sectors(rq);
|
||||
|
||||
bucket = ddir + 2*(ilog2(bytes) - 9);
|
||||
bucket = ddir + 2 * ilog2(sectors);
|
||||
|
||||
if (bucket < 0)
|
||||
return -1;
|
||||
|
@ -282,16 +282,16 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
|
|||
EXPORT_SYMBOL(blk_mq_can_queue);
|
||||
|
||||
/*
|
||||
* Only need start/end time stamping if we have stats enabled, or using
|
||||
* an IO scheduler.
|
||||
* Only need start/end time stamping if we have iostat or
|
||||
* blk stats enabled, or using an IO scheduler.
|
||||
*/
|
||||
static inline bool blk_mq_need_time_stamp(struct request *rq)
|
||||
{
|
||||
return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
|
||||
return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
|
||||
}
|
||||
|
||||
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
|
||||
unsigned int tag, unsigned int op)
|
||||
unsigned int tag, unsigned int op, u64 alloc_time_ns)
|
||||
{
|
||||
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
|
||||
struct request *rq = tags->static_rqs[tag];
|
||||
|
@ -325,11 +325,15 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
|
|||
RB_CLEAR_NODE(&rq->rb_node);
|
||||
rq->rq_disk = NULL;
|
||||
rq->part = NULL;
|
||||
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
|
||||
rq->alloc_time_ns = alloc_time_ns;
|
||||
#endif
|
||||
if (blk_mq_need_time_stamp(rq))
|
||||
rq->start_time_ns = ktime_get_ns();
|
||||
else
|
||||
rq->start_time_ns = 0;
|
||||
rq->io_start_time_ns = 0;
|
||||
rq->stats_sectors = 0;
|
||||
rq->nr_phys_segments = 0;
|
||||
#if defined(CONFIG_BLK_DEV_INTEGRITY)
|
||||
rq->nr_integrity_segments = 0;
|
||||
|
@ -356,8 +360,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
|
|||
struct request *rq;
|
||||
unsigned int tag;
|
||||
bool clear_ctx_on_error = false;
|
||||
u64 alloc_time_ns = 0;
|
||||
|
||||
blk_queue_enter_live(q);
|
||||
|
||||
/* alloc_time includes depth and tag waits */
|
||||
if (blk_queue_rq_alloc_time(q))
|
||||
alloc_time_ns = ktime_get_ns();
|
||||
|
||||
data->q = q;
|
||||
if (likely(!data->ctx)) {
|
||||
data->ctx = blk_mq_get_ctx(q);
|
||||
|
@ -393,7 +403,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
|
||||
rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
|
||||
if (!op_is_flush(data->cmd_flags)) {
|
||||
rq->elv.icq = NULL;
|
||||
if (e && e->type->ops.prepare_request) {
|
||||
|
@ -652,19 +662,18 @@ bool blk_mq_complete_request(struct request *rq)
|
|||
}
|
||||
EXPORT_SYMBOL(blk_mq_complete_request);
|
||||
|
||||
void blk_mq_complete_request_sync(struct request *rq)
|
||||
{
|
||||
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
|
||||
rq->q->mq_ops->complete(rq);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync);
|
||||
|
||||
int blk_mq_request_started(struct request *rq)
|
||||
{
|
||||
return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_request_started);
|
||||
|
||||
int blk_mq_request_completed(struct request *rq)
|
||||
{
|
||||
return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_request_completed);
|
||||
|
||||
void blk_mq_start_request(struct request *rq)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
|
@ -673,9 +682,7 @@ void blk_mq_start_request(struct request *rq)
|
|||
|
||||
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
|
||||
rq->io_start_time_ns = ktime_get_ns();
|
||||
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
rq->throtl_size = blk_rq_sectors(rq);
|
||||
#endif
|
||||
rq->stats_sectors = blk_rq_sectors(rq);
|
||||
rq->rq_flags |= RQF_STATS;
|
||||
rq_qos_issue(q, rq);
|
||||
}
|
||||
|
@ -2453,11 +2460,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
|||
struct blk_mq_ctx *ctx;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
|
||||
/*
|
||||
* Avoid others reading imcomplete hctx->cpumask through sysfs
|
||||
*/
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
cpumask_clear(hctx->cpumask);
|
||||
hctx->nr_ctx = 0;
|
||||
|
@ -2518,8 +2520,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
|||
HCTX_TYPE_DEFAULT, i);
|
||||
}
|
||||
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
/*
|
||||
* If no software queues are mapped to this hardware queue,
|
||||
|
@ -2688,7 +2688,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
|||
if (!uninit_q)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
q = blk_mq_init_allocated_queue(set, uninit_q);
|
||||
/*
|
||||
* Initialize the queue without an elevator. device_add_disk() will do
|
||||
* the initialization.
|
||||
*/
|
||||
q = blk_mq_init_allocated_queue(set, uninit_q, false);
|
||||
if (IS_ERR(q))
|
||||
blk_cleanup_queue(uninit_q);
|
||||
|
||||
|
@ -2839,7 +2843,8 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
|
|||
}
|
||||
|
||||
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q)
|
||||
struct request_queue *q,
|
||||
bool elevator_init)
|
||||
{
|
||||
/* mark the queue as mq asap */
|
||||
q->mq_ops = set->ops;
|
||||
|
@ -2901,18 +2906,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
|||
blk_mq_add_queue_tag_set(set, q);
|
||||
blk_mq_map_swqueue(q);
|
||||
|
||||
if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
|
||||
int ret;
|
||||
|
||||
ret = elevator_init_mq(q);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
if (elevator_init)
|
||||
elevator_init_mq(q);
|
||||
|
||||
return q;
|
||||
|
||||
err_hctxs:
|
||||
kfree(q->queue_hw_ctx);
|
||||
q->nr_hw_queues = 0;
|
||||
err_sys_init:
|
||||
blk_mq_sysfs_deinit(q);
|
||||
err_poll:
|
||||
|
|
|
@ -207,10 +207,12 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
|
|||
*/
|
||||
void blk_set_runtime_active(struct request_queue *q)
|
||||
{
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
pm_request_autosuspend(q->dev);
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
if (q->dev) {
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
pm_request_autosuspend(q->dev);
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(blk_set_runtime_active);
|
||||
|
|
|
@ -83,6 +83,15 @@ void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
|
|||
} while (rqos);
|
||||
}
|
||||
|
||||
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
|
||||
{
|
||||
do {
|
||||
if (rqos->ops->merge)
|
||||
rqos->ops->merge(rqos, rq, bio);
|
||||
rqos = rqos->next;
|
||||
} while (rqos);
|
||||
}
|
||||
|
||||
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
|
||||
{
|
||||
do {
|
||||
|
@ -92,6 +101,15 @@ void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
|
|||
} while (rqos);
|
||||
}
|
||||
|
||||
void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
|
||||
{
|
||||
do {
|
||||
if (rqos->ops->queue_depth_changed)
|
||||
rqos->ops->queue_depth_changed(rqos);
|
||||
rqos = rqos->next;
|
||||
} while (rqos);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true, if we can't increase the depth further by scaling
|
||||
*/
|
||||
|
|
|
@ -14,7 +14,8 @@ struct blk_mq_debugfs_attr;
|
|||
|
||||
enum rq_qos_id {
|
||||
RQ_QOS_WBT,
|
||||
RQ_QOS_CGROUP,
|
||||
RQ_QOS_LATENCY,
|
||||
RQ_QOS_COST,
|
||||
};
|
||||
|
||||
struct rq_wait {
|
||||
|
@ -35,11 +36,13 @@ struct rq_qos {
|
|||
struct rq_qos_ops {
|
||||
void (*throttle)(struct rq_qos *, struct bio *);
|
||||
void (*track)(struct rq_qos *, struct request *, struct bio *);
|
||||
void (*merge)(struct rq_qos *, struct request *, struct bio *);
|
||||
void (*issue)(struct rq_qos *, struct request *);
|
||||
void (*requeue)(struct rq_qos *, struct request *);
|
||||
void (*done)(struct rq_qos *, struct request *);
|
||||
void (*done_bio)(struct rq_qos *, struct bio *);
|
||||
void (*cleanup)(struct rq_qos *, struct bio *);
|
||||
void (*queue_depth_changed)(struct rq_qos *);
|
||||
void (*exit)(struct rq_qos *);
|
||||
const struct blk_mq_debugfs_attr *debugfs_attrs;
|
||||
};
|
||||
|
@ -72,7 +75,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
|
|||
|
||||
static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
|
||||
{
|
||||
return rq_qos_id(q, RQ_QOS_CGROUP);
|
||||
return rq_qos_id(q, RQ_QOS_LATENCY);
|
||||
}
|
||||
|
||||
static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
|
||||
|
@ -80,8 +83,10 @@ static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
|
|||
switch (id) {
|
||||
case RQ_QOS_WBT:
|
||||
return "wbt";
|
||||
case RQ_QOS_CGROUP:
|
||||
return "cgroup";
|
||||
case RQ_QOS_LATENCY:
|
||||
return "latency";
|
||||
case RQ_QOS_COST:
|
||||
return "cost";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
@ -135,7 +140,9 @@ void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
|
|||
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
|
||||
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
|
||||
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
|
||||
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
|
||||
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
|
||||
void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
|
||||
|
||||
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
|
@ -185,6 +192,19 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
|
|||
__rq_qos_track(q->rq_qos, rq, bio);
|
||||
}
|
||||
|
||||
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
|
||||
struct bio *bio)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
__rq_qos_merge(q->rq_qos, rq, bio);
|
||||
}
|
||||
|
||||
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
__rq_qos_queue_depth_changed(q->rq_qos);
|
||||
}
|
||||
|
||||
void rq_qos_exit(struct request_queue *);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -805,7 +805,7 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment);
|
|||
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
|
||||
{
|
||||
q->queue_depth = depth;
|
||||
wbt_set_queue_depth(q, depth);
|
||||
rq_qos_queue_depth_changed(q);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_set_queue_depth);
|
||||
|
||||
|
@ -832,6 +832,22 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
|
||||
|
||||
/**
|
||||
* blk_queue_required_elevator_features - Set a queue required elevator features
|
||||
* @q: the request queue for the target device
|
||||
* @features: Required elevator features OR'ed together
|
||||
*
|
||||
* Tell the block layer that for the device controlled through @q, only the
|
||||
* only elevators that can be used are those that implement at least the set of
|
||||
* features specified by @features.
|
||||
*/
|
||||
void blk_queue_required_elevator_features(struct request_queue *q,
|
||||
unsigned int features)
|
||||
{
|
||||
q->required_elevator_features = features;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
|
||||
|
||||
static int __init blk_settings_init(void)
|
||||
{
|
||||
blk_max_low_pfn = max_low_pfn - 1;
|
||||
|
|
|
@ -941,14 +941,14 @@ int blk_register_queue(struct gendisk *disk)
|
|||
int ret;
|
||||
struct device *dev = disk_to_dev(disk);
|
||||
struct request_queue *q = disk->queue;
|
||||
bool has_elevator = false;
|
||||
|
||||
if (WARN_ON(!q))
|
||||
return -ENXIO;
|
||||
|
||||
WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
|
||||
WARN_ONCE(blk_queue_registered(q),
|
||||
"%s is registering an already registered queue\n",
|
||||
kobject_name(&dev->kobj));
|
||||
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
|
||||
|
||||
/*
|
||||
* SCSI probing may synchronously create and destroy a lot of
|
||||
|
@ -968,8 +968,7 @@ int blk_register_queue(struct gendisk *disk)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Prevent changes through sysfs until registration is completed. */
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
mutex_lock(&q->sysfs_dir_lock);
|
||||
|
||||
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
|
||||
if (ret < 0) {
|
||||
|
@ -990,26 +989,36 @@ int blk_register_queue(struct gendisk *disk)
|
|||
blk_mq_debugfs_register(q);
|
||||
}
|
||||
|
||||
kobject_uevent(&q->kobj, KOBJ_ADD);
|
||||
|
||||
wbt_enable_default(q);
|
||||
|
||||
blk_throtl_register_queue(q);
|
||||
|
||||
/*
|
||||
* The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
|
||||
* switch won't happen at all.
|
||||
*/
|
||||
if (q->elevator) {
|
||||
ret = elv_register_queue(q);
|
||||
ret = elv_register_queue(q, false);
|
||||
if (ret) {
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
kobject_uevent(&q->kobj, KOBJ_REMOVE);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
kobject_del(&q->kobj);
|
||||
blk_trace_remove_sysfs(dev);
|
||||
kobject_put(&dev->kobj);
|
||||
return ret;
|
||||
}
|
||||
has_elevator = true;
|
||||
}
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
|
||||
wbt_enable_default(q);
|
||||
blk_throtl_register_queue(q);
|
||||
|
||||
/* Now everything is ready and send out KOBJ_ADD uevent */
|
||||
kobject_uevent(&q->kobj, KOBJ_ADD);
|
||||
if (has_elevator)
|
||||
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
ret = 0;
|
||||
unlock:
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_register_queue);
|
||||
|
@ -1029,7 +1038,7 @@ void blk_unregister_queue(struct gendisk *disk)
|
|||
return;
|
||||
|
||||
/* Return early if disk->queue was never registered. */
|
||||
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
|
||||
if (!blk_queue_registered(q))
|
||||
return;
|
||||
|
||||
/*
|
||||
|
@ -1038,25 +1047,28 @@ void blk_unregister_queue(struct gendisk *disk)
|
|||
* concurrent elv_iosched_store() calls.
|
||||
*/
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
|
||||
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
mutex_lock(&q->sysfs_dir_lock);
|
||||
/*
|
||||
* Remove the sysfs attributes before unregistering the queue data
|
||||
* structures that can be modified through sysfs.
|
||||
*/
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_unregister_dev(disk_to_dev(disk), q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
kobject_uevent(&q->kobj, KOBJ_REMOVE);
|
||||
kobject_del(&q->kobj);
|
||||
blk_trace_remove_sysfs(disk_to_dev(disk));
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
/*
|
||||
* q->kobj has been removed, so it is safe to check if elevator
|
||||
* exists without holding q->sysfs_lock.
|
||||
*/
|
||||
if (q->elevator)
|
||||
elv_unregister_queue(q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
|
||||
kobject_put(&disk_to_dev(disk)->kobj);
|
||||
}
|
||||
|
|
|
@ -478,12 +478,14 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
|
|||
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
|
||||
}
|
||||
|
||||
static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
|
||||
static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
|
||||
struct request_queue *q,
|
||||
struct blkcg *blkcg)
|
||||
{
|
||||
struct throtl_grp *tg;
|
||||
int rw;
|
||||
|
||||
tg = kzalloc_node(sizeof(*tg), gfp, node);
|
||||
tg = kzalloc_node(sizeof(*tg), gfp, q->node);
|
||||
if (!tg)
|
||||
return NULL;
|
||||
|
||||
|
@ -2246,7 +2248,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
|
|||
struct request_queue *q = rq->q;
|
||||
struct throtl_data *td = q->td;
|
||||
|
||||
throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
|
||||
throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
|
||||
time_ns >> 10);
|
||||
}
|
||||
|
||||
void blk_throtl_bio_endio(struct bio *bio)
|
||||
|
|
|
@ -629,15 +629,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
|
|||
}
|
||||
}
|
||||
|
||||
void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (rqos) {
|
||||
RQWB(rqos)->rq_depth.queue_depth = depth;
|
||||
__wbt_update_limits(RQWB(rqos));
|
||||
}
|
||||
}
|
||||
|
||||
void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
|
@ -656,7 +647,7 @@ void wbt_enable_default(struct request_queue *q)
|
|||
return;
|
||||
|
||||
/* Queue not registered? Maybe shutting down... */
|
||||
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
|
||||
if (!blk_queue_registered(q))
|
||||
return;
|
||||
|
||||
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
|
||||
|
@ -689,6 +680,12 @@ static int wbt_data_dir(const struct request *rq)
|
|||
return -1;
|
||||
}
|
||||
|
||||
static void wbt_queue_depth_changed(struct rq_qos *rqos)
|
||||
{
|
||||
RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
|
||||
__wbt_update_limits(RQWB(rqos));
|
||||
}
|
||||
|
||||
static void wbt_exit(struct rq_qos *rqos)
|
||||
{
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
|
@ -811,6 +808,7 @@ static struct rq_qos_ops wbt_rqos_ops = {
|
|||
.requeue = wbt_requeue,
|
||||
.done = wbt_done,
|
||||
.cleanup = wbt_cleanup,
|
||||
.queue_depth_changed = wbt_queue_depth_changed,
|
||||
.exit = wbt_exit,
|
||||
#ifdef CONFIG_BLK_DEBUG_FS
|
||||
.debugfs_attrs = wbt_debugfs_attrs,
|
||||
|
@ -853,7 +851,7 @@ int wbt_init(struct request_queue *q)
|
|||
|
||||
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
|
||||
|
||||
wbt_set_queue_depth(q, blk_queue_depth(q));
|
||||
wbt_queue_depth_changed(&rwb->rqos);
|
||||
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -95,7 +95,6 @@ void wbt_enable_default(struct request_queue *);
|
|||
u64 wbt_get_min_lat(struct request_queue *q);
|
||||
void wbt_set_min_lat(struct request_queue *q, u64 val);
|
||||
|
||||
void wbt_set_queue_depth(struct request_queue *, unsigned int);
|
||||
void wbt_set_write_cache(struct request_queue *, bool);
|
||||
|
||||
u64 wbt_default_latency_nsec(struct request_queue *);
|
||||
|
@ -118,9 +117,6 @@ static inline void wbt_disable_default(struct request_queue *q)
|
|||
static inline void wbt_enable_default(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
|
||||
{
|
||||
}
|
||||
static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -202,6 +202,42 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(blkdev_report_zones);
|
||||
|
||||
/*
|
||||
* Special case of zone reset operation to reset all zones in one command,
|
||||
* useful for applications like mkfs.
|
||||
*/
|
||||
static int __blkdev_reset_all_zones(struct block_device *bdev, gfp_t gfp_mask)
|
||||
{
|
||||
struct bio *bio = bio_alloc(gfp_mask, 0);
|
||||
int ret;
|
||||
|
||||
/* across the zones operations, don't need any sectors */
|
||||
bio_set_dev(bio, bdev);
|
||||
bio_set_op_attrs(bio, REQ_OP_ZONE_RESET_ALL, 0);
|
||||
|
||||
ret = submit_bio_wait(bio);
|
||||
bio_put(bio);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
|
||||
sector_t nr_sectors)
|
||||
{
|
||||
if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
|
||||
return false;
|
||||
|
||||
if (nr_sectors != part_nr_sects_read(bdev->bd_part))
|
||||
return false;
|
||||
/*
|
||||
* REQ_OP_ZONE_RESET_ALL can be executed only if the block device is
|
||||
* the entire disk, that is, if the blocks device start offset is 0 and
|
||||
* its capacity is the same as the entire disk.
|
||||
*/
|
||||
return get_start_sect(bdev) == 0 &&
|
||||
part_nr_sects_read(bdev->bd_part) == get_capacity(bdev->bd_disk);
|
||||
}
|
||||
|
||||
/**
|
||||
* blkdev_reset_zones - Reset zones write pointer
|
||||
* @bdev: Target block device
|
||||
|
@ -235,6 +271,9 @@ int blkdev_reset_zones(struct block_device *bdev,
|
|||
/* Out of range */
|
||||
return -EINVAL;
|
||||
|
||||
if (blkdev_allow_reset_all_zones(bdev, nr_sectors))
|
||||
return __blkdev_reset_all_zones(bdev, gfp_mask);
|
||||
|
||||
/* Check alignment (handle eventual smaller last zone) */
|
||||
zone_sectors = blk_queue_zone_sectors(q);
|
||||
if (sector & (zone_sectors - 1))
|
||||
|
|
|
@ -184,11 +184,11 @@ void blk_account_io_done(struct request *req, u64 now);
|
|||
|
||||
void blk_insert_flush(struct request *rq);
|
||||
|
||||
int elevator_init_mq(struct request_queue *q);
|
||||
void elevator_init_mq(struct request_queue *q);
|
||||
int elevator_switch_mq(struct request_queue *q,
|
||||
struct elevator_type *new_e);
|
||||
void __elevator_exit(struct request_queue *, struct elevator_queue *);
|
||||
int elv_register_queue(struct request_queue *q);
|
||||
int elv_register_queue(struct request_queue *q, bool uevent);
|
||||
void elv_unregister_queue(struct request_queue *q);
|
||||
|
||||
static inline void elevator_exit(struct request_queue *q,
|
||||
|
|
217
block/elevator.c
217
block/elevator.c
|
@ -83,8 +83,26 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
|
|||
}
|
||||
EXPORT_SYMBOL(elv_bio_merge_ok);
|
||||
|
||||
static bool elevator_match(const struct elevator_type *e, const char *name)
|
||||
static inline bool elv_support_features(unsigned int elv_features,
|
||||
unsigned int required_features)
|
||||
{
|
||||
return (required_features & elv_features) == required_features;
|
||||
}
|
||||
|
||||
/**
|
||||
* elevator_match - Test an elevator name and features
|
||||
* @e: Scheduler to test
|
||||
* @name: Elevator name to test
|
||||
* @required_features: Features that the elevator must provide
|
||||
*
|
||||
* Return true is the elevator @e name matches @name and if @e provides all the
|
||||
* the feratures spcified by @required_features.
|
||||
*/
|
||||
static bool elevator_match(const struct elevator_type *e, const char *name,
|
||||
unsigned int required_features)
|
||||
{
|
||||
if (!elv_support_features(e->elevator_features, required_features))
|
||||
return false;
|
||||
if (!strcmp(e->elevator_name, name))
|
||||
return true;
|
||||
if (e->elevator_alias && !strcmp(e->elevator_alias, name))
|
||||
|
@ -93,15 +111,21 @@ static bool elevator_match(const struct elevator_type *e, const char *name)
|
|||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return scheduler with name 'name'
|
||||
/**
|
||||
* elevator_find - Find an elevator
|
||||
* @name: Name of the elevator to find
|
||||
* @required_features: Features that the elevator must provide
|
||||
*
|
||||
* Return the first registered scheduler with name @name and supporting the
|
||||
* features @required_features and NULL otherwise.
|
||||
*/
|
||||
static struct elevator_type *elevator_find(const char *name)
|
||||
static struct elevator_type *elevator_find(const char *name,
|
||||
unsigned int required_features)
|
||||
{
|
||||
struct elevator_type *e;
|
||||
|
||||
list_for_each_entry(e, &elv_list, list) {
|
||||
if (elevator_match(e, name))
|
||||
if (elevator_match(e, name, required_features))
|
||||
return e;
|
||||
}
|
||||
|
||||
|
@ -120,12 +144,12 @@ static struct elevator_type *elevator_get(struct request_queue *q,
|
|||
|
||||
spin_lock(&elv_list_lock);
|
||||
|
||||
e = elevator_find(name);
|
||||
e = elevator_find(name, q->required_elevator_features);
|
||||
if (!e && try_loading) {
|
||||
spin_unlock(&elv_list_lock);
|
||||
request_module("%s-iosched", name);
|
||||
spin_lock(&elv_list_lock);
|
||||
e = elevator_find(name);
|
||||
e = elevator_find(name, q->required_elevator_features);
|
||||
}
|
||||
|
||||
if (e && !try_module_get(e->elevator_owner))
|
||||
|
@ -135,20 +159,6 @@ static struct elevator_type *elevator_get(struct request_queue *q,
|
|||
return e;
|
||||
}
|
||||
|
||||
static char chosen_elevator[ELV_NAME_MAX];
|
||||
|
||||
static int __init elevator_setup(char *str)
|
||||
{
|
||||
/*
|
||||
* Be backwards-compatible with previous kernels, so users
|
||||
* won't get the wrong elevator.
|
||||
*/
|
||||
strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
__setup("elevator=", elevator_setup);
|
||||
|
||||
static struct kobj_type elv_ktype;
|
||||
|
||||
struct elevator_queue *elevator_alloc(struct request_queue *q,
|
||||
|
@ -470,13 +480,16 @@ static struct kobj_type elv_ktype = {
|
|||
.release = elevator_release,
|
||||
};
|
||||
|
||||
int elv_register_queue(struct request_queue *q)
|
||||
/*
|
||||
* elv_register_queue is called from either blk_register_queue or
|
||||
* elevator_switch, elevator switch is prevented from being happen
|
||||
* in the two paths, so it is safe to not hold q->sysfs_lock.
|
||||
*/
|
||||
int elv_register_queue(struct request_queue *q, bool uevent)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
int error;
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
|
||||
if (!error) {
|
||||
struct elv_fs_entry *attr = e->type->elevator_attrs;
|
||||
|
@ -487,24 +500,34 @@ int elv_register_queue(struct request_queue *q)
|
|||
attr++;
|
||||
}
|
||||
}
|
||||
kobject_uevent(&e->kobj, KOBJ_ADD);
|
||||
if (uevent)
|
||||
kobject_uevent(&e->kobj, KOBJ_ADD);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
e->registered = 1;
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* elv_unregister_queue is called from either blk_unregister_queue or
|
||||
* elevator_switch, elevator switch is prevented from being happen
|
||||
* in the two paths, so it is safe to not hold q->sysfs_lock.
|
||||
*/
|
||||
void elv_unregister_queue(struct request_queue *q)
|
||||
{
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
if (q) {
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
kobject_uevent(&e->kobj, KOBJ_REMOVE);
|
||||
kobject_del(&e->kobj);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
e->registered = 0;
|
||||
/* Re-enable throttling in case elevator disabled it */
|
||||
wbt_enable_default(q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -526,7 +549,7 @@ int elv_register(struct elevator_type *e)
|
|||
|
||||
/* register, don't allow duplicate names */
|
||||
spin_lock(&elv_list_lock);
|
||||
if (elevator_find(e->elevator_name)) {
|
||||
if (elevator_find(e->elevator_name, 0)) {
|
||||
spin_unlock(&elv_list_lock);
|
||||
kmem_cache_destroy(e->icq_cache);
|
||||
return -EBUSY;
|
||||
|
@ -567,10 +590,32 @@ int elevator_switch_mq(struct request_queue *q,
|
|||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
if (q->elevator) {
|
||||
if (q->elevator->registered)
|
||||
if (q->elevator->registered) {
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
/*
|
||||
* Concurrent elevator switch can't happen becasue
|
||||
* sysfs write is always exclusively on same file.
|
||||
*
|
||||
* Also the elevator queue won't be freed after
|
||||
* sysfs_lock is released becasue kobject_del() in
|
||||
* blk_unregister_queue() waits for completion of
|
||||
* .store & .show on its attributes.
|
||||
*/
|
||||
elv_unregister_queue(q);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
}
|
||||
ioc_clear_queue(q);
|
||||
elevator_exit(q, q->elevator);
|
||||
|
||||
/*
|
||||
* sysfs_lock may be dropped, so re-check if queue is
|
||||
* unregistered. If yes, don't switch to new elevator
|
||||
* any more
|
||||
*/
|
||||
if (!blk_queue_registered(q))
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = blk_mq_init_sched(q, new_e);
|
||||
|
@ -578,7 +623,11 @@ int elevator_switch_mq(struct request_queue *q,
|
|||
goto out;
|
||||
|
||||
if (new_e) {
|
||||
ret = elv_register_queue(q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
ret = elv_register_queue(q, true);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
if (ret) {
|
||||
elevator_exit(q, q->elevator);
|
||||
goto out;
|
||||
|
@ -594,37 +643,89 @@ int elevator_switch_mq(struct request_queue *q,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static inline bool elv_support_iosched(struct request_queue *q)
|
||||
{
|
||||
if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* For blk-mq devices, we default to using mq-deadline, if available, for single
|
||||
* queue devices. If deadline isn't available OR we have multiple queues,
|
||||
* default to "none".
|
||||
* For single queue devices, default to using mq-deadline. If we have multiple
|
||||
* queues or mq-deadline is not available, default to "none".
|
||||
*/
|
||||
int elevator_init_mq(struct request_queue *q)
|
||||
static struct elevator_type *elevator_get_default(struct request_queue *q)
|
||||
{
|
||||
if (q->nr_hw_queues != 1)
|
||||
return NULL;
|
||||
|
||||
return elevator_get(q, "mq-deadline", false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the first elevator providing the features required by the request queue.
|
||||
* Default to "none" if no matching elevator is found.
|
||||
*/
|
||||
static struct elevator_type *elevator_get_by_features(struct request_queue *q)
|
||||
{
|
||||
struct elevator_type *e, *found = NULL;
|
||||
|
||||
spin_lock(&elv_list_lock);
|
||||
|
||||
list_for_each_entry(e, &elv_list, list) {
|
||||
if (elv_support_features(e->elevator_features,
|
||||
q->required_elevator_features)) {
|
||||
found = e;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found && !try_module_get(found->elevator_owner))
|
||||
found = NULL;
|
||||
|
||||
spin_unlock(&elv_list_lock);
|
||||
return found;
|
||||
}
|
||||
|
||||
/*
|
||||
* For a device queue that has no required features, use the default elevator
|
||||
* settings. Otherwise, use the first elevator available matching the required
|
||||
* features. If no suitable elevator is find or if the chosen elevator
|
||||
* initialization fails, fall back to the "none" elevator (no elevator).
|
||||
*/
|
||||
void elevator_init_mq(struct request_queue *q)
|
||||
{
|
||||
struct elevator_type *e;
|
||||
int err = 0;
|
||||
int err;
|
||||
|
||||
if (q->nr_hw_queues != 1)
|
||||
return 0;
|
||||
if (!elv_support_iosched(q))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
|
||||
|
||||
/*
|
||||
* q->sysfs_lock must be held to provide mutual exclusion between
|
||||
* elevator_switch() and here.
|
||||
*/
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
if (unlikely(q->elevator))
|
||||
goto out_unlock;
|
||||
return;
|
||||
|
||||
e = elevator_get(q, "mq-deadline", false);
|
||||
if (!q->required_elevator_features)
|
||||
e = elevator_get_default(q);
|
||||
else
|
||||
e = elevator_get_by_features(q);
|
||||
if (!e)
|
||||
goto out_unlock;
|
||||
return;
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
err = blk_mq_init_sched(q, e);
|
||||
if (err)
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
if (err) {
|
||||
pr_warn("\"%s\" elevator initialization failed, "
|
||||
"falling back to \"none\"\n", e->elevator_name);
|
||||
elevator_put(e);
|
||||
out_unlock:
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -660,7 +761,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
|
|||
struct elevator_type *e;
|
||||
|
||||
/* Make sure queue is not in the middle of being removed */
|
||||
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
|
||||
if (!blk_queue_registered(q))
|
||||
return -ENOENT;
|
||||
|
||||
/*
|
||||
|
@ -677,7 +778,8 @@ static int __elevator_change(struct request_queue *q, const char *name)
|
|||
if (!e)
|
||||
return -EINVAL;
|
||||
|
||||
if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
|
||||
if (q->elevator &&
|
||||
elevator_match(q->elevator->type, elevator_name, 0)) {
|
||||
elevator_put(e);
|
||||
return 0;
|
||||
}
|
||||
|
@ -685,13 +787,6 @@ static int __elevator_change(struct request_queue *q, const char *name)
|
|||
return elevator_switch(q, e);
|
||||
}
|
||||
|
||||
static inline bool elv_support_iosched(struct request_queue *q)
|
||||
{
|
||||
if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
|
||||
size_t count)
|
||||
{
|
||||
|
@ -724,11 +819,13 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
|
|||
|
||||
spin_lock(&elv_list_lock);
|
||||
list_for_each_entry(__e, &elv_list, list) {
|
||||
if (elv && elevator_match(elv, __e->elevator_name)) {
|
||||
if (elv && elevator_match(elv, __e->elevator_name, 0)) {
|
||||
len += sprintf(name+len, "[%s] ", elv->elevator_name);
|
||||
continue;
|
||||
}
|
||||
if (elv_support_iosched(q))
|
||||
if (elv_support_iosched(q) &&
|
||||
elevator_match(__e, __e->elevator_name,
|
||||
q->required_elevator_features))
|
||||
len += sprintf(name+len, "%s ", __e->elevator_name);
|
||||
}
|
||||
spin_unlock(&elv_list_lock);
|
||||
|
|
|
@ -695,6 +695,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
|
|||
dev_t devt;
|
||||
int retval;
|
||||
|
||||
/*
|
||||
* The disk queue should now be all set with enough information about
|
||||
* the device for the elevator code to pick an adequate default
|
||||
* elevator if one is needed, that is, for devices requesting queue
|
||||
* registration.
|
||||
*/
|
||||
if (register_queue)
|
||||
elevator_init_mq(disk->queue);
|
||||
|
||||
/* minors == 0 indicates to use ext devt from part0 and should
|
||||
* be accompanied with EXT_DEVT flag. Make sure all
|
||||
* parameters make sense.
|
||||
|
|
|
@ -377,13 +377,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd)
|
|||
* hardware queue, but we may return a request that is for a
|
||||
* different hardware queue. This is because mq-deadline has shared
|
||||
* state for all hardware queues, in terms of sorting, FIFOs, etc.
|
||||
*
|
||||
* For a zoned block device, __dd_dispatch_request() may return NULL
|
||||
* if all the queued write requests are directed at zones that are already
|
||||
* locked due to on-going write requests. In this case, make sure to mark
|
||||
* the queue as needing a restart to ensure that the queue is run again
|
||||
* and the pending writes dispatched once the target zones for the ongoing
|
||||
* write requests are unlocked in dd_finish_request().
|
||||
*/
|
||||
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
|
@ -392,9 +385,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
|||
|
||||
spin_lock(&dd->lock);
|
||||
rq = __dd_dispatch_request(dd);
|
||||
if (!rq && blk_queue_is_zoned(hctx->queue) &&
|
||||
!list_empty(&dd->fifo_list[WRITE]))
|
||||
blk_mq_sched_mark_restart_hctx(hctx);
|
||||
spin_unlock(&dd->lock);
|
||||
|
||||
return rq;
|
||||
|
@ -561,6 +551,13 @@ static void dd_prepare_request(struct request *rq, struct bio *bio)
|
|||
* spinlock so that the zone is never unlocked while deadline_fifo_request()
|
||||
* or deadline_next_request() are executing. This function is called for
|
||||
* all requests, whether or not these requests complete successfully.
|
||||
*
|
||||
* For a zoned block device, __dd_dispatch_request() may have stopped
|
||||
* dispatching requests if all the queued requests are write requests directed
|
||||
* at zones that are already locked due to on-going write requests. To ensure
|
||||
* write request dispatch progress in this case, mark the queue as needing a
|
||||
* restart to ensure that the queue is run again after completion of the
|
||||
* request and zones being unlocked.
|
||||
*/
|
||||
static void dd_finish_request(struct request *rq)
|
||||
{
|
||||
|
@ -572,6 +569,8 @@ static void dd_finish_request(struct request *rq)
|
|||
|
||||
spin_lock_irqsave(&dd->zone_lock, flags);
|
||||
blk_req_zone_write_unlock(rq);
|
||||
if (!list_empty(&dd->fifo_list[WRITE]))
|
||||
blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
|
||||
spin_unlock_irqrestore(&dd->zone_lock, flags);
|
||||
}
|
||||
}
|
||||
|
@ -795,6 +794,7 @@ static struct elevator_type mq_deadline = {
|
|||
.elevator_attrs = deadline_attrs,
|
||||
.elevator_name = "mq-deadline",
|
||||
.elevator_alias = "deadline",
|
||||
.elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
|
||||
.elevator_owner = THIS_MODULE,
|
||||
};
|
||||
MODULE_ALIAS("mq-deadline-iosched");
|
||||
|
|
|
@ -119,8 +119,6 @@ enum opal_uid {
|
|||
OPAL_UID_HEXFF,
|
||||
};
|
||||
|
||||
#define OPAL_METHOD_LENGTH 8
|
||||
|
||||
/* Enum for indexing the OPALMETHOD array */
|
||||
enum opal_method {
|
||||
OPAL_PROPERTIES,
|
||||
|
@ -167,7 +165,6 @@ enum opal_token {
|
|||
OPAL_TABLE_LASTID = 0x0A,
|
||||
OPAL_TABLE_MIN = 0x0B,
|
||||
OPAL_TABLE_MAX = 0x0C,
|
||||
|
||||
/* authority table */
|
||||
OPAL_PIN = 0x03,
|
||||
/* locking tokens */
|
||||
|
@ -182,7 +179,7 @@ enum opal_token {
|
|||
OPAL_LIFECYCLE = 0x06,
|
||||
/* locking info table */
|
||||
OPAL_MAXRANGES = 0x04,
|
||||
/* mbr control */
|
||||
/* mbr control */
|
||||
OPAL_MBRENABLE = 0x01,
|
||||
OPAL_MBRDONE = 0x02,
|
||||
/* properties */
|
||||
|
|
|
@ -129,7 +129,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
|
|||
{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 },
|
||||
|
||||
/* tables */
|
||||
|
||||
[OPAL_TABLE_TABLE]
|
||||
{ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 },
|
||||
[OPAL_LOCKINGRANGE_GLOBAL] =
|
||||
|
@ -152,7 +151,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
|
|||
{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
|
||||
|
||||
/* C_PIN_TABLE object ID's */
|
||||
|
||||
[OPAL_C_PIN_MSID] =
|
||||
{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
|
||||
[OPAL_C_PIN_SID] =
|
||||
|
@ -161,7 +159,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
|
|||
{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
|
||||
|
||||
/* half UID's (only first 4 bytes used) */
|
||||
|
||||
[OPAL_HALF_UID_AUTHORITY_OBJ_REF] =
|
||||
{ 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
|
||||
[OPAL_HALF_UID_BOOLEAN_ACE] =
|
||||
|
@ -517,6 +514,7 @@ static int opal_discovery0(struct opal_dev *dev, void *data)
|
|||
ret = opal_recv_cmd(dev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return opal_discovery0_end(dev);
|
||||
}
|
||||
|
||||
|
@ -525,6 +523,7 @@ static int opal_discovery0_step(struct opal_dev *dev)
|
|||
const struct opal_step discovery0_step = {
|
||||
opal_discovery0,
|
||||
};
|
||||
|
||||
return execute_step(dev, &discovery0_step, 0);
|
||||
}
|
||||
|
||||
|
@ -551,6 +550,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
|
|||
{
|
||||
if (!can_add(err, cmd, 1))
|
||||
return;
|
||||
|
||||
cmd->cmd[cmd->pos++] = tok;
|
||||
}
|
||||
|
||||
|
@ -577,6 +577,7 @@ static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
|
|||
header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0;
|
||||
header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0;
|
||||
header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK;
|
||||
|
||||
cmd->cmd[cmd->pos++] = header0;
|
||||
cmd->cmd[cmd->pos++] = len;
|
||||
}
|
||||
|
@ -649,6 +650,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
|
|||
|
||||
if (lr == 0)
|
||||
return 0;
|
||||
|
||||
buffer[5] = LOCKING_RANGE_NON_GLOBAL;
|
||||
buffer[7] = lr;
|
||||
|
||||
|
@ -903,10 +905,6 @@ static int response_parse(const u8 *buf, size_t length,
|
|||
num_entries++;
|
||||
}
|
||||
|
||||
if (num_entries == 0) {
|
||||
pr_debug("Couldn't parse response.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
resp->num = num_entries;
|
||||
|
||||
return 0;
|
||||
|
@ -945,6 +943,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
|
|||
}
|
||||
|
||||
*store = tok->pos + skip;
|
||||
|
||||
return tok->len - skip;
|
||||
}
|
||||
|
||||
|
@ -1062,6 +1061,7 @@ static int start_opal_session_cont(struct opal_dev *dev)
|
|||
|
||||
dev->hsn = hsn;
|
||||
dev->tsn = tsn;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1084,6 +1084,7 @@ static int end_session_cont(struct opal_dev *dev)
|
|||
{
|
||||
dev->hsn = 0;
|
||||
dev->tsn = 0;
|
||||
|
||||
return parse_and_check_status(dev);
|
||||
}
|
||||
|
||||
|
@ -1172,6 +1173,7 @@ static int gen_key(struct opal_dev *dev, void *data)
|
|||
return err;
|
||||
|
||||
}
|
||||
|
||||
return finalize_and_send(dev, parse_and_check_status);
|
||||
}
|
||||
|
||||
|
@ -1184,12 +1186,14 @@ static int get_active_key_cont(struct opal_dev *dev)
|
|||
error = parse_and_check_status(dev);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
keylen = response_get_string(&dev->parsed, 4, &activekey);
|
||||
if (!activekey) {
|
||||
pr_debug("%s: Couldn't extract the Activekey from the response\n",
|
||||
__func__);
|
||||
return OPAL_INVAL_PARAM;
|
||||
}
|
||||
|
||||
dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
|
||||
|
||||
if (!dev->prev_data)
|
||||
|
@ -1251,6 +1255,7 @@ static int generic_lr_enable_disable(struct opal_dev *dev,
|
|||
|
||||
add_token_u8(&err, dev, OPAL_ENDLIST);
|
||||
add_token_u8(&err, dev, OPAL_ENDNAME);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -1263,6 +1268,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
|
|||
0, 0);
|
||||
if (err)
|
||||
pr_debug("Failed to create enable global lr command\n");
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -1313,7 +1319,6 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
|
|||
if (err) {
|
||||
pr_debug("Error building Setup Locking range command.\n");
|
||||
return err;
|
||||
|
||||
}
|
||||
|
||||
return finalize_and_send(dev, parse_and_check_status);
|
||||
|
@ -1393,6 +1398,7 @@ static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
|
|||
kfree(key);
|
||||
dev->prev_data = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1518,6 +1524,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
|
|||
pr_debug("Error building Erase Locking Range Command.\n");
|
||||
return err;
|
||||
}
|
||||
|
||||
return finalize_and_send(dev, parse_and_check_status);
|
||||
}
|
||||
|
||||
|
@ -1636,6 +1643,7 @@ static int write_shadow_mbr(struct opal_dev *dev, void *data)
|
|||
|
||||
off += len;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -1816,6 +1824,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
|
|||
pr_debug("Error building SET command.\n");
|
||||
return err;
|
||||
}
|
||||
|
||||
return finalize_and_send(dev, parse_and_check_status);
|
||||
}
|
||||
|
||||
|
@ -1857,6 +1866,7 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
|
|||
pr_debug("Error building SET command.\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
return finalize_and_send(dev, parse_and_check_status);
|
||||
}
|
||||
|
||||
|
@ -1957,6 +1967,7 @@ static int end_opal_session(struct opal_dev *dev, void *data)
|
|||
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
return finalize_and_send(dev, end_session_cont);
|
||||
}
|
||||
|
||||
|
@ -1965,6 +1976,7 @@ static int end_opal_session_error(struct opal_dev *dev)
|
|||
const struct opal_step error_end_session = {
|
||||
end_opal_session,
|
||||
};
|
||||
|
||||
return execute_step(dev, &error_end_session, 0);
|
||||
}
|
||||
|
||||
|
@ -1984,6 +1996,7 @@ static int check_opal_support(struct opal_dev *dev)
|
|||
ret = opal_discovery0_step(dev);
|
||||
dev->supported = !ret;
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2004,6 +2017,7 @@ void free_opal_dev(struct opal_dev *dev)
|
|||
{
|
||||
if (!dev)
|
||||
return;
|
||||
|
||||
clean_opal_dev(dev);
|
||||
kfree(dev);
|
||||
}
|
||||
|
@ -2026,6 +2040,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
|
|||
kfree(dev);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return dev;
|
||||
}
|
||||
EXPORT_SYMBOL(init_opal_dev);
|
||||
|
@ -2045,6 +2060,7 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2062,6 +2078,7 @@ static int opal_erase_locking_range(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2089,6 +2106,7 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2113,6 +2131,7 @@ static int opal_set_mbr_done(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2133,6 +2152,7 @@ static int opal_write_shadow_mbr(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2151,6 +2171,7 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
|
|||
setup_opal_dev(dev);
|
||||
add_suspend_info(dev, suspend);
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2169,12 +2190,14 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
|
|||
pr_debug("Locking state was not RO or RW\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (lk_unlk->session.who < OPAL_USER1 ||
|
||||
lk_unlk->session.who > OPAL_USER9) {
|
||||
pr_debug("Authority was not within the range of users: %d\n",
|
||||
lk_unlk->session.who);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (lk_unlk->session.sum) {
|
||||
pr_debug("%s not supported in sum. Use setup locking range\n",
|
||||
__func__);
|
||||
|
@ -2185,6 +2208,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2267,6 +2291,7 @@ static int opal_lock_unlock(struct opal_dev *dev,
|
|||
mutex_lock(&dev->dev_lock);
|
||||
ret = __opal_lock_unlock(dev, lk_unlk);
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2289,6 +2314,7 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2310,6 +2336,7 @@ static int opal_activate_lsp(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2327,6 +2354,7 @@ static int opal_setup_locking_range(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2347,6 +2375,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2371,6 +2400,7 @@ static int opal_activate_user(struct opal_dev *dev,
|
|||
setup_opal_dev(dev);
|
||||
ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps));
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2382,6 +2412,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
|
|||
|
||||
if (!dev)
|
||||
return false;
|
||||
|
||||
if (!dev->supported)
|
||||
return false;
|
||||
|
||||
|
@ -2399,6 +2430,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
|
|||
suspend->unlk.session.sum);
|
||||
was_failure = true;
|
||||
}
|
||||
|
||||
if (dev->mbr_enabled) {
|
||||
ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
|
||||
if (ret)
|
||||
|
@ -2406,6 +2438,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
|
|||
}
|
||||
}
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
return was_failure;
|
||||
}
|
||||
EXPORT_SYMBOL(opal_unlock_from_suspend);
|
||||
|
|
|
@ -3780,7 +3780,7 @@ static int compat_getdrvprm(int drive,
|
|||
v.native_format = UDP->native_format;
|
||||
mutex_unlock(&floppy_mutex);
|
||||
|
||||
if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_params)))
|
||||
if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_params)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
@ -3816,7 +3816,7 @@ static int compat_getdrvstat(int drive, bool poll,
|
|||
v.bufblocks = UDRS->bufblocks;
|
||||
mutex_unlock(&floppy_mutex);
|
||||
|
||||
if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_struct)))
|
||||
if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_struct)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
Eintr:
|
||||
|
|
|
@ -1755,6 +1755,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
|
|||
case LOOP_SET_FD:
|
||||
case LOOP_CHANGE_FD:
|
||||
case LOOP_SET_BLOCK_SIZE:
|
||||
case LOOP_SET_DIRECT_IO:
|
||||
err = lo_ioctl(bdev, mode, cmd, arg);
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -108,6 +108,7 @@ struct nbd_device {
|
|||
struct nbd_config *config;
|
||||
struct mutex config_lock;
|
||||
struct gendisk *disk;
|
||||
struct workqueue_struct *recv_workq;
|
||||
|
||||
struct list_head list;
|
||||
struct task_struct *task_recv;
|
||||
|
@ -121,6 +122,7 @@ struct nbd_cmd {
|
|||
struct mutex lock;
|
||||
int index;
|
||||
int cookie;
|
||||
int retries;
|
||||
blk_status_t status;
|
||||
unsigned long flags;
|
||||
u32 cmd_cookie;
|
||||
|
@ -138,7 +140,6 @@ static struct dentry *nbd_dbg_dir;
|
|||
|
||||
static unsigned int nbds_max = 16;
|
||||
static int max_part = 16;
|
||||
static struct workqueue_struct *recv_workqueue;
|
||||
static int part_shift;
|
||||
|
||||
static int nbd_dev_dbg_init(struct nbd_device *nbd);
|
||||
|
@ -344,6 +345,22 @@ static void sock_shutdown(struct nbd_device *nbd)
|
|||
dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
|
||||
}
|
||||
|
||||
static u32 req_to_nbd_cmd_type(struct request *req)
|
||||
{
|
||||
switch (req_op(req)) {
|
||||
case REQ_OP_DISCARD:
|
||||
return NBD_CMD_TRIM;
|
||||
case REQ_OP_FLUSH:
|
||||
return NBD_CMD_FLUSH;
|
||||
case REQ_OP_WRITE:
|
||||
return NBD_CMD_WRITE;
|
||||
case REQ_OP_READ:
|
||||
return NBD_CMD_READ;
|
||||
default:
|
||||
return U32_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
|
||||
bool reserved)
|
||||
{
|
||||
|
@ -357,8 +374,10 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
|
|||
}
|
||||
config = nbd->config;
|
||||
|
||||
if (!mutex_trylock(&cmd->lock))
|
||||
if (!mutex_trylock(&cmd->lock)) {
|
||||
nbd_config_put(nbd);
|
||||
return BLK_EH_RESET_TIMER;
|
||||
}
|
||||
|
||||
if (config->num_connections > 1) {
|
||||
dev_err_ratelimited(nbd_to_dev(nbd),
|
||||
|
@ -389,10 +408,25 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
|
|||
nbd_config_put(nbd);
|
||||
return BLK_EH_DONE;
|
||||
}
|
||||
} else {
|
||||
dev_err_ratelimited(nbd_to_dev(nbd),
|
||||
"Connection timed out\n");
|
||||
}
|
||||
|
||||
if (!nbd->tag_set.timeout) {
|
||||
/*
|
||||
* Userspace sets timeout=0 to disable socket disconnection,
|
||||
* so just warn and reset the timer.
|
||||
*/
|
||||
cmd->retries++;
|
||||
dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
|
||||
req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
|
||||
(unsigned long long)blk_rq_pos(req) << 9,
|
||||
blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
|
||||
|
||||
mutex_unlock(&cmd->lock);
|
||||
nbd_config_put(nbd);
|
||||
return BLK_EH_RESET_TIMER;
|
||||
}
|
||||
|
||||
dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
|
||||
set_bit(NBD_TIMEDOUT, &config->runtime_flags);
|
||||
cmd->status = BLK_STS_IOERR;
|
||||
mutex_unlock(&cmd->lock);
|
||||
|
@ -480,22 +514,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
|
|||
|
||||
iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
|
||||
|
||||
switch (req_op(req)) {
|
||||
case REQ_OP_DISCARD:
|
||||
type = NBD_CMD_TRIM;
|
||||
break;
|
||||
case REQ_OP_FLUSH:
|
||||
type = NBD_CMD_FLUSH;
|
||||
break;
|
||||
case REQ_OP_WRITE:
|
||||
type = NBD_CMD_WRITE;
|
||||
break;
|
||||
case REQ_OP_READ:
|
||||
type = NBD_CMD_READ;
|
||||
break;
|
||||
default:
|
||||
type = req_to_nbd_cmd_type(req);
|
||||
if (type == U32_MAX)
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (rq_data_dir(req) == WRITE &&
|
||||
(config->flags & NBD_FLAG_READ_ONLY)) {
|
||||
|
@ -526,6 +547,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
|
|||
}
|
||||
cmd->index = index;
|
||||
cmd->cookie = nsock->cookie;
|
||||
cmd->retries = 0;
|
||||
request.type = htonl(type | nbd_cmd_flags);
|
||||
if (type != NBD_CMD_FLUSH) {
|
||||
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
|
||||
|
@ -1036,7 +1058,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
|
|||
/* We take the tx_mutex in an error path in the recv_work, so we
|
||||
* need to queue_work outside of the tx_mutex.
|
||||
*/
|
||||
queue_work(recv_workqueue, &args->work);
|
||||
queue_work(nbd->recv_workq, &args->work);
|
||||
|
||||
atomic_inc(&config->live_connections);
|
||||
wake_up(&config->conn_wait);
|
||||
|
@ -1137,6 +1159,10 @@ static void nbd_config_put(struct nbd_device *nbd)
|
|||
kfree(nbd->config);
|
||||
nbd->config = NULL;
|
||||
|
||||
if (nbd->recv_workq)
|
||||
destroy_workqueue(nbd->recv_workq);
|
||||
nbd->recv_workq = NULL;
|
||||
|
||||
nbd->tag_set.timeout = 0;
|
||||
nbd->disk->queue->limits.discard_granularity = 0;
|
||||
nbd->disk->queue->limits.discard_alignment = 0;
|
||||
|
@ -1165,6 +1191,14 @@ static int nbd_start_device(struct nbd_device *nbd)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
nbd->recv_workq = alloc_workqueue("knbd%d-recv",
|
||||
WQ_MEM_RECLAIM | WQ_HIGHPRI |
|
||||
WQ_UNBOUND, 0, nbd->index);
|
||||
if (!nbd->recv_workq) {
|
||||
dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
|
||||
nbd->task_recv = current;
|
||||
|
||||
|
@ -1195,7 +1229,7 @@ static int nbd_start_device(struct nbd_device *nbd)
|
|||
INIT_WORK(&args->work, recv_work);
|
||||
args->nbd = nbd;
|
||||
args->index = i;
|
||||
queue_work(recv_workqueue, &args->work);
|
||||
queue_work(nbd->recv_workq, &args->work);
|
||||
}
|
||||
nbd_size_update(nbd);
|
||||
return error;
|
||||
|
@ -1215,8 +1249,10 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
|
|||
mutex_unlock(&nbd->config_lock);
|
||||
ret = wait_event_interruptible(config->recv_wq,
|
||||
atomic_read(&config->recv_threads) == 0);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
sock_shutdown(nbd);
|
||||
flush_workqueue(nbd->recv_workq);
|
||||
}
|
||||
mutex_lock(&nbd->config_lock);
|
||||
nbd_bdev_reset(bdev);
|
||||
/* user requested, ignore socket errors */
|
||||
|
@ -1246,6 +1282,13 @@ static bool nbd_is_valid_blksize(unsigned long blksize)
|
|||
return true;
|
||||
}
|
||||
|
||||
static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
|
||||
{
|
||||
nbd->tag_set.timeout = timeout * HZ;
|
||||
if (timeout)
|
||||
blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
|
||||
}
|
||||
|
||||
/* Must be called with config_lock held */
|
||||
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
|
||||
unsigned int cmd, unsigned long arg)
|
||||
|
@ -1276,10 +1319,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
|
|||
nbd_size_set(nbd, config->blksize, arg);
|
||||
return 0;
|
||||
case NBD_SET_TIMEOUT:
|
||||
if (arg) {
|
||||
nbd->tag_set.timeout = arg * HZ;
|
||||
blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
|
||||
}
|
||||
nbd_set_cmd_timeout(nbd, arg);
|
||||
return 0;
|
||||
|
||||
case NBD_SET_FLAGS:
|
||||
|
@ -1799,11 +1839,9 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
|
|||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (info->attrs[NBD_ATTR_TIMEOUT]) {
|
||||
u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
|
||||
nbd->tag_set.timeout = timeout * HZ;
|
||||
blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
|
||||
}
|
||||
if (info->attrs[NBD_ATTR_TIMEOUT])
|
||||
nbd_set_cmd_timeout(nbd,
|
||||
nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
|
||||
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
|
||||
config->dead_conn_timeout =
|
||||
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
|
||||
|
@ -1875,6 +1913,12 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
|
|||
nbd_disconnect(nbd);
|
||||
nbd_clear_sock(nbd);
|
||||
mutex_unlock(&nbd->config_lock);
|
||||
/*
|
||||
* Make sure recv thread has finished, so it does not drop the last
|
||||
* config ref and try to destroy the workqueue from inside the work
|
||||
* queue.
|
||||
*/
|
||||
flush_workqueue(nbd->recv_workq);
|
||||
if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
|
||||
&nbd->config->runtime_flags))
|
||||
nbd_config_put(nbd);
|
||||
|
@ -1971,11 +2015,9 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
|
|||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (info->attrs[NBD_ATTR_TIMEOUT]) {
|
||||
u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
|
||||
nbd->tag_set.timeout = timeout * HZ;
|
||||
blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
|
||||
}
|
||||
if (info->attrs[NBD_ATTR_TIMEOUT])
|
||||
nbd_set_cmd_timeout(nbd,
|
||||
nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
|
||||
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
|
||||
config->dead_conn_timeout =
|
||||
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
|
||||
|
@ -2261,20 +2303,12 @@ static int __init nbd_init(void)
|
|||
|
||||
if (nbds_max > 1UL << (MINORBITS - part_shift))
|
||||
return -EINVAL;
|
||||
recv_workqueue = alloc_workqueue("knbd-recv",
|
||||
WQ_MEM_RECLAIM | WQ_HIGHPRI |
|
||||
WQ_UNBOUND, 0);
|
||||
if (!recv_workqueue)
|
||||
return -ENOMEM;
|
||||
|
||||
if (register_blkdev(NBD_MAJOR, "nbd")) {
|
||||
destroy_workqueue(recv_workqueue);
|
||||
if (register_blkdev(NBD_MAJOR, "nbd"))
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (genl_register_family(&nbd_genl_family)) {
|
||||
unregister_blkdev(NBD_MAJOR, "nbd");
|
||||
destroy_workqueue(recv_workqueue);
|
||||
return -EINVAL;
|
||||
}
|
||||
nbd_dbg_init();
|
||||
|
@ -2316,7 +2350,6 @@ static void __exit nbd_cleanup(void)
|
|||
|
||||
idr_destroy(&nbd_index_idr);
|
||||
genl_unregister_family(&nbd_genl_family);
|
||||
destroy_workqueue(recv_workqueue);
|
||||
unregister_blkdev(NBD_MAJOR, "nbd");
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,9 @@
|
|||
#ifndef __BLK_NULL_BLK_H
|
||||
#define __BLK_NULL_BLK_H
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/blk-mq.h>
|
||||
|
@ -90,13 +93,13 @@ int null_zone_init(struct nullb_device *dev);
|
|||
void null_zone_exit(struct nullb_device *dev);
|
||||
int null_zone_report(struct gendisk *disk, sector_t sector,
|
||||
struct blk_zone *zones, unsigned int *nr_zones);
|
||||
void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
||||
unsigned int nr_sectors);
|
||||
void null_zone_reset(struct nullb_cmd *cmd, sector_t sector);
|
||||
blk_status_t null_handle_zoned(struct nullb_cmd *cmd,
|
||||
enum req_opf op, sector_t sector,
|
||||
sector_t nr_sectors);
|
||||
#else
|
||||
static inline int null_zone_init(struct nullb_device *dev)
|
||||
{
|
||||
pr_err("null_blk: CONFIG_BLK_DEV_ZONED not enabled\n");
|
||||
pr_err("CONFIG_BLK_DEV_ZONED not enabled\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
static inline void null_zone_exit(struct nullb_device *dev) {}
|
||||
|
@ -106,10 +109,11 @@ static inline int null_zone_report(struct gendisk *disk, sector_t sector,
|
|||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
static inline void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
||||
unsigned int nr_sectors)
|
||||
static inline blk_status_t null_handle_zoned(struct nullb_cmd *cmd,
|
||||
enum req_opf op, sector_t sector,
|
||||
sector_t nr_sectors)
|
||||
{
|
||||
return BLK_STS_NOTSUPP;
|
||||
}
|
||||
static inline void null_zone_reset(struct nullb_cmd *cmd, sector_t sector) {}
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
#endif /* __NULL_BLK_H */
|
||||
|
|
|
@ -141,8 +141,8 @@ static int g_bs = 512;
|
|||
module_param_named(bs, g_bs, int, 0444);
|
||||
MODULE_PARM_DESC(bs, "Block size (in bytes)");
|
||||
|
||||
static int nr_devices = 1;
|
||||
module_param(nr_devices, int, 0444);
|
||||
static unsigned int nr_devices = 1;
|
||||
module_param(nr_devices, uint, 0444);
|
||||
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
|
||||
|
||||
static bool g_blocking;
|
||||
|
@ -1133,93 +1133,61 @@ static void null_restart_queue_async(struct nullb *nullb)
|
|||
blk_mq_start_stopped_hw_queues(q, true);
|
||||
}
|
||||
|
||||
static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
|
||||
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
|
||||
{
|
||||
struct nullb_device *dev = cmd->nq->dev;
|
||||
struct nullb *nullb = dev->nullb;
|
||||
int err = 0;
|
||||
blk_status_t sts = BLK_STS_OK;
|
||||
struct request *rq = cmd->rq;
|
||||
|
||||
if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
|
||||
struct request *rq = cmd->rq;
|
||||
if (!hrtimer_active(&nullb->bw_timer))
|
||||
hrtimer_restart(&nullb->bw_timer);
|
||||
|
||||
if (!hrtimer_active(&nullb->bw_timer))
|
||||
hrtimer_restart(&nullb->bw_timer);
|
||||
|
||||
if (atomic_long_sub_return(blk_rq_bytes(rq),
|
||||
&nullb->cur_bytes) < 0) {
|
||||
null_stop_queue(nullb);
|
||||
/* race with timer */
|
||||
if (atomic_long_read(&nullb->cur_bytes) > 0)
|
||||
null_restart_queue_async(nullb);
|
||||
/* requeue request */
|
||||
return BLK_STS_DEV_RESOURCE;
|
||||
}
|
||||
if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
|
||||
null_stop_queue(nullb);
|
||||
/* race with timer */
|
||||
if (atomic_long_read(&nullb->cur_bytes) > 0)
|
||||
null_restart_queue_async(nullb);
|
||||
/* requeue request */
|
||||
sts = BLK_STS_DEV_RESOURCE;
|
||||
}
|
||||
return sts;
|
||||
}
|
||||
|
||||
if (nullb->dev->badblocks.shift != -1) {
|
||||
int bad_sectors;
|
||||
sector_t sector, size, first_bad;
|
||||
bool is_flush = true;
|
||||
static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
|
||||
sector_t sector,
|
||||
sector_t nr_sectors)
|
||||
{
|
||||
struct badblocks *bb = &cmd->nq->dev->badblocks;
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
if (dev->queue_mode == NULL_Q_BIO &&
|
||||
bio_op(cmd->bio) != REQ_OP_FLUSH) {
|
||||
is_flush = false;
|
||||
sector = cmd->bio->bi_iter.bi_sector;
|
||||
size = bio_sectors(cmd->bio);
|
||||
}
|
||||
if (dev->queue_mode != NULL_Q_BIO &&
|
||||
req_op(cmd->rq) != REQ_OP_FLUSH) {
|
||||
is_flush = false;
|
||||
sector = blk_rq_pos(cmd->rq);
|
||||
size = blk_rq_sectors(cmd->rq);
|
||||
}
|
||||
if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
|
||||
size, &first_bad, &bad_sectors)) {
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors))
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
if (dev->memory_backed) {
|
||||
if (dev->queue_mode == NULL_Q_BIO) {
|
||||
if (bio_op(cmd->bio) == REQ_OP_FLUSH)
|
||||
err = null_handle_flush(nullb);
|
||||
else
|
||||
err = null_handle_bio(cmd);
|
||||
} else {
|
||||
if (req_op(cmd->rq) == REQ_OP_FLUSH)
|
||||
err = null_handle_flush(nullb);
|
||||
else
|
||||
err = null_handle_rq(cmd);
|
||||
}
|
||||
}
|
||||
cmd->error = errno_to_blk_status(err);
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
if (!cmd->error && dev->zoned) {
|
||||
sector_t sector;
|
||||
unsigned int nr_sectors;
|
||||
enum req_opf op;
|
||||
static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
|
||||
enum req_opf op)
|
||||
{
|
||||
struct nullb_device *dev = cmd->nq->dev;
|
||||
int err;
|
||||
|
||||
if (dev->queue_mode == NULL_Q_BIO) {
|
||||
op = bio_op(cmd->bio);
|
||||
sector = cmd->bio->bi_iter.bi_sector;
|
||||
nr_sectors = cmd->bio->bi_iter.bi_size >> 9;
|
||||
} else {
|
||||
op = req_op(cmd->rq);
|
||||
sector = blk_rq_pos(cmd->rq);
|
||||
nr_sectors = blk_rq_sectors(cmd->rq);
|
||||
}
|
||||
if (dev->queue_mode == NULL_Q_BIO)
|
||||
err = null_handle_bio(cmd);
|
||||
else
|
||||
err = null_handle_rq(cmd);
|
||||
|
||||
if (op == REQ_OP_WRITE)
|
||||
null_zone_write(cmd, sector, nr_sectors);
|
||||
else if (op == REQ_OP_ZONE_RESET)
|
||||
null_zone_reset(cmd, sector);
|
||||
}
|
||||
out:
|
||||
return errno_to_blk_status(err);
|
||||
}
|
||||
|
||||
static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
|
||||
{
|
||||
/* Complete IO by inline, softirq or timer */
|
||||
switch (dev->irqmode) {
|
||||
switch (cmd->nq->dev->irqmode) {
|
||||
case NULL_IRQ_SOFTIRQ:
|
||||
switch (dev->queue_mode) {
|
||||
switch (cmd->nq->dev->queue_mode) {
|
||||
case NULL_Q_MQ:
|
||||
blk_mq_complete_request(cmd->rq);
|
||||
break;
|
||||
|
@ -1238,6 +1206,40 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
|
|||
null_cmd_end_timer(cmd);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
|
||||
sector_t nr_sectors, enum req_opf op)
|
||||
{
|
||||
struct nullb_device *dev = cmd->nq->dev;
|
||||
struct nullb *nullb = dev->nullb;
|
||||
blk_status_t sts;
|
||||
|
||||
if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
|
||||
sts = null_handle_throttled(cmd);
|
||||
if (sts != BLK_STS_OK)
|
||||
return sts;
|
||||
}
|
||||
|
||||
if (op == REQ_OP_FLUSH) {
|
||||
cmd->error = errno_to_blk_status(null_handle_flush(nullb));
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (nullb->dev->badblocks.shift != -1) {
|
||||
cmd->error = null_handle_badblocks(cmd, sector, nr_sectors);
|
||||
if (cmd->error != BLK_STS_OK)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (dev->memory_backed)
|
||||
cmd->error = null_handle_memory_backed(cmd, op);
|
||||
|
||||
if (!cmd->error && dev->zoned)
|
||||
cmd->error = null_handle_zoned(cmd, op, sector, nr_sectors);
|
||||
|
||||
out:
|
||||
nullb_complete_cmd(cmd);
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
|
@ -1280,6 +1282,8 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
|
|||
|
||||
static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
sector_t sector = bio->bi_iter.bi_sector;
|
||||
sector_t nr_sectors = bio_sectors(bio);
|
||||
struct nullb *nullb = q->queuedata;
|
||||
struct nullb_queue *nq = nullb_to_queue(nullb);
|
||||
struct nullb_cmd *cmd;
|
||||
|
@ -1287,7 +1291,7 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
|
|||
cmd = alloc_cmd(nq, 1);
|
||||
cmd->bio = bio;
|
||||
|
||||
null_handle_cmd(cmd);
|
||||
null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio));
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
|
@ -1311,7 +1315,7 @@ static bool should_requeue_request(struct request *rq)
|
|||
|
||||
static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
|
||||
{
|
||||
pr_info("null: rq %p timed out\n", rq);
|
||||
pr_info("rq %p timed out\n", rq);
|
||||
blk_mq_complete_request(rq);
|
||||
return BLK_EH_DONE;
|
||||
}
|
||||
|
@ -1321,6 +1325,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||
{
|
||||
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
|
||||
struct nullb_queue *nq = hctx->driver_data;
|
||||
sector_t nr_sectors = blk_rq_sectors(bd->rq);
|
||||
sector_t sector = blk_rq_pos(bd->rq);
|
||||
|
||||
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
|
||||
|
||||
|
@ -1349,7 +1355,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||
if (should_timeout_request(bd->rq))
|
||||
return BLK_STS_OK;
|
||||
|
||||
return null_handle_cmd(cmd);
|
||||
return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq));
|
||||
}
|
||||
|
||||
static const struct blk_mq_ops null_mq_ops = {
|
||||
|
@ -1688,6 +1694,9 @@ static int null_add_dev(struct nullb_device *dev)
|
|||
|
||||
blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
|
||||
nullb->q->limits.zoned = BLK_ZONED_HM;
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q);
|
||||
blk_queue_required_elevator_features(nullb->q,
|
||||
ELEVATOR_F_ZBD_SEQ_WRITE);
|
||||
}
|
||||
|
||||
nullb->q->queuedata = nullb;
|
||||
|
@ -1739,28 +1748,28 @@ static int __init null_init(void)
|
|||
struct nullb_device *dev;
|
||||
|
||||
if (g_bs > PAGE_SIZE) {
|
||||
pr_warn("null_blk: invalid block size\n");
|
||||
pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
|
||||
pr_warn("invalid block size\n");
|
||||
pr_warn("defaults block size to %lu\n", PAGE_SIZE);
|
||||
g_bs = PAGE_SIZE;
|
||||
}
|
||||
|
||||
if (!is_power_of_2(g_zone_size)) {
|
||||
pr_err("null_blk: zone_size must be power-of-two\n");
|
||||
pr_err("zone_size must be power-of-two\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
|
||||
pr_err("null_blk: invalid home_node value\n");
|
||||
pr_err("invalid home_node value\n");
|
||||
g_home_node = NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
if (g_queue_mode == NULL_Q_RQ) {
|
||||
pr_err("null_blk: legacy IO path no longer available\n");
|
||||
pr_err("legacy IO path no longer available\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
|
||||
if (g_submit_queues != nr_online_nodes) {
|
||||
pr_warn("null_blk: submit_queues param is set to %u.\n",
|
||||
pr_warn("submit_queues param is set to %u.\n",
|
||||
nr_online_nodes);
|
||||
g_submit_queues = nr_online_nodes;
|
||||
}
|
||||
|
@ -1803,7 +1812,7 @@ static int __init null_init(void)
|
|||
}
|
||||
}
|
||||
|
||||
pr_info("null: module loaded\n");
|
||||
pr_info("module loaded\n");
|
||||
return 0;
|
||||
|
||||
err_dev:
|
||||
|
|
|
@ -17,7 +17,7 @@ int null_zone_init(struct nullb_device *dev)
|
|||
unsigned int i;
|
||||
|
||||
if (!is_power_of_2(dev->zone_size)) {
|
||||
pr_err("null_blk: zone_size must be power-of-two\n");
|
||||
pr_err("zone_size must be power-of-two\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -31,7 +31,7 @@ int null_zone_init(struct nullb_device *dev)
|
|||
|
||||
if (dev->zone_nr_conv >= dev->nr_zones) {
|
||||
dev->zone_nr_conv = dev->nr_zones - 1;
|
||||
pr_info("null_blk: changed the number of conventional zones to %u",
|
||||
pr_info("changed the number of conventional zones to %u",
|
||||
dev->zone_nr_conv);
|
||||
}
|
||||
|
||||
|
@ -84,7 +84,7 @@ int null_zone_report(struct gendisk *disk, sector_t sector,
|
|||
return 0;
|
||||
}
|
||||
|
||||
void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
||||
static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
||||
unsigned int nr_sectors)
|
||||
{
|
||||
struct nullb_device *dev = cmd->nq->dev;
|
||||
|
@ -95,14 +95,12 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
|||
case BLK_ZONE_COND_FULL:
|
||||
/* Cannot write to a full zone */
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
break;
|
||||
return BLK_STS_IOERR;
|
||||
case BLK_ZONE_COND_EMPTY:
|
||||
case BLK_ZONE_COND_IMP_OPEN:
|
||||
/* Writes must be at the write pointer position */
|
||||
if (sector != zone->wp) {
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
break;
|
||||
}
|
||||
if (sector != zone->wp)
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
if (zone->cond == BLK_ZONE_COND_EMPTY)
|
||||
zone->cond = BLK_ZONE_COND_IMP_OPEN;
|
||||
|
@ -115,22 +113,51 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
|||
break;
|
||||
default:
|
||||
/* Invalid zone condition */
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
break;
|
||||
return BLK_STS_IOERR;
|
||||
}
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
void null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
|
||||
static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
|
||||
{
|
||||
struct nullb_device *dev = cmd->nq->dev;
|
||||
unsigned int zno = null_zone_no(dev, sector);
|
||||
struct blk_zone *zone = &dev->zones[zno];
|
||||
size_t i;
|
||||
|
||||
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
return;
|
||||
switch (req_op(cmd->rq)) {
|
||||
case REQ_OP_ZONE_RESET_ALL:
|
||||
for (i = 0; i < dev->nr_zones; i++) {
|
||||
if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
|
||||
continue;
|
||||
zone[i].cond = BLK_ZONE_COND_EMPTY;
|
||||
zone[i].wp = zone[i].start;
|
||||
}
|
||||
break;
|
||||
case REQ_OP_ZONE_RESET:
|
||||
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
zone->cond = BLK_ZONE_COND_EMPTY;
|
||||
zone->wp = zone->start;
|
||||
break;
|
||||
default:
|
||||
cmd->error = BLK_STS_NOTSUPP;
|
||||
break;
|
||||
}
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op,
|
||||
sector_t sector, sector_t nr_sectors)
|
||||
{
|
||||
switch (op) {
|
||||
case REQ_OP_WRITE:
|
||||
return null_zone_write(cmd, sector, nr_sectors);
|
||||
case REQ_OP_ZONE_RESET:
|
||||
case REQ_OP_ZONE_RESET_ALL:
|
||||
return null_zone_reset(cmd, sector);
|
||||
default:
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
zone->cond = BLK_ZONE_COND_EMPTY;
|
||||
zone->wp = zone->start;
|
||||
}
|
||||
|
|
|
@ -314,8 +314,8 @@ static void pcd_init_units(void)
|
|||
disk->queue = blk_mq_init_sq_queue(&cd->tag_set, &pcd_mq_ops,
|
||||
1, BLK_MQ_F_SHOULD_MERGE);
|
||||
if (IS_ERR(disk->queue)) {
|
||||
put_disk(disk);
|
||||
disk->queue = NULL;
|
||||
put_disk(disk);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -723,9 +723,9 @@ static int pcd_detect(void)
|
|||
k = 0;
|
||||
if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
|
||||
cd = pcd;
|
||||
if (pi_init(cd->pi, 1, -1, -1, -1, -1, -1, pcd_buffer,
|
||||
PI_PCD, verbose, cd->name)) {
|
||||
if (!pcd_probe(cd, -1, id) && cd->disk) {
|
||||
if (cd->disk && pi_init(cd->pi, 1, -1, -1, -1, -1, -1,
|
||||
pcd_buffer, PI_PCD, verbose, cd->name)) {
|
||||
if (!pcd_probe(cd, -1, id)) {
|
||||
cd->present = 1;
|
||||
k++;
|
||||
} else
|
||||
|
@ -736,11 +736,13 @@ static int pcd_detect(void)
|
|||
int *conf = *drives[unit];
|
||||
if (!conf[D_PRT])
|
||||
continue;
|
||||
if (!cd->disk)
|
||||
continue;
|
||||
if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD],
|
||||
conf[D_UNI], conf[D_PRO], conf[D_DLY],
|
||||
pcd_buffer, PI_PCD, verbose, cd->name))
|
||||
continue;
|
||||
if (!pcd_probe(cd, conf[D_SLV], id) && cd->disk) {
|
||||
if (!pcd_probe(cd, conf[D_SLV], id)) {
|
||||
cd->present = 1;
|
||||
k++;
|
||||
} else
|
||||
|
|
|
@ -300,8 +300,8 @@ static void __init pf_init_units(void)
|
|||
disk->queue = blk_mq_init_sq_queue(&pf->tag_set, &pf_mq_ops,
|
||||
1, BLK_MQ_F_SHOULD_MERGE);
|
||||
if (IS_ERR(disk->queue)) {
|
||||
put_disk(disk);
|
||||
disk->queue = NULL;
|
||||
put_disk(disk);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
* Initial release: Matias Bjorling <m@bjorling.me>
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "nvm: " fmt
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/sem.h>
|
||||
|
@ -74,7 +76,7 @@ static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
|
|||
|
||||
for (i = lun_begin; i <= lun_end; i++) {
|
||||
if (test_and_set_bit(i, dev->lun_map)) {
|
||||
pr_err("nvm: lun %d already allocated\n", i);
|
||||
pr_err("lun %d already allocated\n", i);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
@ -264,7 +266,7 @@ static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
|
|||
int lun_end)
|
||||
{
|
||||
if (lun_begin > lun_end || lun_end >= geo->all_luns) {
|
||||
pr_err("nvm: lun out of bound (%u:%u > %u)\n",
|
||||
pr_err("lun out of bound (%u:%u > %u)\n",
|
||||
lun_begin, lun_end, geo->all_luns - 1);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -297,7 +299,7 @@ static int __nvm_config_extended(struct nvm_dev *dev,
|
|||
if (e->op == 0xFFFF) {
|
||||
e->op = NVM_TARGET_DEFAULT_OP;
|
||||
} else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) {
|
||||
pr_err("nvm: invalid over provisioning value\n");
|
||||
pr_err("invalid over provisioning value\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -334,23 +336,23 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
|
|||
e = create->conf.e;
|
||||
break;
|
||||
default:
|
||||
pr_err("nvm: config type not valid\n");
|
||||
pr_err("config type not valid\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
tt = nvm_find_target_type(create->tgttype);
|
||||
if (!tt) {
|
||||
pr_err("nvm: target type %s not found\n", create->tgttype);
|
||||
pr_err("target type %s not found\n", create->tgttype);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if ((tt->flags & NVM_TGT_F_HOST_L2P) != (dev->geo.dom & NVM_RSP_L2P)) {
|
||||
pr_err("nvm: device is incompatible with target L2P type.\n");
|
||||
pr_err("device is incompatible with target L2P type.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (nvm_target_exists(create->tgtname)) {
|
||||
pr_err("nvm: target name already exists (%s)\n",
|
||||
pr_err("target name already exists (%s)\n",
|
||||
create->tgtname);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -367,7 +369,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
|
|||
|
||||
tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
|
||||
if (!tgt_dev) {
|
||||
pr_err("nvm: could not create target device\n");
|
||||
pr_err("could not create target device\n");
|
||||
ret = -ENOMEM;
|
||||
goto err_t;
|
||||
}
|
||||
|
@ -493,8 +495,11 @@ static int nvm_remove_tgt(struct nvm_ioctl_remove *remove)
|
|||
}
|
||||
up_read(&nvm_lock);
|
||||
|
||||
if (!t)
|
||||
if (!t) {
|
||||
pr_err("failed to remove target %s\n",
|
||||
remove->tgtname);
|
||||
return 1;
|
||||
}
|
||||
|
||||
__nvm_remove_target(t, true);
|
||||
kref_put(&dev->ref, nvm_free);
|
||||
|
@ -686,7 +691,7 @@ static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
|
|||
rqd->nr_ppas = nr_ppas;
|
||||
rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
|
||||
if (!rqd->ppa_list) {
|
||||
pr_err("nvm: failed to allocate dma memory\n");
|
||||
pr_err("failed to allocate dma memory\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
|
@ -731,7 +736,7 @@ static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd)
|
|||
return flags;
|
||||
}
|
||||
|
||||
int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
|
||||
int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf)
|
||||
{
|
||||
struct nvm_dev *dev = tgt_dev->parent;
|
||||
int ret;
|
||||
|
@ -745,19 +750,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
|
|||
rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd);
|
||||
|
||||
/* In case of error, fail with right address format */
|
||||
ret = dev->ops->submit_io(dev, rqd);
|
||||
ret = dev->ops->submit_io(dev, rqd, buf);
|
||||
if (ret)
|
||||
nvm_rq_dev_to_tgt(tgt_dev, rqd);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(nvm_submit_io);
|
||||
|
||||
int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
|
||||
static void nvm_sync_end_io(struct nvm_rq *rqd)
|
||||
{
|
||||
struct completion *waiting = rqd->private;
|
||||
|
||||
complete(waiting);
|
||||
}
|
||||
|
||||
static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd,
|
||||
void *buf)
|
||||
{
|
||||
DECLARE_COMPLETION_ONSTACK(wait);
|
||||
int ret = 0;
|
||||
|
||||
rqd->end_io = nvm_sync_end_io;
|
||||
rqd->private = &wait;
|
||||
|
||||
ret = dev->ops->submit_io(dev, rqd, buf);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
wait_for_completion_io(&wait);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
|
||||
void *buf)
|
||||
{
|
||||
struct nvm_dev *dev = tgt_dev->parent;
|
||||
int ret;
|
||||
|
||||
if (!dev->ops->submit_io_sync)
|
||||
if (!dev->ops->submit_io)
|
||||
return -ENODEV;
|
||||
|
||||
nvm_rq_tgt_to_dev(tgt_dev, rqd);
|
||||
|
@ -765,9 +796,7 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
|
|||
rqd->dev = tgt_dev;
|
||||
rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd);
|
||||
|
||||
/* In case of error, fail with right address format */
|
||||
ret = dev->ops->submit_io_sync(dev, rqd);
|
||||
nvm_rq_dev_to_tgt(tgt_dev, rqd);
|
||||
ret = nvm_submit_io_wait(dev, rqd, buf);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -788,12 +817,13 @@ EXPORT_SYMBOL(nvm_end_io);
|
|||
|
||||
static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd)
|
||||
{
|
||||
if (!dev->ops->submit_io_sync)
|
||||
if (!dev->ops->submit_io)
|
||||
return -ENODEV;
|
||||
|
||||
rqd->dev = NULL;
|
||||
rqd->flags = nvm_set_flags(&dev->geo, rqd);
|
||||
|
||||
return dev->ops->submit_io_sync(dev, rqd);
|
||||
return nvm_submit_io_wait(dev, rqd, NULL);
|
||||
}
|
||||
|
||||
static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa)
|
||||
|
@ -1048,7 +1078,7 @@ int nvm_set_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
|
|||
return 0;
|
||||
|
||||
if (nr_ppas > NVM_MAX_VLBA) {
|
||||
pr_err("nvm: unable to update all blocks atomically\n");
|
||||
pr_err("unable to update all blocks atomically\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -1111,27 +1141,26 @@ static int nvm_init(struct nvm_dev *dev)
|
|||
int ret = -EINVAL;
|
||||
|
||||
if (dev->ops->identity(dev)) {
|
||||
pr_err("nvm: device could not be identified\n");
|
||||
pr_err("device could not be identified\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
pr_debug("nvm: ver:%u.%u nvm_vendor:%x\n",
|
||||
geo->major_ver_id, geo->minor_ver_id,
|
||||
geo->vmnt);
|
||||
pr_debug("ver:%u.%u nvm_vendor:%x\n", geo->major_ver_id,
|
||||
geo->minor_ver_id, geo->vmnt);
|
||||
|
||||
ret = nvm_core_init(dev);
|
||||
if (ret) {
|
||||
pr_err("nvm: could not initialize core structures.\n");
|
||||
pr_err("could not initialize core structures.\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
pr_info("nvm: registered %s [%u/%u/%u/%u/%u]\n",
|
||||
pr_info("registered %s [%u/%u/%u/%u/%u]\n",
|
||||
dev->name, dev->geo.ws_min, dev->geo.ws_opt,
|
||||
dev->geo.num_chk, dev->geo.all_luns,
|
||||
dev->geo.num_ch);
|
||||
return 0;
|
||||
err:
|
||||
pr_err("nvm: failed to initialize nvm\n");
|
||||
pr_err("failed to initialize nvm\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1169,7 +1198,7 @@ int nvm_register(struct nvm_dev *dev)
|
|||
dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist",
|
||||
exp_pool_size);
|
||||
if (!dev->dma_pool) {
|
||||
pr_err("nvm: could not create dma pool\n");
|
||||
pr_err("could not create dma pool\n");
|
||||
kref_put(&dev->ref, nvm_free);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
@ -1214,7 +1243,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
|
|||
up_write(&nvm_lock);
|
||||
|
||||
if (!dev) {
|
||||
pr_err("nvm: device not found\n");
|
||||
pr_err("device not found\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -1288,7 +1317,7 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
|
|||
i++;
|
||||
|
||||
if (i > 31) {
|
||||
pr_err("nvm: max 31 devices can be reported.\n");
|
||||
pr_err("max 31 devices can be reported.\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1315,7 +1344,7 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
|
|||
|
||||
if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
|
||||
create.conf.e.rsv != 0) {
|
||||
pr_err("nvm: reserved config field in use\n");
|
||||
pr_err("reserved config field in use\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -1331,7 +1360,7 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
|
|||
flags &= ~NVM_TARGET_FACTORY;
|
||||
|
||||
if (flags) {
|
||||
pr_err("nvm: flag not supported\n");
|
||||
pr_err("flag not supported\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
@ -1349,7 +1378,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
|
|||
remove.tgtname[DISK_NAME_LEN - 1] = '\0';
|
||||
|
||||
if (remove.flags != 0) {
|
||||
pr_err("nvm: no flags supported\n");
|
||||
pr_err("no flags supported\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -1365,7 +1394,7 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
|
|||
return -EFAULT;
|
||||
|
||||
if (init.flags != 0) {
|
||||
pr_err("nvm: no flags supported\n");
|
||||
pr_err("no flags supported\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
|
|
@ -507,7 +507,7 @@ void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write)
|
|||
pblk->sec_per_write = sec_per_write;
|
||||
}
|
||||
|
||||
int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
|
||||
|
@ -518,7 +518,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
return NVM_IO_ERR;
|
||||
#endif
|
||||
|
||||
return nvm_submit_io(dev, rqd);
|
||||
return nvm_submit_io(dev, rqd, buf);
|
||||
}
|
||||
|
||||
void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
|
@ -541,7 +541,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
}
|
||||
}
|
||||
|
||||
int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
int ret;
|
||||
|
@ -553,7 +553,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
return NVM_IO_ERR;
|
||||
#endif
|
||||
|
||||
ret = nvm_submit_io_sync(dev, rqd);
|
||||
ret = nvm_submit_io_sync(dev, rqd, buf);
|
||||
|
||||
if (trace_pblk_chunk_state_enabled() && !ret &&
|
||||
rqd->opcode == NVM_OP_PWRITE)
|
||||
|
@ -562,65 +562,19 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
return ret;
|
||||
}
|
||||
|
||||
int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
void *buf)
|
||||
{
|
||||
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
|
||||
int ret;
|
||||
|
||||
pblk_down_chunk(pblk, ppa_list[0]);
|
||||
ret = pblk_submit_io_sync(pblk, rqd);
|
||||
ret = pblk_submit_io_sync(pblk, rqd, buf);
|
||||
pblk_up_chunk(pblk, ppa_list[0]);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void pblk_bio_map_addr_endio(struct bio *bio)
|
||||
{
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
|
||||
unsigned int nr_secs, unsigned int len,
|
||||
int alloc_type, gfp_t gfp_mask)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
void *kaddr = data;
|
||||
struct page *page;
|
||||
struct bio *bio;
|
||||
int i, ret;
|
||||
|
||||
if (alloc_type == PBLK_KMALLOC_META)
|
||||
return bio_map_kern(dev->q, kaddr, len, gfp_mask);
|
||||
|
||||
bio = bio_kmalloc(gfp_mask, nr_secs);
|
||||
if (!bio)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
for (i = 0; i < nr_secs; i++) {
|
||||
page = vmalloc_to_page(kaddr);
|
||||
if (!page) {
|
||||
pblk_err(pblk, "could not map vmalloc bio\n");
|
||||
bio_put(bio);
|
||||
bio = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
|
||||
if (ret != PAGE_SIZE) {
|
||||
pblk_err(pblk, "could not add page to bio\n");
|
||||
bio_put(bio);
|
||||
bio = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
kaddr += PAGE_SIZE;
|
||||
}
|
||||
|
||||
bio->bi_end_io = pblk_bio_map_addr_endio;
|
||||
out:
|
||||
return bio;
|
||||
}
|
||||
|
||||
int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
|
||||
unsigned long secs_to_flush, bool skip_meta)
|
||||
{
|
||||
|
@ -722,9 +676,7 @@ u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
|
|||
|
||||
int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
struct bio *bio;
|
||||
struct ppa_addr *ppa_list;
|
||||
struct nvm_rq rqd;
|
||||
u64 paddr = pblk_line_smeta_start(pblk, line);
|
||||
|
@ -736,16 +688,6 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
ret = PTR_ERR(bio);
|
||||
goto clear_rqd;
|
||||
}
|
||||
|
||||
bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
|
||||
rqd.bio = bio;
|
||||
rqd.opcode = NVM_OP_PREAD;
|
||||
rqd.nr_ppas = lm->smeta_sec;
|
||||
rqd.is_seq = 1;
|
||||
|
@ -754,10 +696,9 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
|
|||
for (i = 0; i < lm->smeta_sec; i++, paddr++)
|
||||
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
|
||||
|
||||
ret = pblk_submit_io_sync(pblk, &rqd);
|
||||
ret = pblk_submit_io_sync(pblk, &rqd, line->smeta);
|
||||
if (ret) {
|
||||
pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
|
||||
bio_put(bio);
|
||||
goto clear_rqd;
|
||||
}
|
||||
|
||||
|
@ -776,9 +717,7 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
|
|||
static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
|
||||
u64 paddr)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
struct bio *bio;
|
||||
struct ppa_addr *ppa_list;
|
||||
struct nvm_rq rqd;
|
||||
__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
|
||||
|
@ -791,16 +730,6 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
ret = PTR_ERR(bio);
|
||||
goto clear_rqd;
|
||||
}
|
||||
|
||||
bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
|
||||
rqd.bio = bio;
|
||||
rqd.opcode = NVM_OP_PWRITE;
|
||||
rqd.nr_ppas = lm->smeta_sec;
|
||||
rqd.is_seq = 1;
|
||||
|
@ -814,10 +743,9 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
|
|||
meta->lba = lba_list[paddr] = addr_empty;
|
||||
}
|
||||
|
||||
ret = pblk_submit_io_sync_sem(pblk, &rqd);
|
||||
ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta);
|
||||
if (ret) {
|
||||
pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
|
||||
bio_put(bio);
|
||||
goto clear_rqd;
|
||||
}
|
||||
|
||||
|
@ -838,10 +766,8 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
|
|||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
void *ppa_list_buf, *meta_list;
|
||||
struct bio *bio;
|
||||
struct ppa_addr *ppa_list;
|
||||
struct nvm_rq rqd;
|
||||
u64 paddr = line->emeta_ssec;
|
||||
|
@ -867,17 +793,6 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
|
|||
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
|
||||
rq_len = rq_ppas * geo->csecs;
|
||||
|
||||
bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
|
||||
l_mg->emeta_alloc_type, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
ret = PTR_ERR(bio);
|
||||
goto free_rqd_dma;
|
||||
}
|
||||
|
||||
bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
|
||||
rqd.bio = bio;
|
||||
rqd.meta_list = meta_list;
|
||||
rqd.ppa_list = ppa_list_buf;
|
||||
rqd.dma_meta_list = dma_meta_list;
|
||||
|
@ -896,7 +811,6 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
|
|||
while (test_bit(pos, line->blk_bitmap)) {
|
||||
paddr += min;
|
||||
if (pblk_boundary_paddr_checks(pblk, paddr)) {
|
||||
bio_put(bio);
|
||||
ret = -EINTR;
|
||||
goto free_rqd_dma;
|
||||
}
|
||||
|
@ -906,7 +820,6 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
|
|||
}
|
||||
|
||||
if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
|
||||
bio_put(bio);
|
||||
ret = -EINTR;
|
||||
goto free_rqd_dma;
|
||||
}
|
||||
|
@ -915,10 +828,9 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
|
|||
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id);
|
||||
}
|
||||
|
||||
ret = pblk_submit_io_sync(pblk, &rqd);
|
||||
ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf);
|
||||
if (ret) {
|
||||
pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
|
||||
bio_put(bio);
|
||||
goto free_rqd_dma;
|
||||
}
|
||||
|
||||
|
@ -963,7 +875,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
|
|||
/* The write thread schedules erases so that it minimizes disturbances
|
||||
* with writes. Thus, there is no need to take the LUN semaphore.
|
||||
*/
|
||||
ret = pblk_submit_io_sync(pblk, &rqd);
|
||||
ret = pblk_submit_io_sync(pblk, &rqd, NULL);
|
||||
rqd.private = pblk;
|
||||
__pblk_end_io_erase(pblk, &rqd);
|
||||
|
||||
|
@ -1792,7 +1704,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
|
|||
/* The write thread schedules erases so that it minimizes disturbances
|
||||
* with writes. Thus, there is no need to take the LUN semaphore.
|
||||
*/
|
||||
err = pblk_submit_io(pblk, rqd);
|
||||
err = pblk_submit_io(pblk, rqd, NULL);
|
||||
if (err) {
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
|
@ -1923,13 +1835,11 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
|
|||
static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
|
||||
{
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
unsigned int lba_list_size = lm->emeta_len[2];
|
||||
struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
|
||||
struct pblk_emeta *emeta = line->emeta;
|
||||
|
||||
w_err_gc->lba_list = pblk_malloc(lba_list_size,
|
||||
l_mg->emeta_alloc_type, GFP_KERNEL);
|
||||
w_err_gc->lba_list = kvmalloc(lba_list_size, GFP_KERNEL);
|
||||
memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
|
||||
lba_list_size);
|
||||
}
|
||||
|
|
|
@ -132,14 +132,12 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
|
|||
struct pblk_line *line)
|
||||
{
|
||||
struct line_emeta *emeta_buf;
|
||||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
unsigned int lba_list_size = lm->emeta_len[2];
|
||||
__le64 *lba_list;
|
||||
int ret;
|
||||
|
||||
emeta_buf = pblk_malloc(lm->emeta_len[0],
|
||||
l_mg->emeta_alloc_type, GFP_KERNEL);
|
||||
emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL);
|
||||
if (!emeta_buf)
|
||||
return NULL;
|
||||
|
||||
|
@ -147,7 +145,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
|
|||
if (ret) {
|
||||
pblk_err(pblk, "line %d read emeta failed (%d)\n",
|
||||
line->id, ret);
|
||||
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
|
||||
kvfree(emeta_buf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -161,16 +159,16 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
|
|||
if (ret) {
|
||||
pblk_err(pblk, "inconsistent emeta (line %d)\n",
|
||||
line->id);
|
||||
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
|
||||
kvfree(emeta_buf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
lba_list = pblk_malloc(lba_list_size,
|
||||
l_mg->emeta_alloc_type, GFP_KERNEL);
|
||||
lba_list = kvmalloc(lba_list_size, GFP_KERNEL);
|
||||
|
||||
if (lba_list)
|
||||
memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
|
||||
|
||||
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
|
||||
kvfree(emeta_buf);
|
||||
|
||||
return lba_list;
|
||||
}
|
||||
|
@ -181,7 +179,6 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
|
|||
ws);
|
||||
struct pblk *pblk = line_ws->pblk;
|
||||
struct pblk_line *line = line_ws->line;
|
||||
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
|
||||
struct pblk_line_meta *lm = &pblk->lm;
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
|
@ -272,7 +269,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
|
|||
goto next_rq;
|
||||
|
||||
out:
|
||||
pblk_mfree(lba_list, l_mg->emeta_alloc_type);
|
||||
kvfree(lba_list);
|
||||
kfree(line_ws);
|
||||
kfree(invalid_bitmap);
|
||||
|
||||
|
@ -286,7 +283,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
|
|||
fail_free_gc_rq:
|
||||
kfree(gc_rq);
|
||||
fail_free_lba_list:
|
||||
pblk_mfree(lba_list, l_mg->emeta_alloc_type);
|
||||
kvfree(lba_list);
|
||||
fail_free_invalid_bitmap:
|
||||
kfree(invalid_bitmap);
|
||||
fail_free_ws:
|
||||
|
|
|
@ -543,7 +543,7 @@ static void pblk_line_mg_free(struct pblk *pblk)
|
|||
|
||||
for (i = 0; i < PBLK_DATA_LINES; i++) {
|
||||
kfree(l_mg->sline_meta[i]);
|
||||
pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
|
||||
kvfree(l_mg->eline_meta[i]->buf);
|
||||
kfree(l_mg->eline_meta[i]);
|
||||
}
|
||||
|
||||
|
@ -560,7 +560,7 @@ static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg,
|
|||
kfree(line->erase_bitmap);
|
||||
kfree(line->chks);
|
||||
|
||||
pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type);
|
||||
kvfree(w_err_gc->lba_list);
|
||||
kfree(w_err_gc);
|
||||
}
|
||||
|
||||
|
@ -890,29 +890,14 @@ static int pblk_line_mg_init(struct pblk *pblk)
|
|||
if (!emeta)
|
||||
goto fail_free_emeta;
|
||||
|
||||
if (lm->emeta_len[0] > KMALLOC_MAX_CACHE_SIZE) {
|
||||
l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
|
||||
|
||||
emeta->buf = vmalloc(lm->emeta_len[0]);
|
||||
if (!emeta->buf) {
|
||||
kfree(emeta);
|
||||
goto fail_free_emeta;
|
||||
}
|
||||
|
||||
emeta->nr_entries = lm->emeta_sec[0];
|
||||
l_mg->eline_meta[i] = emeta;
|
||||
} else {
|
||||
l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
|
||||
|
||||
emeta->buf = kmalloc(lm->emeta_len[0], GFP_KERNEL);
|
||||
if (!emeta->buf) {
|
||||
kfree(emeta);
|
||||
goto fail_free_emeta;
|
||||
}
|
||||
|
||||
emeta->nr_entries = lm->emeta_sec[0];
|
||||
l_mg->eline_meta[i] = emeta;
|
||||
emeta->buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL);
|
||||
if (!emeta->buf) {
|
||||
kfree(emeta);
|
||||
goto fail_free_emeta;
|
||||
}
|
||||
|
||||
emeta->nr_entries = lm->emeta_sec[0];
|
||||
l_mg->eline_meta[i] = emeta;
|
||||
}
|
||||
|
||||
for (i = 0; i < l_mg->nr_lines; i++)
|
||||
|
@ -926,10 +911,7 @@ static int pblk_line_mg_init(struct pblk *pblk)
|
|||
|
||||
fail_free_emeta:
|
||||
while (--i >= 0) {
|
||||
if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META)
|
||||
vfree(l_mg->eline_meta[i]->buf);
|
||||
else
|
||||
kfree(l_mg->eline_meta[i]->buf);
|
||||
kvfree(l_mg->eline_meta[i]->buf);
|
||||
kfree(l_mg->eline_meta[i]);
|
||||
}
|
||||
|
||||
|
|
|
@ -342,7 +342,7 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
|||
bio_put(int_bio);
|
||||
int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
|
||||
goto split_retry;
|
||||
} else if (pblk_submit_io(pblk, rqd)) {
|
||||
} else if (pblk_submit_io(pblk, rqd, NULL)) {
|
||||
/* Submitting IO to drive failed, let's report an error */
|
||||
rqd->error = -ENODEV;
|
||||
pblk_end_io_read(rqd);
|
||||
|
@ -417,11 +417,7 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
|
|||
|
||||
int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
|
||||
{
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct bio *bio;
|
||||
struct nvm_rq rqd;
|
||||
int data_len;
|
||||
int ret = NVM_IO_OK;
|
||||
|
||||
memset(&rqd, 0, sizeof(struct nvm_rq));
|
||||
|
@ -446,26 +442,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
|
|||
if (!(gc_rq->secs_to_gc))
|
||||
goto out;
|
||||
|
||||
data_len = (gc_rq->secs_to_gc) * geo->csecs;
|
||||
bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
|
||||
PBLK_VMALLOC_META, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
pblk_err(pblk, "could not allocate GC bio (%lu)\n",
|
||||
PTR_ERR(bio));
|
||||
ret = PTR_ERR(bio);
|
||||
goto err_free_dma;
|
||||
}
|
||||
|
||||
bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
|
||||
rqd.opcode = NVM_OP_PREAD;
|
||||
rqd.nr_ppas = gc_rq->secs_to_gc;
|
||||
rqd.bio = bio;
|
||||
|
||||
if (pblk_submit_io_sync(pblk, &rqd)) {
|
||||
if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) {
|
||||
ret = -EIO;
|
||||
goto err_free_bio;
|
||||
goto err_free_dma;
|
||||
}
|
||||
|
||||
pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
|
||||
|
@ -489,8 +471,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
|
|||
pblk_free_rqd_meta(pblk, &rqd);
|
||||
return ret;
|
||||
|
||||
err_free_bio:
|
||||
bio_put(bio);
|
||||
err_free_dma:
|
||||
pblk_free_rqd_meta(pblk, &rqd);
|
||||
return ret;
|
||||
|
|
|
@ -178,12 +178,11 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
|
|||
void *meta_list;
|
||||
struct pblk_pad_rq *pad_rq;
|
||||
struct nvm_rq *rqd;
|
||||
struct bio *bio;
|
||||
struct ppa_addr *ppa_list;
|
||||
void *data;
|
||||
__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
|
||||
u64 w_ptr = line->cur_sec;
|
||||
int left_line_ppas, rq_ppas, rq_len;
|
||||
int left_line_ppas, rq_ppas;
|
||||
int i, j;
|
||||
int ret = 0;
|
||||
|
||||
|
@ -212,28 +211,15 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
|
|||
goto fail_complete;
|
||||
}
|
||||
|
||||
rq_len = rq_ppas * geo->csecs;
|
||||
|
||||
bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
|
||||
PBLK_VMALLOC_META, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
ret = PTR_ERR(bio);
|
||||
goto fail_complete;
|
||||
}
|
||||
|
||||
bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
|
||||
rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
|
||||
|
||||
ret = pblk_alloc_rqd_meta(pblk, rqd);
|
||||
if (ret) {
|
||||
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
|
||||
bio_put(bio);
|
||||
goto fail_complete;
|
||||
}
|
||||
|
||||
rqd->bio = bio;
|
||||
rqd->bio = NULL;
|
||||
rqd->opcode = NVM_OP_PWRITE;
|
||||
rqd->is_seq = 1;
|
||||
rqd->nr_ppas = rq_ppas;
|
||||
|
@ -275,13 +261,12 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
|
|||
kref_get(&pad_rq->ref);
|
||||
pblk_down_chunk(pblk, ppa_list[0]);
|
||||
|
||||
ret = pblk_submit_io(pblk, rqd);
|
||||
ret = pblk_submit_io(pblk, rqd, data);
|
||||
if (ret) {
|
||||
pblk_err(pblk, "I/O submission failed: %d\n", ret);
|
||||
pblk_up_chunk(pblk, ppa_list[0]);
|
||||
kref_put(&pad_rq->ref, pblk_recov_complete);
|
||||
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
|
||||
bio_put(bio);
|
||||
goto fail_complete;
|
||||
}
|
||||
|
||||
|
@ -375,13 +360,12 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
|
|||
struct ppa_addr *ppa_list;
|
||||
void *meta_list;
|
||||
struct nvm_rq *rqd;
|
||||
struct bio *bio;
|
||||
void *data;
|
||||
dma_addr_t dma_ppa_list, dma_meta_list;
|
||||
__le64 *lba_list;
|
||||
u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
|
||||
bool padded = false;
|
||||
int rq_ppas, rq_len;
|
||||
int rq_ppas;
|
||||
int i, j;
|
||||
int ret;
|
||||
u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
|
||||
|
@ -404,18 +388,9 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
|
|||
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
|
||||
if (!rq_ppas)
|
||||
rq_ppas = pblk->min_write_pgs;
|
||||
rq_len = rq_ppas * geo->csecs;
|
||||
|
||||
retry_rq:
|
||||
bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
|
||||
if (IS_ERR(bio))
|
||||
return PTR_ERR(bio);
|
||||
|
||||
bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
bio_get(bio);
|
||||
|
||||
rqd->bio = bio;
|
||||
rqd->bio = NULL;
|
||||
rqd->opcode = NVM_OP_PREAD;
|
||||
rqd->meta_list = meta_list;
|
||||
rqd->nr_ppas = rq_ppas;
|
||||
|
@ -445,10 +420,9 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
|
|||
addr_to_gen_ppa(pblk, paddr + j, line->id);
|
||||
}
|
||||
|
||||
ret = pblk_submit_io_sync(pblk, rqd);
|
||||
ret = pblk_submit_io_sync(pblk, rqd, data);
|
||||
if (ret) {
|
||||
pblk_err(pblk, "I/O submission failed: %d\n", ret);
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -460,24 +434,20 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
|
|||
|
||||
if (padded) {
|
||||
pblk_log_read_err(pblk, rqd);
|
||||
bio_put(bio);
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
pad_distance = pblk_pad_distance(pblk, line);
|
||||
ret = pblk_recov_pad_line(pblk, line, pad_distance);
|
||||
if (ret) {
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
padded = true;
|
||||
bio_put(bio);
|
||||
goto retry_rq;
|
||||
}
|
||||
|
||||
pblk_get_packed_meta(pblk, rqd);
|
||||
bio_put(bio);
|
||||
|
||||
for (i = 0; i < rqd->nr_ppas; i++) {
|
||||
struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
|
||||
|
|
|
@ -373,7 +373,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
|
|||
struct pblk_emeta *emeta = meta_line->emeta;
|
||||
struct ppa_addr *ppa_list;
|
||||
struct pblk_g_ctx *m_ctx;
|
||||
struct bio *bio;
|
||||
struct nvm_rq *rqd;
|
||||
void *data;
|
||||
u64 paddr;
|
||||
|
@ -391,20 +390,9 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
|
|||
rq_len = rq_ppas * geo->csecs;
|
||||
data = ((void *)emeta->buf) + emeta->mem;
|
||||
|
||||
bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
|
||||
l_mg->emeta_alloc_type, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
pblk_err(pblk, "failed to map emeta io");
|
||||
ret = PTR_ERR(bio);
|
||||
goto fail_free_rqd;
|
||||
}
|
||||
bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
rqd->bio = bio;
|
||||
|
||||
ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta);
|
||||
if (ret)
|
||||
goto fail_free_bio;
|
||||
goto fail_free_rqd;
|
||||
|
||||
ppa_list = nvm_rq_to_ppa_list(rqd);
|
||||
for (i = 0; i < rqd->nr_ppas; ) {
|
||||
|
@ -423,7 +411,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
|
|||
|
||||
pblk_down_chunk(pblk, ppa_list[0]);
|
||||
|
||||
ret = pblk_submit_io(pblk, rqd);
|
||||
ret = pblk_submit_io(pblk, rqd, data);
|
||||
if (ret) {
|
||||
pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
|
||||
goto fail_rollback;
|
||||
|
@ -437,8 +425,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
|
|||
pblk_dealloc_page(pblk, meta_line, rq_ppas);
|
||||
list_add(&meta_line->list, &meta_line->list);
|
||||
spin_unlock(&l_mg->close_lock);
|
||||
fail_free_bio:
|
||||
bio_put(bio);
|
||||
fail_free_rqd:
|
||||
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
|
||||
return ret;
|
||||
|
@ -523,7 +509,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
|
|||
meta_line = pblk_should_submit_meta_io(pblk, rqd);
|
||||
|
||||
/* Submit data write for current data line */
|
||||
err = pblk_submit_io(pblk, rqd);
|
||||
err = pblk_submit_io(pblk, rqd, NULL);
|
||||
if (err) {
|
||||
pblk_err(pblk, "data I/O submission failed: %d\n", err);
|
||||
return NVM_IO_ERR;
|
||||
|
|
|
@ -481,11 +481,6 @@ struct pblk_line {
|
|||
|
||||
#define PBLK_DATA_LINES 4
|
||||
|
||||
enum {
|
||||
PBLK_KMALLOC_META = 1,
|
||||
PBLK_VMALLOC_META = 2,
|
||||
};
|
||||
|
||||
enum {
|
||||
PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */
|
||||
PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */
|
||||
|
@ -521,9 +516,6 @@ struct pblk_line_mgmt {
|
|||
|
||||
__le32 *vsc_list; /* Valid sector counts for all lines */
|
||||
|
||||
/* Metadata allocation type: VMALLOC | KMALLOC */
|
||||
int emeta_alloc_type;
|
||||
|
||||
/* Pre-allocated metadata for data lines */
|
||||
struct pblk_smeta *sline_meta[PBLK_DATA_LINES];
|
||||
struct pblk_emeta *eline_meta[PBLK_DATA_LINES];
|
||||
|
@ -783,14 +775,10 @@ struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
|
|||
struct ppa_addr ppa);
|
||||
void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
|
||||
void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
|
||||
int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
|
||||
int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
|
||||
int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd);
|
||||
int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf);
|
||||
int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf);
|
||||
int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
|
||||
void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd);
|
||||
struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
|
||||
unsigned int nr_secs, unsigned int len,
|
||||
int alloc_type, gfp_t gfp_mask);
|
||||
struct pblk_line *pblk_line_get(struct pblk *pblk);
|
||||
struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
|
||||
struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
|
||||
|
@ -938,21 +926,6 @@ void pblk_rl_werr_line_out(struct pblk_rl *rl);
|
|||
int pblk_sysfs_init(struct gendisk *tdisk);
|
||||
void pblk_sysfs_exit(struct gendisk *tdisk);
|
||||
|
||||
static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
|
||||
{
|
||||
if (type == PBLK_KMALLOC_META)
|
||||
return kmalloc(size, flags);
|
||||
return vmalloc(size);
|
||||
}
|
||||
|
||||
static inline void pblk_mfree(void *ptr, int type)
|
||||
{
|
||||
if (type == PBLK_KMALLOC_META)
|
||||
kfree(ptr);
|
||||
else
|
||||
vfree(ptr);
|
||||
}
|
||||
|
||||
static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
|
||||
{
|
||||
return c_ctx - sizeof(struct nvm_rq);
|
||||
|
|
|
@ -105,8 +105,14 @@ struct closure_syncer {
|
|||
|
||||
static void closure_sync_fn(struct closure *cl)
|
||||
{
|
||||
cl->s->done = 1;
|
||||
wake_up_process(cl->s->task);
|
||||
struct closure_syncer *s = cl->s;
|
||||
struct task_struct *p;
|
||||
|
||||
rcu_read_lock();
|
||||
p = READ_ONCE(s->task);
|
||||
s->done = 1;
|
||||
wake_up_process(p);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
void __sched __closure_sync(struct closure *cl)
|
||||
|
|
|
@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
|
|||
while (size) {
|
||||
struct keybuf_key *w;
|
||||
unsigned int bytes = min(i->bytes, size);
|
||||
int err = copy_to_user(buf, i->buf, bytes);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
if (copy_to_user(buf, i->buf, bytes))
|
||||
return -EFAULT;
|
||||
|
||||
ret += bytes;
|
||||
buf += bytes;
|
||||
|
|
|
@ -964,6 +964,7 @@ KTYPE(bch_cache_set_internal);
|
|||
|
||||
static int __bch_cache_cmp(const void *l, const void *r)
|
||||
{
|
||||
cond_resched();
|
||||
return *((uint16_t *)r) - *((uint16_t *)l);
|
||||
}
|
||||
|
||||
|
|
|
@ -408,6 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
|
|||
ret = dm_dispatch_clone_request(clone, rq);
|
||||
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
|
||||
blk_rq_unprep_clone(clone);
|
||||
blk_mq_cleanup_rq(clone);
|
||||
tio->ti->type->release_clone_rq(clone, &tio->info);
|
||||
tio->clone = NULL;
|
||||
return DM_MAPIO_REQUEUE;
|
||||
|
@ -562,7 +563,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
|
|||
if (err)
|
||||
goto out_kfree_tag_set;
|
||||
|
||||
q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
|
||||
q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true);
|
||||
if (IS_ERR(q)) {
|
||||
err = PTR_ERR(q);
|
||||
goto out_tag_set;
|
||||
|
|
|
@ -258,6 +258,11 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
|
|||
bio_sector < start_sector))
|
||||
goto out_of_bounds;
|
||||
|
||||
if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (unlikely(bio_end_sector(bio) > end_sector)) {
|
||||
/* This bio crosses a device boundary, so we have to split it */
|
||||
struct bio *split = bio_split(bio, end_sector - bio_sector,
|
||||
|
|
|
@ -376,6 +376,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
|
|||
struct mddev *mddev = q->queuedata;
|
||||
unsigned int sectors;
|
||||
|
||||
if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
|
||||
bio_io_error(bio);
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
blk_queue_split(q, &bio);
|
||||
|
||||
if (mddev == NULL || mddev->pers == NULL) {
|
||||
|
@ -1232,6 +1237,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
|
|||
mddev->new_layout = mddev->layout;
|
||||
mddev->new_chunk_sectors = mddev->chunk_sectors;
|
||||
}
|
||||
if (mddev->level == 0)
|
||||
mddev->layout = -1;
|
||||
|
||||
if (sb->state & (1<<MD_SB_CLEAN))
|
||||
mddev->recovery_cp = MaxSector;
|
||||
|
@ -1647,6 +1654,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
|
|||
rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
|
||||
}
|
||||
|
||||
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
|
||||
sb->level != 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (!refdev) {
|
||||
ret = 1;
|
||||
} else {
|
||||
|
@ -1757,6 +1768,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
|||
mddev->new_chunk_sectors = mddev->chunk_sectors;
|
||||
}
|
||||
|
||||
if (mddev->level == 0 &&
|
||||
!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
|
||||
mddev->layout = -1;
|
||||
|
||||
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
|
||||
set_bit(MD_HAS_JOURNAL, &mddev->flags);
|
||||
|
||||
|
@ -1826,8 +1841,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
|||
if (!(le32_to_cpu(sb->feature_map) &
|
||||
MD_FEATURE_RECOVERY_BITMAP))
|
||||
rdev->saved_raid_disk = -1;
|
||||
} else
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
} else {
|
||||
/*
|
||||
* If the array is FROZEN, then the device can't
|
||||
* be in_sync with rest of array.
|
||||
*/
|
||||
if (!test_bit(MD_RECOVERY_FROZEN,
|
||||
&mddev->recovery))
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
}
|
||||
rdev->raid_disk = role;
|
||||
break;
|
||||
}
|
||||
|
@ -3664,11 +3686,7 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
|
|||
return -EINVAL;
|
||||
if (decimals < 0)
|
||||
decimals = 0;
|
||||
while (decimals < scale) {
|
||||
result *= 10;
|
||||
decimals ++;
|
||||
}
|
||||
*res = result;
|
||||
*res = result * int_pow(10, scale - decimals);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -4155,12 +4173,17 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
|
|||
* active-idle
|
||||
* like active, but no writes have been seen for a while (100msec).
|
||||
*
|
||||
* broken
|
||||
* RAID0/LINEAR-only: same as clean, but array is missing a member.
|
||||
* It's useful because RAID0/LINEAR mounted-arrays aren't stopped
|
||||
* when a member is gone, so this state will at least alert the
|
||||
* user that something is wrong.
|
||||
*/
|
||||
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
|
||||
write_pending, active_idle, bad_word};
|
||||
write_pending, active_idle, broken, bad_word};
|
||||
static char *array_states[] = {
|
||||
"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
|
||||
"write-pending", "active-idle", NULL };
|
||||
"write-pending", "active-idle", "broken", NULL };
|
||||
|
||||
static int match_word(const char *word, char **list)
|
||||
{
|
||||
|
@ -4176,7 +4199,7 @@ array_state_show(struct mddev *mddev, char *page)
|
|||
{
|
||||
enum array_state st = inactive;
|
||||
|
||||
if (mddev->pers)
|
||||
if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
|
||||
switch(mddev->ro) {
|
||||
case 1:
|
||||
st = readonly;
|
||||
|
@ -4196,7 +4219,10 @@ array_state_show(struct mddev *mddev, char *page)
|
|||
st = active;
|
||||
spin_unlock(&mddev->lock);
|
||||
}
|
||||
else {
|
||||
|
||||
if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
|
||||
st = broken;
|
||||
} else {
|
||||
if (list_empty(&mddev->disks) &&
|
||||
mddev->raid_disks == 0 &&
|
||||
mddev->dev_sectors == 0)
|
||||
|
@ -4310,6 +4336,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
break;
|
||||
case write_pending:
|
||||
case active_idle:
|
||||
case broken:
|
||||
/* these cannot be set */
|
||||
break;
|
||||
}
|
||||
|
@ -5182,6 +5209,34 @@ static struct md_sysfs_entry md_consistency_policy =
|
|||
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
|
||||
consistency_policy_store);
|
||||
|
||||
static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", mddev->fail_last_dev);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting fail_last_dev to true to allow last device to be forcibly removed
|
||||
* from RAID1/RAID10.
|
||||
*/
|
||||
static ssize_t
|
||||
fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
{
|
||||
int ret;
|
||||
bool value;
|
||||
|
||||
ret = kstrtobool(buf, &value);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (value != mddev->fail_last_dev)
|
||||
mddev->fail_last_dev = value;
|
||||
|
||||
return len;
|
||||
}
|
||||
static struct md_sysfs_entry md_fail_last_dev =
|
||||
__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
|
||||
fail_last_dev_store);
|
||||
|
||||
static struct attribute *md_default_attrs[] = {
|
||||
&md_level.attr,
|
||||
&md_layout.attr,
|
||||
|
@ -5198,6 +5253,7 @@ static struct attribute *md_default_attrs[] = {
|
|||
&md_array_size.attr,
|
||||
&max_corr_read_errors.attr,
|
||||
&md_consistency_policy.attr,
|
||||
&md_fail_last_dev.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
|
@ -5744,9 +5800,6 @@ int md_run(struct mddev *mddev)
|
|||
md_update_sb(mddev, 0);
|
||||
|
||||
md_new_event(mddev);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_action);
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
return 0;
|
||||
|
||||
bitmap_abort:
|
||||
|
@ -5767,6 +5820,7 @@ static int do_md_run(struct mddev *mddev)
|
|||
{
|
||||
int err;
|
||||
|
||||
set_bit(MD_NOT_READY, &mddev->flags);
|
||||
err = md_run(mddev);
|
||||
if (err)
|
||||
goto out;
|
||||
|
@ -5787,9 +5841,14 @@ static int do_md_run(struct mddev *mddev)
|
|||
|
||||
set_capacity(mddev->gendisk, mddev->array_sectors);
|
||||
revalidate_disk(mddev->gendisk);
|
||||
clear_bit(MD_NOT_READY, &mddev->flags);
|
||||
mddev->changed = 1;
|
||||
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_action);
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
out:
|
||||
clear_bit(MD_NOT_READY, &mddev->flags);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -6849,6 +6908,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
|
|||
mddev->external = 0;
|
||||
|
||||
mddev->layout = info->layout;
|
||||
if (mddev->level == 0)
|
||||
/* Cannot trust RAID0 layout info here */
|
||||
mddev->layout = -1;
|
||||
mddev->chunk_sectors = info->chunk_size >> 9;
|
||||
|
||||
if (mddev->persistent) {
|
||||
|
@ -8900,6 +8962,7 @@ void md_check_recovery(struct mddev *mddev)
|
|||
|
||||
if (mddev_trylock(mddev)) {
|
||||
int spares = 0;
|
||||
bool try_set_sync = mddev->safemode != 0;
|
||||
|
||||
if (!mddev->external && mddev->safemode == 1)
|
||||
mddev->safemode = 0;
|
||||
|
@ -8945,7 +9008,7 @@ void md_check_recovery(struct mddev *mddev)
|
|||
}
|
||||
}
|
||||
|
||||
if (!mddev->external && !mddev->in_sync) {
|
||||
if (try_set_sync && !mddev->external && !mddev->in_sync) {
|
||||
spin_lock(&mddev->lock);
|
||||
set_in_sync(mddev);
|
||||
spin_unlock(&mddev->lock);
|
||||
|
@ -9043,7 +9106,8 @@ void md_reap_sync_thread(struct mddev *mddev)
|
|||
/* resync has finished, collect result */
|
||||
md_unregister_thread(&mddev->sync_thread);
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
|
||||
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
|
||||
mddev->degraded != mddev->raid_disks) {
|
||||
/* success...*/
|
||||
/* activate any spares */
|
||||
if (mddev->pers->spare_active(mddev)) {
|
||||
|
|
|
@ -248,6 +248,12 @@ enum mddev_flags {
|
|||
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
|
||||
* without explicitly holding reconfig_mutex.
|
||||
*/
|
||||
MD_NOT_READY, /* do_md_run() is active, so 'array_state'
|
||||
* must not report that array is ready yet
|
||||
*/
|
||||
MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
|
||||
* I/O in case an array member is gone/failed.
|
||||
*/
|
||||
};
|
||||
|
||||
enum mddev_sb_flags {
|
||||
|
@ -487,6 +493,7 @@ struct mddev {
|
|||
unsigned int good_device_nr; /* good device num within cluster raid */
|
||||
|
||||
bool has_superblocks:1;
|
||||
bool fail_last_dev:1;
|
||||
};
|
||||
|
||||
enum recovery_flags {
|
||||
|
@ -735,6 +742,19 @@ extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
|
|||
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
|
||||
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
|
||||
|
||||
static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
|
||||
{
|
||||
int flags = rdev->bdev->bd_disk->flags;
|
||||
|
||||
if (!(flags & GENHD_FL_UP)) {
|
||||
if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
|
||||
pr_warn("md: %s: %s array has a missing/failed member\n",
|
||||
mdname(rdev->mddev), md_type);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
|
||||
{
|
||||
int faulty = test_bit(Faulty, &rdev->flags);
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
#include "raid0.h"
|
||||
#include "raid5.h"
|
||||
|
||||
static int default_layout = 0;
|
||||
module_param(default_layout, int, 0644);
|
||||
|
||||
#define UNSUPPORTED_MDDEV_FLAGS \
|
||||
((1L << MD_HAS_JOURNAL) | \
|
||||
(1L << MD_JOURNAL_CLEAN) | \
|
||||
|
@ -139,6 +142,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
|
|||
}
|
||||
pr_debug("md/raid0:%s: FINAL %d zones\n",
|
||||
mdname(mddev), conf->nr_strip_zones);
|
||||
|
||||
if (conf->nr_strip_zones == 1) {
|
||||
conf->layout = RAID0_ORIG_LAYOUT;
|
||||
} else if (mddev->layout == RAID0_ORIG_LAYOUT ||
|
||||
mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
|
||||
conf->layout = mddev->layout;
|
||||
} else if (default_layout == RAID0_ORIG_LAYOUT ||
|
||||
default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
|
||||
conf->layout = default_layout;
|
||||
} else {
|
||||
pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
|
||||
mdname(mddev));
|
||||
pr_err("md/raid0: please set raid.default_layout to 1 or 2\n");
|
||||
err = -ENOTSUPP;
|
||||
goto abort;
|
||||
}
|
||||
/*
|
||||
* now since we have the hard sector sizes, we can make sure
|
||||
* chunk size is a multiple of that sector size
|
||||
|
@ -547,10 +566,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
|
|||
|
||||
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct r0conf *conf = mddev->private;
|
||||
struct strip_zone *zone;
|
||||
struct md_rdev *tmp_dev;
|
||||
sector_t bio_sector;
|
||||
sector_t sector;
|
||||
sector_t orig_sector;
|
||||
unsigned chunk_sects;
|
||||
unsigned sectors;
|
||||
|
||||
|
@ -584,8 +605,26 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
|
|||
bio = split;
|
||||
}
|
||||
|
||||
orig_sector = sector;
|
||||
zone = find_zone(mddev->private, §or);
|
||||
tmp_dev = map_sector(mddev, zone, sector, §or);
|
||||
switch (conf->layout) {
|
||||
case RAID0_ORIG_LAYOUT:
|
||||
tmp_dev = map_sector(mddev, zone, orig_sector, §or);
|
||||
break;
|
||||
case RAID0_ALT_MULTIZONE_LAYOUT:
|
||||
tmp_dev = map_sector(mddev, zone, sector, §or);
|
||||
break;
|
||||
default:
|
||||
WARN("md/raid0:%s: Invalid layout\n", mdname(mddev));
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) {
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
bio_set_dev(bio, tmp_dev->bdev);
|
||||
bio->bi_iter.bi_sector = sector + zone->dev_start +
|
||||
tmp_dev->data_offset;
|
||||
|
|
|
@ -8,11 +8,25 @@ struct strip_zone {
|
|||
int nb_dev; /* # of devices attached to the zone */
|
||||
};
|
||||
|
||||
/* Linux 3.14 (20d0189b101) made an unintended change to
|
||||
* the RAID0 layout for multi-zone arrays (where devices aren't all
|
||||
* the same size.
|
||||
* RAID0_ORIG_LAYOUT restores the original layout
|
||||
* RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout
|
||||
* The layouts are identical when there is only one zone (all
|
||||
* devices the same size).
|
||||
*/
|
||||
|
||||
enum r0layout {
|
||||
RAID0_ORIG_LAYOUT = 1,
|
||||
RAID0_ALT_MULTIZONE_LAYOUT = 2,
|
||||
};
|
||||
struct r0conf {
|
||||
struct strip_zone *strip_zone;
|
||||
struct md_rdev **devlist; /* lists of rdevs, pointed to
|
||||
* by strip_zone->dev */
|
||||
int nr_strip_zones;
|
||||
enum r0layout layout;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -447,19 +447,21 @@ static void raid1_end_write_request(struct bio *bio)
|
|||
/* We never try FailFast to WriteMostly devices */
|
||||
!test_bit(WriteMostly, &rdev->flags)) {
|
||||
md_error(r1_bio->mddev, rdev);
|
||||
if (!test_bit(Faulty, &rdev->flags))
|
||||
/* This is the only remaining device,
|
||||
* We need to retry the write without
|
||||
* FailFast
|
||||
*/
|
||||
set_bit(R1BIO_WriteError, &r1_bio->state);
|
||||
else {
|
||||
/* Finished with this branch */
|
||||
r1_bio->bios[mirror] = NULL;
|
||||
to_put = bio;
|
||||
}
|
||||
} else
|
||||
}
|
||||
|
||||
/*
|
||||
* When the device is faulty, it is not necessary to
|
||||
* handle write error.
|
||||
* For failfast, this is the only remaining device,
|
||||
* We need to retry the write without FailFast.
|
||||
*/
|
||||
if (!test_bit(Faulty, &rdev->flags))
|
||||
set_bit(R1BIO_WriteError, &r1_bio->state);
|
||||
else {
|
||||
/* Finished with this branch */
|
||||
r1_bio->bios[mirror] = NULL;
|
||||
to_put = bio;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Set R1BIO_Uptodate in our master bio, so that we
|
||||
|
@ -872,8 +874,11 @@ static void flush_pending_writes(struct r1conf *conf)
|
|||
* backgroup IO calls must call raise_barrier. Once that returns
|
||||
* there is no normal IO happeing. It must arrange to call
|
||||
* lower_barrier when the particular background IO completes.
|
||||
*
|
||||
* If resync/recovery is interrupted, returns -EINTR;
|
||||
* Otherwise, returns 0.
|
||||
*/
|
||||
static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
|
||||
static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
|
||||
{
|
||||
int idx = sector_to_idx(sector_nr);
|
||||
|
||||
|
@ -1612,12 +1617,12 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
/*
|
||||
* If it is not operational, then we have already marked it as dead
|
||||
* else if it is the last working disks, ignore the error, let the
|
||||
* next level up know.
|
||||
* else if it is the last working disks with "fail_last_dev == false",
|
||||
* ignore the error, let the next level up know.
|
||||
* else mark the drive as failed
|
||||
*/
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
if (test_bit(In_sync, &rdev->flags)
|
||||
if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
|
||||
&& (conf->raid_disks - mddev->degraded) == 1) {
|
||||
/*
|
||||
* Don't fail the drive, act as though we were just a
|
||||
|
@ -1901,6 +1906,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
|
|||
} while (sectors_to_go > 0);
|
||||
}
|
||||
|
||||
static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
|
||||
{
|
||||
if (atomic_dec_and_test(&r1_bio->remaining)) {
|
||||
struct mddev *mddev = r1_bio->mddev;
|
||||
int s = r1_bio->sectors;
|
||||
|
||||
if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
|
||||
test_bit(R1BIO_WriteError, &r1_bio->state))
|
||||
reschedule_retry(r1_bio);
|
||||
else {
|
||||
put_buf(r1_bio);
|
||||
md_done_sync(mddev, s, uptodate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void end_sync_write(struct bio *bio)
|
||||
{
|
||||
int uptodate = !bio->bi_status;
|
||||
|
@ -1927,16 +1948,7 @@ static void end_sync_write(struct bio *bio)
|
|||
)
|
||||
set_bit(R1BIO_MadeGood, &r1_bio->state);
|
||||
|
||||
if (atomic_dec_and_test(&r1_bio->remaining)) {
|
||||
int s = r1_bio->sectors;
|
||||
if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
|
||||
test_bit(R1BIO_WriteError, &r1_bio->state))
|
||||
reschedule_retry(r1_bio);
|
||||
else {
|
||||
put_buf(r1_bio);
|
||||
md_done_sync(mddev, s, uptodate);
|
||||
}
|
||||
}
|
||||
put_sync_write_buf(r1_bio, uptodate);
|
||||
}
|
||||
|
||||
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
|
||||
|
@ -2219,17 +2231,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
|
|||
generic_make_request(wbio);
|
||||
}
|
||||
|
||||
if (atomic_dec_and_test(&r1_bio->remaining)) {
|
||||
/* if we're here, all write(s) have completed, so clean up */
|
||||
int s = r1_bio->sectors;
|
||||
if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
|
||||
test_bit(R1BIO_WriteError, &r1_bio->state))
|
||||
reschedule_retry(r1_bio);
|
||||
else {
|
||||
put_buf(r1_bio);
|
||||
md_done_sync(mddev, s, 1);
|
||||
}
|
||||
}
|
||||
put_sync_write_buf(r1_bio, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -3127,6 +3129,13 @@ static int raid1_run(struct mddev *mddev)
|
|||
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
|
||||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
|
||||
mddev->degraded++;
|
||||
/*
|
||||
* RAID1 needs at least one disk in active
|
||||
*/
|
||||
if (conf->raid_disks - mddev->degraded < 1) {
|
||||
ret = -EINVAL;
|
||||
goto abort;
|
||||
}
|
||||
|
||||
if (conf->raid_disks - mddev->degraded == 1)
|
||||
mddev->recovery_cp = MaxSector;
|
||||
|
@ -3160,8 +3169,12 @@ static int raid1_run(struct mddev *mddev)
|
|||
ret = md_integrity_register(mddev);
|
||||
if (ret) {
|
||||
md_unregister_thread(&mddev->thread);
|
||||
raid1_free(mddev, conf);
|
||||
goto abort;
|
||||
}
|
||||
return 0;
|
||||
|
||||
abort:
|
||||
raid1_free(mddev, conf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -465,19 +465,21 @@ static void raid10_end_write_request(struct bio *bio)
|
|||
if (test_bit(FailFast, &rdev->flags) &&
|
||||
(bio->bi_opf & MD_FAILFAST)) {
|
||||
md_error(rdev->mddev, rdev);
|
||||
if (!test_bit(Faulty, &rdev->flags))
|
||||
/* This is the only remaining device,
|
||||
* We need to retry the write without
|
||||
* FailFast
|
||||
*/
|
||||
set_bit(R10BIO_WriteError, &r10_bio->state);
|
||||
else {
|
||||
r10_bio->devs[slot].bio = NULL;
|
||||
to_put = bio;
|
||||
dec_rdev = 1;
|
||||
}
|
||||
} else
|
||||
}
|
||||
|
||||
/*
|
||||
* When the device is faulty, it is not necessary to
|
||||
* handle write error.
|
||||
* For failfast, this is the only remaining device,
|
||||
* We need to retry the write without FailFast.
|
||||
*/
|
||||
if (!test_bit(Faulty, &rdev->flags))
|
||||
set_bit(R10BIO_WriteError, &r10_bio->state);
|
||||
else {
|
||||
r10_bio->devs[slot].bio = NULL;
|
||||
to_put = bio;
|
||||
dec_rdev = 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
|
@ -1638,12 +1640,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
/*
|
||||
* If it is not operational, then we have already marked it as dead
|
||||
* else if it is the last working disks, ignore the error, let the
|
||||
* next level up know.
|
||||
* else if it is the last working disks with "fail_last_dev == false",
|
||||
* ignore the error, let the next level up know.
|
||||
* else mark the drive as failed
|
||||
*/
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
if (test_bit(In_sync, &rdev->flags)
|
||||
if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
|
||||
&& !enough(conf, rdev->raid_disk)) {
|
||||
/*
|
||||
* Don't fail the drive, just return an IO error.
|
||||
|
|
|
@ -2526,7 +2526,8 @@ static void raid5_end_read_request(struct bio * bi)
|
|||
int set_bad = 0;
|
||||
|
||||
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
|
||||
atomic_inc(&rdev->read_errors);
|
||||
if (!(bi->bi_status == BLK_STS_PROTECTION))
|
||||
atomic_inc(&rdev->read_errors);
|
||||
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
|
||||
pr_warn_ratelimited(
|
||||
"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
|
||||
|
@ -2549,16 +2550,24 @@ static void raid5_end_read_request(struct bio * bi)
|
|||
(unsigned long long)s,
|
||||
bdn);
|
||||
} else if (atomic_read(&rdev->read_errors)
|
||||
> conf->max_nr_stripes)
|
||||
pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
|
||||
mdname(conf->mddev), bdn);
|
||||
else
|
||||
> conf->max_nr_stripes) {
|
||||
if (!test_bit(Faulty, &rdev->flags)) {
|
||||
pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
|
||||
mdname(conf->mddev),
|
||||
atomic_read(&rdev->read_errors),
|
||||
conf->max_nr_stripes);
|
||||
pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
|
||||
mdname(conf->mddev), bdn);
|
||||
}
|
||||
} else
|
||||
retry = 1;
|
||||
if (set_bad && test_bit(In_sync, &rdev->flags)
|
||||
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
|
||||
retry = 1;
|
||||
if (retry)
|
||||
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
|
||||
if (sh->qd_idx >= 0 && sh->pd_idx == i)
|
||||
set_bit(R5_ReadError, &sh->dev[i].flags);
|
||||
else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
|
||||
set_bit(R5_ReadError, &sh->dev[i].flags);
|
||||
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
|
||||
} else
|
||||
|
@ -4612,7 +4621,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
|
|||
(1 << STRIPE_FULL_WRITE) |
|
||||
(1 << STRIPE_BIOFILL_RUN) |
|
||||
(1 << STRIPE_COMPUTE_RUN) |
|
||||
(1 << STRIPE_OPS_REQ_PENDING) |
|
||||
(1 << STRIPE_DISCARD) |
|
||||
(1 << STRIPE_BATCH_READY) |
|
||||
(1 << STRIPE_BATCH_ERR) |
|
||||
|
@ -5491,7 +5499,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
|||
return;
|
||||
|
||||
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
|
||||
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
|
||||
last_sector = bio_end_sector(bi);
|
||||
|
||||
bi->bi_next = NULL;
|
||||
|
||||
|
@ -5718,7 +5726,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
|||
do_flush = false;
|
||||
}
|
||||
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
if (!sh->batch_head)
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
clear_bit(STRIPE_DELAYED, &sh->state);
|
||||
if ((!sh->batch_head || sh == sh->batch_head) &&
|
||||
(bi->bi_opf & REQ_SYNC) &&
|
||||
|
|
|
@ -357,7 +357,6 @@ enum {
|
|||
STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
|
||||
STRIPE_BIOFILL_RUN,
|
||||
STRIPE_COMPUTE_RUN,
|
||||
STRIPE_OPS_REQ_PENDING,
|
||||
STRIPE_ON_UNPLUG_LIST,
|
||||
STRIPE_DISCARD,
|
||||
STRIPE_ON_RELEASE_LIST,
|
||||
|
@ -493,9 +492,7 @@ struct disk_info {
|
|||
*/
|
||||
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
|
||||
{
|
||||
int sectors = bio_sectors(bio);
|
||||
|
||||
if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
|
||||
if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
|
||||
return bio->bi_next;
|
||||
else
|
||||
return NULL;
|
||||
|
|
|
@ -64,6 +64,7 @@ config NVME_TCP
|
|||
depends on INET
|
||||
depends on BLK_DEV_NVME
|
||||
select NVME_FABRICS
|
||||
select CRYPTO_CRC32C
|
||||
help
|
||||
This provides support for the NVMe over Fabrics protocol using
|
||||
the TCP transport. This allows you to use remote block devices
|
||||
|
|
|
@ -22,12 +22,12 @@
|
|||
#include <linux/pm_qos.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include "trace.h"
|
||||
|
||||
#include "nvme.h"
|
||||
#include "fabrics.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include "trace.h"
|
||||
|
||||
#define NVME_MINORS (1U << MINORBITS)
|
||||
|
||||
unsigned int admin_timeout = 60;
|
||||
|
@ -81,7 +81,6 @@ EXPORT_SYMBOL_GPL(nvme_reset_wq);
|
|||
struct workqueue_struct *nvme_delete_wq;
|
||||
EXPORT_SYMBOL_GPL(nvme_delete_wq);
|
||||
|
||||
static DEFINE_IDA(nvme_subsystems_ida);
|
||||
static LIST_HEAD(nvme_subsystems);
|
||||
static DEFINE_MUTEX(nvme_subsystems_lock);
|
||||
|
||||
|
@ -197,9 +196,9 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
|
|||
return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
|
||||
}
|
||||
|
||||
static blk_status_t nvme_error_status(struct request *req)
|
||||
static blk_status_t nvme_error_status(u16 status)
|
||||
{
|
||||
switch (nvme_req(req)->status & 0x7ff) {
|
||||
switch (status & 0x7ff) {
|
||||
case NVME_SC_SUCCESS:
|
||||
return BLK_STS_OK;
|
||||
case NVME_SC_CAP_EXCEEDED:
|
||||
|
@ -226,6 +225,8 @@ static blk_status_t nvme_error_status(struct request *req)
|
|||
return BLK_STS_PROTECTION;
|
||||
case NVME_SC_RESERVATION_CONFLICT:
|
||||
return BLK_STS_NEXUS;
|
||||
case NVME_SC_HOST_PATH_ERROR:
|
||||
return BLK_STS_TRANSPORT;
|
||||
default:
|
||||
return BLK_STS_IOERR;
|
||||
}
|
||||
|
@ -260,7 +261,7 @@ static void nvme_retry_req(struct request *req)
|
|||
|
||||
void nvme_complete_rq(struct request *req)
|
||||
{
|
||||
blk_status_t status = nvme_error_status(req);
|
||||
blk_status_t status = nvme_error_status(nvme_req(req)->status);
|
||||
|
||||
trace_nvme_complete_rq(req);
|
||||
|
||||
|
@ -279,6 +280,8 @@ void nvme_complete_rq(struct request *req)
|
|||
return;
|
||||
}
|
||||
}
|
||||
|
||||
nvme_trace_bio_complete(req, status);
|
||||
blk_mq_end_request(req, status);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_complete_rq);
|
||||
|
@ -288,8 +291,12 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
|
|||
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
|
||||
"Cancelling I/O %d", req->tag);
|
||||
|
||||
nvme_req(req)->status = NVME_SC_ABORT_REQ;
|
||||
blk_mq_complete_request_sync(req);
|
||||
/* don't abort one completed request */
|
||||
if (blk_mq_request_completed(req))
|
||||
return true;
|
||||
|
||||
nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
|
||||
blk_mq_complete_request(req);
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_cancel_request);
|
||||
|
@ -1088,10 +1095,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
|
|||
NVME_IDENTIFY_DATA_SIZE);
|
||||
}
|
||||
|
||||
static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
|
||||
unsigned nsid)
|
||||
static int nvme_identify_ns(struct nvme_ctrl *ctrl,
|
||||
unsigned nsid, struct nvme_id_ns **id)
|
||||
{
|
||||
struct nvme_id_ns *id;
|
||||
struct nvme_command c = { };
|
||||
int error;
|
||||
|
||||
|
@ -1100,18 +1106,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
|
|||
c.identify.nsid = cpu_to_le32(nsid);
|
||||
c.identify.cns = NVME_ID_CNS_NS;
|
||||
|
||||
id = kmalloc(sizeof(*id), GFP_KERNEL);
|
||||
if (!id)
|
||||
return NULL;
|
||||
*id = kmalloc(sizeof(**id), GFP_KERNEL);
|
||||
if (!*id)
|
||||
return -ENOMEM;
|
||||
|
||||
error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
|
||||
error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
|
||||
if (error) {
|
||||
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
|
||||
kfree(id);
|
||||
return NULL;
|
||||
kfree(*id);
|
||||
}
|
||||
|
||||
return id;
|
||||
return error;
|
||||
}
|
||||
|
||||
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
|
||||
|
@ -1180,7 +1185,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
|
|||
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
|
||||
|
||||
#define NVME_AEN_SUPPORTED \
|
||||
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
|
||||
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
|
||||
NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
|
||||
|
||||
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
|
@ -1195,6 +1201,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
|
|||
if (status)
|
||||
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
|
||||
supported_aens);
|
||||
|
||||
queue_work(nvme_wq, &ctrl->async_event_work);
|
||||
}
|
||||
|
||||
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
|
||||
|
@ -1594,9 +1602,11 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
|
|||
blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors);
|
||||
}
|
||||
|
||||
static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
|
||||
static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
|
||||
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
memset(ids, 0, sizeof(*ids));
|
||||
|
||||
if (ctrl->vs >= NVME_VS(1, 1, 0))
|
||||
|
@ -1607,10 +1617,12 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
|
|||
/* Don't treat error as fatal we potentially
|
||||
* already have a NGUID or EUI-64
|
||||
*/
|
||||
if (nvme_identify_ns_descs(ctrl, nsid, ids))
|
||||
ret = nvme_identify_ns_descs(ctrl, nsid, ids);
|
||||
if (ret)
|
||||
dev_warn(ctrl->device,
|
||||
"%s: Identify Descriptors failed\n", __func__);
|
||||
"Identify Descriptors failed (%d)\n", ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
|
||||
|
@ -1738,25 +1750,37 @@ static int nvme_revalidate_disk(struct gendisk *disk)
|
|||
return -ENODEV;
|
||||
}
|
||||
|
||||
id = nvme_identify_ns(ctrl, ns->head->ns_id);
|
||||
if (!id)
|
||||
return -ENODEV;
|
||||
ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (id->ncap == 0) {
|
||||
ret = -ENODEV;
|
||||
goto out;
|
||||
goto free_id;
|
||||
}
|
||||
|
||||
__nvme_revalidate_disk(disk, id);
|
||||
nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
|
||||
ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
|
||||
if (ret)
|
||||
goto free_id;
|
||||
|
||||
if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
|
||||
dev_err(ctrl->device,
|
||||
"identifiers changed for nsid %d\n", ns->head->ns_id);
|
||||
ret = -ENODEV;
|
||||
}
|
||||
|
||||
out:
|
||||
free_id:
|
||||
kfree(id);
|
||||
out:
|
||||
/*
|
||||
* Only fail the function if we got a fatal error back from the
|
||||
* device, otherwise ignore the error and just move on.
|
||||
*/
|
||||
if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
|
||||
ret = 0;
|
||||
else if (ret > 0)
|
||||
ret = blk_status_to_errno(nvme_error_status(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1952,7 +1976,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
|
|||
* bits', but doing so may cause the device to complete commands to the
|
||||
* admin queue ... and we don't know what memory that might be pointing at!
|
||||
*/
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
int ret;
|
||||
|
||||
|
@ -1966,20 +1990,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
|
|||
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
|
||||
msleep(NVME_QUIRK_DELAY_AMOUNT);
|
||||
|
||||
return nvme_wait_ready(ctrl, cap, false);
|
||||
return nvme_wait_ready(ctrl, ctrl->cap, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
|
||||
|
||||
int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
|
||||
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
/*
|
||||
* Default to a 4K page size, with the intention to update this
|
||||
* path in the future to accomodate architectures with differing
|
||||
* kernel and IO page sizes.
|
||||
*/
|
||||
unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
|
||||
unsigned dev_page_min, page_shift = 12;
|
||||
int ret;
|
||||
|
||||
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
|
||||
if (ret) {
|
||||
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
|
||||
return ret;
|
||||
}
|
||||
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
|
||||
|
||||
if (page_shift < dev_page_min) {
|
||||
dev_err(ctrl->device,
|
||||
"Minimum device page size %u too large for host (%u)\n",
|
||||
|
@ -1998,7 +2029,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
|
|||
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
||||
if (ret)
|
||||
return ret;
|
||||
return nvme_wait_ready(ctrl, cap, true);
|
||||
return nvme_wait_ready(ctrl, ctrl->cap, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
|
||||
|
||||
|
@ -2332,7 +2363,8 @@ static void nvme_release_subsystem(struct device *dev)
|
|||
struct nvme_subsystem *subsys =
|
||||
container_of(dev, struct nvme_subsystem, dev);
|
||||
|
||||
ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
|
||||
if (subsys->instance >= 0)
|
||||
ida_simple_remove(&nvme_instance_ida, subsys->instance);
|
||||
kfree(subsys);
|
||||
}
|
||||
|
||||
|
@ -2361,6 +2393,17 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
|
|||
|
||||
lockdep_assert_held(&nvme_subsystems_lock);
|
||||
|
||||
/*
|
||||
* Fail matches for discovery subsystems. This results
|
||||
* in each discovery controller bound to a unique subsystem.
|
||||
* This avoids issues with validating controller values
|
||||
* that can only be true when there is a single unique subsystem.
|
||||
* There may be multiple and completely independent entities
|
||||
* that provide discovery controllers.
|
||||
*/
|
||||
if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
|
||||
return NULL;
|
||||
|
||||
list_for_each_entry(subsys, &nvme_subsystems, entry) {
|
||||
if (strcmp(subsys->subnqn, subsysnqn))
|
||||
continue;
|
||||
|
@ -2461,12 +2504,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|||
subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
|
||||
if (!subsys)
|
||||
return -ENOMEM;
|
||||
ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
|
||||
if (ret < 0) {
|
||||
kfree(subsys);
|
||||
return ret;
|
||||
}
|
||||
subsys->instance = ret;
|
||||
|
||||
subsys->instance = -1;
|
||||
mutex_init(&subsys->lock);
|
||||
kref_init(&subsys->ref);
|
||||
INIT_LIST_HEAD(&subsys->ctrls);
|
||||
|
@ -2485,7 +2524,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|||
subsys->dev.class = nvme_subsys_class;
|
||||
subsys->dev.release = nvme_release_subsystem;
|
||||
subsys->dev.groups = nvme_subsys_attrs_groups;
|
||||
dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
|
||||
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
|
||||
device_initialize(&subsys->dev);
|
||||
|
||||
mutex_lock(&nvme_subsystems_lock);
|
||||
|
@ -2517,6 +2556,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
|||
goto out_put_subsystem;
|
||||
}
|
||||
|
||||
if (!found)
|
||||
subsys->instance = ctrl->instance;
|
||||
ctrl->subsys = subsys;
|
||||
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
|
||||
mutex_unlock(&nvme_subsystems_lock);
|
||||
|
@ -2574,7 +2615,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
|
|||
int nvme_init_identify(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_id_ctrl *id;
|
||||
u64 cap;
|
||||
int ret, page_shift;
|
||||
u32 max_hw_sectors;
|
||||
bool prev_apst_enabled;
|
||||
|
@ -2584,16 +2624,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
|||
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
|
||||
if (ret) {
|
||||
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
|
||||
return ret;
|
||||
}
|
||||
page_shift = NVME_CAP_MPSMIN(cap) + 12;
|
||||
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
|
||||
ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
|
||||
|
||||
if (ctrl->vs >= NVME_VS(1, 1, 0))
|
||||
ctrl->subsystem = NVME_CAP_NSSRC(cap);
|
||||
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
|
||||
|
||||
ret = nvme_identify_ctrl(ctrl, &id);
|
||||
if (ret) {
|
||||
|
@ -3184,7 +3219,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
|
|||
head->ns_id = nsid;
|
||||
kref_init(&head->ref);
|
||||
|
||||
nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
|
||||
ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
|
||||
if (ret)
|
||||
goto out_cleanup_srcu;
|
||||
|
||||
ret = __nvme_check_ids(ctrl->subsys, head);
|
||||
if (ret) {
|
||||
|
@ -3209,6 +3246,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
|
|||
out_free_head:
|
||||
kfree(head);
|
||||
out:
|
||||
if (ret > 0)
|
||||
ret = blk_status_to_errno(nvme_error_status(ret));
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
|
@ -3232,7 +3271,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
|
|||
} else {
|
||||
struct nvme_ns_ids ids;
|
||||
|
||||
nvme_report_ns_ids(ctrl, nsid, id, &ids);
|
||||
ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
if (!nvme_ns_ids_equal(&head->ids, &ids)) {
|
||||
dev_err(ctrl->device,
|
||||
"IDs don't match for shared namespace %d\n",
|
||||
|
@ -3247,6 +3289,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
|
|||
|
||||
out_unlock:
|
||||
mutex_unlock(&ctrl->subsys->lock);
|
||||
if (ret > 0)
|
||||
ret = blk_status_to_errno(nvme_error_status(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -3338,11 +3382,9 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
|||
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
|
||||
nvme_set_queue_limits(ctrl, ns->queue);
|
||||
|
||||
id = nvme_identify_ns(ctrl, nsid);
|
||||
if (!id) {
|
||||
ret = -EIO;
|
||||
ret = nvme_identify_ns(ctrl, nsid, &id);
|
||||
if (ret)
|
||||
goto out_free_queue;
|
||||
}
|
||||
|
||||
if (id->ncap == 0) {
|
||||
ret = -EINVAL;
|
||||
|
@ -3404,6 +3446,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
|||
blk_cleanup_queue(ns->queue);
|
||||
out_free_ns:
|
||||
kfree(ns);
|
||||
if (ret > 0)
|
||||
ret = blk_status_to_errno(nvme_error_status(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -3617,6 +3661,33 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
|
||||
|
||||
static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
|
||||
{
|
||||
struct nvme_ctrl *ctrl =
|
||||
container_of(dev, struct nvme_ctrl, ctrl_device);
|
||||
struct nvmf_ctrl_options *opts = ctrl->opts;
|
||||
int ret;
|
||||
|
||||
ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (opts) {
|
||||
ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = add_uevent_var(env, "NVME_TRSVCID=%s",
|
||||
opts->trsvcid ?: "none");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
|
||||
opts->host_traddr ?: "none");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
char *envp[2] = { NULL, NULL };
|
||||
|
@ -3723,6 +3794,9 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
|
|||
queue_work(nvme_wq, &ctrl->ana_work);
|
||||
break;
|
||||
#endif
|
||||
case NVME_AER_NOTICE_DISC_CHANGED:
|
||||
ctrl->aen_result = result;
|
||||
break;
|
||||
default:
|
||||
dev_warn(ctrl->device, "async event result %08x\n", result);
|
||||
}
|
||||
|
@ -3769,10 +3843,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
|
|||
if (ctrl->kato)
|
||||
nvme_start_keep_alive(ctrl);
|
||||
|
||||
nvme_enable_aen(ctrl);
|
||||
|
||||
if (ctrl->queue_count > 1) {
|
||||
nvme_queue_scan(ctrl);
|
||||
nvme_enable_aen(ctrl);
|
||||
queue_work(nvme_wq, &ctrl->async_event_work);
|
||||
nvme_start_queues(ctrl);
|
||||
}
|
||||
}
|
||||
|
@ -3792,7 +3866,9 @@ static void nvme_free_ctrl(struct device *dev)
|
|||
container_of(dev, struct nvme_ctrl, ctrl_device);
|
||||
struct nvme_subsystem *subsys = ctrl->subsys;
|
||||
|
||||
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
|
||||
if (subsys && ctrl->instance != subsys->instance)
|
||||
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
|
||||
|
||||
kfree(ctrl->effects);
|
||||
nvme_mpath_uninit(ctrl);
|
||||
__free_page(ctrl->discard_page);
|
||||
|
@ -3992,6 +4068,9 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
|
|||
list_for_each_entry(ns, &ctrl->namespaces, list)
|
||||
blk_sync_queue(ns->queue);
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
|
||||
if (ctrl->admin_q)
|
||||
blk_sync_queue(ctrl->admin_q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_sync_queues);
|
||||
|
||||
|
@ -4050,6 +4129,7 @@ static int __init nvme_core_init(void)
|
|||
result = PTR_ERR(nvme_class);
|
||||
goto unregister_chrdev;
|
||||
}
|
||||
nvme_class->dev_uevent = nvme_class_uevent;
|
||||
|
||||
nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
|
||||
if (IS_ERR(nvme_subsys_class)) {
|
||||
|
@ -4074,7 +4154,6 @@ static int __init nvme_core_init(void)
|
|||
|
||||
static void __exit nvme_core_exit(void)
|
||||
{
|
||||
ida_destroy(&nvme_subsystems_ida);
|
||||
class_destroy(nvme_subsys_class);
|
||||
class_destroy(nvme_class);
|
||||
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
|
||||
|
|
|
@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
|
|||
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
|
||||
cmd.prop_get.offset = cpu_to_le32(off);
|
||||
|
||||
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
|
||||
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
|
||||
NVME_QID_ANY, 0, 0, false);
|
||||
|
||||
if (ret >= 0)
|
||||
|
@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
|
|||
cmd.prop_get.attrib = 1;
|
||||
cmd.prop_get.offset = cpu_to_le32(off);
|
||||
|
||||
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
|
||||
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
|
||||
NVME_QID_ANY, 0, 0, false);
|
||||
|
||||
if (ret >= 0)
|
||||
|
@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
|
|||
cmd.prop_set.offset = cpu_to_le32(off);
|
||||
cmd.prop_set.value = cpu_to_le64(val);
|
||||
|
||||
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
|
||||
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
|
||||
NVME_QID_ANY, 0, 0, false);
|
||||
if (unlikely(ret))
|
||||
dev_err(ctrl->device,
|
||||
|
@ -381,8 +381,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
|
|||
* Set keep-alive timeout in seconds granularity (ms * 1000)
|
||||
* and add a grace period for controller kato enforcement
|
||||
*/
|
||||
cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
|
||||
cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
|
||||
cmd.connect.kato = ctrl->kato ?
|
||||
cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0;
|
||||
|
||||
if (ctrl->opts->disable_sqflow)
|
||||
cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
|
||||
|
@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
|
|||
strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
|
||||
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
|
||||
|
||||
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
|
||||
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
|
||||
data, sizeof(*data), 0, NVME_QID_ANY, 1,
|
||||
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
|
||||
if (ret) {
|
||||
|
@ -611,6 +611,7 @@ static const match_table_t opt_tokens = {
|
|||
{ NVMF_OPT_DATA_DIGEST, "data_digest" },
|
||||
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
|
||||
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
|
||||
{ NVMF_OPT_TOS, "tos=%d" },
|
||||
{ NVMF_OPT_ERR, NULL }
|
||||
};
|
||||
|
||||
|
@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
|
|||
opts->duplicate_connect = false;
|
||||
opts->hdr_digest = false;
|
||||
opts->data_digest = false;
|
||||
opts->tos = -1; /* < 0 == use transport default */
|
||||
|
||||
options = o = kstrdup(buf, GFP_KERNEL);
|
||||
if (!options)
|
||||
|
@ -738,13 +740,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
|
|||
pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n");
|
||||
}
|
||||
opts->kato = token;
|
||||
|
||||
if (opts->discovery_nqn && opts->kato) {
|
||||
pr_err("Discovery controllers cannot accept KATO != 0\n");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
break;
|
||||
case NVMF_OPT_CTRL_LOSS_TMO:
|
||||
if (match_int(args, &token)) {
|
||||
|
@ -856,6 +851,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
|
|||
}
|
||||
opts->nr_poll_queues = token;
|
||||
break;
|
||||
case NVMF_OPT_TOS:
|
||||
if (match_int(args, &token)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
if (token < 0) {
|
||||
pr_err("Invalid type of service %d\n", token);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
if (token > 255) {
|
||||
pr_warn("Clamping type of service to 255\n");
|
||||
token = 255;
|
||||
}
|
||||
opts->tos = token;
|
||||
break;
|
||||
default:
|
||||
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
|
||||
p);
|
||||
|
@ -865,7 +876,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
|
|||
}
|
||||
|
||||
if (opts->discovery_nqn) {
|
||||
opts->kato = 0;
|
||||
opts->nr_io_queues = 0;
|
||||
opts->nr_write_queues = 0;
|
||||
opts->nr_poll_queues = 0;
|
||||
|
|
|
@ -55,6 +55,7 @@ enum {
|
|||
NVMF_OPT_DATA_DIGEST = 1 << 16,
|
||||
NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
|
||||
NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
|
||||
NVMF_OPT_TOS = 1 << 19,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -87,6 +88,7 @@ enum {
|
|||
* @data_digest: generate/verify data digest (TCP)
|
||||
* @nr_write_queues: number of queues for write I/O
|
||||
* @nr_poll_queues: number of queues for polling I/O
|
||||
* @tos: type of service
|
||||
*/
|
||||
struct nvmf_ctrl_options {
|
||||
unsigned mask;
|
||||
|
@ -108,6 +110,7 @@ struct nvmf_ctrl_options {
|
|||
bool data_digest;
|
||||
unsigned int nr_write_queues;
|
||||
unsigned int nr_poll_queues;
|
||||
int tos;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
@ -1608,9 +1608,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
|
|||
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
|
||||
|
||||
if (opstate == FCPOP_STATE_ABORTED)
|
||||
status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
|
||||
else if (freq->status)
|
||||
status = cpu_to_le16(NVME_SC_INTERNAL << 1);
|
||||
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
|
||||
else if (freq->status) {
|
||||
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
|
||||
dev_info(ctrl->ctrl.device,
|
||||
"NVME-FC{%d}: io failed due to lldd error %d\n",
|
||||
ctrl->cnum, freq->status);
|
||||
}
|
||||
|
||||
/*
|
||||
* For the linux implementation, if we have an unsuccesful
|
||||
|
@ -1637,8 +1641,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
|
|||
* no payload in the CQE by the transport.
|
||||
*/
|
||||
if (freq->transferred_length !=
|
||||
be32_to_cpu(op->cmd_iu.data_len)) {
|
||||
status = cpu_to_le16(NVME_SC_INTERNAL << 1);
|
||||
be32_to_cpu(op->cmd_iu.data_len)) {
|
||||
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
|
||||
dev_info(ctrl->ctrl.device,
|
||||
"NVME-FC{%d}: io failed due to bad transfer "
|
||||
"length: %d vs expected %d\n",
|
||||
ctrl->cnum, freq->transferred_length,
|
||||
be32_to_cpu(op->cmd_iu.data_len));
|
||||
goto done;
|
||||
}
|
||||
result.u64 = 0;
|
||||
|
@ -1655,7 +1664,17 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
|
|||
freq->transferred_length ||
|
||||
op->rsp_iu.status_code ||
|
||||
sqe->common.command_id != cqe->command_id)) {
|
||||
status = cpu_to_le16(NVME_SC_INTERNAL << 1);
|
||||
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
|
||||
dev_info(ctrl->ctrl.device,
|
||||
"NVME-FC{%d}: io failed due to bad NVMe_ERSP: "
|
||||
"iu len %d, xfr len %d vs %d, status code "
|
||||
"%d, cmdid %d vs %d\n",
|
||||
ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len),
|
||||
be32_to_cpu(op->rsp_iu.xfrd_len),
|
||||
freq->transferred_length,
|
||||
op->rsp_iu.status_code,
|
||||
sqe->common.command_id,
|
||||
cqe->command_id);
|
||||
goto done;
|
||||
}
|
||||
result = cqe->result;
|
||||
|
@ -1663,7 +1682,11 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
|
|||
break;
|
||||
|
||||
default:
|
||||
status = cpu_to_le16(NVME_SC_INTERNAL << 1);
|
||||
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
|
||||
dev_info(ctrl->ctrl.device,
|
||||
"NVME-FC{%d}: io failed due to odd NVMe_xRSP iu "
|
||||
"len %d\n",
|
||||
ctrl->cnum, freq->rcv_rsplen);
|
||||
goto done;
|
||||
}
|
||||
|
||||
|
@ -2006,6 +2029,7 @@ nvme_fc_ctrl_free(struct kref *ref)
|
|||
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
blk_cleanup_queue(ctrl->ctrl.admin_q);
|
||||
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
|
||||
blk_mq_free_tag_set(&ctrl->admin_tag_set);
|
||||
|
||||
kfree(ctrl->queues);
|
||||
|
@ -2107,7 +2131,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
|
|||
struct nvme_fc_fcp_op *op)
|
||||
{
|
||||
struct nvmefc_fcp_req *freq = &op->fcp_req;
|
||||
enum dma_data_direction dir;
|
||||
int ret;
|
||||
|
||||
freq->sg_cnt = 0;
|
||||
|
@ -2124,9 +2147,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
|
|||
|
||||
op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
|
||||
WARN_ON(op->nents > blk_rq_nr_phys_segments(rq));
|
||||
dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
|
||||
freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
|
||||
op->nents, dir);
|
||||
op->nents, rq_dma_dir(rq));
|
||||
if (unlikely(freq->sg_cnt <= 0)) {
|
||||
sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
|
||||
freq->sg_cnt = 0;
|
||||
|
@ -2149,8 +2171,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
|
|||
return;
|
||||
|
||||
fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
|
||||
((rq_data_dir(rq) == WRITE) ?
|
||||
DMA_TO_DEVICE : DMA_FROM_DEVICE));
|
||||
rq_dma_dir(rq));
|
||||
|
||||
nvme_cleanup_cmd(rq);
|
||||
|
||||
|
@ -2633,8 +2654,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
|
|||
if (ret)
|
||||
goto out_delete_hw_queue;
|
||||
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
|
||||
if (ret)
|
||||
goto out_disconnect_admin_queue;
|
||||
|
@ -2648,23 +2667,15 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
|
|||
* prior connection values
|
||||
*/
|
||||
|
||||
ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
"prop_get NVME_REG_CAP failed\n");
|
||||
goto out_disconnect_admin_queue;
|
||||
}
|
||||
|
||||
ctrl->ctrl.sqsize =
|
||||
min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
|
||||
|
||||
ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
|
||||
ret = nvme_enable_ctrl(&ctrl->ctrl);
|
||||
if (ret)
|
||||
goto out_disconnect_admin_queue;
|
||||
|
||||
ctrl->ctrl.max_hw_sectors =
|
||||
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
|
||||
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
ret = nvme_init_identify(&ctrl->ctrl);
|
||||
if (ret)
|
||||
goto out_disconnect_admin_queue;
|
||||
|
@ -2774,6 +2785,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
|
|||
nvme_stop_queues(&ctrl->ctrl);
|
||||
blk_mq_tagset_busy_iter(&ctrl->tag_set,
|
||||
nvme_fc_terminate_exchange, &ctrl->ctrl);
|
||||
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2796,6 +2808,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
|
|||
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
|
||||
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
|
||||
nvme_fc_terminate_exchange, &ctrl->ctrl);
|
||||
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
|
||||
|
||||
/* kill the aens as they are a separate path */
|
||||
nvme_fc_abort_aen_ops(ctrl);
|
||||
|
@ -3109,10 +3122,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
|
|||
goto out_free_queues;
|
||||
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
|
||||
|
||||
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
|
||||
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
|
||||
ret = PTR_ERR(ctrl->ctrl.fabrics_q);
|
||||
goto out_free_admin_tag_set;
|
||||
}
|
||||
|
||||
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
|
||||
if (IS_ERR(ctrl->ctrl.admin_q)) {
|
||||
ret = PTR_ERR(ctrl->ctrl.admin_q);
|
||||
goto out_free_admin_tag_set;
|
||||
goto out_cleanup_fabrics_q;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -3184,6 +3203,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
|
|||
|
||||
out_cleanup_admin_q:
|
||||
blk_cleanup_queue(ctrl->ctrl.admin_q);
|
||||
out_cleanup_fabrics_q:
|
||||
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
|
||||
out_free_admin_tag_set:
|
||||
blk_mq_free_tag_set(&ctrl->admin_tag_set);
|
||||
out_free_queues:
|
||||
|
|
|
@ -667,11 +667,14 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
|
|||
return rq;
|
||||
}
|
||||
|
||||
static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
|
||||
static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
|
||||
void *buf)
|
||||
{
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
struct request_queue *q = dev->q;
|
||||
struct nvme_nvm_command *cmd;
|
||||
struct request *rq;
|
||||
int ret;
|
||||
|
||||
cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
|
||||
if (!cmd)
|
||||
|
@ -679,8 +682,15 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
|
|||
|
||||
rq = nvme_nvm_alloc_request(q, rqd, cmd);
|
||||
if (IS_ERR(rq)) {
|
||||
kfree(cmd);
|
||||
return PTR_ERR(rq);
|
||||
ret = PTR_ERR(rq);
|
||||
goto err_free_cmd;
|
||||
}
|
||||
|
||||
if (buf) {
|
||||
ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas,
|
||||
GFP_KERNEL);
|
||||
if (ret)
|
||||
goto err_free_cmd;
|
||||
}
|
||||
|
||||
rq->end_io_data = rqd;
|
||||
|
@ -688,33 +698,9 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
|
|||
blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
|
||||
{
|
||||
struct request_queue *q = dev->q;
|
||||
struct request *rq;
|
||||
struct nvme_nvm_command cmd;
|
||||
int ret = 0;
|
||||
|
||||
memset(&cmd, 0, sizeof(struct nvme_nvm_command));
|
||||
|
||||
rq = nvme_nvm_alloc_request(q, rqd, &cmd);
|
||||
if (IS_ERR(rq))
|
||||
return PTR_ERR(rq);
|
||||
|
||||
/* I/Os can fail and the error is signaled through rqd. Callers must
|
||||
* handle the error accordingly.
|
||||
*/
|
||||
blk_execute_rq(q, NULL, rq, 0);
|
||||
if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
|
||||
ret = -EINTR;
|
||||
|
||||
rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
|
||||
rqd->error = nvme_req(rq)->status;
|
||||
|
||||
blk_mq_free_request(rq);
|
||||
|
||||
err_free_cmd:
|
||||
kfree(cmd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -754,7 +740,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
|
|||
.get_chk_meta = nvme_nvm_get_chk_meta,
|
||||
|
||||
.submit_io = nvme_nvm_submit_io,
|
||||
.submit_io_sync = nvme_nvm_submit_io_sync,
|
||||
|
||||
.create_dma_pool = nvme_nvm_create_dma_pool,
|
||||
.destroy_dma_pool = nvme_nvm_destroy_dma_pool,
|
||||
|
|
|
@ -509,14 +509,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
|
|||
|
||||
down_write(&ctrl->namespaces_rwsem);
|
||||
list_for_each_entry(ns, &ctrl->namespaces, list) {
|
||||
if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
|
||||
unsigned nsid = le32_to_cpu(desc->nsids[n]);
|
||||
|
||||
if (ns->head->ns_id < nsid)
|
||||
continue;
|
||||
nvme_update_ns_ana_state(desc, ns);
|
||||
if (ns->head->ns_id == nsid)
|
||||
nvme_update_ns_ana_state(desc, ns);
|
||||
if (++n == nr_nsids)
|
||||
break;
|
||||
}
|
||||
up_write(&ctrl->namespaces_rwsem);
|
||||
WARN_ON_ONCE(n < nr_nsids);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
#include <linux/fault-inject.h>
|
||||
#include <linux/rcupdate.h>
|
||||
|
||||
#include <trace/events/block.h>
|
||||
|
||||
extern unsigned int nvme_io_timeout;
|
||||
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
|
||||
|
||||
|
@ -97,6 +99,21 @@ enum nvme_quirks {
|
|||
* Force simple suspend/resume path.
|
||||
*/
|
||||
NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10),
|
||||
|
||||
/*
|
||||
* Use only one interrupt vector for all queues
|
||||
*/
|
||||
NVME_QUIRK_SINGLE_VECTOR = (1 << 11),
|
||||
|
||||
/*
|
||||
* Use non-standard 128 bytes SQEs.
|
||||
*/
|
||||
NVME_QUIRK_128_BYTES_SQES = (1 << 12),
|
||||
|
||||
/*
|
||||
* Prevent tag overlap between queues
|
||||
*/
|
||||
NVME_QUIRK_SHARED_TAGS = (1 << 13),
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -169,6 +186,7 @@ struct nvme_ctrl {
|
|||
const struct nvme_ctrl_ops *ops;
|
||||
struct request_queue *admin_q;
|
||||
struct request_queue *connect_q;
|
||||
struct request_queue *fabrics_q;
|
||||
struct device *dev;
|
||||
int instance;
|
||||
int numa_node;
|
||||
|
@ -431,8 +449,8 @@ void nvme_complete_rq(struct request *req);
|
|||
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
|
||||
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
||||
enum nvme_ctrl_state new_state);
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
|
||||
int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
||||
const struct nvme_ctrl_ops *ops, unsigned long quirks);
|
||||
|
@ -520,6 +538,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
|
|||
kblockd_schedule_work(&head->requeue_work);
|
||||
}
|
||||
|
||||
static inline void nvme_trace_bio_complete(struct request *req,
|
||||
blk_status_t status)
|
||||
{
|
||||
struct nvme_ns *ns = req->q->queuedata;
|
||||
|
||||
if (req->cmd_flags & REQ_NVME_MPATH)
|
||||
trace_block_bio_complete(ns->head->disk->queue,
|
||||
req->bio, status);
|
||||
}
|
||||
|
||||
extern struct device_attribute dev_attr_ana_grpid;
|
||||
extern struct device_attribute dev_attr_ana_state;
|
||||
extern struct device_attribute subsys_attr_iopolicy;
|
||||
|
@ -567,6 +595,10 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
|
|||
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
|
||||
{
|
||||
}
|
||||
static inline void nvme_trace_bio_complete(struct request *req,
|
||||
blk_status_t status)
|
||||
{
|
||||
}
|
||||
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
|
||||
struct nvme_id_ctrl *id)
|
||||
{
|
||||
|
|
|
@ -28,8 +28,8 @@
|
|||
#include "trace.h"
|
||||
#include "nvme.h"
|
||||
|
||||
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
|
||||
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
|
||||
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
|
||||
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
|
||||
|
||||
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
|
||||
|
||||
|
@ -100,6 +100,7 @@ struct nvme_dev {
|
|||
unsigned io_queues[HCTX_MAX_TYPES];
|
||||
unsigned int num_vecs;
|
||||
int q_depth;
|
||||
int io_sqes;
|
||||
u32 db_stride;
|
||||
void __iomem *bar;
|
||||
unsigned long bar_mapped_size;
|
||||
|
@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
|
|||
struct nvme_queue {
|
||||
struct nvme_dev *dev;
|
||||
spinlock_t sq_lock;
|
||||
struct nvme_command *sq_cmds;
|
||||
void *sq_cmds;
|
||||
/* only used for poll queues: */
|
||||
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
|
||||
volatile struct nvme_completion *cqes;
|
||||
|
@ -178,6 +179,7 @@ struct nvme_queue {
|
|||
u16 last_cq_head;
|
||||
u16 qid;
|
||||
u8 cq_phase;
|
||||
u8 sqes;
|
||||
unsigned long flags;
|
||||
#define NVMEQ_ENABLED 0
|
||||
#define NVMEQ_SQ_CMB 1
|
||||
|
@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
|
|||
bool write_sq)
|
||||
{
|
||||
spin_lock(&nvmeq->sq_lock);
|
||||
memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
|
||||
memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
|
||||
cmd, sizeof(*cmd));
|
||||
if (++nvmeq->sq_tail == nvmeq->q_depth)
|
||||
nvmeq->sq_tail = 0;
|
||||
nvme_write_sq_db(nvmeq, write_sq);
|
||||
|
@ -534,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
|
|||
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
enum dma_data_direction dma_dir = rq_data_dir(req) ?
|
||||
DMA_TO_DEVICE : DMA_FROM_DEVICE;
|
||||
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
|
||||
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
|
||||
int i;
|
||||
|
||||
if (iod->dma_len) {
|
||||
dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir);
|
||||
dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
|
||||
rq_dma_dir(req));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1344,16 +1346,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
|
|||
|
||||
static void nvme_free_queue(struct nvme_queue *nvmeq)
|
||||
{
|
||||
dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth),
|
||||
dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
|
||||
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
|
||||
if (!nvmeq->sq_cmds)
|
||||
return;
|
||||
|
||||
if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
|
||||
pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
|
||||
nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
|
||||
nvmeq->sq_cmds, SQ_SIZE(nvmeq));
|
||||
} else {
|
||||
dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth),
|
||||
dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
|
||||
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
|
||||
}
|
||||
}
|
||||
|
@ -1403,7 +1405,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
|
|||
if (shutdown)
|
||||
nvme_shutdown_ctrl(&dev->ctrl);
|
||||
else
|
||||
nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
|
||||
nvme_disable_ctrl(&dev->ctrl);
|
||||
|
||||
nvme_poll_irqdisable(nvmeq, -1);
|
||||
}
|
||||
|
@ -1433,12 +1435,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
|
|||
}
|
||||
|
||||
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
|
||||
int qid, int depth)
|
||||
int qid)
|
||||
{
|
||||
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
||||
|
||||
if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
|
||||
nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
|
||||
nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
|
||||
if (nvmeq->sq_cmds) {
|
||||
nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
|
||||
nvmeq->sq_cmds);
|
||||
|
@ -1447,11 +1449,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
|
|||
return 0;
|
||||
}
|
||||
|
||||
pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth));
|
||||
pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
|
||||
}
|
||||
}
|
||||
|
||||
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
|
||||
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
|
||||
&nvmeq->sq_dma_addr, GFP_KERNEL);
|
||||
if (!nvmeq->sq_cmds)
|
||||
return -ENOMEM;
|
||||
|
@ -1465,12 +1467,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
|
|||
if (dev->ctrl.queue_count > qid)
|
||||
return 0;
|
||||
|
||||
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth),
|
||||
nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
|
||||
nvmeq->q_depth = depth;
|
||||
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
|
||||
&nvmeq->cq_dma_addr, GFP_KERNEL);
|
||||
if (!nvmeq->cqes)
|
||||
goto free_nvmeq;
|
||||
|
||||
if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
|
||||
if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
|
||||
goto free_cqdma;
|
||||
|
||||
nvmeq->dev = dev;
|
||||
|
@ -1479,15 +1483,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
|
|||
nvmeq->cq_head = 0;
|
||||
nvmeq->cq_phase = 1;
|
||||
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
|
||||
nvmeq->q_depth = depth;
|
||||
nvmeq->qid = qid;
|
||||
dev->ctrl.queue_count++;
|
||||
|
||||
return 0;
|
||||
|
||||
free_cqdma:
|
||||
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
|
||||
nvmeq->cq_dma_addr);
|
||||
dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
|
||||
nvmeq->cq_dma_addr);
|
||||
free_nvmeq:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
@ -1515,7 +1518,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
|
|||
nvmeq->cq_head = 0;
|
||||
nvmeq->cq_phase = 1;
|
||||
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
|
||||
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
|
||||
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
|
||||
nvme_dbbuf_init(dev, nvmeq, qid);
|
||||
dev->online_queues++;
|
||||
wmb(); /* ensure the first interrupt sees the initialization */
|
||||
|
@ -1552,7 +1555,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
|
|||
nvme_init_queue(nvmeq, qid);
|
||||
|
||||
if (!polled) {
|
||||
nvmeq->cq_vector = vector;
|
||||
result = queue_request_irq(nvmeq);
|
||||
if (result < 0)
|
||||
goto release_sq;
|
||||
|
@ -1679,7 +1681,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
|
|||
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
|
||||
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
|
||||
|
||||
result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
|
||||
result = nvme_disable_ctrl(&dev->ctrl);
|
||||
if (result < 0)
|
||||
return result;
|
||||
|
||||
|
@ -1695,7 +1697,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
|
|||
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
|
||||
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
|
||||
|
||||
result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
|
||||
result = nvme_enable_ctrl(&dev->ctrl);
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
|
@ -2077,6 +2079,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
|
|||
dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
|
||||
dev->io_queues[HCTX_TYPE_READ] = 0;
|
||||
|
||||
/*
|
||||
* Some Apple controllers require all queues to use the
|
||||
* first vector.
|
||||
*/
|
||||
if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
|
||||
irq_queues = 1;
|
||||
|
||||
return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
|
||||
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
|
||||
}
|
||||
|
@ -2095,6 +2104,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
|
|||
unsigned long size;
|
||||
|
||||
nr_io_queues = max_io_queues();
|
||||
|
||||
/*
|
||||
* If tags are shared with admin queue (Apple bug), then
|
||||
* make sure we only use one IO queue.
|
||||
*/
|
||||
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
|
||||
nr_io_queues = 1;
|
||||
|
||||
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
|
||||
if (result < 0)
|
||||
return result;
|
||||
|
@ -2265,6 +2282,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
|
|||
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
dev->tagset.driver_data = dev;
|
||||
|
||||
/*
|
||||
* Some Apple controllers requires tags to be unique
|
||||
* across admin and IO queue, so reserve the first 32
|
||||
* tags of the IO queue.
|
||||
*/
|
||||
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
|
||||
dev->tagset.reserved_tags = NVME_AQ_DEPTH;
|
||||
|
||||
ret = blk_mq_alloc_tag_set(&dev->tagset);
|
||||
if (ret) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
|
@ -2314,9 +2339,20 @@ static int nvme_pci_enable(struct nvme_dev *dev)
|
|||
|
||||
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
|
||||
io_queue_depth);
|
||||
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
|
||||
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
|
||||
dev->dbs = dev->bar + 4096;
|
||||
|
||||
/*
|
||||
* Some Apple controllers require a non-standard SQE size.
|
||||
* Interestingly they also seem to ignore the CC:IOSQES register
|
||||
* so we don't bother updating it here.
|
||||
*/
|
||||
if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
|
||||
dev->io_sqes = 7;
|
||||
else
|
||||
dev->io_sqes = NVME_NVM_IOSQES;
|
||||
|
||||
/*
|
||||
* Temporary fix for the Apple controller found in the MacBook8,1 and
|
||||
* some MacBook7,1 to avoid controller resets and data loss.
|
||||
|
@ -2334,6 +2370,18 @@ static int nvme_pci_enable(struct nvme_dev *dev)
|
|||
"set queue depth=%u\n", dev->q_depth);
|
||||
}
|
||||
|
||||
/*
|
||||
* Controllers with the shared tags quirk need the IO queue to be
|
||||
* big enough so that we get 32 tags for the admin queue
|
||||
*/
|
||||
if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
|
||||
(dev->q_depth < (NVME_AQ_DEPTH + 2))) {
|
||||
dev->q_depth = NVME_AQ_DEPTH + 2;
|
||||
dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
|
||||
dev->q_depth);
|
||||
}
|
||||
|
||||
|
||||
nvme_map_cmb(dev);
|
||||
|
||||
pci_enable_pcie_error_reporting(pdev);
|
||||
|
@ -2401,6 +2449,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
|
|||
|
||||
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
|
||||
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
|
||||
blk_mq_tagset_wait_completed_request(&dev->tagset);
|
||||
blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
|
||||
|
||||
/*
|
||||
* The driver will not be starting up queues again if shutting down so
|
||||
|
@ -3041,6 +3091,10 @@ static const struct pci_device_id nvme_id_table[] = {
|
|||
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
|
||||
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
|
||||
NVME_QUIRK_128_BYTES_SQES |
|
||||
NVME_QUIRK_SHARED_TAGS },
|
||||
{ 0, }
|
||||
};
|
||||
MODULE_DEVICE_TABLE(pci, nvme_id_table);
|
||||
|
|
|
@ -757,6 +757,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
|
|||
{
|
||||
if (remove) {
|
||||
blk_cleanup_queue(ctrl->ctrl.admin_q);
|
||||
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
|
||||
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
|
||||
}
|
||||
if (ctrl->async_event_sqe.data) {
|
||||
|
@ -798,10 +799,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
|
|||
goto out_free_async_qe;
|
||||
}
|
||||
|
||||
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
|
||||
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
|
||||
error = PTR_ERR(ctrl->ctrl.fabrics_q);
|
||||
goto out_free_tagset;
|
||||
}
|
||||
|
||||
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
|
||||
if (IS_ERR(ctrl->ctrl.admin_q)) {
|
||||
error = PTR_ERR(ctrl->ctrl.admin_q);
|
||||
goto out_free_tagset;
|
||||
goto out_cleanup_fabrics_q;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -809,24 +816,15 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
|
|||
if (error)
|
||||
goto out_cleanup_queue;
|
||||
|
||||
error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
|
||||
&ctrl->ctrl.cap);
|
||||
if (error) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
"prop_get NVME_REG_CAP failed\n");
|
||||
goto out_stop_queue;
|
||||
}
|
||||
|
||||
ctrl->ctrl.sqsize =
|
||||
min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
|
||||
|
||||
error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
|
||||
error = nvme_enable_ctrl(&ctrl->ctrl);
|
||||
if (error)
|
||||
goto out_stop_queue;
|
||||
|
||||
ctrl->ctrl.max_hw_sectors =
|
||||
(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
|
||||
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
error = nvme_init_identify(&ctrl->ctrl);
|
||||
if (error)
|
||||
goto out_stop_queue;
|
||||
|
@ -838,6 +836,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
|
|||
out_cleanup_queue:
|
||||
if (new)
|
||||
blk_cleanup_queue(ctrl->ctrl.admin_q);
|
||||
out_cleanup_fabrics_q:
|
||||
if (new)
|
||||
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
|
||||
out_free_tagset:
|
||||
if (new)
|
||||
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
|
||||
|
@ -907,10 +908,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
|
|||
{
|
||||
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
|
||||
nvme_rdma_stop_queue(&ctrl->queues[0]);
|
||||
if (ctrl->ctrl.admin_tagset)
|
||||
if (ctrl->ctrl.admin_tagset) {
|
||||
blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
|
||||
nvme_cancel_request, &ctrl->ctrl);
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
|
||||
}
|
||||
if (remove)
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
nvme_rdma_destroy_admin_queue(ctrl, remove);
|
||||
}
|
||||
|
||||
|
@ -920,9 +924,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
|
|||
if (ctrl->ctrl.queue_count > 1) {
|
||||
nvme_stop_queues(&ctrl->ctrl);
|
||||
nvme_rdma_stop_io_queues(ctrl);
|
||||
if (ctrl->ctrl.tagset)
|
||||
if (ctrl->ctrl.tagset) {
|
||||
blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
|
||||
nvme_cancel_request, &ctrl->ctrl);
|
||||
blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
|
||||
}
|
||||
if (remove)
|
||||
nvme_start_queues(&ctrl->ctrl);
|
||||
nvme_rdma_destroy_io_queues(ctrl, remove);
|
||||
|
@ -1059,6 +1065,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
|
|||
nvme_rdma_teardown_io_queues(ctrl, false);
|
||||
nvme_start_queues(&ctrl->ctrl);
|
||||
nvme_rdma_teardown_admin_queue(ctrl, false);
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
|
||||
/* state change failure is ok if we're in DELETING state */
|
||||
|
@ -1145,9 +1152,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
|
|||
req->mr = NULL;
|
||||
}
|
||||
|
||||
ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
|
||||
req->nents, rq_data_dir(rq) ==
|
||||
WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
|
||||
ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
|
||||
|
||||
nvme_cleanup_cmd(rq);
|
||||
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
|
||||
|
@ -1273,7 +1278,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
|
|||
req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
|
||||
|
||||
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
|
||||
rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
|
||||
rq_dma_dir(rq));
|
||||
if (unlikely(count <= 0)) {
|
||||
ret = -EIO;
|
||||
goto out_free_table;
|
||||
|
@ -1302,9 +1307,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
|
|||
return 0;
|
||||
|
||||
out_unmap_sg:
|
||||
ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
|
||||
req->nents, rq_data_dir(rq) ==
|
||||
WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
|
||||
ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
|
||||
out_free_table:
|
||||
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
|
||||
return ret;
|
||||
|
@ -1547,16 +1550,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
|
|||
|
||||
static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
|
||||
int ret;
|
||||
|
||||
ret = nvme_rdma_create_queue_ib(queue);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (ctrl->opts->tos >= 0)
|
||||
rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
|
||||
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
|
||||
if (ret) {
|
||||
dev_err(queue->ctrl->ctrl.device,
|
||||
"rdma_resolve_route failed (%d).\n",
|
||||
dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
|
||||
queue->cm_error);
|
||||
goto out_destroy_queue;
|
||||
}
|
||||
|
@ -1869,10 +1874,11 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
|
|||
cancel_delayed_work_sync(&ctrl->reconnect_work);
|
||||
|
||||
nvme_rdma_teardown_io_queues(ctrl, shutdown);
|
||||
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
|
||||
if (shutdown)
|
||||
nvme_shutdown_ctrl(&ctrl->ctrl);
|
||||
else
|
||||
nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
|
||||
nvme_disable_ctrl(&ctrl->ctrl);
|
||||
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
|
||||
}
|
||||
|
||||
|
@ -2051,7 +2057,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
|
|||
.required_opts = NVMF_OPT_TRADDR,
|
||||
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
|
||||
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
|
||||
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
|
||||
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
|
||||
NVMF_OPT_TOS,
|
||||
.create_ctrl = nvme_rdma_create_ctrl,
|
||||
};
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <net/tcp.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <crypto/hash.h>
|
||||
#include <net/busy_poll.h>
|
||||
|
||||
#include "nvme.h"
|
||||
#include "fabrics.h"
|
||||
|
@ -72,6 +73,7 @@ struct nvme_tcp_queue {
|
|||
int pdu_offset;
|
||||
size_t data_remaining;
|
||||
size_t ddgst_remaining;
|
||||
unsigned int nr_cqe;
|
||||
|
||||
/* send state */
|
||||
struct nvme_tcp_request *request;
|
||||
|
@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
|
|||
}
|
||||
|
||||
nvme_end_request(rq, cqe->status, cqe->result);
|
||||
queue->nr_cqe++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
|
|||
|
||||
switch (hdr->type) {
|
||||
case nvme_tcp_c2h_data:
|
||||
ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
|
||||
break;
|
||||
return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
|
||||
case nvme_tcp_rsp:
|
||||
nvme_tcp_init_recv_ctx(queue);
|
||||
ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
|
||||
break;
|
||||
return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
|
||||
case nvme_tcp_r2t:
|
||||
nvme_tcp_init_recv_ctx(queue);
|
||||
ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
|
||||
break;
|
||||
return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
|
||||
default:
|
||||
dev_err(queue->ctrl->ctrl.device,
|
||||
"unsupported pdu type (%d)\n", hdr->type);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
|
||||
|
@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
|
|||
nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
|
||||
queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
|
||||
} else {
|
||||
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS)
|
||||
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
|
||||
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
|
||||
queue->nr_cqe++;
|
||||
}
|
||||
nvme_tcp_init_recv_ctx(queue);
|
||||
}
|
||||
}
|
||||
|
@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
|
|||
pdu->command_id);
|
||||
|
||||
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
|
||||
queue->nr_cqe++;
|
||||
}
|
||||
|
||||
nvme_tcp_init_recv_ctx(queue);
|
||||
|
@ -841,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
|
|||
|
||||
static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
|
||||
{
|
||||
nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR);
|
||||
nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
|
||||
}
|
||||
|
||||
static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
|
||||
|
@ -1023,14 +1024,16 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
|
|||
|
||||
static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
|
||||
{
|
||||
struct sock *sk = queue->sock->sk;
|
||||
struct socket *sock = queue->sock;
|
||||
struct sock *sk = sock->sk;
|
||||
read_descriptor_t rd_desc;
|
||||
int consumed;
|
||||
|
||||
rd_desc.arg.data = queue;
|
||||
rd_desc.count = 1;
|
||||
lock_sock(sk);
|
||||
consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
|
||||
queue->nr_cqe = 0;
|
||||
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
|
||||
release_sock(sk);
|
||||
return consumed;
|
||||
}
|
||||
|
@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
queue->queue_size = queue_size;
|
||||
|
||||
if (qid > 0)
|
||||
queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
|
||||
queue->cmnd_capsule_len = nctrl->ioccsz * 16;
|
||||
else
|
||||
queue->cmnd_capsule_len = sizeof(struct nvme_command) +
|
||||
NVME_TCP_ADMIN_CCSZ;
|
||||
|
@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
|
||||
IPPROTO_TCP, &queue->sock);
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
dev_err(nctrl->device,
|
||||
"failed to create socket: %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
|
||||
(char *)&opt, sizeof(opt));
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
dev_err(nctrl->device,
|
||||
"failed to set TCP_SYNCNT sock opt %d\n", ret);
|
||||
goto err_sock;
|
||||
}
|
||||
|
@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
|
||||
TCP_NODELAY, (char *)&opt, sizeof(opt));
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
dev_err(nctrl->device,
|
||||
"failed to set TCP_NODELAY sock opt %d\n", ret);
|
||||
goto err_sock;
|
||||
}
|
||||
|
@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
|
||||
(char *)&sol, sizeof(sol));
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
dev_err(nctrl->device,
|
||||
"failed to set SO_LINGER sock opt %d\n", ret);
|
||||
goto err_sock;
|
||||
}
|
||||
|
||||
/* Set socket type of service */
|
||||
if (nctrl->opts->tos >= 0) {
|
||||
opt = nctrl->opts->tos;
|
||||
ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
|
||||
(char *)&opt, sizeof(opt));
|
||||
if (ret) {
|
||||
dev_err(nctrl->device,
|
||||
"failed to set IP_TOS sock opt %d\n", ret);
|
||||
goto err_sock;
|
||||
}
|
||||
}
|
||||
|
||||
queue->sock->sk->sk_allocation = GFP_ATOMIC;
|
||||
if (!qid)
|
||||
n = 0;
|
||||
|
@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
queue->pdu_offset = 0;
|
||||
sk_set_memalloc(queue->sock->sk);
|
||||
|
||||
if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
|
||||
if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
|
||||
ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
|
||||
sizeof(ctrl->src_addr));
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
dev_err(nctrl->device,
|
||||
"failed to bind queue %d socket %d\n",
|
||||
qid, ret);
|
||||
goto err_sock;
|
||||
|
@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
if (queue->hdr_digest || queue->data_digest) {
|
||||
ret = nvme_tcp_alloc_crypto(queue);
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
dev_err(nctrl->device,
|
||||
"failed to allocate queue %d crypto\n", qid);
|
||||
goto err_sock;
|
||||
}
|
||||
|
@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
goto err_crypto;
|
||||
}
|
||||
|
||||
dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
|
||||
dev_dbg(nctrl->device, "connecting queue %d\n",
|
||||
nvme_tcp_queue_id(queue));
|
||||
|
||||
ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
|
||||
sizeof(ctrl->addr), 0);
|
||||
if (ret) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
dev_err(nctrl->device,
|
||||
"failed to connect socket: %d\n", ret);
|
||||
goto err_rcv_pdu;
|
||||
}
|
||||
|
@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
|||
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
|
||||
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
|
||||
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
|
||||
queue->sock->sk->sk_ll_usec = 1;
|
||||
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
|
||||
|
||||
return 0;
|
||||
|
@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
|
|||
set->driver_data = ctrl;
|
||||
set->nr_hw_queues = nctrl->queue_count - 1;
|
||||
set->timeout = NVME_IO_TIMEOUT;
|
||||
set->nr_maps = 2 /* default + read */;
|
||||
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
|
||||
}
|
||||
|
||||
ret = blk_mq_alloc_tag_set(set);
|
||||
|
@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
|
|||
|
||||
nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
|
||||
nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
|
||||
nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
|
||||
|
||||
return nr_io_queues;
|
||||
}
|
||||
|
@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
|
|||
min(opts->nr_io_queues, nr_io_queues);
|
||||
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
|
||||
}
|
||||
|
||||
if (opts->nr_poll_queues && nr_io_queues) {
|
||||
/* map dedicated poll queues only if we have queues left */
|
||||
ctrl->io_queues[HCTX_TYPE_POLL] =
|
||||
min(opts->nr_poll_queues, nr_io_queues);
|
||||
}
|
||||
}
|
||||
|
||||
static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
|
||||
|
@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
|
|||
nvme_tcp_stop_queue(ctrl, 0);
|
||||
if (remove) {
|
||||
blk_cleanup_queue(ctrl->admin_q);
|
||||
blk_cleanup_queue(ctrl->fabrics_q);
|
||||
blk_mq_free_tag_set(ctrl->admin_tagset);
|
||||
}
|
||||
nvme_tcp_free_admin_queue(ctrl);
|
||||
|
@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
|
|||
goto out_free_queue;
|
||||
}
|
||||
|
||||
ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
|
||||
if (IS_ERR(ctrl->fabrics_q)) {
|
||||
error = PTR_ERR(ctrl->fabrics_q);
|
||||
goto out_free_tagset;
|
||||
}
|
||||
|
||||
ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
|
||||
if (IS_ERR(ctrl->admin_q)) {
|
||||
error = PTR_ERR(ctrl->admin_q);
|
||||
goto out_free_tagset;
|
||||
goto out_cleanup_fabrics_q;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
|
|||
if (error)
|
||||
goto out_cleanup_queue;
|
||||
|
||||
error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
|
||||
if (error) {
|
||||
dev_err(ctrl->device,
|
||||
"prop_get NVME_REG_CAP failed\n");
|
||||
goto out_stop_queue;
|
||||
}
|
||||
|
||||
ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
|
||||
|
||||
error = nvme_enable_ctrl(ctrl, ctrl->cap);
|
||||
error = nvme_enable_ctrl(ctrl);
|
||||
if (error)
|
||||
goto out_stop_queue;
|
||||
|
||||
blk_mq_unquiesce_queue(ctrl->admin_q);
|
||||
|
||||
error = nvme_init_identify(ctrl);
|
||||
if (error)
|
||||
goto out_stop_queue;
|
||||
|
@ -1735,6 +1758,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
|
|||
out_cleanup_queue:
|
||||
if (new)
|
||||
blk_cleanup_queue(ctrl->admin_q);
|
||||
out_cleanup_fabrics_q:
|
||||
if (new)
|
||||
blk_cleanup_queue(ctrl->fabrics_q);
|
||||
out_free_tagset:
|
||||
if (new)
|
||||
blk_mq_free_tag_set(ctrl->admin_tagset);
|
||||
|
@ -1748,10 +1774,13 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
|
|||
{
|
||||
blk_mq_quiesce_queue(ctrl->admin_q);
|
||||
nvme_tcp_stop_queue(ctrl, 0);
|
||||
if (ctrl->admin_tagset)
|
||||
if (ctrl->admin_tagset) {
|
||||
blk_mq_tagset_busy_iter(ctrl->admin_tagset,
|
||||
nvme_cancel_request, ctrl);
|
||||
blk_mq_unquiesce_queue(ctrl->admin_q);
|
||||
blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
|
||||
}
|
||||
if (remove)
|
||||
blk_mq_unquiesce_queue(ctrl->admin_q);
|
||||
nvme_tcp_destroy_admin_queue(ctrl, remove);
|
||||
}
|
||||
|
||||
|
@ -1762,9 +1791,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
|
|||
return;
|
||||
nvme_stop_queues(ctrl);
|
||||
nvme_tcp_stop_io_queues(ctrl);
|
||||
if (ctrl->tagset)
|
||||
if (ctrl->tagset) {
|
||||
blk_mq_tagset_busy_iter(ctrl->tagset,
|
||||
nvme_cancel_request, ctrl);
|
||||
blk_mq_tagset_wait_completed_request(ctrl->tagset);
|
||||
}
|
||||
if (remove)
|
||||
nvme_start_queues(ctrl);
|
||||
nvme_tcp_destroy_io_queues(ctrl, remove);
|
||||
|
@ -1793,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
|
|||
static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
|
||||
{
|
||||
struct nvmf_ctrl_options *opts = ctrl->opts;
|
||||
int ret = -EINVAL;
|
||||
int ret;
|
||||
|
||||
ret = nvme_tcp_configure_admin_queue(ctrl, new);
|
||||
if (ret)
|
||||
|
@ -1876,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
|
|||
/* unquiesce to fail fast pending requests */
|
||||
nvme_start_queues(ctrl);
|
||||
nvme_tcp_teardown_admin_queue(ctrl, false);
|
||||
blk_mq_unquiesce_queue(ctrl->admin_q);
|
||||
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
|
||||
/* state change failure is ok if we're in DELETING state */
|
||||
|
@ -1892,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
|
|||
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
|
||||
|
||||
nvme_tcp_teardown_io_queues(ctrl, shutdown);
|
||||
blk_mq_quiesce_queue(ctrl->admin_q);
|
||||
if (shutdown)
|
||||
nvme_shutdown_ctrl(ctrl);
|
||||
else
|
||||
nvme_disable_ctrl(ctrl, ctrl->cap);
|
||||
nvme_disable_ctrl(ctrl);
|
||||
nvme_tcp_teardown_admin_queue(ctrl, shutdown);
|
||||
}
|
||||
|
||||
|
@ -2151,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
|
|||
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
|
||||
blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
|
||||
|
||||
if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
|
||||
/* map dedicated poll queues only if we have queues left */
|
||||
set->map[HCTX_TYPE_POLL].nr_queues =
|
||||
ctrl->io_queues[HCTX_TYPE_POLL];
|
||||
set->map[HCTX_TYPE_POLL].queue_offset =
|
||||
ctrl->io_queues[HCTX_TYPE_DEFAULT] +
|
||||
ctrl->io_queues[HCTX_TYPE_READ];
|
||||
blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
|
||||
}
|
||||
|
||||
dev_info(ctrl->ctrl.device,
|
||||
"mapped %d/%d default/read queues.\n",
|
||||
"mapped %d/%d/%d default/read/poll queues.\n",
|
||||
ctrl->io_queues[HCTX_TYPE_DEFAULT],
|
||||
ctrl->io_queues[HCTX_TYPE_READ]);
|
||||
ctrl->io_queues[HCTX_TYPE_READ],
|
||||
ctrl->io_queues[HCTX_TYPE_POLL]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct nvme_tcp_queue *queue = hctx->driver_data;
|
||||
struct sock *sk = queue->sock->sk;
|
||||
|
||||
if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue))
|
||||
sk_busy_loop(sk, true);
|
||||
nvme_tcp_try_recv(queue);
|
||||
return queue->nr_cqe;
|
||||
}
|
||||
|
||||
static struct blk_mq_ops nvme_tcp_mq_ops = {
|
||||
.queue_rq = nvme_tcp_queue_rq,
|
||||
.complete = nvme_complete_rq,
|
||||
|
@ -2167,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
|
|||
.init_hctx = nvme_tcp_init_hctx,
|
||||
.timeout = nvme_tcp_timeout,
|
||||
.map_queues = nvme_tcp_map_queues,
|
||||
.poll = nvme_tcp_poll,
|
||||
};
|
||||
|
||||
static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
|
||||
|
@ -2220,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
|
|||
|
||||
INIT_LIST_HEAD(&ctrl->list);
|
||||
ctrl->ctrl.opts = opts;
|
||||
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
|
||||
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
|
||||
opts->nr_poll_queues + 1;
|
||||
ctrl->ctrl.sqsize = opts->queue_size - 1;
|
||||
ctrl->ctrl.kato = opts->kato;
|
||||
|
||||
|
@ -2314,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
|
|||
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
|
||||
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
|
||||
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
|
||||
NVMF_OPT_NR_WRITE_QUEUES,
|
||||
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
|
||||
NVMF_OPT_TOS,
|
||||
.create_ctrl = nvme_tcp_create_ctrl,
|
||||
};
|
||||
|
||||
|
|
|
@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvme_trace_get_lba_status(struct trace_seq *p,
|
||||
u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
u64 slba = get_unaligned_le64(cdw10);
|
||||
u32 mndw = get_unaligned_le32(cdw10 + 8);
|
||||
u16 rl = get_unaligned_le16(cdw10 + 12);
|
||||
u8 atype = cdw10[15];
|
||||
|
||||
trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
|
||||
slba, mndw, rl, atype);
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
|
@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
|
|||
return nvme_trace_admin_identify(p, cdw10);
|
||||
case nvme_admin_get_features:
|
||||
return nvme_trace_admin_get_features(p, cdw10);
|
||||
case nvme_admin_get_lba_status:
|
||||
return nvme_trace_get_lba_status(p, cdw10);
|
||||
default:
|
||||
return nvme_trace_common(p, cdw10);
|
||||
}
|
||||
|
|
|
@ -37,7 +37,6 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
|
|||
static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
u16 status = NVME_SC_SUCCESS;
|
||||
unsigned long flags;
|
||||
off_t offset = 0;
|
||||
u64 slot;
|
||||
|
@ -47,9 +46,8 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
|
|||
slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS;
|
||||
|
||||
for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) {
|
||||
status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
|
||||
sizeof(struct nvme_error_slot));
|
||||
if (status)
|
||||
if (nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
|
||||
sizeof(struct nvme_error_slot)))
|
||||
break;
|
||||
|
||||
if (slot == 0)
|
||||
|
@ -59,7 +57,7 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
|
|||
offset += sizeof(struct nvme_error_slot);
|
||||
}
|
||||
spin_unlock_irqrestore(&ctrl->error_lock, flags);
|
||||
nvmet_req_complete(req, status);
|
||||
nvmet_req_complete(req, 0);
|
||||
}
|
||||
|
||||
static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
|
||||
|
@ -81,9 +79,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
|
|||
goto out;
|
||||
|
||||
host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
|
||||
data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
|
||||
data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
|
||||
sectors[READ]), 1000);
|
||||
host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
|
||||
data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
|
||||
data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
|
||||
sectors[WRITE]), 1000);
|
||||
|
||||
put_unaligned_le64(host_reads, &slog->host_reads[0]);
|
||||
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
|
||||
|
@ -111,11 +111,11 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
|
|||
if (!ns->bdev)
|
||||
continue;
|
||||
host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
|
||||
data_units_read +=
|
||||
part_stat_read(ns->bdev->bd_part, sectors[READ]);
|
||||
data_units_read += DIV_ROUND_UP(
|
||||
part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000);
|
||||
host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
|
||||
data_units_written +=
|
||||
part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
|
||||
data_units_written += DIV_ROUND_UP(
|
||||
part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
|
||||
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
|
|
@ -381,9 +381,7 @@ int __init nvmet_init_discovery(void)
|
|||
{
|
||||
nvmet_disc_subsys =
|
||||
nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
|
||||
if (IS_ERR(nvmet_disc_subsys))
|
||||
return PTR_ERR(nvmet_disc_subsys);
|
||||
return 0;
|
||||
return PTR_ERR_OR_ZERO(nvmet_disc_subsys);
|
||||
}
|
||||
|
||||
void nvmet_exit_discovery(void)
|
||||
|
|
|
@ -253,6 +253,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
|
|||
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
|
||||
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
|
||||
blk_cleanup_queue(ctrl->ctrl.admin_q);
|
||||
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
|
||||
blk_mq_free_tag_set(&ctrl->admin_tag_set);
|
||||
}
|
||||
|
||||
|
@ -357,10 +358,16 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
|
|||
goto out_free_sq;
|
||||
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
|
||||
|
||||
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
|
||||
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
|
||||
error = PTR_ERR(ctrl->ctrl.fabrics_q);
|
||||
goto out_free_tagset;
|
||||
}
|
||||
|
||||
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
|
||||
if (IS_ERR(ctrl->ctrl.admin_q)) {
|
||||
error = PTR_ERR(ctrl->ctrl.admin_q);
|
||||
goto out_free_tagset;
|
||||
goto out_cleanup_fabrics_q;
|
||||
}
|
||||
|
||||
error = nvmf_connect_admin_queue(&ctrl->ctrl);
|
||||
|
@ -369,23 +376,15 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
|
|||
|
||||
set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
|
||||
|
||||
error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
|
||||
if (error) {
|
||||
dev_err(ctrl->ctrl.device,
|
||||
"prop_get NVME_REG_CAP failed\n");
|
||||
goto out_cleanup_queue;
|
||||
}
|
||||
|
||||
ctrl->ctrl.sqsize =
|
||||
min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
|
||||
|
||||
error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
|
||||
error = nvme_enable_ctrl(&ctrl->ctrl);
|
||||
if (error)
|
||||
goto out_cleanup_queue;
|
||||
|
||||
ctrl->ctrl.max_hw_sectors =
|
||||
(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
|
||||
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
error = nvme_init_identify(&ctrl->ctrl);
|
||||
if (error)
|
||||
goto out_cleanup_queue;
|
||||
|
@ -394,6 +393,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
|
|||
|
||||
out_cleanup_queue:
|
||||
blk_cleanup_queue(ctrl->ctrl.admin_q);
|
||||
out_cleanup_fabrics_q:
|
||||
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
|
||||
out_free_tagset:
|
||||
blk_mq_free_tag_set(&ctrl->admin_tag_set);
|
||||
out_free_sq:
|
||||
|
@ -407,16 +408,17 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
|
|||
nvme_stop_queues(&ctrl->ctrl);
|
||||
blk_mq_tagset_busy_iter(&ctrl->tag_set,
|
||||
nvme_cancel_request, &ctrl->ctrl);
|
||||
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
|
||||
nvme_loop_destroy_io_queues(ctrl);
|
||||
}
|
||||
|
||||
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
|
||||
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
|
||||
nvme_shutdown_ctrl(&ctrl->ctrl);
|
||||
|
||||
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
|
||||
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
|
||||
nvme_cancel_request, &ctrl->ctrl);
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
|
||||
nvme_loop_destroy_admin_queue(ctrl);
|
||||
}
|
||||
|
||||
|
|
|
@ -348,7 +348,8 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
|
|||
|
||||
return 0;
|
||||
err:
|
||||
sgl_free(cmd->req.sg);
|
||||
if (cmd->req.sg_cnt)
|
||||
sgl_free(cmd->req.sg);
|
||||
return NVME_SC_INTERNAL;
|
||||
}
|
||||
|
||||
|
@ -553,7 +554,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
|
|||
|
||||
if (queue->nvme_sq.sqhd_disabled) {
|
||||
kfree(cmd->iov);
|
||||
sgl_free(cmd->req.sg);
|
||||
if (cmd->req.sg_cnt)
|
||||
sgl_free(cmd->req.sg);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
@ -584,7 +586,8 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
|
|||
return -EAGAIN;
|
||||
|
||||
kfree(cmd->iov);
|
||||
sgl_free(cmd->req.sg);
|
||||
if (cmd->req.sg_cnt)
|
||||
sgl_free(cmd->req.sg);
|
||||
cmd->queue->snd_cmd = NULL;
|
||||
nvmet_tcp_put_cmd(cmd);
|
||||
return 1;
|
||||
|
@ -1306,7 +1309,9 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
|
|||
{
|
||||
nvmet_req_uninit(&cmd->req);
|
||||
nvmet_tcp_unmap_pdu_iovec(cmd);
|
||||
sgl_free(cmd->req.sg);
|
||||
kfree(cmd->iov);
|
||||
if (cmd->req.sg_cnt)
|
||||
sgl_free(cmd->req.sg);
|
||||
}
|
||||
|
||||
static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
|
||||
|
@ -1410,6 +1415,7 @@ static void nvmet_tcp_state_change(struct sock *sk)
|
|||
static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
|
||||
{
|
||||
struct socket *sock = queue->sock;
|
||||
struct inet_sock *inet = inet_sk(sock->sk);
|
||||
struct linger sol = { .l_onoff = 1, .l_linger = 0 };
|
||||
int ret;
|
||||
|
||||
|
@ -1433,6 +1439,16 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Set socket type of service */
|
||||
if (inet->rcv_tos > 0) {
|
||||
int tos = inet->rcv_tos;
|
||||
|
||||
ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
|
||||
(char *)&tos, sizeof(tos));
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
write_lock_bh(&sock->sk->sk_callback_lock);
|
||||
sock->sk->sk_user_data = queue;
|
||||
queue->data_ready = sock->sk->sk_data_ready;
|
||||
|
|
|
@ -33,6 +33,22 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvmet_trace_get_lba_status(struct trace_seq *p,
|
||||
u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
u64 slba = get_unaligned_le64(cdw10);
|
||||
u32 mndw = get_unaligned_le32(cdw10 + 8);
|
||||
u16 rl = get_unaligned_le16(cdw10 + 12);
|
||||
u8 atype = cdw10[15];
|
||||
|
||||
trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
|
||||
slba, mndw, rl, atype);
|
||||
trace_seq_putc(p, 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10)
|
||||
{
|
||||
const char *ret = trace_seq_buffer_ptr(p);
|
||||
|
@ -80,6 +96,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p,
|
|||
return nvmet_trace_admin_identify(p, cdw10);
|
||||
case nvme_admin_get_features:
|
||||
return nvmet_trace_admin_get_features(p, cdw10);
|
||||
case nvme_admin_get_lba_status:
|
||||
return nvmet_trace_get_lba_status(p, cdw10);
|
||||
default:
|
||||
return nvmet_trace_common(p, cdw10);
|
||||
}
|
||||
|
|
|
@ -1089,6 +1089,18 @@ static void scsi_initialize_rq(struct request *rq)
|
|||
cmd->retries = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Only called when the request isn't completed by SCSI, and not freed by
|
||||
* SCSI
|
||||
*/
|
||||
static void scsi_cleanup_rq(struct request *rq)
|
||||
{
|
||||
if (rq->rq_flags & RQF_DONTPREP) {
|
||||
scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq));
|
||||
rq->rq_flags &= ~RQF_DONTPREP;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add a command to the list used by the aacraid and dpt_i2o drivers */
|
||||
void scsi_add_cmd_to_list(struct scsi_cmnd *cmd)
|
||||
{
|
||||
|
@ -1821,6 +1833,7 @@ static const struct blk_mq_ops scsi_mq_ops = {
|
|||
.init_request = scsi_mq_init_request,
|
||||
.exit_request = scsi_mq_exit_request,
|
||||
.initialize_rq_fn = scsi_initialize_rq,
|
||||
.cleanup_rq = scsi_cleanup_rq,
|
||||
.busy = scsi_mq_lld_busy,
|
||||
.map_queues = scsi_map_queues,
|
||||
};
|
||||
|
|
|
@ -94,8 +94,7 @@ static int scsi_dev_type_resume(struct device *dev,
|
|||
if (!err && scsi_is_sdev_device(dev)) {
|
||||
struct scsi_device *sdev = to_scsi_device(dev);
|
||||
|
||||
if (sdev->request_queue->dev)
|
||||
blk_set_runtime_active(sdev->request_queue);
|
||||
blk_set_runtime_active(sdev->request_queue);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1293,7 +1293,9 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
|
|||
case REQ_OP_WRITE:
|
||||
return sd_setup_read_write_cmnd(cmd);
|
||||
case REQ_OP_ZONE_RESET:
|
||||
return sd_zbc_setup_reset_cmnd(cmd);
|
||||
return sd_zbc_setup_reset_cmnd(cmd, false);
|
||||
case REQ_OP_ZONE_RESET_ALL:
|
||||
return sd_zbc_setup_reset_cmnd(cmd, true);
|
||||
default:
|
||||
WARN_ON_ONCE(1);
|
||||
return BLK_STS_NOTSUPP;
|
||||
|
@ -1959,6 +1961,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
|
|||
case REQ_OP_WRITE_ZEROES:
|
||||
case REQ_OP_WRITE_SAME:
|
||||
case REQ_OP_ZONE_RESET:
|
||||
case REQ_OP_ZONE_RESET_ALL:
|
||||
if (!result) {
|
||||
good_bytes = blk_rq_bytes(req);
|
||||
scsi_set_resid(SCpnt, 0);
|
||||
|
|
|
@ -209,7 +209,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
|
|||
|
||||
extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
|
||||
extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
|
||||
extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
|
||||
extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd, bool all);
|
||||
extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
|
||||
struct scsi_sense_hdr *sshdr);
|
||||
extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
|
||||
|
@ -225,7 +225,8 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
|
|||
|
||||
static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {}
|
||||
|
||||
static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
|
||||
static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd,
|
||||
bool all)
|
||||
{
|
||||
return BLK_STS_TARGET;
|
||||
}
|
||||
|
|
|
@ -209,10 +209,11 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
|
|||
/**
|
||||
* sd_zbc_setup_reset_cmnd - Prepare a RESET WRITE POINTER scsi command.
|
||||
* @cmd: the command to setup
|
||||
* @all: Reset all zones control.
|
||||
*
|
||||
* Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
|
||||
*/
|
||||
blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
|
||||
blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd, bool all)
|
||||
{
|
||||
struct request *rq = cmd->request;
|
||||
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
|
||||
|
@ -234,7 +235,10 @@ blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
|
|||
memset(cmd->cmnd, 0, cmd->cmd_len);
|
||||
cmd->cmnd[0] = ZBC_OUT;
|
||||
cmd->cmnd[1] = ZO_RESET_WRITE_POINTER;
|
||||
put_unaligned_be64(block, &cmd->cmnd[2]);
|
||||
if (all)
|
||||
cmd->cmnd[14] = 0x1;
|
||||
else
|
||||
put_unaligned_be64(block, &cmd->cmnd[2]);
|
||||
|
||||
rq->timeout = SD_TIMEOUT;
|
||||
cmd->sc_data_direction = DMA_NONE;
|
||||
|
@ -261,6 +265,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
|
|||
|
||||
switch (req_op(rq)) {
|
||||
case REQ_OP_ZONE_RESET:
|
||||
case REQ_OP_ZONE_RESET_ALL:
|
||||
|
||||
if (result &&
|
||||
sshdr->sense_key == ILLEGAL_REQUEST &&
|
||||
|
@ -487,6 +492,9 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
|
|||
/* The drive satisfies the kernel restrictions: set it up */
|
||||
blk_queue_chunk_sectors(sdkp->disk->queue,
|
||||
logical_to_sectors(sdkp->device, zone_blocks));
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
|
||||
blk_queue_required_elevator_features(sdkp->disk->queue,
|
||||
ELEVATOR_F_ZBD_SEQ_WRITE);
|
||||
nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
|
||||
|
||||
/* READ16/WRITE16 is mandatory for ZBC disks */
|
||||
|
|
|
@ -36,10 +36,6 @@
|
|||
*/
|
||||
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
|
||||
|
||||
struct wb_completion {
|
||||
atomic_t cnt;
|
||||
};
|
||||
|
||||
/*
|
||||
* Passed into wb_writeback(), essentially a subset of writeback_control
|
||||
*/
|
||||
|
@ -60,19 +56,6 @@ struct wb_writeback_work {
|
|||
struct wb_completion *done; /* set if the caller waits */
|
||||
};
|
||||
|
||||
/*
|
||||
* If one wants to wait for one or more wb_writeback_works, each work's
|
||||
* ->done should be set to a wb_completion defined using the following
|
||||
* macro. Once all work items are issued with wb_queue_work(), the caller
|
||||
* can wait for the completion of all using wb_wait_for_completion(). Work
|
||||
* items which are waited upon aren't freed automatically on completion.
|
||||
*/
|
||||
#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
|
||||
struct wb_completion cmpl = { \
|
||||
.cnt = ATOMIC_INIT(1), \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If an inode is constantly having its pages dirtied, but then the
|
||||
* updates stop dirtytime_expire_interval seconds in the past, it's
|
||||
|
@ -182,7 +165,7 @@ static void finish_writeback_work(struct bdi_writeback *wb,
|
|||
if (work->auto_free)
|
||||
kfree(work);
|
||||
if (done && atomic_dec_and_test(&done->cnt))
|
||||
wake_up_all(&wb->bdi->wb_waitq);
|
||||
wake_up_all(done->waitq);
|
||||
}
|
||||
|
||||
static void wb_queue_work(struct bdi_writeback *wb,
|
||||
|
@ -206,28 +189,44 @@ static void wb_queue_work(struct bdi_writeback *wb,
|
|||
|
||||
/**
|
||||
* wb_wait_for_completion - wait for completion of bdi_writeback_works
|
||||
* @bdi: bdi work items were issued to
|
||||
* @done: target wb_completion
|
||||
*
|
||||
* Wait for one or more work items issued to @bdi with their ->done field
|
||||
* set to @done, which should have been defined with
|
||||
* DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
|
||||
* work items are completed. Work items which are waited upon aren't freed
|
||||
* set to @done, which should have been initialized with
|
||||
* DEFINE_WB_COMPLETION(). This function returns after all such work items
|
||||
* are completed. Work items which are waited upon aren't freed
|
||||
* automatically on completion.
|
||||
*/
|
||||
static void wb_wait_for_completion(struct backing_dev_info *bdi,
|
||||
struct wb_completion *done)
|
||||
void wb_wait_for_completion(struct wb_completion *done)
|
||||
{
|
||||
atomic_dec(&done->cnt); /* put down the initial count */
|
||||
wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
|
||||
wait_event(*done->waitq, !atomic_read(&done->cnt));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
|
||||
/* parameters for foreign inode detection, see wb_detach_inode() */
|
||||
/*
|
||||
* Parameters for foreign inode detection, see wbc_detach_inode() to see
|
||||
* how they're used.
|
||||
*
|
||||
* These paramters are inherently heuristical as the detection target
|
||||
* itself is fuzzy. All we want to do is detaching an inode from the
|
||||
* current owner if it's being written to by some other cgroups too much.
|
||||
*
|
||||
* The current cgroup writeback is built on the assumption that multiple
|
||||
* cgroups writing to the same inode concurrently is very rare and a mode
|
||||
* of operation which isn't well supported. As such, the goal is not
|
||||
* taking too long when a different cgroup takes over an inode while
|
||||
* avoiding too aggressive flip-flops from occasional foreign writes.
|
||||
*
|
||||
* We record, very roughly, 2s worth of IO time history and if more than
|
||||
* half of that is foreign, trigger the switch. The recording is quantized
|
||||
* to 16 slots. To avoid tiny writes from swinging the decision too much,
|
||||
* writes smaller than 1/8 of avg size are ignored.
|
||||
*/
|
||||
#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
|
||||
#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
|
||||
#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
|
||||
#define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */
|
||||
#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
|
||||
|
||||
#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
|
||||
|
@ -237,6 +236,7 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
|
|||
/* if foreign slots >= 8, switch */
|
||||
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
|
||||
/* one round can affect upto 5 slots */
|
||||
#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
|
||||
|
||||
static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
|
||||
static struct workqueue_struct *isw_wq;
|
||||
|
@ -389,6 +389,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
|
|||
if (unlikely(inode->i_state & I_FREEING))
|
||||
goto skip_switch;
|
||||
|
||||
trace_inode_switch_wbs(inode, old_wb, new_wb);
|
||||
|
||||
/*
|
||||
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
|
||||
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
|
||||
|
@ -489,18 +491,13 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
|
|||
if (inode->i_state & I_WB_SWITCH)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Avoid starting new switches while sync_inodes_sb() is in
|
||||
* progress. Otherwise, if the down_write protected issue path
|
||||
* blocks heavily, we might end up starting a large number of
|
||||
* switches which will block on the rwsem.
|
||||
*/
|
||||
if (!down_read_trylock(&bdi->wb_switch_rwsem))
|
||||
/* avoid queueing a new switch if too many are already in flight */
|
||||
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
|
||||
return;
|
||||
|
||||
isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
|
||||
if (!isw)
|
||||
goto out_unlock;
|
||||
return;
|
||||
|
||||
/* find and pin the new wb */
|
||||
rcu_read_lock();
|
||||
|
@ -534,15 +531,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
|
|||
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
|
||||
|
||||
atomic_inc(&isw_nr_in_flight);
|
||||
|
||||
goto out_unlock;
|
||||
return;
|
||||
|
||||
out_free:
|
||||
if (isw->new_wb)
|
||||
wb_put(isw->new_wb);
|
||||
kfree(isw);
|
||||
out_unlock:
|
||||
up_read(&bdi->wb_switch_rwsem);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -681,6 +675,9 @@ void wbc_detach_inode(struct writeback_control *wbc)
|
|||
if (wbc->wb_id != max_id)
|
||||
history |= (1U << slots) - 1;
|
||||
|
||||
if (history)
|
||||
trace_inode_foreign_history(inode, wbc, history);
|
||||
|
||||
/*
|
||||
* Switch if the current wb isn't the consistent winner.
|
||||
* If there are multiple closely competing dirtiers, the
|
||||
|
@ -843,7 +840,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
|
|||
restart:
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
|
||||
DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
|
||||
DEFINE_WB_COMPLETION(fallback_work_done, bdi);
|
||||
struct wb_writeback_work fallback_work;
|
||||
struct wb_writeback_work *work;
|
||||
long nr_pages;
|
||||
|
@ -890,7 +887,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
|
|||
last_wb = wb;
|
||||
|
||||
rcu_read_unlock();
|
||||
wb_wait_for_completion(bdi, &fallback_work_done);
|
||||
wb_wait_for_completion(&fallback_work_done);
|
||||
goto restart;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
@ -899,6 +896,89 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
|
|||
wb_put(last_wb);
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
|
||||
* @bdi_id: target bdi id
|
||||
* @memcg_id: target memcg css id
|
||||
* @nr_pages: number of pages to write, 0 for best-effort dirty flushing
|
||||
* @reason: reason why some writeback work initiated
|
||||
* @done: target wb_completion
|
||||
*
|
||||
* Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
|
||||
* with the specified parameters.
|
||||
*/
|
||||
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
|
||||
enum wb_reason reason, struct wb_completion *done)
|
||||
{
|
||||
struct backing_dev_info *bdi;
|
||||
struct cgroup_subsys_state *memcg_css;
|
||||
struct bdi_writeback *wb;
|
||||
struct wb_writeback_work *work;
|
||||
int ret;
|
||||
|
||||
/* lookup bdi and memcg */
|
||||
bdi = bdi_get_by_id(bdi_id);
|
||||
if (!bdi)
|
||||
return -ENOENT;
|
||||
|
||||
rcu_read_lock();
|
||||
memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
|
||||
if (memcg_css && !css_tryget(memcg_css))
|
||||
memcg_css = NULL;
|
||||
rcu_read_unlock();
|
||||
if (!memcg_css) {
|
||||
ret = -ENOENT;
|
||||
goto out_bdi_put;
|
||||
}
|
||||
|
||||
/*
|
||||
* And find the associated wb. If the wb isn't there already
|
||||
* there's nothing to flush, don't create one.
|
||||
*/
|
||||
wb = wb_get_lookup(bdi, memcg_css);
|
||||
if (!wb) {
|
||||
ret = -ENOENT;
|
||||
goto out_css_put;
|
||||
}
|
||||
|
||||
/*
|
||||
* If @nr is zero, the caller is attempting to write out most of
|
||||
* the currently dirty pages. Let's take the current dirty page
|
||||
* count and inflate it by 25% which should be large enough to
|
||||
* flush out most dirty pages while avoiding getting livelocked by
|
||||
* concurrent dirtiers.
|
||||
*/
|
||||
if (!nr) {
|
||||
unsigned long filepages, headroom, dirty, writeback;
|
||||
|
||||
mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
|
||||
&writeback);
|
||||
nr = dirty * 10 / 8;
|
||||
}
|
||||
|
||||
/* issue the writeback work */
|
||||
work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
|
||||
if (work) {
|
||||
work->nr_pages = nr;
|
||||
work->sync_mode = WB_SYNC_NONE;
|
||||
work->range_cyclic = 1;
|
||||
work->reason = reason;
|
||||
work->done = done;
|
||||
work->auto_free = 1;
|
||||
wb_queue_work(wb, work);
|
||||
ret = 0;
|
||||
} else {
|
||||
ret = -ENOMEM;
|
||||
}
|
||||
|
||||
wb_put(wb);
|
||||
out_css_put:
|
||||
css_put(memcg_css);
|
||||
out_bdi_put:
|
||||
bdi_put(bdi);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_writeback_umount - flush inode wb switches for umount
|
||||
*
|
||||
|
@ -2362,7 +2442,8 @@ static void wait_sb_inodes(struct super_block *sb)
|
|||
static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
|
||||
enum wb_reason reason, bool skip_if_busy)
|
||||
{
|
||||
DEFINE_WB_COMPLETION_ONSTACK(done);
|
||||
struct backing_dev_info *bdi = sb->s_bdi;
|
||||
DEFINE_WB_COMPLETION(done, bdi);
|
||||
struct wb_writeback_work work = {
|
||||
.sb = sb,
|
||||
.sync_mode = WB_SYNC_NONE,
|
||||
|
@ -2371,14 +2452,13 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
|
|||
.nr_pages = nr,
|
||||
.reason = reason,
|
||||
};
|
||||
struct backing_dev_info *bdi = sb->s_bdi;
|
||||
|
||||
if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
|
||||
return;
|
||||
WARN_ON(!rwsem_is_locked(&sb->s_umount));
|
||||
|
||||
bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
|
||||
wb_wait_for_completion(bdi, &done);
|
||||
wb_wait_for_completion(&done);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2440,7 +2520,8 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
|
|||
*/
|
||||
void sync_inodes_sb(struct super_block *sb)
|
||||
{
|
||||
DEFINE_WB_COMPLETION_ONSTACK(done);
|
||||
struct backing_dev_info *bdi = sb->s_bdi;
|
||||
DEFINE_WB_COMPLETION(done, bdi);
|
||||
struct wb_writeback_work work = {
|
||||
.sb = sb,
|
||||
.sync_mode = WB_SYNC_ALL,
|
||||
|
@ -2450,7 +2531,6 @@ void sync_inodes_sb(struct super_block *sb)
|
|||
.reason = WB_REASON_SYNC,
|
||||
.for_sync = 1,
|
||||
};
|
||||
struct backing_dev_info *bdi = sb->s_bdi;
|
||||
|
||||
/*
|
||||
* Can't skip on !bdi_has_dirty() because we should wait for !dirty
|
||||
|
@ -2464,7 +2544,7 @@ void sync_inodes_sb(struct super_block *sb)
|
|||
/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
|
||||
bdi_down_write_wb_switch_rwsem(bdi);
|
||||
bdi_split_work_to_wbs(bdi, &work, false);
|
||||
wb_wait_for_completion(bdi, &done);
|
||||
wb_wait_for_completion(&done);
|
||||
bdi_up_write_wb_switch_rwsem(bdi);
|
||||
|
||||
wait_sb_inodes(sb);
|
||||
|
|
|
@ -63,10 +63,31 @@ enum wb_reason {
|
|||
* so it has a mismatch name.
|
||||
*/
|
||||
WB_REASON_FORKER_THREAD,
|
||||
WB_REASON_FOREIGN_FLUSH,
|
||||
|
||||
WB_REASON_MAX,
|
||||
};
|
||||
|
||||
struct wb_completion {
|
||||
atomic_t cnt;
|
||||
wait_queue_head_t *waitq;
|
||||
};
|
||||
|
||||
#define __WB_COMPLETION_INIT(_waitq) \
|
||||
(struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }
|
||||
|
||||
/*
|
||||
* If one wants to wait for one or more wb_writeback_works, each work's
|
||||
* ->done should be set to a wb_completion defined using the following
|
||||
* macro. Once all work items are issued with wb_queue_work(), the caller
|
||||
* can wait for the completion of all using wb_wait_for_completion(). Work
|
||||
* items which are waited upon aren't freed automatically on completion.
|
||||
*/
|
||||
#define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq)
|
||||
|
||||
#define DEFINE_WB_COMPLETION(cmpl, bdi) \
|
||||
struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)
|
||||
|
||||
/*
|
||||
* For cgroup writeback, multiple wb's may map to the same blkcg. Those
|
||||
* wb's can operate mostly independently but should share the congested
|
||||
|
@ -165,6 +186,8 @@ struct bdi_writeback {
|
|||
};
|
||||
|
||||
struct backing_dev_info {
|
||||
u64 id;
|
||||
struct rb_node rb_node; /* keyed by ->id */
|
||||
struct list_head bdi_list;
|
||||
unsigned long ra_pages; /* max readahead in PAGE_SIZE units */
|
||||
unsigned long io_pages; /* max allowed IO size */
|
||||
|
|
|
@ -24,6 +24,7 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
|
|||
return bdi;
|
||||
}
|
||||
|
||||
struct backing_dev_info *bdi_get_by_id(u64 id);
|
||||
void bdi_put(struct backing_dev_info *bdi);
|
||||
|
||||
__printf(2, 3)
|
||||
|
@ -44,6 +45,8 @@ void wb_start_background_writeback(struct bdi_writeback *wb);
|
|||
void wb_workfn(struct work_struct *work);
|
||||
void wb_wakeup_delayed(struct bdi_writeback *wb);
|
||||
|
||||
void wb_wait_for_completion(struct wb_completion *done);
|
||||
|
||||
extern spinlock_t bdi_lock;
|
||||
extern struct list_head bdi_list;
|
||||
|
||||
|
@ -227,6 +230,8 @@ static inline int bdi_sched_wait(void *word)
|
|||
struct bdi_writeback_congested *
|
||||
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
|
||||
void wb_congested_put(struct bdi_writeback_congested *congested);
|
||||
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
|
||||
struct cgroup_subsys_state *memcg_css);
|
||||
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
|
||||
struct cgroup_subsys_state *memcg_css,
|
||||
gfp_t gfp);
|
||||
|
|
|
@ -149,7 +149,8 @@ typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
|
|||
typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
|
||||
typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
|
||||
typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
|
||||
typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
|
||||
typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
|
||||
struct request_queue *q, struct blkcg *blkcg);
|
||||
typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
|
||||
typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
|
||||
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
|
||||
|
@ -233,6 +234,7 @@ struct blkg_conf_ctx {
|
|||
char *body;
|
||||
};
|
||||
|
||||
struct gendisk *blkcg_conf_get_disk(char **inputp);
|
||||
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
|
||||
char *input, struct blkg_conf_ctx *ctx);
|
||||
void blkg_conf_finish(struct blkg_conf_ctx *ctx);
|
||||
|
@ -375,7 +377,7 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
|
|||
* @q: request_queue of interest
|
||||
*
|
||||
* Lookup blkg for the @blkcg - @q pair. This function should be called
|
||||
* under RCU read loc.
|
||||
* under RCU read lock.
|
||||
*/
|
||||
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
|
||||
struct request_queue *q)
|
||||
|
|
|
@ -140,6 +140,7 @@ typedef int (poll_fn)(struct blk_mq_hw_ctx *);
|
|||
typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
|
||||
typedef bool (busy_fn)(struct request_queue *);
|
||||
typedef void (complete_fn)(struct request *);
|
||||
typedef void (cleanup_rq_fn)(struct request *);
|
||||
|
||||
|
||||
struct blk_mq_ops {
|
||||
|
@ -200,6 +201,12 @@ struct blk_mq_ops {
|
|||
/* Called from inside blk_get_request() */
|
||||
void (*initialize_rq_fn)(struct request *rq);
|
||||
|
||||
/*
|
||||
* Called before freeing one request which isn't completed yet,
|
||||
* and usually for freeing the driver private data
|
||||
*/
|
||||
cleanup_rq_fn *cleanup_rq;
|
||||
|
||||
/*
|
||||
* If set, returns whether or not this queue currently is busy
|
||||
*/
|
||||
|
@ -241,12 +248,12 @@ enum {
|
|||
|
||||
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
|
||||
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q);
|
||||
struct request_queue *q,
|
||||
bool elevator_init);
|
||||
struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
|
||||
const struct blk_mq_ops *ops,
|
||||
unsigned int queue_depth,
|
||||
unsigned int set_flags);
|
||||
int blk_mq_register_dev(struct device *, struct request_queue *);
|
||||
void blk_mq_unregister_dev(struct device *, struct request_queue *);
|
||||
|
||||
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
|
||||
|
@ -296,6 +303,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
|
|||
|
||||
|
||||
int blk_mq_request_started(struct request *rq);
|
||||
int blk_mq_request_completed(struct request *rq);
|
||||
void blk_mq_start_request(struct request *rq);
|
||||
void blk_mq_end_request(struct request *rq, blk_status_t error);
|
||||
void __blk_mq_end_request(struct request *rq, blk_status_t error);
|
||||
|
@ -304,7 +312,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
|
|||
void blk_mq_kick_requeue_list(struct request_queue *q);
|
||||
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
|
||||
bool blk_mq_complete_request(struct request *rq);
|
||||
void blk_mq_complete_request_sync(struct request *rq);
|
||||
bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
|
||||
struct bio *bio, unsigned int nr_segs);
|
||||
bool blk_mq_queue_stopped(struct request_queue *q);
|
||||
|
@ -321,6 +328,7 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
|
|||
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
|
||||
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
|
||||
busy_tag_iter_fn *fn, void *priv);
|
||||
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
|
||||
void blk_mq_freeze_queue(struct request_queue *q);
|
||||
void blk_mq_unfreeze_queue(struct request_queue *q);
|
||||
void blk_freeze_queue_start(struct request_queue *q);
|
||||
|
@ -366,4 +374,10 @@ static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
|
|||
BLK_QC_T_INTERNAL;
|
||||
}
|
||||
|
||||
static inline void blk_mq_cleanup_rq(struct request *rq)
|
||||
{
|
||||
if (rq->q->mq_ops->cleanup_rq)
|
||||
rq->q->mq_ops->cleanup_rq(rq);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -169,6 +169,9 @@ struct bio {
|
|||
*/
|
||||
struct blkcg_gq *bi_blkg;
|
||||
struct bio_issue bi_issue;
|
||||
#ifdef CONFIG_BLK_CGROUP_IOCOST
|
||||
u64 bi_iocost_cost;
|
||||
#endif
|
||||
#endif
|
||||
union {
|
||||
#if defined(CONFIG_BLK_DEV_INTEGRITY)
|
||||
|
@ -209,6 +212,7 @@ enum {
|
|||
BIO_BOUNCED, /* bio is a bounce bio */
|
||||
BIO_USER_MAPPED, /* contains user pages */
|
||||
BIO_NULL_MAPPED, /* contains invalid user pages */
|
||||
BIO_WORKINGSET, /* contains userspace workingset pages */
|
||||
BIO_QUIET, /* Make BIO Quiet */
|
||||
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
|
||||
BIO_REFFED, /* bio has elevated ->bi_cnt */
|
||||
|
@ -282,6 +286,8 @@ enum req_opf {
|
|||
REQ_OP_ZONE_RESET = 6,
|
||||
/* write the same sector many times */
|
||||
REQ_OP_WRITE_SAME = 7,
|
||||
/* reset all the zone present on the device */
|
||||
REQ_OP_ZONE_RESET_ALL = 8,
|
||||
/* write the zero filled sector many times */
|
||||
REQ_OP_WRITE_ZEROES = 9,
|
||||
|
||||
|
|
|
@ -194,7 +194,11 @@ struct request {
|
|||
|
||||
struct gendisk *rq_disk;
|
||||
struct hd_struct *part;
|
||||
/* Time that I/O was submitted to the kernel. */
|
||||
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
|
||||
/* Time that the first bio started allocating this request. */
|
||||
u64 alloc_time_ns;
|
||||
#endif
|
||||
/* Time that this request was allocated for this IO. */
|
||||
u64 start_time_ns;
|
||||
/* Time that I/O was submitted to the device. */
|
||||
u64 io_start_time_ns;
|
||||
|
@ -202,9 +206,12 @@ struct request {
|
|||
#ifdef CONFIG_BLK_WBT
|
||||
unsigned short wbt_flags;
|
||||
#endif
|
||||
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
unsigned short throtl_size;
|
||||
#endif
|
||||
/*
|
||||
* rq sectors used for blk stats. It has the same value
|
||||
* with blk_rq_sectors(rq), except that it never be zeroed
|
||||
* by completion.
|
||||
*/
|
||||
unsigned short stats_sectors;
|
||||
|
||||
/*
|
||||
* Number of scatter-gather DMA addr+len pairs after
|
||||
|
@ -391,10 +398,6 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
|
|||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
struct request_queue {
|
||||
/*
|
||||
* Together with queue_head for cacheline sharing
|
||||
*/
|
||||
struct list_head queue_head;
|
||||
struct request *last_merge;
|
||||
struct elevator_queue *elevator;
|
||||
|
||||
|
@ -496,6 +499,8 @@ struct request_queue {
|
|||
|
||||
struct queue_limits limits;
|
||||
|
||||
unsigned int required_elevator_features;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
/*
|
||||
* Zoned block device information for request dispatch control.
|
||||
|
@ -539,6 +544,7 @@ struct request_queue {
|
|||
struct delayed_work requeue_work;
|
||||
|
||||
struct mutex sysfs_lock;
|
||||
struct mutex sysfs_dir_lock;
|
||||
|
||||
/*
|
||||
* for reusing dead hctx instance in case of updating
|
||||
|
@ -611,6 +617,8 @@ struct request_queue {
|
|||
#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */
|
||||
#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
|
||||
#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
|
||||
#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */
|
||||
#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
|
||||
|
||||
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
|
||||
(1 << QUEUE_FLAG_SAME_COMP))
|
||||
|
@ -630,6 +638,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
|
|||
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
|
||||
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
|
||||
#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
|
||||
#define blk_queue_zone_resetall(q) \
|
||||
test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
|
||||
#define blk_queue_secure_erase(q) \
|
||||
(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
|
||||
#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
|
||||
|
@ -637,6 +647,12 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
|
|||
test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
|
||||
#define blk_queue_pci_p2pdma(q) \
|
||||
test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
|
||||
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
|
||||
#define blk_queue_rq_alloc_time(q) \
|
||||
test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
|
||||
#else
|
||||
#define blk_queue_rq_alloc_time(q) false
|
||||
#endif
|
||||
|
||||
#define blk_noretry_request(rq) \
|
||||
((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
|
||||
|
@ -644,6 +660,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
|
|||
#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
|
||||
#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only)
|
||||
#define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
|
||||
#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
|
||||
|
||||
extern void blk_set_pm_only(struct request_queue *q);
|
||||
extern void blk_clear_pm_only(struct request_queue *q);
|
||||
|
@ -903,6 +920,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
|
|||
* blk_rq_err_bytes() : bytes left till the next error boundary
|
||||
* blk_rq_sectors() : sectors left in the entire request
|
||||
* blk_rq_cur_sectors() : sectors left in the current segment
|
||||
* blk_rq_stats_sectors() : sectors of the entire request used for stats
|
||||
*/
|
||||
static inline sector_t blk_rq_pos(const struct request *rq)
|
||||
{
|
||||
|
@ -931,6 +949,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
|
|||
return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
|
||||
}
|
||||
|
||||
static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
|
||||
{
|
||||
return rq->stats_sectors;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
static inline unsigned int blk_rq_zone_no(struct request *rq)
|
||||
{
|
||||
|
@ -1085,6 +1108,8 @@ extern void blk_queue_dma_alignment(struct request_queue *, int);
|
|||
extern void blk_queue_update_dma_alignment(struct request_queue *, int);
|
||||
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
|
||||
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
|
||||
extern void blk_queue_required_elevator_features(struct request_queue *q,
|
||||
unsigned int features);
|
||||
|
||||
/*
|
||||
* Number of physical segments as sent to the device.
|
||||
|
@ -1232,42 +1257,42 @@ enum blk_default_limits {
|
|||
BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
|
||||
};
|
||||
|
||||
static inline unsigned long queue_segment_boundary(struct request_queue *q)
|
||||
static inline unsigned long queue_segment_boundary(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.seg_boundary_mask;
|
||||
}
|
||||
|
||||
static inline unsigned long queue_virt_boundary(struct request_queue *q)
|
||||
static inline unsigned long queue_virt_boundary(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.virt_boundary_mask;
|
||||
}
|
||||
|
||||
static inline unsigned int queue_max_sectors(struct request_queue *q)
|
||||
static inline unsigned int queue_max_sectors(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.max_sectors;
|
||||
}
|
||||
|
||||
static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
|
||||
static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.max_hw_sectors;
|
||||
}
|
||||
|
||||
static inline unsigned short queue_max_segments(struct request_queue *q)
|
||||
static inline unsigned short queue_max_segments(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.max_segments;
|
||||
}
|
||||
|
||||
static inline unsigned short queue_max_discard_segments(struct request_queue *q)
|
||||
static inline unsigned short queue_max_discard_segments(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.max_discard_segments;
|
||||
}
|
||||
|
||||
static inline unsigned int queue_max_segment_size(struct request_queue *q)
|
||||
static inline unsigned int queue_max_segment_size(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.max_segment_size;
|
||||
}
|
||||
|
||||
static inline unsigned short queue_logical_block_size(struct request_queue *q)
|
||||
static inline unsigned short queue_logical_block_size(const struct request_queue *q)
|
||||
{
|
||||
int retval = 512;
|
||||
|
||||
|
@ -1282,7 +1307,7 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
|
|||
return queue_logical_block_size(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline unsigned int queue_physical_block_size(struct request_queue *q)
|
||||
static inline unsigned int queue_physical_block_size(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.physical_block_size;
|
||||
}
|
||||
|
@ -1292,7 +1317,7 @@ static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
|
|||
return queue_physical_block_size(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline unsigned int queue_io_min(struct request_queue *q)
|
||||
static inline unsigned int queue_io_min(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.io_min;
|
||||
}
|
||||
|
@ -1302,7 +1327,7 @@ static inline int bdev_io_min(struct block_device *bdev)
|
|||
return queue_io_min(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline unsigned int queue_io_opt(struct request_queue *q)
|
||||
static inline unsigned int queue_io_opt(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.io_opt;
|
||||
}
|
||||
|
@ -1312,7 +1337,7 @@ static inline int bdev_io_opt(struct block_device *bdev)
|
|||
return queue_io_opt(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline int queue_alignment_offset(struct request_queue *q)
|
||||
static inline int queue_alignment_offset(const struct request_queue *q)
|
||||
{
|
||||
if (q->limits.misaligned)
|
||||
return -1;
|
||||
|
@ -1342,7 +1367,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
|
|||
return q->limits.alignment_offset;
|
||||
}
|
||||
|
||||
static inline int queue_discard_alignment(struct request_queue *q)
|
||||
static inline int queue_discard_alignment(const struct request_queue *q)
|
||||
{
|
||||
if (q->limits.discard_misaligned)
|
||||
return -1;
|
||||
|
@ -1432,7 +1457,7 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline int queue_dma_alignment(struct request_queue *q)
|
||||
static inline int queue_dma_alignment(const struct request_queue *q)
|
||||
{
|
||||
return q ? q->dma_alignment : 511;
|
||||
}
|
||||
|
@ -1543,7 +1568,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q,
|
|||
}
|
||||
|
||||
static inline unsigned short
|
||||
queue_max_integrity_segments(struct request_queue *q)
|
||||
queue_max_integrity_segments(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.max_integrity_segments;
|
||||
}
|
||||
|
@ -1626,7 +1651,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q,
|
|||
unsigned int segs)
|
||||
{
|
||||
}
|
||||
static inline unsigned short queue_max_integrity_segments(struct request_queue *q)
|
||||
static inline unsigned short queue_max_integrity_segments(const struct request_queue *q)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -76,6 +76,7 @@ struct elevator_type
|
|||
struct elv_fs_entry *elevator_attrs;
|
||||
const char *elevator_name;
|
||||
const char *elevator_alias;
|
||||
const unsigned int elevator_features;
|
||||
struct module *elevator_owner;
|
||||
#ifdef CONFIG_BLK_DEBUG_FS
|
||||
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
|
||||
|
@ -165,5 +166,12 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
|
|||
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
|
||||
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
|
||||
|
||||
/*
|
||||
* Elevator features.
|
||||
*/
|
||||
|
||||
/* Supports zoned block devices sequential write constraint */
|
||||
#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
|
||||
|
||||
#endif /* CONFIG_BLOCK */
|
||||
#endif
|
||||
|
|
|
@ -88,8 +88,7 @@ typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
|
|||
typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
|
||||
typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int,
|
||||
struct nvm_chk_meta *);
|
||||
typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
|
||||
typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
|
||||
typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *);
|
||||
typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int);
|
||||
typedef void (nvm_destroy_dma_pool_fn)(void *);
|
||||
typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
|
||||
|
@ -104,7 +103,6 @@ struct nvm_dev_ops {
|
|||
nvm_get_chk_meta_fn *get_chk_meta;
|
||||
|
||||
nvm_submit_io_fn *submit_io;
|
||||
nvm_submit_io_sync_fn *submit_io_sync;
|
||||
|
||||
nvm_create_dma_pool_fn *create_dma_pool;
|
||||
nvm_destroy_dma_pool_fn *destroy_dma_pool;
|
||||
|
@ -682,8 +680,8 @@ extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr,
|
|||
int, struct nvm_chk_meta *);
|
||||
extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *,
|
||||
int, int);
|
||||
extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
|
||||
extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
|
||||
extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *);
|
||||
extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *);
|
||||
extern void nvm_end_io(struct nvm_rq *);
|
||||
|
||||
#else /* CONFIG_NVM */
|
||||
|
|
|
@ -183,6 +183,23 @@ struct memcg_padding {
|
|||
#define MEMCG_PADDING(name)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Remember four most recent foreign writebacks with dirty pages in this
|
||||
* cgroup. Inode sharing is expected to be uncommon and, even if we miss
|
||||
* one in a given round, we're likely to catch it later if it keeps
|
||||
* foreign-dirtying, so a fairly low count should be enough.
|
||||
*
|
||||
* See mem_cgroup_track_foreign_dirty_slowpath() for details.
|
||||
*/
|
||||
#define MEMCG_CGWB_FRN_CNT 4
|
||||
|
||||
struct memcg_cgwb_frn {
|
||||
u64 bdi_id; /* bdi->id of the foreign inode */
|
||||
int memcg_id; /* memcg->css.id of foreign inode */
|
||||
u64 at; /* jiffies_64 at the time of dirtying */
|
||||
struct wb_completion done; /* tracks in-flight foreign writebacks */
|
||||
};
|
||||
|
||||
/*
|
||||
* The memory controller data structure. The memory controller controls both
|
||||
* page cache and RSS per cgroup. We would eventually like to provide
|
||||
|
@ -307,6 +324,7 @@ struct mem_cgroup {
|
|||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
struct list_head cgwb_list;
|
||||
struct wb_domain cgwb_domain;
|
||||
struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
|
||||
#endif
|
||||
|
||||
/* List of events which userspace want to receive */
|
||||
|
@ -1237,6 +1255,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
|||
unsigned long *pheadroom, unsigned long *pdirty,
|
||||
unsigned long *pwriteback);
|
||||
|
||||
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
|
||||
struct bdi_writeback *wb);
|
||||
|
||||
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
|
||||
struct bdi_writeback *wb)
|
||||
{
|
||||
if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
|
||||
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
|
||||
}
|
||||
|
||||
void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
|
||||
|
||||
#else /* CONFIG_CGROUP_WRITEBACK */
|
||||
|
||||
static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
|
||||
|
@ -1252,6 +1282,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
|
|||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
|
||||
struct bdi_writeback *wb)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_CGROUP_WRITEBACK */
|
||||
|
||||
struct sock;
|
||||
|
|
|
@ -140,6 +140,7 @@ enum {
|
|||
* Submission and Completion Queue Entry Sizes for the NVM command set.
|
||||
* (In bytes and specified as a power of two (2^n)).
|
||||
*/
|
||||
#define NVME_ADM_SQES 6
|
||||
#define NVME_NVM_IOSQES 6
|
||||
#define NVME_NVM_IOCQES 4
|
||||
|
||||
|
@ -814,6 +815,7 @@ enum nvme_admin_opcode {
|
|||
nvme_admin_security_send = 0x81,
|
||||
nvme_admin_security_recv = 0x82,
|
||||
nvme_admin_sanitize_nvm = 0x84,
|
||||
nvme_admin_get_lba_status = 0x86,
|
||||
};
|
||||
|
||||
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
|
||||
|
@ -840,7 +842,8 @@ enum nvme_admin_opcode {
|
|||
nvme_admin_opcode_name(nvme_admin_format_nvm), \
|
||||
nvme_admin_opcode_name(nvme_admin_security_send), \
|
||||
nvme_admin_opcode_name(nvme_admin_security_recv), \
|
||||
nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
|
||||
nvme_admin_opcode_name(nvme_admin_sanitize_nvm), \
|
||||
nvme_admin_opcode_name(nvme_admin_get_lba_status))
|
||||
|
||||
enum {
|
||||
NVME_QUEUE_PHYS_CONTIG = (1 << 0),
|
||||
|
|
|
@ -217,6 +217,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
|
|||
void wbc_detach_inode(struct writeback_control *wbc);
|
||||
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
|
||||
size_t bytes);
|
||||
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
|
||||
enum wb_reason reason, struct wb_completion *done);
|
||||
void cgroup_writeback_umount(void);
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,178 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM iocost
|
||||
|
||||
struct ioc;
|
||||
struct ioc_now;
|
||||
struct ioc_gq;
|
||||
|
||||
#if !defined(_TRACE_BLK_IOCOST_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_BLK_IOCOST_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
TRACE_EVENT(iocost_iocg_activate,
|
||||
|
||||
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
|
||||
u64 last_period, u64 cur_period, u64 vtime),
|
||||
|
||||
TP_ARGS(iocg, path, now, last_period, cur_period, vtime),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__string(devname, ioc_name(iocg->ioc))
|
||||
__string(cgroup, path)
|
||||
__field(u64, now)
|
||||
__field(u64, vnow)
|
||||
__field(u64, vrate)
|
||||
__field(u64, last_period)
|
||||
__field(u64, cur_period)
|
||||
__field(u64, last_vtime)
|
||||
__field(u64, vtime)
|
||||
__field(u32, weight)
|
||||
__field(u32, inuse)
|
||||
__field(u64, hweight_active)
|
||||
__field(u64, hweight_inuse)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(devname, ioc_name(iocg->ioc));
|
||||
__assign_str(cgroup, path);
|
||||
__entry->now = now->now;
|
||||
__entry->vnow = now->vnow;
|
||||
__entry->vrate = now->vrate;
|
||||
__entry->last_period = last_period;
|
||||
__entry->cur_period = cur_period;
|
||||
__entry->last_vtime = iocg->last_vtime;
|
||||
__entry->vtime = vtime;
|
||||
__entry->weight = iocg->weight;
|
||||
__entry->inuse = iocg->inuse;
|
||||
__entry->hweight_active = iocg->hweight_active;
|
||||
__entry->hweight_inuse = iocg->hweight_inuse;
|
||||
),
|
||||
|
||||
TP_printk("[%s:%s] now=%llu:%llu vrate=%llu "
|
||||
"period=%llu->%llu vtime=%llu->%llu "
|
||||
"weight=%u/%u hweight=%llu/%llu",
|
||||
__get_str(devname), __get_str(cgroup),
|
||||
__entry->now, __entry->vnow, __entry->vrate,
|
||||
__entry->last_period, __entry->cur_period,
|
||||
__entry->last_vtime, __entry->vtime,
|
||||
__entry->inuse, __entry->weight,
|
||||
__entry->hweight_inuse, __entry->hweight_active
|
||||
)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(iocg_inuse_update,
|
||||
|
||||
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
|
||||
u32 old_inuse, u32 new_inuse,
|
||||
u64 old_hw_inuse, u64 new_hw_inuse),
|
||||
|
||||
TP_ARGS(iocg, path, now, old_inuse, new_inuse,
|
||||
old_hw_inuse, new_hw_inuse),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__string(devname, ioc_name(iocg->ioc))
|
||||
__string(cgroup, path)
|
||||
__field(u64, now)
|
||||
__field(u32, old_inuse)
|
||||
__field(u32, new_inuse)
|
||||
__field(u64, old_hweight_inuse)
|
||||
__field(u64, new_hweight_inuse)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(devname, ioc_name(iocg->ioc));
|
||||
__assign_str(cgroup, path);
|
||||
__entry->now = now->now;
|
||||
__entry->old_inuse = old_inuse;
|
||||
__entry->new_inuse = new_inuse;
|
||||
__entry->old_hweight_inuse = old_hw_inuse;
|
||||
__entry->new_hweight_inuse = new_hw_inuse;
|
||||
),
|
||||
|
||||
TP_printk("[%s:%s] now=%llu inuse=%u->%u hw_inuse=%llu->%llu",
|
||||
__get_str(devname), __get_str(cgroup), __entry->now,
|
||||
__entry->old_inuse, __entry->new_inuse,
|
||||
__entry->old_hweight_inuse, __entry->new_hweight_inuse
|
||||
)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback,
|
||||
|
||||
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
|
||||
u32 old_inuse, u32 new_inuse,
|
||||
u64 old_hw_inuse, u64 new_hw_inuse),
|
||||
|
||||
TP_ARGS(iocg, path, now, old_inuse, new_inuse,
|
||||
old_hw_inuse, new_hw_inuse)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway,
|
||||
|
||||
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
|
||||
u32 old_inuse, u32 new_inuse,
|
||||
u64 old_hw_inuse, u64 new_hw_inuse),
|
||||
|
||||
TP_ARGS(iocg, path, now, old_inuse, new_inuse,
|
||||
old_hw_inuse, new_hw_inuse)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset,
|
||||
|
||||
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
|
||||
u32 old_inuse, u32 new_inuse,
|
||||
u64 old_hw_inuse, u64 new_hw_inuse),
|
||||
|
||||
TP_ARGS(iocg, path, now, old_inuse, new_inuse,
|
||||
old_hw_inuse, new_hw_inuse)
|
||||
);
|
||||
|
||||
TRACE_EVENT(iocost_ioc_vrate_adj,
|
||||
|
||||
TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 (*missed_ppm)[2],
|
||||
u32 rq_wait_pct, int nr_lagging, int nr_shortages,
|
||||
int nr_surpluses),
|
||||
|
||||
TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages,
|
||||
nr_surpluses),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__string(devname, ioc_name(ioc))
|
||||
__field(u64, old_vrate)
|
||||
__field(u64, new_vrate)
|
||||
__field(int, busy_level)
|
||||
__field(u32, read_missed_ppm)
|
||||
__field(u32, write_missed_ppm)
|
||||
__field(u32, rq_wait_pct)
|
||||
__field(int, nr_lagging)
|
||||
__field(int, nr_shortages)
|
||||
__field(int, nr_surpluses)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(devname, ioc_name(ioc));
|
||||
__entry->old_vrate = atomic64_read(&ioc->vtime_rate);;
|
||||
__entry->new_vrate = new_vrate;
|
||||
__entry->busy_level = ioc->busy_level;
|
||||
__entry->read_missed_ppm = (*missed_ppm)[READ];
|
||||
__entry->write_missed_ppm = (*missed_ppm)[WRITE];
|
||||
__entry->rq_wait_pct = rq_wait_pct;
|
||||
__entry->nr_lagging = nr_lagging;
|
||||
__entry->nr_shortages = nr_shortages;
|
||||
__entry->nr_surpluses = nr_surpluses;
|
||||
),
|
||||
|
||||
TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d",
|
||||
__get_str(devname), __entry->old_vrate, __entry->new_vrate,
|
||||
__entry->busy_level,
|
||||
__entry->read_missed_ppm, __entry->write_missed_ppm,
|
||||
__entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages,
|
||||
__entry->nr_surpluses
|
||||
)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_BLK_IOCOST_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
#include <trace/define_trace.h>
|
|
@ -176,6 +176,132 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w
|
|||
#endif /* CONFIG_CGROUP_WRITEBACK */
|
||||
#endif /* CREATE_TRACE_POINTS */
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
TRACE_EVENT(inode_foreign_history,
|
||||
|
||||
TP_PROTO(struct inode *inode, struct writeback_control *wbc,
|
||||
unsigned int history),
|
||||
|
||||
TP_ARGS(inode, wbc, history),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, name, 32)
|
||||
__field(unsigned long, ino)
|
||||
__field(unsigned int, cgroup_ino)
|
||||
__field(unsigned int, history)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
strncpy(__entry->name, dev_name(inode_to_bdi(inode)->dev), 32);
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc);
|
||||
__entry->history = history;
|
||||
),
|
||||
|
||||
TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x",
|
||||
__entry->name,
|
||||
__entry->ino,
|
||||
__entry->cgroup_ino,
|
||||
__entry->history
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(inode_switch_wbs,
|
||||
|
||||
TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
|
||||
struct bdi_writeback *new_wb),
|
||||
|
||||
TP_ARGS(inode, old_wb, new_wb),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, name, 32)
|
||||
__field(unsigned long, ino)
|
||||
__field(unsigned int, old_cgroup_ino)
|
||||
__field(unsigned int, new_cgroup_ino)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
strncpy(__entry->name, dev_name(old_wb->bdi->dev), 32);
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
|
||||
__entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
|
||||
),
|
||||
|
||||
TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u",
|
||||
__entry->name,
|
||||
__entry->ino,
|
||||
__entry->old_cgroup_ino,
|
||||
__entry->new_cgroup_ino
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(track_foreign_dirty,
|
||||
|
||||
TP_PROTO(struct page *page, struct bdi_writeback *wb),
|
||||
|
||||
TP_ARGS(page, wb),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, name, 32)
|
||||
__field(u64, bdi_id)
|
||||
__field(unsigned long, ino)
|
||||
__field(unsigned int, memcg_id)
|
||||
__field(unsigned int, cgroup_ino)
|
||||
__field(unsigned int, page_cgroup_ino)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
struct inode *inode = mapping ? mapping->host : NULL;
|
||||
|
||||
strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
|
||||
__entry->bdi_id = wb->bdi->id;
|
||||
__entry->ino = inode ? inode->i_ino : 0;
|
||||
__entry->memcg_id = wb->memcg_css->id;
|
||||
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
|
||||
__entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino;
|
||||
),
|
||||
|
||||
TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u",
|
||||
__entry->name,
|
||||
__entry->bdi_id,
|
||||
__entry->ino,
|
||||
__entry->memcg_id,
|
||||
__entry->cgroup_ino,
|
||||
__entry->page_cgroup_ino
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(flush_foreign,
|
||||
|
||||
TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
|
||||
unsigned int frn_memcg_id),
|
||||
|
||||
TP_ARGS(wb, frn_bdi_id, frn_memcg_id),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, name, 32)
|
||||
__field(unsigned int, cgroup_ino)
|
||||
__field(unsigned int, frn_bdi_id)
|
||||
__field(unsigned int, frn_memcg_id)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
|
||||
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
|
||||
__entry->frn_bdi_id = frn_bdi_id;
|
||||
__entry->frn_memcg_id = frn_memcg_id;
|
||||
),
|
||||
|
||||
TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u",
|
||||
__entry->name,
|
||||
__entry->cgroup_ino,
|
||||
__entry->frn_bdi_id,
|
||||
__entry->frn_memcg_id
|
||||
)
|
||||
);
|
||||
#endif
|
||||
|
||||
DECLARE_EVENT_CLASS(writeback_write_inode_template,
|
||||
|
||||
TP_PROTO(struct inode *inode, struct writeback_control *wbc),
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue