From 33cb0917bbe241dd17a2b87ead63514c1b7e5615 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Apr 2022 21:07:09 +0200 Subject: [PATCH 01/72] drbd: fix duplicate array initializer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two initializers for P_RETRY_WRITE: drivers/block/drbd/drbd_main.c:3676:22: warning: initialized field overwritten [-Woverride-init] Remove the first one since it was already ignored by the compiler and reorder the list to match the enum definition. As P_ZEROES had no entry, add that one instead. Fixes: 036b17eaab93 ("drbd: Receiving part for the PROTOCOL_UPDATE packet") Fixes: f31e583aa2c2 ("drbd: introduce P_ZEROES (REQ_OP_WRITE_ZEROES on the "wire")") Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220406190715.1938174-2-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 912560f611c3..2887350ae010 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3581,9 +3581,8 @@ const char *cmdname(enum drbd_packet cmd) * when we want to support more than * one PRO_VERSION */ static const char *cmdnames[] = { + [P_DATA] = "Data", - [P_WSAME] = "WriteSame", - [P_TRIM] = "Trim", [P_DATA_REPLY] = "DataReply", [P_RS_DATA_REPLY] = "RSDataReply", [P_BARRIER] = "Barrier", @@ -3594,7 +3593,6 @@ const char *cmdname(enum drbd_packet cmd) [P_DATA_REQUEST] = "DataRequest", [P_RS_DATA_REQUEST] = "RSDataRequest", [P_SYNC_PARAM] = "SyncParam", - [P_SYNC_PARAM89] = "SyncParam89", [P_PROTOCOL] = "ReportProtocol", [P_UUIDS] = "ReportUUIDs", [P_SIZES] = "ReportSizes", @@ -3602,6 +3600,7 @@ const char *cmdname(enum drbd_packet cmd) [P_SYNC_UUID] = "ReportSyncUUID", [P_AUTH_CHALLENGE] = "AuthChallenge", [P_AUTH_RESPONSE] = "AuthResponse", + [P_STATE_CHG_REQ] = "StateChgRequest", [P_PING] = "Ping", [P_PING_ACK] = "PingAck", [P_RECV_ACK] = "RecvAck", @@ -3612,23 +3611,25 @@ const char *cmdname(enum drbd_packet cmd) [P_NEG_DREPLY] = "NegDReply", [P_NEG_RS_DREPLY] = "NegRSDReply", [P_BARRIER_ACK] = "BarrierAck", - [P_STATE_CHG_REQ] = "StateChgRequest", [P_STATE_CHG_REPLY] = "StateChgReply", [P_OV_REQUEST] = "OVRequest", [P_OV_REPLY] = "OVReply", [P_OV_RESULT] = "OVResult", [P_CSUM_RS_REQUEST] = "CsumRSRequest", [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_SYNC_PARAM89] = "SyncParam89", [P_COMPRESSED_BITMAP] = "CBitmap", [P_DELAY_PROBE] = "DelayProbe", [P_OUT_OF_SYNC] = "OutOfSync", - [P_RETRY_WRITE] = "RetryWrite", [P_RS_CANCEL] = "RSCancel", [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", [P_PROTOCOL_UPDATE] = "protocol_update", + [P_TRIM] = "Trim", [P_RS_THIN_REQ] = "rs_thin_req", [P_RS_DEALLOCATED] = "rs_deallocated", + [P_WSAME] = "WriteSame", + [P_ZEROES] = "Zeroes", /* enum drbd_packet, but not commands - obsoleted flags: * P_MAY_IGNORE From 4b28f3b448df42a202ebf2d8824280ee71e6d79a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Apr 2022 21:07:10 +0200 Subject: [PATCH 02/72] drbd: address enum mismatch warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc -Wextra warns about mixing drbd_state_rv with drbd_ret_code in a couple of places: drivers/block/drbd/drbd_nl.c: In function 'drbd_adm_set_role': drivers/block/drbd/drbd_nl.c:777:14: warning: comparison between 'enum drbd_state_rv' and 'enum drbd_ret_code' [-Wenum-compare] 777 | if (retcode != NO_ERROR) | ^~ drivers/block/drbd/drbd_nl.c:784:12: warning: implicit conversion from 'enum drbd_ret_code' to 'enum drbd_state_rv' [-Wenum-conversion] 784 | retcode = ERR_MANDATORY_TAG; | ^ drivers/block/drbd/drbd_nl.c: In function 'drbd_adm_attach': drivers/block/drbd/drbd_nl.c:1965:10: warning: implicit conversion from 'enum drbd_state_rv' to 'enum drbd_ret_code' [-Wenum-conversion] 1965 | retcode = rv; /* FIXME: Type mismatch. */ | ^ drivers/block/drbd/drbd_nl.c: In function 'drbd_adm_connect': drivers/block/drbd/drbd_nl.c:2690:10: warning: implicit conversion from 'enum drbd_state_rv' to 'enum drbd_ret_code' [-Wenum-conversion] 2690 | retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); | ^ drivers/block/drbd/drbd_nl.c: In function 'drbd_adm_disconnect': drivers/block/drbd/drbd_nl.c:2803:11: warning: implicit conversion from 'enum drbd_state_rv' to 'enum drbd_ret_code' [-Wenum-conversion] 2803 | retcode = rv; /* FIXME: Type mismatch. */ | ^ In each case, both are passed into drbd_adm_finish(), which just takes a 32-bit integer and is happy with either, presumably intentionally. Restructure the code to pass either type directly in there in most cases, avoiding the warnings. Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220406190715.1938174-3-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index a6280dcb3767..99c58cdb8619 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -770,6 +770,7 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) struct set_role_parms parms; int err; enum drbd_ret_code retcode; + enum drbd_state_rv rv; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) @@ -790,14 +791,14 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) - retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, - R_PRIMARY, parms.assume_uptodate); + rv = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); else - retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, - R_SECONDARY, 0); + rv = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); mutex_unlock(&adm_ctx.resource->adm_mutex); genl_lock(); + drbd_adm_finish(&adm_ctx, info, rv); + return 0; out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -2492,6 +2493,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) struct drbd_resource *resource; struct drbd_connection *connection; enum drbd_ret_code retcode; + enum drbd_state_rv rv; int i; int err; @@ -2611,12 +2613,11 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - retcode = (enum drbd_ret_code)conn_request_state(connection, - NS(conn, C_UNCONNECTED), CS_VERBOSE); + rv = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); mutex_unlock(&adm_ctx.resource->adm_mutex); - drbd_adm_finish(&adm_ctx, info, retcode); + drbd_adm_finish(&adm_ctx, info, rv); return 0; fail: @@ -2724,11 +2725,12 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); - if (rv < SS_SUCCESS) - retcode = (enum drbd_ret_code)rv; - else - retcode = NO_ERROR; mutex_unlock(&adm_ctx.resource->adm_mutex); + if (rv < SS_SUCCESS) { + drbd_adm_finish(&adm_ctx, info, rv); + return 0; + } + retcode = NO_ERROR; fail: drbd_adm_finish(&adm_ctx, info, retcode); return 0; From e1838cf01b2dbc71ac579bd762158c62a3e63f6f Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 6 Apr 2022 21:07:11 +0200 Subject: [PATCH 03/72] block: drbd: drbd_receiver: Remove redundant assignment to err MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable err is set to '-EIO' but this value is never read as it is overwritten or not used later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: drivers/block/drbd/drbd_receiver.c:3955:5: warning: Value stored to 'err' is never read [clang-analyzer-deadcode.DeadStores]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220406190715.1938174-4-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 2957b0b68d60..ef812ea3b4c5 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3902,7 +3902,6 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i drbd_err(device, "verify-alg of wrong size, " "peer wants %u, accepting only up to %u byte\n", data_size, SHARED_SECRET_MAX); - err = -EIO; goto reconnect; } From ba6bee98d0c539a1e78fe62949fe16032e8f68fb Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Wed, 6 Apr 2022 21:07:12 +0200 Subject: [PATCH 04/72] drbd: Make use of PFN_UP helper macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit it's a refactor to make use of PFN_UP helper macro Signed-off-by: Cai Huoqing Reviewed-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220406190715.1938174-5-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_bitmap.c | 2 +- drivers/block/drbd/drbd_receiver.c | 4 ++-- drivers/block/drbd/drbd_worker.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index df25eecf80af..9e060e49b3f8 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -683,7 +683,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi } } - want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + want = PFN_UP(words*sizeof(long)); have = b->bm_number_of_pages; if (want == have) { D_ASSERT(device, b->bm_pages != NULL); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index ef812ea3b4c5..c66864324f25 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -364,7 +364,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; struct page *page = NULL; - unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned nr_pages = PFN_UP(payload_size); if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; @@ -1630,7 +1630,7 @@ int drbd_submit_peer_request(struct drbd_device *device, sector_t sector = peer_req->i.sector; unsigned data_size = peer_req->i.size; unsigned n_bios = 0; - unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned nr_pages = PFN_UP(data_size); /* TRIM/DISCARD: for now, always use the helper function * blkdev_issue_zeroout(..., discard=true). diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0f9956f4e9c4..af3051dd8912 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1030,7 +1030,7 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_ { if (drbd_peer_req_has_active_page(peer_req)) { /* This might happen if sendpage() has not finished */ - int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; + int i = PFN_UP(peer_req->i.size); atomic_add(i, &device->pp_in_use_by_net); atomic_sub(i, &device->pp_in_use); spin_lock_irq(&device->resource->req_lock); From e6be38a164ba2023af9c5c2c35eb728cb1825120 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Wed, 6 Apr 2022 21:07:13 +0200 Subject: [PATCH 05/72] drbd: Replace "unsigned" with "unsigned int" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when run checkpath.pl for the first patch, found that WARNING: Prefer 'unsigned int' to bare use of 'unsigned'. so fix it. BTW Signed-off-by: Cai Huoqing Acked-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220406190715.1938174-6-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c66864324f25..f1da0914c928 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -364,7 +364,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; struct page *page = NULL; - unsigned nr_pages = PFN_UP(payload_size); + unsigned int nr_pages = PFN_UP(payload_size); if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; @@ -1628,9 +1628,9 @@ int drbd_submit_peer_request(struct drbd_device *device, struct bio *bio; struct page *page = peer_req->pages; sector_t sector = peer_req->i.sector; - unsigned data_size = peer_req->i.size; - unsigned n_bios = 0; - unsigned nr_pages = PFN_UP(data_size); + unsigned int data_size = peer_req->i.size; + unsigned int n_bios = 0; + unsigned int nr_pages = PFN_UP(data_size); /* TRIM/DISCARD: for now, always use the helper function * blkdev_issue_zeroout(..., discard=true). From 90c6c291453922362ae026f1049843368111bdfe Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 6 Apr 2022 21:07:14 +0200 Subject: [PATCH 06/72] drdb: Switch to kvfree_rcu() API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of invoking a synchronize_rcu() to free a pointer after a grace period we can directly make use of new API that does the same but in more efficient way. TO: Jens Axboe TO: Philipp Reisner TO: Jason Gunthorpe TO: drbd-dev@lists.linbit.com TO: linux-block@vger.kernel.org Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220406190715.1938174-7-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 9 +++------ drivers/block/drbd/drbd_receiver.c | 6 ++---- drivers/block/drbd/drbd_state.c | 3 +-- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 99c58cdb8619..013d355a2033 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1602,8 +1602,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) drbd_send_sync_param(peer_device); } - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); kfree(old_plan); mod_timer(&device->request_timer, jiffies + HZ); goto success; @@ -2434,8 +2433,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - synchronize_rcu(); - kfree(old_net_conf); + kvfree_rcu(old_net_conf); if (connection->cstate >= C_WF_REPORT_PARAMS) { struct drbd_peer_device *peer_device; @@ -2849,8 +2847,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) new_disk_conf->disk_size = (sector_t)rs.resize_size; rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&device->resource->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); new_disk_conf = NULL; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index f1da0914c928..6762be53f409 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3750,8 +3750,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in drbd_info(connection, "peer data-integrity-alg: %s\n", integrity_alg[0] ? integrity_alg : "(none)"); - synchronize_rcu(); - kfree(old_net_conf); + kvfree_rcu(old_net_conf); return 0; disconnect_rcu_unlock: @@ -4119,8 +4118,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&connection->resource->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", (unsigned long)p_usize, (unsigned long)my_usize); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 4ee11aef6672..3f7bf9f2d874 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -2071,8 +2071,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) conn_free_crypto(connection); mutex_unlock(&connection->resource->conf_update); - synchronize_rcu(); - kfree(old_conf); + kvfree_rcu(old_conf); } if (ns_max.susp_fen) { From 8fd6533ef3f7729e4aa29ead83844c042688615a Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Wed, 6 Apr 2022 21:07:15 +0200 Subject: [PATCH 07/72] drbd: Return true/false (not 1/0) from bool functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return boolean values ("true" or "false") instead of 1 or 0 from bool functions. This fixes the following warnings from coccicheck: ./drivers/block/drbd/drbd_req.c:912:9-10: WARNING: return of 0/1 in function 'remote_due_to_read_balancing' with return type bool Signed-off-by: Haowen Bai Reviewed-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220406190715.1938174-8-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 75be0e16770a..e64bcfba30ef 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -922,7 +922,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se switch (rbm) { case RB_CONGESTED_REMOTE: - return 0; + return false; case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); From 2a852a693f8839bb877fc731ffbc9ece3a9c16d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:03 +0200 Subject: [PATCH 08/72] nbd: use the correct block_device in nbd_bdev_reset The bdev parameter to ->ioctl contains the block device that the ioctl is called on, which can be the partition. But the openers check in nbd_bdev_reset really needs to check use the whole device, so switch to using that. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4729aef8c646..dab24c58d5cc 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1217,11 +1217,11 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) return -ENOSPC; } -static void nbd_bdev_reset(struct block_device *bdev) +static void nbd_bdev_reset(struct nbd_device *nbd) { - if (bdev->bd_openers > 1) + if (nbd->disk->part0->bd_openers > 1) return; - set_capacity(bdev->bd_disk, 0); + set_capacity(nbd->disk, 0); } static void nbd_parse_flags(struct nbd_device *nbd) @@ -1386,7 +1386,7 @@ static int nbd_start_device(struct nbd_device *nbd) return nbd_set_size(nbd, config->bytesize, nbd_blksize(config)); } -static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) +static int nbd_start_device_ioctl(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; int ret; @@ -1405,7 +1405,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b flush_workqueue(nbd->recv_workq); mutex_lock(&nbd->config_lock); - nbd_bdev_reset(bdev); + nbd_bdev_reset(nbd); /* user requested, ignore socket errors */ if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; @@ -1419,7 +1419,7 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, { sock_shutdown(nbd); __invalidate_device(bdev, true); - nbd_bdev_reset(bdev); + nbd_bdev_reset(nbd); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); @@ -1465,7 +1465,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, config->flags = arg; return 0; case NBD_DO_IT: - return nbd_start_device_ioctl(nbd, bdev); + return nbd_start_device_ioctl(nbd); case NBD_CLEAR_QUE: /* * This is for compatibility only. The queue is always cleared From d666e20e2e7983d03bbf5e208b8485541ae616a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:04 +0200 Subject: [PATCH 09/72] zram: cleanup reset_store Use a local variable for the gendisk instead of the part0 block_device, as the gendisk is what this function actually operates on. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 59ff444bf6c7..387d8697d700 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1786,7 +1786,7 @@ static ssize_t reset_store(struct device *dev, int ret; unsigned short do_reset; struct zram *zram; - struct block_device *bdev; + struct gendisk *disk; ret = kstrtou16(buf, 10, &do_reset); if (ret) @@ -1796,26 +1796,26 @@ static ssize_t reset_store(struct device *dev, return -EINVAL; zram = dev_to_zram(dev); - bdev = zram->disk->part0; + disk = zram->disk; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); /* Do not reset an active device or claimed device */ - if (bdev->bd_openers || zram->claim) { - mutex_unlock(&bdev->bd_disk->open_mutex); + if (disk->part0->bd_openers || zram->claim) { + mutex_unlock(&disk->open_mutex); return -EBUSY; } /* From now on, anyone can't open /dev/zram[0-9] */ zram->claim = true; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); /* Make sure all the pending I/O are finished */ - sync_blockdev(bdev); + sync_blockdev(disk->part0); zram_reset_device(zram); - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); zram->claim = false; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return len; } From 7a86d6dc1493326feb0d3ce5af2f34401dd3defa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:05 +0200 Subject: [PATCH 10/72] zram: cleanup zram_remove Remove the bdev variable and just use the gendisk pointed to by the zram_device directly. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 387d8697d700..70d4bd662ab6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1986,19 +1986,18 @@ static int zram_add(void) static int zram_remove(struct zram *zram) { - struct block_device *bdev = zram->disk->part0; bool claimed; - mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) { - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_lock(&zram->disk->open_mutex); + if (zram->disk->part0->bd_openers) { + mutex_unlock(&zram->disk->open_mutex); return -EBUSY; } claimed = zram->claim; if (!claimed) zram->claim = true; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&zram->disk->open_mutex); zram_debugfs_unregister(zram); @@ -2010,7 +2009,7 @@ static int zram_remove(struct zram *zram) ; } else { /* Make sure all the pending I/O are finished */ - sync_blockdev(bdev); + sync_blockdev(zram->disk->part0); zram_reset_device(zram); } From dbdc1be32591af023db2812706f01e6cd2f42bfc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:06 +0200 Subject: [PATCH 11/72] block: add a disk_openers helper Add a helper that returns the openers for a given gendisk to avoid having drivers poke into disk->part0 to get at this information in a somewhat cumbersome way. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 4 ++-- drivers/block/zram/zram_drv.c | 4 ++-- include/linux/blkdev.h | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index dab24c58d5cc..526389351784 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1219,7 +1219,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) static void nbd_bdev_reset(struct nbd_device *nbd) { - if (nbd->disk->part0->bd_openers > 1) + if (disk_openers(nbd->disk) > 1) return; set_capacity(nbd->disk, 0); } @@ -1576,7 +1576,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) struct nbd_device *nbd = disk->private_data; if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && - disk->part0->bd_openers == 0) + disk_openers(disk) == 0) nbd_disconnect_and_put(nbd); nbd_config_put(nbd); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 70d4bd662ab6..dc5ac5963377 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1800,7 +1800,7 @@ static ssize_t reset_store(struct device *dev, mutex_lock(&disk->open_mutex); /* Do not reset an active device or claimed device */ - if (disk->part0->bd_openers || zram->claim) { + if (disk_openers(disk) || zram->claim) { mutex_unlock(&disk->open_mutex); return -EBUSY; } @@ -1989,7 +1989,7 @@ static int zram_remove(struct zram *zram) bool claimed; mutex_lock(&zram->disk->open_mutex); - if (zram->disk->part0->bd_openers) { + if (disk_openers(zram->disk)) { mutex_unlock(&zram->disk->open_mutex); return -EBUSY; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c9b5925af5a3..436645bde13f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -176,6 +176,21 @@ static inline bool disk_live(struct gendisk *disk) return !inode_unhashed(disk->part0->bd_inode); } +/** + * disk_openers - returns how many openers are there for a disk + * @disk: disk to check + * + * This returns the number of openers for a disk. Note that this value is only + * stable if disk->open_mutex is held. + * + * Note: Due to a quirk in the block layer open code, each open partition is + * only counted once even if there are multiple openers. + */ +static inline unsigned int disk_openers(struct gendisk *disk) +{ + return disk->part0->bd_openers; +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. From 9acf381f3e8f715175c29f4b6d722f6b6797d139 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:07 +0200 Subject: [PATCH 12/72] block: turn bdev->bd_openers into an atomic_t All manipulation of bd_openers is under disk->open_mutex and will remain so for the foreseeable future. But at least one place reads it without the lock (blkdev_get) and there are more to be added. So make sure the compiler does not do turn the increments and decrements into non-atomic sequences by using an atomic_t. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 16 ++++++++-------- block/partitions/core.c | 2 +- include/linux/blk_types.h | 2 +- include/linux/blkdev.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 13de871fa816..7bf88e591aaf 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -673,17 +673,17 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) + if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); - bdev->bd_openers++; + atomic_inc(&bdev->bd_openers); return 0; } static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) { - if (!--bdev->bd_openers) + if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk, mode); @@ -694,7 +694,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) struct gendisk *disk = part->bd_disk; int ret; - if (part->bd_openers) + if (atomic_read(&part->bd_openers)) goto done; ret = blkdev_get_whole(bdev_whole(part), mode); @@ -708,7 +708,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); done: - part->bd_openers++; + atomic_inc(&part->bd_openers); return 0; out_blkdev_put: @@ -720,7 +720,7 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) { struct block_device *whole = bdev_whole(part); - if (--part->bd_openers) + if (!atomic_dec_and_test(&part->bd_openers)) return; blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; @@ -899,7 +899,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ - if (bdev->bd_openers == 1) + if (atomic_read(&bdev->bd_openers) == 1) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); @@ -1044,7 +1044,7 @@ void sync_bdevs(bool wait) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (!bdev->bd_openers) { + if (!atomic_read(&bdev->bd_openers)) { ; /* skip */ } else if (wait) { /* diff --git a/block/partitions/core.c b/block/partitions/core.c index 70dec1c78521..8a0ec929023b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -478,7 +478,7 @@ int bdev_del_partition(struct gendisk *disk, int partno) goto out_unlock; ret = -EBUSY; - if (part->bd_openers) + if (atomic_read(&part->bd_openers)) goto out_unlock; delete_partition(part); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4968cb17b13b..c62274466e72 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -44,7 +44,7 @@ struct block_device { unsigned long bd_stamp; bool bd_read_only; /* read-only policy */ dev_t bd_dev; - int bd_openers; + atomic_t bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; void * bd_claiming; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 436645bde13f..afad3d1d0dac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -188,7 +188,7 @@ static inline bool disk_live(struct gendisk *disk) */ static inline unsigned int disk_openers(struct gendisk *disk) { - return disk->part0->bd_openers; + return atomic_read(&disk->part0->bd_openers); } /* From 2cf429b53c1041a0e90943e1d2a5a7a7f89accb0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:08 +0200 Subject: [PATCH 13/72] loop: de-duplicate the idle worker freeing code Use a common helper for both timer based and uncoditional freeing of idle workers. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Tested-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220330052917.2566582-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 73 +++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 976cf987b392..349e11e8b3e0 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -804,7 +804,6 @@ struct loop_worker { static void loop_workfn(struct work_struct *work); static void loop_rootcg_workfn(struct work_struct *work); -static void loop_free_idle_workers(struct timer_list *timer); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) @@ -888,6 +887,39 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) spin_unlock_irq(&lo->lo_work_lock); } +static void loop_set_timer(struct loop_device *lo) +{ + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); +} + +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) +{ + struct loop_worker *pos, *worker; + + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (!delete_all && + time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); +} + +static void loop_free_idle_workers_timer(struct timer_list *timer) +{ + struct loop_device *lo = container_of(timer, struct loop_device, timer); + + return loop_free_idle_workers(lo, false); +} + static void loop_update_rotational(struct loop_device *lo) { struct file *file = lo->lo_backing_file; @@ -1022,7 +1054,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, INIT_LIST_HEAD(&lo->rootcg_cmd_list); INIT_LIST_HEAD(&lo->idle_worker_list); lo->worker_tree = RB_ROOT; - timer_setup(&lo->timer, loop_free_idle_workers, + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; @@ -1086,7 +1118,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) { struct file *filp; gfp_t gfp = lo->old_gfp_mask; - struct loop_worker *pos, *worker; /* * Flush loop_configure() and loop_change_fd(). It is acceptable for @@ -1116,15 +1147,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) blk_mq_freeze_queue(lo->lo_queue); destroy_workqueue(lo->workqueue); - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - spin_unlock_irq(&lo->lo_work_lock); + loop_free_idle_workers(lo, true); del_timer_sync(&lo->timer); spin_lock_irq(&lo->lo_lock); @@ -1883,11 +1906,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_set_timer(struct loop_device *lo) -{ - timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); -} - static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { @@ -1936,27 +1954,6 @@ static void loop_rootcg_workfn(struct work_struct *work) loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } -static void loop_free_idle_workers(struct timer_list *timer) -{ - struct loop_device *lo = container_of(timer, struct loop_device, timer); - struct loop_worker *pos, *worker; - - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - if (time_is_after_jiffies(worker->last_ran_at + - LOOP_IDLE_WORKER_TIMEOUT)) - break; - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - if (!list_empty(&lo->idle_worker_list)) - loop_set_timer(lo); - spin_unlock_irq(&lo->lo_work_lock); -} - static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, From b15ed54694fbba714931dd81790f86797cf8bed2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:09 +0200 Subject: [PATCH 14/72] loop: initialize the worker tracking fields once There is no need to reinitialize idle_worker_list, worker_tree and timer every time a loop device is configured. Just initialize them once at allocation time. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Tested-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220330052917.2566582-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 349e11e8b3e0..98e12d25d10c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1052,10 +1052,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); INIT_LIST_HEAD(&lo->rootcg_cmd_list); - INIT_LIST_HEAD(&lo->idle_worker_list); - lo->worker_tree = RB_ROOT; - timer_setup(&lo->timer, loop_free_idle_workers_timer, - TIMER_DEFERRABLE); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; @@ -1969,6 +1965,9 @@ static int loop_add(int i) lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; + lo->worker_tree = RB_ROOT; + INIT_LIST_HEAD(&lo->idle_worker_list); + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; err = mutex_lock_killable(&loop_ctl_mutex); From 98ded54a33839e7b8f8bed772e01a544f48e33a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:10 +0200 Subject: [PATCH 15/72] loop: remove the racy bd_inode->i_mapping->nrpages asserts Nothing prevents a file system or userspace opener of the block device from redirtying the page right afte sync_blockdev returned. Fortunately data in the page cache during a block device change is mostly harmless anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Tested-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220330052917.2566582-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 98e12d25d10c..610942e9c891 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1271,15 +1271,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); - if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { - /* If any pages were dirtied after invalidate_bdev(), try again */ - err = -EAGAIN; - pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - prev_lo_flags = lo->lo_flags; err = loop_set_status_from_info(lo, info); @@ -1490,21 +1481,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - - /* invalidate_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - blk_queue_logical_block_size(lo->lo_queue, arg); blk_queue_physical_block_size(lo->lo_queue, arg); blk_queue_io_min(lo->lo_queue, arg); loop_update_dio(lo); -out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); return err; From 46dc967445bde5300eee7e567a67796de2217586 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:11 +0200 Subject: [PATCH 16/72] loop: don't freeze the queue in lo_release By the time the final ->release is called there can't be outstanding I/O. For non-final ->release there is no need for driver action at all. Thus remove the useless queue freeze. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Tested-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220330052917.2566582-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 610942e9c891..d6315b0c413e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1749,13 +1749,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode) */ __loop_clr_fd(lo, true); return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ - blk_mq_freeze_queue(lo->lo_queue); - blk_mq_unfreeze_queue(lo->lo_queue); } out_unlock: From 1fe0b1acb14dd3113b7dc975a118cd7f08af8316 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:12 +0200 Subject: [PATCH 17/72] loop: only freeze the queue in __loop_clr_fd when needed ->release is only called after all outstanding I/O has completed, so only freeze the queue when clearing the backing file of a live loop device. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Tested-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220330052917.2566582-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d6315b0c413e..d1a4af599359 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1139,8 +1139,13 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); - /* freeze request queue during the transition */ - blk_mq_freeze_queue(lo->lo_queue); + /* + * Freeze the request queue when unbinding on a live file descriptor and + * thus an open device. When called from ->release we are guaranteed + * that there is no I/O in progress already. + */ + if (!release) + blk_mq_freeze_queue(lo->lo_queue); destroy_workqueue(lo->workqueue); loop_free_idle_workers(lo, true); @@ -1165,7 +1170,8 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - blk_mq_unfreeze_queue(lo->lo_queue); + if (!release) + blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); From d2c7f56f8b5256d57f9e3fc7794c31361d43bdd9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:13 +0200 Subject: [PATCH 18/72] loop: implement ->free_disk Ensure that the lo_device which is stored in the gendisk private data is valid until the gendisk is freed. Currently the loop driver uses a lot of effort to make sure a device is not freed when it is still in use, but to to fix a potential deadlock this will be relaxed a bit soon. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d1a4af599359..e0c8768bac85 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1761,6 +1761,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&lo->lo_mutex); } +static void lo_free_disk(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + mutex_destroy(&lo->lo_mutex); + kfree(lo); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, @@ -1769,6 +1777,7 @@ static const struct block_device_operations lo_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif + .free_disk = lo_free_disk, }; /* @@ -2060,15 +2069,14 @@ static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_disk(lo->lo_disk); + blk_cleanup_queue(lo->lo_disk->queue); blk_mq_free_tag_set(&lo->tag_set); mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); - /* There is no route which can find this loop device. */ - mutex_destroy(&lo->lo_mutex); - kfree(lo); + + put_disk(lo->lo_disk); } static void loop_probe(dev_t dev) From 498ef5c777d9c89693b70cc453b40c392120ea1b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:14 +0200 Subject: [PATCH 19/72] loop: suppress uevents while reconfiguring the device Currently, udev change event is generated for a loop device before the device is ready for IO. Due to serialization on lo->lo_mutex in lo_open() this does not matter because anybody is able to open the device and do IO only after the configuration is finished. However this synchronization in lo_open() is going away so make sure userspace reacting to the change event will see the new device state by generating the event only when the device is setup. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e0c8768bac85..ef7692381032 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -569,6 +569,10 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (!file) return -EBADF; + + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + is_loop = is_loop_device(file); error = loop_global_lock_killable(lo, is_loop); if (error) @@ -623,13 +627,18 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, fput(old_file); if (partscan) loop_reread_partitions(lo); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); - return error; + goto done; } /* loop sysfs attributes */ @@ -994,6 +1003,9 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. @@ -1096,7 +1108,12 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, loop_reread_partitions(lo); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_unlock: loop_global_unlock(lo, is_loop); @@ -1107,7 +1124,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - return error; + goto done; } static void __loop_clr_fd(struct loop_device *lo, bool release) From 158eaeba4b8edf9940f64daa83cbd1ac7db7593c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 30 Mar 2022 07:29:15 +0200 Subject: [PATCH 20/72] loop: avoid loop_validate_mutex/lo_mutex in ->release Since ->release is called with disk->open_mutex held, and __loop_clr_fd() from lo_release() is called via ->release when disk_openers() == 0, we are guaranteed that "struct file" which will be passed to loop_validate_file() via fget() cannot be the loop device __loop_clr_fd(lo, true) will clear. Thus, there is no need to hold loop_validate_mutex from __loop_clr_fd() if release == true. When I made commit 3ce6e1f662a91097 ("loop: reintroduce global lock for safe loop_validate_file() traversal"), I wrote "It is acceptable for loop_validate_file() to succeed, for actual clear operation has not started yet.". But now I came to feel why it is acceptable to succeed. It seems that the loop driver was added in Linux 1.3.68, and if (lo->lo_refcnt > 1) return -EBUSY; check in loop_clr_fd() was there from the beginning. The intent of this check was unclear. But now I think that current disk_openers(lo->lo_disk) > 1 form is there for three reasons. (1) Avoid I/O errors when some process which opens and reads from this loop device in response to uevent notification (e.g. systemd-udevd), as described in commit a1ecac3b0656a682 ("loop: Make explicit loop device destruction lazy"). This opener is short-lived because it is likely that the file descriptor used by that process is closed soon. (2) Avoid I/O errors caused by underlying layer of stacked loop devices (i.e. ioctl(some_loop_fd, LOOP_SET_FD, other_loop_fd)) being suddenly disappeared. This opener is long-lived because this reference is associated with not a file descriptor but lo->lo_backing_file. (3) Avoid I/O errors caused by underlying layer of mounted loop device (i.e. mount(some_loop_device, some_mount_point)) being suddenly disappeared. This opener is long-lived because this reference is associated with not a file descriptor but mount. While race in (1) might be acceptable, (2) and (3) should be checked racelessly. That is, make sure that __loop_clr_fd() will not run if loop_validate_file() succeeds, by doing refcount check with global lock held when explicit loop device destruction is requested. As a result of no longer waiting for lo->lo_mutex after setting Lo_rundown, we can remove pointless BUG_ON(lo->lo_state != Lo_rundown) check. Signed-off-by: Tetsuo Handa Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220330052917.2566582-14-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ef7692381032..029398ff6517 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1132,27 +1132,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) struct file *filp; gfp_t gfp = lo->old_gfp_mask; - /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. - */ - - /* - * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() - * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if - * lo->lo_state has changed while waiting for lo->lo_mutex. - */ - mutex_lock(&lo->lo_mutex); - BUG_ON(lo->lo_state != Lo_rundown); - mutex_unlock(&lo->lo_mutex); - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); @@ -1239,11 +1218,20 @@ static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* @@ -1258,11 +1246,11 @@ static int loop_clr_fd(struct loop_device *lo) */ if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return 0; } lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); __loop_clr_fd(lo, false); return 0; From a0e286b6a5b61d4da01bdf865071c4da417046d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:16 +0200 Subject: [PATCH 21/72] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release lo_refcount counts how many openers a loop device has, but that count is already provided by the block layer in the bd_openers field of the whole-disk block_device. Remove lo_refcount and allow opens to succeed even on devices beeing deleted - now that ->free_disk is implemented we can handle that race gracefull and all I/O on it will just fail. Similarly there is a small race window now where loop_control_remove does not synchronize the delete vs the remove due do bd_openers not being under lo_mutex protection, but we can handle that just as gracefully. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-15-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 37 +++++++------------------------------ drivers/block/loop.h | 1 - 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 029398ff6517..204558d7a81d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1244,7 +1244,7 @@ static int loop_clr_fd(struct loop_device *lo) * /do something like mkfs/losetup -d causing the losetup -d * command to fail with EBUSY. */ - if (atomic_read(&lo->lo_refcnt) > 1) { + if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; loop_global_unlock(lo, true); return 0; @@ -1725,33 +1725,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) -{ - struct loop_device *lo = bdev->bd_disk->private_data; - int err; - - err = mutex_lock_killable(&lo->lo_mutex); - if (err) - return err; - if (lo->lo_state == Lo_deleting) - err = -ENXIO; - else - atomic_inc(&lo->lo_refcnt); - mutex_unlock(&lo->lo_mutex); - return err; -} - static void lo_release(struct gendisk *disk, fmode_t mode) { struct loop_device *lo = disk->private_data; - mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); /* @@ -1761,8 +1743,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode) __loop_clr_fd(lo, true); return; } - -out_unlock: mutex_unlock(&lo->lo_mutex); } @@ -1776,7 +1756,6 @@ static void lo_free_disk(struct gendisk *disk) static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT @@ -2030,7 +2009,6 @@ static int loop_add(int i) */ if (!part_shift) disk->flags |= GENHD_FL_NO_PART; - atomic_set(&lo->lo_refcnt, 0); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); @@ -2120,13 +2098,12 @@ static int loop_control_remove(int idx) ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } - /* Mark this loop device no longer open()-able. */ + /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 082d4b6bfc6a..449d562738c5 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -28,7 +28,6 @@ struct loop_func_table; struct loop_device { int lo_number; - atomic_t lo_refcnt; loff_t lo_offset; loff_t lo_sizelimit; int lo_flags; From d292dc80686aeafc418d809b4f9598578a7f294f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:17 +0200 Subject: [PATCH 22/72] loop: don't destroy lo->workqueue in __loop_clr_fd There is no need to destroy the workqueue when clearing unbinding a loop device from a backing file. Not doing so on the other hand avoid creating a complex lock dependency chain involving the global system_transition_mutex. Based on a patch from Tetsuo Handa . Reported-by: syzbot+6479585dfd4dedd3f7e1@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Tested-by: syzbot+6479585dfd4dedd3f7e1@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20220330052917.2566582-16-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 204558d7a81d..0c7f0367200c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -812,7 +812,6 @@ struct loop_worker { }; static void loop_workfn(struct work_struct *work); -static void loop_rootcg_workfn(struct work_struct *work); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) @@ -1050,20 +1049,19 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; - lo->workqueue = alloc_workqueue("loop%d", - WQ_UNBOUND | WQ_FREEZABLE, - 0, - lo->lo_number); if (!lo->workqueue) { - error = -ENOMEM; - goto out_unlock; + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; + goto out_unlock; + } } disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); - INIT_LIST_HEAD(&lo->rootcg_cmd_list); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; @@ -1143,10 +1141,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) if (!release) blk_mq_freeze_queue(lo->lo_queue); - destroy_workqueue(lo->workqueue); - loop_free_idle_workers(lo, true); - del_timer_sync(&lo->timer); - spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; lo->lo_backing_file = NULL; @@ -1750,6 +1744,10 @@ static void lo_free_disk(struct gendisk *disk) { struct loop_device *lo = disk->private_data; + if (lo->workqueue) + destroy_workqueue(lo->workqueue); + loop_free_idle_workers(lo, true); + del_timer_sync(&lo->timer); mutex_destroy(&lo->lo_mutex); kfree(lo); } @@ -2013,6 +2011,8 @@ static int loop_add(int i) lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; From 5ea7c1339e3ed094dd4df48d598f9018a2587283 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 13 Apr 2022 14:34:20 +0200 Subject: [PATCH 23/72] block/rnbd-clt: Avoid flush_workqueue(system_long_wq) usage Flushing system-wide workqueues is dangerous and will be forbidden. Replace system_long_wq with local rnbd_clt_wq. Link: https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp Cc: Tetsuo Handa Signed-off-by: Jack Wang Reviewed-by: Santosh Kumar Pradhan Link: https://lore.kernel.org/r/20220413123420.66470-1-jinpu.wang@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index d178be175ad9..409c76b81aed 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -25,6 +25,7 @@ static int rnbd_client_major; static DEFINE_IDA(index_ida); static DEFINE_MUTEX(sess_lock); static LIST_HEAD(sess_list); +static struct workqueue_struct *rnbd_clt_wq; /* * Maximum number of partitions an instance can have. @@ -1759,12 +1760,12 @@ static void rnbd_destroy_sessions(void) * procedure takes minutes. */ INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work); - queue_work(system_long_wq, &dev->unmap_on_rmmod_work); + queue_work(rnbd_clt_wq, &dev->unmap_on_rmmod_work); } rnbd_clt_put_sess(sess); } /* Wait for all scheduled unmap works */ - flush_workqueue(system_long_wq); + flush_workqueue(rnbd_clt_wq); WARN_ON(!list_empty(&sess_list)); } @@ -1789,6 +1790,14 @@ static int __init rnbd_client_init(void) pr_err("Failed to load module, creating sysfs device files failed, err: %d\n", err); unregister_blkdev(rnbd_client_major, "rnbd"); + return err; + } + rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0); + if (!rnbd_clt_wq) { + pr_err("Failed to load module, alloc_workqueue failed.\n"); + rnbd_clt_destroy_sysfs_files(); + unregister_blkdev(rnbd_client_major, "rnbd"); + err = -ENOMEM; } return err; @@ -1799,6 +1808,7 @@ static void __exit rnbd_client_exit(void) rnbd_destroy_sessions(); unregister_blkdev(rnbd_client_major, "rnbd"); ida_destroy(&index_ida); + destroy_workqueue(rnbd_clt_wq); } module_init(rnbd_client_init); From 9631abdbf406c764f2a5d8305eac063bc3396a0a Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Tue, 22 Mar 2022 16:23:38 +0100 Subject: [PATCH 24/72] md: Set MD_BROKEN for RAID1 and RAID10 There is no direct mechanism to determine raid failure outside personality. It is done by checking rdev->flags after executing md_error(). If "faulty" flag is not set then -EBUSY is returned to userspace. -EBUSY means that array will be failed after drive removal. Mdadm has special routine to handle the array failure and it is executed if -EBUSY is returned by md. There are at least two known reasons to not consider this mechanism as correct: 1. drive can be removed even if array will be failed[1]. 2. -EBUSY seems to be wrong status. Array is not busy, but removal process cannot proceed safe. -EBUSY expectation cannot be removed without breaking compatibility with userspace. In this patch first issue is resolved by adding support for MD_BROKEN flag for RAID1 and RAID10. Support for RAID456 is added in next commit. The idea is to set the MD_BROKEN if we are sure that raid is in failed state now. This is done in each error_handler(). In md_error() MD_BROKEN flag is checked. If is set, then -EBUSY is returned to userspace. As in previous commit, it causes that #mdadm --set-faulty is able to fail array. Previously proposed workaround is valid if optional functionality[1] is disabled. [1] commit 9a567843f7ce("md: allow last device to be forcibly removed from RAID1/RAID10.") Reviewd-by: Xiao Ni Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu --- drivers/md/md.c | 27 +++++++++++--------- drivers/md/md.h | 62 +++++++++++++++++++++++++-------------------- drivers/md/raid1.c | 43 ++++++++++++++++++------------- drivers/md/raid10.c | 40 +++++++++++++++++------------ 4 files changed, 100 insertions(+), 72 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2587f872c088..9294f13e0c9d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2984,10 +2984,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - if (test_bit(Faulty, &rdev->flags)) - err = 0; - else + + if (test_bit(MD_BROKEN, &rdev->mddev->flags)) err = -EBUSY; + else + err = 0; } else if (cmd_match(buf, "remove")) { if (rdev->mddev->pers) { clear_bit(Blocked, &rdev->flags); @@ -4353,10 +4354,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, * like active, but no writes have been seen for a while (100msec). * * broken - * RAID0/LINEAR-only: same as clean, but array is missing a member. - * It's useful because RAID0/LINEAR mounted-arrays aren't stopped - * when a member is gone, so this state will at least alert the - * user that something is wrong. +* Array is failed. It's useful because mounted-arrays aren't stopped +* when array is failed, so this state will at least alert the user that +* something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, write_pending, active_idle, broken, bad_word}; @@ -7443,7 +7443,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) err = -ENODEV; else { md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) + if (test_bit(MD_BROKEN, &mddev->flags)) err = -EBUSY; } rcu_read_unlock(); @@ -7984,13 +7984,16 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) if (!mddev->pers || !mddev->pers->error_handler) return; - mddev->pers->error_handler(mddev,rdev); - if (mddev->degraded) + mddev->pers->error_handler(mddev, rdev); + + if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + if (!test_bit(MD_BROKEN, &mddev->flags)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); md_new_event(); diff --git a/drivers/md/md.h b/drivers/md/md.h index 6ac283864533..cf2cbb17acbd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; -/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ +/** + * enum mddev_flags - md device flags. + * @MD_ARRAY_FIRST_USE: First use of array, needs initialization. + * @MD_CLOSING: If set, we are closing the array, do not open it then. + * @MD_JOURNAL_CLEAN: A raid with journal is already clean. + * @MD_HAS_JOURNAL: The raid array has journal feature set. + * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took + * resync lock, need to release the lock. + * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as + * calls to md_error() will never cause the array to + * become failed. + * @MD_HAS_PPL: The raid array has PPL feature set. + * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. + * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata + * without taking reconfig_mutex. + * @MD_UPDATING_SB: md_check_recovery is updating the metadata without + * explicitly holding reconfig_mutex. + * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that + * array is ready yet. + * @MD_BROKEN: This is used to stop writes and mark array as failed. + * + * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added + */ enum mddev_flags { - MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ - MD_CLOSING, /* If set, we are closing the array, do not open - * it then */ - MD_JOURNAL_CLEAN, /* A raid with journal is already clean */ - MD_HAS_JOURNAL, /* The raid array has journal feature set */ - MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node - * already took resync lock, need to - * release the lock */ - MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is - * supported as calls to md_error() will - * never cause the array to become failed. - */ - MD_HAS_PPL, /* The raid array has PPL feature set */ - MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ - MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update - * the metadata without taking reconfig_mutex. - */ - MD_UPDATING_SB, /* md_check_recovery is updating the metadata - * without explicitly holding reconfig_mutex. - */ - MD_NOT_READY, /* do_md_run() is active, so 'array_state' - * must not report that array is ready yet - */ - MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop - * I/O in case an array member is gone/failed. - */ + MD_ARRAY_FIRST_USE, + MD_CLOSING, + MD_JOURNAL_CLEAN, + MD_HAS_JOURNAL, + MD_CLUSTER_RESYNC_LOCKED, + MD_FAILFAST_SUPPORTED, + MD_HAS_PPL, + MD_HAS_MULTIPLE_PPLS, + MD_ALLOW_SB_UPDATE, + MD_UPDATING_SB, + MD_NOT_READY, + MD_BROKEN, }; enum mddev_sb_flags { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5aed2c8b746e..99d5af1362d7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1641,30 +1641,39 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "]"); } +/** + * raid1_error() - RAID1 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * - recovery is disabled. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + * + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; unsigned long flags; - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks with "fail_last_dev == false", - * ignore the error, let the next level up know. - * else mark the drive as failed - */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev - && (conf->raid_disks - mddev->degraded) == 1) { - /* - * Don't fail the drive, act as though we were just a - * normal single drive. - * However don't try a recovery from this drive as - * it is very likely to fail. - */ - conf->recovery_disabled = mddev->recovery_disabled; - spin_unlock_irqrestore(&conf->device_lock, flags); - return; + + if (test_bit(In_sync, &rdev->flags) && + (conf->raid_disks - mddev->degraded) == 1) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + conf->recovery_disabled = mddev->recovery_disabled; + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } } set_bit(Blocked, &rdev->flags); if (test_and_clear_bit(In_sync, &rdev->flags)) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 834eb3ba95a6..dfa576cdf11c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1970,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore) _enough(conf, 1, ignore); } +/** + * raid10_error() - RAID10 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r10conf *conf = mddev->private; unsigned long flags; - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks with "fail_last_dev == false", - * ignore the error, let the next level up know. - * else mark the drive as failed - */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev - && !enough(conf, rdev->raid_disk)) { - /* - * Don't fail the drive, just return an IO error. - */ - spin_unlock_irqrestore(&conf->device_lock, flags); - return; + + if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } } if (test_and_clear_bit(In_sync, &rdev->flags)) mddev->degraded++; - /* - * If recovery is running, make sure it aborts. - */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); From 57668f0a4cc4083a120cc8c517ca0055c4543b59 Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Tue, 22 Mar 2022 16:23:39 +0100 Subject: [PATCH 25/72] raid5: introduce MD_BROKEN Raid456 module had allowed to achieve failed state. It was fixed by fb73b357fb9 ("raid5: block failing device if raid will be failed"). This fix introduces a bug, now if raid5 fails during IO, it may result with a hung task without completion. Faulty flag on the device is necessary to process all requests and is checked many times, mainly in analyze_stripe(). Allow to set faulty on drive again and set MD_BROKEN if raid is failed. As a result, this level is allowed to achieve failed state again, but communication with userspace (via -EBUSY status) will be preserved. This restores possibility to fail array via #mdadm --set-faulty command and will be fixed by additional verification on mdadm side. Reproduction steps: mdadm -CR imsm -e imsm -n 3 /dev/nvme[0-2]n1 mdadm -CR r5 -e imsm -l5 -n3 /dev/nvme[0-2]n1 --assume-clean mkfs.xfs /dev/md126 -f mount /dev/md126 /mnt/root/ fio --filename=/mnt/root/file --size=5GB --direct=1 --rw=randrw --bs=64k --ioengine=libaio --iodepth=64 --runtime=240 --numjobs=4 --time_based --group_reporting --name=throughput-test-job --eta-newline=1 & echo 1 > /sys/block/nvme2n1/device/device/remove echo 1 > /sys/block/nvme1n1/device/device/remove [ 1475.787779] Call Trace: [ 1475.793111] __schedule+0x2a6/0x700 [ 1475.799460] schedule+0x38/0xa0 [ 1475.805454] raid5_get_active_stripe+0x469/0x5f0 [raid456] [ 1475.813856] ? finish_wait+0x80/0x80 [ 1475.820332] raid5_make_request+0x180/0xb40 [raid456] [ 1475.828281] ? finish_wait+0x80/0x80 [ 1475.834727] ? finish_wait+0x80/0x80 [ 1475.841127] ? finish_wait+0x80/0x80 [ 1475.847480] md_handle_request+0x119/0x190 [ 1475.854390] md_make_request+0x8a/0x190 [ 1475.861041] generic_make_request+0xcf/0x310 [ 1475.868145] submit_bio+0x3c/0x160 [ 1475.874355] iomap_dio_submit_bio.isra.20+0x51/0x60 [ 1475.882070] iomap_dio_bio_actor+0x175/0x390 [ 1475.889149] iomap_apply+0xff/0x310 [ 1475.895447] ? iomap_dio_bio_actor+0x390/0x390 [ 1475.902736] ? iomap_dio_bio_actor+0x390/0x390 [ 1475.909974] iomap_dio_rw+0x2f2/0x490 [ 1475.916415] ? iomap_dio_bio_actor+0x390/0x390 [ 1475.923680] ? atime_needs_update+0x77/0xe0 [ 1475.930674] ? xfs_file_dio_aio_read+0x6b/0xe0 [xfs] [ 1475.938455] xfs_file_dio_aio_read+0x6b/0xe0 [xfs] [ 1475.946084] xfs_file_read_iter+0xba/0xd0 [xfs] [ 1475.953403] aio_read+0xd5/0x180 [ 1475.959395] ? _cond_resched+0x15/0x30 [ 1475.965907] io_submit_one+0x20b/0x3c0 [ 1475.972398] __x64_sys_io_submit+0xa2/0x180 [ 1475.979335] ? do_io_getevents+0x7c/0xc0 [ 1475.986009] do_syscall_64+0x5b/0x1a0 [ 1475.992419] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 1476.000255] RIP: 0033:0x7f11fc27978d [ 1476.006631] Code: Bad RIP value. [ 1476.073251] INFO: task fio:3877 blocked for more than 120 seconds. Cc: stable@vger.kernel.org Fixes: fb73b357fb9 ("raid5: block failing device if raid will be failed") Reviewd-by: Xiao Ni Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu --- drivers/md/raid5.c | 49 ++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59f91e392a2a..f22e0da01f13 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -686,17 +686,17 @@ int raid5_calc_degraded(struct r5conf *conf) return degraded; } -static int has_failed(struct r5conf *conf) +static bool has_failed(struct r5conf *conf) { - int degraded; + int degraded = conf->mddev->degraded; - if (conf->mddev->reshape_position == MaxSector) - return conf->mddev->degraded > conf->max_degraded; + if (test_bit(MD_BROKEN, &conf->mddev->flags)) + return true; - degraded = raid5_calc_degraded(conf); - if (degraded > conf->max_degraded) - return 1; - return 0; + if (conf->mddev->reshape_position != MaxSector) + degraded = raid5_calc_degraded(conf); + + return degraded > conf->max_degraded; } struct stripe_head * @@ -2863,34 +2863,31 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) unsigned long flags; pr_debug("raid456: error called\n"); + pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n", + mdname(mddev), bdevname(rdev->bdev, b)); + spin_lock_irqsave(&conf->device_lock, flags); - - if (test_bit(In_sync, &rdev->flags) && - mddev->degraded == conf->max_degraded) { - /* - * Don't allow to achieve failed state - * Don't try to recover this device - */ - conf->recovery_disabled = mddev->recovery_disabled; - spin_unlock_irqrestore(&conf->device_lock, flags); - return; - } - set_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); mddev->degraded = raid5_calc_degraded(conf); + + if (has_failed(conf)) { + set_bit(MD_BROKEN, &conf->mddev->flags); + conf->recovery_disabled = mddev->recovery_disabled; + + pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", + mdname(mddev), mddev->degraded, conf->raid_disks); + } else { + pr_crit("md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), conf->raid_disks - mddev->degraded); + } + spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); - pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); r5c_update_on_rdev_error(mddev, rdev); } From fc8738343eefc4ea8afb6122826dea48eacde514 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Fri, 8 Apr 2022 16:37:28 +0800 Subject: [PATCH 26/72] md: fix an incorrect NULL check in does_sb_need_changing The bug is here: if (!rdev) The list iterator value 'rdev' will *always* be set and non-NULL by rdev_for_each(), so it is incorrect to assume that the iterator value will be NULL if the list is empty or no element found. Otherwise it will bypass the NULL check and lead to invalid memory access passing the check. To fix the bug, use a new variable 'iter' as the list iterator, while using the original variable 'rdev' as a dedicated pointer to point to the found element. Cc: stable@vger.kernel.org Fixes: 2aa82191ac36 ("md-cluster: Perform a lazy update") Acked-by: Guoqing Jiang Signed-off-by: Xiaomeng Tong Acked-by: Goldwyn Rodrigues Signed-off-by: Song Liu --- drivers/md/md.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9294f13e0c9d..f07f007ecae4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2627,14 +2627,16 @@ static void sync_sbs(struct mddev *mddev, int nospares) static bool does_sb_need_changing(struct mddev *mddev) { - struct md_rdev *rdev; + struct md_rdev *rdev = NULL, *iter; struct mdp_superblock_1 *sb; int role; /* Find a good rdev */ - rdev_for_each(rdev, mddev) - if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + rdev_for_each(iter, mddev) + if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { + rdev = iter; break; + } /* No good device found. */ if (!rdev) From 64c54d9244a4efe9bc6e9c98e13c4bbb8bb39083 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Fri, 8 Apr 2022 16:47:15 +0800 Subject: [PATCH 27/72] md: fix an incorrect NULL check in md_reload_sb The bug is here: if (!rdev || rdev->desc_nr != nr) { The list iterator value 'rdev' will *always* be set and non-NULL by rdev_for_each_rcu(), so it is incorrect to assume that the iterator value will be NULL if the list is empty or no element found (In fact, it will be a bogus pointer to an invalid struct object containing the HEAD). Otherwise it will bypass the check and lead to invalid memory access passing the check. To fix the bug, use a new variable 'iter' as the list iterator, while using the original variable 'pdev' as a dedicated pointer to point to the found element. Cc: stable@vger.kernel.org Fixes: 70bcecdb1534 ("md-cluster: Improve md_reload_sb to be less error prone") Signed-off-by: Xiaomeng Tong Signed-off-by: Song Liu --- drivers/md/md.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f07f007ecae4..4e3c314b3862 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9795,16 +9795,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) void md_reload_sb(struct mddev *mddev, int nr) { - struct md_rdev *rdev; + struct md_rdev *rdev = NULL, *iter; int err; /* Find the rdev */ - rdev_for_each_rcu(rdev, mddev) { - if (rdev->desc_nr == nr) + rdev_for_each_rcu(iter, mddev) { + if (iter->desc_nr == nr) { + rdev = iter; break; + } } - if (!rdev || rdev->desc_nr != nr) { + if (!rdev) { pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); return; } From e68cb83a57a458b01c9739e2ad9cb70b04d1e6d2 Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Fri, 1 Apr 2022 10:13:16 +0800 Subject: [PATCH 28/72] md/bitmap: don't set sb values if can't pass sanity check If bitmap area contains invalid data, kernel will crash then mdadm triggers "Segmentation fault". This is cluster-md speical bug. In non-clustered env, mdadm will handle broken metadata case. In clustered array, only kernel space handles bitmap slot info. But even this bug only happened in clustered env, current sanity check is wrong, the code should be changed. How to trigger: (faulty injection) dd if=/dev/zero bs=1M count=1 oflag=direct of=/dev/sda dd if=/dev/zero bs=1M count=1 oflag=direct of=/dev/sdb mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sda /dev/sdb mdadm -Ss echo aaa > magic.txt == below modifying slot 2 bitmap data == dd if=magic.txt of=/dev/sda seek=16384 bs=1 count=3 <== destroy magic dd if=/dev/zero of=/dev/sda seek=16436 bs=1 count=4 <== ZERO chunksize mdadm -A /dev/md0 /dev/sda /dev/sdb == kernel crashes. mdadm outputs "Segmentation fault" == Reason of kernel crash: In md_bitmap_read_sb (called by md_bitmap_create), bad bitmap magic didn't block chunksize assignment, and zero value made DIV_ROUND_UP_SECTOR_T() trigger "divide error". Crash log: kernel: md: md0 stopped. kernel: md/raid1:md0: not clean -- starting background reconstruction kernel: md/raid1:md0: active with 2 out of 2 mirrors kernel: dlm: ... ... kernel: md-cluster: Joined cluster 44810aba-38bb-e6b8-daca-bc97a0b254aa slot 1 kernel: md0: invalid bitmap file superblock: bad magic kernel: md_bitmap_copy_from_slot can't get bitmap from slot 2 kernel: md-cluster: Could not gather bitmaps from slot 2 kernel: divide error: 0000 [#1] SMP NOPTI kernel: CPU: 0 PID: 1603 Comm: mdadm Not tainted 5.14.6-1-default kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) kernel: RIP: 0010:md_bitmap_create+0x1d1/0x850 [md_mod] kernel: RSP: 0018:ffffc22ac0843ba0 EFLAGS: 00010246 kernel: ... ... kernel: Call Trace: kernel: ? dlm_lock_sync+0xd0/0xd0 [md_cluster 77fe..7a0] kernel: md_bitmap_copy_from_slot+0x2c/0x290 [md_mod 24ea..d3a] kernel: load_bitmaps+0xec/0x210 [md_cluster 77fe..7a0] kernel: md_bitmap_load+0x81/0x1e0 [md_mod 24ea..d3a] kernel: do_md_run+0x30/0x100 [md_mod 24ea..d3a] kernel: md_ioctl+0x1290/0x15a0 [md_mod 24ea....d3a] kernel: ? mddev_unlock+0xaa/0x130 [md_mod 24ea..d3a] kernel: ? blkdev_ioctl+0xb1/0x2b0 kernel: block_ioctl+0x3b/0x40 kernel: __x64_sys_ioctl+0x7f/0xb0 kernel: do_syscall_64+0x59/0x80 kernel: ? exit_to_user_mode_prepare+0x1ab/0x230 kernel: ? syscall_exit_to_user_mode+0x18/0x40 kernel: ? do_syscall_64+0x69/0x80 kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae kernel: RIP: 0033:0x7f4a15fa722b kernel: ... ... kernel: ---[ end trace 8afa7612f559c868 ]--- kernel: RIP: 0010:md_bitmap_create+0x1d1/0x850 [md_mod] Reported-by: kernel test robot Reported-by: Dan Carpenter Acked-by: Guoqing Jiang Signed-off-by: Heming Zhao Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 44 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bfd6026d7809..612460d2bdaf 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -639,14 +639,6 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - /* Setup nodes/clustername only if bitmap version is - * cluster-compatible - */ - if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { - nodes = le32_to_cpu(sb->nodes); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, - sb->cluster_name, 64); - } /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -668,6 +660,16 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) goto out; } + /* + * Setup nodes/clustername only if bitmap version is + * cluster-compatible + */ + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { + nodes = le32_to_cpu(sb->nodes); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, + sb->cluster_name, 64); + } + /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); @@ -700,9 +702,9 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) out: kunmap_atomic(sb); - /* Assigning chunksize is required for "re_read" */ - bitmap->mddev->bitmap_info.chunksize = chunksize; if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { + /* Assigning chunksize is required for "re_read" */ + bitmap->mddev->bitmap_info.chunksize = chunksize; err = md_setup_cluster(bitmap->mddev, nodes); if (err) { pr_warn("%s: Could not setup cluster service (%d)\n", @@ -713,18 +715,18 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) goto re_read; } - out_no_sb: - if (test_bit(BITMAP_STALE, &bitmap->flags)) - bitmap->events_cleared = bitmap->mddev->events; - bitmap->mddev->bitmap_info.chunksize = chunksize; - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - bitmap->mddev->bitmap_info.max_write_behind = write_behind; - bitmap->mddev->bitmap_info.nodes = nodes; - if (bitmap->mddev->bitmap_info.space == 0 || - bitmap->mddev->bitmap_info.space > sectors_reserved) - bitmap->mddev->bitmap_info.space = sectors_reserved; - if (err) { + if (err == 0) { + if (test_bit(BITMAP_STALE, &bitmap->flags)) + bitmap->events_cleared = bitmap->mddev->events; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->mddev->bitmap_info.nodes = nodes; + if (bitmap->mddev->bitmap_info.space == 0 || + bitmap->mddev->bitmap_info.space > sectors_reserved) + bitmap->mddev->bitmap_info.space = sectors_reserved; + } else { md_bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); From 92d9aac92b7cc92c770e736c70c3acae7b803278 Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Fri, 1 Apr 2022 10:13:17 +0800 Subject: [PATCH 29/72] md: replace deprecated strlcpy & remove duplicated line This commit includes two topics: 1> replace deprecated strlcpy change strlcpy to strscpy for strlcpy is marked as deprecated in Documentation/process/deprecated.rst 2> remove duplicated strlcpy line in md_bitmap_read_sb@md-bitmap.c there are two duplicated strlcpy(), the history: - commit cf921cc19cf7 ("Add node recovery callbacks") introduced the first usage of strlcpy(). - commit b97e92574c0b ("Use separate bitmaps for each nodes in the cluster") introduced the second strlcpy(). this time, the two strlcpy() are same, we can remove anyone safely. - commit d3b178adb3a3 ("md: Skip cluster setup for dm-raid") added dm-raid special handling. And the "nodes" value is the key of this patch. but from this patch, strlcpy() which was introduced by b97e92574c0bf become necessary. - commit 3c462c880b52 ("md: Increment version for clustered bitmaps") used clustered major version to only handle in clustered env. this patch could look a polishment for clustered code logic. So cf921cc19cf7 became useless after d3b178adb3a3a, we could remove it safely. Signed-off-by: Heming Zhao Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 3 +-- drivers/md/md-cluster.c | 2 +- drivers/md/md.c | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 612460d2bdaf..d87f674ab762 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -666,7 +666,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) */ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { nodes = le32_to_cpu(sb->nodes); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, + strscpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); } @@ -697,7 +697,6 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); err = 0; out: diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1c8a06b77c85..37cbcce3cc66 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -201,7 +201,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); goto out_err; } - strlcpy(res->name, name, namelen + 1); + strscpy(res->name, name, namelen + 1); if (with_lvb) { res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); if (!res->lksb.sb_lvbptr) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e3c314b3862..e0336a563a2a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4031,7 +4031,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -5765,7 +5765,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp) len--; if (len >= DISK_NAME_LEN) return -E2BIG; - strlcpy(buf, val, len+1); + strscpy(buf, val, len+1); if (strncmp(buf, "md_", 3) == 0) return md_alloc(0, buf); if (strncmp(buf, "md", 2) == 0 && @@ -5898,7 +5898,7 @@ int md_run(struct mddev *mddev) mddev->level = pers->level; mddev->new_level = pers->level; } - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { From 8fbcba6b999beb9fd0b95cd2efe00a1215e36406 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Apr 2022 10:57:07 -0600 Subject: [PATCH 30/72] md/raid5: Cleanup setup_conf() error returns Be more careful about the error returns. Most errors in this function are actually ENOMEM, but it forcibly returns EIO if conf has been allocated. Instead return ret and ensure it is set appropriately before each goto abort. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f22e0da01f13..79b03c79c66f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7163,7 +7163,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) int i; int group_cnt; struct r5worker_group *new_group; - int ret; + int ret = -ENOMEM; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -7222,6 +7222,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->device_lock); seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); mutex_init(&conf->cache_size_mutex); + init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -7299,11 +7300,13 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; - if (raid5_alloc_percpu(conf) != 0) + ret = raid5_alloc_percpu(conf); + if (ret) goto abort; pr_debug("raid456: run(%s) called.\n", mdname(mddev)); + ret = -EIO; rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks @@ -7367,6 +7370,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (grow_stripes(conf, conf->min_nr_stripes)) { pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); + ret = -ENOMEM; goto abort; } else pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); @@ -7380,7 +7384,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - if (register_shrinker(&conf->shrinker)) { + ret = register_shrinker(&conf->shrinker); + if (ret) { pr_warn("md/raid:%s: couldn't register shrinker.\n", mdname(mddev)); goto abort; @@ -7391,17 +7396,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (!conf->thread) { pr_warn("md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); + ret = -ENOMEM; goto abort; } return conf; abort: - if (conf) { + if (conf) free_conf(conf); - return ERR_PTR(-EIO); - } else - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) From 3d9a644cf45c26ad1d0ceff0af8c9e9860677729 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Apr 2022 10:57:08 -0600 Subject: [PATCH 31/72] md/raid5: Un-nest struct raid5_percpu definition Sparse reports many warnings of the form: drivers/md/raid5.c:1476:16: warning: dereference of noderef expression This is because all struct raid5_percpu definitions get marked as __percpu when really only the pointer in r5conf should have that annotation. Fix this by moving the defnition of raid5_precpu out of the definition of struct r5conf. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9e8486a9e445..61bc2e1f1b4e 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -560,6 +560,16 @@ struct r5pending_data { struct bio_list bios; }; +struct raid5_percpu { + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address + * conversions + */ + int scribble_obj_size; + local_lock_t lock; +}; + struct r5conf { struct hlist_head *stripe_hashtbl; /* only protect corresponding hash list and inactive_list */ @@ -635,15 +645,7 @@ struct r5conf { */ int recovery_disabled; /* per cpu variables */ - struct raid5_percpu { - struct page *spare_page; /* Used when checking P/Q in raid6 */ - void *scribble; /* space for constructing buffer - * lists and performing address - * conversions - */ - int scribble_obj_size; - local_lock_t lock; - } __percpu *percpu; + struct raid5_percpu __percpu *percpu; int scribble_disks; int scribble_sectors; struct hlist_node node; From b0920ede081b3f1659872f80ce552305610675a6 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Apr 2022 10:57:09 -0600 Subject: [PATCH 32/72] md/raid5: Add __rcu annotation to struct disk_info rdev and replacement are protected in some circumstances with rcu_dereference and synchronize_rcu (in raid5_remove_disk()). However, they were not annotated with __rcu so a sparse warning is emitted for every rcu_dereference() call. Add the __rcu annotation and fix up the initialization with RCU_INIT_POINTER, all pointer modifications with rcu_assign_pointer(), a few cases where the pointer value is tested with rcu_access_pointer() and one case where READ_ONCE() is used instead of rcu_dereference(), a case in print_raid5_conf() that should have rcu_dereference() and rcu_read_[un]lock() calls. Additional sparse issues will be fixed up in further commits. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 46 ++++++++++++++++++++++++++-------------------- drivers/md/raid5.h | 3 ++- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 79b03c79c66f..c4051625d293 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6285,7 +6285,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n */ rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; @@ -7317,11 +7317,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (test_bit(Replacement, &rdev->flags)) { if (disk->replacement) goto abort; - disk->replacement = rdev; + RCU_INIT_POINTER(disk->replacement, rdev); } else { if (disk->rdev) goto abort; - disk->rdev = rdev; + RCU_INIT_POINTER(disk->rdev, rdev); } if (test_bit(In_sync, &rdev->flags)) { @@ -7628,11 +7628,11 @@ static int raid5_run(struct mddev *mddev) rdev = conf->disks[i].replacement; conf->disks[i].replacement = NULL; clear_bit(Replacement, &rdev->flags); - conf->disks[i].rdev = rdev; + rcu_assign_pointer(conf->disks[i].rdev, rdev); } if (!rdev) continue; - if (conf->disks[i].replacement && + if (rcu_access_pointer(conf->disks[i].replacement) && conf->reshape_progress != MaxSector) { /* replacements and reshape simply do not mix. */ pr_warn("md: cannot handle concurrent replacement and reshape.\n"); @@ -7829,8 +7829,8 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) static void print_raid5_conf (struct r5conf *conf) { + struct md_rdev *rdev; int i; - struct disk_info *tmp; pr_debug("RAID conf printout:\n"); if (!conf) { @@ -7841,14 +7841,16 @@ static void print_raid5_conf (struct r5conf *conf) conf->raid_disks, conf->raid_disks - conf->mddev->degraded); + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->disks + i; - if (tmp->rdev) + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev) pr_debug(" disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev, b)); + i, !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev, b)); } + rcu_read_unlock(); } static int raid5_spare_active(struct mddev *mddev) @@ -7899,8 +7901,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r5conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct md_rdev **rdevp; + struct md_rdev __rcu **rdevp; struct disk_info *p = conf->disks + number; + struct md_rdev *tmp; print_raid5_conf(conf); if (test_bit(Journal, &rdev->flags) && conf->log) { @@ -7918,9 +7921,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) log_exit(conf); return 0; } - if (rdev == p->rdev) + if (rdev == rcu_access_pointer(p->rdev)) rdevp = &p->rdev; - else if (rdev == p->replacement) + else if (rdev == rcu_access_pointer(p->replacement)) rdevp = &p->replacement; else return 0; @@ -7940,7 +7943,8 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (!test_bit(Faulty, &rdev->flags) && mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && - (!p->replacement || p->replacement == rdev) && + (!rcu_access_pointer(p->replacement) || + rcu_access_pointer(p->replacement) == rdev) && number < conf->raid_disks) { err = -EBUSY; goto abort; @@ -7951,7 +7955,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; - *rdevp = rdev; + rcu_assign_pointer(*rdevp, rdev); } } if (!err) { @@ -7959,17 +7963,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (err) goto abort; } - if (p->replacement) { + + tmp = rcu_access_pointer(p->replacement); + if (tmp) { /* We must have just cleared 'rdev' */ - p->rdev = p->replacement; - clear_bit(Replacement, &p->replacement->flags); + rcu_assign_pointer(p->rdev, tmp); + clear_bit(Replacement, &tmp->flags); smp_mb(); /* Make sure other CPUs may see both as identical * but will never see neither - if they are careful */ - p->replacement = NULL; + rcu_assign_pointer(p->replacement, NULL); if (!err) - err = log_modify(conf, p->rdev, true); + err = log_modify(conf, tmp, true); } clear_bit(WantReplacement, &rdev->flags); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 61bc2e1f1b4e..638d29863503 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -473,7 +473,8 @@ enum { */ struct disk_info { - struct md_rdev *rdev, *replacement; + struct md_rdev __rcu *rdev; + struct md_rdev __rcu *replacement; struct page *extra_page; /* extra page to use in prexor */ }; From e38b0432550507a78d63c8da094e5f50820bdf92 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Apr 2022 10:57:10 -0600 Subject: [PATCH 33/72] md/raid5: Annotate rdev/replacement accesses when nr_pending is elevated There are a number of accesses to __rcu variables that should be safe because nr_pending in the disk is known to be elevated. Create a wrapper around rcu_dereference_protected() to annotate these accesses and verify that nr_pending is non-zero. This fixes a number of sparse warnings. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c4051625d293..6dc9d7cfa095 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2648,6 +2648,16 @@ static void shrink_stripes(struct r5conf *conf) conf->slab_cache = NULL; } +/* + * This helper wraps rcu_dereference_protected() and can be used when + * it is known that the nr_pending of the rdev is elevated. + */ +static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev) +{ + return rcu_dereference_protected(rdev, + atomic_read(&rcu_access_pointer(rdev)->nr_pending)); +} + static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; @@ -2674,9 +2684,9 @@ static void raid5_end_read_request(struct bio * bi) * In that case it moved down to 'rdev'. * rdev is not removed until all requests are finished. */ - rdev = conf->disks[i].replacement; + rdev = rdev_pend_deref(conf->disks[i].replacement); if (!rdev) - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); if (use_new_offset(conf, sh)) s = sh->sector + rdev->new_data_offset; @@ -2790,11 +2800,11 @@ static void raid5_end_write_request(struct bio *bi) for (i = 0 ; i < disks; i++) { if (bi == &sh->dev[i].req) { - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); break; } if (bi == &sh->dev[i].rreq) { - rdev = conf->disks[i].replacement; + rdev = rdev_pend_deref(conf->disks[i].replacement); if (rdev) replacement = 1; else @@ -2802,7 +2812,7 @@ static void raid5_end_write_request(struct bio *bi) * replaced it. rdev is not removed * until all requests are finished. */ - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); break; } } @@ -5210,23 +5220,23 @@ static void handle_stripe(struct stripe_head *sh) struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_WriteError, &dev->flags)) { /* We own a safe reference to the rdev */ - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); if (!rdev_set_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { - rdev = conf->disks[i].replacement; + rdev = rdev_pend_deref(conf->disks[i].replacement); if (!rdev) /* rdev have been moved down */ - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); From 9aeb7f99a134391e19ffad926cfb6a60d72139b4 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Apr 2022 10:57:11 -0600 Subject: [PATCH 34/72] md/raid5: Annotate rdev/replacement access when mddev_lock is held The mddev_lock should be held during raid5_remove_disk() which is when the rdev/replacement pointers are modified. So any access to these pointers marked __rcu should be safe whenever the mddev_lock is held. There are numerous such access that currently produce sparse warnings. Add a helper function, rdev_mdlock_deref() that wraps rcu_dereference_protected() in all these instances. This annotation fixes a number of sparse warnings. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 65 ++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6dc9d7cfa095..7c4f94c392ea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2658,6 +2658,18 @@ static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev) atomic_read(&rcu_access_pointer(rdev)->nr_pending)); } +/* + * This helper wraps rcu_dereference_protected() and should be used + * when it is known that the mddev_lock() is held. This is safe + * seeing raid5_remove_disk() has the same lock held. + */ +static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev, + struct md_rdev __rcu *rdev) +{ + return rcu_dereference_protected(rdev, + lockdep_is_held(&mddev->reconfig_mutex)); +} + static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; @@ -7632,10 +7644,11 @@ static int raid5_run(struct mddev *mddev) for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; i++) { - rdev = conf->disks[i].rdev; + rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); if (!rdev && conf->disks[i].replacement) { /* The replacement is all we have yet */ - rdev = conf->disks[i].replacement; + rdev = rdev_mdlock_deref(mddev, + conf->disks[i].replacement); conf->disks[i].replacement = NULL; clear_bit(Replacement, &rdev->flags); rcu_assign_pointer(conf->disks[i].rdev, rdev); @@ -7867,36 +7880,38 @@ static int raid5_spare_active(struct mddev *mddev) { int i; struct r5conf *conf = mddev->private; - struct disk_info *tmp; + struct md_rdev *rdev, *replacement; int count = 0; unsigned long flags; for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->disks + i; - if (tmp->replacement - && tmp->replacement->recovery_offset == MaxSector - && !test_bit(Faulty, &tmp->replacement->flags) - && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); + replacement = rdev_mdlock_deref(mddev, + conf->disks[i].replacement); + if (replacement + && replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &replacement->flags) + && !test_and_set_bit(In_sync, &replacement->flags)) { /* Replacement has just become active. */ - if (!tmp->rdev - || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + if (!rdev + || !test_and_clear_bit(In_sync, &rdev->flags)) count++; - if (tmp->rdev) { + if (rdev) { /* Replaced device not technically faulty, * but we need to be sure it gets removed * and never re-added. */ - set_bit(Faulty, &tmp->rdev->flags); + set_bit(Faulty, &rdev->flags); sysfs_notify_dirent_safe( - tmp->rdev->sysfs_state); + rdev->sysfs_state); } - sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); - } else if (tmp->rdev - && tmp->rdev->recovery_offset == MaxSector - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + sysfs_notify_dirent_safe(replacement->sysfs_state); + } else if (rdev + && rdev->recovery_offset == MaxSector + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { count++; - sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -7961,6 +7976,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) } *rdevp = NULL; if (!test_bit(RemoveSynchronized, &rdev->flags)) { + lockdep_assert_held(&mddev->reconfig_mutex); synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ @@ -8001,6 +8017,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int ret, err = -EEXIST; int disk; struct disk_info *p; + struct md_rdev *tmp; int first = 0; int last = conf->raid_disks - 1; @@ -8058,7 +8075,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) } for (disk = first; disk <= last; disk++) { p = conf->disks + disk; - if (test_bit(WantReplacement, &p->rdev->flags) && + tmp = rdev_mdlock_deref(mddev, p->rdev); + if (test_bit(WantReplacement, &tmp->flags) && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); @@ -8349,6 +8367,7 @@ static void end_reshape(struct r5conf *conf) static void raid5_finish_reshape(struct mddev *mddev) { struct r5conf *conf = mddev->private; + struct md_rdev *rdev; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -8360,10 +8379,12 @@ static void raid5_finish_reshape(struct mddev *mddev) for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; d++) { - struct md_rdev *rdev = conf->disks[d].rdev; + rdev = rdev_mdlock_deref(mddev, + conf->disks[d].rdev); if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = conf->disks[d].replacement; + rdev = rdev_mdlock_deref(mddev, + conf->disks[d].replacement); if (rdev) clear_bit(In_sync, &rdev->flags); } From 4f4ee2bf32860e4aa3b07be3fc9224fbe6cce4fe Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Apr 2022 10:57:12 -0600 Subject: [PATCH 35/72] md/raid5-ppl: Annotate with rcu_dereference_protected() To suppress the last remaining sparse warnings about accessing rdev, add rcu_dereference_protected calls to a couple places in raid5-ppl. All of these places are called under raid5_run and therefore are occurring before the array has started and is thus safe. There's no sensible check to do for the second argument of rcu_dereference_protected() so a comment is added instead. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-ppl.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index d3962d92df18..55d065a87b89 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -883,7 +883,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)r_sector, dd_idx, (unsigned long long)sector); - rdev = conf->disks[dd_idx].rdev; + /* Array has not started so rcu dereference is safe */ + rdev = rcu_dereference_protected( + conf->disks[dd_idx].rdev, 1); if (!rdev || (!test_bit(In_sync, &rdev->flags) && sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", @@ -934,7 +936,10 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, parity_sector = raid5_compute_sector(conf, r_sector_first + i, 0, &disk, &sh); BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); - parity_rdev = conf->disks[sh.pd_idx].rdev; + + /* Array has not started so rcu dereference is safe */ + parity_rdev = rcu_dereference_protected( + conf->disks[sh.pd_idx].rdev, 1); BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); pr_debug("%s:%*s write parity at sector %llu, disk %s\n", @@ -1404,7 +1409,9 @@ int ppl_init_log(struct r5conf *conf) for (i = 0; i < ppl_conf->count; i++) { struct ppl_log *log = &ppl_conf->child_logs[i]; - struct md_rdev *rdev = conf->disks[i].rdev; + /* Array has not started so rcu dereference is safe */ + struct md_rdev *rdev = + rcu_dereference_protected(conf->disks[i].rdev, 1); mutex_init(&log->io_mutex); spin_lock_init(&log->io_list_lock); From 4631f39f058b98bfa3fd1d6ffb491fa9e70e3e81 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Apr 2022 10:57:13 -0600 Subject: [PATCH 36/72] md/raid5: Annotate functions that hold device_lock with __must_hold A handful of functions note the device_lock must be held with a comment but this is not comprehensive. Many other functions hold the lock when taken so add an __must_hold() to each call to annotate when the lock is held. This makes it a bit easier to analyse device_lock. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7c4f94c392ea..144ea077c2ed 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -79,18 +79,21 @@ static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) } static inline void lock_device_hash_lock(struct r5conf *conf, int hash) + __acquires(&conf->device_lock) { spin_lock_irq(conf->hash_locks + hash); spin_lock(&conf->device_lock); } static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) + __releases(&conf->device_lock) { spin_unlock(&conf->device_lock); spin_unlock_irq(conf->hash_locks + hash); } static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) + __acquires(&conf->device_lock) { int i; spin_lock_irq(conf->hash_locks); @@ -100,6 +103,7 @@ static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) } static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) + __releases(&conf->device_lock) { int i; spin_unlock(&conf->device_lock); @@ -164,6 +168,7 @@ static bool stripe_is_lowprio(struct stripe_head *sh) } static void raid5_wakeup_stripe_thread(struct stripe_head *sh) + __must_hold(&sh->raid_conf->device_lock) { struct r5conf *conf = sh->raid_conf; struct r5worker_group *group; @@ -211,6 +216,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { int i; int injournal = 0; /* number of date pages with R5_InJournal */ @@ -296,6 +302,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { if (atomic_dec_and_test(&sh->count)) do_release_stripe(conf, sh, temp_inactive_list); @@ -350,9 +357,9 @@ static void release_inactive_stripe_list(struct r5conf *conf, } } -/* should hold conf->device_lock already */ static int release_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { struct stripe_head *sh, *t; int count = 0; @@ -629,6 +636,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, * This is because some failed devices may only affect one * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. + * + * Most calls to this function hold &conf->device_lock. Calls + * in raid5_run() do not require the lock as no other threads + * have been started yet. */ int raid5_calc_degraded(struct r5conf *conf) { @@ -5275,6 +5286,7 @@ static void handle_stripe(struct stripe_head *sh) } static void raid5_activate_delayed(struct r5conf *conf) + __must_hold(&conf->device_lock) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { while (!list_empty(&conf->delayed_list)) { @@ -5292,9 +5304,9 @@ static void raid5_activate_delayed(struct r5conf *conf) } static void activate_bit_delay(struct r5conf *conf, - struct list_head *temp_inactive_list) + struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { - /* device_lock is held */ struct list_head head; list_add(&head, &conf->bitmap_list); list_del_init(&conf->bitmap_list); @@ -5519,6 +5531,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) * handle_list. */ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) + __must_hold(&conf->device_lock) { struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; @@ -6390,8 +6403,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, static int handle_active_stripes(struct r5conf *conf, int group, struct r5worker *worker, struct list_head *temp_inactive_list) - __releases(&conf->device_lock) - __acquires(&conf->device_lock) + __must_hold(&conf->device_lock) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0, hash; From ea23994edc4169bd90d7a9b5908c6ccefd82fa40 Mon Sep 17 00:00:00 2001 From: Pascal Hambourg Date: Wed, 13 Apr 2022 08:53:56 +0200 Subject: [PATCH 37/72] md/raid0: Ignore RAID0 layout if the second zone has only one device The RAID0 layout is irrelevant if all members have the same size so the array has only one zone. It is *also* irrelevant if the array has two zones and the second zone has only one device, for example if the array has two members of different sizes. So in that case it makes sense to allow assembly even when the layout is undefined, like what is done when the array has only one zone. Reviewed-by: NeilBrown Signed-off-by: Pascal Hambourg Signed-off-by: Song Liu --- drivers/md/raid0.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 7231f5e1eaa7..e11701e394ca 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -128,21 +128,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); - if (conf->nr_strip_zones == 1) { - conf->layout = RAID0_ORIG_LAYOUT; - } else if (mddev->layout == RAID0_ORIG_LAYOUT || - mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { - conf->layout = mddev->layout; - } else if (default_layout == RAID0_ORIG_LAYOUT || - default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { - conf->layout = default_layout; - } else { - pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", - mdname(mddev)); - pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); - err = -ENOTSUPP; - goto abort; - } /* * now since we have the hard sector sizes, we can make sure * chunk size is a multiple of that sector size @@ -273,6 +258,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) (unsigned long long)smallest->sectors); } + if (conf->nr_strip_zones == 1 || conf->strip_zone[1].nb_dev == 1) { + conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; + } else if (default_layout == RAID0_ORIG_LAYOUT || + default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = default_layout; + } else { + pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", + mdname(mddev)); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); + err = -EOPNOTSUPP; + goto abort; + } + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; From 9151ad5d8676a89cf1b6a4051037ab3ca077d938 Mon Sep 17 00:00:00 2001 From: David Sloan Date: Thu, 21 Apr 2022 13:45:58 -0600 Subject: [PATCH 38/72] md: Replace role magic numbers with defined constants There are several instances where magic numbers are used in md.c instead of the defined constants in md_p.h. This patch set improves code readability by replacing all occurrences of 0xffff, 0xfffe, and 0xfffd when relating to md roles with their equivalent defined constant. Signed-off-by: David Sloan Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/md.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e0336a563a2a..707e802d0082 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2647,11 +2647,11 @@ static bool does_sb_need_changing(struct mddev *mddev) rdev_for_each(rdev, mddev) { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); /* Device activated? */ - if (role == 0xffff && rdev->raid_disk >=0 && + if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) return true; /* Device turned faulty? */ - if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) return true; } @@ -9675,7 +9675,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); if (test_bit(Candidate, &rdev2->flags)) { - if (role == 0xfffe) { + if (role == MD_DISK_ROLE_FAULTY) { pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); md_kick_rdev_from_array(rdev2); continue; @@ -9688,7 +9688,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) /* * got activated except reshape is happening. */ - if (rdev2->raid_disk == -1 && role != 0xffff && + if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { rdev2->saved_raid_disk = role; @@ -9705,7 +9705,8 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) * as faulty. The recovery is performed by the * one who initiated the error. */ - if ((role == 0xfffe) || (role == 0xfffd)) { + if (role == MD_DISK_ROLE_FAULTY || + role == MD_DISK_ROLE_JOURNAL) { md_error(mddev, rdev2); clear_bit(Blocked, &rdev2->flags); } From 8ba816b23abd2a9a05705f3d00b8653f8be73015 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 26 Apr 2022 10:21:33 +0800 Subject: [PATCH 39/72] null-blk: save memory footprint for struct nullb_cmd Total 16 bytes can be saved in two ways: 1) The field 'bio' will only be used in bio based mode, and the field 'rq' will only be used in mq mode. Since they won't be used in the same time, declare a union for them. 2) The field 'bool fake_timeout' can be placed in the hole after the field 'error'. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20220426022133.3999006-1-yukuai3@huawei.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/null_blk.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 78eb56b0ca55..4525a65e1b23 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -16,13 +16,15 @@ #include struct nullb_cmd { - struct request *rq; - struct bio *bio; + union { + struct request *rq; + struct bio *bio; + }; unsigned int tag; blk_status_t error; + bool fake_timeout; struct nullb_queue *nq; struct hrtimer timer; - bool fake_timeout; }; struct nullb_queue { From 0b8d7622ab1859bec082bd01c5e11137195f3d52 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 19 Apr 2022 08:31:55 +0900 Subject: [PATCH 40/72] aoe: Avoid flush_scheduled_work() usage Flushing system-wide workqueues is dangerous and will be forbidden. Replace system_wq with local aoe_wq. Link: https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/r/abb37616-eec9-2794-e21e-7c623085d987@I-love.SAKURA.ne.jp Signed-off-by: Jens Axboe --- drivers/block/aoe/aoe.h | 2 ++ drivers/block/aoe/aoeblk.c | 2 +- drivers/block/aoe/aoecmd.c | 2 +- drivers/block/aoe/aoedev.c | 4 ++-- drivers/block/aoe/aoemain.c | 10 +++++++++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 84d0fcebd6af..749ae1246f4c 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -244,3 +244,5 @@ void aoenet_exit(void); void aoenet_xmit(struct sk_buff_head *); int is_aoe_netif(struct net_device *ifp); int set_aoe_iflist(const char __user *str, size_t size); + +extern struct workqueue_struct *aoe_wq; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 8a91fcac6f82..348adf335217 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -435,7 +435,7 @@ aoeblk_gdalloc(void *vp) err: spin_lock_irqsave(&d->lock, flags); d->flags &= ~DEVFL_GD_NOW; - schedule_work(&d->work); + queue_work(aoe_wq, &d->work); spin_unlock_irqrestore(&d->lock, flags); } diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 384073ef2323..d7317425be51 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -968,7 +968,7 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) d->flags |= DEVFL_NEWSIZE; else d->flags |= DEVFL_GDALLOC; - schedule_work(&d->work); + queue_work(aoe_wq, &d->work); } static void diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index c5753c6bfe80..b381d1c3ef32 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -321,7 +321,7 @@ flush(const char __user *str, size_t cnt, int exiting) specified = 1; } - flush_scheduled_work(); + flush_workqueue(aoe_wq); /* pass one: do aoedev_downdev, which might sleep */ restart1: spin_lock_irqsave(&devlist_lock, flags); @@ -520,7 +520,7 @@ freetgt(struct aoedev *d, struct aoetgt *t) void aoedev_exit(void) { - flush_scheduled_work(); + flush_workqueue(aoe_wq); flush(NULL, 0, EXITING); } diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 1e4e2971171c..6238c4c87cfc 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -16,6 +16,7 @@ MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels"); MODULE_VERSION(VERSION); static struct timer_list timer; +struct workqueue_struct *aoe_wq; static void discover_timer(struct timer_list *t) { @@ -35,6 +36,7 @@ aoe_exit(void) aoechr_exit(); aoedev_exit(); aoeblk_exit(); /* free cache after de-allocating bufs */ + destroy_workqueue(aoe_wq); } static int __init @@ -42,9 +44,13 @@ aoe_init(void) { int ret; + aoe_wq = alloc_workqueue("aoe_wq", 0, 0); + if (!aoe_wq) + return -ENOMEM; + ret = aoedev_init(); if (ret) - return ret; + goto dev_fail; ret = aoechr_init(); if (ret) goto chr_fail; @@ -77,6 +83,8 @@ aoe_init(void) aoechr_exit(); chr_fail: aoedev_exit(); + dev_fail: + destroy_workqueue(aoe_wq); printk(KERN_INFO "aoe: initialisation failure.\n"); return ret; From 07c6e92a8478770a7302f7dde72f03a5465901bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:04 +0200 Subject: [PATCH 41/72] ubd: don't set the discard_alignment queue limit The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. Setting it to the discard granularity as done by ubd is mostly harmless but also useless. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 085ffdf98e57..c4344b67628d 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -799,7 +799,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) } if (ubd_dev->no_trim == 0) { ubd_dev->queue->limits.discard_granularity = SECTOR_SIZE; - ubd_dev->queue->limits.discard_alignment = SECTOR_SIZE; blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); } From 4a04d517c56e0616c6f69afc226ee2691e543712 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:05 +0200 Subject: [PATCH 42/72] nbd: don't set the discard_alignment queue limit The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. Setting it to the discard granularity as done by nbd is mostly harmless but also useless. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 526389351784..fd1501f81b34 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -333,7 +333,6 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, if (nbd->config->flags & NBD_FLAG_SEND_TRIM) { nbd->disk->queue->limits.discard_granularity = blksize; - nbd->disk->queue->limits.discard_alignment = blksize; blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); } blk_queue_logical_block_size(nbd->disk->queue, blksize); @@ -1316,7 +1315,6 @@ static void nbd_config_put(struct nbd_device *nbd) nbd->tag_set.timeout = 0; nbd->disk->queue->limits.discard_granularity = 0; - nbd->disk->queue->limits.discard_alignment = 0; blk_queue_max_discard_sectors(nbd->disk->queue, 0); mutex_unlock(&nbd->config_lock); @@ -1781,7 +1779,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 0; - disk->queue->limits.discard_alignment = 0; blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); blk_queue_max_segments(disk->queue, USHRT_MAX); From fb749a87f4536d2fa86ea135ae4eff1072903438 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:06 +0200 Subject: [PATCH 43/72] null_blk: don't set the discard_alignment queue limit The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. Setting it to the discard granularity as done by null_blk is mostly harmless but also useless. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 5cb4c92cdffe..a521e914a984 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1765,7 +1765,6 @@ static void null_config_discard(struct nullb *nullb) } nullb->q->limits.discard_granularity = nullb->dev->blocksize; - nullb->q->limits.discard_alignment = nullb->dev->blocksize; blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); } From 62952cc5bccd89b76d710de1d0b43244af0f2903 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:07 +0200 Subject: [PATCH 44/72] virtio_blk: fix the discard_granularity and discard_alignment queue limits The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. On the other hand the discard_sector_alignment from the virtio 1.1 looks similar to what Linux uses as discard granularity (even if not very well described): "discard_sector_alignment can be used by OS when splitting a request based on alignment. " And at least qemu does set it to the discard granularity. So stop setting the discard_alignment and use the virtio discard_sector_alignment to set the discard granularity. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6ccf15253dee..d624cc8eddc3 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -867,11 +867,12 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_io_opt(q, blk_size * opt_io_size); if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { - q->limits.discard_granularity = blk_size; - virtio_cread(vdev, struct virtio_blk_config, discard_sector_alignment, &v); - q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0; + if (v) + q->limits.discard_granularity = v << SECTOR_SHIFT; + else + q->limits.discard_granularity = blk_size; virtio_cread(vdev, struct virtio_blk_config, max_discard_sectors, &v); From 44d583702f4429763c558624fac763650a1f05bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:08 +0200 Subject: [PATCH 45/72] dm-zoned: don't set the discard_alignment queue limit The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. Setting it to the discard granularity as done by dm-zoned is mostly harmless but also useless. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-zoned-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index cac295cc8840..0ec5d8b9b1a4 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1001,7 +1001,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, DMZ_BLOCK_SIZE); blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); - limits->discard_alignment = DMZ_BLOCK_SIZE; + limits->discard_alignment = 0; limits->discard_granularity = DMZ_BLOCK_SIZE; limits->max_discard_sectors = chunk_sectors; limits->max_hw_discard_sectors = chunk_sectors; From 3d50d368c92ade2f98a3d0d28b842a57c35284e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:09 +0200 Subject: [PATCH 46/72] raid5: don't set the discard_alignment queue limit The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. Setting it to the discard granularity as done by raid5 is mostly harmless but also useless. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 144ea077c2ed..39038fa8b1c8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7785,7 +7785,6 @@ static int raid5_run(struct mddev *mddev) */ stripe = stripe * PAGE_SIZE; stripe = roundup_pow_of_two(stripe); - mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; blk_queue_max_write_zeroes_sectors(mddev->queue, 0); From c3f765299632727fa5ea5a0acf118665227a4f1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:10 +0200 Subject: [PATCH 47/72] dasd: don't set the discard_alignment queue limit The discard_alignment queue limit is named a bit misleading means the offset into the block device at which the discard granularity starts. Setting it to PAGE_SIZE while the discard granularity is the block size that is smaller or the same as PAGE_SIZE as done by dasd is mostly harmless but also useless. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_fba.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 8bd5665db919..60be7f7bf2d1 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -782,7 +782,6 @@ static void dasd_fba_setup_blk_queue(struct dasd_block *block) blk_queue_segment_boundary(q, PAGE_SIZE - 1); q->limits.discard_granularity = logical_block_size; - q->limits.discard_alignment = PAGE_SIZE; /* Calculate max_discard_sectors and make it PAGE aligned */ max_bytes = USHRT_MAX * logical_block_size; From 4418bfd8fb9602d9cd8747c3ad52fdbaa02e2ffd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:11 +0200 Subject: [PATCH 48/72] loop: remove a spurious clear of discard_alignment The loop driver never sets a discard_alignment, so it also doens't need to clear it to zero. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 0c7f0367200c..322d92f958ea 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -798,7 +798,6 @@ static void loop_config_discard(struct loop_device *lo) blk_queue_max_discard_sectors(q, 0); blk_queue_max_write_zeroes_sectors(q, 0); } - q->limits.discard_alignment = 0; } struct loop_worker { From 4e7f0ece41e1be8f876f320a0972a715daec0a50 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:12 +0200 Subject: [PATCH 49/72] nvme: remove a spurious clear of discard_alignment The nvme driver never sets a discard_alignment, so it also doens't need to clear it to zero. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b9b0fbde97c8..76a9ccd5d064 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1628,7 +1628,6 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - queue->limits.discard_alignment = 0; queue->limits.discard_granularity = size; /* If discard is already enabled, don't reset queue limits */ From 18292faa89d2bff3bdd33ab9c065f45fb6710e47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:13 +0200 Subject: [PATCH 50/72] rnbd-srv: use bdev_discard_alignment Use bdev_discard_alignment to calculate the correct discard alignment offset even for partitions instead of just looking at the queue limit. Signed-off-by: Christoph Hellwig Acked-by: Jack Wang Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h index d080a0de5922..4309e5252469 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ b/drivers/block/rnbd/rnbd-srv-dev.h @@ -59,7 +59,7 @@ static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) { - return bdev_get_queue(dev->bdev)->limits.discard_alignment; + return bdev_discard_alignment(dev->bdev); } #endif /* RNBD_SRV_DEV_H */ From 0000f2f7205d88e0d97f8b47b2c8a98e86137708 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Apr 2022 06:53:14 +0200 Subject: [PATCH 51/72] xen-blkback: use bdev_discard_alignment Use bdev_discard_alignment to calculate the correct discard alignment offset even for partitions instead of just looking at the queue limit. Also switch to use bdev_discard_granularity to get rid of the last direct queue reference in xen_blkbk_discard. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220418045314.360785-12-hch@lst.de [axboe: fold in 'q' removal as it's now unused] Signed-off-by: Jens Axboe --- drivers/block/xen-blkback/xenbus.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index b21bffc9c50b..97de13b14175 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -575,7 +575,6 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info int err; int state = 0; struct block_device *bdev = be->blkif->vbd.bdev; - struct request_queue *q = bdev_get_queue(bdev); if (!xenbus_read_unsigned(dev->nodename, "discard-enable", 1)) return; @@ -583,14 +582,14 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info if (bdev_max_discard_sectors(bdev)) { err = xenbus_printf(xbt, dev->nodename, "discard-granularity", "%u", - q->limits.discard_granularity); + bdev_discard_granularity(bdev)); if (err) { dev_warn(&dev->dev, "writing discard-granularity (%d)", err); return; } err = xenbus_printf(xbt, dev->nodename, "discard-alignment", "%u", - q->limits.discard_alignment); + bdev_discard_alignment(bdev)); if (err) { dev_warn(&dev->dev, "writing discard-alignment (%d)", err); return; From 525323d25e8756f9c661f405208a7d333a1470c3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 20 Apr 2022 09:57:15 +0900 Subject: [PATCH 52/72] block: null_blk: Fix code style issues Fix message grammar and code style issues (brackets and indentation) in null_init(). Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220420005718.3780004-2-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index a521e914a984..55285519911f 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2111,19 +2111,21 @@ static int __init null_init(void) } if (g_queue_mode == NULL_Q_RQ) { - pr_err("legacy IO path no longer available\n"); + pr_err("legacy IO path is no longer available\n"); return -EINVAL; } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("submit_queues param is set to %u.\n", - nr_online_nodes); + nr_online_nodes); g_submit_queues = nr_online_nodes; } - } else if (g_submit_queues > nr_cpu_ids) + } else if (g_submit_queues > nr_cpu_ids) { g_submit_queues = nr_cpu_ids; - else if (g_submit_queues <= 0) + } else if (g_submit_queues <= 0) { g_submit_queues = 1; + } if (g_queue_mode == NULL_Q_MQ && shared_tags) { ret = null_init_tag_set(NULL, &tag_set); From b3a0a73e8a79eab6ec74139b505f4c6d6781aae9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 20 Apr 2022 09:57:16 +0900 Subject: [PATCH 53/72] block: null_blk: Cleanup device creation and deletion Introduce the null_create_dev() and null_destroy_dev() helper functions to respectivel create nullb devices on modprobe and destroy them on rmmod. The null_destroy_dev() helper avoids duplicated code in the null_init() and null_exit() functions for deleting devices. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220420005718.3780004-3-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 48 ++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 55285519911f..b0841f1d0832 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2086,12 +2086,37 @@ static int null_add_dev(struct nullb_device *dev) return rv; } +static int null_create_dev(void) +{ + struct nullb_device *dev; + int ret; + + dev = null_alloc_dev(); + if (!dev) + return -ENOMEM; + + ret = null_add_dev(dev); + if (ret) { + null_free_dev(dev); + return ret; + } + + return 0; +} + +static void null_destroy_dev(struct nullb *nullb) +{ + struct nullb_device *dev = nullb->dev; + + null_del_dev(nullb); + null_free_dev(dev); +} + static int __init null_init(void) { int ret = 0; unsigned int i; struct nullb *nullb; - struct nullb_device *dev; if (g_bs > PAGE_SIZE) { pr_warn("invalid block size\n"); @@ -2149,16 +2174,9 @@ static int __init null_init(void) } for (i = 0; i < nr_devices; i++) { - dev = null_alloc_dev(); - if (!dev) { - ret = -ENOMEM; + ret = null_create_dev(); + if (ret) goto err_dev; - } - ret = null_add_dev(dev); - if (ret) { - null_free_dev(dev); - goto err_dev; - } } pr_info("module loaded\n"); @@ -2167,9 +2185,7 @@ static int __init null_init(void) err_dev: while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); + null_destroy_dev(nullb); } unregister_blkdev(null_major, "nullb"); err_conf: @@ -2190,12 +2206,8 @@ static void __exit null_exit(void) mutex_lock(&lock); while (!list_empty(&nullb_list)) { - struct nullb_device *dev; - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); + null_destroy_dev(nullb); } mutex_unlock(&lock); From db060f54e0c53af66d72aacd13a2550f1e24c90b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 20 Apr 2022 09:57:17 +0900 Subject: [PATCH 54/72] block: null_blk: Cleanup messages Use the pr_fmt() macro to prefix all null_blk pr_xxx() messages with "null_blk:" to clarify which module is printing the messages. Also add a pr_info() message in null_add_dev() to print the name of a newly created disk. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220420005718.3780004-4-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 5 +++++ drivers/block/null_blk/zoned.c | 7 +++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index b0841f1d0832..5de83b62cf7d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -11,6 +11,9 @@ #include #include "null_blk.h" +#undef pr_fmt +#define pr_fmt(fmt) "null_blk: " fmt + #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL @@ -2069,6 +2072,8 @@ static int null_add_dev(struct nullb_device *dev) list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); + pr_info("disk %s created\n", nullb->disk_name); + return 0; out_cleanup_zone: null_free_zoned_dev(dev); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index dae54dd1aeac..ed158ea4fdd1 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -6,6 +6,9 @@ #define CREATE_TRACE_POINTS #include "trace.h" +#undef pr_fmt +#define pr_fmt(fmt) "null_blk: " fmt + static inline sector_t mb_to_sects(unsigned long mb) { return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; @@ -75,8 +78,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_capacity = dev->zone_size; if (dev->zone_capacity > dev->zone_size) { - pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", - dev->zone_capacity, dev->zone_size); + pr_err("zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); return -EINVAL; } From 49c3b9266a718dbd73c42e004288b4bb2ea0ac0b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 20 Apr 2022 09:57:18 +0900 Subject: [PATCH 55/72] block: null_blk: Improve device creation with configfs Currently, the directory name used to create a nullb device through sysfs is not used as the device name, potentially causing headaches for users if devices are already created through the modprobe operation withe the nr_device module parameter not set to 0. E.g. a user can do "mkdir /sys/kernel/config/nullb/nullb0" to create a nullb device even though /dev/nullb0 was already created by modprobe. In this case, the configfs nullb device will be named nullb1, causing confusion for the user. Simplify this by using the configfs directory name as the nullb device name, always, unless another nullb device is already using the same name. E.g. if modprobe created nullb0, then: $ mkdir /sys/kernel/config/nullb/nullb0 mkdir: cannot create directory '/sys/kernel/config/nullb/nullb0': File exists will be reported to the user. To implement this, the function null_find_dev_by_name() is added to check for the existence of a nullb device with the name used for a new configfs device directory. nullb_group_make_item() uses this new function to check if the directory name can be used as the disk name. Finally, null_add_dev() is modified to use the device config item name as the disk name for a new nullb device created using configfs. The naming of devices created though modprobe remains unchanged. Of note is that it is possible for a user to create through configfs a nullb device with the same name as an existing device. E.g. $ mkdir /sys/kernel/config/nullb/null will successfully create the nullb device named "null" but this block device will however not appear under /dev/ since /dev/null already exists. Suggested-by: Joseph Bacik Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220420005718.3780004-5-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 5de83b62cf7d..539cfeac263d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -235,6 +235,7 @@ static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); +static struct nullb *null_find_dev_by_name(const char *name); static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) @@ -563,6 +564,9 @@ config_item *nullb_group_make_item(struct config_group *group, const char *name) { struct nullb_device *dev; + if (null_find_dev_by_name(name)) + return ERR_PTR(-EEXIST); + dev = null_alloc_dev(); if (!dev) return ERR_PTR(-ENOMEM); @@ -2062,7 +2066,13 @@ static int null_add_dev(struct nullb_device *dev) null_config_discard(nullb); - sprintf(nullb->disk_name, "nullb%d", nullb->index); + if (config_item_name(&dev->item)) { + /* Use configfs dir name as the device name */ + snprintf(nullb->disk_name, sizeof(nullb->disk_name), + "%s", config_item_name(&dev->item)); + } else { + sprintf(nullb->disk_name, "nullb%d", nullb->index); + } rv = null_gendisk_register(nullb); if (rv) @@ -2091,6 +2101,22 @@ static int null_add_dev(struct nullb_device *dev) return rv; } +static struct nullb *null_find_dev_by_name(const char *name) +{ + struct nullb *nullb = NULL, *nb; + + mutex_lock(&lock); + list_for_each_entry(nb, &nullb_list, list) { + if (strcmp(nb->disk_name, name) == 0) { + nullb = nb; + break; + } + } + mutex_unlock(&lock); + + return nullb; +} + static int null_create_dev(void) { struct nullb_device *dev; From 754d96798fab1316f4f14bb86cf3c0244cb2b20b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Apr 2022 08:33:00 +0200 Subject: [PATCH 56/72] loop: remove loop.h Merge loop.h into loop.c as all the content is only used there. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220419063303.583106-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 59 +++++++++++++++++++++++++++++++++--- drivers/block/loop.h | 71 -------------------------------------------- 2 files changed, 55 insertions(+), 75 deletions(-) delete mode 100644 drivers/block/loop.h diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 322d92f958ea..533294a047e0 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -80,10 +79,62 @@ #include #include #include - -#include "loop.h" - #include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, + Lo_deleting, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file * lo_backing_file; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; + +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH (128) diff --git a/drivers/block/loop.h b/drivers/block/loop.h deleted file mode 100644 index 449d562738c5..000000000000 --- a/drivers/block/loop.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * loop.h - * - * Written by Theodore Ts'o, 3/29/93. - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - */ -#ifndef _LINUX_LOOP_H -#define _LINUX_LOOP_H - -#include -#include -#include -#include -#include -#include - -/* Possible states of device */ -enum { - Lo_unbound, - Lo_bound, - Lo_rundown, - Lo_deleting, -}; - -struct loop_func_table; - -struct loop_device { - int lo_number; - loff_t lo_offset; - loff_t lo_sizelimit; - int lo_flags; - char lo_file_name[LO_NAME_SIZE]; - - struct file * lo_backing_file; - struct block_device *lo_device; - - gfp_t old_gfp_mask; - - spinlock_t lo_lock; - int lo_state; - spinlock_t lo_work_lock; - struct workqueue_struct *workqueue; - struct work_struct rootcg_work; - struct list_head rootcg_cmd_list; - struct list_head idle_worker_list; - struct rb_root worker_tree; - struct timer_list timer; - bool use_dio; - bool sysfs_inited; - - struct request_queue *lo_queue; - struct blk_mq_tag_set tag_set; - struct gendisk *lo_disk; - struct mutex lo_mutex; - bool idr_visible; -}; - -struct loop_cmd { - struct list_head list_entry; - bool use_aio; /* use AIO interface to handle I/O */ - atomic_t ref; /* only for aio */ - long ret; - struct kiocb iocb; - struct bio_vec *bvec; - struct cgroup_subsys_state *blkcg_css; - struct cgroup_subsys_state *memcg_css; -}; - -#endif From f21e6e185a3a95dedc0d604b468d40ff1dc71fd9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Apr 2022 08:33:01 +0200 Subject: [PATCH 57/72] loop: add a SPDX header The copyright statement says: "Redistribution of this file is permitted under the GNU General Public License." and was added by Ted in 1993, at which point GPLv2 only was the default Linux license. Replace it with the usual GPLv2 only SPDX header. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220419063303.583106-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 533294a047e0..695b26e1a0dd 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,10 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/drivers/block/loop.c * * Written by Theodore Ts'o, 3/29/93 * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. + * Copyright 1993 by Theodore Ts'o. * * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 From eb04bb154b76a0633afc5d26c1de7619a6686e9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Apr 2022 08:33:02 +0200 Subject: [PATCH 58/72] loop: remove most the top-of-file boilerplate comment Remove the irrelevant changelogs and todo notes and just leave the SPDX marker and the copyright notice. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220419063303.583106-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 47 -------------------------------------------- 1 file changed, 47 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 695b26e1a0dd..01b4e257016a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,54 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/drivers/block/loop.c - * - * Written by Theodore Ts'o, 3/29/93 - * * Copyright 1993 by Theodore Ts'o. - * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * - * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 - * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 - * - * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 - * - * Added devfs support - Richard Gooch 16-Jan-1998 - * - * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 - * - * Loadable modules and other fixes by AK, 1998 - * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * - * Maximum number of loop devices now dynamic via max_loop module parameter. - * Russell Kroll 19990701 - * - * Maximum number of loop devices when compiled-in now selectable by passing - * max_loop=<1-255> to the kernel on boot. - * Erik I. Bolsø, , Oct 31, 1999 - * - * Completely rewrite request handling to be make_request_fn style and - * non blocking, pushing work to a helper thread. Lots of fixes from - * Al Viro too. - * Jens Axboe , Nov 2000 - * - * Support up to 256 loop devices - * Heinz Mauelshagen , Feb 2002 - * - * Support for falling back on the write file operation when the address space - * operations write_begin is not available on the backing filesystem. - * Anton Altaparmakov, 16 Feb 2005 - * - * Still To Fix: - * - Advisory locking is ignored here. - * - Should use an own CAP_* category instead of CAP_SYS_ADMIN - * */ - #include #include #include From c23d47abee3a54e4991ed3993340596d04aabd6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Apr 2022 08:33:03 +0200 Subject: [PATCH 59/72] loop: remove most the top-of-file boilerplate comment from the UAPI header Just leave the SPDX marker and the copyright notice and remove the irrelevant rest. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220419063303.583106-5-hch@lst.de Signed-off-by: Jens Axboe --- include/uapi/linux/loop.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index 98e60801195e..6f63527dd2ed 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -1,11 +1,6 @@ /* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */ /* - * include/linux/loop.h - * - * Written by Theodore Ts'o, 3/29/93. - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. + * Copyright 1993 by Theodore Ts'o. */ #ifndef _UAPI_LINUX_LOOP_H #define _UAPI_LINUX_LOOP_H From 1a86924e4f464757546d7f7bdc469be237918395 Mon Sep 17 00:00:00 2001 From: Tom Yan Date: Fri, 29 Apr 2022 12:52:43 +0800 Subject: [PATCH 60/72] nvme: fix interpretation of DMRSL DMRSLl is in the unit of logical blocks, while max_discard_sectors is in the unit of "linux sector". Signed-off-by: Tom Yan Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 ++++-- drivers/nvme/host/nvme.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 76a9ccd5d064..f41b2b18fad9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1634,6 +1634,9 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) if (queue->limits.max_discard_sectors) return; + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX)) + ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl); + blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); @@ -2893,8 +2896,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) if (id->dmrl) ctrl->max_discard_segments = id->dmrl; - if (id->dmrsl) - ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl); + ctrl->dmrsl = le32_to_cpu(id->dmrsl); if (id->wzsl) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a2b53ca63335..81c4f5379c0c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -284,6 +284,7 @@ struct nvme_ctrl { #endif u16 crdt[3]; u16 oncs; + u32 dmrsl; u16 oacs; u16 sqsize; u32 max_namespaces; From 52fde2c07da606f3f120af4f734eadcfb52b04be Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 4 May 2022 11:43:25 -0700 Subject: [PATCH 61/72] nvme: set dma alignment to dword The nvme specification only requires qword alignment for segment descriptors, and the driver already guarantees that. The spec has always allowed user data to be dword aligned, which is what the queue's attribute is for, so relax the alignment requirement to that value. While we could allow byte alignment for some controllers when using SGLs, we still need to support PRP, and that only allows dword. Fixes: 3b2a1ebceba3 ("nvme: set dma alignment to qword") Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f41b2b18fad9..9a6fb071d986 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1773,7 +1773,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 7); + blk_queue_dma_alignment(q, 3); blk_queue_write_cache(q, vwc, vwc); } From ca2d89925ae3f3d5c65182ff75e58bc9b484e69c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 Apr 2022 12:19:35 +0300 Subject: [PATCH 62/72] nvme: add missing status values to verbose logging Log a few more path related status codes. Signed-off-by: Max Gurtovoy Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/constants.c | 3 +++ include/linux/nvme.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 7d49eb34b348..465aee42fced 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -155,10 +155,13 @@ static const char * const nvme_statuses[] = { [NVME_SC_COMPARE_FAILED] = "Compare Failure", [NVME_SC_ACCESS_DENIED] = "Access Denied", [NVME_SC_UNWRITTEN_BLOCK] = "Deallocated or Unwritten Logical Block", + [NVME_SC_INTERNAL_PATH_ERROR] = "Internal Pathing Error", [NVME_SC_ANA_PERSISTENT_LOSS] = "Asymmetric Access Persistent Loss", [NVME_SC_ANA_INACCESSIBLE] = "Asymmetric Access Inaccessible", [NVME_SC_ANA_TRANSITION] = "Asymmetric Access Transition", + [NVME_SC_CTRL_PATH_ERROR] = "Controller Pathing Error", [NVME_SC_HOST_PATH_ERROR] = "Host Pathing Error", + [NVME_SC_HOST_ABORTED_CMD] = "Host Aborted Command", }; const unsigned char *nvme_get_error_status_str(u16 status) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f626a445d1a8..bbabdc7600da 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1679,9 +1679,11 @@ enum { /* * Path-related Errors: */ + NVME_SC_INTERNAL_PATH_ERROR = 0x300, NVME_SC_ANA_PERSISTENT_LOSS = 0x301, NVME_SC_ANA_INACCESSIBLE = 0x302, NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_CTRL_PATH_ERROR = 0x360, NVME_SC_HOST_PATH_ERROR = 0x370, NVME_SC_HOST_ABORTED_CMD = 0x371, From da3340e77eeb4ced79784eaadbcc529e1ecef673 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 Apr 2022 12:15:24 +0300 Subject: [PATCH 63/72] nvme: remove unneeded include from constants file No usage of blkdev.h elements. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/constants.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 465aee42fced..1dd1d78de295 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -4,7 +4,6 @@ * Copyright (c) 2022, Oracle and/or its affiliates */ -#include #include "nvme.h" #ifdef CONFIG_NVME_VERBOSE_ERRORS From 128126a7943622424350752a71be5bb95e7946db Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 19 Apr 2022 15:53:51 -0700 Subject: [PATCH 64/72] nvme: mark internal passthru request RQF_QUIET Most of the internal passthru commands use __nvme_submit_sync_cmd() interface. There are few places we open code the request submission :- 1. nvme_keep_alive_work(struct work_struct *work) 2. nvme_timeout(struct request *req, bool reserved) 3. nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) Mark the internal passthru request quiet so that we can skip the verbose error message from nvme_log_error() in nvme_end_req() completion path, this will be consistent with what we have in __nvme_submit_sync_cmd(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Alan Adamson Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/pci.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9a6fb071d986..42f9772abc4d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1207,6 +1207,7 @@ static void nvme_keep_alive_work(struct work_struct *work) rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; + rq->rq_flags |= RQF_QUIET; blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io); } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3aacf1c0d5a5..f326261456ac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1439,6 +1439,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) nvme_init_request(abort_req, &cmd); abort_req->end_io_data = NULL; + abort_req->rq_flags |= RQF_QUIET; blk_execute_rq_nowait(abort_req, false, abort_endio); /* @@ -2486,6 +2487,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); + req->rq_flags |= RQF_QUIET; blk_execute_rq_nowait(req, false, opcode == nvme_admin_delete_cq ? nvme_del_cq_end : nvme_del_queue_end); return 0; From da42761181627e9bdc37d18368b827948a583929 Mon Sep 17 00:00:00 2001 From: "Smith, Kyle Miller (Nimble Kernel)" Date: Fri, 22 Apr 2022 14:40:32 +0000 Subject: [PATCH 65/72] nvme-pci: fix a NULL pointer dereference in nvme_alloc_admin_tags In nvme_alloc_admin_tags, the admin_q can be set to an error (typically -ENOMEM) if the blk_mq_init_queue call fails to set up the queue, which is checked immediately after the call. However, when we return the error message up the stack, to nvme_reset_work the error takes us to nvme_remove_dead_ctrl() nvme_dev_disable() nvme_suspend_queue(&dev->queues[0]). Here, we only check that the admin_q is non-NULL, rather than not an error or NULL, and begin quiescing a queue that never existed, leading to bad / NULL pointer dereference. Signed-off-by: Kyle Smith Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f326261456ac..e951bd9b8159 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1776,6 +1776,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); if (IS_ERR(dev->ctrl.admin_q)) { blk_mq_free_tag_set(&dev->admin_tagset); + dev->ctrl.admin_q = NULL; return -ENOMEM; } if (!blk_get_queue(dev->ctrl.admin_q)) { From b98235d3a471e121376bfabce27380dde5add1d9 Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Fri, 6 May 2022 12:15:34 +0200 Subject: [PATCH 66/72] nvme-pci: harden drive presence detect in nvme_dev_disable() On our ZynqMP system we observe, that a NVMe drive that resets itself while doing a firmware update causes a Kernel crash like this: [ 67.720772] pcieport 0000:02:02.0: pciehp: Slot(2): Link Down [ 67.720783] pcieport 0000:02:02.0: pciehp: Slot(2): Card not present [ 67.720795] nvme 0000:04:00.0: PME# disabled [ 67.720849] Internal error: synchronous external abort: 96000010 [#1] PREEMPT SMP [ 67.720853] nwl-pcie fd0e0000.pcie: Slave error Analysis: When nvme_dev_disable() is called because of this PCIe hotplug event, pci_is_enabled() is still true. And accessing the NVMe drive which is currently not available as it's in reboot process causes this "synchronous external abort" on this ARM64 platform. This patch adds the pci_device_is_present() check as well, which returns false in this "Card not present" hot-plug case. With this change, the NVMe driver does not try to access the NVMe registers any more and the FW update finishes without any problems. Signed-off-by: Stefan Roese Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e951bd9b8159..5a98a7de0964 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2678,7 +2678,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) struct pci_dev *pdev = to_pci_dev(dev->dev); mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(pdev)) { + if (pci_device_is_present(pdev) && pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); if (dev->ctrl.state == NVME_CTRL_LIVE || From 93ba75c90524618ef2c20979b0e660b9d071f0e6 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Mar 2022 02:40:32 -0700 Subject: [PATCH 67/72] nvme-fabrics: add a request timeout helper The RDAMA and TCP transport both complete the timed out request in the same manner and hence code is duplicated. Add and use the helper nvmf_complete_timed_out_request() to remove the duplicate code. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.h | 8 ++++++++ drivers/nvme/host/rdma.c | 5 +---- drivers/nvme/host/tcp.c | 5 +---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 1e3a09cad961..46d6e194ac2b 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -187,6 +187,14 @@ static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl) return ctrl->subsys->subnqn; } +static inline void nvmf_complete_timed_out_request(struct request *rq) +{ + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } +} + int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d9f19d901313..b87c8ae41d9b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2010,10 +2010,7 @@ static void nvme_rdma_complete_timed_out(struct request *rq) struct nvme_rdma_queue *queue = req->queue; nvme_rdma_stop_queue(queue); - if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { - nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; - blk_mq_complete_request(rq); - } + nvmf_complete_timed_out_request(rq); } static enum blk_eh_timer_return diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index ad3a2bf2f1e9..bb67538d241b 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2318,10 +2318,7 @@ static void nvme_tcp_complete_timed_out(struct request *rq) struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); - if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { - nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; - blk_mq_complete_request(rq); - } + nvmf_complete_timed_out_request(rq); } static enum blk_eh_timer_return From 491bf8f236fdeec698fa6744993f1ecf3fafd1a5 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 22 Mar 2022 16:06:39 +0800 Subject: [PATCH 68/72] nbd: Fix hung on disconnect request if socket is closed before When userspace closes the socket before sending a disconnect request, the following I/O requests will be blocked in wait_for_reconnect() until dead timeout. This will cause the following disconnect request also hung on blk_mq_quiesce_queue(). That means we have no way to disconnect a nbd device if there are some I/O requests waiting for reconnecting until dead timeout. It's not expected. So let's wake up the thread waiting for reconnecting directly when a disconnect request is sent. Reported-by: Xu Jianhai Signed-off-by: Xie Yongji Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220322080639.142-1-xieyongji@bytedance.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index fd1501f81b34..ac8b045c777c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -946,11 +946,15 @@ static int wait_for_reconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; - if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) + + if (!wait_event_timeout(config->conn_wait, + test_bit(NBD_RT_DISCONNECTED, + &config->runtime_flags) || + atomic_read(&config->live_connections) > 0, + config->dead_conn_timeout)) return 0; - return wait_event_timeout(config->conn_wait, - atomic_read(&config->live_connections) > 0, - config->dead_conn_timeout) > 0; + + return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); } static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) @@ -2076,6 +2080,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) mutex_lock(&nbd->config_lock); nbd_disconnect(nbd); sock_shutdown(nbd); + wake_up(&nbd->config->conn_wait); /* * Make sure recv thread has finished, we can safely call nbd_clear_que() * to cancel the inflight I/Os. From e626f37e657adbab2a7abe51480925891662a5f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 May 2022 14:29:43 +0200 Subject: [PATCH 69/72] nvme: split the enum used for various register constants Instead of having one big enum add one for each register or field. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- include/linux/nvme.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bbabdc7600da..5f6d432fa06a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -204,8 +204,9 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, - NVME_CAP_CSS_NVM = 1 << 0, - NVME_CAP_CSS_CSI = 1 << 6, +}; + +enum { NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -214,10 +215,18 @@ enum { NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, NVME_CSTS_SHST_MASK = 3 << 2, +}; + +enum { NVME_CMBMSC_CRE = 1 << 0, NVME_CMBMSC_CMSE = 1 << 1, }; +enum { + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI = 1 << 6, +}; + struct nvme_id_power_state { __le16 max_power; /* centiwatts */ __u8 rsvd2; From 354201c53e61e493017b15327294b0c8ab522d69 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 May 2022 15:09:21 +0200 Subject: [PATCH 70/72] nvme: add support for TP4084 - Time-to-Ready Enhancements Add support for using longer timeouts during controller initialization and letting the controller come up with namespaces that are not ready for I/O yet. We skip these not ready namespaces during scanning and only bring them online once anoter scan is kicked off by the AEN that is set when the NRDY bit gets set in the I/O Command Set Independent Identify Namespace Data Structure. This asynchronous probing avoids blocking the kernel boot when controllers take a very long time to recover after unclean shutdowns (up to minutes). Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke --- drivers/nvme/host/constants.c | 1 + drivers/nvme/host/core.c | 76 ++++++++++++++++++++++++++++++++--- include/linux/nvme.h | 31 ++++++++++++++ 3 files changed, 102 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 1dd1d78de295..4910543f00ff 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -91,6 +91,7 @@ static const char * const nvme_statuses[] = { [NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected", [NVME_SC_CMD_INTERRUPTED] = "Command Interrupted", [NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error", + [NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY] = "Admin Command Media Not Ready", [NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set", [NVME_SC_LBA_RANGE] = "LBA Out of Range", [NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded", diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 42f9772abc4d..faeb03271913 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1427,6 +1427,32 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, return error; } +static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_id_ns_cs_indep **id) +{ + struct nvme_command c = { + .identify.opcode = nvme_admin_identify, + .identify.nsid = cpu_to_le32(nsid), + .identify.cns = NVME_ID_CNS_NS_CS_INDEP, + }; + int ret; + + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); + if (ret) { + dev_warn(ctrl->device, + "Identify namespace (CS independent) failed (%d)\n", + ret); + kfree(*id); + return ret; + } + + return 0; +} + static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, u32 *result) { @@ -2103,10 +2129,9 @@ static const struct block_device_operations nvme_bdev_ops = { .pr_ops = &nvme_pr_ops, }; -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) { - unsigned long timeout = - ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies; u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; int ret; @@ -2119,7 +2144,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) usleep_range(1000, 2000); if (fatal_signal_pending(current)) return -EINTR; - if (time_after(jiffies, timeout)) { + if (time_after(jiffies, timeout_jiffies)) { dev_err(ctrl->device, "Device not ready; aborting %s, CSTS=0x%x\n", enabled ? "initialisation" : "reset", csts); @@ -2150,13 +2175,14 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - return nvme_wait_ready(ctrl, ctrl->cap, false); + return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { unsigned dev_page_min; + u32 timeout; int ret; ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); @@ -2177,6 +2203,27 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ctrl->ctrl_config = NVME_CC_CSS_CSI; else ctrl->ctrl_config = NVME_CC_CSS_NVM; + + if (ctrl->cap & NVME_CAP_CRMS_CRWMS) { + u32 crto; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto); + if (ret) { + dev_err(ctrl->device, "Reading CRTO failed (%d)\n", + ret); + return ret; + } + + if (ctrl->cap & NVME_CAP_CRMS_CRIMS) { + ctrl->ctrl_config |= NVME_CC_CRIME; + timeout = NVME_CRTO_CRIMT(crto); + } else { + timeout = NVME_CRTO_CRWMT(crto); + } + } else { + timeout = NVME_CAP_TIMEOUT(ctrl->cap); + } + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; @@ -2185,7 +2232,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, ctrl->cap, true); + return nvme_wait_ready(ctrl, timeout, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); @@ -4092,11 +4139,26 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns_ids ids = { }; + struct nvme_id_ns_cs_indep *id; struct nvme_ns *ns; + bool ready = true; if (nvme_identify_ns_descs(ctrl, nsid, &ids)) return; + /* + * Check if the namespace is ready. If not ignore it, we will get an + * AEN once it becomes ready and restart the scan. + */ + if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) && + !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) { + ready = id->nstat & NVME_NSTAT_NRDY; + kfree(id); + } + + if (!ready) + return; + ns = nvme_find_get_ns(ctrl, nsid); if (ns) { nvme_validate_ns(ns, &ids); @@ -4841,6 +4903,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) != + NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5f6d432fa06a..29ec3e3481ff 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -137,6 +137,7 @@ enum { NVME_REG_CMBMSC = 0x0050, /* Controller Memory Buffer Memory * Space Control */ + NVME_REG_CRTO = 0x0068, /* Controller Ready Timeouts */ NVME_REG_PMRCAP = 0x0e00, /* Persistent Memory Capabilities */ NVME_REG_PMRCTL = 0x0e04, /* Persistent Memory Region Control */ NVME_REG_PMRSTS = 0x0e08, /* Persistent Memory Region Status */ @@ -161,6 +162,9 @@ enum { #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) +#define NVME_CRTO_CRIMT(crto) ((crto) >> 16) +#define NVME_CRTO_CRWMT(crto) ((crto) & 0xffff) + enum { NVME_CMBSZ_SQS = 1 << 0, NVME_CMBSZ_CQS = 1 << 1, @@ -204,6 +208,7 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CC_CRIME = 1 << 24, }; enum { @@ -227,6 +232,11 @@ enum { NVME_CAP_CSS_CSI = 1 << 6, }; +enum { + NVME_CAP_CRMS_CRIMS = 1ULL << 59, + NVME_CAP_CRMS_CRWMS = 1ULL << 60, +}; + struct nvme_id_power_state { __le16 max_power; /* centiwatts */ __u8 rsvd2; @@ -414,6 +424,21 @@ struct nvme_id_ns { __u8 vs[3712]; }; +/* I/O Command Set Independent Identify Namespace Data Structure */ +struct nvme_id_ns_cs_indep { + __u8 nsfeat; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __le32 anagrpid; + __u8 nsattr; + __u8 rsvd9; + __le16 nvmsetid; + __le16 endgid; + __u8 nstat; + __u8 rsvd15[4081]; +}; + struct nvme_zns_lbafe { __le64 zsze; __u8 zdes; @@ -478,6 +503,7 @@ enum { NVME_ID_CNS_NS_DESC_LIST = 0x03, NVME_ID_CNS_CS_NS = 0x05, NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_NS_CS_INDEP = 0x08, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -531,6 +557,10 @@ enum { NVME_NS_DPS_PI_TYPE3 = 3, }; +enum { + NVME_NSTAT_NRDY = 1 << 0, +}; + enum { NVME_NVM_NS_16B_GUARD = 0, NVME_NVM_NS_32B_GUARD = 1, @@ -1592,6 +1622,7 @@ enum { NVME_SC_NS_WRITE_PROTECTED = 0x20, NVME_SC_CMD_INTERRUPTED = 0x21, NVME_SC_TRANSIENT_TR_ERR = 0x22, + NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY = 0x24, NVME_SC_INVALID_IO_CMD_SET = 0x2C, NVME_SC_LBA_RANGE = 0x80, From 78288665b5d0154978fed431985310cb4f166836 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 18 May 2022 08:51:38 -0700 Subject: [PATCH 71/72] nvme: set non-mdts limits in nvme_scan_work In current implementation we set the non-mdts limits by calling nvme_init_non_mdts_limits() from nvme_init_ctrl_finish(). This also tries to set the limits for the discovery controller which has no I/O queues resulting in the warning message reported by the nvme_log_error() when running blktest nvme/002: - [ 2005.155946] run blktests nvme/002 at 2022-04-09 16:57:47 [ 2005.192223] loop: module loaded [ 2005.196429] nvmet: adding nsid 1 to subsystem blktests-subsystem-0 [ 2005.200334] nvmet: adding nsid 1 to subsystem blktests-subsystem-1 <------------------------------SNIP----------------------------------> [ 2008.958108] nvmet: adding nsid 1 to subsystem blktests-subsystem-997 [ 2008.962082] nvmet: adding nsid 1 to subsystem blktests-subsystem-998 [ 2008.966102] nvmet: adding nsid 1 to subsystem blktests-subsystem-999 [ 2008.973132] nvmet: creating discovery controller 1 for subsystem nqn.2014-08.org.nvmexpress.discovery for NQN testhostnqn. *[ 2008.973196] nvme1: Identify(0x6), Invalid Field in Command (sct 0x0 / sc 0x2) MORE DNR* [ 2008.974595] nvme nvme1: new ctrl: "nqn.2014-08.org.nvmexpress.discovery" [ 2009.103248] nvme nvme1: Removing ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery" Move the call of nvme_init_non_mdts_limits() to nvme_scan_work() after we verify that I/O queues are created since that is a converging point for each transport where these limits are actually used. 1. FC : nvme_fc_create_association() ... nvme_fc_create_io_queues(ctrl); ... nvme_start_ctrl() nvme_scan_queue() nvme_scan_work() 2. PCIe:- nvme_reset_work() ... nvme_setup_io_queues() nvme_create_io_queues() nvme_alloc_queue() ... nvme_start_ctrl() nvme_scan_queue() nvme_scan_work() 3. RDMA :- nvme_rdma_setup_ctrl ... nvme_rdma_configure_io_queues ... nvme_start_ctrl() nvme_scan_queue() nvme_scan_work() 4. TCP :- nvme_tcp_setup_ctrl ... nvme_tcp_configure_io_queues ... nvme_start_ctrl() nvme_scan_queue() nvme_scan_work() * nvme_scan_work() ... nvme_validate_or_alloc_ns() nvme_alloc_ns() nvme_update_ns_info() nvme_update_disk_info() nvme_config_discard() <--- blk_queue_max_write_zeroes_sectors() <--- Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index faeb03271913..a7660e4be55b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3129,10 +3129,6 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) if (ret) return ret; - ret = nvme_init_non_mdts_limits(ctrl); - if (ret < 0) - return ret; - ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; @@ -4301,11 +4297,26 @@ static void nvme_scan_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, scan_work); + int ret; /* No tagset on a live ctrl means IO queues could not created */ if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) return; + /* + * Identify controller limits can change at controller reset due to + * new firmware download, even though it is not common we cannot ignore + * such scenario. Controller's non-mdts limits are reported in the unit + * of logical blocks that is dependent on the format of attached + * namespace. Hence re-read the limits at the time of ns allocation. + */ + ret = nvme_init_non_mdts_limits(ctrl); + if (ret < 0) { + dev_warn(ctrl->device, + "reading non-mdts-limits failed: %d\n", ret); + return; + } + if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { dev_info(ctrl->device, "rescanning namespaces.\n"); nvme_clear_changed_ns_log(ctrl); From 537b9f2bf60f4bbd8ab89cea16aaab70f0c1560d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:10:38 +0200 Subject: [PATCH 72/72] mtip32xx: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20220521111145.81697-28-Julia.Lawall@inria.fr Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4fbaf0b4958b..27386a572ba4 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2729,7 +2729,7 @@ static int mtip_dma_alloc(struct driver_data *dd) { struct mtip_port *port = dd->port; - /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ + /* Allocate dma memory for RX Fis, Identify, and Sector Buffer */ port->block1 = dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, &port->block1_dma, GFP_KERNEL);