diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block new file mode 100644 index 000000000000..8dd3e84a8aad --- /dev/null +++ b/Documentation/ABI/stable/sysfs-block @@ -0,0 +1,676 @@ +What: /sys/block//alignment_offset +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the device is + offset from the disk's natural alignment. + + +What: /sys/block//discard_alignment +Date: May 2011 +Contact: Martin K. Petersen +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + device is offset from the internal allocation unit's + natural alignment. + + +What: /sys/block//diskseq +Date: February 2021 +Contact: Matteo Croce +Description: + The /sys/block//diskseq files reports the disk + sequence number, which is a monotonically increasing + number assigned to every drive. + Some devices, like the loop device, refresh such number + every time the backing file is changed. + The value type is 64 bit unsigned. + + +What: /sys/block//inflight +Date: October 2009 +Contact: Jens Axboe , Nikanth Karthikesan +Description: + Reports the number of I/O requests currently in progress + (pending / in flight) in a device driver. This can be less + than the number of requests queued in the block device queue. + The report contains 2 fields: one for read requests + and one for write requests. + The value type is unsigned int. + Cf. Documentation/block/stat.rst which contains a single value for + requests in flight. + This is related to /sys/block//queue/nr_requests + and for SCSI device also its queue_depth. + + +What: /sys/block//integrity/device_is_integrity_capable +Date: July 2014 +Contact: Martin K. Petersen +Description: + Indicates whether a storage device is capable of storing + integrity metadata. Set if the device is T10 PI-capable. + + +What: /sys/block//integrity/format +Date: June 2008 +Contact: Martin K. Petersen +Description: + Metadata format for integrity capable block device. + E.g. T10-DIF-TYPE1-CRC. + + +What: /sys/block//integrity/protection_interval_bytes +Date: July 2015 +Contact: Martin K. Petersen +Description: + Describes the number of data bytes which are protected + by one integrity tuple. Typically the device's logical + block size. + + +What: /sys/block//integrity/read_verify +Date: June 2008 +Contact: Martin K. Petersen +Description: + Indicates whether the block layer should verify the + integrity of read requests serviced by devices that + support sending integrity metadata. + + +What: /sys/block//integrity/tag_size +Date: June 2008 +Contact: Martin K. Petersen +Description: + Number of bytes of integrity tag space available per + 512 bytes of data. + + +What: /sys/block//integrity/write_generate +Date: June 2008 +Contact: Martin K. Petersen +Description: + Indicates whether the block layer should automatically + generate checksums for write requests bound for + devices that support receiving integrity metadata. + + +What: /sys/block///alignment_offset +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the partition + is offset from the disk's natural alignment. + + +What: /sys/block///discard_alignment +Date: May 2011 +Contact: Martin K. Petersen +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + partition is offset from the internal allocation unit's + natural alignment. + + +What: /sys/block///stat +Date: February 2008 +Contact: Jerome Marchand +Description: + The /sys/block///stat files display the + I/O statistics of partition . The format is the + same as the format of /sys/block//stat. + + +What: /sys/block//queue/add_random +Date: June 2010 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file allows to turn off the disk entropy contribution. + Default value of this file is '1'(on). + + +What: /sys/block//queue/chunk_sectors +Date: September 2016 +Contact: Hannes Reinecke +Description: + [RO] chunk_sectors has different meaning depending on the type + of the disk. For a RAID device (dm-raid), chunk_sectors + indicates the size in 512B sectors of the RAID volume stripe + segment. For a zoned block device, either host-aware or + host-managed, chunk_sectors indicates the size in 512B sectors + of the zones of the device, with the eventual exception of the + last zone of the device which may be smaller. + + +What: /sys/block//queue/dax +Date: June 2016 +Contact: linux-block@vger.kernel.org +Description: + [RO] This file indicates whether the device supports Direct + Access (DAX), used by CPU-addressable storage to bypass the + pagecache. It shows '1' if true, '0' if not. + + +What: /sys/block//queue/discard_granularity +Date: May 2011 +Contact: Martin K. Petersen +Description: + [RO] Devices that support discard functionality may internally + allocate space using units that are bigger than the logical + block size. The discard_granularity parameter indicates the size + of the internal allocation unit in bytes if reported by the + device. Otherwise the discard_granularity will be set to match + the device's physical block size. A discard_granularity of 0 + means that the device does not support discard functionality. + + +What: /sys/block//queue/discard_max_bytes +Date: May 2011 +Contact: Martin K. Petersen +Description: + [RW] While discard_max_hw_bytes is the hardware limit for the + device, this setting is the software limit. Some devices exhibit + large latencies when large discards are issued, setting this + value lower will make Linux issue smaller discards and + potentially help reduce latencies induced by large discard + operations. + + +What: /sys/block//queue/discard_max_hw_bytes +Date: July 2015 +Contact: linux-block@vger.kernel.org +Description: + [RO] Devices that support discard functionality may have + internal limits on the number of bytes that can be trimmed or + unmapped in a single operation. The `discard_max_hw_bytes` + parameter is set by the device driver to the maximum number of + bytes that can be discarded in a single operation. Discard + requests issued to the device must not exceed this limit. A + `discard_max_hw_bytes` value of 0 means that the device does not + support discard functionality. + + +What: /sys/block//queue/discard_zeroes_data +Date: May 2011 +Contact: Martin K. Petersen +Description: + [RO] Will always return 0. Don't rely on any specific behavior + for discards, and don't read this file. + + +What: /sys/block//queue/fua +Date: May 2018 +Contact: linux-block@vger.kernel.org +Description: + [RO] Whether or not the block driver supports the FUA flag for + write requests. FUA stands for Force Unit Access. If the FUA + flag is set that means that write requests must bypass the + volatile cache of the storage device. + + +What: /sys/block//queue/hw_sector_size +Date: January 2008 +Contact: linux-block@vger.kernel.org +Description: + [RO] This is the hardware sector size of the device, in bytes. + + +What: /sys/block//queue/independent_access_ranges/ +Date: October 2021 +Contact: linux-block@vger.kernel.org +Description: + [RO] The presence of this sub-directory of the + /sys/block/xxx/queue/ directory indicates that the device is + capable of executing requests targeting different sector ranges + in parallel. For instance, single LUN multi-actuator hard-disks + will have an independent_access_ranges directory if the device + correctly advertizes the sector ranges of its actuators. + + The independent_access_ranges directory contains one directory + per access range, with each range described using the sector + (RO) attribute file to indicate the first sector of the range + and the nr_sectors (RO) attribute file to indicate the total + number of sectors in the range starting from the first sector of + the range. For example, a dual-actuator hard-disk will have the + following independent_access_ranges entries.:: + + $ tree /sys/block//queue/independent_access_ranges/ + /sys/block//queue/independent_access_ranges/ + |-- 0 + | |-- nr_sectors + | `-- sector + `-- 1 + |-- nr_sectors + `-- sector + + The sector and nr_sectors attributes use 512B sector unit, + regardless of the actual block size of the device. Independent + access ranges do not overlap and include all sectors within the + device capacity. The access ranges are numbered in increasing + order of the range start sector, that is, the sector attribute + of range 0 always has the value 0. + + +What: /sys/block//queue/io_poll +Date: November 2015 +Contact: linux-block@vger.kernel.org +Description: + [RW] When read, this file shows whether polling is enabled (1) + or disabled (0). Writing '0' to this file will disable polling + for this device. Writing any non-zero value will enable this + feature. + + +What: /sys/block//queue/io_poll_delay +Date: November 2016 +Contact: linux-block@vger.kernel.org +Description: + [RW] If polling is enabled, this controls what kind of polling + will be performed. It defaults to -1, which is classic polling. + In this mode, the CPU will repeatedly ask for completions + without giving up any time. If set to 0, a hybrid polling mode + is used, where the kernel will attempt to make an educated guess + at when the IO will complete. Based on this guess, the kernel + will put the process issuing IO to sleep for an amount of time, + before entering a classic poll loop. This mode might be a little + slower than pure classic polling, but it will be more efficient. + If set to a value larger than 0, the kernel will put the process + issuing IO to sleep for this amount of microseconds before + entering classic polling. + + +What: /sys/block//queue/io_timeout +Date: November 2018 +Contact: Weiping Zhang +Description: + [RW] io_timeout is the request timeout in milliseconds. If a + request does not complete in this time then the block driver + timeout handler is invoked. That timeout handler can decide to + retry the request, to fail it or to start a device recovery + strategy. + + +What: /sys/block//queue/iostats +Date: January 2009 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file is used to control (on/off) the iostats + accounting of the disk. + + +What: /sys/block//queue/logical_block_size +Date: May 2009 +Contact: Martin K. Petersen +Description: + [RO] This is the smallest unit the storage device can address. + It is typically 512 bytes. + + +What: /sys/block//queue/max_active_zones +Date: July 2020 +Contact: Niklas Cassel +Description: + [RO] For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, + is limited by this value. If this value is 0, there is no limit. + + If the host attempts to exceed this limit, the driver should + report this error with BLK_STS_ZONE_ACTIVE_RESOURCE, which user + space may see as the EOVERFLOW errno. + + +What: /sys/block//queue/max_discard_segments +Date: February 2017 +Contact: linux-block@vger.kernel.org +Description: + [RO] The maximum number of DMA scatter/gather entries in a + discard request. + + +What: /sys/block//queue/max_hw_sectors_kb +Date: September 2004 +Contact: linux-block@vger.kernel.org +Description: + [RO] This is the maximum number of kilobytes supported in a + single data transfer. + + +What: /sys/block//queue/max_integrity_segments +Date: September 2010 +Contact: linux-block@vger.kernel.org +Description: + [RO] Maximum number of elements in a DMA scatter/gather list + with integrity data that will be submitted by the block layer + core to the associated block driver. + + +What: /sys/block//queue/max_open_zones +Date: July 2020 +Contact: Niklas Cassel +Description: + [RO] For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, is + limited by this value. If this value is 0, there is no limit. + + +What: /sys/block//queue/max_sectors_kb +Date: September 2004 +Contact: linux-block@vger.kernel.org +Description: + [RW] This is the maximum number of kilobytes that the block + layer will allow for a filesystem request. Must be smaller than + or equal to the maximum size allowed by the hardware. + + +What: /sys/block//queue/max_segment_size +Date: March 2010 +Contact: linux-block@vger.kernel.org +Description: + [RO] Maximum size in bytes of a single element in a DMA + scatter/gather list. + + +What: /sys/block//queue/max_segments +Date: March 2010 +Contact: linux-block@vger.kernel.org +Description: + [RO] Maximum number of elements in a DMA scatter/gather list + that is submitted to the associated block driver. + + +What: /sys/block//queue/minimum_io_size +Date: April 2009 +Contact: Martin K. Petersen +Description: + [RO] Storage devices may report a granularity or preferred + minimum I/O size which is the smallest request the device can + perform without incurring a performance penalty. For disk + drives this is often the physical block size. For RAID arrays + it is often the stripe chunk size. A properly aligned multiple + of minimum_io_size is the preferred request size for workloads + where a high number of I/O operations is desired. + + +What: /sys/block//queue/nomerges +Date: January 2010 +Contact: linux-block@vger.kernel.org +Description: + [RW] Standard I/O elevator operations include attempts to merge + contiguous I/Os. For known random I/O loads these attempts will + always fail and result in extra cycles being spent in the + kernel. This allows one to turn off this behavior on one of two + ways: When set to 1, complex merge checks are disabled, but the + simple one-shot merges with the previous I/O request are + enabled. When set to 2, all merge tries are disabled. The + default value is 0 - which enables all types of merge tries. + + +What: /sys/block//queue/nr_requests +Date: July 2003 +Contact: linux-block@vger.kernel.org +Description: + [RW] This controls how many requests may be allocated in the + block layer for read or write requests. Note that the total + allocated number may be twice this amount, since it applies only + to reads or writes (not the accumulated sum). + + To avoid priority inversion through request starvation, a + request queue maintains a separate request pool per each cgroup + when CONFIG_BLK_CGROUP is enabled, and this parameter applies to + each such per-block-cgroup request pool. IOW, if there are N + block cgroups, each request queue may have up to N request + pools, each independently regulated by nr_requests. + + +What: /sys/block//queue/nr_zones +Date: November 2018 +Contact: Damien Le Moal +Description: + [RO] nr_zones indicates the total number of zones of a zoned + block device ("host-aware" or "host-managed" zone model). For + regular block devices, the value is always 0. + + +What: /sys/block//queue/optimal_io_size +Date: April 2009 +Contact: Martin K. Petersen +Description: + [RO] Storage devices may report an optimal I/O size, which is + the device's preferred unit for sustained I/O. This is rarely + reported for disk drives. For RAID arrays it is usually the + stripe width or the internal track size. A properly aligned + multiple of optimal_io_size is the preferred request size for + workloads where sustained throughput is desired. If no optimal + I/O size is reported this file contains 0. + + +What: /sys/block//queue/physical_block_size +Date: May 2009 +Contact: Martin K. Petersen +Description: + [RO] This is the smallest unit a physical storage device can + write atomically. It is usually the same as the logical block + size but may be bigger. One example is SATA drives with 4KB + sectors that expose a 512-byte logical block size to the + operating system. For stacked block devices the + physical_block_size variable contains the maximum + physical_block_size of the component devices. + + +What: /sys/block//queue/read_ahead_kb +Date: May 2004 +Contact: linux-block@vger.kernel.org +Description: + [RW] Maximum number of kilobytes to read-ahead for filesystems + on this block device. + + +What: /sys/block//queue/rotational +Date: January 2009 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file is used to stat if the device is of rotational + type or non-rotational type. + + +What: /sys/block//queue/rq_affinity +Date: September 2008 +Contact: linux-block@vger.kernel.org +Description: + [RW] If this option is '1', the block layer will migrate request + completions to the cpu "group" that originally submitted the + request. For some workloads this provides a significant + reduction in CPU cycles due to caching effects. + + For storage configurations that need to maximize distribution of + completion processing setting this option to '2' forces the + completion to run on the requesting cpu (bypassing the "group" + aggregation logic). + + +What: /sys/block//queue/scheduler +Date: October 2004 +Contact: linux-block@vger.kernel.org +Description: + [RW] When read, this file will display the current and available + IO schedulers for this block device. The currently active IO + scheduler will be enclosed in [] brackets. Writing an IO + scheduler name to this file will switch control of this block + device to that new IO scheduler. Note that writing an IO + scheduler name to this file will attempt to load that IO + scheduler module, if it isn't already present in the system. + + +What: /sys/block//queue/stable_writes +Date: September 2020 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file will contain '1' if memory must not be modified + while it is being used in a write request to this device. When + this is the case and the kernel is performing writeback of a + page, the kernel will wait for writeback to complete before + allowing the page to be modified again, rather than allowing + immediate modification as is normally the case. This + restriction arises when the device accesses the memory multiple + times where the same data must be seen every time -- for + example, once to calculate a checksum and once to actually write + the data. If no such restriction exists, this file will contain + '0'. This file is writable for testing purposes. + + +What: /sys/block//queue/throttle_sample_time +Date: March 2017 +Contact: linux-block@vger.kernel.org +Description: + [RW] This is the time window that blk-throttle samples data, in + millisecond. blk-throttle makes decision based on the + samplings. Lower time means cgroups have more smooth throughput, + but higher CPU overhead. This exists only when + CONFIG_BLK_DEV_THROTTLING_LOW is enabled. + + +What: /sys/block//queue/virt_boundary_mask +Date: April 2021 +Contact: linux-block@vger.kernel.org +Description: + [RO] This file shows the I/O segment memory alignment mask for + the block device. I/O requests to this device will be split + between segments wherever either the memory address of the end + of the previous segment or the memory address of the beginning + of the current segment is not aligned to virt_boundary_mask + 1 + bytes. + + +What: /sys/block//queue/wbt_lat_usec +Date: November 2016 +Contact: linux-block@vger.kernel.org +Description: + [RW] If the device is registered for writeback throttling, then + this file shows the target minimum read latency. If this latency + is exceeded in a given window of time (see wb_window_usec), then + the writeback throttling will start scaling back writes. Writing + a value of '0' to this file disables the feature. Writing a + value of '-1' to this file resets the value to the default + setting. + + +What: /sys/block//queue/write_cache +Date: April 2016 +Contact: linux-block@vger.kernel.org +Description: + [RW] When read, this file will display whether the device has + write back caching enabled or not. It will return "write back" + for the former case, and "write through" for the latter. Writing + to this file can change the kernels view of the device, but it + doesn't alter the device state. This means that it might not be + safe to toggle the setting from "write back" to "write through", + since that will also eliminate cache flushes issued by the + kernel. + + +What: /sys/block//queue/write_same_max_bytes +Date: January 2012 +Contact: Martin K. Petersen +Description: + [RO] Some devices support a write same operation in which a + single data block can be written to a range of several + contiguous blocks on storage. This can be used to wipe areas on + disk or to initialize drives in a RAID configuration. + write_same_max_bytes indicates how many bytes can be written in + a single write same command. If write_same_max_bytes is 0, write + same is not supported by the device. + + +What: /sys/block//queue/write_zeroes_max_bytes +Date: November 2016 +Contact: Chaitanya Kulkarni +Description: + [RO] Devices that support write zeroes operation in which a + single request can be issued to zero out the range of contiguous + blocks on storage without having any payload in the request. + This can be used to optimize writing zeroes to the devices. + write_zeroes_max_bytes indicates how many bytes can be written + in a single write zeroes command. If write_zeroes_max_bytes is + 0, write zeroes is not supported by the device. + + +What: /sys/block//queue/zone_append_max_bytes +Date: May 2020 +Contact: linux-block@vger.kernel.org +Description: + [RO] This is the maximum number of bytes that can be written to + a sequential zone of a zoned block device using a zone append + write operation (REQ_OP_ZONE_APPEND). This value is always 0 for + regular block devices. + + +What: /sys/block//queue/zone_write_granularity +Date: January 2021 +Contact: linux-block@vger.kernel.org +Description: + [RO] This indicates the alignment constraint, in bytes, for + write operations in sequential zones of zoned block devices + (devices with a zoned attributed that reports "host-managed" or + "host-aware"). This value is always 0 for regular block devices. + + +What: /sys/block//queue/zoned +Date: September 2016 +Contact: Damien Le Moal +Description: + [RO] zoned indicates if the device is a zoned block device and + the zone model of the device if it is indeed zoned. The + possible values indicated by zoned are "none" for regular block + devices and "host-aware" or "host-managed" for zoned block + devices. The characteristics of host-aware and host-managed + zoned block devices are described in the ZBC (Zoned Block + Commands) and ZAC (Zoned Device ATA Command Set) standards. + These standards also define the "drive-managed" zone model. + However, since drive-managed zoned block devices do not support + zone commands, they will be treated as regular block devices and + zoned will report "none". + + +What: /sys/block//stat +Date: February 2008 +Contact: Jerome Marchand +Description: + The /sys/block//stat files displays the I/O + statistics of disk . They contain 11 fields: + + == ============================================== + 1 reads completed successfully + 2 reads merged + 3 sectors read + 4 time spent reading (ms) + 5 writes completed + 6 writes merged + 7 sectors written + 8 time spent writing (ms) + 9 I/Os currently in progress + 10 time spent doing I/Os (ms) + 11 weighted time spent doing I/Os (ms) + 12 discards completed + 13 discards merged + 14 sectors discarded + 15 time spent discarding (ms) + 16 flush requests completed + 17 time spent flushing (ms) + == ============================================== + + For more details refer Documentation/admin-guide/iostats.rst diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block deleted file mode 100644 index b16b0c45a272..000000000000 --- a/Documentation/ABI/testing/sysfs-block +++ /dev/null @@ -1,346 +0,0 @@ -What: /sys/block//stat -Date: February 2008 -Contact: Jerome Marchand -Description: - The /sys/block//stat files displays the I/O - statistics of disk . They contain 11 fields: - - == ============================================== - 1 reads completed successfully - 2 reads merged - 3 sectors read - 4 time spent reading (ms) - 5 writes completed - 6 writes merged - 7 sectors written - 8 time spent writing (ms) - 9 I/Os currently in progress - 10 time spent doing I/Os (ms) - 11 weighted time spent doing I/Os (ms) - 12 discards completed - 13 discards merged - 14 sectors discarded - 15 time spent discarding (ms) - 16 flush requests completed - 17 time spent flushing (ms) - == ============================================== - - For more details refer Documentation/admin-guide/iostats.rst - - -What: /sys/block//inflight -Date: October 2009 -Contact: Jens Axboe , Nikanth Karthikesan -Description: - Reports the number of I/O requests currently in progress - (pending / in flight) in a device driver. This can be less - than the number of requests queued in the block device queue. - The report contains 2 fields: one for read requests - and one for write requests. - The value type is unsigned int. - Cf. Documentation/block/stat.rst which contains a single value for - requests in flight. - This is related to nr_requests in Documentation/block/queue-sysfs.rst - and for SCSI device also its queue_depth. - - -What: /sys/block//diskseq -Date: February 2021 -Contact: Matteo Croce -Description: - The /sys/block//diskseq files reports the disk - sequence number, which is a monotonically increasing - number assigned to every drive. - Some devices, like the loop device, refresh such number - every time the backing file is changed. - The value type is 64 bit unsigned. - - -What: /sys/block///stat -Date: February 2008 -Contact: Jerome Marchand -Description: - The /sys/block///stat files display the - I/O statistics of partition . The format is the - same as the above-written /sys/block//stat - format. - - -What: /sys/block//integrity/format -Date: June 2008 -Contact: Martin K. Petersen -Description: - Metadata format for integrity capable block device. - E.g. T10-DIF-TYPE1-CRC. - - -What: /sys/block//integrity/read_verify -Date: June 2008 -Contact: Martin K. Petersen -Description: - Indicates whether the block layer should verify the - integrity of read requests serviced by devices that - support sending integrity metadata. - - -What: /sys/block//integrity/tag_size -Date: June 2008 -Contact: Martin K. Petersen -Description: - Number of bytes of integrity tag space available per - 512 bytes of data. - - -What: /sys/block//integrity/device_is_integrity_capable -Date: July 2014 -Contact: Martin K. Petersen -Description: - Indicates whether a storage device is capable of storing - integrity metadata. Set if the device is T10 PI-capable. - -What: /sys/block//integrity/protection_interval_bytes -Date: July 2015 -Contact: Martin K. Petersen -Description: - Describes the number of data bytes which are protected - by one integrity tuple. Typically the device's logical - block size. - -What: /sys/block//integrity/write_generate -Date: June 2008 -Contact: Martin K. Petersen -Description: - Indicates whether the block layer should automatically - generate checksums for write requests bound for - devices that support receiving integrity metadata. - -What: /sys/block//alignment_offset -Date: April 2009 -Contact: Martin K. Petersen -Description: - Storage devices may report a physical block size that is - bigger than the logical block size (for instance a drive - with 4KB physical sectors exposing 512-byte logical - blocks to the operating system). This parameter - indicates how many bytes the beginning of the device is - offset from the disk's natural alignment. - -What: /sys/block///alignment_offset -Date: April 2009 -Contact: Martin K. Petersen -Description: - Storage devices may report a physical block size that is - bigger than the logical block size (for instance a drive - with 4KB physical sectors exposing 512-byte logical - blocks to the operating system). This parameter - indicates how many bytes the beginning of the partition - is offset from the disk's natural alignment. - -What: /sys/block//queue/logical_block_size -Date: May 2009 -Contact: Martin K. Petersen -Description: - This is the smallest unit the storage device can - address. It is typically 512 bytes. - -What: /sys/block//queue/physical_block_size -Date: May 2009 -Contact: Martin K. Petersen -Description: - This is the smallest unit a physical storage device can - write atomically. It is usually the same as the logical - block size but may be bigger. One example is SATA - drives with 4KB sectors that expose a 512-byte logical - block size to the operating system. For stacked block - devices the physical_block_size variable contains the - maximum physical_block_size of the component devices. - -What: /sys/block//queue/minimum_io_size -Date: April 2009 -Contact: Martin K. Petersen -Description: - Storage devices may report a granularity or preferred - minimum I/O size which is the smallest request the - device can perform without incurring a performance - penalty. For disk drives this is often the physical - block size. For RAID arrays it is often the stripe - chunk size. A properly aligned multiple of - minimum_io_size is the preferred request size for - workloads where a high number of I/O operations is - desired. - -What: /sys/block//queue/optimal_io_size -Date: April 2009 -Contact: Martin K. Petersen -Description: - Storage devices may report an optimal I/O size, which is - the device's preferred unit for sustained I/O. This is - rarely reported for disk drives. For RAID arrays it is - usually the stripe width or the internal track size. A - properly aligned multiple of optimal_io_size is the - preferred request size for workloads where sustained - throughput is desired. If no optimal I/O size is - reported this file contains 0. - -What: /sys/block//queue/nomerges -Date: January 2010 -Contact: -Description: - Standard I/O elevator operations include attempts to - merge contiguous I/Os. For known random I/O loads these - attempts will always fail and result in extra cycles - being spent in the kernel. This allows one to turn off - this behavior on one of two ways: When set to 1, complex - merge checks are disabled, but the simple one-shot merges - with the previous I/O request are enabled. When set to 2, - all merge tries are disabled. The default value is 0 - - which enables all types of merge tries. - -What: /sys/block//discard_alignment -Date: May 2011 -Contact: Martin K. Petersen -Description: - Devices that support discard functionality may - internally allocate space in units that are bigger than - the exported logical block size. The discard_alignment - parameter indicates how many bytes the beginning of the - device is offset from the internal allocation unit's - natural alignment. - -What: /sys/block///discard_alignment -Date: May 2011 -Contact: Martin K. Petersen -Description: - Devices that support discard functionality may - internally allocate space in units that are bigger than - the exported logical block size. The discard_alignment - parameter indicates how many bytes the beginning of the - partition is offset from the internal allocation unit's - natural alignment. - -What: /sys/block//queue/discard_granularity -Date: May 2011 -Contact: Martin K. Petersen -Description: - Devices that support discard functionality may - internally allocate space using units that are bigger - than the logical block size. The discard_granularity - parameter indicates the size of the internal allocation - unit in bytes if reported by the device. Otherwise the - discard_granularity will be set to match the device's - physical block size. A discard_granularity of 0 means - that the device does not support discard functionality. - -What: /sys/block//queue/discard_max_bytes -Date: May 2011 -Contact: Martin K. Petersen -Description: - Devices that support discard functionality may have - internal limits on the number of bytes that can be - trimmed or unmapped in a single operation. Some storage - protocols also have inherent limits on the number of - blocks that can be described in a single command. The - discard_max_bytes parameter is set by the device driver - to the maximum number of bytes that can be discarded in - a single operation. Discard requests issued to the - device must not exceed this limit. A discard_max_bytes - value of 0 means that the device does not support - discard functionality. - -What: /sys/block//queue/discard_zeroes_data -Date: May 2011 -Contact: Martin K. Petersen -Description: - Will always return 0. Don't rely on any specific behavior - for discards, and don't read this file. - -What: /sys/block//queue/write_same_max_bytes -Date: January 2012 -Contact: Martin K. Petersen -Description: - Some devices support a write same operation in which a - single data block can be written to a range of several - contiguous blocks on storage. This can be used to wipe - areas on disk or to initialize drives in a RAID - configuration. write_same_max_bytes indicates how many - bytes can be written in a single write same command. If - write_same_max_bytes is 0, write same is not supported - by the device. - -What: /sys/block//queue/write_zeroes_max_bytes -Date: November 2016 -Contact: Chaitanya Kulkarni -Description: - Devices that support write zeroes operation in which a - single request can be issued to zero out the range of - contiguous blocks on storage without having any payload - in the request. This can be used to optimize writing zeroes - to the devices. write_zeroes_max_bytes indicates how many - bytes can be written in a single write zeroes command. If - write_zeroes_max_bytes is 0, write zeroes is not supported - by the device. - -What: /sys/block//queue/zoned -Date: September 2016 -Contact: Damien Le Moal -Description: - zoned indicates if the device is a zoned block device - and the zone model of the device if it is indeed zoned. - The possible values indicated by zoned are "none" for - regular block devices and "host-aware" or "host-managed" - for zoned block devices. The characteristics of - host-aware and host-managed zoned block devices are - described in the ZBC (Zoned Block Commands) and ZAC - (Zoned Device ATA Command Set) standards. These standards - also define the "drive-managed" zone model. However, - since drive-managed zoned block devices do not support - zone commands, they will be treated as regular block - devices and zoned will report "none". - -What: /sys/block//queue/nr_zones -Date: November 2018 -Contact: Damien Le Moal -Description: - nr_zones indicates the total number of zones of a zoned block - device ("host-aware" or "host-managed" zone model). For regular - block devices, the value is always 0. - -What: /sys/block//queue/max_active_zones -Date: July 2020 -Contact: Niklas Cassel -Description: - For zoned block devices (zoned attribute indicating - "host-managed" or "host-aware"), the sum of zones belonging to - any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, - is limited by this value. If this value is 0, there is no limit. - -What: /sys/block//queue/max_open_zones -Date: July 2020 -Contact: Niklas Cassel -Description: - For zoned block devices (zoned attribute indicating - "host-managed" or "host-aware"), the sum of zones belonging to - any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, - is limited by this value. If this value is 0, there is no limit. - -What: /sys/block//queue/chunk_sectors -Date: September 2016 -Contact: Hannes Reinecke -Description: - chunk_sectors has different meaning depending on the type - of the disk. For a RAID device (dm-raid), chunk_sectors - indicates the size in 512B sectors of the RAID volume - stripe segment. For a zoned block device, either - host-aware or host-managed, chunk_sectors indicates the - size in 512B sectors of the zones of the device, with - the eventual exception of the last zone of the device - which may be smaller. - -What: /sys/block//queue/io_timeout -Date: November 2018 -Contact: Weiping Zhang -Description: - io_timeout is the request timeout in milliseconds. If a request - does not complete in this time then the block driver timeout - handler is invoked. That timeout handler can decide to retry - the request, to fail it or to start a device recovery strategy. diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst index 86dcf7159f99..3a41495dd77b 100644 --- a/Documentation/block/index.rst +++ b/Documentation/block/index.rst @@ -20,7 +20,6 @@ Block kyber-iosched null_blk pr - queue-sysfs request stat switching-sched diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst deleted file mode 100644 index 3f569d532485..000000000000 --- a/Documentation/block/queue-sysfs.rst +++ /dev/null @@ -1,321 +0,0 @@ -================= -Queue sysfs files -================= - -This text file will detail the queue files that are located in the sysfs tree -for each block device. Note that stacked devices typically do not export -any settings, since their queue merely functions as a remapping target. -These files are the ones found in the /sys/block/xxx/queue/ directory. - -Files denoted with a RO postfix are readonly and the RW postfix means -read-write. - -add_random (RW) ---------------- -This file allows to turn off the disk entropy contribution. Default -value of this file is '1'(on). - -chunk_sectors (RO) ------------------- -This has different meaning depending on the type of the block device. -For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors -of the RAID volume stripe segment. For a zoned block device, either host-aware -or host-managed, chunk_sectors indicates the size in 512B sectors of the zones -of the device, with the eventual exception of the last zone of the device which -may be smaller. - -dax (RO) --------- -This file indicates whether the device supports Direct Access (DAX), -used by CPU-addressable storage to bypass the pagecache. It shows '1' -if true, '0' if not. - -discard_granularity (RO) ------------------------- -This shows the size of internal allocation of the device in bytes, if -reported by the device. A value of '0' means device does not support -the discard functionality. - -discard_max_hw_bytes (RO) -------------------------- -Devices that support discard functionality may have internal limits on -the number of bytes that can be trimmed or unmapped in a single operation. -The `discard_max_hw_bytes` parameter is set by the device driver to the -maximum number of bytes that can be discarded in a single operation. -Discard requests issued to the device must not exceed this limit. -A `discard_max_hw_bytes` value of 0 means that the device does not support -discard functionality. - -discard_max_bytes (RW) ----------------------- -While discard_max_hw_bytes is the hardware limit for the device, this -setting is the software limit. Some devices exhibit large latencies when -large discards are issued, setting this value lower will make Linux issue -smaller discards and potentially help reduce latencies induced by large -discard operations. - -discard_zeroes_data (RO) ------------------------- -Obsolete. Always zero. - -fua (RO) --------- -Whether or not the block driver supports the FUA flag for write requests. -FUA stands for Force Unit Access. If the FUA flag is set that means that -write requests must bypass the volatile cache of the storage device. - -hw_sector_size (RO) -------------------- -This is the hardware sector size of the device, in bytes. - -io_poll (RW) ------------- -When read, this file shows whether polling is enabled (1) or disabled -(0). Writing '0' to this file will disable polling for this device. -Writing any non-zero value will enable this feature. - -io_poll_delay (RW) ------------------- -If polling is enabled, this controls what kind of polling will be -performed. It defaults to -1, which is classic polling. In this mode, -the CPU will repeatedly ask for completions without giving up any time. -If set to 0, a hybrid polling mode is used, where the kernel will attempt -to make an educated guess at when the IO will complete. Based on this -guess, the kernel will put the process issuing IO to sleep for an amount -of time, before entering a classic poll loop. This mode might be a -little slower than pure classic polling, but it will be more efficient. -If set to a value larger than 0, the kernel will put the process issuing -IO to sleep for this amount of microseconds before entering classic -polling. - -io_timeout (RW) ---------------- -io_timeout is the request timeout in milliseconds. If a request does not -complete in this time then the block driver timeout handler is invoked. -That timeout handler can decide to retry the request, to fail it or to start -a device recovery strategy. - -iostats (RW) -------------- -This file is used to control (on/off) the iostats accounting of the -disk. - -logical_block_size (RO) ------------------------ -This is the logical block size of the device, in bytes. - -max_discard_segments (RO) -------------------------- -The maximum number of DMA scatter/gather entries in a discard request. - -max_hw_sectors_kb (RO) ----------------------- -This is the maximum number of kilobytes supported in a single data transfer. - -max_integrity_segments (RO) ---------------------------- -Maximum number of elements in a DMA scatter/gather list with integrity -data that will be submitted by the block layer core to the associated -block driver. - -max_active_zones (RO) ---------------------- -For zoned block devices (zoned attribute indicating "host-managed" or -"host-aware"), the sum of zones belonging to any of the zone states: -EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value. -If this value is 0, there is no limit. - -If the host attempts to exceed this limit, the driver should report this error -with BLK_STS_ZONE_ACTIVE_RESOURCE, which user space may see as the EOVERFLOW -errno. - -max_open_zones (RO) -------------------- -For zoned block devices (zoned attribute indicating "host-managed" or -"host-aware"), the sum of zones belonging to any of the zone states: -EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value. -If this value is 0, there is no limit. - -If the host attempts to exceed this limit, the driver should report this error -with BLK_STS_ZONE_OPEN_RESOURCE, which user space may see as the ETOOMANYREFS -errno. - -max_sectors_kb (RW) -------------------- -This is the maximum number of kilobytes that the block layer will allow -for a filesystem request. Must be smaller than or equal to the maximum -size allowed by the hardware. - -max_segments (RO) ------------------ -Maximum number of elements in a DMA scatter/gather list that is submitted -to the associated block driver. - -max_segment_size (RO) ---------------------- -Maximum size in bytes of a single element in a DMA scatter/gather list. - -minimum_io_size (RO) --------------------- -This is the smallest preferred IO size reported by the device. - -nomerges (RW) -------------- -This enables the user to disable the lookup logic involved with IO -merging requests in the block layer. By default (0) all merges are -enabled. When set to 1 only simple one-hit merges will be tried. When -set to 2 no merge algorithms will be tried (including one-hit or more -complex tree/hash lookups). - -nr_requests (RW) ----------------- -This controls how many requests may be allocated in the block layer for -read or write requests. Note that the total allocated number may be twice -this amount, since it applies only to reads or writes (not the accumulated -sum). - -To avoid priority inversion through request starvation, a request -queue maintains a separate request pool per each cgroup when -CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such -per-block-cgroup request pool. IOW, if there are N block cgroups, -each request queue may have up to N request pools, each independently -regulated by nr_requests. - -nr_zones (RO) -------------- -For zoned block devices (zoned attribute indicating "host-managed" or -"host-aware"), this indicates the total number of zones of the device. -This is always 0 for regular block devices. - -optimal_io_size (RO) --------------------- -This is the optimal IO size reported by the device. - -physical_block_size (RO) ------------------------- -This is the physical block size of device, in bytes. - -read_ahead_kb (RW) ------------------- -Maximum number of kilobytes to read-ahead for filesystems on this block -device. - -rotational (RW) ---------------- -This file is used to stat if the device is of rotational type or -non-rotational type. - -rq_affinity (RW) ----------------- -If this option is '1', the block layer will migrate request completions to the -cpu "group" that originally submitted the request. For some workloads this -provides a significant reduction in CPU cycles due to caching effects. - -For storage configurations that need to maximize distribution of completion -processing setting this option to '2' forces the completion to run on the -requesting cpu (bypassing the "group" aggregation logic). - -scheduler (RW) --------------- -When read, this file will display the current and available IO schedulers -for this block device. The currently active IO scheduler will be enclosed -in [] brackets. Writing an IO scheduler name to this file will switch -control of this block device to that new IO scheduler. Note that writing -an IO scheduler name to this file will attempt to load that IO scheduler -module, if it isn't already present in the system. - -write_cache (RW) ----------------- -When read, this file will display whether the device has write back -caching enabled or not. It will return "write back" for the former -case, and "write through" for the latter. Writing to this file can -change the kernels view of the device, but it doesn't alter the -device state. This means that it might not be safe to toggle the -setting from "write back" to "write through", since that will also -eliminate cache flushes issued by the kernel. - -write_same_max_bytes (RO) -------------------------- -This is the number of bytes the device can write in a single write-same -command. A value of '0' means write-same is not supported by this -device. - -wbt_lat_usec (RW) ------------------ -If the device is registered for writeback throttling, then this file shows -the target minimum read latency. If this latency is exceeded in a given -window of time (see wb_window_usec), then the writeback throttling will start -scaling back writes. Writing a value of '0' to this file disables the -feature. Writing a value of '-1' to this file resets the value to the -default setting. - -throttle_sample_time (RW) -------------------------- -This is the time window that blk-throttle samples data, in millisecond. -blk-throttle makes decision based on the samplings. Lower time means cgroups -have more smooth throughput, but higher CPU overhead. This exists only when -CONFIG_BLK_DEV_THROTTLING_LOW is enabled. - -write_zeroes_max_bytes (RO) ---------------------------- -For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of -bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES -is not supported. - -zone_append_max_bytes (RO) --------------------------- -This is the maximum number of bytes that can be written to a sequential -zone of a zoned block device using a zone append write operation -(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices. - -zoned (RO) ----------- -This indicates if the device is a zoned block device and the zone model of the -device if it is indeed zoned. The possible values indicated by zoned are -"none" for regular block devices and "host-aware" or "host-managed" for zoned -block devices. The characteristics of host-aware and host-managed zoned block -devices are described in the ZBC (Zoned Block Commands) and ZAC -(Zoned Device ATA Command Set) standards. These standards also define the -"drive-managed" zone model. However, since drive-managed zoned block devices -do not support zone commands, they will be treated as regular block devices -and zoned will report "none". - -zone_write_granularity (RO) ---------------------------- -This indicates the alignment constraint, in bytes, for write operations in -sequential zones of zoned block devices (devices with a zoned attributed -that reports "host-managed" or "host-aware"). This value is always 0 for -regular block devices. - -independent_access_ranges (RO) ------------------------------- - -The presence of this sub-directory of the /sys/block/xxx/queue/ directory -indicates that the device is capable of executing requests targeting -different sector ranges in parallel. For instance, single LUN multi-actuator -hard-disks will have an independent_access_ranges directory if the device -correctly advertizes the sector ranges of its actuators. - -The independent_access_ranges directory contains one directory per access -range, with each range described using the sector (RO) attribute file to -indicate the first sector of the range and the nr_sectors (RO) attribute file -to indicate the total number of sectors in the range starting from the first -sector of the range. For example, a dual-actuator hard-disk will have the -following independent_access_ranges entries.:: - - $ tree /sys/block//queue/independent_access_ranges/ - /sys/block//queue/independent_access_ranges/ - |-- 0 - | |-- nr_sectors - | `-- sector - `-- 1 - |-- nr_sectors - `-- sector - -The sector and nr_sectors attributes use 512B sector unit, regardless of -the actual block size of the device. Independent access ranges do not -overlap and include all sectors within the device capacity. The access -ranges are numbered in increasing order of the range start sector, -that is, the sector attribute of range 0 always has the value 0. - -Jens Axboe , February 2009 diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 2e7186805148..19f501d58f5d 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -294,9 +294,6 @@ Block Devices .. kernel-doc:: block/blk-settings.c :export: -.. kernel-doc:: block/blk-exec.c - :export: - .. kernel-doc:: block/blk-flush.c :export: diff --git a/Documentation/translations/zh_CN/core-api/kernel-api.rst b/Documentation/translations/zh_CN/core-api/kernel-api.rst index ab7d81889340..e45fe80d1cd8 100644 --- a/Documentation/translations/zh_CN/core-api/kernel-api.rst +++ b/Documentation/translations/zh_CN/core-api/kernel-api.rst @@ -292,8 +292,6 @@ block/blk-sysfs.c block/blk-settings.c -block/blk-exec.c - block/blk-flush.c block/blk-lib.c diff --git a/MAINTAINERS b/MAINTAINERS index 03916ac8a4ac..a31b5e4c4ab7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3416,6 +3416,8 @@ M: Jens Axboe L: linux-block@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git +F: Documentation/ABI/stable/sysfs-block +F: Documentation/block/ F: block/ F: drivers/block/ F: include/linux/blk* diff --git a/block/Kconfig b/block/Kconfig index c6ce41a5e5b2..d5d4197b7ed2 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT config BLK_DEV_BSG_COMMON tristate +config BLK_ICQ + bool + config BLK_DEV_BSGLIB bool "Block layer SG support v4 helper lib" select BLK_DEV_BSG_COMMON diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 885fee86dfca..615516146086 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -18,6 +18,7 @@ config MQ_IOSCHED_KYBER config IOSCHED_BFQ tristate "BFQ I/O scheduler" + select BLK_ICQ help BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of of the device among all processes according to their weights, diff --git a/block/Makefile b/block/Makefile index 44df57e562bf..f38eaa612929 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-timeout.o \ + blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ diff --git a/block/bdev.c b/block/bdev.c index b1d087e5e205..8bf93a19041b 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -665,7 +665,7 @@ static void blkdev_flush_mapping(struct block_device *bdev) static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; - int ret = 0; + int ret; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); @@ -750,14 +750,6 @@ struct block_device *blkdev_get_no_open(dev_t dev) if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) bdev = NULL; iput(inode); - - if (!bdev) - return NULL; - if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN)) { - put_device(&bdev->bd_device); - return NULL; - } - return bdev; } @@ -837,7 +829,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) * used in blkdev_get/put(). */ if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; } @@ -963,15 +955,15 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) EXPORT_SYMBOL(blkdev_put); /** - * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device - * @dev: return value of the block device's dev_t + * lookup_bdev() - Look up a struct block_device by name. + * @pathname: Name of the block device in the filesystem. + * @dev: Pointer to the block device's dev_t, if found. * * Lookup the block device's dev_t at @pathname in the current - * namespace if possible and return it by @dev. + * namespace if possible and return it in @dev. * - * RETURNS: - * 0 if succeeded, errno otherwise. + * Context: May sleep. + * Return: 0 if succeeded, negative errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fec18118dc30..0c612a911696 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) /** * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. - * @bfqd: the lookup key. - * @ioc: the io_context of the process doing I/O. * @q: the request queue. */ -static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc, - struct request_queue *q) +static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) { - if (ioc) { - unsigned long flags; - struct bfq_io_cq *icq; + struct bfq_io_cq *icq; + unsigned long flags; - spin_lock_irqsave(&q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(ioc, q)); - spin_unlock_irqrestore(&q->queue_lock, flags); + if (!current->io_context) + return NULL; - return icq; - } + spin_lock_irqsave(&q->queue_lock, flags); + icq = icq_to_bic(ioc_lookup_icq(q)); + spin_unlock_irqrestore(&q->queue_lock, flags); - return NULL; + return icq; } /* @@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +#define BFQ_LIMIT_INLINE_DEPTH 16 + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_entity *entity = &bfqq->entity; + struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; + struct bfq_entity **entities = inline_entities; + int depth, level; + int class_idx = bfqq->ioprio_class - 1; + struct bfq_sched_data *sched_data; + unsigned long wsum; + bool ret = false; + + if (!entity->on_st_or_in_serv) + return false; + + /* +1 for bfqq entity, root cgroup not included */ + depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; + if (depth > BFQ_LIMIT_INLINE_DEPTH) { + entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO); + if (!entities) + return false; + } + + spin_lock_irq(&bfqd->lock); + sched_data = entity->sched_data; + /* Gather our ancestors as we need to traverse them in reverse order */ + level = 0; + for_each_entity(entity) { + /* + * If at some level entity is not even active, allow request + * queueing so that BFQ knows there's work to do and activate + * entities. + */ + if (!entity->on_st_or_in_serv) + goto out; + /* Uh, more parents than cgroup subsystem thinks? */ + if (WARN_ON_ONCE(level >= depth)) + break; + entities[level++] = entity; + } + WARN_ON_ONCE(level != depth); + for (level--; level >= 0; level--) { + entity = entities[level]; + if (level > 0) { + wsum = bfq_entity_service_tree(entity)->wsum; + } else { + int i; + /* + * For bfqq itself we take into account service trees + * of all higher priority classes and multiply their + * weights so that low prio queue from higher class + * gets more requests than high prio queue from lower + * class. + */ + wsum = 0; + for (i = 0; i <= class_idx; i++) { + wsum = wsum * IOPRIO_BE_NR + + sched_data->service_tree[i].wsum; + } + } + limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum); + if (entity->allocated >= limit) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "too many requests: allocated %d limit %d level %d", + entity->allocated, limit, level); + ret = true; + break; + } + } +out: + spin_unlock_irq(&bfqd->lock); + if (entities != inline_entities) + kfree(entities); + return ret; +} +#else +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + return false; +} +#endif + /* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. * Limit depths of async I/O and sync writes so as to counter both * problems. + * + * Also if a bfq queue or its parent cgroup consume more tags than would be + * appropriate for their weight, we trim the available tag depth to 1. This + * avoids a situation where one cgroup can starve another cgroup from tags and + * thus block service differentiation among cgroups. Note that because the + * queue / cgroup already has many requests allocated and queued, this does not + * significantly affect service guarantees coming from the BFQ scheduling + * algorithm. */ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct bfq_io_cq *bic = bfq_bic_lookup(data->q); + struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL; + int depth; + unsigned limit = data->q->nr_requests; - if (op_is_sync(op) && !op_is_write(op)) - return; + /* Sync reads have full depth available */ + if (op_is_sync(op) && !op_is_write(op)) { + depth = 0; + } else { + depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + limit = (limit * depth) >> bfqd->full_depth_shift; + } - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + /* + * Does queue (or any parent entity) exceed number of requests that + * should be available to it? Heavily limit depth so that it cannot + * consume more available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) + depth = 1; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); + __func__, bfqd->wr_busy_queues, op_is_sync(op), depth); + if (depth) + data->shallow_depth = depth; } static struct bfq_queue * @@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - + return bfqq->ref - bfqq->entity.allocated - + bfqq->entity.on_st_or_in_serv - (bfqq->weight_counter != NULL) - bfqq->stable_ref; } @@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) * aspect, see the comments on the choice of the queue for injection * in bfq_select_queue(). * - * Turning back to the detection of a waker queue, a queue Q is deemed - * as a waker queue for bfqq if, for three consecutive times, bfqq - * happens to become non empty right after a request of Q has been - * completed. In this respect, even if bfqq is empty, we do not check - * for a waker if it still has some in-flight I/O. In fact, in this - * case bfqq is actually still being served by the drive, and may - * receive new I/O on the completion of some of the in-flight - * requests. In particular, on the first time, Q is tentatively set as - * a candidate waker queue, while on the third consecutive time that Q - * is detected, the field waker_bfqq is set to Q, to confirm that Q is - * a waker queue for bfqq. These detection steps are performed only if - * bfqq has a long think time, so as to make it more likely that - * bfqq's I/O is actually being blocked by a synchronization. This - * last filter, plus the above three-times requirement, make false + * Turning back to the detection of a waker queue, a queue Q is deemed as a + * waker queue for bfqq if, for three consecutive times, bfqq happens to become + * non empty right after a request of Q has been completed within given + * timeout. In this respect, even if bfqq is empty, we do not check for a waker + * if it still has some in-flight I/O. In fact, in this case bfqq is actually + * still being served by the drive, and may receive new I/O on the completion + * of some of the in-flight requests. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while on the third consecutive + * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q + * is a waker queue for bfqq. These detection steps are performed only if bfqq + * has a long think time, so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This last filter, plus the + * above three-times requirement and time limit for detection, make false * positives less likely. * * NOTE @@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) { + char waker_name[MAX_BFQQ_NAME_LENGTH]; + if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || @@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) return; + /* + * We reset waker detection logic also if too much time has passed + * since the first detection. If wakeups are rare, pointless idling + * doesn't hurt throughput that much. The condition below makes sure + * we do not uselessly idle blocking waker in more than 1/64 cases. + */ if (bfqd->last_completed_rq_bfqq != - bfqq->tentative_waker_bfqq) { + bfqq->tentative_waker_bfqq || + now_ns > bfqq->waker_detection_started + + 128 * (u64)bfqd->bfq_slice_idle) { /* * First synchronization detected with a * candidate waker queue, or with a different @@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->tentative_waker_bfqq = bfqd->last_completed_rq_bfqq; bfqq->num_waker_detections = 1; + bfqq->waker_detection_started = now_ns; + bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name); } else /* Same tentative waker queue detected again */ bfqq->num_waker_detections++; if (bfqq->num_waker_detections == 3) { bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; bfqq->tentative_waker_bfqq = NULL; + bfq_bfqq_name(bfqq->waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name); /* * If the waker queue disappears, then @@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, * returned by bfq_bic_lookup does not go away before * bfqd->lock is taken. */ - struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); + struct bfq_io_cq *bic = bfq_bic_lookup(q); bool ret; spin_lock_irq(&bfqd->lock); @@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, } } +static void bfqq_request_allocated(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated++; +} + +static void bfqq_request_freed(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated--; +} + /* returns true if it causes the idle timer to be disabled */ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { @@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. */ - new_bfqq->allocated++; - bfqq->allocated--; + bfqq_request_allocated(new_bfqq); + bfqq_request_freed(bfqq); new_bfqq->ref++; /* * If the bic associated with the process @@ -5991,48 +6127,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); - - /* - * Reqs with at_head or passthrough flags set are to be put - * directly into dispatch list. Additional case for putting rq - * directly into the dispatch queue: the only active - * bfq_queues are bfqq and either its waker bfq_queue or one - * of its woken bfq_queues. The rationale behind this - * additional condition is as follows: - * - consider a bfq_queue, say Q1, detected as a waker of - * another bfq_queue, say Q2 - * - by definition of a waker, Q1 blocks the I/O of Q2, i.e., - * some I/O of Q1 needs to be completed for new I/O of Q2 - * to arrive. A notable example of waker is journald - * - so, Q1 and Q2 are in any respect the queues of two - * cooperating processes (or of two cooperating sets of - * processes): the goal of Q1's I/O is doing what needs to - * be done so that new Q2's I/O can finally be - * issued. Therefore, if the service of Q1's I/O is delayed, - * then Q2's I/O is delayed too. Conversely, if Q2's I/O is - * delayed, the goal of Q1's I/O is hindered. - * - as a consequence, if some I/O of Q1/Q2 arrives while - * Q2/Q1 is the only queue in service, there is absolutely - * no point in delaying the service of such an I/O. The - * only possible result is a throughput loss - * - so, when the above condition holds, the best option is to - * have the new I/O dispatched as soon as possible - * - the most effective and efficient way to attain the above - * goal is to put the new I/O directly in the dispatch - * list - * - as an additional restriction, Q1 and Q2 must be the only - * busy queues for this commit to put the I/O of Q2/Q1 in - * the dispatch list. This is necessary, because, if also - * other queues are waiting for service, then putting new - * I/O directly in the dispatch list may evidently cause a - * violation of service guarantees for the other queues - */ - if (!bfqq || - (bfqq != bfqd->in_service_queue && - bfqd->in_service_queue != NULL && - bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && - (bfqq->waker_bfqq == bfqd->in_service_queue || - bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) { + if (!bfqq || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else @@ -6059,7 +6154,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * merge). */ cmd_flags = rq->cmd_flags; - spin_unlock_irq(&bfqd->lock); bfq_update_insert_stats(q, bfqq, idle_timer_disabled, @@ -6251,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) { - bfqq->allocated--; - + bfqq_request_freed(bfqq); bfq_put_queue(bfqq); } @@ -6476,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq) rq->elv.priv[1] = NULL; } +static void bfq_finish_request(struct request *rq) +{ + bfq_finish_requeue_request(rq); + + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } +} + /* * Removes the association between the current task and bfqq, assuming * that bic points to the bfq iocontext of the task. @@ -6573,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, */ static void bfq_prepare_request(struct request *rq) { + rq->elv.icq = ioc_find_get_icq(rq->q); + /* * Regardless of whether we have an icq attached, we have to * clear the scheduler pointers, as they might point to @@ -6672,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) } } - bfqq->allocated++; + bfqq_request_allocated(bfqq); bfqq->ref++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); @@ -6835,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static unsigned int bfq_update_depths(struct bfq_data *bfqd, - struct sbitmap_queue *bt) +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int i, j, min_shallow = UINT_MAX; + unsigned int depth = 1U << bt->sb.shift; + bfqd->full_depth_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. @@ -6851,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + bfqd->word_depths[0][0] = max(depth >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -6867,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - min_shallow = min(min_shallow, bfqd->word_depths[i][j]); - - return min_shallow; + bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); + bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) @@ -7300,7 +7398,7 @@ static struct elevator_type iosched_bfq_mq = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .requeue_request = bfq_finish_requeue_request, - .finish_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index a73488eec8a4..07288b9da389 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -25,7 +25,7 @@ #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -#define MAX_PID_STR_LENGTH 12 +#define MAX_BFQQ_NAME_LENGTH 16 /* * Soft real-time applications are extremely more latency sensitive @@ -170,6 +170,9 @@ struct bfq_entity { /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ int budget; + /* Number of requests allocated in the subtree of this entity */ + int allocated; + /* device weight, if non-zero, it overrides the default weight of * bfq_group_data */ int dev_weight; @@ -266,8 +269,6 @@ struct bfq_queue { struct request *next_rq; /* number of sync and async requests queued */ int queued[2]; - /* number of requests currently allocated */ - int allocated; /* number of pending metadata requests */ int meta_pending; /* fifo list of requests in sort_list */ @@ -387,6 +388,8 @@ struct bfq_queue { struct bfq_queue *tentative_waker_bfqq; /* number of times the same tentative waker has been detected */ unsigned int num_waker_detections; + /* time when we started considering this waker */ + u64 waker_detection_started; /* node for woken_list, see below */ struct hlist_node woken_list_node; @@ -768,6 +771,7 @@ struct bfq_data { * function) */ unsigned int word_depths[2][2]; + unsigned int full_depth_shift; }; enum bfqq_state_flags { @@ -1079,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ /* Logging facilities. */ -static inline void bfq_pid_to_str(int pid, char *str, int len) +static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len) { - if (pid != -1) - snprintf(str, len, "%d", pid); + char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A'; + + if (bfqq->pid != -1) + snprintf(str, len, "bfq%d%c", bfqq->pid, type); else - snprintf(str, len, "SHARED-"); + snprintf(str, len, "bfqSHARED-%c", type); } #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ - "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ + "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ @@ -1109,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #else /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ - blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) diff --git a/block/bio.c b/block/bio.c index 15ab0d6d1c06..6fadc977cd7f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -26,7 +26,7 @@ #include "blk-rq-qos.h" struct bio_alloc_cache { - struct bio_list free_list; + struct bio *free_list; unsigned int nr; }; @@ -630,7 +630,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int i = 0; struct bio *bio; - while ((bio = bio_list_pop(&cache->free_list)) != NULL) { + while ((bio = cache->free_list) != NULL) { + cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) @@ -689,7 +690,8 @@ void bio_put(struct bio *bio) bio_uninit(bio); cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - bio_list_add_head(&cache->free_list, bio); + bio->bi_next = cache->free_list; + cache->free_list = bio; if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); put_cpu(); @@ -1704,8 +1706,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); cache = per_cpu_ptr(bs->cache, get_cpu()); - bio = bio_list_pop(&cache->free_list); - if (bio) { + if (cache->free_list) { + bio = cache->free_list; + cache->free_list = bio->bi_next; cache->nr--; put_cpu(); bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 663aabfeba18..650f7e27989f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "blk.h" #include "blk-ioprio.h" #include "blk-throttle.h" diff --git a/block/blk-core.c b/block/blk-core.c index 1378d084c770..97f8bc8d3a79 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -40,6 +39,7 @@ #include #include #include +#include #include #include @@ -47,7 +47,6 @@ #include #include "blk.h" -#include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-throttle.h" @@ -67,6 +66,7 @@ DEFINE_IDA(blk_queue_ida); * For queue allocation */ struct kmem_cache *blk_requestq_cachep; +struct kmem_cache *blk_requestq_srcu_cachep; /* * Controlling structure to kblockd @@ -109,23 +109,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); -void blk_rq_init(struct request_queue *q, struct request *rq) -{ - memset(rq, 0, sizeof(*rq)); - - INIT_LIST_HEAD(&rq->queuelist); - rq->q = q; - rq->__sector = (sector_t) -1; - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); - rq->tag = BLK_MQ_NO_TAG; - rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); - rq->part = NULL; - blk_crypto_rq_set_defaults(rq); -} -EXPORT_SYMBOL(blk_rq_init); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), @@ -216,38 +199,15 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); -void blk_print_req_error(struct request *req, blk_status_t status) +const char *blk_status_to_str(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) - return; - - printk_ratelimited(KERN_ERR - "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " - "phys_seg %u prio class %u\n", - blk_errors[idx].name, - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, - req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + return ""; + return blk_errors[idx].name; } -void blk_dump_rq_flags(struct request *rq, char *msg) -{ - printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", - (unsigned long long) rq->cmd_flags); - - printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", - (unsigned long long)blk_rq_pos(rq), - blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, len %u\n", - rq->bio, rq->biotail, blk_rq_bytes(rq)); -} -EXPORT_SYMBOL(blk_dump_rq_flags); - /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue @@ -478,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) { struct request_queue *q; int ret; - q = kmem_cache_alloc_node(blk_requestq_cachep, - GFP_KERNEL | __GFP_ZERO, node_id); + q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; + if (alloc_srcu) { + blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q); + if (init_srcu_struct(q->srcu) != 0) + goto fail_q; + } + q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) - goto fail_q; + goto fail_srcu; ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); if (ret) @@ -549,8 +515,11 @@ struct request_queue *blk_alloc_queue(int node_id) bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_srcu: + if (alloc_srcu) + cleanup_srcu_struct(q->srcu); fail_q: - kmem_cache_free(blk_requestq_cachep, q); + kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q); return NULL; } @@ -594,7 +563,7 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct block_device *part, unsigned int bytes) +bool should_fail_request(struct block_device *part, unsigned int bytes) { return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } @@ -608,15 +577,6 @@ static int __init fail_make_request_debugfs(void) } late_initcall(fail_make_request_debugfs); - -#else /* CONFIG_FAIL_MAKE_REQUEST */ - -static inline bool should_fail_request(struct block_device *part, - unsigned int bytes) -{ - return false; -} - #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool bio_check_ro(struct bio *bio) @@ -802,15 +762,6 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio) break; } - /* - * Various block parts want %current->io_context, so allocate it up - * front rather than dealing with lots of pain to allocate it only - * where needed. This may fail and the block layer knows how to live - * with it. - */ - if (unlikely(!current->io_context)) - create_task_io_context(current, GFP_ATOMIC, q->node); - if (blk_throtl_bio(bio)) return false; @@ -836,17 +787,21 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio) static void __submit_bio_fops(struct gendisk *disk, struct bio *bio) { - if (unlikely(bio_queue_enter(bio) != 0)) - return; - if (submit_bio_checks(bio) && blk_crypto_bio_prep(&bio)) - disk->fops->submit_bio(bio); - blk_queue_exit(disk->queue); + if (blk_crypto_bio_prep(&bio)) { + if (likely(bio_queue_enter(bio) == 0)) { + disk->fops->submit_bio(bio); + blk_queue_exit(disk->queue); + } + } } static void __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; + if (unlikely(!submit_bio_checks(bio))) + return; + if (!disk->fops->submit_bio) blk_mq_submit_bio(bio); else @@ -1090,135 +1045,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, } EXPORT_SYMBOL_GPL(iocb_bio_iopoll); -/** - * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for the new queue limits - * @q: the queue - * @rq: the request being checked - * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. - * - * Request stacking drivers like request-based dm may change the queue - * limits when retrying requests on other queues. Those requests need - * to be checked against the new queue limits again during dispatch. - */ -static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, - struct request *rq) -{ - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); - - if (blk_rq_sectors(rq) > max_sectors) { - /* - * SCSI device does not have a good way to return if - * Write Same/Zero is actually supported. If a device rejects - * a non-read/write command (discard, write same,etc.) the - * low-level device driver will set the relevant queue limit to - * 0 to prevent blk-lib from issuing more of the offending - * operations. Commands queued prior to the queue limit being - * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O - * errors being propagated to upper layers. - */ - if (max_sectors == 0) - return BLK_STS_NOTSUPP; - - printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", - __func__, blk_rq_sectors(rq), max_sectors); - return BLK_STS_IOERR; - } - - /* - * The queue settings related to segment counting may differ from the - * original queue. - */ - rq->nr_phys_segments = blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_segments(q)) { - printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", - __func__, rq->nr_phys_segments, queue_max_segments(q)); - return BLK_STS_IOERR; - } - - return BLK_STS_OK; -} - -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued - */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) -{ - blk_status_t ret; - - ret = blk_cloned_rq_check_limits(q, rq); - if (ret != BLK_STS_OK) - return ret; - - if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) - return BLK_STS_IOERR; - - if (blk_crypto_insert_cloned_request(rq)) - return BLK_STS_IOERR; - - blk_account_io_start(rq); - - /* - * Since we have a scheduler attached on the top device, - * bypass a potential scheduler on the bottom device for - * insert. - */ - return blk_mq_request_issue_directly(rq, true); -} -EXPORT_SYMBOL_GPL(blk_insert_cloned_request); - -/** - * blk_rq_err_bytes - determine number of bytes till the next failure boundary - * @rq: request to examine - * - * Description: - * A request could be merge of IOs which require different failure - * handling. This function determines the number of bytes which - * can be failed from the beginning of the request without - * crossing into area which need to be retried further. - * - * Return: - * The number of bytes to fail. - */ -unsigned int blk_rq_err_bytes(const struct request *rq) -{ - unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; - unsigned int bytes = 0; - struct bio *bio; - - if (!(rq->rq_flags & RQF_MIXED_MERGE)) - return blk_rq_bytes(rq); - - /* - * Currently the only 'mixing' which can happen is between - * different fastfail types. We can safely fail portions - * which have all the failfast bits that the first one has - - * the ones which are at least as eager to fail as the first - * one. - */ - for (bio = rq->bio; bio; bio = bio->bi_next) { - if ((bio->bi_opf & ff) != ff) - break; - bytes += bio->bi_iter.bi_size; - } - - /* this could lead to infinite loop */ - BUG_ON(blk_rq_bytes(rq) && !bytes); - return bytes; -} -EXPORT_SYMBOL_GPL(blk_rq_err_bytes); - -static void update_io_ticks(struct block_device *part, unsigned long now, - bool end) +void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; again: @@ -1233,30 +1060,6 @@ static void update_io_ticks(struct block_device *part, unsigned long now, } } -void __blk_account_io_done(struct request *req, u64 now) -{ - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - update_io_ticks(req->part, jiffies, true); - part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); -} - -void __blk_account_io_start(struct request *rq) -{ - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) - rq->part = rq->bio->bi_bdev; - else - rq->part = rq->rq_disk->part0; - - part_stat_lock(); - update_io_ticks(rq->part, jiffies, false); - part_stat_unlock(); -} - static unsigned long __part_start_io_acct(struct block_device *part, unsigned int sectors, unsigned int op) { @@ -1320,46 +1123,6 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op, } EXPORT_SYMBOL(disk_end_io_acct); -/* - * Steal bios from a request and add them to a bio list. - * The request must not have been partially completed before. - */ -void blk_steal_bios(struct bio_list *list, struct request *rq) -{ - if (rq->bio) { - if (list->tail) - list->tail->bi_next = rq->bio; - else - list->head = rq->bio; - list->tail = rq->biotail; - - rq->bio = NULL; - rq->biotail = NULL; - } - - rq->__data_len = 0; -} -EXPORT_SYMBOL_GPL(blk_steal_bios); - -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -/** - * rq_flush_dcache_pages - Helper function to flush all pages in a request - * @rq: the request to be flushed - * - * Description: - * Flush all pages in @rq. - */ -void rq_flush_dcache_pages(struct request *rq) -{ - struct req_iterator iter; - struct bio_vec bvec; - - rq_for_each_segment(bvec, rq, iter) - flush_dcache_page(bvec.bv_page); -} -EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); -#endif - /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked @@ -1388,93 +1151,6 @@ int blk_lld_busy(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_lld_busy); -/** - * blk_rq_unprep_clone - Helper function to free all bios in a cloned request - * @rq: the clone request to be cleaned up - * - * Description: - * Free all bios in @rq for a cloned request. - */ -void blk_rq_unprep_clone(struct request *rq) -{ - struct bio *bio; - - while ((bio = rq->bio) != NULL) { - rq->bio = bio->bi_next; - - bio_put(bio); - } -} -EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); - -/** - * blk_rq_prep_clone - Helper function to setup clone request - * @rq: the request to be setup - * @rq_src: original request to be cloned - * @bs: bio_set that bios for clone are allocated from - * @gfp_mask: memory allocation mask for bio - * @bio_ctr: setup function to be called for each clone bio. - * Returns %0 for success, non %0 for failure. - * @data: private data to be passed to @bio_ctr - * - * Description: - * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * Also, pages which the original bios are pointing to are not copied - * and the cloned bios just point same pages. - * So cloned bios must be completed before original bios, which means - * the caller must complete @rq before @rq_src. - */ -int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data) -{ - struct bio *bio, *bio_src; - - if (!bs) - bs = &fs_bio_set; - - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); - if (!bio) - goto free_and_out; - - if (bio_ctr && bio_ctr(bio, bio_src, data)) - goto free_and_out; - - if (rq->bio) { - rq->biotail->bi_next = bio; - rq->biotail = bio; - } else { - rq->bio = rq->biotail = bio; - } - bio = NULL; - } - - /* Copy attributes of the original request to the clone request. */ - rq->__sector = blk_rq_pos(rq_src); - rq->__data_len = blk_rq_bytes(rq_src); - if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; - rq->special_vec = rq_src->special_vec; - } - rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - - if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) - goto free_and_out; - - return 0; - -free_and_out: - if (bio) - bio_put(bio); - blk_rq_unprep_clone(rq); - - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_rq_prep_clone); - int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); @@ -1639,6 +1315,9 @@ int __init blk_dev_init(void) sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); + BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu), + __alignof__(struct request_queue)) != + sizeof(struct request_queue)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", @@ -1649,6 +1328,10 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu", + sizeof(struct request_queue) + + sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL); + blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 605ba0626a5c..96c511967386 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -463,11 +463,6 @@ bool blk_crypto_register(struct blk_crypto_profile *profile, } EXPORT_SYMBOL_GPL(blk_crypto_register); -void blk_crypto_unregister(struct request_queue *q) -{ - q->crypto_profile = NULL; -} - /** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device diff --git a/block/blk-exec.c b/block/blk-exec.c deleted file mode 100644 index 1b8b47f6e79b..000000000000 --- a/block/blk-exec.c +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to setting various queue properties from drivers - */ -#include -#include -#include -#include -#include -#include - -#include "blk.h" -#include "blk-mq-sched.h" - -/** - * blk_end_sync_rq - executes a completion event on a request - * @rq: request to complete - * @error: end I/O status of the request - */ -static void blk_end_sync_rq(struct request *rq, blk_status_t error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = (void *)(uintptr_t)error; - - /* - * complete last, if this is a stack request the process (and thus - * the rq pointer) could be invalid right after this complete() - */ - complete(waiting); -} - -/** - * blk_execute_rq_nowait - insert a request to I/O scheduler for execution - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * @done: I/O completion handler - * - * Description: - * Insert a fully prepared request at the back of the I/O scheduler queue - * for execution. Don't wait for completion. - * - * Note: - * This function will invoke @done directly if the queue is dead. - */ -void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, - int at_head, rq_end_io_fn *done) -{ - WARN_ON(irqs_disabled()); - WARN_ON(!blk_rq_is_passthrough(rq)); - - rq->rq_disk = bd_disk; - rq->end_io = done; - - blk_account_io_start(rq); - - /* - * don't check dying flag for MQ because the request won't - * be reused after dying flag is set - */ - blk_mq_sched_insert_request(rq, at_head, true, false); -} -EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); - -static bool blk_rq_is_poll(struct request *rq) -{ - if (!rq->mq_hctx) - return false; - if (rq->mq_hctx->type != HCTX_TYPE_POLL) - return false; - if (WARN_ON_ONCE(!rq->bio)) - return false; - return true; -} - -static void blk_rq_poll_completion(struct request *rq, struct completion *wait) -{ - do { - bio_poll(rq->bio, NULL, 0); - cond_resched(); - } while (!completion_done(wait)); -} - -/** - * blk_execute_rq - insert a request into queue for execution - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * - * Description: - * Insert a fully prepared request at the back of the I/O scheduler queue - * for execution and wait for completion. - * Return: The blk_status_t result provided to blk_mq_end_request(). - */ -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head) -{ - DECLARE_COMPLETION_ONSTACK(wait); - unsigned long hang_check; - - rq->end_io_data = &wait; - blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - - if (blk_rq_is_poll(rq)) - blk_rq_poll_completion(rq, &wait); - else if (hang_check) - while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); - else - wait_for_completion_io(&wait); - - return (blk_status_t)(uintptr_t)rq->end_io_data; -} -EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index 1fce6d16e6d3..e4df894189ce 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,6 +69,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -95,6 +96,12 @@ enum { static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, unsigned int flags); +static inline struct blk_flush_queue * +blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +{ + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; +} + static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { unsigned int policy = 0; @@ -138,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct block_device *part = rq->rq_disk->part0; + struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); @@ -222,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (!refcount_dec_and_test(&flush_rq->ref)) { + if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return; @@ -235,8 +242,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); - if (fq->rq_status != BLK_STS_OK) + if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; + fq->rq_status = BLK_STS_OK; + } if (!q->elevator) { flush_rq->tag = BLK_MQ_NO_TAG; @@ -332,7 +341,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; - flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one @@ -341,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * and READ flush_rq->end_io */ smp_wmb(); - refcount_set(&flush_rq->ref, 1); + req_ref_set(flush_rq, 1); blk_flush_queue_rq(flush_rq, false); } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d670d54e5f7a..69eed260a823 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -411,7 +411,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - blk_crypto_unregister(disk->queue); + disk->queue->crypto_profile = NULL; } #endif } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 57299f860d41..11f49f78db32 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -8,22 +8,25 @@ #include #include #include +#include #include #include "blk.h" +#include "blk-mq-sched.h" /* * For io context allocations */ static struct kmem_cache *iocontext_cachep; +#ifdef CONFIG_BLK_ICQ /** * get_io_context - increment reference count to io_context * @ioc: io_context to get * * Increment reference count to @ioc. */ -void get_io_context(struct io_context *ioc) +static void get_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); @@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq) icq->flags |= ICQ_EXITED; } +static void ioc_exit_icqs(struct io_context *ioc) +{ + struct io_cq *icq; + + spin_lock_irq(&ioc->lock); + hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) + ioc_exit_icq(icq); + spin_unlock_irq(&ioc->lock); +} + /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. @@ -132,102 +145,22 @@ static void ioc_release_fn(struct work_struct *work) kmem_cache_free(iocontext_cachep, ioc); } -/** - * put_io_context - put a reference of io_context - * @ioc: io_context to put - * - * Decrement reference count of @ioc and release it if the count reaches - * zero. +/* + * Releasing icqs requires reverse order double locking and we may already be + * holding a queue_lock. Do it asynchronously from a workqueue. */ -void put_io_context(struct io_context *ioc) -{ - unsigned long flags; - bool free_ioc = false; - - if (ioc == NULL) - return; - - BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - - /* - * Releasing ioc requires reverse order double locking and we may - * already be holding a queue_lock. Do it asynchronously from wq. - */ - if (atomic_long_dec_and_test(&ioc->refcount)) { - spin_lock_irqsave(&ioc->lock, flags); - if (!hlist_empty(&ioc->icq_list)) - queue_work(system_power_efficient_wq, - &ioc->release_work); - else - free_ioc = true; - spin_unlock_irqrestore(&ioc->lock, flags); - } - - if (free_ioc) - kmem_cache_free(iocontext_cachep, ioc); -} - -/** - * put_io_context_active - put active reference on ioc - * @ioc: ioc of interest - * - * Undo get_io_context_active(). If active reference reaches zero after - * put, @ioc can never issue further IOs and ioscheds are notified. - */ -void put_io_context_active(struct io_context *ioc) -{ - struct io_cq *icq; - - if (!atomic_dec_and_test(&ioc->active_ref)) { - put_io_context(ioc); - return; - } - - spin_lock_irq(&ioc->lock); - hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { - if (icq->flags & ICQ_EXITED) - continue; - - ioc_exit_icq(icq); - } - spin_unlock_irq(&ioc->lock); - - put_io_context(ioc); -} - -/* Called by the exiting task */ -void exit_io_context(struct task_struct *task) -{ - struct io_context *ioc; - - task_lock(task); - ioc = task->io_context; - task->io_context = NULL; - task_unlock(task); - - atomic_dec(&ioc->nr_tasks); - put_io_context_active(ioc); -} - -static void __ioc_clear_queue(struct list_head *icq_list) +static bool ioc_delay_free(struct io_context *ioc) { unsigned long flags; - rcu_read_lock(); - while (!list_empty(icq_list)) { - struct io_cq *icq = list_entry(icq_list->next, - struct io_cq, q_node); - struct io_context *ioc = icq->ioc; - - spin_lock_irqsave(&ioc->lock, flags); - if (icq->flags & ICQ_DESTROYED) { - spin_unlock_irqrestore(&ioc->lock, flags); - continue; - } - ioc_destroy_icq(icq); + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) { + queue_work(system_power_efficient_wq, &ioc->release_work); spin_unlock_irqrestore(&ioc->lock, flags); + return true; } - rcu_read_unlock(); + spin_unlock_irqrestore(&ioc->lock, flags); + return false; } /** @@ -244,93 +177,156 @@ void ioc_clear_queue(struct request_queue *q) list_splice_init(&q->icq_list, &icq_list); spin_unlock_irq(&q->queue_lock); - __ioc_clear_queue(&icq_list); -} + rcu_read_lock(); + while (!list_empty(&icq_list)) { + struct io_cq *icq = + list_entry(icq_list.next, struct io_cq, q_node); -int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) + spin_lock_irq(&icq->ioc->lock); + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + spin_unlock_irq(&icq->ioc->lock); + } + rcu_read_unlock(); +} +#else /* CONFIG_BLK_ICQ */ +static inline void ioc_exit_icqs(struct io_context *ioc) +{ +} +static inline bool ioc_delay_free(struct io_context *ioc) +{ + return false; +} +#endif /* CONFIG_BLK_ICQ */ + +/** + * put_io_context - put a reference of io_context + * @ioc: io_context to put + * + * Decrement reference count of @ioc and release it if the count reaches + * zero. + */ +void put_io_context(struct io_context *ioc) +{ + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc)) + kmem_cache_free(iocontext_cachep, ioc); +} +EXPORT_SYMBOL_GPL(put_io_context); + +/* Called by the exiting task */ +void exit_io_context(struct task_struct *task) +{ + struct io_context *ioc; + + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); + + if (atomic_dec_and_test(&ioc->active_ref)) { + ioc_exit_icqs(ioc); + put_io_context(ioc); + } +} + +static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ioc; - int ret; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) - return -ENOMEM; + return NULL; - /* initialize */ atomic_long_set(&ioc->refcount, 1); - atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); +#ifdef CONFIG_BLK_ICQ spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); +#endif + return ioc; +} + +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + const struct cred *cred = current_cred(), *tcred; + + rcu_read_lock(); + tcred = __task_cred(task); + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + err = security_task_setioprio(task, ioprio); + if (err) + return err; + + task_lock(task); + if (unlikely(!task->io_context)) { + struct io_context *ioc; + + task_unlock(task); + + ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE); + if (!ioc) + return -ENOMEM; + + task_lock(task); + if (task->flags & PF_EXITING) { + err = -ESRCH; + kmem_cache_free(iocontext_cachep, ioc); + goto out; + } + if (task->io_context) + kmem_cache_free(iocontext_cachep, ioc); + else + task->io_context = ioc; + } + task->io_context->ioprio = ioprio; +out: + task_unlock(task); + return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + struct io_context *ioc = current->io_context; /* - * Try to install. ioc shouldn't be installed if someone else - * already did or @task, which isn't %current, is exiting. Note - * that we need to allow ioc creation on exiting %current as exit - * path may issue IOs from e.g. exit_files(). The exit path is - * responsible for not issuing IO after exit_io_context(). + * Share io context with parent, if CLONE_IO is set */ - task_lock(task); - if (!task->io_context && - (task == current || !(task->flags & PF_EXITING))) - task->io_context = ioc; - else - kmem_cache_free(iocontext_cachep, ioc); + if (clone_flags & CLONE_IO) { + atomic_inc(&ioc->active_ref); + tsk->io_context = ioc; + } else if (ioprio_valid(ioc->ioprio)) { + tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE); + if (!tsk->io_context) + return -ENOMEM; + tsk->io_context->ioprio = ioc->ioprio; + } - ret = task->io_context ? 0 : -EBUSY; - - task_unlock(task); - - return ret; -} - -/** - * get_task_io_context - get io_context of a task - * @task: task of interest - * @gfp_flags: allocation flags, used if allocation is necessary - * @node: allocation node, used if allocation is necessary - * - * Return io_context of @task. If it doesn't exist, it is created with - * @gfp_flags and @node. The returned io_context has its reference count - * incremented. - * - * This function always goes through task_lock() and it's better to use - * %current->io_context + get_io_context() for %current. - */ -struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node) -{ - struct io_context *ioc; - - might_sleep_if(gfpflags_allow_blocking(gfp_flags)); - - do { - task_lock(task); - ioc = task->io_context; - if (likely(ioc)) { - get_io_context(ioc); - task_unlock(task); - return ioc; - } - task_unlock(task); - } while (!create_task_io_context(task, gfp_flags, node)); - - return NULL; + return 0; } +#ifdef CONFIG_BLK_ICQ /** * ioc_lookup_icq - lookup io_cq from ioc - * @ioc: the associated io_context * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) +struct io_cq *ioc_lookup_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct io_cq *icq; lockdep_assert_held(&q->queue_lock); @@ -359,9 +355,7 @@ EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq - * @ioc: io_context of interest * @q: request_queue of interest - * @gfp_mask: allocation mask * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. @@ -369,19 +363,19 @@ EXPORT_SYMBOL(ioc_lookup_icq); * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask) +static struct io_cq *ioc_create_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, + icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO, q->node); if (!icq) return NULL; - if (radix_tree_maybe_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } @@ -402,7 +396,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); - icq = ioc_lookup_icq(ioc, q); + icq = ioc_lookup_icq(q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } @@ -413,6 +407,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, return icq; } +struct io_cq *ioc_find_get_icq(struct request_queue *q) +{ + struct io_context *ioc = current->io_context; + struct io_cq *icq = NULL; + + if (unlikely(!ioc)) { + ioc = alloc_io_context(GFP_ATOMIC, q->node); + if (!ioc) + return NULL; + + task_lock(current); + if (current->io_context) { + kmem_cache_free(iocontext_cachep, ioc); + ioc = current->io_context; + } else { + current->io_context = ioc; + } + + get_io_context(ioc); + task_unlock(current); + } else { + get_io_context(ioc); + + spin_lock_irq(&q->queue_lock); + icq = ioc_lookup_icq(q); + spin_unlock_irq(&q->queue_lock); + } + + if (!icq) { + icq = ioc_create_icq(q); + if (!icq) { + put_io_context(ioc); + return NULL; + } + } + return icq; +} +EXPORT_SYMBOL_GPL(ioc_find_get_icq); +#endif /* CONFIG_BLK_ICQ */ + static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 332a07761bf8..2e7f10e1c03f 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -62,6 +62,7 @@ struct ioprio_blkg { struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; + bool prio_set; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) @@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, if (ret < 0) return ret; blkcg->prio_policy = ret; - + blkcg->prio_set = true; return nbytes; } @@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + u16 prio; + + if (!blkcg->prio_set) + return; /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers @@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, * bio I/O priority is not modified. If the bio I/O priority equals * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. */ - bio->bi_ioprio = max_t(u16, bio->bi_ioprio, - IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); + prio = max_t(u16, bio->bi_ioprio, + IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); + if (prio > bio->bi_ioprio) + bio->bi_ioprio = prio; } static void blkcg_ioprio_exit(struct rq_qos *rqos) diff --git a/block/blk-merge.c b/block/blk-merge.c index 893c1a60b701..4de34a332c9f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -8,10 +8,12 @@ #include #include #include +#include #include #include "blk.h" +#include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-throttle.h" @@ -775,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - if (rq_data_dir(req) != rq_data_dir(next) - || req->rq_disk != next->rq_disk) + if (rq_data_dir(req) != rq_data_dir(next)) return NULL; if (req_op(req) == REQ_OP_WRITE_SAME && @@ -903,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; - /* must be same device */ - if (rq->rq_disk != bio->bi_bdev->bd_disk) - return false; - /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; @@ -1067,7 +1064,6 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @nr_segs: number of segments in @bio - * @same_queue_rq: output value, will be true if there's an existing request * from the passed in @q already in the plug list * * Determine whether @bio being queued on @q can be merged with the previous @@ -1084,7 +1080,7 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, bool *same_queue_rq) + unsigned int nr_segs) { struct blk_plug *plug; struct request *rq; @@ -1096,12 +1092,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, /* check the previously added entry for a quick merge attempt */ rq = rq_list_peek(&plug->mq_list); if (rq->q == q) { - /* - * Only blk-mq multiple hardware queues case checks the rq in - * the same queue, there should be only one such rq in a queue - */ - *same_queue_rq = true; - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK) return true; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4f2cf8399f3d..3a790eb4995c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -11,6 +11,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-rq-qos.h" @@ -29,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m) struct request_queue *q = data; int bucket; + if (!q->poll_stat) + return 0; + for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); print_stat(m, &q->poll_stat[2 * bucket]); @@ -122,7 +126,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), - QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index ba21449439cc..55488ba97823 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -18,32 +18,6 @@ #include "blk-mq-tag.h" #include "blk-wbt.h" -void blk_mq_sched_assign_ioc(struct request *rq) -{ - struct request_queue *q = rq->q; - struct io_context *ioc; - struct io_cq *icq; - - /* - * May not have an IO context if it's a passthrough request - */ - ioc = current->io_context; - if (!ioc) - return; - - spin_lock_irq(&q->queue_lock); - icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(&q->queue_lock); - - if (!icq) { - icq = ioc_create_icq(ioc, q, GFP_ATOMIC); - if (!icq) - return; - } - get_io_context(icq->ioc); - rq->elv.icq = icq; -} - /* * Mark a hardware queue as needing a restart. For shared queues, maintain * a count of how many hardware queues are marked for restart. @@ -501,7 +475,8 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, * us one extra enqueue & dequeue to sw queue. */ if (!hctx->dispatch_busy && !run_queue_async) { - blk_mq_try_issue_list_directly(hctx, list); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 25d1034952b6..025013972453 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -8,8 +8,6 @@ #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) -void blk_mq_sched_assign_ioc(struct request *rq); - bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 253c857cba47..674786574075 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->srcu); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 995336abee33..e55a6834c9a6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -215,7 +215,8 @@ void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) struct bt_iter_data { struct blk_mq_hw_ctx *hctx; - busy_iter_fn *fn; + struct request_queue *q; + busy_tag_iter_fn *fn; void *data; bool reserved; }; @@ -228,7 +229,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; - if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) + if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; @@ -238,11 +239,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; - struct blk_mq_tags *tags = hctx->tags; + struct request_queue *q = iter_data->q; + struct blk_mq_tag_set *set = q->tag_set; bool reserved = iter_data->reserved; + struct blk_mq_tags *tags; struct request *rq; bool ret = true; + if (blk_mq_is_shared_tags(set->flags)) + tags = set->shared_tags; + else + tags = hctx->tags; + if (!reserved) bitnr += tags->nr_reserved_tags; /* @@ -253,8 +261,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!rq) return true; - if (rq->q == hctx->queue && rq->mq_hctx == hctx) - ret = iter_data->fn(hctx, rq, iter_data->data, reserved); + if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) + ret = iter_data->fn(rq, iter_data->data, reserved); blk_mq_put_rq_ref(rq); return ret; } @@ -262,6 +270,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. + * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request @@ -273,14 +282,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ -static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - busy_iter_fn *fn, void *data, bool reserved) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, + struct sbitmap_queue *bt, busy_tag_iter_fn *fn, + void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, + .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); @@ -457,12 +468,9 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { - struct blk_mq_hw_ctx *hctx; - int i; - /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * while the queue is frozen. So we can use q_usage_counter to avoid @@ -471,19 +479,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, if (!percpu_ref_tryget(&q->q_usage_counter)) return; - queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_tags *tags = hctx->tags; - - /* - * If no software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + struct blk_mq_tags *tags = q->tag_set->shared_tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); + bt_for_each(NULL, q, bresv, fn, priv, true); + bt_for_each(NULL, q, btags, fn, priv, false); + } else { + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; + + /* + * If no software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, q, bresv, fn, priv, true); + bt_for_each(hctx, q, btags, fn, priv, false); + } } blk_queue_exit(q); } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index df787b5a23bd..5668e28be0b7 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -28,7 +28,7 @@ extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); diff --git a/block/blk-mq.c b/block/blk-mq.c index 8874a63ae952..a6d4780580fc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -126,8 +127,7 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, +static bool blk_mq_check_inflight(struct request *rq, void *priv, bool reserved) { struct mq_inflight *mi = priv; @@ -259,17 +259,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); */ void blk_mq_wait_quiesce_done(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; - bool rcu = false; - - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->srcu); - else - rcu = true; - } - if (rcu) + if (blk_queue_has_srcu(q)) + synchronize_srcu(q->srcu); + else synchronize_rcu(); } EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); @@ -327,6 +319,23 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } +void blk_rq_init(struct request_queue *q, struct request *rq) +{ + memset(rq, 0, sizeof(*rq)); + + INIT_LIST_HEAD(&rq->queuelist); + rq->q = q; + rq->__sector = (sector_t) -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; + rq->start_time_ns = ktime_get_ns(); + rq->part = NULL; + blk_crypto_rq_set_defaults(rq); +} +EXPORT_SYMBOL(blk_rq_init); + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) { @@ -359,7 +368,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; - rq->rq_disk = NULL; rq->part = NULL; #ifdef CONFIG_BLK_RQ_ALLOC_TIME rq->alloc_time_ns = alloc_time_ns; @@ -377,20 +385,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); - refcount_set(&rq->ref, 1); + req_ref_set(rq, 1); if (rq->rq_flags & RQF_ELV) { struct elevator_queue *e = data->q->elevator; - rq->elv.icq = NULL; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); if (!op_is_flush(data->cmd_flags) && e->type->ops.prepare_request) { - if (e->type->icq_cache) - blk_mq_sched_assign_ioc(rq); - e->type->ops.prepare_request(rq); rq->rq_flags |= RQF_ELVPRIV; } @@ -616,16 +620,9 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - if (rq->rq_flags & RQF_ELVPRIV) { - struct elevator_queue *e = q->elevator; - - if (e->type->ops.finish_request) - e->type->ops.finish_request(rq); - if (rq->elv.icq) { - put_io_context(rq->elv.icq->ioc); - rq->elv.icq = NULL; - } - } + if ((rq->rq_flags & RQF_ELVPRIV) && + q->elevator->type->ops.finish_request) + q->elevator->type->ops.finish_request(rq); if (rq->rq_flags & RQF_MQ_INFLIGHT) __blk_mq_dec_active_requests(hctx); @@ -636,7 +633,7 @@ void blk_mq_free_request(struct request *rq) rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); - if (refcount_dec_and_test(&rq->ref)) + if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -649,6 +646,20 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) blk_mq_free_request(rq); } +void blk_dump_rq_flags(struct request *rq, char *msg) +{ + printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, + rq->q->disk ? rq->q->disk->disk_name : "?", + (unsigned long long) rq->cmd_flags); + + printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", + (unsigned long long)blk_rq_pos(rq), + blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); +} +EXPORT_SYMBOL(blk_dump_rq_flags); + static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { @@ -685,6 +696,60 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) } } +static void blk_print_req_error(struct request *req, blk_status_t status) +{ + printk_ratelimited(KERN_ERR + "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", + blk_status_to_str(status), + req->q->disk ? req->q->disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, + req->nr_phys_segments, + IOPRIO_PRIO_CLASS(req->ioprio)); +} + +/* + * Fully end IO on a request. Does not support partial completions, or + * errors. + */ +static void blk_complete_request(struct request *req) +{ + const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; + int total_bytes = blk_rq_bytes(req); + struct bio *bio = req->bio; + + trace_block_rq_complete(req, BLK_STS_OK, total_bytes); + + if (!bio) + return; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) + req->q->integrity.profile->complete_fn(req, total_bytes); +#endif + + blk_account_io_completion(req, total_bytes); + + do { + struct bio *next = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + if (!is_flush) + bio_endio(bio); + bio = next; + } while (bio); + + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->bio = NULL; + req->__data_len = 0; +} + /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed @@ -791,6 +856,48 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); +static void __blk_account_io_done(struct request *req, u64 now) +{ + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); +} + +static inline void blk_account_io_done(struct request *req, u64 now) +{ + /* + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. + */ + if (blk_do_io_stat(req) && req->part && + !(req->rq_flags & RQF_FLUSH_SEQ)) + __blk_account_io_done(req, now); +} + +static void __blk_account_io_start(struct request *rq) +{ + /* passthrough requests can hold bios that do not have ->bi_bdev set */ + if (rq->bio && rq->bio->bi_bdev) + rq->part = rq->bio->bi_bdev; + else if (rq->q->disk) + rq->part = rq->q->disk->part0; + + part_stat_lock(); + update_io_ticks(rq->part, jiffies, false); + part_stat_unlock(); +} + +static inline void blk_account_io_start(struct request *req) +{ + if (blk_do_io_stat(req)) + __blk_account_io_start(req); +} + static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { if (rq->rq_flags & RQF_STATS) { @@ -856,14 +963,14 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) prefetch(rq->bio); prefetch(rq->rq_next); - blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq)); + blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); rq_qos_done(rq->q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); - if (!refcount_dec_and_test(&rq->ref)) + if (!req_ref_put_and_test(rq)) continue; blk_crypto_free_request(rq); @@ -996,26 +1103,6 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) - __releases(hctx->srcu) -{ - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) - rcu_read_unlock(); - else - srcu_read_unlock(hctx->srcu, srcu_idx); -} - -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) - __acquires(hctx->srcu) -{ - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - /* shut up gcc false positive */ - *srcu_idx = 0; - rcu_read_lock(); - } else - *srcu_idx = srcu_read_lock(hctx->srcu); -} - /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started @@ -1058,6 +1145,107 @@ void blk_mq_start_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_start_request); +/** + * blk_end_sync_rq - executes a completion event on a request + * @rq: request to complete + * @error: end I/O status of the request + */ +static void blk_end_sync_rq(struct request *rq, blk_status_t error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = (void *)(uintptr_t)error; + + /* + * complete last, if this is a stack request the process (and thus + * the rq pointer) could be invalid right after this complete() + */ + complete(waiting); +} + +/** + * blk_execute_rq_nowait - insert a request to I/O scheduler for execution + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * @done: I/O completion handler + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution. Don't wait for completion. + * + * Note: + * This function will invoke @done directly if the queue is dead. + */ +void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done) +{ + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->end_io = done; + + blk_account_io_start(rq); + + /* + * don't check dying flag for MQ because the request won't + * be reused after dying flag is set + */ + blk_mq_sched_insert_request(rq, at_head, true, false); +} +EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); + +static bool blk_rq_is_poll(struct request *rq) +{ + if (!rq->mq_hctx) + return false; + if (rq->mq_hctx->type != HCTX_TYPE_POLL) + return false; + if (WARN_ON_ONCE(!rq->bio)) + return false; + return true; +} + +static void blk_rq_poll_completion(struct request *rq, struct completion *wait) +{ + do { + bio_poll(rq->bio, NULL, 0); + cond_resched(); + } while (!completion_done(wait)); +} + +/** + * blk_execute_rq - insert a request into queue for execution + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). + */ +blk_status_t blk_execute_rq(struct request *rq, bool at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + unsigned long hang_check; + + rq->end_io_data = &wait; + blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + + if (blk_rq_is_poll(rq)) + blk_rq_poll_completion(rq, &wait); + else if (hang_check) + while (!wait_for_completion_io_timeout(&wait, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&wait); + + return (blk_status_t)(uintptr_t)rq->end_io_data; +} +EXPORT_SYMBOL(blk_execute_rq); + static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -1160,14 +1348,15 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, - void *priv, bool reserved) +static bool blk_mq_rq_inflight(struct request *rq, void *priv, + bool reserved) { /* - * If we find a request that isn't idle and the queue matches, - * we know the queue is busy. Return false to stop the iteration. + * If we find a request that isn't idle we know the queue is busy + * as it's checked in the iter. + * Return false to stop the iteration. */ - if (blk_mq_request_started(rq) && rq->q == hctx->queue) { + if (blk_mq_request_started(rq)) { bool *busy = priv; *busy = true; @@ -1225,12 +1414,11 @@ void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) + else if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } -static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, bool reserved) +static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved) { unsigned long *next = priv; @@ -1771,19 +1959,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, */ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { - int srcu_idx; - /* * We can't run the queue inline with ints disabled. Ensure that * we catch bad users of this early. */ WARN_ON_ONCE(in_interrupt()); - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - blk_mq_sched_dispatch_requests(hctx); - hctx_unlock(hctx, srcu_idx); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) @@ -1895,7 +2078,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - int srcu_idx; bool need_run; /* @@ -1906,10 +2088,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ - hctx_lock(hctx, &srcu_idx); - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx); - hctx_unlock(hctx, srcu_idx); + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); @@ -2202,98 +2383,6 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, *queued = 0; } -static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) -{ - struct blk_mq_hw_ctx *hctx = NULL; - struct request *rq; - int queued = 0; - int errors = 0; - - while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(plug->mq_list); - blk_status_t ret; - - if (hctx != rq->mq_hctx) { - if (hctx) - blk_mq_commit_rqs(hctx, &queued, from_schedule); - hctx = rq->mq_hctx; - } - - ret = blk_mq_request_issue_directly(rq, last); - switch (ret) { - case BLK_STS_OK: - queued++; - break; - case BLK_STS_RESOURCE: - case BLK_STS_DEV_RESOURCE: - blk_mq_request_bypass_insert(rq, false, last); - blk_mq_commit_rqs(hctx, &queued, from_schedule); - return; - default: - blk_mq_end_request(rq, ret); - errors++; - break; - } - } - - /* - * If we didn't flush the entire list, we could have told the driver - * there was more coming, but that turned out to be a lie. - */ - if (errors) - blk_mq_commit_rqs(hctx, &queued, from_schedule); -} - -void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) -{ - struct blk_mq_hw_ctx *this_hctx; - struct blk_mq_ctx *this_ctx; - unsigned int depth; - LIST_HEAD(list); - - if (rq_list_empty(plug->mq_list)) - return; - plug->rq_count = 0; - - if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - blk_mq_plug_issue_direct(plug, false); - if (rq_list_empty(plug->mq_list)) - return; - } - - this_hctx = NULL; - this_ctx = NULL; - depth = 0; - do { - struct request *rq; - - rq = rq_list_pop(&plug->mq_list); - - if (!this_hctx) { - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { - trace_block_unplug(this_hctx->queue, depth, - !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, - &list, from_schedule); - depth = 0; - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - - } - - list_add(&rq->queuelist, &list); - depth++; - } while (!rq_list_empty(plug->mq_list)); - - if (!list_empty(&list)) { - trace_block_unplug(this_hctx->queue, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, - from_schedule); - } -} - static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { @@ -2404,33 +2493,141 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { - blk_status_t ret; - int srcu_idx; + blk_status_t ret = + __blk_mq_try_issue_directly(hctx, rq, false, true); - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - - ret = __blk_mq_try_issue_directly(hctx, rq, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); - - hctx_unlock(hctx, srcu_idx); } -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { - blk_status_t ret; - int srcu_idx; - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last); +} - hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, true, last); - hctx_unlock(hctx, srcu_idx); +static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_hw_ctx *hctx = NULL; + struct request *rq; + int queued = 0; + int errors = 0; - return ret; + while ((rq = rq_list_pop(&plug->mq_list))) { + bool last = rq_list_empty(plug->mq_list); + blk_status_t ret; + + if (hctx != rq->mq_hctx) { + if (hctx) + blk_mq_commit_rqs(hctx, &queued, from_schedule); + hctx = rq->mq_hctx; + } + + ret = blk_mq_request_issue_directly(rq, last); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, last); + blk_mq_commit_rqs(hctx, &queued, from_schedule); + return; + default: + blk_mq_end_request(rq, ret); + errors++; + break; + } + } + + /* + * If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if (errors) + blk_mq_commit_rqs(hctx, &queued, from_schedule); +} + +static void __blk_mq_flush_plug_list(struct request_queue *q, + struct blk_plug *plug) +{ + if (blk_queue_quiesced(q)) + return; + q->mq_ops->queue_rqs(&plug->mq_list); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_hw_ctx *this_hctx; + struct blk_mq_ctx *this_ctx; + struct request *rq; + unsigned int depth; + LIST_HEAD(list); + + if (rq_list_empty(plug->mq_list)) + return; + plug->rq_count = 0; + + if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { + struct request_queue *q; + + rq = rq_list_peek(&plug->mq_list); + q = rq->q; + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole plug list in one go. We + * already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + * + * Since we pass off the full list to the driver at this point, + * we do not increment the active request count for the queue. + * Bypass shared tags for now because of that. + */ + if (q->mq_ops->queue_rqs && + !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + blk_mq_run_dispatch_ops(q, + __blk_mq_flush_plug_list(q, plug)); + if (rq_list_empty(plug->mq_list)) + return; + } + + blk_mq_run_dispatch_ops(q, + blk_mq_plug_issue_direct(plug, false)); + if (rq_list_empty(plug->mq_list)) + return; + } + + this_hctx = NULL; + this_ctx = NULL; + depth = 0; + do { + rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + trace_block_unplug(this_hctx->queue, depth, + !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &list, from_schedule); + depth = 0; + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + + } + + list_add(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + if (!list_empty(&list)) { + trace_block_unplug(this_hctx->queue, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, + from_schedule); + } } void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2469,21 +2666,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, hctx->queue->mq_ops->commit_rqs(hctx); } -static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) -{ - if (!plug->multiple_queues) { - struct request *nxt = rq_list_peek(&plug->mq_list); - - if (nxt && nxt->q != rq->q) - plug->multiple_queues = true; - } - if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) - plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); - plug->rq_count++; -} - /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging @@ -2496,12 +2678,33 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) return BLK_MAX_REQUEST_COUNT; } +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + struct request *last = rq_list_peek(&plug->mq_list); + + if (!plug->rq_count) { + trace_block_plug(rq->q); + } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || + (!blk_queue_nomerges(rq->q) && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_mq_flush_plug_list(plug, false); + trace_block_plug(rq->q); + } + + if (!plug->multiple_queues && last && last->q != rq->q) + plug->multiple_queues = true; + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; +} + static bool blk_mq_attempt_bio_merge(struct request_queue *q, - struct bio *bio, unsigned int nr_segs, - bool *same_queue_rq) + struct bio *bio, unsigned int nr_segs) { if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { - if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq)) + if (blk_attempt_plug_merge(q, bio, nr_segs)) return true; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return true; @@ -2511,9 +2714,7 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio, - unsigned int nsegs, - bool *same_queue_rq) + struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, @@ -2522,11 +2723,9 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (blk_mq_attempt_bio_merge(q, bio, nsegs, same_queue_rq)) + if (unlikely(bio_queue_enter(bio))) return NULL; - rq_qos_throttle(q, bio); - if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; @@ -2536,66 +2735,35 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq = __blk_mq_alloc_requests(&data); if (rq) return rq; - rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - - return NULL; -} - -static inline bool blk_mq_can_use_cached_rq(struct request *rq, struct bio *bio) -{ - if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) - return false; - - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) - return false; - - return true; -} - -static inline struct request *blk_mq_get_request(struct request_queue *q, - struct blk_plug *plug, - struct bio *bio, - unsigned int nsegs, - bool *same_queue_rq) -{ - struct request *rq; - bool checked = false; - - if (plug) { - rq = rq_list_peek(&plug->cached_rq); - if (rq && rq->q == q) { - if (unlikely(!submit_bio_checks(bio))) - return NULL; - if (blk_mq_attempt_bio_merge(q, bio, nsegs, - same_queue_rq)) - return NULL; - checked = true; - if (!blk_mq_can_use_cached_rq(rq, bio)) - goto fallback; - rq->cmd_flags = bio->bi_opf; - plug->cached_rq = rq_list_next(rq); - INIT_LIST_HEAD(&rq->queuelist); - rq_qos_throttle(q, bio); - return rq; - } - } - -fallback: - if (unlikely(bio_queue_enter(bio))) - return NULL; - if (unlikely(!checked && !submit_bio_checks(bio))) - goto out_put; - rq = blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq); - if (rq) - return rq; -out_put: blk_queue_exit(q); return NULL; } +static inline struct request *blk_mq_get_cached_request(struct request_queue *q, + struct blk_plug *plug, struct bio *bio) +{ + struct request *rq; + + if (!plug) + return NULL; + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + + if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + return NULL; + + rq->cmd_flags = bio->bi_opf; + plug->cached_rq = rq_list_next(rq); + INIT_LIST_HEAD(&rq->queuelist); + return rq; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2612,10 +2780,9 @@ static inline struct request *blk_mq_get_request(struct request_queue *q, void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct blk_plug *plug = blk_mq_plug(q, bio); const int is_sync = op_is_sync(bio->bi_opf); struct request *rq; - struct blk_plug *plug; - bool same_queue_rq = false; unsigned int nr_segs = 1; blk_status_t ret; @@ -2629,11 +2796,18 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - plug = blk_mq_plug(q, bio); - rq = blk_mq_get_request(q, plug, bio, nr_segs, &same_queue_rq); - if (unlikely(!rq)) + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; + rq_qos_throttle(q, bio); + + rq = blk_mq_get_cached_request(q, plug, bio); + if (!rq) { + rq = blk_mq_get_new_requests(q, plug, bio); + if (unlikely(!rq)) + return; + } + trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -2653,69 +2827,212 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (plug && (q->nr_hw_queues == 1 || - blk_mq_is_shared_tags(rq->mq_hctx->flags) || - q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { - /* - * Use plugging if we have a ->commit_rqs() hook as well, as - * we know the driver uses bd->last in a smart fashion. - * - * Use normal plugging if this disk is slow HDD, as sequential - * IO may benefit a lot from plug merging. - */ - unsigned int request_count = plug->rq_count; - struct request *last = NULL; - - if (!request_count) { - trace_block_plug(q); - } else if (!blk_queue_nomerges(q)) { - last = rq_list_peek(&plug->mq_list); - if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE) - last = NULL; - } - - if (request_count >= blk_plug_max_rq_count(plug) || last) { - blk_mq_flush_plug_list(plug, false); - trace_block_plug(q); - } - + if (plug) blk_add_rq_to_plug(plug, rq); - } else if (rq->rq_flags & RQF_ELV) { - /* Insert the request at the IO scheduler queue */ + else if ((rq->rq_flags & RQF_ELV) || + (rq->mq_hctx->dispatch_busy && + (q->nr_hw_queues == 1 || !is_sync))) blk_mq_sched_insert_request(rq, false, true, true); - } else if (plug && !blk_queue_nomerges(q)) { - struct request *next_rq = NULL; + else + blk_mq_run_dispatch_ops(rq->q, + blk_mq_try_issue_directly(rq->mq_hctx, rq)); +} - /* - * We do limited plugging. If the bio can be merged, do that. - * Otherwise the existing request in the plug list will be - * issued. So the plug list will have one request at most - * The plug list might get flushed before this. If that happens, - * the plug list is empty, and same_queue_rq is invalid. - */ - if (same_queue_rq) { - next_rq = rq_list_pop(&plug->mq_list); - plug->rq_count--; - } - blk_add_rq_to_plug(plug, rq); - trace_block_plug(q); +/** + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for the new queue limits + * @q: the queue + * @rq: the request being checked + * + * Description: + * @rq may have been made based on weaker limitations of upper-level queues + * in request stacking drivers, and it may violate the limitation of @q. + * Since the block layer and the underlying device driver trust @rq + * after it is inserted to @q, it should be checked against @q before + * the insertion using this generic function. + * + * Request stacking drivers like request-based dm may change the queue + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. + */ +static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) +{ + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); - if (next_rq) { - trace_block_unplug(q, 1, true); - blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq); - } - } else if ((q->nr_hw_queues > 1 && is_sync) || - !rq->mq_hctx->dispatch_busy) { + if (blk_rq_sectors(rq) > max_sectors) { /* - * There is no scheduler and we can try to send directly - * to the hardware. + * SCSI device does not have a good way to return if + * Write Same/Zero is actually supported. If a device rejects + * a non-read/write command (discard, write same,etc.) the + * low-level device driver will set the relevant queue limit to + * 0 to prevent blk-lib from issuing more of the offending + * operations. Commands queued prior to the queue limit being + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O + * errors being propagated to upper layers. */ - blk_mq_try_issue_directly(rq->mq_hctx, rq); - } else { - /* Default case. */ - blk_mq_sched_insert_request(rq, false, true, true); + if (max_sectors == 0) + return BLK_STS_NOTSUPP; + + printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", + __func__, blk_rq_sectors(rq), max_sectors); + return BLK_STS_IOERR; + } + + /* + * The queue settings related to segment counting may differ from the + * original queue. + */ + rq->nr_phys_segments = blk_recalc_rq_segments(rq); + if (rq->nr_phys_segments > queue_max_segments(q)) { + printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", + __func__, rq->nr_phys_segments, queue_max_segments(q)); + return BLK_STS_IOERR; + } + + return BLK_STS_OK; +} + +/** + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued + */ +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +{ + blk_status_t ret; + + ret = blk_cloned_rq_check_limits(q, rq); + if (ret != BLK_STS_OK) + return ret; + + if (rq->q->disk && + should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) + return BLK_STS_IOERR; + + if (blk_crypto_insert_cloned_request(rq)) + return BLK_STS_IOERR; + + blk_account_io_start(rq); + + /* + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. + */ + blk_mq_run_dispatch_ops(rq->q, + ret = blk_mq_request_issue_directly(rq, true)); + return ret; +} +EXPORT_SYMBOL_GPL(blk_insert_cloned_request); + +/** + * blk_rq_unprep_clone - Helper function to free all bios in a cloned request + * @rq: the clone request to be cleaned up + * + * Description: + * Free all bios in @rq for a cloned request. + */ +void blk_rq_unprep_clone(struct request *rq) +{ + struct bio *bio; + + while ((bio = rq->bio) != NULL) { + rq->bio = bio->bi_next; + + bio_put(bio); } } +EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); + +/** + * blk_rq_prep_clone - Helper function to setup clone request + * @rq: the request to be setup + * @rq_src: original request to be cloned + * @bs: bio_set that bios for clone are allocated from + * @gfp_mask: memory allocation mask for bio + * @bio_ctr: setup function to be called for each clone bio. + * Returns %0 for success, non %0 for failure. + * @data: private data to be passed to @bio_ctr + * + * Description: + * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. + * Also, pages which the original bios are pointing to are not copied + * and the cloned bios just point same pages. + * So cloned bios must be completed before original bios, which means + * the caller must complete @rq before @rq_src. + */ +int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), + void *data) +{ + struct bio *bio, *bio_src; + + if (!bs) + bs = &fs_bio_set; + + __rq_for_each_bio(bio_src, rq_src) { + bio = bio_clone_fast(bio_src, gfp_mask, bs); + if (!bio) + goto free_and_out; + + if (bio_ctr && bio_ctr(bio, bio_src, data)) + goto free_and_out; + + if (rq->bio) { + rq->biotail->bi_next = bio; + rq->biotail = bio; + } else { + rq->bio = rq->biotail = bio; + } + bio = NULL; + } + + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + + if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) + goto free_and_out; + + return 0; + +free_and_out: + if (bio) + bio_put(bio); + blk_rq_unprep_clone(rq); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_rq_prep_clone); + +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); static size_t order_to_size(unsigned int order) { @@ -2743,7 +3060,7 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { - WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } @@ -3077,7 +3394,7 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, if (!tags) return; - WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); @@ -3131,20 +3448,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, } } -static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) -{ - int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), - __alignof__(struct blk_mq_hw_ctx)) != - sizeof(struct blk_mq_hw_ctx)); - - if (tag_set->flags & BLK_MQ_F_BLOCKING) - hw_ctx_size += sizeof(struct srcu_struct); - - return hw_ctx_size; -} - static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) @@ -3182,7 +3485,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; - hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); + hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; @@ -3224,8 +3527,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, if (!hctx->fq) goto free_bitmap; - if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->srcu); blk_mq_hctx_kobj_init(hctx); return hctx; @@ -3561,7 +3862,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node); + q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; @@ -3710,6 +4011,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { + WARN_ON_ONCE(blk_queue_has_srcu(q) != + !!(set->flags & BLK_MQ_F_BLOCKING)); + /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -4246,11 +4550,10 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); /* Enable polling stats and return whether they were already enabled. */ static bool blk_poll_stats_enable(struct request_queue *q) { - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) + if (q->poll_stat) return true; - blk_stat_add_callback(q, q->poll_cb); - return false; + + return blk_stats_alloc_enable(q); } static void blk_mq_poll_stats_start(struct request_queue *q) @@ -4259,8 +4562,7 @@ static void blk_mq_poll_stats_start(struct request_queue *q) * We don't arm the callback if polling stats are not enabled or the * callback is already active. */ - if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_stat_is_active(q->poll_cb)) + if (!q->poll_stat || blk_stat_is_active(q->poll_cb)) return; blk_stat_activate_msecs(q->poll_cb, 100); diff --git a/block/blk-mq.h b/block/blk-mq.h index afcf9931a489..948791ea2a3e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -65,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); - -/* Used by blk_insert_cloned_request() to issue request directly */ -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); @@ -377,5 +374,24 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return __blk_mq_active_requests(hctx) < depth; } +/* run the code block in @dispatch_ops with rcu/srcu read lock held */ +#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ +do { \ + if (!blk_queue_has_srcu(q)) { \ + rcu_read_lock(); \ + (dispatch_ops); \ + rcu_read_unlock(); \ + } else { \ + int srcu_idx; \ + \ + might_sleep_if(check_sleep); \ + srcu_idx = srcu_read_lock((q)->srcu); \ + (dispatch_ops); \ + srcu_read_unlock((q)->srcu, srcu_idx); \ + } \ +} while (0) + +#define blk_mq_run_dispatch_ops(q, dispatch_ops) \ + __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ #endif diff --git a/block/blk-stat.c b/block/blk-stat.c index ae3dd1fb8e61..2ea01b5c1aca 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -15,7 +15,7 @@ struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; - bool enable_accounting; + int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) @@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q, spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); - if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) + if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); @@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } +void blk_stat_disable_accounting(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); + if (!--q->stats->accounting) + blk_queue_flag_clear(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); +} +EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); + void blk_stat_enable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); - q->stats->enable_accounting = true; - blk_queue_flag_set(QUEUE_FLAG_STATS, q); + if (!q->stats->accounting++) + blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); @@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void) INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); - stats->enable_accounting = false; + stats->accounting = 0; return stats; } @@ -219,3 +230,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats) kfree(stats); } + +bool blk_stats_alloc_enable(struct request_queue *q) +{ + struct blk_rq_stat *poll_stat; + + poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat), + GFP_ATOMIC); + if (!poll_stat) + return false; + + if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) { + kfree(poll_stat); + return true; + } + + blk_stat_add_callback(q, q->poll_cb); + return false; +} diff --git a/block/blk-stat.h b/block/blk-stat.h index 17b47a86eefb..17e1eb4ec7e2 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,11 +64,13 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); +bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); /* record time/size info in request but not add a callback */ void blk_stat_enable_accounting(struct request_queue *q); +void blk_stat_disable_accounting(struct request_queue *q); /** * blk_stat_alloc_callback() - Allocate a block statistics callback. diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cd75b0f73dc6..e20eadfcf5c8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -16,6 +16,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" #include "blk-wbt.h" #include "blk-throttle.h" @@ -734,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); - kmem_cache_free(blk_requestq_cachep, q); + + kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); } /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ @@ -747,7 +749,7 @@ static void blk_exit_queue(struct request_queue *q) */ if (q->elevator) { ioc_clear_queue(q); - __elevator_exit(q, q->elevator); + elevator_exit(q); } /* @@ -785,14 +787,15 @@ static void blk_release_queue(struct kobject *kobj) might_sleep(); - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + if (q->poll_stat) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); - blk_free_queue_stats(q->stats); - blk_exit_queue(q); + blk_free_queue_stats(q->stats); + kfree(q->poll_stat); + blk_queue_free_zone_bitmaps(q); if (queue_is_mq(q)) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 39bb6e68a9a2..7c462c006b26 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -13,6 +13,7 @@ #include #include "blk.h" #include "blk-cgroup-rwstat.h" +#include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ diff --git a/block/blk.h b/block/blk.h index ccde6e6f1736..8bd43b3ad33d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,15 +2,10 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H -#include -#include -#include #include #include /* for max_pfn/max_low_pfn */ #include #include "blk-crypto-internal.h" -#include "blk-mq.h" -#include "blk-mq-sched.h" struct elevator_type; @@ -32,15 +27,10 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *blk_requestq_srcu_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; -static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) -{ - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; -} - static inline void __blk_get_queue(struct request_queue *q) { kobject_get(&q->kobj); @@ -250,16 +240,13 @@ static inline void blk_integrity_del(struct gendisk *disk) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -void blk_print_req_error(struct request *req, blk_status_t status); +const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, bool *same_queue_rq); + unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); -void __blk_account_io_start(struct request *req); -void __blk_account_io_done(struct request *req, u64 now); - /* * Plug flush limits */ @@ -275,19 +262,10 @@ void blk_insert_flush(struct request *rq); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); -void __elevator_exit(struct request_queue *, struct elevator_queue *); +void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); -static inline void elevator_exit(struct request_queue *q, - struct elevator_queue *e) -{ - lockdep_assert_held(&q->sysfs_lock); - - blk_mq_sched_free_rqs(q); - __elevator_exit(q, e); -} - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, @@ -347,26 +325,10 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk; + return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk; } -static inline void blk_account_io_done(struct request *req, u64 now) -{ - /* - * Account IO completion. flush_rq isn't accounted as a - * normal IO on queueing nor completion. Accounting the - * containing request is enough. - */ - if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) - __blk_account_io_done(req, now); -} - -static inline void blk_account_io_start(struct request *req) -{ - if (blk_do_io_stat(req)) - __blk_account_io_start(req); -} +void update_io_ticks(struct block_device *part, unsigned long now, bool end); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { @@ -402,13 +364,15 @@ static inline unsigned int bio_aligned_discard_max_sectors( /* * Internal io_context interface */ -void get_io_context(struct io_context *ioc); -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask); +struct io_cq *ioc_find_get_icq(struct request_queue *q); +struct io_cq *ioc_lookup_icq(struct request_queue *q); +#ifdef CONFIG_BLK_ICQ void ioc_clear_queue(struct request_queue *q); - -int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); +#else +static inline void ioc_clear_queue(struct request_queue *q) +{ +} +#endif /* CONFIG_BLK_ICQ */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); @@ -467,7 +431,15 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); -struct request_queue *blk_alloc_queue(int node_id); +static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu) +{ + if (srcu) + return blk_requestq_srcu_cachep; + return blk_requestq_cachep; +} +struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu); + +int disk_scan_partitions(struct gendisk *disk, fmode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); @@ -493,4 +465,45 @@ int disk_register_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *new_iars); void disk_unregister_independent_access_ranges(struct gendisk *disk); +#ifdef CONFIG_FAIL_MAKE_REQUEST +bool should_fail_request(struct block_device *part, unsigned int bytes); +#else /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool should_fail_request(struct block_device *part, + unsigned int bytes) +{ + return false; +} +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + +/* + * Optimized request reference counting. Ideally we'd make timeouts be more + * clever, as that's the only reason we need references at all... But until + * this happens, this is faster than using refcount_t. Also see: + * + * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count") + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct request *req) +{ + return atomic_inc_not_zero(&req->ref); +} + +static inline bool req_ref_put_and_test(struct request *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->ref); +} + +static inline void req_ref_set(struct request *req, int value) +{ + atomic_set(&req->ref, value); +} + +static inline int req_ref_read(struct request *req) +{ + return atomic_read(&req->ref); +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 10aa378702fa..acfe1357bf6c 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -92,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, goto out_unmap_bidi_rq; bio = rq->bio; - blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); /* * The assignments below don't make much sense, but are kept for diff --git a/block/elevator.c b/block/elevator.c index 19a78d5516ba..ec98aed39c4f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void __elevator_exit(struct request_queue *q, struct elevator_queue *e) +void elevator_exit(struct request_queue *q) { + struct elevator_queue *e = q->elevator; + mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); @@ -595,7 +597,8 @@ int elevator_switch_mq(struct request_queue *q, elv_unregister_queue(q); ioc_clear_queue(q); - elevator_exit(q, q->elevator); + blk_mq_sched_free_rqs(q); + elevator_exit(q); } ret = blk_mq_init_sched(q, new_e); @@ -605,7 +608,8 @@ int elevator_switch_mq(struct request_queue *q, if (new_e) { ret = elv_register_queue(q, true); if (ret) { - elevator_exit(q, q->elevator); + blk_mq_sched_free_rqs(q); + elevator_exit(q); goto out; } } diff --git a/block/fops.c b/block/fops.c index 0da147edbd18..26bf15c770d2 100644 --- a/block/fops.c +++ b/block/fops.c @@ -566,21 +566,48 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct block_device *bdev = iocb->ki_filp->private_data; loff_t size = bdev_nr_bytes(bdev); + size_t count = iov_iter_count(to); loff_t pos = iocb->ki_pos; size_t shorted = 0; - ssize_t ret; + ssize_t ret = 0; - if (unlikely(pos + iov_iter_count(to) > size)) { + if (unlikely(pos + count > size)) { if (pos >= size) return 0; size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; + if (count > size) { + shorted = count - size; iov_iter_truncate(to, size); } } - ret = generic_file_read_iter(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + return ret; + } + + file_accessed(iocb->ki_filp); + + ret = blkdev_direct_IO(iocb, to); + if (ret >= 0) { + iocb->ki_pos += ret; + count -= ret; + } + if (ret < 0 || !count) + return ret; + } + + ret = filemap_read(iocb, to, ret); if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); diff --git a/block/genhd.c b/block/genhd.c index 30362aeacac4..626c8406f21a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,8 +25,10 @@ #include #include #include +#include #include "blk.h" +#include "blk-mq-sched.h" #include "blk-rq-qos.h" static struct kobject *block_depr; @@ -372,17 +374,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -static void disk_scan_partitions(struct gendisk *disk) +int disk_scan_partitions(struct gendisk *disk, fmode_t mode) { struct block_device *bdev; - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; + if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) + return -EINVAL; + if (disk->open_partitions) + return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); + bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + blkdev_put(bdev, mode); + return 0; } /** @@ -425,6 +431,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, DISK_MAX_PARTS); disk->minors = DISK_MAX_PARTS; } + if (disk->first_minor + disk->minors > MINORMASK + 1) + return -EINVAL; } else { if (WARN_ON(disk->minors)) return -EINVAL; @@ -434,13 +442,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, return ret; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; - disk->flags |= GENHD_FL_EXT_DEVT; } - ret = disk_alloc_events(disk); - if (ret) - goto out_free_ext_minor; - /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); @@ -451,7 +454,12 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); if (ret) - goto out_disk_release_events; + goto out_free_ext_minor; + + ret = disk_alloc_events(disk); + if (ret) + goto out_device_del; + if (!sysfs_deprecated) { ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); @@ -490,14 +498,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, if (ret) goto out_put_slave_dir; - if (disk->flags & GENHD_FL_HIDDEN) { - /* - * Don't let hidden disks show up in /proc/partitions, - * and don't bother scanning for partitions either. - */ - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->flags |= GENHD_FL_NO_PART_SCAN; - } else { + if (!(disk->flags & GENHD_FL_HIDDEN)) { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); if (ret) @@ -509,7 +510,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, goto out_unregister_bdi; bdev_add(disk->part0, ddev->devt); - disk_scan_partitions(disk); + if (get_capacity(disk)) + disk_scan_partitions(disk, FMODE_READ); /* * Announce the disk and partitions after all partitions are @@ -539,8 +541,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, sysfs_remove_link(block_depr, dev_name(ddev)); out_device_del: device_del(ddev); -out_disk_release_events: - disk_release_events(disk); out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); @@ -720,8 +720,7 @@ void __init printk_all_partitions(void) * Don't show empty devices or things that have been * suppressed */ - if (get_capacity(disk) == 0 || - (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) + if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) continue; /* @@ -814,11 +813,7 @@ static int show_partition(struct seq_file *seqf, void *v) struct block_device *part; unsigned long idx; - /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && - (sgp->flags & GENHD_FL_REMOVABLE))) - return 0; - if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) + if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; rcu_read_lock(); @@ -874,7 +869,8 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk_max_parts(disk)); + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, @@ -1343,7 +1339,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node); + q = blk_alloc_queue(node, false); if (!q) return NULL; diff --git a/block/ioctl.c b/block/ioctl.c index 0a1d10ac2e1a..4a86340133e4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif -static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) -{ - struct block_device *tmp; - - if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (bdev->bd_disk->open_partitions) - return -EBUSY; - - /* - * Reopen the device to revalidate the driver state and force a - * partition rescan. - */ - mode &= ~FMODE_EXCL; - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - - tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - blkdev_put(tmp, mode); - return 0; -} - static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, unsigned long arg, unsigned long flags) { @@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: - return blkdev_reread_part(bdev, mode); + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (bdev_is_partition(bdev)) + return -EINVAL; + return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: diff --git a/block/ioprio.c b/block/ioprio.c index 6f01d35a5145..2fe068fcaad5 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -22,46 +22,14 @@ */ #include #include -#include #include #include #include #include -#include -#include #include #include #include -int set_task_ioprio(struct task_struct *task, int ioprio) -{ - int err; - struct io_context *ioc; - const struct cred *cred = current_cred(), *tcred; - - rcu_read_lock(); - tcred = __task_cred(task); - if (!uid_eq(tcred->uid, cred->euid) && - !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); - - err = security_task_setioprio(task, ioprio); - if (err) - return err; - - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } - - return err; -} -EXPORT_SYMBOL_GPL(set_task_ioprio); - int ioprio_check_cap(int ioprio) { int class = IOPRIO_PRIO_CLASS(ioprio); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index fdd74a4df56f..70ff2a599ef6 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e) int i; del_timer_sync(&kqd->timer); + blk_stat_disable_accounting(kqd->q); for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); diff --git a/block/partitions/core.c b/block/partitions/core.c index 334b72ef1d73..c2a1635922b1 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -98,13 +98,12 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; - int nr; + int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; - nr = disk_max_parts(hd); state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); @@ -326,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, lockdep_assert_held(&disk->open_mutex); - if (partno >= disk_max_parts(disk)) + if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* @@ -527,18 +526,15 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, static bool disk_unlock_native_capacity(struct gendisk *disk) { - const struct block_device_operations *bdops = disk->fops; - - if (bdops->unlock_native_capacity && - !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { - printk(KERN_CONT "enabling native capacity\n"); - bdops->unlock_native_capacity(disk); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - return true; - } else { + if (!disk->fops->unlock_native_capacity || + test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } + + printk(KERN_CONT "enabling native capacity\n"); + disk->fops->unlock_native_capacity(disk); + return true; } void blk_drop_partitions(struct gendisk *disk) @@ -607,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk) struct parsed_partitions *state; int ret = -EAGAIN, p; - if (!disk_part_scan_enabled(disk)) + if (disk->flags & GENHD_FL_NO_PART) return 0; state = check_partition(disk); @@ -690,7 +686,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate) * userspace for this particular setup. */ if (invalidate) { - if (disk_part_scan_enabled(disk) || + if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index bf5c124c5452..5a566f2fd533 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1505,7 +1505,7 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct amiga_floppy_struct *floppy = rq->rq_disk->private_data; + struct amiga_floppy_struct *floppy = rq->q->disk->private_data; blk_status_t err; if (!spin_trylock_irq(&amiflop_lock)) @@ -1790,6 +1790,7 @@ static int fd_alloc_disk(int drive, int system) disk->first_minor = drive + system; disk->minors = 1; disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; if (system) sprintf(disk->disk_name, "fd%d_msdos", drive); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index bf769e6e32fe..5d819a466e2f 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1502,7 +1502,7 @@ static void setup_req_params( int drive ) static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct atari_floppy_struct *floppy = bd->rq->rq_disk->private_data; + struct atari_floppy_struct *floppy = bd->rq->q->disk->private_data; int drive = floppy - unit; int type = floppy->type; @@ -1538,7 +1538,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, if (!UDT) { Probing = 1; UDT = atari_disk_type + StartDiskType[DriveType]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 1; } } @@ -1558,7 +1558,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, } type = minor2disktype[type].index; UDT = &atari_disk_type[type]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 0; } @@ -2000,6 +2000,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type) disk->minors = 1; sprintf(disk->disk_name, "fd%d", drive); disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = &unit[drive]; set_capacity(disk, MAX_DISK_SIZE * 2); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a896ee175d86..8fe2e4289dae 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -405,7 +405,6 @@ static int brd_alloc(int i) disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; - disk->flags = GENHD_FL_EXT_DEVT; strlcpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 53ba2dddba6e..07b3c6093e7d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2734,6 +2734,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig disk->first_minor = minor; disk->minors = 1; disk->fops = &drbd_ops; + disk->flags |= GENHD_FL_NO_PART; sprintf(disk->disk_name, "drbd%d", minor); disk->private_data = device; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c4267da716fe..0c638de25023 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2259,7 +2259,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req) static void floppy_end_request(struct request *req, blk_status_t error) { unsigned int nr_sectors = current_count_sectors; - unsigned int drive = (unsigned long)req->rq_disk->private_data; + unsigned int drive = (unsigned long)req->q->disk->private_data; /* current_count_sectors can be zero if transfer failed */ if (error) @@ -2550,7 +2550,7 @@ static int make_raw_rw_request(void) if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n")) return 0; - set_fdc((long)current_req->rq_disk->private_data); + set_fdc((long)current_req->q->disk->private_data); raw_cmd = &default_raw_cmd; raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; @@ -2792,7 +2792,7 @@ static void redo_fd_request(void) return; } } - drive = (long)current_req->rq_disk->private_data; + drive = (long)current_req->q->disk->private_data; set_fdc(drive); reschedule_timeout(current_drive, "redo fd request"); @@ -4503,6 +4503,7 @@ static int floppy_alloc_disk(unsigned int drive, unsigned int type) disk->first_minor = TOMINOR(drive) | (type << 2); disk->minors = 1; disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; if (type) sprintf(disk->disk_name, "fd%d_type%d", drive, type); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c3a36cfaa855..e98ddf08d77d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1061,7 +1061,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART; loop_global_unlock(lo, is_loop); if (partscan) @@ -1191,7 +1191,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) mutex_lock(&lo->lo_mutex); lo->lo_flags = 0; if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; + lo->lo_disk->flags |= GENHD_FL_NO_PART; lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); @@ -1301,7 +1301,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART; partscan = true; } out_unlock: @@ -2032,8 +2032,7 @@ static int loop_add(int i) * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) - disk->flags |= GENHD_FL_NO_PART_SCAN; - disk->flags |= GENHD_FL_EXT_DEVT; + disk->flags |= GENHD_FL_NO_PART; atomic_set(&lo->lo_refcnt, 0); mutex_init(&lo->lo_mutex); lo->lo_number = i; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c91b9010c1a6..30f471021a40 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, rq->timeout = timeout; /* insert request and run queue */ - blk_execute_rq(NULL, rq, true); + blk_execute_rq(rq, true); if (int_cmd->status) { dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 78282f01f581..4db9a8c244af 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -136,7 +136,7 @@ static int __init n64cart_probe(struct platform_device *pdev) goto out; disk->first_minor = 0; - disk->flags = GENHD_FL_NO_PART_SCAN; + disk->flags = GENHD_FL_NO_PART; disk->fops = &n64cart_fops; disk->private_data = &pdev->dev; strcpy(disk->disk_name, "n64cart"); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 323af5c9c802..54f7d490f8eb 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1850,7 +1850,6 @@ static int null_gendisk_register(struct nullb *nullb) set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; disk->minors = 1; diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index ce3b430e88c5..86d6c12c603c 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -44,7 +44,7 @@ TRACE_EVENT(nullb_zone_op, __entry->op = req_op(cmd->rq); __entry->zone_no = zone_no; __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->rq_disk); + __assign_disk_name(__entry->disk, cmd->rq->q->disk); ), TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", __print_disk_name(__entry->disk), diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index f6b1d63e96e1..f462ad67931a 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -690,7 +690,7 @@ static void pcd_request(void) if (!pcd_req && !set_next_request()) return; - cd = pcd_req->rq_disk->private_data; + cd = pcd_req->q->disk->private_data; if (cd != pcd_current) pcd_bufblk = -1; pcd_current = cd; @@ -928,8 +928,9 @@ static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port, disk->minors = 1; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; - disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; if (!pi_init(cd->pi, autoprobe, port, mode, unit, protocol, delay, pcd_buffer, PI_PCD, verbose, cd->name)) { diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index fba865058a17..3637c38c72f9 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -430,7 +430,7 @@ static void run_fsm(void) int stop = 0; if (!phase) { - pd_current = pd_req->rq_disk->private_data; + pd_current = pd_req->q->disk->private_data; pi_current = pd_current->pi; phase = do_pd_io_start; } @@ -492,7 +492,7 @@ static enum action do_pd_io_start(void) case REQ_OP_WRITE: pd_block = blk_rq_pos(pd_req); pd_count = blk_rq_cur_sectors(pd_req); - if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) + if (pd_block + pd_count > get_capacity(pd_req->q->disk)) return Fail; pd_run = blk_rq_sectors(pd_req); pd_buf = bio_data(pd_req->bio); @@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk, req = blk_mq_rq_to_pdu(rq); req->func = func; - blk_execute_rq(disk->gd, rq, 0); + blk_execute_rq(rq, false); blk_mq_free_request(rq); return 0; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index bf8d0ef41a0a..292e9a4ce1b9 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -746,12 +746,12 @@ static void pf_request(void) if (!pf_req && !set_next_request()) return; - pf_current = pf_req->rq_disk->private_data; + pf_current = pf_req->q->disk->private_data; pf_block = blk_rq_pos(pf_req); pf_run = blk_rq_sectors(pf_req); pf_count = blk_rq_cur_sectors(pf_req); - if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { + if (pf_block + pf_count > get_capacity(pf_req->q->disk)) { pf_end_request(BLK_STS_IOERR); goto repeat; } @@ -942,6 +942,7 @@ static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port, disk->minors = 1; strcpy(disk->disk_name, pf->name); disk->fops = &pf_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = pf; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b53f648302c1..887c98d61684 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * if (cgc->quiet) rq->rq_flags |= RQF_QUIET; - blk_execute_rq(pd->bdev->bd_disk, rq, 0); + blk_execute_rq(rq, false); if (scsi_req(rq)->result) ret = -EIO; out: @@ -2719,7 +2719,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->first_minor = idx; disk->minors = 1; disk->fops = &pktcdvd_ops; - disk->flags = GENHD_FL_REMOVABLE; + disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; strcpy(disk->disk_name, pd->name); disk->private_data = pd; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index c1876646a4cb..4f90819e245e 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -742,6 +742,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) priv->gendisk = gendisk; gendisk->major = ps3vram_major; gendisk->minors = 1; + gendisk->flags |= GENHD_FL_NO_PART; gendisk->fops = &ps3vram_fops; gendisk->private_data = dev; strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 953fa134cd3d..8f140da1efe3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4924,12 +4924,10 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->dev_id); disk->major = rbd_dev->major; disk->first_minor = rbd_dev->minor; - if (single_major) { + if (single_major) disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT); - disk->flags |= GENHD_FL_EXT_DEVT; - } else { + else disk->minors = RBD_MINORS_PER_MAJOR; - } disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 2df0657cdf00..67a8edbaa1fd 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -393,7 +393,7 @@ static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) static void rnbd_softirq_done_fn(struct request *rq) { - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_clt_session *sess = dev->sess; struct rnbd_iu *iu; @@ -1133,7 +1133,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); int err; blk_status_t ret = BLK_STS_IOERR; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 6f45a53f7cbf..146d85d80e0e 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -143,8 +143,8 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int vdc_ioctl(struct block_device *bdev, fmode_t mode, unsigned command, unsigned long argument) { + struct vdc_port *port = bdev->bd_disk->private_data; int i; - struct gendisk *disk; switch (command) { case CDROMMULTISESSION: @@ -155,12 +155,15 @@ static int vdc_ioctl(struct block_device *bdev, fmode_t mode, return 0; case CDROM_GET_CAPABILITY: - disk = bdev->bd_disk; - - if (bdev->bd_disk && (disk->flags & GENHD_FL_CD)) + if (!vdc_version_supported(port, 1, 1)) + return -EINVAL; + switch (port->vdisk_mtype) { + case VD_MEDIA_TYPE_CD: + case VD_MEDIA_TYPE_DVD: return 0; - return -EINVAL; - + default: + return -EINVAL; + } default: pr_debug(PFX "ioctl %08x not supported\n", command); return -EINVAL; @@ -459,7 +462,7 @@ static int __vdc_tx_trigger(struct vdc_port *port) static int __send_request(struct request *req) { - struct vdc_port *port = req->rq_disk->private_data; + struct vdc_port *port = req->q->disk->private_data; struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; struct scatterlist sg[MAX_RING_COOKIES]; struct vdc_req_entry *rqe; @@ -854,14 +857,12 @@ static int probe_disk(struct vdc_port *port) switch (port->vdisk_mtype) { case VD_MEDIA_TYPE_CD: pr_info(PFX "Virtual CDROM %s\n", port->disk_name); - g->flags |= GENHD_FL_CD; g->flags |= GENHD_FL_REMOVABLE; set_disk_ro(g, 1); break; case VD_MEDIA_TYPE_DVD: pr_info(PFX "Virtual DVD %s\n", port->disk_name); - g->flags |= GENHD_FL_CD; g->flags |= GENHD_FL_REMOVABLE; set_disk_ro(g, 1); break; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 821594cd1315..fef65a18d56f 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -840,6 +840,7 @@ static int swim_floppy_init(struct swim_priv *swd) swd->unit[drive].disk->minors = 1; sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); swd->unit[drive].disk->fops = &floppy_fops; + swd->unit[drive].disk->flags |= GENHD_FL_NO_PART; swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE; swd->unit[drive].disk->private_data = &swd->unit[drive]; set_capacity(swd->unit[drive].disk, 2880); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 4b91c9aa5892..6c39f2c9f806 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1227,7 +1227,7 @@ static int swim3_attach(struct macio_dev *mdev, disk->fops = &floppy_fops; disk->private_data = fs; disk->events = DISK_EVENT_MEDIA_CHANGE; - disk->flags |= GENHD_FL_REMOVABLE; + disk->flags |= GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; sprintf(disk->disk_name, "fd%d", floppy_count); set_capacity(disk, 2880); rc = add_disk(disk); diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index d1676fe0da1a..b361583944b9 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -540,7 +540,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); + blk_execute_rq_nowait(rq, true, NULL); return 0; @@ -579,7 +579,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); + blk_execute_rq_nowait(rq, true, NULL); return 0; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6ae38776e30e..c3dc3cd7a779 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -384,7 +384,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) if (err) goto out; - blk_execute_rq(vblk->disk, req, false); + blk_execute_rq(req, false); err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_mq_free_request(req); @@ -843,7 +843,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->minors = 1 << PART_BITS; vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; - vblk->disk->flags |= GENHD_FL_EXT_DEVT; vblk->index = index; /* configure queue flush support */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 914587aabca0..62125fd4af4a 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -510,7 +510,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, } vbd->size = vbd_sz(vbd); - if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) + if (cdrom || disk_to_cdi(vbd->bdev->bd_disk)) vbd->type |= VDISK_CDROM; if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) vbd->type |= VDISK_REMOVABLE; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 286cf1afad78..ccd0dd0c6b83 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -198,6 +198,7 @@ struct blkfront_info struct gendisk *gd; u16 sector_size; unsigned int physical_sector_size; + unsigned long vdisk_info; int vdevice; blkif_vdev_t handle; enum blkif_state connected; @@ -505,6 +506,7 @@ static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) static int blkif_ioctl(struct block_device *bdev, fmode_t mode, unsigned command, unsigned long argument) { + struct blkfront_info *info = bdev->bd_disk->private_data; int i; switch (command) { @@ -514,9 +516,9 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return -EFAULT; return 0; case CDROM_GET_CAPABILITY: - if (bdev->bd_disk->flags & GENHD_FL_CD) - return 0; - return -EINVAL; + if (!(info->vdisk_info & VDISK_CDROM)) + return -EINVAL; + return 0; default: return -EINVAL; } @@ -1057,9 +1059,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) } static int xlvbd_alloc_gendisk(blkif_sector_t capacity, - struct blkfront_info *info, - u16 vdisk_info, u16 sector_size, - unsigned int physical_sector_size) + struct blkfront_info *info, u16 sector_size, + unsigned int physical_sector_size) { struct gendisk *gd; int nr_minors = 1; @@ -1157,15 +1158,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, xlvbd_flush(info); - if (vdisk_info & VDISK_READONLY) + if (info->vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); - - if (vdisk_info & VDISK_REMOVABLE) + if (info->vdisk_info & VDISK_REMOVABLE) gd->flags |= GENHD_FL_REMOVABLE; - if (vdisk_info & VDISK_CDROM) - gd->flags |= GENHD_FL_CD; - return 0; out_free_tag_set: @@ -2313,7 +2310,6 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long long sectors; unsigned long sector_size; unsigned int physical_sector_size; - unsigned int binfo; int err, i; struct blkfront_ring_info *rinfo; @@ -2351,7 +2347,7 @@ static void blkfront_connect(struct blkfront_info *info) err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "sectors", "%llu", §ors, - "info", "%u", &binfo, + "info", "%u", &info->vdisk_info, "sector-size", "%lu", §or_size, NULL); if (err) { @@ -2380,7 +2376,7 @@ static void blkfront_connect(struct blkfront_info *info) } } - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, + err = xlvbd_alloc_gendisk(sectors, info, sector_size, physical_sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index ccc52c935faf..7a6ed83481b8 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -327,6 +327,7 @@ static int z2ram_register_disk(int minor) disk->major = Z2RAM_MAJOR; disk->first_minor = minor; disk->minors = 1; + disk->flags |= GENHD_FL_NO_PART; disk->fops = &z2_fops; if (minor) sprintf(disk->disk_name, "z2ram%d", minor); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 25071126995b..f6da5293b913 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1947,6 +1947,7 @@ static int zram_add(void) zram->disk->major = zram_major; zram->disk->first_minor = device_id; zram->disk->minors = 1; + zram->disk->flags |= GENHD_FL_NO_PART; zram->disk->fops = &zram_devops; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index d50cc1fd34d5..faead41709bc 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -719,6 +719,7 @@ static void probe_gdrom_setupdisk(void) gd.disk->major = gdrom_major; gd.disk->first_minor = 1; gd.disk->minors = 1; + gd.disk->flags |= GENHD_FL_NO_PART; strcpy(gd.disk->disk_name, GDROM_DEV_NAME); } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ef91bff5c23c..0080f0be72fe 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -425,7 +425,7 @@ static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) } #endif -static void copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss, +static void qib_copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss, u32 length, unsigned flush_wc) { u32 extra = 0; @@ -975,7 +975,7 @@ static int qib_verbs_send_pio(struct rvt_qp *qp, struct ib_header *ibhdr, qib_pio_copy(piobuf, addr, dwords); goto done; } - copy_io(piobuf, ss, len, flush_wc); + qib_copy_io(piobuf, ss, len, flush_wc); done: if (dd->flags & QIB_USE_SPCL_TRIG) { u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 90dc9cc48881..f4719b65e5e3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -550,7 +550,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_REQUEUE; } clone->bio = clone->biotail = NULL; - clone->rq_disk = bdev->bd_disk; clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; *__clone = clone; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 662742a310cb..280918cdcabd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1778,6 +1778,7 @@ static struct mapped_device *alloc_dev(int minor) md->disk->major = _major; md->disk->first_minor = minor; md->disk->minors = 1; + md->disk->flags |= GENHD_FL_NO_PART; md->disk->fops = &dm_blk_dops; md->disk->queue = md->queue; md->disk->private_data = md; diff --git a/drivers/md/md.c b/drivers/md/md.c index 41d6e2383517..55f9d9caff31 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5708,11 +5708,6 @@ static int md_alloc(dev_t dev, char *name) mddev->queue = disk->queue; blk_set_stacking_limits(&mddev->queue->limits); blk_queue_write_cache(mddev->queue, true, true); - /* Allow extended partitions. This makes the - * 'mdp' device redundant, but we can't really - * remove it now. - */ - disk->flags |= GENHD_FL_EXT_DEVT; disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 2483cfdd30ea..4e61b28a002f 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -264,7 +264,7 @@ static ssize_t power_ro_lock_store(struct device *dev, goto out_put; } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ret = req_to_mmc_queue_req(req)->drv_op_result; blk_mq_free_request(req); @@ -657,7 +657,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idatas; req_to_mmc_queue_req(req)->ioc_count = 1; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata); blk_mq_free_request(req); @@ -726,7 +726,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idata; req_to_mmc_queue_req(req)->ioc_count = num_of_cmds; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; /* copy to user if data and response */ @@ -1837,7 +1837,7 @@ static void mmc_blk_mq_rw_recovery(struct mmc_queue *mq, struct request *req) /* Reset if the card is in a bad state */ if (!mmc_host_is_spi(mq->card->host) && err && mmc_blk_reset(md, card->host, type)) { - pr_err("%s: recovery failed!\n", req->rq_disk->disk_name); + pr_err("%s: recovery failed!\n", req->q->disk->disk_name); mqrq->retries = MMC_NO_RETRIES; return; } @@ -2051,7 +2051,8 @@ static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req) mmc_put_card(mq->card, &mq->ctx); } -static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) +static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req, + bool can_sleep) { struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req); struct mmc_request *mrq = &mqrq->brq.mrq; @@ -2063,10 +2064,14 @@ static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) * Block layer timeouts race with completions which means the normal * completion path cannot be used during recovery. */ - if (mq->in_recovery) + if (mq->in_recovery) { mmc_blk_mq_complete_rq(mq, req); - else if (likely(!blk_should_fake_timeout(req->q))) - blk_mq_complete_request(req); + } else if (likely(!blk_should_fake_timeout(req->q))) { + if (can_sleep) + blk_mq_complete_request_direct(req, mmc_blk_mq_complete); + else + blk_mq_complete_request(req); + } mmc_blk_mq_dec_in_flight(mq, req); } @@ -2087,7 +2092,7 @@ void mmc_blk_mq_recovery(struct mmc_queue *mq) mmc_blk_urgent_bkops(mq, mqrq); - mmc_blk_mq_post_req(mq, req); + mmc_blk_mq_post_req(mq, req, true); } static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, @@ -2106,7 +2111,7 @@ static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, if (prev_req) *prev_req = mq->complete_req; else - mmc_blk_mq_post_req(mq, mq->complete_req); + mmc_blk_mq_post_req(mq, mq->complete_req, true); mq->complete_req = NULL; @@ -2178,7 +2183,8 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq) mq->rw_wait = false; wake_up(&mq->wait); - mmc_blk_mq_post_req(mq, req); + /* context unknown */ + mmc_blk_mq_post_req(mq, req, false); } static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) @@ -2238,7 +2244,7 @@ static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq, err = mmc_start_request(host, &mqrq->brq.mrq); if (prev_req) - mmc_blk_mq_post_req(mq, prev_req); + mmc_blk_mq_post_req(mq, prev_req, true); if (err) mq->rw_wait = false; @@ -2395,10 +2401,8 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, md->disk->private_data = md; md->parent = parent; set_disk_ro(md->disk, md->read_only || default_ro); - md->disk->flags = GENHD_FL_EXT_DEVT; if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT)) - md->disk->flags |= GENHD_FL_NO_PART_SCAN - | GENHD_FL_SUPPRESS_PARTITION_INFO; + md->disk->flags |= GENHD_FL_NO_PART; /* * As discussed on lkml, GENHD_FL_REMOVABLE should: @@ -2739,7 +2743,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) if (IS_ERR(req)) return PTR_ERR(req); req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ret = req_to_mmc_queue_req(req)->drv_op_result; if (ret >= 0) { *val = ret; @@ -2778,7 +2782,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD; req_to_mmc_queue_req(req)->drv_op_data = &ext_csd; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); err = req_to_mmc_queue_req(req)->drv_op_result; blk_mq_free_request(req); if (err) { diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 4eaba6f4ec68..243f28a3206b 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -46,23 +46,19 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, struct mtd_blktrans_dev *dev, struct request *req) { + struct req_iterator iter; + struct bio_vec bvec; unsigned long block, nsect; char *buf; block = blk_rq_pos(req) << 9 >> tr->blkshift; nsect = blk_rq_cur_bytes(req) >> tr->blkshift; - if (req_op(req) == REQ_OP_FLUSH) { + switch (req_op(req)) { + case REQ_OP_FLUSH: if (tr->flush(dev)) return BLK_STS_IOERR; return BLK_STS_OK; - } - - if (blk_rq_pos(req) + blk_rq_cur_sectors(req) > - get_capacity(req->rq_disk)) - return BLK_STS_IOERR; - - switch (req_op(req)) { case REQ_OP_DISCARD: if (tr->discard(dev, block, nsect)) return BLK_STS_IOERR; @@ -76,13 +72,17 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, } } kunmap(bio_page(req->bio)); - rq_flush_dcache_pages(req); + + rq_for_each_segment(bvec, req, iter) + flush_dcache_page(bvec.bv_page); return BLK_STS_OK; case REQ_OP_WRITE: if (!tr->writesect) return BLK_STS_IOERR; - rq_flush_dcache_pages(req); + rq_for_each_segment(bvec, req, iter) + flush_dcache_page(bvec.bv_page); + buf = kmap(bio_page(req->bio)) + bio_offset(req->bio); for (; nsect > 0; nsect--, block++, buf += tr->blksize) { if (tr->writesect(dev, block, buf)) { @@ -346,7 +346,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) gd->minors = 1 << tr->part_bits; gd->fops = &mtd_block_ops; - if (tr->part_bits) + if (tr->part_bits) { if (new->devnum < 26) snprintf(gd->disk_name, sizeof(gd->disk_name), "%s%c", tr->name, 'a' + new->devnum); @@ -355,9 +355,11 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) "%s%c%c", tr->name, 'a' - 1 + new->devnum / 26, 'a' + new->devnum % 26); - else + } else { snprintf(gd->disk_name, sizeof(gd->disk_name), "%s%d", tr->name, new->devnum); + gd->flags |= GENHD_FL_NO_PART; + } set_capacity(gd, ((u64)new->size * tr->blksize) >> 9); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 062e6c2c45f5..a78fdf3b30f7 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -294,6 +294,8 @@ static void ubiblock_do_work(struct work_struct *work) int ret; struct ubiblock_pdu *pdu = container_of(work, struct ubiblock_pdu, work); struct request *req = blk_mq_rq_from_pdu(pdu); + struct req_iterator iter; + struct bio_vec bvec; blk_mq_start_request(req); @@ -305,7 +307,9 @@ static void ubiblock_do_work(struct work_struct *work) blk_rq_map_sg(req->q, req, pdu->usgl.sg); ret = ubiblock_read(pdu); - rq_flush_dcache_pages(req); + + rq_for_each_segment(bvec, req, iter) + flush_dcache_page(bvec.bv_page); blk_mq_end_request(req, errno_to_blk_status(ret)); } @@ -426,6 +430,7 @@ int ubiblock_create(struct ubi_volume_info *vi) ret = -ENODEV; goto out_cleanup_disk; } + gd->flags |= GENHD_FL_NO_PART; gd->private_data = dev; sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id); set_capacity(gd, disk_capacity); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1af8a4513708..290f26ed74c2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1057,7 +1057,7 @@ static int nvme_execute_rq(struct gendisk *disk, struct request *rq, { blk_status_t status; - status = blk_execute_rq(disk, rq, at_head); + status = blk_execute_rq(rq, at_head); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) return -EINTR; if (nvme_req(rq)->status) @@ -1284,7 +1284,7 @@ static void nvme_keep_alive_work(struct work_struct *work) rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; - blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); + blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io); } static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 1352159733b0..83d2e6860d38 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -56,7 +56,7 @@ void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject) void nvme_should_fail(struct request *req) { - struct gendisk *disk = req->rq_disk; + struct gendisk *disk = req->q->disk; struct nvme_fault_inject *fault_inject = NULL; u16 status; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ca2ee806d74b..d8585df2c2fd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -500,22 +500,13 @@ static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) nvmeq->last_sq_tail = nvmeq->sq_tail; } -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * @write_sq: whether to write to the SQ doorbell - */ -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, - bool write_sq) +static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd) { - spin_lock(&nvmeq->sq_lock); memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), - cmd, sizeof(*cmd)); + absolute_pointer(cmd), sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; - nvme_write_sq_db(nvmeq, write_sq); - spin_unlock(&nvmeq->sq_lock); } static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) @@ -912,24 +903,52 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, return BLK_STS_OK; } +static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + blk_status_t ret; + + iod->aborted = 0; + iod->npages = -1; + iod->nents = 0; + + ret = nvme_setup_cmd(req->q->queuedata, req); + if (ret) + return ret; + + if (blk_rq_nr_phys_segments(req)) { + ret = nvme_map_data(dev, req, &iod->cmd); + if (ret) + goto out_free_cmd; + } + + if (blk_integrity_rq(req)) { + ret = nvme_map_metadata(dev, req, &iod->cmd); + if (ret) + goto out_unmap_data; + } + + blk_mq_start_request(req); + return BLK_STS_OK; +out_unmap_data: + nvme_unmap_data(dev, req); +out_free_cmd: + nvme_cleanup_cmd(req); + return ret; +} + /* * NOTE: ns is NULL when called on the admin queue. */ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_command *cmnd = &iod->cmd; blk_status_t ret; - iod->aborted = 0; - iod->npages = -1; - iod->nents = 0; - /* * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue. @@ -937,33 +956,75 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; - if (!nvme_check_ready(&dev->ctrl, req, true)) + if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) return nvme_fail_nonready_command(&dev->ctrl, req); - ret = nvme_setup_cmd(ns, req); - if (ret) + ret = nvme_prep_rq(dev, req); + if (unlikely(ret)) return ret; - - if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, cmnd); - if (ret) - goto out_free_cmd; - } - - if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, cmnd); - if (ret) - goto out_unmap_data; - } - - blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, cmnd, bd->last); + spin_lock(&nvmeq->sq_lock); + nvme_sq_copy_cmd(nvmeq, &iod->cmd); + nvme_write_sq_db(nvmeq, bd->last); + spin_unlock(&nvmeq->sq_lock); return BLK_STS_OK; -out_unmap_data: - nvme_unmap_data(dev, req); -out_free_cmd: - nvme_cleanup_cmd(req); - return ret; +} + +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) +{ + spin_lock(&nvmeq->sq_lock); + while (!rq_list_empty(*rqlist)) { + struct request *req = rq_list_pop(rqlist); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + nvme_sq_copy_cmd(nvmeq, &iod->cmd); + } + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); +} + +static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) +{ + /* + * We should not need to do this, but we're still using this to + * ensure we can drain requests on a dying queue. + */ + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) + return false; + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) + return false; + + req->mq_hctx->tags->rqs[req->tag] = req; + return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; +} + +static void nvme_queue_rqs(struct request **rqlist) +{ + struct request *req, *next, *prev = NULL; + struct request *requeue_list = NULL; + + rq_list_for_each_safe(rqlist, req, next) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + + if (!nvme_prep_rq_batch(nvmeq, req)) { + /* detach 'req' and add to remainder list */ + rq_list_move(rqlist, &requeue_list, req, prev); + + req = prev; + if (!req) + continue; + } + + if (!next || req->mq_hctx != next->mq_hctx) { + /* detach rest of list, and submit */ + req->rq_next = NULL; + nvme_submit_cmds(nvmeq, rqlist); + *rqlist = next; + prev = NULL; + } else + prev = req; + } + + *rqlist = requeue_list; } static __always_inline void nvme_pci_unmap_rq(struct request *req) @@ -1140,7 +1201,11 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - nvme_submit_cmd(nvmeq, &c, true); + + spin_lock(&nvmeq->sq_lock); + nvme_sq_copy_cmd(nvmeq, &c); + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1371,7 +1436,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } abort_req->end_io_data = NULL; - blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio); + blk_execute_rq_nowait(abort_req, false, abort_endio); /* * The aborted req will be completed on receiving the abort req. @@ -1663,6 +1728,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, + .queue_rqs = nvme_queue_rqs, .complete = nvme_pci_complete_rq, .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, @@ -2416,9 +2482,8 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); - blk_execute_rq_nowait(NULL, req, false, - opcode == nvme_admin_delete_cq ? - nvme_del_cq_end : nvme_del_queue_end); + blk_execute_rq_nowait(req, false, opcode == nvme_admin_delete_cq ? + nvme_del_cq_end : nvme_del_queue_end); return 0; } diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 35bac7a25422..b5f85259461a 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -68,7 +68,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = !!blk_integrity_rq(req); __entry->fctype = cmd->fabrics.fctype; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); memcpy(__entry->cdw10, &cmd->common.cdw10, sizeof(__entry->cdw10)); ), @@ -103,7 +103,7 @@ TRACE_EVENT(nvme_complete_rq, __entry->retries = nvme_req(req)->retries; __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x", __entry->ctrl_id, __print_disk_name(__entry->disk), @@ -153,7 +153,7 @@ TRACE_EVENT(nvme_sq, ), TP_fast_assign( __entry->ctrl_id = nvme_req(req)->ctrl->instance; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); __entry->qid = nvme_req_qid(req); __entry->sq_head = le16_to_cpu(sq_head); __entry->sq_tail = sq_tail; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index f0efb3537989..9e5b89ae29df 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -284,8 +284,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) schedule_work(&req->p.work); } else { rq->end_io_data = req; - blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0, - nvmet_passthru_req_done); + blk_execute_rq_nowait(rq, false, nvmet_passthru_req_done); } if (ns) diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c index 27012908b586..6fa300daa31e 100644 --- a/drivers/scsi/ch.c +++ b/drivers/scsi/ch.c @@ -877,7 +877,7 @@ static long ch_ioctl(struct file *file, } default: - return scsi_ioctl(ch->device, NULL, file->f_mode, cmd, argp); + return scsi_ioctl(ch->device, file->f_mode, cmd, argp); } } diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index 081b84bb7985..b7a464383cc0 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -60,7 +60,7 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, goto out_free_cmd; bio = rq->bio; - blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); /* * fill in all the output members diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 2371edbc3af4..3eae2392ef15 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2040,7 +2040,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->timeout = 10 * HZ; rq->retries = 5; - blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done); + blk_execute_rq_nowait(req, true, eh_lock_door_done); } /** diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index 400df3354cd6..e13fd380deb6 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -408,8 +408,7 @@ static int scsi_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, return ret; } -static int sg_io(struct scsi_device *sdev, struct gendisk *disk, - struct sg_io_hdr *hdr, fmode_t mode) +static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) { unsigned long start_time; ssize_t ret = 0; @@ -483,7 +482,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk, start_time = jiffies; - blk_execute_rq(disk, rq, at_head); + blk_execute_rq(rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); @@ -499,19 +498,12 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk, /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl * @q: request queue to send scsi commands down - * @disk: gendisk to operate on (option) * @mode: mode used to open the file through which the ioctl has been * submitted * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below - * the request queue @q. If @file is non-NULL it's used to perform - * fine-grained permission checks that allow users to send down - * non-destructive SCSI commands. If the caller has a struct gendisk - * available it should be passed in as @disk to allow the low level - * driver to use the information contained in it. A non-NULL @disk - * is only allowed if the caller knows that the low level driver doesn't - * need it (e.g. in the scsi subsystem). + * the request queue @q. * * Notes: * - This interface is deprecated - users should use the SG_IO @@ -530,8 +522,8 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk, * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ -static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, - fmode_t mode, struct scsi_ioctl_command __user *sic) +static int sg_scsi_ioctl(struct request_queue *q, fmode_t mode, + struct scsi_ioctl_command __user *sic) { enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ struct request *rq; @@ -620,7 +612,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, goto error; } - blk_execute_rq(disk, rq, 0); + blk_execute_rq(rq, false); err = req->result & 0xff; /* only 8 bit SCSI status */ if (err) { @@ -806,8 +798,8 @@ static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, return 0; } -static int scsi_cdrom_send_packet(struct scsi_device *sdev, struct gendisk *disk, - fmode_t mode, void __user *arg) +static int scsi_cdrom_send_packet(struct scsi_device *sdev, fmode_t mode, + void __user *arg) { struct cdrom_generic_command cgc; struct sg_io_hdr hdr; @@ -847,7 +839,7 @@ static int scsi_cdrom_send_packet(struct scsi_device *sdev, struct gendisk *disk hdr.cmdp = ((struct cdrom_generic_command __user *) arg)->cmd; hdr.cmd_len = sizeof(cgc.cmd); - err = sg_io(sdev, disk, &hdr, mode); + err = sg_io(sdev, &hdr, mode); if (err == -EFAULT) return -EFAULT; @@ -862,8 +854,8 @@ static int scsi_cdrom_send_packet(struct scsi_device *sdev, struct gendisk *disk return err; } -static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, - fmode_t mode, void __user *argp) +static int scsi_ioctl_sg_io(struct scsi_device *sdev, fmode_t mode, + void __user *argp) { struct sg_io_hdr hdr; int error; @@ -871,7 +863,7 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, error = get_sg_io_hdr(&hdr, argp); if (error) return error; - error = sg_io(sdev, disk, &hdr, mode); + error = sg_io(sdev, &hdr, mode); if (error == -EFAULT) return error; if (put_sg_io_hdr(&hdr, argp)) @@ -882,7 +874,6 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, /** * scsi_ioctl - Dispatch ioctl to scsi device * @sdev: scsi device receiving ioctl - * @disk: disk receiving the ioctl * @mode: mode the block/char device is opened with * @cmd: which ioctl is it * @arg: data associated with ioctl @@ -891,8 +882,8 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, * does not take a major/minor number as the dev field. Rather, it takes * a pointer to a &struct scsi_device. */ -int scsi_ioctl(struct scsi_device *sdev, struct gendisk *disk, fmode_t mode, - int cmd, void __user *arg) +int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, + void __user *arg) { struct request_queue *q = sdev->request_queue; struct scsi_sense_hdr sense_hdr; @@ -927,11 +918,11 @@ int scsi_ioctl(struct scsi_device *sdev, struct gendisk *disk, fmode_t mode, case SG_EMULATED_HOST: return sg_emulated_host(q, arg); case SG_IO: - return scsi_ioctl_sg_io(sdev, disk, mode, arg); + return scsi_ioctl_sg_io(sdev, mode, arg); case SCSI_IOCTL_SEND_COMMAND: - return sg_scsi_ioctl(q, disk, mode, arg); + return sg_scsi_ioctl(q, mode, arg); case CDROM_SEND_PACKET: - return scsi_cdrom_send_packet(sdev, disk, mode, arg); + return scsi_cdrom_send_packet(sdev, mode, arg); case CDROMCLOSETRAY: return scsi_send_start_stop(sdev, 3); case CDROMEJECT: diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 621d841d819a..35e381f6d371 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -241,7 +241,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, /* * head injection *required* here otherwise quiesce won't work */ - blk_execute_rq(NULL, req, 1); + blk_execute_rq(req, true); /* * Some devices (USB mass-storage in particular) may transfer @@ -543,8 +543,9 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (blk_update_request(req, error, bytes)) return true; + // XXX: if (blk_queue_add_random(q)) - add_disk_randomness(req->rq_disk); + add_disk_randomness(req->q->disk); if (!blk_rq_is_passthrough(req)) { WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); @@ -617,6 +618,46 @@ static blk_status_t scsi_result_to_blk_status(struct scsi_cmnd *cmd, int result) } } +/** + * scsi_rq_err_bytes - determine number of bytes till the next failure boundary + * @rq: request to examine + * + * Description: + * A request could be merge of IOs which require different failure + * handling. This function determines the number of bytes which + * can be failed from the beginning of the request without + * crossing into area which need to be retried further. + * + * Return: + * The number of bytes to fail. + */ +static unsigned int scsi_rq_err_bytes(const struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + unsigned int bytes = 0; + struct bio *bio; + + if (!(rq->rq_flags & RQF_MIXED_MERGE)) + return blk_rq_bytes(rq); + + /* + * Currently the only 'mixing' which can happen is between + * different fastfail types. We can safely fail portions + * which have all the failfast bits that the first one has - + * the ones which are at least as eager to fail as the first + * one. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + if ((bio->bi_opf & ff) != ff) + break; + bytes += bio->bi_iter.bi_size; + } + + /* this could lead to infinite loop */ + BUG_ON(blk_rq_bytes(rq) && !bytes); + return bytes; +} + /* Helper for scsi_io_completion() when "reprep" action required. */ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd, struct request_queue *q) @@ -794,7 +835,7 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) scsi_print_command(cmd); } } - if (!scsi_end_request(req, blk_stat, blk_rq_err_bytes(req))) + if (!scsi_end_request(req, blk_stat, scsi_rq_err_bytes(req))) return; fallthrough; case ACTION_REPREP: diff --git a/drivers/scsi/scsi_logging.c b/drivers/scsi/scsi_logging.c index ed9572252a42..1f8f80b2dbfc 100644 --- a/drivers/scsi/scsi_logging.c +++ b/drivers/scsi/scsi_logging.c @@ -30,7 +30,9 @@ static inline const char *scmd_name(const struct scsi_cmnd *scmd) { struct request *rq = scsi_cmd_to_rq((struct scsi_cmnd *)scmd); - return rq->rq_disk ? rq->rq_disk->disk_name : NULL; + if (!rq->q->disk) + return NULL; + return rq->q->disk->disk_name; } static size_t sdev_format_header(char *logbuf, size_t logbuf_len, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 65875a598d62..5ddb8e053a8e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -872,7 +872,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; @@ -908,7 +908,7 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; @@ -940,7 +940,7 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; @@ -971,7 +971,7 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1068,7 +1068,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); struct bio *bio = rq->bio; u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1116,7 +1116,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); @@ -1215,7 +1215,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1236,7 +1236,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) goto fail; } - if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->rq_disk)) { + if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } @@ -1331,7 +1331,7 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) switch (req_op(rq)) { case REQ_OP_DISCARD: - switch (scsi_disk(rq->rq_disk)->provisioning_mode) { + switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: @@ -1574,7 +1574,7 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode, if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); - return scsi_ioctl(sdp, disk, mode, cmd, p); + return scsi_ioctl(sdp, mode, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) @@ -1917,7 +1917,7 @@ static const struct block_device_operations sd_fops = { **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { - struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->rq_disk); + struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; @@ -1937,7 +1937,7 @@ static void sd_eh_reset(struct scsi_cmnd *scmd) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { - struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->rq_disk); + struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || @@ -2034,7 +2034,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); - struct scsi_disk *sdkp = scsi_disk(req->rq_disk); + struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; @@ -3566,7 +3566,6 @@ static int sd_probe(struct device *dev) sd_revalidate_disk(gd); - gd->flags = GENHD_FL_EXT_DEVT; if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; gd->events |= DISK_EVENT_MEDIA_CHANGE; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ed06798983f8..65bfd1e170da 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -244,7 +244,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t sector = blk_rq_pos(rq); if (!sd_is_zoned(sdkp)) @@ -322,7 +322,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, unsigned int nr_blocks) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); unsigned int wp_offset, zno = blk_rq_zone_no(rq); unsigned long flags; blk_status_t ret; @@ -388,7 +388,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, { struct request *rq = scsi_cmd_to_rq(cmd); sector_t sector = blk_rq_pos(rq); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t block = sectors_to_logical(sdkp->device, sector); blk_status_t ret; @@ -443,7 +443,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, { int result = cmd->result; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); unsigned int zno = blk_rq_zone_no(rq); enum req_opf op = req_op(rq); unsigned long flags; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 141099ab9092..ad12b3261845 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -833,7 +833,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(NULL, srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(srp->rq, at_head, sg_rq_end_io); return 0; } @@ -1109,7 +1109,7 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, case SCSI_IOCTL_SEND_COMMAND: if (atomic_read(&sdp->detaching)) return -ENODEV; - return scsi_ioctl(sdp->device, NULL, filp->f_mode, cmd_in, p); + return scsi_ioctl(sdp->device, filp->f_mode, cmd_in, p); case SG_SET_DEBUG: result = get_user(val, ip); if (result) @@ -1165,7 +1165,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) ret = sg_ioctl_common(filp, sdp, sfp, cmd_in, p); if (ret != -ENOIOCTLCMD) return ret; - return scsi_ioctl(sdp->device, NULL, filp->f_mode, cmd_in, p); + return scsi_ioctl(sdp->device, filp->f_mode, cmd_in, p); } static __poll_t diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 8e4af111c078..14c122839c40 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -335,7 +335,7 @@ static int sr_done(struct scsi_cmnd *SCpnt) int block_sectors = 0; long error_sector; struct request *rq = scsi_cmd_to_rq(SCpnt); - struct scsi_cd *cd = scsi_cd(rq->rq_disk); + struct scsi_cd *cd = scsi_cd(rq->q->disk); #ifdef DEBUG scmd_printk(KERN_INFO, SCpnt, "done: %x\n", result); @@ -402,7 +402,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) ret = scsi_alloc_sgtables(SCpnt); if (ret != BLK_STS_OK) return ret; - cd = scsi_cd(rq->rq_disk); + cd = scsi_cd(rq->q->disk); SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, "Doing sr request, block = %d\n", block)); @@ -561,8 +561,7 @@ static void sr_block_release(struct gendisk *disk, fmode_t mode) static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - struct gendisk *disk = bdev->bd_disk; - struct scsi_cd *cd = scsi_cd(disk); + struct scsi_cd *cd = scsi_cd(bdev->bd_disk); struct scsi_device *sdev = cd->device; void __user *argp = (void __user *)arg; int ret; @@ -584,7 +583,7 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (ret != -ENOSYS) goto put; } - ret = scsi_ioctl(sdev, disk, mode, cmd, argp); + ret = scsi_ioctl(sdev, mode, cmd, argp); put: scsi_autopm_put_device(sdev); @@ -684,9 +683,10 @@ static int sr_probe(struct device *dev) disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; - disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; + disk->flags |= GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; - disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT; + disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT | + DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT); @@ -725,7 +725,6 @@ static int sr_probe(struct device *dev) blk_pm_runtime_init(sdev->request_queue, dev); dev_set_drvdata(dev, cd); - disk->flags |= GENHD_FL_REMOVABLE; sr_revalidate_disk(cd); error = device_add_disk(&sdev->sdev_gendev, disk, NULL); @@ -994,7 +993,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf, rq->timeout = 60 * HZ; bio = rq->bio; - blk_execute_rq(disk, rq, 0); + blk_execute_rq(rq, false); if (scsi_req(rq)->result) { struct scsi_sense_hdr sshdr; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index c2d5608f6b1a..e869e90e05af 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -581,7 +581,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, rq->retries = retries; req->end_io_data = SRpnt; - blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end); + blk_execute_rq_nowait(req, true, st_scsi_execute_end); return 0; } @@ -3829,7 +3829,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) break; } - retval = scsi_ioctl(STp->device, NULL, file->f_mode, cmd_in, p); + retval = scsi_ioctl(STp->device, file->f_mode, cmd_in, p); if (!retval && cmd_in == SCSI_IOCTL_STOP_UNIT) { /* unload */ STp->rew_at_close = 0; diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c index ded5ba9b1466..13cd21204bf9 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/scsi/ufs/ufshpb.c @@ -677,7 +677,7 @@ static void ufshpb_execute_umap_req(struct ufshpb_lu *hpb, ufshpb_set_unmap_cmd(rq->cmd, rgn); rq->cmd_len = HPB_WRITE_BUFFER_CMD_LENGTH; - blk_execute_rq_nowait(NULL, req, 1, ufshpb_umap_req_compl_fn); + blk_execute_rq_nowait(req, true, ufshpb_umap_req_compl_fn); hpb->stats.umap_req_cnt++; } @@ -719,7 +719,7 @@ static int ufshpb_execute_map_req(struct ufshpb_lu *hpb, map_req->rb.srgn_idx, mem_size); rq->cmd_len = HPB_READ_BUFFER_CMD_LENGTH; - blk_execute_rq_nowait(NULL, req, 1, ufshpb_map_req_compl_fn); + blk_execute_rq_nowait(req, true, ufshpb_map_req_compl_fn); hpb->stats.map_req_cnt++; return 0; diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 28e1d98ae102..65c642b24ecf 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -528,7 +528,7 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev, if (!rq || !scsi_prot_sg_count(sc)) return; - bi = blk_get_integrity(rq->rq_disk); + bi = blk_get_integrity(rq->q->disk); if (sc->sc_data_direction == DMA_TO_DEVICE) cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 7fa57fb57bf2..807d06ecadee 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1005,7 +1005,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) req->timeout = PS_TIMEOUT_OTHER; scsi_req(req)->retries = PS_RETRY; - blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG), + blk_execute_rq_nowait(req, cmd->sam_task_attr == TCM_HEAD_TAG, pscsi_req_done); return 0; diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 4c5a0a49035f..1928b3918242 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -551,7 +551,7 @@ static void last_sector_hacks(struct us_data *us, struct scsi_cmnd *srb) /* Did this command access the last sector? */ sector = (srb->cmnd[2] << 24) | (srb->cmnd[3] << 16) | (srb->cmnd[4] << 8) | (srb->cmnd[5]); - disk = scsi_cmd_to_rq(srb)->rq_disk; + disk = scsi_cmd_to_rq(srb)->q->disk; if (!disk) goto done; sdkp = scsi_disk(disk); diff --git a/fs/io_uring.c b/fs/io_uring.c index a4f20b8c74a4..de9c9de90655 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b4dc51063d36..03ea367df19a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/bio.h b/include/linux/bio.h index fe6bdfbbef66..0a41efe02208 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -124,7 +124,7 @@ void __bio_advance(struct bio *, unsigned bytes); /** * bio_advance - increment/complete a bio by some number of bytes * @bio: bio to advance - * @bytes: number of bytes to complete + * @nbytes: number of bytes to complete * * This updates bi_sector, bi_size and bi_idx; if the number of bytes to * complete doesn't align with a bvec boundary, then bv_len and bv_offset will @@ -332,7 +332,7 @@ extern struct bio *bio_split(struct bio *bio, int sectors, * @gfp: gfp mask * @bs: bio set to allocate from * - * Returns a bio representing the next @sectors of @bio - if the bio is smaller + * Return: a bio representing the next @sectors of @bio - if the bio is smaller * than @sectors, returns the original bio unchanged. */ static inline struct bio *bio_next_split(struct bio *bio, int sectors, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2949d9ac7484..d319ffa59354 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -100,7 +99,6 @@ struct request { struct request *rq_next; }; - struct gendisk *rq_disk; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ @@ -140,7 +138,7 @@ struct request { unsigned short ioprio; enum mq_rq_state state; - refcount_t ref; + atomic_t ref; unsigned long deadline; @@ -218,6 +216,56 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) +#define rq_list_add(listptr, rq) do { \ + (rq)->rq_next = *(listptr); \ + *(listptr) = rq; \ +} while (0) + +#define rq_list_pop(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) { \ + __req = *(listptr); \ + *(listptr) = __req->rq_next; \ + } \ + __req; \ +}) + +#define rq_list_peek(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) \ + __req = *(listptr); \ + __req; \ +}) + +#define rq_list_for_each(listptr, pos) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) + +#define rq_list_for_each_safe(listptr, pos, nxt) \ + for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ + pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) + +#define rq_list_next(rq) (rq)->rq_next +#define rq_list_empty(list) ((list) == (struct request *) NULL) + +/** + * rq_list_move() - move a struct request from one list to another + * @src: The source list @rq is currently in + * @dst: The destination list that @rq will be appended to + * @rq: The request to move + * @prev: The request preceding @rq in @src (NULL if @rq is the head) + */ +static inline void rq_list_move(struct request **src, struct request **dst, + struct request *rq, struct request *prev) +{ + if (prev) + prev->rq_next = rq->rq_next; + else + *src = rq->rq_next; + rq_list_add(dst, rq); +} + enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ BLK_EH_RESET_TIMER, /* reset timer and try again */ @@ -376,13 +424,6 @@ struct blk_mq_hw_ctx { * q->unused_hctx_list. */ struct list_head hctx_list; - - /** - * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is - * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also - * blk_mq_hw_ctx_size(). - */ - struct srcu_struct srcu[]; }; /** @@ -479,8 +520,6 @@ struct blk_mq_queue_data { bool last; }; -typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, - bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); /** @@ -503,6 +542,14 @@ struct blk_mq_ops { */ void (*commit_rqs)(struct blk_mq_hw_ctx *); + /** + * @queue_rqs: Queue a list of new requests. Driver is guaranteed + * that each request belongs to the same queue. If the driver doesn't + * empty the @rqlist completely, then the rest will be queued + * individually by the block layer upon return. + */ + void (*queue_rqs)(struct request **rqlist); + /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the @@ -752,6 +799,17 @@ static inline void blk_mq_set_request_complete(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } +/* + * Complete the request directly instead of deferring it to softirq or + * completing it another CPU. Useful in preemptible instead of an interrupt. + */ +static inline void blk_mq_complete_request_direct(struct request *rq, + void (*complete)(struct request *rq)) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + complete(rq); +} + void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); @@ -879,9 +937,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - - if (bio->bi_bdev) - rq->rq_disk = bio->bi_bdev->bd_disk; } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, @@ -917,10 +972,9 @@ int blk_rq_unmap_user(struct bio *); int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); int blk_rq_append_bio(struct request *rq, struct bio *bio); -void blk_execute_rq_nowait(struct gendisk *, struct request *, int, - rq_end_io_fn *); -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, - int at_head); +void blk_execute_rq_nowait(struct request *rq, bool at_head, + rq_end_io_fn *end_io); +blk_status_t blk_execute_rq(struct request *rq, bool at_head); struct req_iterator { struct bvec_iter iter; @@ -947,7 +1001,6 @@ struct req_iterator { * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment - * blk_rq_err_bytes() : bytes left till the next error boundary * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats @@ -971,8 +1024,6 @@ static inline int blk_rq_cur_bytes(const struct request *rq) return bio_iovec(rq->bio).bv_len; } -unsigned int blk_rq_err_bytes(const struct request *rq); - static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> SECTOR_SHIFT; @@ -1135,14 +1186,4 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq) } #endif /* CONFIG_BLK_DEV_ZONED */ -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -#endif -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void rq_flush_dcache_pages(struct request *rq); -#else -static inline void rq_flush_dcache_pages(struct request *rq) -{ -} -#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ #endif /* BLK_MQ_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bd4370baccca..9c95df26fc26 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -16,6 +16,7 @@ #include #include #include +#include struct module; struct request_queue; @@ -44,7 +45,7 @@ struct blk_crypto_profile; */ #define BLKCG_MAX_POLS 6 -static inline int blk_validate_block_size(unsigned int bsize) +static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) return -EINVAL; @@ -267,7 +268,7 @@ struct request_queue { int poll_nsec; struct blk_stat_callback *poll_cb; - struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; + struct blk_rq_stat *poll_stat; struct timer_list timeout; struct work_struct timeout_work; @@ -373,11 +374,18 @@ struct request_queue { * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; + + /** + * @srcu: Sleepable RCU. Use as lock when type of the request queue + * is blocking (BLK_MQ_F_BLOCKING). Must be the last member + */ + struct srcu_struct srcu[]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ +#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -397,7 +405,6 @@ struct request_queue { #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ -#define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ @@ -416,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) +#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) @@ -1171,8 +1179,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo bool blk_crypto_register(struct blk_crypto_profile *profile, struct request_queue *q); -void blk_crypto_unregister(struct request_queue *q); - #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, @@ -1181,8 +1187,6 @@ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, return true; } -static inline void blk_crypto_unregister(struct request_queue *q) { } - #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ enum blk_unique_id { @@ -1335,33 +1339,4 @@ struct io_comp_batch { #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } -#define rq_list_add(listptr, rq) do { \ - (rq)->rq_next = *(listptr); \ - *(listptr) = rq; \ -} while (0) - -#define rq_list_pop(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) { \ - __req = *(listptr); \ - *(listptr) = __req->rq_next; \ - } \ - __req; \ -}) - -#define rq_list_peek(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) \ - __req = *(listptr); \ - __req; \ -}) - -#define rq_list_for_each(listptr, pos) \ - for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \ - -#define rq_list_next(rq) (rq)->rq_next -#define rq_list_empty(list) ((list) == (struct request *) NULL) - #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 493b87e3616b..58e911cb3885 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2786,8 +2786,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); -extern bool filemap_range_needs_writeback(struct address_space *, - loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 74c410263113..6906a45bc761 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -39,57 +39,24 @@ struct partition_meta_info { /** * DOC: genhd capability flags * - * ``GENHD_FL_REMOVABLE`` (0x0001): indicates that the block device - * gives access to removable media. - * When set, the device remains present even when media is not - * inserted. - * Must not be set for devices which are removed entirely when the + * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to + * removable media. When set, the device remains present even when media is not + * inserted. Shall not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_CD`` (0x0008): the block device is a CD-ROM-style - * device. - * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. + * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, + * doesn't appear in sysfs, and can't be opened from userspace or using + * blkdev_get*. Used for the underlying components of multipath devices. * - * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include - * partition information in ``/proc/partitions`` or in the output of - * printk_all_partitions(). - * Used for the null block device and some MMC devices. + * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not + * scan for partitions from add_disk, and users can't add partitions manually. * - * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended - * dynamic ``dev_t``, i.e. it wants extended device numbers - * (``BLOCK_EXT_MAJOR``). - * This affects the maximum number of partitions. - * - * ``GENHD_FL_NATIVE_CAPACITY`` (0x0080): based on information in the - * partition table, the device's capacity has been extended to its - * native capacity; i.e. the device has hidden capacity used by one - * of the partitions (this is a flag used so that native capacity is - * only ever unlocked once). - * - * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is - * blocked whenever a writer holds an exclusive lock. - * - * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled. - * Used for loop devices in their default settings and some MMC - * devices. - * - * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it - * doesn't produce events, doesn't appear in sysfs, and doesn't have - * an associated ``bdev``. - * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and - * ``GENHD_FL_NO_PART_SCAN``. - * Used for multipath devices. */ -#define GENHD_FL_REMOVABLE 0x0001 -/* 2 is unused (used to be GENHD_FL_DRIVERFS) */ -/* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_CD 0x0008 -#define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 -#define GENHD_FL_EXT_DEVT 0x0040 -#define GENHD_FL_NATIVE_CAPACITY 0x0080 -#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 0x0100 -#define GENHD_FL_NO_PART_SCAN 0x0200 -#define GENHD_FL_HIDDEN 0x0400 +enum { + GENHD_FL_REMOVABLE = 1 << 0, + GENHD_FL_HIDDEN = 1 << 1, + GENHD_FL_NO_PART = 1 << 2, +}; enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ @@ -101,6 +68,8 @@ enum { DISK_EVENT_FLAG_POLL = 1 << 0, /* Forward events to udev */ DISK_EVENT_FLAG_UEVENT = 1 << 1, + /* Block event polling when open for exclusive write */ + DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, }; struct disk_events; @@ -115,13 +84,13 @@ struct blk_integrity { }; struct gendisk { - /* major, first_minor and minors are input parameters only, - * don't use directly. Use disk_devt() and disk_max_parts(). + /* + * major/first_minor/minors should not be set by any new driver, the + * block core will take care of allocating them automatically. */ - int major; /* major number of driver */ + int major; int first_minor; - int minors; /* maximum number of minors, =1 for - * disks that can't be partitioned. */ + int minors; char disk_name[DISK_NAME_LEN]; /* name of major driver */ @@ -140,6 +109,7 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 #define GD_DEAD 2 +#define GD_NATIVE_CAPACITY 3 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ @@ -184,19 +154,6 @@ static inline bool disk_live(struct gendisk *disk) #define disk_to_cdi(disk) NULL #endif -static inline int disk_max_parts(struct gendisk *disk) -{ - if (disk->flags & GENHD_FL_EXT_DEVT) - return DISK_MAX_PARTS; - return disk->minors; -} - -static inline bool disk_part_scan_enabled(struct gendisk *disk) -{ - return disk_max_parts(disk) > 1 && - !(disk->flags & GENHD_FL_NO_PART_SCAN); -} - static inline dev_t disk_devt(struct gendisk *disk) { return MKDEV(disk->major, disk->first_minor); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 0a9dc40b7be8..14f7eaf1b443 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -99,55 +99,40 @@ struct io_cq { struct io_context { atomic_long_t refcount; atomic_t active_ref; - atomic_t nr_tasks; - - /* all the fields below are protected by this lock */ - spinlock_t lock; unsigned short ioprio; +#ifdef CONFIG_BLK_ICQ + /* all the fields below are protected by this lock */ + spinlock_t lock; + struct radix_tree_root icq_tree; struct io_cq __rcu *icq_hint; struct hlist_head icq_list; struct work_struct release_work; +#endif /* CONFIG_BLK_ICQ */ }; -/** - * get_io_context_active - get active reference on ioc - * @ioc: ioc of interest - * - * Only iocs with active reference can issue new IOs. This function - * acquires an active reference on @ioc. The caller must already have an - * active reference on @ioc. - */ -static inline void get_io_context_active(struct io_context *ioc) -{ - WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); - WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); - atomic_long_inc(&ioc->refcount); - atomic_inc(&ioc->active_ref); -} - -static inline void ioc_task_link(struct io_context *ioc) -{ - get_io_context_active(ioc); - - WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); - atomic_inc(&ioc->nr_tasks); -} - struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); -void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); -struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node); +int __copy_io(unsigned long clone_flags, struct task_struct *tsk); +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + if (!current->io_context) + return 0; + return __copy_io(clone_flags, tsk); +} #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -#endif +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + return 0; +} +#endif /* CONFIG_BLOCK */ -#endif +#endif /* IOCONTEXT_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d150a9082b31..422bdf9f4e76 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -962,6 +962,35 @@ static inline int add_to_page_cache(struct page *page, int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp); +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte); + +/** + * filemap_range_needs_writeback - check if range potentially needs writeback + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. Used by O_DIRECT + * read/write with IOCB_NOWAIT, to see if the caller needs to do + * filemap_write_and_wait_range() before proceeding. + * + * Return: %true if the caller should do filemap_write_and_wait_range() before + * doing O_DIRECT to a page in this range, %false otherwise. + */ +static inline bool filemap_range_needs_writeback(struct address_space *mapping, + loff_t start_byte, + loff_t end_byte) +{ + if (!mapping->nrpages) + return false; + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + return false; + return filemap_range_has_writeback(mapping, start_byte, end_byte); +} + /** * struct readahead_control - Describes a readahead request. * diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 477a800a9543..6794d7322cbd 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -164,7 +164,7 @@ static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - return *(struct scsi_driver **)rq->rq_disk->private_data; + return *(struct scsi_driver **)rq->q->disk->private_data; } void scsi_done(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d1c6fc83b1e3..ab7557d84f75 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -275,9 +275,9 @@ scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...); do { \ struct request *__rq = scsi_cmd_to_rq((scmd)); \ \ - if (__rq->rq_disk) \ + if (__rq->q->disk) \ sdev_dbg((scmd)->device, "[%s] " fmt, \ - __rq->rq_disk->disk_name, ##a); \ + __rq->q->disk->disk_name, ##a); \ else \ sdev_dbg((scmd)->device, fmt, ##a); \ } while (0) diff --git a/include/scsi/scsi_ioctl.h b/include/scsi/scsi_ioctl.h index d2cb9aeaf1f1..beac64e38b87 100644 --- a/include/scsi/scsi_ioctl.h +++ b/include/scsi/scsi_ioctl.h @@ -45,8 +45,8 @@ typedef struct scsi_fctargaddress { int scsi_ioctl_block_when_processing_errors(struct scsi_device *sdev, int cmd, bool ndelay); -int scsi_ioctl(struct scsi_device *sdev, struct gendisk *disk, fmode_t mode, - int cmd, void __user *arg); +int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, + void __user *arg); int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp); int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp); bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode); diff --git a/include/trace/events/block.h b/include/trace/events/block.h index a95daa4d4caa..27170e40e8c9 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -85,7 +85,7 @@ TRACE_EVENT(block_rq_requeue, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); @@ -128,7 +128,7 @@ TRACE_EVENT(block_rq_complete, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); @@ -161,7 +161,7 @@ DECLARE_EVENT_CLASS(block_rq, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); @@ -512,7 +512,7 @@ TRACE_EVENT(block_rq_remap, ), TP_fast_assign( - __entry->dev = disk_devt(rq->rq_disk); + __entry->dev = disk_devt(rq->q->disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..3161d7980155 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1556,32 +1556,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) return error; } -static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - struct io_context *new_ioc; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - ioc_task_link(ioc); - tsk->io_context = ioc; - } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) - return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); - } -#endif - return 0; -} - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1183c88634aa..af68a67179b4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -34,7 +34,7 @@ static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; static LIST_HEAD(running_trace_list); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -121,12 +121,12 @@ static void trace_note_tsk(struct task_struct *tsk) struct blk_trace *bt; tsk->btrace_seq = blktrace_seq; - spin_lock_irqsave(&running_trace_lock, flags); + raw_spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm), 0); } - spin_unlock_irqrestore(&running_trace_lock, flags); + raw_spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -666,9 +666,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_add(&bt->running_list, &running_trace_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -676,9 +676,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1045,7 +1045,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, } r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), @@ -1608,9 +1608,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); } diff --git a/mm/filemap.c b/mm/filemap.c index 39c4c46c6133..3baf03c0f608 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -646,8 +646,8 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } -static bool filemap_range_has_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -667,34 +667,8 @@ static bool filemap_range_has_writeback(struct address_space *mapping, } rcu_read_unlock(); return page != NULL; - } - -/** - * filemap_range_needs_writeback - check if range potentially needs writeback - * @mapping: address space within which to check - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Find at least one page in the range supplied, usually used to check if - * direct writing in this range will trigger a writeback. Used by O_DIRECT - * read/write with IOCB_NOWAIT, to see if the caller needs to do - * filemap_write_and_wait_range() before proceeding. - * - * Return: %true if the caller should do filemap_write_and_wait_range() before - * doing O_DIRECT to a page in this range, %false otherwise. - */ -bool filemap_range_needs_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) -{ - if (!mapping_needs_writeback(mapping)) - return false; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return false; - return filemap_range_has_writeback(mapping, start_byte, end_byte); -} -EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); +EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range