Merge branch 'for-4.3/drivers' of git://git.kernel.dk/linux-block

Pull block driver updates from Jens Axboe:
 "On top of the 4.3 core block IO changes, here are the driver related
  changes for 4.3.  Basically just NVMe and nbd this time around:

   - NVMe:
      - PRACT PI improvement from Alok Pandey.
      - Cleanups and improvements on submission queue doorbell and
        writing, using CMB if available.  From Jon Derrick.
      - From Keith, support for setting queue maximum segments, and
        reset support.
      - Also from Jon, fixup of u64 division issue on 32-bit archs and
        wiring up of the reset support through and ioctl.
      - Two small cleanups from Matias and Sunad

  - Various code cleanups and fixes from Markus Pargmann"

* 'for-4.3/drivers' of git://git.kernel.dk/linux-block:
  NVMe: Using PRACT bit to generate and verify PI by controller
  NVMe:Remove unreachable code in nvme_abort_req
  NVMe: Add nvme subsystem reset IOCTL
  NVMe: Add nvme subsystem reset support
  NVMe: removed unused nn var from nvme_dev_add
  NVMe: Set queue max segments
  nbd: flags is a u32 variable
  nbd: Rename functions for clearness of recv/send path
  nbd: Change 'disconnect' to be boolean
  nbd: Add debugfs entries
  nbd: Remove variable 'pid'
  nbd: Move clear queue debug message
  nbd: Remove 'harderror' and propagate error properly
  nbd: restructure sock_shutdown
  nbd: sock_shutdown, remove conditional lock
  nbd: Fix timeout detection
  nvme: Fixes u64 division which breaks i386 builds
  NVMe: Use CMB for the IO SQes if available
  NVMe: Unify SQ entry writing and doorbell ringing
This commit is contained in:
Linus Torvalds 2015-09-02 13:14:58 -07:00
commit 52b084d31c
4 changed files with 506 additions and 146 deletions

View File

@ -33,6 +33,7 @@
#include <linux/net.h>
#include <linux/kthread.h>
#include <linux/types.h>
#include <linux/debugfs.h>
#include <asm/uaccess.h>
#include <asm/types.h>
@ -40,8 +41,7 @@
#include <linux/nbd.h>
struct nbd_device {
int flags;
int harderror; /* Code of hard error */
u32 flags;
struct socket * sock; /* If == NULL, device is not ready, yet */
int magic;
@ -56,11 +56,24 @@ struct nbd_device {
struct gendisk *disk;
int blksize;
loff_t bytesize;
pid_t pid; /* pid of nbd-client, if attached */
int xmit_timeout;
int disconnect; /* a disconnect has been requested by user */
bool disconnect; /* a disconnect has been requested by user */
struct timer_list timeout_timer;
struct task_struct *task_recv;
struct task_struct *task_send;
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *dbg_dir;
#endif
};
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif
#define nbd_name(nbd) ((nbd)->disk->disk_name)
#define NBD_MAGIC 0x68797548
static unsigned int nbds_max = 16;
@ -113,26 +126,36 @@ static void nbd_end_request(struct nbd_device *nbd, struct request *req)
/*
* Forcibly shutdown the socket causing all listeners to error
*/
static void sock_shutdown(struct nbd_device *nbd, int lock)
static void sock_shutdown(struct nbd_device *nbd)
{
if (lock)
mutex_lock(&nbd->tx_lock);
if (nbd->sock) {
if (!nbd->sock)
return;
dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
nbd->sock = NULL;
}
if (lock)
mutex_unlock(&nbd->tx_lock);
del_timer_sync(&nbd->timeout_timer);
}
static void nbd_xmit_timeout(unsigned long arg)
{
struct task_struct *task = (struct task_struct *)arg;
struct nbd_device *nbd = (struct nbd_device *)arg;
struct task_struct *task;
printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
task->comm, task->pid);
if (list_empty(&nbd->queue_head))
return;
nbd->disconnect = true;
task = READ_ONCE(nbd->task_recv);
if (task)
force_sig(SIGKILL, task);
task = READ_ONCE(nbd->task_send);
if (task)
force_sig(SIGKILL, nbd->task_send);
dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
}
/*
@ -171,33 +194,12 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
msg.msg_controllen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
if (send) {
struct timer_list ti;
if (nbd->xmit_timeout) {
init_timer(&ti);
ti.function = nbd_xmit_timeout;
ti.data = (unsigned long)current;
ti.expires = jiffies + nbd->xmit_timeout;
add_timer(&ti);
}
if (send)
result = kernel_sendmsg(sock, &msg, &iov, 1, size);
if (nbd->xmit_timeout)
del_timer_sync(&ti);
} else
else
result = kernel_recvmsg(sock, &msg, &iov, 1, size,
msg.msg_flags);
if (signal_pending(current)) {
siginfo_t info;
printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
task_pid_nr(current), current->comm,
dequeue_signal_lock(current, &current->blocked, &info));
result = -EINTR;
sock_shutdown(nbd, !send);
break;
}
if (result <= 0) {
if (result == 0)
result = -EPIPE; /* short read */
@ -210,6 +212,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
sigprocmask(SIG_SETMASK, &oldset, NULL);
tsk_restore_flags(current, pflags, PF_MEMALLOC);
if (!send && nbd->xmit_timeout)
mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
return result;
}
@ -333,26 +338,24 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk),
"Receive control failed (result %d)\n", result);
goto harderror;
return ERR_PTR(result);
}
if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
(unsigned long)ntohl(reply.magic));
result = -EPROTO;
goto harderror;
return ERR_PTR(-EPROTO);
}
req = nbd_find_request(nbd, *(struct request **)reply.handle);
if (IS_ERR(req)) {
result = PTR_ERR(req);
if (result != -ENOENT)
goto harderror;
return ERR_PTR(result);
dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
reply.handle);
result = -EBADR;
goto harderror;
return ERR_PTR(-EBADR);
}
if (ntohl(reply.error)) {
@ -380,18 +383,15 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
}
}
return req;
harderror:
nbd->harderror = result;
return NULL;
}
static ssize_t pid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
return sprintf(buf, "%ld\n",
(long) ((struct nbd_device *)disk->private_data)->pid);
return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
}
static struct device_attribute pid_attr = {
@ -399,7 +399,7 @@ static struct device_attribute pid_attr = {
.show = pid_show,
};
static int nbd_do_it(struct nbd_device *nbd)
static int nbd_thread_recv(struct nbd_device *nbd)
{
struct request *req;
int ret;
@ -407,20 +407,43 @@ static int nbd_do_it(struct nbd_device *nbd)
BUG_ON(nbd->magic != NBD_MAGIC);
sk_set_memalloc(nbd->sock->sk);
nbd->pid = task_pid_nr(current);
nbd->task_recv = current;
ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (ret) {
dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
nbd->pid = 0;
nbd->task_recv = NULL;
return ret;
}
while ((req = nbd_read_stat(nbd)) != NULL)
while (1) {
req = nbd_read_stat(nbd);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
break;
}
nbd_end_request(nbd, req);
}
device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
nbd->pid = 0;
return 0;
nbd->task_recv = NULL;
if (signal_pending(current)) {
siginfo_t info;
ret = dequeue_signal_lock(current, &current->blocked, &info);
dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
task_pid_nr(current), current->comm, ret);
mutex_lock(&nbd->tx_lock);
sock_shutdown(nbd);
mutex_unlock(&nbd->tx_lock);
ret = -ETIMEDOUT;
}
return ret;
}
static void nbd_clear_que(struct nbd_device *nbd)
@ -455,6 +478,7 @@ static void nbd_clear_que(struct nbd_device *nbd)
req->errors++;
nbd_end_request(nbd, req);
}
dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
}
@ -482,6 +506,9 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
nbd->active_req = req;
if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
if (nbd_send_req(nbd, req) != 0) {
dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
req->errors++;
@ -503,11 +530,13 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
nbd_end_request(nbd, req);
}
static int nbd_thread(void *data)
static int nbd_thread_send(void *data)
{
struct nbd_device *nbd = data;
struct request *req;
nbd->task_send = current;
set_user_nice(current, MIN_NICE);
while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
/* wait for something to do */
@ -515,6 +544,20 @@ static int nbd_thread(void *data)
kthread_should_stop() ||
!list_empty(&nbd->waiting_queue));
if (signal_pending(current)) {
siginfo_t info;
int ret;
ret = dequeue_signal_lock(current, &current->blocked,
&info);
dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
task_pid_nr(current), current->comm, ret);
mutex_lock(&nbd->tx_lock);
sock_shutdown(nbd);
mutex_unlock(&nbd->tx_lock);
break;
}
/* extract request */
if (list_empty(&nbd->waiting_queue))
continue;
@ -528,6 +571,9 @@ static int nbd_thread(void *data)
/* handle request */
nbd_handle_req(nbd, req);
}
nbd->task_send = NULL;
return 0;
}
@ -538,7 +584,7 @@ static int nbd_thread(void *data)
* { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
*/
static void do_nbd_request(struct request_queue *q)
static void nbd_request_handler(struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock)
{
struct request *req;
@ -574,6 +620,9 @@ static void do_nbd_request(struct request_queue *q)
}
}
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);
/* Must be called with tx_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
@ -597,7 +646,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
if (!nbd->sock)
return -EINVAL;
nbd->disconnect = 1;
nbd->disconnect = true;
nbd_send_req(nbd, &sreq);
return 0;
@ -625,7 +674,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
nbd->sock = sock;
if (max_part > 0)
bdev->bd_invalidated = 1;
nbd->disconnect = 0; /* we're connected now */
nbd->disconnect = false; /* we're connected now */
return 0;
}
return -EINVAL;
@ -648,6 +697,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
case NBD_SET_TIMEOUT:
nbd->xmit_timeout = arg * HZ;
if (arg)
mod_timer(&nbd->timeout_timer,
jiffies + nbd->xmit_timeout);
else
del_timer_sync(&nbd->timeout_timer);
return 0;
case NBD_SET_FLAGS:
@ -666,7 +721,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
struct socket *sock;
int error;
if (nbd->pid)
if (nbd->task_recv)
return -EBUSY;
if (!nbd->sock)
return -EINVAL;
@ -683,24 +738,24 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
else
blk_queue_flush(nbd->disk->queue, 0);
thread = kthread_run(nbd_thread, nbd, "%s",
nbd->disk->disk_name);
thread = kthread_run(nbd_thread_send, nbd, "%s",
nbd_name(nbd));
if (IS_ERR(thread)) {
mutex_lock(&nbd->tx_lock);
return PTR_ERR(thread);
}
error = nbd_do_it(nbd);
nbd_dev_dbg_init(nbd);
error = nbd_thread_recv(nbd);
nbd_dev_dbg_close(nbd);
kthread_stop(thread);
mutex_lock(&nbd->tx_lock);
if (error)
return error;
sock_shutdown(nbd, 0);
sock_shutdown(nbd);
sock = nbd->sock;
nbd->sock = NULL;
nbd_clear_que(nbd);
dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
kill_bdev(bdev);
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
set_device_ro(bdev, false);
@ -714,7 +769,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
blkdev_reread_part(bdev);
if (nbd->disconnect) /* user requested, ignore socket errors */
return 0;
return nbd->harderror;
return error;
}
case NBD_CLEAR_QUE:
@ -758,6 +813,161 @@ static const struct block_device_operations nbd_fops =
.ioctl = nbd_ioctl,
};
#if IS_ENABLED(CONFIG_DEBUG_FS)
static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
struct nbd_device *nbd = s->private;
if (nbd->task_recv)
seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
if (nbd->task_send)
seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
return 0;
}
static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
{
return single_open(file, nbd_dbg_tasks_show, inode->i_private);
}
static const struct file_operations nbd_dbg_tasks_ops = {
.open = nbd_dbg_tasks_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
struct nbd_device *nbd = s->private;
u32 flags = nbd->flags;
seq_printf(s, "Hex: 0x%08x\n\n", flags);
seq_puts(s, "Known flags:\n");
if (flags & NBD_FLAG_HAS_FLAGS)
seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
if (flags & NBD_FLAG_READ_ONLY)
seq_puts(s, "NBD_FLAG_READ_ONLY\n");
if (flags & NBD_FLAG_SEND_FLUSH)
seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
if (flags & NBD_FLAG_SEND_TRIM)
seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
return 0;
}
static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
{
return single_open(file, nbd_dbg_flags_show, inode->i_private);
}
static const struct file_operations nbd_dbg_flags_ops = {
.open = nbd_dbg_flags_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
struct dentry *dir;
struct dentry *f;
dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
if (IS_ERR_OR_NULL(dir)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s' (%ld)\n",
nbd_name(nbd), PTR_ERR(dir));
return PTR_ERR(dir);
}
nbd->dbg_dir = dir;
f = debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
if (IS_ERR_OR_NULL(f)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'tasks', %ld\n",
PTR_ERR(f));
return PTR_ERR(f);
}
f = debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
if (IS_ERR_OR_NULL(f)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'size_bytes', %ld\n",
PTR_ERR(f));
return PTR_ERR(f);
}
f = debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
if (IS_ERR_OR_NULL(f)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'timeout', %ld\n",
PTR_ERR(f));
return PTR_ERR(f);
}
f = debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
if (IS_ERR_OR_NULL(f)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'blocksize', %ld\n",
PTR_ERR(f));
return PTR_ERR(f);
}
f = debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops);
if (IS_ERR_OR_NULL(f)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'flags', %ld\n",
PTR_ERR(f));
return PTR_ERR(f);
}
return 0;
}
static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
debugfs_remove_recursive(nbd->dbg_dir);
}
static int nbd_dbg_init(void)
{
struct dentry *dbg_dir;
dbg_dir = debugfs_create_dir("nbd", NULL);
if (IS_ERR(dbg_dir))
return PTR_ERR(dbg_dir);
nbd_dbg_dir = dbg_dir;
return 0;
}
static void nbd_dbg_close(void)
{
debugfs_remove_recursive(nbd_dbg_dir);
}
#else /* IS_ENABLED(CONFIG_DEBUG_FS) */
static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
return 0;
}
static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
}
static int nbd_dbg_init(void)
{
return 0;
}
static void nbd_dbg_close(void)
{
}
#endif
/*
* And here should be modules and kernel interface
* (Just smiley confuses emacs :-)
@ -811,7 +1021,7 @@ static int __init nbd_init(void)
* every gendisk to have its very own request_queue struct.
* These structs are big so we dynamically allocate them.
*/
disk->queue = blk_init_queue(do_nbd_request, &nbd_lock);
disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
if (!disk->queue) {
put_disk(disk);
goto out;
@ -835,6 +1045,8 @@ static int __init nbd_init(void)
printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
nbd_dbg_init();
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = nbd_dev[i].disk;
nbd_dev[i].magic = NBD_MAGIC;
@ -842,6 +1054,9 @@ static int __init nbd_init(void)
spin_lock_init(&nbd_dev[i].queue_lock);
INIT_LIST_HEAD(&nbd_dev[i].queue_head);
mutex_init(&nbd_dev[i].tx_lock);
init_timer(&nbd_dev[i].timeout_timer);
nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
init_waitqueue_head(&nbd_dev[i].active_wq);
init_waitqueue_head(&nbd_dev[i].waiting_wq);
nbd_dev[i].blksize = 1024;
@ -868,6 +1083,9 @@ static int __init nbd_init(void)
static void __exit nbd_cleanup(void)
{
int i;
nbd_dbg_close();
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = nbd_dev[i].disk;
nbd_dev[i].magic = 0;

View File

@ -72,6 +72,10 @@ module_param(nvme_char_major, int, 0);
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);
static bool use_cmb_sqes = true;
module_param(use_cmb_sqes, bool, 0644);
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
static DEFINE_SPINLOCK(dev_list_lock);
static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
@ -103,6 +107,7 @@ struct nvme_queue {
char irqname[24]; /* nvme4294967295-65535\0 */
spinlock_t q_lock;
struct nvme_command *sq_cmds;
struct nvme_command __iomem *sq_cmds_io;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
@ -379,27 +384,28 @@ static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
*
* Safe to use from interrupt context
*/
static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
struct nvme_command *cmd)
{
u16 tail = nvmeq->sq_tail;
if (nvmeq->sq_cmds_io)
memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
else
memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
if (++tail == nvmeq->q_depth)
tail = 0;
writel(tail, nvmeq->q_db);
nvmeq->sq_tail = tail;
return 0;
}
static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&nvmeq->q_lock, flags);
ret = __nvme_submit_cmd(nvmeq, cmd);
__nvme_submit_cmd(nvmeq, cmd);
spin_unlock_irqrestore(&nvmeq->q_lock, flags);
return ret;
}
static __le64 **iod_list(struct nvme_iod *iod)
@ -730,18 +736,16 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
struct nvme_iod *iod)
{
struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
struct nvme_command cmnd;
memcpy(cmnd, req->cmd, sizeof(struct nvme_command));
cmnd->rw.command_id = req->tag;
memcpy(&cmnd, req->cmd, sizeof(cmnd));
cmnd.rw.command_id = req->tag;
if (req->nr_phys_segments) {
cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
}
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
__nvme_submit_cmd(nvmeq, &cmnd);
}
/*
@ -754,45 +758,41 @@ static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
{
struct nvme_dsm_range *range =
(struct nvme_dsm_range *)iod_list(iod)[0];
struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
struct nvme_command cmnd;
range->cattr = cpu_to_le32(0);
range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
cmnd->dsm.command_id = req->tag;
cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
cmnd->dsm.nr = 0;
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
memset(&cmnd, 0, sizeof(cmnd));
cmnd.dsm.opcode = nvme_cmd_dsm;
cmnd.dsm.command_id = req->tag;
cmnd.dsm.nsid = cpu_to_le32(ns->ns_id);
cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma);
cmnd.dsm.nr = 0;
cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
__nvme_submit_cmd(nvmeq, &cmnd);
}
static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
int cmdid)
{
struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
struct nvme_command cmnd;
memset(cmnd, 0, sizeof(*cmnd));
cmnd->common.opcode = nvme_cmd_flush;
cmnd->common.command_id = cmdid;
cmnd->common.nsid = cpu_to_le32(ns->ns_id);
memset(&cmnd, 0, sizeof(cmnd));
cmnd.common.opcode = nvme_cmd_flush;
cmnd.common.command_id = cmdid;
cmnd.common.nsid = cpu_to_le32(ns->ns_id);
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
__nvme_submit_cmd(nvmeq, &cmnd);
}
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
struct nvme_ns *ns)
{
struct request *req = iod_get_private(iod);
struct nvme_command *cmnd;
struct nvme_command cmnd;
u16 control = 0;
u32 dsmgmt = 0;
@ -804,19 +804,16 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
memset(cmnd, 0, sizeof(*cmnd));
memset(&cmnd, 0, sizeof(cmnd));
cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd.rw.command_id = req->tag;
cmnd.rw.nsid = cpu_to_le32(ns->ns_id);
cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.command_id = req->tag;
cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
if (blk_integrity_rq(req)) {
cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
if (ns->ms) {
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
@ -825,19 +822,21 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
cmnd->rw.reftag = cpu_to_le32(
cmnd.rw.reftag = cpu_to_le32(
nvme_block_nr(ns, blk_rq_pos(req)));
break;
}
} else if (ns->ms)
if (blk_integrity_rq(req))
cmnd.rw.metadata =
cpu_to_le64(sg_dma_address(iod->meta_sg));
else
control |= NVME_RW_PRINFO_PRACT;
}
cmnd->rw.control = cpu_to_le16(control);
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
cmnd.rw.control = cpu_to_le16(control);
cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt);
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
__nvme_submit_cmd(nvmeq, &cmnd);
return 0;
}
@ -1080,7 +1079,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
c.common.command_id = req->tag;
blk_mq_free_request(req);
return __nvme_submit_cmd(nvmeq, &c);
__nvme_submit_cmd(nvmeq, &c);
return 0;
}
static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
@ -1103,7 +1103,8 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
cmd->common.command_id = req->tag;
return nvme_submit_cmd(nvmeq, cmd);
nvme_submit_cmd(nvmeq, cmd);
return 0;
}
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@ -1315,12 +1316,7 @@ static void nvme_abort_req(struct request *req)
dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
nvmeq->qid);
if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
dev_warn(nvmeq->q_dmadev,
"Could not abort I/O %d QID %d",
req->tag, nvmeq->qid);
blk_mq_free_request(abort_req);
}
nvme_submit_cmd(dev->queues[0], &cmd);
}
static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
@ -1374,6 +1370,7 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
{
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
if (nvmeq->sq_cmds)
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
kfree(nvmeq);
@ -1447,6 +1444,47 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
spin_unlock_irq(&nvmeq->q_lock);
}
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
int entry_size)
{
int q_depth = dev->q_depth;
unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size);
if (q_size_aligned * nr_io_queues > dev->cmb_size) {
u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
mem_per_q = round_down(mem_per_q, dev->page_size);
q_depth = div_u64(mem_per_q, entry_size);
/*
* Ensure the reduced q_depth is above some threshold where it
* would be better to map queues in system memory with the
* original depth
*/
if (q_depth < 64)
return -ENOMEM;
}
return q_depth;
}
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
int qid, int depth)
{
if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
unsigned offset = (qid - 1) *
roundup(SQ_SIZE(depth), dev->page_size);
nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
nvmeq->sq_cmds_io = dev->cmb + offset;
} else {
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
&nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
}
return 0;
}
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth)
{
@ -1459,9 +1497,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
if (!nvmeq->cqes)
goto free_nvmeq;
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
&nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
goto free_cqdma;
nvmeq->q_dmadev = dev->dev;
@ -1696,6 +1732,12 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
page_shift = dev_page_max;
}
dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ?
NVME_CAP_NSSRC(cap) : 0;
if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, &dev->bar->csts);
result = nvme_disable_ctrl(dev, cap);
if (result < 0)
return result;
@ -1856,6 +1898,15 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
return status;
}
static int nvme_subsys_reset(struct nvme_dev *dev)
{
if (!dev->subsystem)
return -ENOTTY;
writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
return 0;
}
static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
unsigned long arg)
{
@ -1989,7 +2040,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
!ns->ext)
nvme_init_integrity(ns);
if (ns->ms && !blk_get_integrity(disk))
if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
set_capacity(disk, 0);
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
@ -2020,7 +2071,10 @@ static int nvme_kthread(void *data)
spin_lock(&dev_list_lock);
list_for_each_entry_safe(dev, next, &dev_list, node) {
int i;
if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
u32 csts = readl(&dev->bar->csts);
if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
csts & NVME_CSTS_CFS) {
if (work_busy(&dev->reset_work))
continue;
list_del_init(&dev->node);
@ -2080,8 +2134,11 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
list_add_tail(&ns->list, &dev->namespaces);
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
if (dev->max_hw_sectors)
if (dev->max_hw_sectors) {
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
blk_queue_max_segments(ns->queue,
((dev->max_hw_sectors << 9) / dev->page_size) + 1);
}
if (dev->stripe_size)
blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
if (dev->vwc & NVME_CTRL_VWC_PRESENT)
@ -2159,6 +2216,58 @@ static int set_queue_count(struct nvme_dev *dev, int count)
return min(result & 0xffff, result >> 16) + 1;
}
static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
{
u64 szu, size, offset;
u32 cmbloc;
resource_size_t bar_size;
struct pci_dev *pdev = to_pci_dev(dev->dev);
void __iomem *cmb;
dma_addr_t dma_addr;
if (!use_cmb_sqes)
return NULL;
dev->cmbsz = readl(&dev->bar->cmbsz);
if (!(NVME_CMB_SZ(dev->cmbsz)))
return NULL;
cmbloc = readl(&dev->bar->cmbloc);
szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
size = szu * NVME_CMB_SZ(dev->cmbsz);
offset = szu * NVME_CMB_OFST(cmbloc);
bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc));
if (offset > bar_size)
return NULL;
/*
* Controllers may support a CMB size larger than their BAR,
* for example, due to being behind a bridge. Reduce the CMB to
* the reported size of the BAR
*/
if (size > bar_size - offset)
size = bar_size - offset;
dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset;
cmb = ioremap_wc(dma_addr, size);
if (!cmb)
return NULL;
dev->cmb_dma_addr = dma_addr;
dev->cmb_size = size;
return cmb;
}
static inline void nvme_release_cmb(struct nvme_dev *dev)
{
if (dev->cmb) {
iounmap(dev->cmb);
dev->cmb = NULL;
}
}
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
@ -2177,6 +2286,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (result < nr_io_queues)
nr_io_queues = result;
if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0)
dev->q_depth = result;
else
nvme_release_cmb(dev);
}
size = db_bar_size(dev, nr_io_queues);
if (size > 8192) {
iounmap(dev->bar);
@ -2344,7 +2462,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
int res;
unsigned nn;
struct nvme_id_ctrl *ctrl;
int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
@ -2354,7 +2471,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
return -EIO;
}
nn = le32_to_cpup(&ctrl->nn);
dev->oncs = le16_to_cpup(&ctrl->oncs);
dev->abort_limit = ctrl->acl + 1;
dev->vwc = ctrl->vwc;
@ -2440,6 +2556,8 @@ static int nvme_dev_map(struct nvme_dev *dev)
dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
dev->dbs = ((void __iomem *)dev->bar) + 4096;
if (readl(&dev->bar->vs) >= NVME_VS(1, 2))
dev->cmb = nvme_map_cmb(dev);
return 0;
@ -2820,6 +2938,8 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
case NVME_IOCTL_RESET:
dev_warn(dev->dev, "resetting controller\n");
return nvme_reset(dev);
case NVME_IOCTL_SUBSYS_RESET:
return nvme_subsys_reset(dev);
default:
return -ENOTTY;
}
@ -3145,6 +3265,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dev_remove_admin(dev);
device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
nvme_free_queues(dev, 0);
nvme_release_cmb(dev);
nvme_release_prp_pools(dev);
kref_put(&dev->kref, nvme_free_dev);
}

View File

@ -28,18 +28,32 @@ struct nvme_bar {
__u32 cc; /* Controller Configuration */
__u32 rsvd1; /* Reserved */
__u32 csts; /* Controller Status */
__u32 rsvd2; /* Reserved */
__u32 nssr; /* Subsystem Reset */
__u32 aqa; /* Admin Queue Attributes */
__u64 asq; /* Admin SQ Base Address */
__u64 acq; /* Admin CQ Base Address */
__u32 cmbloc; /* Controller Memory Buffer Location */
__u32 cmbsz; /* Controller Memory Buffer Size */
};
#define NVME_CAP_MQES(cap) ((cap) & 0xffff)
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1)
#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf)
#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf)
#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7)
#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff)
#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff)
#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf)
#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10)
#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8)
#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4)
#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2)
#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1)
enum {
NVME_CC_ENABLE = 1 << 0,
NVME_CC_CSS_NVM = 0 << 4,
@ -55,6 +69,7 @@ enum {
NVME_CC_IOCQES = 4 << 20,
NVME_CSTS_RDY = 1 << 0,
NVME_CSTS_CFS = 1 << 1,
NVME_CSTS_NSSRO = 1 << 4,
NVME_CSTS_SHST_NORMAL = 0 << 2,
NVME_CSTS_SHST_OCCUR = 1 << 2,
NVME_CSTS_SHST_CMPLT = 2 << 2,
@ -97,9 +112,14 @@ struct nvme_dev {
char serial[20];
char model[40];
char firmware_rev[8];
bool subsystem;
u32 max_hw_sectors;
u32 stripe_size;
u32 page_size;
void __iomem *cmb;
dma_addr_t cmb_dma_addr;
u64 cmb_size;
u32 cmbsz;
u16 oncs;
u16 abort_limit;
u8 event_limit;

View File

@ -584,5 +584,6 @@ struct nvme_passthru_cmd {
#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io)
#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd)
#define NVME_IOCTL_RESET _IO('N', 0x44)
#define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45)
#endif /* _UAPI_LINUX_NVME_H */