Merge git://git.infradead.org/users/willy/linux-nvme
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 8 Sep 2013 03:19:02 +0000 (20:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 8 Sep 2013 03:19:02 +0000 (20:19 -0700)
Pull NVM Express driver update from Matthew Wilcox.

* git://git.infradead.org/users/willy/linux-nvme:
  NVMe: Merge issue on character device bring-up
  NVMe: Handle ioremap failure
  NVMe: Add pci suspend/resume driver callbacks
  NVMe: Use normal shutdown
  NVMe: Separate controller init from disk discovery
  NVMe: Separate queue alloc/free from create/delete
  NVMe: Group pci related actions in functions
  NVMe: Disk stats for read/write commands only
  NVMe: Bring up cdev on set feature failure
  NVMe: Fix checkpatch issues
  NVMe: Namespace IDs are unsigned
  NVMe: Update nvme_id_power_state with latest spec
  NVMe: Split header file into user-visible and kernel-visible pieces
  NVMe: Call nvme_process_cq from submission path
  NVMe: Remove "process_cq did something" message
  NVMe: Return correct value from interrupt handler
  NVMe: Disk IO statistics
  NVMe: Restructure MSI / MSI-X setup
  NVMe: Use kzalloc instead of kmalloc+memset

drivers/block/nvme-core.c
drivers/block/nvme-scsi.c
include/linux/nvme.h
include/uapi/linux/Kbuild
include/uapi/linux/nvme.h [new file with mode: 0644]

index ce79a59..da52092 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
 #include <linux/poison.h>
+#include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -79,7 +80,9 @@ struct nvme_queue {
        u16 sq_head;
        u16 sq_tail;
        u16 cq_head;
-       u16 cq_phase;
+       u8 cq_phase;
+       u8 cqe_seen;
+       u8 q_suspended;
        unsigned long cmdid_data[];
 };
 
@@ -115,6 +118,11 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
        return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
 }
 
+static unsigned nvme_queue_extra(int depth)
+{
+       return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+}
+
 /**
  * alloc_cmdid() - Allocate a Command ID
  * @nvmeq: The queue that will be used for this command
@@ -285,6 +293,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
                iod->npages = -1;
                iod->length = nbytes;
                iod->nents = 0;
+               iod->start_time = jiffies;
        }
 
        return iod;
@@ -308,6 +317,30 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
        kfree(iod);
 }
 
+static void nvme_start_io_acct(struct bio *bio)
+{
+       struct gendisk *disk = bio->bi_bdev->bd_disk;
+       const int rw = bio_data_dir(bio);
+       int cpu = part_stat_lock();
+       part_round_stats(cpu, &disk->part0);
+       part_stat_inc(cpu, &disk->part0, ios[rw]);
+       part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
+       part_inc_in_flight(&disk->part0, rw);
+       part_stat_unlock();
+}
+
+static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
+{
+       struct gendisk *disk = bio->bi_bdev->bd_disk;
+       const int rw = bio_data_dir(bio);
+       unsigned long duration = jiffies - start_time;
+       int cpu = part_stat_lock();
+       part_stat_add(cpu, &disk->part0, ticks[rw], duration);
+       part_round_stats(cpu, &disk->part0);
+       part_dec_in_flight(&disk->part0, rw);
+       part_stat_unlock();
+}
+
 static void bio_completion(struct nvme_dev *dev, void *ctx,
                                                struct nvme_completion *cqe)
 {
@@ -315,9 +348,11 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
        struct bio *bio = iod->private;
        u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-       if (iod->nents)
+       if (iod->nents) {
                dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
                        bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               nvme_end_io_acct(bio, iod->start_time);
+       }
        nvme_free_iod(dev, iod);
        if (status)
                bio_endio(bio, -EIO);
@@ -422,10 +457,8 @@ static void nvme_bio_pair_endio(struct bio *bio, int err)
 
        if (atomic_dec_and_test(&bp->cnt)) {
                bio_endio(bp->parent, bp->err);
-               if (bp->bv1)
-                       kfree(bp->bv1);
-               if (bp->bv2)
-                       kfree(bp->bv2);
+               kfree(bp->bv1);
+               kfree(bp->bv2);
                kfree(bp);
        }
 }
@@ -695,6 +728,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
        cmnd->rw.control = cpu_to_le16(control);
        cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
+       nvme_start_io_acct(bio);
        if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
        writel(nvmeq->sq_tail, nvmeq->q_db);
@@ -709,26 +743,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
        return result;
 }
 
-static void nvme_make_request(struct request_queue *q, struct bio *bio)
-{
-       struct nvme_ns *ns = q->queuedata;
-       struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
-       int result = -EBUSY;
-
-       spin_lock_irq(&nvmeq->q_lock);
-       if (bio_list_empty(&nvmeq->sq_cong))
-               result = nvme_submit_bio_queue(nvmeq, ns, bio);
-       if (unlikely(result)) {
-               if (bio_list_empty(&nvmeq->sq_cong))
-                       add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-               bio_list_add(&nvmeq->sq_cong, bio);
-       }
-
-       spin_unlock_irq(&nvmeq->q_lock);
-       put_nvmeq(nvmeq);
-}
-
-static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
+static int nvme_process_cq(struct nvme_queue *nvmeq)
 {
        u16 head, phase;
 
@@ -758,13 +773,40 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
         * a big problem.
         */
        if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
-               return IRQ_NONE;
+               return 0;
 
        writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
        nvmeq->cq_head = head;
        nvmeq->cq_phase = phase;
 
-       return IRQ_HANDLED;
+       nvmeq->cqe_seen = 1;
+       return 1;
+}
+
+static void nvme_make_request(struct request_queue *q, struct bio *bio)
+{
+       struct nvme_ns *ns = q->queuedata;
+       struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+       int result = -EBUSY;
+
+       if (!nvmeq) {
+               put_nvmeq(NULL);
+               bio_endio(bio, -EIO);
+               return;
+       }
+
+       spin_lock_irq(&nvmeq->q_lock);
+       if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
+               result = nvme_submit_bio_queue(nvmeq, ns, bio);
+       if (unlikely(result)) {
+               if (bio_list_empty(&nvmeq->sq_cong))
+                       add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+               bio_list_add(&nvmeq->sq_cong, bio);
+       }
+
+       nvme_process_cq(nvmeq);
+       spin_unlock_irq(&nvmeq->q_lock);
+       put_nvmeq(nvmeq);
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
@@ -772,7 +814,9 @@ static irqreturn_t nvme_irq(int irq, void *data)
        irqreturn_t result;
        struct nvme_queue *nvmeq = data;
        spin_lock(&nvmeq->q_lock);
-       result = nvme_process_cq(nvmeq);
+       nvme_process_cq(nvmeq);
+       result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
+       nvmeq->cqe_seen = 0;
        spin_unlock(&nvmeq->q_lock);
        return result;
 }
@@ -986,8 +1030,15 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
        }
 }
 
-static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
+static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
+       spin_lock_irq(&nvmeq->q_lock);
+       while (bio_list_peek(&nvmeq->sq_cong)) {
+               struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+               bio_endio(bio, -EIO);
+       }
+       spin_unlock_irq(&nvmeq->q_lock);
+
        dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
                                (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
        dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
@@ -995,17 +1046,28 @@ static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
        kfree(nvmeq);
 }
 
-static void nvme_free_queue(struct nvme_dev *dev, int qid)
+static void nvme_free_queues(struct nvme_dev *dev)
+{
+       int i;
+
+       for (i = dev->queue_count - 1; i >= 0; i--) {
+               nvme_free_queue(dev->queues[i]);
+               dev->queue_count--;
+               dev->queues[i] = NULL;
+       }
+}
+
+static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 {
        struct nvme_queue *nvmeq = dev->queues[qid];
        int vector = dev->entry[nvmeq->cq_vector].vector;
 
        spin_lock_irq(&nvmeq->q_lock);
-       nvme_cancel_ios(nvmeq, false);
-       while (bio_list_peek(&nvmeq->sq_cong)) {
-               struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-               bio_endio(bio, -EIO);
+       if (nvmeq->q_suspended) {
+               spin_unlock_irq(&nvmeq->q_lock);
+               return;
        }
+       nvmeq->q_suspended = 1;
        spin_unlock_irq(&nvmeq->q_lock);
 
        irq_set_affinity_hint(vector, NULL);
@@ -1017,15 +1079,17 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
                adapter_delete_cq(dev, qid);
        }
 
-       nvme_free_queue_mem(nvmeq);
+       spin_lock_irq(&nvmeq->q_lock);
+       nvme_process_cq(nvmeq);
+       nvme_cancel_ios(nvmeq, false);
+       spin_unlock_irq(&nvmeq->q_lock);
 }
 
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
                                                        int depth, int vector)
 {
        struct device *dmadev = &dev->pci_dev->dev;
-       unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
-                                               sizeof(struct nvme_cmd_info));
+       unsigned extra = nvme_queue_extra(depth);
        struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
        if (!nvmeq)
                return NULL;
@@ -1052,6 +1116,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
        nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
        nvmeq->q_depth = depth;
        nvmeq->cq_vector = vector;
+       nvmeq->q_suspended = 1;
+       dev->queue_count++;
 
        return nvmeq;
 
@@ -1075,18 +1141,29 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
                                IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
 }
 
-static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid,
-                                           int cq_size, int vector)
+static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
-       int result;
-       struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
+       struct nvme_dev *dev = nvmeq->dev;
+       unsigned extra = nvme_queue_extra(nvmeq->q_depth);
 
-       if (!nvmeq)
-               return ERR_PTR(-ENOMEM);
+       nvmeq->sq_tail = 0;
+       nvmeq->cq_head = 0;
+       nvmeq->cq_phase = 1;
+       nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
+       memset(nvmeq->cmdid_data, 0, extra);
+       memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+       nvme_cancel_ios(nvmeq, false);
+       nvmeq->q_suspended = 0;
+}
+
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+{
+       struct nvme_dev *dev = nvmeq->dev;
+       int result;
 
        result = adapter_alloc_cq(dev, qid, nvmeq);
        if (result < 0)
-               goto free_nvmeq;
+               return result;
 
        result = adapter_alloc_sq(dev, qid, nvmeq);
        if (result < 0)
@@ -1096,19 +1173,17 @@ static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid,
        if (result < 0)
                goto release_sq;
 
-       return nvmeq;
+       spin_lock(&nvmeq->q_lock);
+       nvme_init_queue(nvmeq, qid);
+       spin_unlock(&nvmeq->q_lock);
+
+       return result;
 
  release_sq:
        adapter_delete_sq(dev, qid);
  release_cq:
        adapter_delete_cq(dev, qid);
- free_nvmeq:
-       dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
-                               (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-       dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-                                       nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-       kfree(nvmeq);
-       return ERR_PTR(result);
+       return result;
 }
 
 static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
@@ -1152,6 +1227,30 @@ static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
        return nvme_wait_ready(dev, cap, true);
 }
 
+static int nvme_shutdown_ctrl(struct nvme_dev *dev)
+{
+       unsigned long timeout;
+       u32 cc;
+
+       cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL;
+       writel(cc, &dev->bar->cc);
+
+       timeout = 2 * HZ + jiffies;
+       while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
+                                                       NVME_CSTS_SHST_CMPLT) {
+               msleep(100);
+               if (fatal_signal_pending(current))
+                       return -EINTR;
+               if (time_after(jiffies, timeout)) {
+                       dev_err(&dev->pci_dev->dev,
+                               "Device shutdown incomplete; abort shutdown\n");
+                       return -ENODEV;
+               }
+       }
+
+       return 0;
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
        int result;
@@ -1159,16 +1258,17 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
        u64 cap = readq(&dev->bar->cap);
        struct nvme_queue *nvmeq;
 
-       dev->dbs = ((void __iomem *)dev->bar) + 4096;
-       dev->db_stride = NVME_CAP_STRIDE(cap);
-
        result = nvme_disable_ctrl(dev, cap);
        if (result < 0)
                return result;
 
-       nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
-       if (!nvmeq)
-               return -ENOMEM;
+       nvmeq = dev->queues[0];
+       if (!nvmeq) {
+               nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+               if (!nvmeq)
+                       return -ENOMEM;
+               dev->queues[0] = nvmeq;
+       }
 
        aqa = nvmeq->q_depth - 1;
        aqa |= aqa << 16;
@@ -1185,17 +1285,15 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 
        result = nvme_enable_ctrl(dev, cap);
        if (result)
-               goto free_q;
+               return result;
 
        result = queue_request_irq(dev, nvmeq, "nvme admin");
        if (result)
-               goto free_q;
-
-       dev->queues[0] = nvmeq;
-       return result;
+               return result;
 
- free_q:
-       nvme_free_queue_mem(nvmeq);
+       spin_lock(&nvmeq->q_lock);
+       nvme_init_queue(nvmeq, 0);
+       spin_unlock(&nvmeq->q_lock);
        return result;
 }
 
@@ -1314,7 +1412,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        c.rw.appmask = cpu_to_le16(io.appmask);
 
        if (meta_len) {
-               meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len);
+               meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata,
+                                                               meta_len);
                if (IS_ERR(meta_iod)) {
                        status = PTR_ERR(meta_iod);
                        meta_iod = NULL;
@@ -1356,6 +1455,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        put_nvmeq(nvmeq);
        if (length != (io.nblocks + 1) << ns->lba_shift)
                status = -ENOMEM;
+       else if (!nvmeq || nvmeq->q_suspended)
+               status = -EBUSY;
        else
                status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
@@ -1453,6 +1554,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 
        switch (cmd) {
        case NVME_IOCTL_ID:
+               force_successful_syscall_return();
                return ns->ns_id;
        case NVME_IOCTL_ADMIN_CMD:
                return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
@@ -1506,10 +1608,12 @@ static int nvme_kthread(void *data)
                                if (!nvmeq)
                                        continue;
                                spin_lock_irq(&nvmeq->q_lock);
-                               if (nvme_process_cq(nvmeq))
-                                       printk("process_cq did something\n");
+                               if (nvmeq->q_suspended)
+                                       goto unlock;
+                               nvme_process_cq(nvmeq);
                                nvme_cancel_ios(nvmeq, true);
                                nvme_resubmit_bios(nvmeq);
+ unlock:
                                spin_unlock_irq(&nvmeq->q_lock);
                        }
                }
@@ -1556,7 +1660,7 @@ static void nvme_config_discard(struct nvme_ns *ns)
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
 }
 
-static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
                        struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
 {
        struct nvme_ns *ns;
@@ -1631,14 +1735,19 @@ static int set_queue_count(struct nvme_dev *dev, int count)
        status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
                                                                &result);
        if (status)
-               return -EIO;
+               return status < 0 ? -EIO : -EBUSY;
        return min(result & 0xffff, result >> 16) + 1;
 }
 
+static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+       return 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
+}
+
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
        struct pci_dev *pdev = dev->pci_dev;
-       int result, cpu, i, nr_io_queues, db_bar_size, q_depth, q_count;
+       int result, cpu, i, vecs, nr_io_queues, size, q_depth;
 
        nr_io_queues = num_online_cpus();
        result = set_queue_count(dev, nr_io_queues);
@@ -1647,53 +1756,80 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
        if (result < nr_io_queues)
                nr_io_queues = result;
 
-       q_count = nr_io_queues;
-       /* Deregister the admin queue's interrupt */
-       free_irq(dev->entry[0].vector, dev->queues[0]);
-
-       db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
-       if (db_bar_size > 8192) {
+       size = db_bar_size(dev, nr_io_queues);
+       if (size > 8192) {
                iounmap(dev->bar);
-               dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size);
+               do {
+                       dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+                       if (dev->bar)
+                               break;
+                       if (!--nr_io_queues)
+                               return -ENOMEM;
+                       size = db_bar_size(dev, nr_io_queues);
+               } while (1);
                dev->dbs = ((void __iomem *)dev->bar) + 4096;
                dev->queues[0]->q_db = dev->dbs;
        }
 
-       for (i = 0; i < nr_io_queues; i++)
+       /* Deregister the admin queue's interrupt */
+       free_irq(dev->entry[0].vector, dev->queues[0]);
+
+       vecs = nr_io_queues;
+       for (i = 0; i < vecs; i++)
                dev->entry[i].entry = i;
        for (;;) {
-               result = pci_enable_msix(pdev, dev->entry, nr_io_queues);
-               if (result == 0) {
-                       break;
-               } else if (result > 0) {
-                       nr_io_queues = result;
-                       continue;
-               } else {
-                       nr_io_queues = 0;
+               result = pci_enable_msix(pdev, dev->entry, vecs);
+               if (result <= 0)
                        break;
-               }
+               vecs = result;
        }
 
-       if (nr_io_queues == 0) {
-               nr_io_queues = q_count;
+       if (result < 0) {
+               vecs = nr_io_queues;
+               if (vecs > 32)
+                       vecs = 32;
                for (;;) {
-                       result = pci_enable_msi_block(pdev, nr_io_queues);
+                       result = pci_enable_msi_block(pdev, vecs);
                        if (result == 0) {
-                               for (i = 0; i < nr_io_queues; i++)
+                               for (i = 0; i < vecs; i++)
                                        dev->entry[i].vector = i + pdev->irq;
                                break;
-                       } else if (result > 0) {
-                               nr_io_queues = result;
-                               continue;
-                       } else {
-                               nr_io_queues = 1;
+                       } else if (result < 0) {
+                               vecs = 1;
                                break;
                        }
+                       vecs = result;
                }
        }
 
+       /*
+        * Should investigate if there's a performance win from allocating
+        * more queues than interrupt vectors; it might allow the submission
+        * path to scale better, even if the receive path is limited by the
+        * number of interrupts.
+        */
+       nr_io_queues = vecs;
+
        result = queue_request_irq(dev, dev->queues[0], "nvme admin");
-       /* XXX: handle failure here */
+       if (result) {
+               dev->queues[0]->q_suspended = 1;
+               goto free_queues;
+       }
+
+       /* Free previously allocated queues that are no longer usable */
+       spin_lock(&dev_list_lock);
+       for (i = dev->queue_count - 1; i > nr_io_queues; i--) {
+               struct nvme_queue *nvmeq = dev->queues[i];
+
+               spin_lock(&nvmeq->q_lock);
+               nvme_cancel_ios(nvmeq, false);
+               spin_unlock(&nvmeq->q_lock);
+
+               nvme_free_queue(nvmeq);
+               dev->queue_count--;
+               dev->queues[i] = NULL;
+       }
+       spin_unlock(&dev_list_lock);
 
        cpu = cpumask_first(cpu_online_mask);
        for (i = 0; i < nr_io_queues; i++) {
@@ -1703,11 +1839,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
        q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
                                                                NVME_Q_DEPTH);
-       for (i = 0; i < nr_io_queues; i++) {
-               dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
-               if (IS_ERR(dev->queues[i + 1]))
-                       return PTR_ERR(dev->queues[i + 1]);
-               dev->queue_count++;
+       for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
+               dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i);
+               if (!dev->queues[i + 1]) {
+                       result = -ENOMEM;
+                       goto free_queues;
+               }
        }
 
        for (; i < num_possible_cpus(); i++) {
@@ -1715,15 +1852,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
                dev->queues[i + 1] = dev->queues[target + 1];
        }
 
-       return 0;
-}
+       for (i = 1; i < dev->queue_count; i++) {
+               result = nvme_create_queue(dev->queues[i], i);
+               if (result) {
+                       for (--i; i > 0; i--)
+                               nvme_disable_queue(dev, i);
+                       goto free_queues;
+               }
+       }
 
-static void nvme_free_queues(struct nvme_dev *dev)
-{
-       int i;
+       return 0;
 
-       for (i = dev->queue_count - 1; i >= 0; i--)
-               nvme_free_queue(dev, i);
+ free_queues:
+       nvme_free_queues(dev);
+       return result;
 }
 
 /*
@@ -1734,7 +1876,8 @@ static void nvme_free_queues(struct nvme_dev *dev)
  */
 static int nvme_dev_add(struct nvme_dev *dev)
 {
-       int res, nn, i;
+       int res;
+       unsigned nn, i;
        struct nvme_ns *ns;
        struct nvme_id_ctrl *ctrl;
        struct nvme_id_ns *id_ns;
@@ -1742,10 +1885,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
        dma_addr_t dma_addr;
        int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
-       res = nvme_setup_io_queues(dev);
-       if (res)
-               return res;
-
        mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
                                                                GFP_KERNEL);
        if (!mem)
@@ -1796,23 +1935,86 @@ static int nvme_dev_add(struct nvme_dev *dev)
        return res;
 }
 
-static int nvme_dev_remove(struct nvme_dev *dev)
+static int nvme_dev_map(struct nvme_dev *dev)
 {
-       struct nvme_ns *ns, *next;
+       int bars, result = -ENOMEM;
+       struct pci_dev *pdev = dev->pci_dev;
+
+       if (pci_enable_device_mem(pdev))
+               return result;
+
+       dev->entry[0].vector = pdev->irq;
+       pci_set_master(pdev);
+       bars = pci_select_bars(pdev, IORESOURCE_MEM);
+       if (pci_request_selected_regions(pdev, bars, "nvme"))
+               goto disable_pci;
+
+       if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)))
+               dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
+       else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)))
+               dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+       else
+               goto disable_pci;
+
+       pci_set_drvdata(pdev, dev);
+       dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+       if (!dev->bar)
+               goto disable;
+
+       dev->db_stride = NVME_CAP_STRIDE(readq(&dev->bar->cap));
+       dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+       return 0;
+
+ disable:
+       pci_release_regions(pdev);
+ disable_pci:
+       pci_disable_device(pdev);
+       return result;
+}
+
+static void nvme_dev_unmap(struct nvme_dev *dev)
+{
+       if (dev->pci_dev->msi_enabled)
+               pci_disable_msi(dev->pci_dev);
+       else if (dev->pci_dev->msix_enabled)
+               pci_disable_msix(dev->pci_dev);
+
+       if (dev->bar) {
+               iounmap(dev->bar);
+               dev->bar = NULL;
+       }
+
+       pci_release_regions(dev->pci_dev);
+       if (pci_is_enabled(dev->pci_dev))
+               pci_disable_device(dev->pci_dev);
+}
+
+static void nvme_dev_shutdown(struct nvme_dev *dev)
+{
+       int i;
+
+       for (i = dev->queue_count - 1; i >= 0; i--)
+               nvme_disable_queue(dev, i);
 
        spin_lock(&dev_list_lock);
-       list_del(&dev->node);
+       list_del_init(&dev->node);
        spin_unlock(&dev_list_lock);
 
+       if (dev->bar)
+               nvme_shutdown_ctrl(dev);
+       nvme_dev_unmap(dev);
+}
+
+static void nvme_dev_remove(struct nvme_dev *dev)
+{
+       struct nvme_ns *ns, *next;
+
        list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
                list_del(&ns->list);
                del_gendisk(ns->disk);
                nvme_ns_free(ns);
        }
-
-       nvme_free_queues(dev);
-
-       return 0;
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -1872,15 +2074,10 @@ static void nvme_free_dev(struct kref *kref)
 {
        struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
        nvme_dev_remove(dev);
-       if (dev->pci_dev->msi_enabled)
-               pci_disable_msi(dev->pci_dev);
-       else if (dev->pci_dev->msix_enabled)
-               pci_disable_msix(dev->pci_dev);
-       iounmap(dev->bar);
+       nvme_dev_shutdown(dev);
+       nvme_free_queues(dev);
        nvme_release_instance(dev);
        nvme_release_prp_pools(dev);
-       pci_disable_device(dev->pci_dev);
-       pci_release_regions(dev->pci_dev);
        kfree(dev->queues);
        kfree(dev->entry);
        kfree(dev);
@@ -1921,9 +2118,40 @@ static const struct file_operations nvme_dev_fops = {
        .compat_ioctl   = nvme_dev_ioctl,
 };
 
+static int nvme_dev_start(struct nvme_dev *dev)
+{
+       int result;
+
+       result = nvme_dev_map(dev);
+       if (result)
+               return result;
+
+       result = nvme_configure_admin_queue(dev);
+       if (result)
+               goto unmap;
+
+       spin_lock(&dev_list_lock);
+       list_add(&dev->node, &dev_list);
+       spin_unlock(&dev_list_lock);
+
+       result = nvme_setup_io_queues(dev);
+       if (result && result != -EBUSY)
+               goto disable;
+
+       return result;
+
+ disable:
+       spin_lock(&dev_list_lock);
+       list_del_init(&dev->node);
+       spin_unlock(&dev_list_lock);
+ unmap:
+       nvme_dev_unmap(dev);
+       return result;
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-       int bars, result = -ENOMEM;
+       int result = -ENOMEM;
        struct nvme_dev *dev;
 
        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
@@ -1938,53 +2166,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (!dev->queues)
                goto free;
 
-       if (pci_enable_device_mem(pdev))
-               goto free;
-       pci_set_master(pdev);
-       bars = pci_select_bars(pdev, IORESOURCE_MEM);
-       if (pci_request_selected_regions(pdev, bars, "nvme"))
-               goto disable;
-
        INIT_LIST_HEAD(&dev->namespaces);
        dev->pci_dev = pdev;
-       pci_set_drvdata(pdev, dev);
-
-       if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)))
-               dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
-       else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)))
-               dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
-       else
-               goto disable;
-
        result = nvme_set_instance(dev);
        if (result)
-               goto disable;
-
-       dev->entry[0].vector = pdev->irq;
+               goto free;
 
        result = nvme_setup_prp_pools(dev);
        if (result)
-               goto disable_msix;
+               goto release;
 
-       dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-       if (!dev->bar) {
-               result = -ENOMEM;
-               goto disable_msix;
+       result = nvme_dev_start(dev);
+       if (result) {
+               if (result == -EBUSY)
+                       goto create_cdev;
+               goto release_pools;
        }
 
-       result = nvme_configure_admin_queue(dev);
-       if (result)
-               goto unmap;
-       dev->queue_count++;
-
-       spin_lock(&dev_list_lock);
-       list_add(&dev->node, &dev_list);
-       spin_unlock(&dev_list_lock);
-
        result = nvme_dev_add(dev);
        if (result)
-               goto delete;
+               goto shutdown;
 
+ create_cdev:
        scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
        dev->miscdev.minor = MISC_DYNAMIC_MINOR;
        dev->miscdev.parent = &pdev->dev;
@@ -1999,24 +2202,13 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
  remove:
        nvme_dev_remove(dev);
- delete:
-       spin_lock(&dev_list_lock);
-       list_del(&dev->node);
-       spin_unlock(&dev_list_lock);
-
+ shutdown:
+       nvme_dev_shutdown(dev);
+ release_pools:
        nvme_free_queues(dev);
- unmap:
-       iounmap(dev->bar);
- disable_msix:
-       if (dev->pci_dev->msi_enabled)
-               pci_disable_msi(dev->pci_dev);
-       else if (dev->pci_dev->msix_enabled)
-               pci_disable_msix(dev->pci_dev);
-       nvme_release_instance(dev);
        nvme_release_prp_pools(dev);
- disable:
-       pci_disable_device(pdev);
-       pci_release_regions(pdev);
+ release:
+       nvme_release_instance(dev);
  free:
        kfree(dev->queues);
        kfree(dev->entry);
@@ -2037,8 +2229,30 @@ static void nvme_remove(struct pci_dev *pdev)
 #define nvme_link_reset NULL
 #define nvme_slot_reset NULL
 #define nvme_error_resume NULL
-#define nvme_suspend NULL
-#define nvme_resume NULL
+
+static int nvme_suspend(struct device *dev)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+       nvme_dev_shutdown(ndev);
+       return 0;
+}
+
+static int nvme_resume(struct device *dev)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct nvme_dev *ndev = pci_get_drvdata(pdev);
+       int ret;
+
+       ret = nvme_dev_start(ndev);
+       /* XXX: should remove gendisks if resume fails */
+       if (ret)
+               nvme_free_queues(ndev);
+       return ret;
+}
+
+static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
 
 static const struct pci_error_handlers nvme_err_handler = {
        .error_detected = nvme_error_detected,
@@ -2062,8 +2276,9 @@ static struct pci_driver nvme_driver = {
        .id_table       = nvme_id_table,
        .probe          = nvme_probe,
        .remove         = nvme_remove,
-       .suspend        = nvme_suspend,
-       .resume         = nvme_resume,
+       .driver         = {
+               .pm     = &nvme_dev_pm_ops,
+       },
        .err_handler    = &nvme_err_handler,
 };
 
index 102de2f..4a4ff4e 100644 (file)
@@ -933,13 +933,12 @@ static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        int res = SNTI_TRANSLATION_SUCCESS;
        int xfer_len;
 
-       inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
+       inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
        if (inq_response == NULL) {
                res = -ENOMEM;
                goto out_mem;
        }
 
-       memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
        inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE;    /* Page Code */
        inq_response[2] = 0x00;    /* Page Length MSB */
        inq_response[3] = 0x3C;    /* Page Length LSB */
@@ -964,12 +963,11 @@ static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        int xfer_len;
        u8 *log_response;
 
-       log_response = kmalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL);
+       log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL);
        if (log_response == NULL) {
                res = -ENOMEM;
                goto out_mem;
        }
-       memset(log_response, 0, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH);
 
        log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE;
        /* Subpage=0x00, Page Length MSB=0 */
@@ -1000,12 +998,11 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
        u8 temp_c;
        u16 temp_k;
 
-       log_response = kmalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL);
+       log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL);
        if (log_response == NULL) {
                res = -ENOMEM;
                goto out_mem;
        }
-       memset(log_response, 0, LOG_INFO_EXCP_PAGE_LENGTH);
 
        mem = dma_alloc_coherent(&dev->pci_dev->dev,
                                        sizeof(struct nvme_smart_log),
@@ -1069,12 +1066,11 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        u8 temp_c_cur, temp_c_thresh;
        u16 temp_k;
 
-       log_response = kmalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL);
+       log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL);
        if (log_response == NULL) {
                res = -ENOMEM;
                goto out_mem;
        }
-       memset(log_response, 0, LOG_TEMP_PAGE_LENGTH);
 
        mem = dma_alloc_coherent(&dev->pci_dev->dev,
                                        sizeof(struct nvme_smart_log),
@@ -1380,12 +1376,11 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns,
        blk_desc_offset = mph_size;
        mode_pages_offset_1 = blk_desc_offset + blk_desc_len;
 
-       response = kmalloc(resp_size, GFP_KERNEL);
+       response = kzalloc(resp_size, GFP_KERNEL);
        if (response == NULL) {
                res = -ENOMEM;
                goto out_mem;
        }
-       memset(response, 0, resp_size);
 
        res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10,
                                        llbaa, mode_data_length, blk_desc_len);
@@ -2480,12 +2475,11 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        }
        id_ns = mem;
 
-       response = kmalloc(resp_size, GFP_KERNEL);
+       response = kzalloc(resp_size, GFP_KERNEL);
        if (response == NULL) {
                res = -ENOMEM;
                goto out_dma;
        }
-       memset(response, 0, resp_size);
        nvme_trans_fill_read_cap(response, id_ns, cdb16);
 
        xfer_len = min(alloc_len, resp_size);
@@ -2554,12 +2548,11 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
                        goto out_dma;
                }
 
-               response = kmalloc(resp_size, GFP_KERNEL);
+               response = kzalloc(resp_size, GFP_KERNEL);
                if (response == NULL) {
                        res = -ENOMEM;
                        goto out_dma;
                }
-               memset(response, 0, resp_size);
 
                /* The first LUN ID will always be 0 per the SAM spec */
                for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) {
@@ -2600,12 +2593,11 @@ static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
        resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) :
                                        (FIXED_FMT_SENSE_DATA_SIZE));
-       response = kmalloc(resp_size, GFP_KERNEL);
+       response = kzalloc(resp_size, GFP_KERNEL);
        if (response == NULL) {
                res = -ENOMEM;
                goto out;
        }
-       memset(response, 0, resp_size);
 
        if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) {
                /* Descriptor Format Sense Data */
index f451c8d..26ebcf4 100644 (file)
@@ -1,6 +1,6 @@
 /*
  * Definitions for the NVM Express interface
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011-2013, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
 #ifndef _LINUX_NVME_H
 #define _LINUX_NVME_H
 
-#include <linux/types.h>
+#include <uapi/linux/nvme.h>
+#include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/kref.h>
 
 struct nvme_bar {
        __u64                   cap;    /* Controller Capabilities */
@@ -50,6 +53,7 @@ enum {
        NVME_CC_SHN_NONE        = 0 << 14,
        NVME_CC_SHN_NORMAL      = 1 << 14,
        NVME_CC_SHN_ABRUPT      = 2 << 14,
+       NVME_CC_SHN_MASK        = 3 << 14,
        NVME_CC_IOSQES          = 6 << 16,
        NVME_CC_IOCQES          = 4 << 20,
        NVME_CSTS_RDY           = 1 << 0,
@@ -57,462 +61,11 @@ enum {
        NVME_CSTS_SHST_NORMAL   = 0 << 2,
        NVME_CSTS_SHST_OCCUR    = 1 << 2,
        NVME_CSTS_SHST_CMPLT    = 2 << 2,
-};
-
-struct nvme_id_power_state {
-       __le16                  max_power;      /* centiwatts */
-       __u16                   rsvd2;
-       __le32                  entry_lat;      /* microseconds */
-       __le32                  exit_lat;       /* microseconds */
-       __u8                    read_tput;
-       __u8                    read_lat;
-       __u8                    write_tput;
-       __u8                    write_lat;
-       __u8                    rsvd16[16];
+       NVME_CSTS_SHST_MASK     = 3 << 2,
 };
 
 #define NVME_VS(major, minor)  (major << 16 | minor)
 
-struct nvme_id_ctrl {
-       __le16                  vid;
-       __le16                  ssvid;
-       char                    sn[20];
-       char                    mn[40];
-       char                    fr[8];
-       __u8                    rab;
-       __u8                    ieee[3];
-       __u8                    mic;
-       __u8                    mdts;
-       __u8                    rsvd78[178];
-       __le16                  oacs;
-       __u8                    acl;
-       __u8                    aerl;
-       __u8                    frmw;
-       __u8                    lpa;
-       __u8                    elpe;
-       __u8                    npss;
-       __u8                    rsvd264[248];
-       __u8                    sqes;
-       __u8                    cqes;
-       __u8                    rsvd514[2];
-       __le32                  nn;
-       __le16                  oncs;
-       __le16                  fuses;
-       __u8                    fna;
-       __u8                    vwc;
-       __le16                  awun;
-       __le16                  awupf;
-       __u8                    rsvd530[1518];
-       struct nvme_id_power_state      psd[32];
-       __u8                    vs[1024];
-};
-
-enum {
-       NVME_CTRL_ONCS_COMPARE                  = 1 << 0,
-       NVME_CTRL_ONCS_WRITE_UNCORRECTABLE      = 1 << 1,
-       NVME_CTRL_ONCS_DSM                      = 1 << 2,
-};
-
-struct nvme_lbaf {
-       __le16                  ms;
-       __u8                    ds;
-       __u8                    rp;
-};
-
-struct nvme_id_ns {
-       __le64                  nsze;
-       __le64                  ncap;
-       __le64                  nuse;
-       __u8                    nsfeat;
-       __u8                    nlbaf;
-       __u8                    flbas;
-       __u8                    mc;
-       __u8                    dpc;
-       __u8                    dps;
-       __u8                    rsvd30[98];
-       struct nvme_lbaf        lbaf[16];
-       __u8                    rsvd192[192];
-       __u8                    vs[3712];
-};
-
-enum {
-       NVME_NS_FEAT_THIN       = 1 << 0,
-       NVME_LBAF_RP_BEST       = 0,
-       NVME_LBAF_RP_BETTER     = 1,
-       NVME_LBAF_RP_GOOD       = 2,
-       NVME_LBAF_RP_DEGRADED   = 3,
-};
-
-struct nvme_smart_log {
-       __u8                    critical_warning;
-       __u8                    temperature[2];
-       __u8                    avail_spare;
-       __u8                    spare_thresh;
-       __u8                    percent_used;
-       __u8                    rsvd6[26];
-       __u8                    data_units_read[16];
-       __u8                    data_units_written[16];
-       __u8                    host_reads[16];
-       __u8                    host_writes[16];
-       __u8                    ctrl_busy_time[16];
-       __u8                    power_cycles[16];
-       __u8                    power_on_hours[16];
-       __u8                    unsafe_shutdowns[16];
-       __u8                    media_errors[16];
-       __u8                    num_err_log_entries[16];
-       __u8                    rsvd192[320];
-};
-
-enum {
-       NVME_SMART_CRIT_SPARE           = 1 << 0,
-       NVME_SMART_CRIT_TEMPERATURE     = 1 << 1,
-       NVME_SMART_CRIT_RELIABILITY     = 1 << 2,
-       NVME_SMART_CRIT_MEDIA           = 1 << 3,
-       NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4,
-};
-
-struct nvme_lba_range_type {
-       __u8                    type;
-       __u8                    attributes;
-       __u8                    rsvd2[14];
-       __u64                   slba;
-       __u64                   nlb;
-       __u8                    guid[16];
-       __u8                    rsvd48[16];
-};
-
-enum {
-       NVME_LBART_TYPE_FS      = 0x01,
-       NVME_LBART_TYPE_RAID    = 0x02,
-       NVME_LBART_TYPE_CACHE   = 0x03,
-       NVME_LBART_TYPE_SWAP    = 0x04,
-
-       NVME_LBART_ATTRIB_TEMP  = 1 << 0,
-       NVME_LBART_ATTRIB_HIDE  = 1 << 1,
-};
-
-/* I/O commands */
-
-enum nvme_opcode {
-       nvme_cmd_flush          = 0x00,
-       nvme_cmd_write          = 0x01,
-       nvme_cmd_read           = 0x02,
-       nvme_cmd_write_uncor    = 0x04,
-       nvme_cmd_compare        = 0x05,
-       nvme_cmd_dsm            = 0x09,
-};
-
-struct nvme_common_command {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __le32                  nsid;
-       __le32                  cdw2[2];
-       __le64                  metadata;
-       __le64                  prp1;
-       __le64                  prp2;
-       __le32                  cdw10[6];
-};
-
-struct nvme_rw_command {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __le32                  nsid;
-       __u64                   rsvd2;
-       __le64                  metadata;
-       __le64                  prp1;
-       __le64                  prp2;
-       __le64                  slba;
-       __le16                  length;
-       __le16                  control;
-       __le32                  dsmgmt;
-       __le32                  reftag;
-       __le16                  apptag;
-       __le16                  appmask;
-};
-
-enum {
-       NVME_RW_LR                      = 1 << 15,
-       NVME_RW_FUA                     = 1 << 14,
-       NVME_RW_DSM_FREQ_UNSPEC         = 0,
-       NVME_RW_DSM_FREQ_TYPICAL        = 1,
-       NVME_RW_DSM_FREQ_RARE           = 2,
-       NVME_RW_DSM_FREQ_READS          = 3,
-       NVME_RW_DSM_FREQ_WRITES         = 4,
-       NVME_RW_DSM_FREQ_RW             = 5,
-       NVME_RW_DSM_FREQ_ONCE           = 6,
-       NVME_RW_DSM_FREQ_PREFETCH       = 7,
-       NVME_RW_DSM_FREQ_TEMP           = 8,
-       NVME_RW_DSM_LATENCY_NONE        = 0 << 4,
-       NVME_RW_DSM_LATENCY_IDLE        = 1 << 4,
-       NVME_RW_DSM_LATENCY_NORM        = 2 << 4,
-       NVME_RW_DSM_LATENCY_LOW         = 3 << 4,
-       NVME_RW_DSM_SEQ_REQ             = 1 << 6,
-       NVME_RW_DSM_COMPRESSED          = 1 << 7,
-};
-
-struct nvme_dsm_cmd {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __le32                  nsid;
-       __u64                   rsvd2[2];
-       __le64                  prp1;
-       __le64                  prp2;
-       __le32                  nr;
-       __le32                  attributes;
-       __u32                   rsvd12[4];
-};
-
-enum {
-       NVME_DSMGMT_IDR         = 1 << 0,
-       NVME_DSMGMT_IDW         = 1 << 1,
-       NVME_DSMGMT_AD          = 1 << 2,
-};
-
-struct nvme_dsm_range {
-       __le32                  cattr;
-       __le32                  nlb;
-       __le64                  slba;
-};
-
-/* Admin commands */
-
-enum nvme_admin_opcode {
-       nvme_admin_delete_sq            = 0x00,
-       nvme_admin_create_sq            = 0x01,
-       nvme_admin_get_log_page         = 0x02,
-       nvme_admin_delete_cq            = 0x04,
-       nvme_admin_create_cq            = 0x05,
-       nvme_admin_identify             = 0x06,
-       nvme_admin_abort_cmd            = 0x08,
-       nvme_admin_set_features         = 0x09,
-       nvme_admin_get_features         = 0x0a,
-       nvme_admin_async_event          = 0x0c,
-       nvme_admin_activate_fw          = 0x10,
-       nvme_admin_download_fw          = 0x11,
-       nvme_admin_format_nvm           = 0x80,
-       nvme_admin_security_send        = 0x81,
-       nvme_admin_security_recv        = 0x82,
-};
-
-enum {
-       NVME_QUEUE_PHYS_CONTIG  = (1 << 0),
-       NVME_CQ_IRQ_ENABLED     = (1 << 1),
-       NVME_SQ_PRIO_URGENT     = (0 << 1),
-       NVME_SQ_PRIO_HIGH       = (1 << 1),
-       NVME_SQ_PRIO_MEDIUM     = (2 << 1),
-       NVME_SQ_PRIO_LOW        = (3 << 1),
-       NVME_FEAT_ARBITRATION   = 0x01,
-       NVME_FEAT_POWER_MGMT    = 0x02,
-       NVME_FEAT_LBA_RANGE     = 0x03,
-       NVME_FEAT_TEMP_THRESH   = 0x04,
-       NVME_FEAT_ERR_RECOVERY  = 0x05,
-       NVME_FEAT_VOLATILE_WC   = 0x06,
-       NVME_FEAT_NUM_QUEUES    = 0x07,
-       NVME_FEAT_IRQ_COALESCE  = 0x08,
-       NVME_FEAT_IRQ_CONFIG    = 0x09,
-       NVME_FEAT_WRITE_ATOMIC  = 0x0a,
-       NVME_FEAT_ASYNC_EVENT   = 0x0b,
-       NVME_FEAT_SW_PROGRESS   = 0x0c,
-       NVME_FWACT_REPL         = (0 << 3),
-       NVME_FWACT_REPL_ACTV    = (1 << 3),
-       NVME_FWACT_ACTV         = (2 << 3),
-};
-
-struct nvme_identify {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __le32                  nsid;
-       __u64                   rsvd2[2];
-       __le64                  prp1;
-       __le64                  prp2;
-       __le32                  cns;
-       __u32                   rsvd11[5];
-};
-
-struct nvme_features {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __le32                  nsid;
-       __u64                   rsvd2[2];
-       __le64                  prp1;
-       __le64                  prp2;
-       __le32                  fid;
-       __le32                  dword11;
-       __u32                   rsvd12[4];
-};
-
-struct nvme_create_cq {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __u32                   rsvd1[5];
-       __le64                  prp1;
-       __u64                   rsvd8;
-       __le16                  cqid;
-       __le16                  qsize;
-       __le16                  cq_flags;
-       __le16                  irq_vector;
-       __u32                   rsvd12[4];
-};
-
-struct nvme_create_sq {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __u32                   rsvd1[5];
-       __le64                  prp1;
-       __u64                   rsvd8;
-       __le16                  sqid;
-       __le16                  qsize;
-       __le16                  sq_flags;
-       __le16                  cqid;
-       __u32                   rsvd12[4];
-};
-
-struct nvme_delete_queue {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __u32                   rsvd1[9];
-       __le16                  qid;
-       __u16                   rsvd10;
-       __u32                   rsvd11[5];
-};
-
-struct nvme_download_firmware {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __u32                   rsvd1[5];
-       __le64                  prp1;
-       __le64                  prp2;
-       __le32                  numd;
-       __le32                  offset;
-       __u32                   rsvd12[4];
-};
-
-struct nvme_format_cmd {
-       __u8                    opcode;
-       __u8                    flags;
-       __u16                   command_id;
-       __le32                  nsid;
-       __u64                   rsvd2[4];
-       __le32                  cdw10;
-       __u32                   rsvd11[5];
-};
-
-struct nvme_command {
-       union {
-               struct nvme_common_command common;
-               struct nvme_rw_command rw;
-               struct nvme_identify identify;
-               struct nvme_features features;
-               struct nvme_create_cq create_cq;
-               struct nvme_create_sq create_sq;
-               struct nvme_delete_queue delete_queue;
-               struct nvme_download_firmware dlfw;
-               struct nvme_format_cmd format;
-               struct nvme_dsm_cmd dsm;
-       };
-};
-
-enum {
-       NVME_SC_SUCCESS                 = 0x0,
-       NVME_SC_INVALID_OPCODE          = 0x1,
-       NVME_SC_INVALID_FIELD           = 0x2,
-       NVME_SC_CMDID_CONFLICT          = 0x3,
-       NVME_SC_DATA_XFER_ERROR         = 0x4,
-       NVME_SC_POWER_LOSS              = 0x5,
-       NVME_SC_INTERNAL                = 0x6,
-       NVME_SC_ABORT_REQ               = 0x7,
-       NVME_SC_ABORT_QUEUE             = 0x8,
-       NVME_SC_FUSED_FAIL              = 0x9,
-       NVME_SC_FUSED_MISSING           = 0xa,
-       NVME_SC_INVALID_NS              = 0xb,
-       NVME_SC_CMD_SEQ_ERROR           = 0xc,
-       NVME_SC_LBA_RANGE               = 0x80,
-       NVME_SC_CAP_EXCEEDED            = 0x81,
-       NVME_SC_NS_NOT_READY            = 0x82,
-       NVME_SC_CQ_INVALID              = 0x100,
-       NVME_SC_QID_INVALID             = 0x101,
-       NVME_SC_QUEUE_SIZE              = 0x102,
-       NVME_SC_ABORT_LIMIT             = 0x103,
-       NVME_SC_ABORT_MISSING           = 0x104,
-       NVME_SC_ASYNC_LIMIT             = 0x105,
-       NVME_SC_FIRMWARE_SLOT           = 0x106,
-       NVME_SC_FIRMWARE_IMAGE          = 0x107,
-       NVME_SC_INVALID_VECTOR          = 0x108,
-       NVME_SC_INVALID_LOG_PAGE        = 0x109,
-       NVME_SC_INVALID_FORMAT          = 0x10a,
-       NVME_SC_BAD_ATTRIBUTES          = 0x180,
-       NVME_SC_WRITE_FAULT             = 0x280,
-       NVME_SC_READ_ERROR              = 0x281,
-       NVME_SC_GUARD_CHECK             = 0x282,
-       NVME_SC_APPTAG_CHECK            = 0x283,
-       NVME_SC_REFTAG_CHECK            = 0x284,
-       NVME_SC_COMPARE_FAILED          = 0x285,
-       NVME_SC_ACCESS_DENIED           = 0x286,
-};
-
-struct nvme_completion {
-       __le32  result;         /* Used by admin commands to return data */
-       __u32   rsvd;
-       __le16  sq_head;        /* how much of this queue may be reclaimed */
-       __le16  sq_id;          /* submission queue that generated this entry */
-       __u16   command_id;     /* of the command which completed */
-       __le16  status;         /* did the command fail, and if so, why? */
-};
-
-struct nvme_user_io {
-       __u8    opcode;
-       __u8    flags;
-       __u16   control;
-       __u16   nblocks;
-       __u16   rsvd;
-       __u64   metadata;
-       __u64   addr;
-       __u64   slba;
-       __u32   dsmgmt;
-       __u32   reftag;
-       __u16   apptag;
-       __u16   appmask;
-};
-
-struct nvme_admin_cmd {
-       __u8    opcode;
-       __u8    flags;
-       __u16   rsvd1;
-       __u32   nsid;
-       __u32   cdw2;
-       __u32   cdw3;
-       __u64   metadata;
-       __u64   addr;
-       __u32   metadata_len;
-       __u32   data_len;
-       __u32   cdw10;
-       __u32   cdw11;
-       __u32   cdw12;
-       __u32   cdw13;
-       __u32   cdw14;
-       __u32   cdw15;
-       __u32   timeout_ms;
-       __u32   result;
-};
-
-#define NVME_IOCTL_ID          _IO('N', 0x40)
-#define NVME_IOCTL_ADMIN_CMD   _IOWR('N', 0x41, struct nvme_admin_cmd)
-#define NVME_IOCTL_SUBMIT_IO   _IOW('N', 0x42, struct nvme_user_io)
-
-#ifdef __KERNEL__
-#include <linux/pci.h>
-#include <linux/miscdevice.h>
-#include <linux/kref.h>
-
 #define NVME_IO_TIMEOUT        (5 * HZ)
 
 /*
@@ -553,7 +106,7 @@ struct nvme_ns {
        struct request_queue *queue;
        struct gendisk *disk;
 
-       int ns_id;
+       unsigned ns_id;
        int lba_shift;
        int ms;
        u64 mode_select_num_blocks;
@@ -572,6 +125,7 @@ struct nvme_iod {
        int offset;             /* Of PRP list */
        int nents;              /* Used in scatterlist */
        int length;             /* Of data, in bytes */
+       unsigned long start_time;
        dma_addr_t first_dma;
        struct scatterlist sg[0];
 };
@@ -613,6 +167,4 @@ struct sg_io_hdr;
 int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr);
 int nvme_sg_get_version_num(int __user *ip);
 
-#endif
-
 #endif /* _LINUX_NVME_H */
index e7c94ee..115add2 100644 (file)
@@ -284,6 +284,7 @@ header-y += nfs_mount.h
 header-y += nfsacl.h
 header-y += nl80211.h
 header-y += nubus.h
+header-y += nvme.h
 header-y += nvram.h
 header-y += omap3isp.h
 header-y += omapfb.h
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
new file mode 100644 (file)
index 0000000..989c04e
--- /dev/null
@@ -0,0 +1,477 @@
+/*
+ * Definitions for the NVM Express interface
+ * Copyright (c) 2011-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _UAPI_LINUX_NVME_H
+#define _UAPI_LINUX_NVME_H
+
+#include <linux/types.h>
+
+struct nvme_id_power_state {
+       __le16                  max_power;      /* centiwatts */
+       __u8                    rsvd2;
+       __u8                    flags;
+       __le32                  entry_lat;      /* microseconds */
+       __le32                  exit_lat;       /* microseconds */
+       __u8                    read_tput;
+       __u8                    read_lat;
+       __u8                    write_tput;
+       __u8                    write_lat;
+       __u8                    rsvd16[16];
+};
+
+enum {
+       NVME_PS_FLAGS_MAX_POWER_SCALE   = 1 << 0,
+       NVME_PS_FLAGS_NON_OP_STATE      = 1 << 1,
+};
+
+struct nvme_id_ctrl {
+       __le16                  vid;
+       __le16                  ssvid;
+       char                    sn[20];
+       char                    mn[40];
+       char                    fr[8];
+       __u8                    rab;
+       __u8                    ieee[3];
+       __u8                    mic;
+       __u8                    mdts;
+       __u8                    rsvd78[178];
+       __le16                  oacs;
+       __u8                    acl;
+       __u8                    aerl;
+       __u8                    frmw;
+       __u8                    lpa;
+       __u8                    elpe;
+       __u8                    npss;
+       __u8                    rsvd264[248];
+       __u8                    sqes;
+       __u8                    cqes;
+       __u8                    rsvd514[2];
+       __le32                  nn;
+       __le16                  oncs;
+       __le16                  fuses;
+       __u8                    fna;
+       __u8                    vwc;
+       __le16                  awun;
+       __le16                  awupf;
+       __u8                    rsvd530[1518];
+       struct nvme_id_power_state      psd[32];
+       __u8                    vs[1024];
+};
+
+enum {
+       NVME_CTRL_ONCS_COMPARE                  = 1 << 0,
+       NVME_CTRL_ONCS_WRITE_UNCORRECTABLE      = 1 << 1,
+       NVME_CTRL_ONCS_DSM                      = 1 << 2,
+};
+
+struct nvme_lbaf {
+       __le16                  ms;
+       __u8                    ds;
+       __u8                    rp;
+};
+
+struct nvme_id_ns {
+       __le64                  nsze;
+       __le64                  ncap;
+       __le64                  nuse;
+       __u8                    nsfeat;
+       __u8                    nlbaf;
+       __u8                    flbas;
+       __u8                    mc;
+       __u8                    dpc;
+       __u8                    dps;
+       __u8                    rsvd30[98];
+       struct nvme_lbaf        lbaf[16];
+       __u8                    rsvd192[192];
+       __u8                    vs[3712];
+};
+
+enum {
+       NVME_NS_FEAT_THIN       = 1 << 0,
+       NVME_LBAF_RP_BEST       = 0,
+       NVME_LBAF_RP_BETTER     = 1,
+       NVME_LBAF_RP_GOOD       = 2,
+       NVME_LBAF_RP_DEGRADED   = 3,
+};
+
+struct nvme_smart_log {
+       __u8                    critical_warning;
+       __u8                    temperature[2];
+       __u8                    avail_spare;
+       __u8                    spare_thresh;
+       __u8                    percent_used;
+       __u8                    rsvd6[26];
+       __u8                    data_units_read[16];
+       __u8                    data_units_written[16];
+       __u8                    host_reads[16];
+       __u8                    host_writes[16];
+       __u8                    ctrl_busy_time[16];
+       __u8                    power_cycles[16];
+       __u8                    power_on_hours[16];
+       __u8                    unsafe_shutdowns[16];
+       __u8                    media_errors[16];
+       __u8                    num_err_log_entries[16];
+       __u8                    rsvd192[320];
+};
+
+enum {
+       NVME_SMART_CRIT_SPARE           = 1 << 0,
+       NVME_SMART_CRIT_TEMPERATURE     = 1 << 1,
+       NVME_SMART_CRIT_RELIABILITY     = 1 << 2,
+       NVME_SMART_CRIT_MEDIA           = 1 << 3,
+       NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4,
+};
+
+struct nvme_lba_range_type {
+       __u8                    type;
+       __u8                    attributes;
+       __u8                    rsvd2[14];
+       __u64                   slba;
+       __u64                   nlb;
+       __u8                    guid[16];
+       __u8                    rsvd48[16];
+};
+
+enum {
+       NVME_LBART_TYPE_FS      = 0x01,
+       NVME_LBART_TYPE_RAID    = 0x02,
+       NVME_LBART_TYPE_CACHE   = 0x03,
+       NVME_LBART_TYPE_SWAP    = 0x04,
+
+       NVME_LBART_ATTRIB_TEMP  = 1 << 0,
+       NVME_LBART_ATTRIB_HIDE  = 1 << 1,
+};
+
+/* I/O commands */
+
+enum nvme_opcode {
+       nvme_cmd_flush          = 0x00,
+       nvme_cmd_write          = 0x01,
+       nvme_cmd_read           = 0x02,
+       nvme_cmd_write_uncor    = 0x04,
+       nvme_cmd_compare        = 0x05,
+       nvme_cmd_dsm            = 0x09,
+};
+
+struct nvme_common_command {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __le32                  cdw2[2];
+       __le64                  metadata;
+       __le64                  prp1;
+       __le64                  prp2;
+       __le32                  cdw10[6];
+};
+
+struct nvme_rw_command {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __u64                   rsvd2;
+       __le64                  metadata;
+       __le64                  prp1;
+       __le64                  prp2;
+       __le64                  slba;
+       __le16                  length;
+       __le16                  control;
+       __le32                  dsmgmt;
+       __le32                  reftag;
+       __le16                  apptag;
+       __le16                  appmask;
+};
+
+enum {
+       NVME_RW_LR                      = 1 << 15,
+       NVME_RW_FUA                     = 1 << 14,
+       NVME_RW_DSM_FREQ_UNSPEC         = 0,
+       NVME_RW_DSM_FREQ_TYPICAL        = 1,
+       NVME_RW_DSM_FREQ_RARE           = 2,
+       NVME_RW_DSM_FREQ_READS          = 3,
+       NVME_RW_DSM_FREQ_WRITES         = 4,
+       NVME_RW_DSM_FREQ_RW             = 5,
+       NVME_RW_DSM_FREQ_ONCE           = 6,
+       NVME_RW_DSM_FREQ_PREFETCH       = 7,
+       NVME_RW_DSM_FREQ_TEMP           = 8,
+       NVME_RW_DSM_LATENCY_NONE        = 0 << 4,
+       NVME_RW_DSM_LATENCY_IDLE        = 1 << 4,
+       NVME_RW_DSM_LATENCY_NORM        = 2 << 4,
+       NVME_RW_DSM_LATENCY_LOW         = 3 << 4,
+       NVME_RW_DSM_SEQ_REQ             = 1 << 6,
+       NVME_RW_DSM_COMPRESSED          = 1 << 7,
+};
+
+struct nvme_dsm_cmd {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __u64                   rsvd2[2];
+       __le64                  prp1;
+       __le64                  prp2;
+       __le32                  nr;
+       __le32                  attributes;
+       __u32                   rsvd12[4];
+};
+
+enum {
+       NVME_DSMGMT_IDR         = 1 << 0,
+       NVME_DSMGMT_IDW         = 1 << 1,
+       NVME_DSMGMT_AD          = 1 << 2,
+};
+
+struct nvme_dsm_range {
+       __le32                  cattr;
+       __le32                  nlb;
+       __le64                  slba;
+};
+
+/* Admin commands */
+
+enum nvme_admin_opcode {
+       nvme_admin_delete_sq            = 0x00,
+       nvme_admin_create_sq            = 0x01,
+       nvme_admin_get_log_page         = 0x02,
+       nvme_admin_delete_cq            = 0x04,
+       nvme_admin_create_cq            = 0x05,
+       nvme_admin_identify             = 0x06,
+       nvme_admin_abort_cmd            = 0x08,
+       nvme_admin_set_features         = 0x09,
+       nvme_admin_get_features         = 0x0a,
+       nvme_admin_async_event          = 0x0c,
+       nvme_admin_activate_fw          = 0x10,
+       nvme_admin_download_fw          = 0x11,
+       nvme_admin_format_nvm           = 0x80,
+       nvme_admin_security_send        = 0x81,
+       nvme_admin_security_recv        = 0x82,
+};
+
+enum {
+       NVME_QUEUE_PHYS_CONTIG  = (1 << 0),
+       NVME_CQ_IRQ_ENABLED     = (1 << 1),
+       NVME_SQ_PRIO_URGENT     = (0 << 1),
+       NVME_SQ_PRIO_HIGH       = (1 << 1),
+       NVME_SQ_PRIO_MEDIUM     = (2 << 1),
+       NVME_SQ_PRIO_LOW        = (3 << 1),
+       NVME_FEAT_ARBITRATION   = 0x01,
+       NVME_FEAT_POWER_MGMT    = 0x02,
+       NVME_FEAT_LBA_RANGE     = 0x03,
+       NVME_FEAT_TEMP_THRESH   = 0x04,
+       NVME_FEAT_ERR_RECOVERY  = 0x05,
+       NVME_FEAT_VOLATILE_WC   = 0x06,
+       NVME_FEAT_NUM_QUEUES    = 0x07,
+       NVME_FEAT_IRQ_COALESCE  = 0x08,
+       NVME_FEAT_IRQ_CONFIG    = 0x09,
+       NVME_FEAT_WRITE_ATOMIC  = 0x0a,
+       NVME_FEAT_ASYNC_EVENT   = 0x0b,
+       NVME_FEAT_SW_PROGRESS   = 0x0c,
+       NVME_FWACT_REPL         = (0 << 3),
+       NVME_FWACT_REPL_ACTV    = (1 << 3),
+       NVME_FWACT_ACTV         = (2 << 3),
+};
+
+struct nvme_identify {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __u64                   rsvd2[2];
+       __le64                  prp1;
+       __le64                  prp2;
+       __le32                  cns;
+       __u32                   rsvd11[5];
+};
+
+struct nvme_features {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __u64                   rsvd2[2];
+       __le64                  prp1;
+       __le64                  prp2;
+       __le32                  fid;
+       __le32                  dword11;
+       __u32                   rsvd12[4];
+};
+
+struct nvme_create_cq {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __u32                   rsvd1[5];
+       __le64                  prp1;
+       __u64                   rsvd8;
+       __le16                  cqid;
+       __le16                  qsize;
+       __le16                  cq_flags;
+       __le16                  irq_vector;
+       __u32                   rsvd12[4];
+};
+
+struct nvme_create_sq {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __u32                   rsvd1[5];
+       __le64                  prp1;
+       __u64                   rsvd8;
+       __le16                  sqid;
+       __le16                  qsize;
+       __le16                  sq_flags;
+       __le16                  cqid;
+       __u32                   rsvd12[4];
+};
+
+struct nvme_delete_queue {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __u32                   rsvd1[9];
+       __le16                  qid;
+       __u16                   rsvd10;
+       __u32                   rsvd11[5];
+};
+
+struct nvme_download_firmware {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __u32                   rsvd1[5];
+       __le64                  prp1;
+       __le64                  prp2;
+       __le32                  numd;
+       __le32                  offset;
+       __u32                   rsvd12[4];
+};
+
+struct nvme_format_cmd {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __u64                   rsvd2[4];
+       __le32                  cdw10;
+       __u32                   rsvd11[5];
+};
+
+struct nvme_command {
+       union {
+               struct nvme_common_command common;
+               struct nvme_rw_command rw;
+               struct nvme_identify identify;
+               struct nvme_features features;
+               struct nvme_create_cq create_cq;
+               struct nvme_create_sq create_sq;
+               struct nvme_delete_queue delete_queue;
+               struct nvme_download_firmware dlfw;
+               struct nvme_format_cmd format;
+               struct nvme_dsm_cmd dsm;
+       };
+};
+
+enum {
+       NVME_SC_SUCCESS                 = 0x0,
+       NVME_SC_INVALID_OPCODE          = 0x1,
+       NVME_SC_INVALID_FIELD           = 0x2,
+       NVME_SC_CMDID_CONFLICT          = 0x3,
+       NVME_SC_DATA_XFER_ERROR         = 0x4,
+       NVME_SC_POWER_LOSS              = 0x5,
+       NVME_SC_INTERNAL                = 0x6,
+       NVME_SC_ABORT_REQ               = 0x7,
+       NVME_SC_ABORT_QUEUE             = 0x8,
+       NVME_SC_FUSED_FAIL              = 0x9,
+       NVME_SC_FUSED_MISSING           = 0xa,
+       NVME_SC_INVALID_NS              = 0xb,
+       NVME_SC_CMD_SEQ_ERROR           = 0xc,
+       NVME_SC_LBA_RANGE               = 0x80,
+       NVME_SC_CAP_EXCEEDED            = 0x81,
+       NVME_SC_NS_NOT_READY            = 0x82,
+       NVME_SC_CQ_INVALID              = 0x100,
+       NVME_SC_QID_INVALID             = 0x101,
+       NVME_SC_QUEUE_SIZE              = 0x102,
+       NVME_SC_ABORT_LIMIT             = 0x103,
+       NVME_SC_ABORT_MISSING           = 0x104,
+       NVME_SC_ASYNC_LIMIT             = 0x105,
+       NVME_SC_FIRMWARE_SLOT           = 0x106,
+       NVME_SC_FIRMWARE_IMAGE          = 0x107,
+       NVME_SC_INVALID_VECTOR          = 0x108,
+       NVME_SC_INVALID_LOG_PAGE        = 0x109,
+       NVME_SC_INVALID_FORMAT          = 0x10a,
+       NVME_SC_BAD_ATTRIBUTES          = 0x180,
+       NVME_SC_WRITE_FAULT             = 0x280,
+       NVME_SC_READ_ERROR              = 0x281,
+       NVME_SC_GUARD_CHECK             = 0x282,
+       NVME_SC_APPTAG_CHECK            = 0x283,
+       NVME_SC_REFTAG_CHECK            = 0x284,
+       NVME_SC_COMPARE_FAILED          = 0x285,
+       NVME_SC_ACCESS_DENIED           = 0x286,
+};
+
+struct nvme_completion {
+       __le32  result;         /* Used by admin commands to return data */
+       __u32   rsvd;
+       __le16  sq_head;        /* how much of this queue may be reclaimed */
+       __le16  sq_id;          /* submission queue that generated this entry */
+       __u16   command_id;     /* of the command which completed */
+       __le16  status;         /* did the command fail, and if so, why? */
+};
+
+struct nvme_user_io {
+       __u8    opcode;
+       __u8    flags;
+       __u16   control;
+       __u16   nblocks;
+       __u16   rsvd;
+       __u64   metadata;
+       __u64   addr;
+       __u64   slba;
+       __u32   dsmgmt;
+       __u32   reftag;
+       __u16   apptag;
+       __u16   appmask;
+};
+
+struct nvme_admin_cmd {
+       __u8    opcode;
+       __u8    flags;
+       __u16   rsvd1;
+       __u32   nsid;
+       __u32   cdw2;
+       __u32   cdw3;
+       __u64   metadata;
+       __u64   addr;
+       __u32   metadata_len;
+       __u32   data_len;
+       __u32   cdw10;
+       __u32   cdw11;
+       __u32   cdw12;
+       __u32   cdw13;
+       __u32   cdw14;
+       __u32   cdw15;
+       __u32   timeout_ms;
+       __u32   result;
+};
+
+#define NVME_IOCTL_ID          _IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD   _IOWR('N', 0x41, struct nvme_admin_cmd)
+#define NVME_IOCTL_SUBMIT_IO   _IOW('N', 0x42, struct nvme_user_io)
+
+#endif /* _UAPI_LINUX_NVME_H */