linux/drivers/nvdimm/pmem.c

511 lines
13 KiB
C
Raw Normal View History

/*
* Persistent Memory Driver
*
* Copyright (c) 2014-2015, Intel Corporation.
* Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
* Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <asm/cacheflush.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/badblocks.h>
#include <linux/memremap.h>
#include <linux/vmalloc.h>
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-29 01:23:37 +08:00
#include <linux/blk-mq.h>
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/pmem.h>
#include <linux/dax.h>
#include <linux/nd.h>
#include "pmem.h"
#include "pfn.h"
#include "nd.h"
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 10:44:50 +08:00
static struct device *to_dev(struct pmem_device *pmem)
{
/*
* nvdimm bus services need a 'dev' parameter, and we record the device
* at init in bb.dev.
*/
return pmem->bb.dev;
}
static struct nd_region *to_region(struct pmem_device *pmem)
{
return to_nd_region(to_dev(pmem)->parent);
}
static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
unsigned int len)
{
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 10:44:50 +08:00
struct device *dev = to_dev(pmem);
sector_t sector;
long cleared;
int rc = 0;
sector = (offset - pmem->data_offset) / 512;
cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
if (cleared < len)
rc = -EIO;
if (cleared > 0 && cleared / 512) {
cleared /= 512;
dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__,
(unsigned long long) sector, cleared,
cleared > 1 ? "s" : "");
badblocks_clear(&pmem->bb, sector, cleared);
if (pmem->bb_state)
sysfs_notify_dirent(pmem->bb_state);
}
invalidate_pmem(pmem->virt_addr + offset, len);
return rc;
}
static void write_pmem(void *pmem_addr, struct page *page,
unsigned int off, unsigned int len)
{
void *mem = kmap_atomic(page);
memcpy_to_pmem(pmem_addr, mem + off, len);
kunmap_atomic(mem);
}
static int read_pmem(struct page *page, unsigned int off,
void *pmem_addr, unsigned int len)
{
int rc;
void *mem = kmap_atomic(page);
rc = memcpy_mcsafe(mem + off, pmem_addr, len);
kunmap_atomic(mem);
if (rc)
return -EIO;
return 0;
}
static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
unsigned int len, unsigned int off, bool is_write,
sector_t sector)
{
int rc = 0;
bool bad_pmem = false;
phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
void *pmem_addr = pmem->virt_addr + pmem_off;
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
bad_pmem = true;
if (!is_write) {
if (unlikely(bad_pmem))
rc = -EIO;
else {
rc = read_pmem(page, off, pmem_addr, len);
flush_dcache_page(page);
}
} else {
/*
* Note that we write the data both before and after
* clearing poison. The write before clear poison
* handles situations where the latest written data is
* preserved and the clear poison operation simply marks
* the address range as valid without changing the data.
* In this case application software can assume that an
* interrupted write will either return the new good
* data or an error.
*
* However, if pmem_clear_poison() leaves the data in an
* indeterminate state we need to perform the write
* after clear poison.
*/
flush_dcache_page(page);
write_pmem(pmem_addr, page, off, len);
if (unlikely(bad_pmem)) {
rc = pmem_clear_poison(pmem, pmem_off, len);
write_pmem(pmem_addr, page, off, len);
}
}
return rc;
}
/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
#ifndef REQ_FLUSH
#define REQ_FLUSH REQ_PREFLUSH
#endif
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{
int rc = 0;
bool do_acct;
unsigned long start;
struct bio_vec bvec;
struct bvec_iter iter;
struct pmem_device *pmem = q->queuedata;
struct nd_region *nd_region = to_region(pmem);
if (bio->bi_opf & REQ_FLUSH)
nvdimm_flush(nd_region);
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
bvec.bv_offset, op_is_write(bio_op(bio)),
iter.bi_sector);
if (rc) {
bio->bi_error = rc;
break;
}
}
if (do_acct)
nd_iostat_end(bio, start);
if (bio->bi_opf & REQ_FUA)
nvdimm_flush(nd_region);
bio_endio(bio);
return BLK_QC_T_NONE;
}
static int pmem_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, bool is_write)
{
struct pmem_device *pmem = bdev->bd_queue->queuedata;
int rc;
rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
/*
* The ->rw_page interface is subtle and tricky. The core
* retries on any error, so we can only invoke page_endio() in
* the successful completion case. Otherwise, we'll see crashes
* caused by double completion.
*/
if (rc == 0)
page_endio(page, is_write, 0);
return rc;
}
/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
PFN_PHYS(nr_pages))))
return -EIO;
*kaddr = pmem->virt_addr + offset;
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
/*
* If badblocks are present, limit known good range to the
* requested range.
*/
if (unlikely(pmem->bb.count))
return nr_pages;
return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
}
static const struct block_device_operations pmem_fops = {
.owner = THIS_MODULE,
.rw_page = pmem_rw_page,
.revalidate_disk = nvdimm_revalidate_disk,
};
static long pmem_dax_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
{
struct pmem_device *pmem = dax_get_private(dax_dev);
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
}
static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
};
static void pmem_release_queue(void *q)
{
blk_cleanup_queue(q);
}
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-29 01:23:37 +08:00
static void pmem_freeze_queue(void *q)
{
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 mm updates from Ingo Molnar: "The main x86 MM changes in this cycle were: - continued native kernel PCID support preparation patches to the TLB flushing code (Andy Lutomirski) - various fixes related to 32-bit compat syscall returning address over 4Gb in applications, launched from 64-bit binaries - motivated by C/R frameworks such as Virtuozzo. (Dmitry Safonov) - continued Intel 5-level paging enablement: in particular the conversion of x86 GUP to the generic GUP code. (Kirill A. Shutemov) - x86/mpx ABI corner case fixes/enhancements (Joerg Roedel) - ... plus misc updates, fixes and cleanups" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits) mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash x86/mm: Fix flush_tlb_page() on Xen x86/mm: Make flush_tlb_mm_range() more predictable x86/mm: Remove flush_tlb() and flush_tlb_current_task() x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly() x86/mm/64: Fix crash in remove_pagetable() Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation" x86/boot/e820: Remove a redundant self assignment x86/mm: Fix dump pagetables for 4 levels of page tables x86/mpx, selftests: Only check bounds-vs-shadow when we keep shadow x86/mpx: Correctly report do_mpx_bt_fault() failures to user-space Revert "x86/mm/numa: Remove numa_nodemask_from_meminfo()" x86/espfix: Add support for 5-level paging x86/kasan: Extend KASAN to support 5-level paging x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL=y x86/paravirt: Add 5-level support to the paravirt code x86/mm: Define virtual memory map for 5-level paging x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert x86/boot: Detect 5-level paging support x86/mm/numa: Remove numa_nodemask_from_meminfo() ...
2017-05-02 14:54:56 +08:00
blk_freeze_queue_start(q);
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-29 01:23:37 +08:00
}
static void pmem_release_disk(void *__pmem)
{
struct pmem_device *pmem = __pmem;
kill_dax(pmem->dax_dev);
put_dax(pmem->dax_dev);
del_gendisk(pmem->disk);
put_disk(pmem->disk);
}
static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 10:44:50 +08:00
struct nd_region *nd_region = to_nd_region(dev->parent);
struct vmem_altmap __altmap, *altmap = NULL;
struct resource *res = &nsio->res;
struct nd_pfn *nd_pfn = NULL;
struct dax_device *dax_dev;
int nid = dev_to_node(dev);
struct nd_pfn_sb *pfn_sb;
struct pmem_device *pmem;
struct resource pfn_res;
struct request_queue *q;
struct gendisk *disk;
void *addr;
/* while nsio_rw_bytes is active, parse a pfn info block if present */
if (is_nd_pfn(dev)) {
nd_pfn = to_nd_pfn(dev);
altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
if (IS_ERR(altmap))
return PTR_ERR(altmap);
}
/* we're attaching a block device, disable raw namespace access */
devm_nsio_disable(dev, nsio);
pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
if (!pmem)
return -ENOMEM;
dev_set_drvdata(dev, pmem);
pmem->phys_addr = res->start;
pmem->size = resource_size(res);
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 10:44:50 +08:00
if (nvdimm_has_flush(nd_region) < 0)
dev_warn(dev, "unable to guarantee persistence of writes\n");
if (!devm_request_mem_region(dev, res->start, resource_size(res),
dev_name(&ndns->dev))) {
dev_warn(dev, "could not reserve region %pR\n", res);
return -EBUSY;
}
q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
if (!q)
return -ENOMEM;
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-29 01:23:37 +08:00
if (devm_add_action_or_reset(dev, pmem_release_queue, q))
return -ENOMEM;
pmem->pfn_flags = PFN_DEV;
if (is_nd_pfn(dev)) {
addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
altmap);
pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
pmem->pfn_flags |= PFN_MAP;
res = &pfn_res; /* for badblocks populate */
res->start += pmem->data_offset;
} else if (pmem_should_map_pages(dev)) {
addr = devm_memremap_pages(dev, &nsio->res,
mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup get_dev_page() enables paths like get_user_pages() to pin a dynamically mapped pfn-range (devm_memremap_pages()) while the resulting struct page objects are in use. Unlike get_page() it may fail if the device is, or is in the process of being, disabled. While the initial lookup of the range may be an expensive list walk, the result is cached to speed up subsequent lookups which are likely to be in the same mapped range. devm_memremap_pages() now requires a reference counter to be specified at init time. For pmem this means moving request_queue allocation into pmem_alloc() so the existing queue usage counter can track "device pages". ZONE_DEVICE pages always have an elevated count and will never be on an lru reclaim list. That space in 'struct page' can be redirected for other uses, but for safety introduce a poison value that will always trip __list_add() to assert. This allows half of the struct list_head storage to be reclaimed with some assurance to back up the assumption that the page count never goes to zero and a list_add() is never attempted. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Tested-by: Logan Gunthorpe <logang@deltatee.com> Cc: Dave Hansen <dave@sr71.net> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-16 08:56:49 +08:00
&q->q_usage_counter, NULL);
pmem->pfn_flags |= PFN_MAP;
} else
addr = devm_memremap(dev, pmem->phys_addr,
pmem->size, ARCH_MEMREMAP_PMEM);
/*
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-29 01:23:37 +08:00
* At release time the queue must be frozen before
* devm_memremap_pages is unwound
*/
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-29 01:23:37 +08:00
if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
return -ENOMEM;
if (IS_ERR(addr))
return PTR_ERR(addr);
pmem->virt_addr = addr;
blk_queue_write_cache(q, true, true);
blk_queue_make_request(q, pmem_make_request);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_logical_block_size(q, pmem_sector_size(ndns));
blk_queue_max_hw_sectors(q, UINT_MAX);
blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
q->queuedata = pmem;
disk = alloc_disk_node(0, nid);
if (!disk)
return -ENOMEM;
pmem->disk = disk;
disk->fops = &pmem_fops;
disk->queue = q;
disk->flags = GENHD_FL_EXT_DEVT;
nd_btt: atomic sector updates BTT stands for Block Translation Table, and is a way to provide power fail sector atomicity semantics for block devices that have the ability to perform byte granularity IO. It relies on the capability of libnvdimm namespace devices to do byte aligned IO. The BTT works as a stacked blocked device, and reserves a chunk of space from the backing device for its accounting metadata. It is a bio-based driver because all IO is done synchronously, and there is no queuing or asynchronous completions at either the device or the driver level. The BTT uses 'lanes' to index into various 'on-disk' data structures, and lanes also act as a synchronization mechanism in case there are more CPUs than available lanes. We did a comparison between two lane lock strategies - first where we kept an atomic counter around that tracked which was the last lane that was used, and 'our' lane was determined by atomically incrementing that. That way, for the nr_cpus > nr_lanes case, theoretically, no CPU would be blocked waiting for a lane. The other strategy was to use the cpu number we're scheduled on to and hash it to a lane number. Theoretically, this could block an IO that could've otherwise run using a different, free lane. But some fio workloads showed that the direct cpu -> lane hash performed faster than tracking 'last lane' - my reasoning is the cache thrash caused by moving the atomic variable made that approach slower than simply waiting out the in-progress IO. This supports the conclusion that the driver can be a very simple bio-based one that does synchronous IOs instead of queuing. Cc: Andy Lutomirski <luto@amacapital.net> Cc: Boaz Harrosh <boaz@plexistor.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jens Axboe <axboe@fb.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Neil Brown <neilb@suse.de> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg KH <gregkh@linuxfoundation.org> [jmoyer: fix nmi watchdog timeout in btt_map_init] [jmoyer: move btt initialization to module load path] [jmoyer: fix memory leak in the btt initialization path] [jmoyer: Don't overwrite corrupted arenas] Signed-off-by: Vishal Verma <vishal.l.verma@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2015-06-25 16:20:32 +08:00
nvdimm_namespace_disk_name(ndns, disk->disk_name);
set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
/ 512);
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 10:44:50 +08:00
nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
disk->bb = &pmem->bb;
dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
if (!dax_dev) {
put_disk(disk);
return -ENOMEM;
}
pmem->dax_dev = dax_dev;
device_add_disk(dev, disk);
if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
return -ENOMEM;
revalidate_disk(disk);
pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
"badblocks");
if (pmem->bb_state)
sysfs_put(pmem->bb_state);
else
dev_warn(dev, "sysfs_get_dirent 'badblocks' failed\n");
return 0;
}
static int nd_pmem_probe(struct device *dev)
{
struct nd_namespace_common *ndns;
ndns = nvdimm_namespace_common_probe(dev);
if (IS_ERR(ndns))
return PTR_ERR(ndns);
if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
return -ENXIO;
if (is_nd_btt(dev))
return nvdimm_namespace_attach_btt(ndns);
if (is_nd_pfn(dev))
return pmem_attach_disk(dev, ndns);
/* if we find a valid info-block we'll come back as that personality */
if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
|| nd_dax_probe(dev, ndns) == 0)
return -ENXIO;
/* ...otherwise we're just a raw pmem device */
return pmem_attach_disk(dev, ndns);
}
static int nd_pmem_remove(struct device *dev)
{
if (is_nd_btt(dev))
nvdimm_namespace_detach_btt(to_nd_btt(dev));
nvdimm_flush(to_nd_region(dev->parent));
return 0;
}
static void nd_pmem_shutdown(struct device *dev)
{
nvdimm_flush(to_nd_region(dev->parent));
}
static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
{
struct nd_region *nd_region;
resource_size_t offset = 0, end_trunc = 0;
struct nd_namespace_common *ndns;
struct nd_namespace_io *nsio;
struct resource res;
struct badblocks *bb;
struct kernfs_node *bb_state;
if (event != NVDIMM_REVALIDATE_POISON)
return;
if (is_nd_btt(dev)) {
struct nd_btt *nd_btt = to_nd_btt(dev);
ndns = nd_btt->ndns;
nd_region = to_nd_region(ndns->dev.parent);
nsio = to_nd_namespace_io(&ndns->dev);
bb = &nsio->bb;
bb_state = NULL;
} else {
struct pmem_device *pmem = dev_get_drvdata(dev);
nd_region = to_region(pmem);
bb = &pmem->bb;
bb_state = pmem->bb_state;
if (is_nd_pfn(dev)) {
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
ndns = nd_pfn->ndns;
offset = pmem->data_offset +
__le32_to_cpu(pfn_sb->start_pad);
end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
} else {
ndns = to_ndns(dev);
}
nsio = to_nd_namespace_io(&ndns->dev);
}
res.start = nsio->res.start + offset;
res.end = nsio->res.end - end_trunc;
nvdimm_badblocks_populate(nd_region, bb, &res);
if (bb_state)
sysfs_notify_dirent(bb_state);
}
MODULE_ALIAS("pmem");
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
static struct nd_device_driver nd_pmem_driver = {
.probe = nd_pmem_probe,
.remove = nd_pmem_remove,
.notify = nd_pmem_notify,
.shutdown = nd_pmem_shutdown,
.drv = {
.name = "nd_pmem",
},
.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
};
static int __init pmem_init(void)
{
return nd_driver_register(&nd_pmem_driver);
}
module_init(pmem_init);
static void pmem_exit(void)
{
driver_unregister(&nd_pmem_driver.drv);
}
module_exit(pmem_exit);
MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
MODULE_LICENSE("GPL v2");