linux/drivers/dax/pmem/core.c

72 lines
2.0 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */
#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/pfn_t.h>
#include "../../nvdimm/pfn.h"
#include "../../nvdimm/nd.h"
#include "../bus.h"
struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
{
struct resource res;
int rc, id, region_id;
resource_size_t offset;
struct nd_pfn_sb *pfn_sb;
struct dev_dax *dev_dax;
struct nd_namespace_io *nsio;
struct dax_region *dax_region;
struct dev_pagemap pgmap = { };
struct nd_namespace_common *ndns;
struct nd_dax *nd_dax = to_nd_dax(dev);
struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware Interface Table), is the first known instance of a memory range described by a unique "target" proximity domain. Where "initiator" and "target" proximity domains is an approach that the ACPI HMAT (Heterogeneous Memory Attributes Table) uses to described the unique performance properties of a memory range relative to a given initiator (e.g. CPU or DMA device). Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y char-device follows the traditional notion of 'numa-node' where the attribute conveys the closest online numa-node. That numa-node attribute is useful for cpu-binding and memory-binding processes *near* the device. However, when the memory range backing a 'pmem', or 'dax' device is onlined (memory hot-add) the memory-only-numa-node representing that address needs to be differentiated from the set of online nodes. In other words, the numa-node association of the device depends on whether you can bind processes *near* the cpu-numa-node in the offline device-case, or bind process *on* the memory-range directly after the backing address range is onlined. Allow for the case that platform firmware describes persistent memory with a unique proximity domain, i.e. when it is distinct from the proximity of DRAM and CPUs that are on the same socket. Plumb the Linux numa-node translation of that proximity through the libnvdimm region device to namespaces that are in device-dax mode. With this in place the proposed kmem driver [1] can optionally discover a unique numa-node number for the address range as it transitions the memory from an offline state managed by a device-driver to an online memory range managed by the core-mm. [1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.com Reported-by: Fan Du <fan.du@intel.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Oliver O'Halloran" <oohall@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Jérôme Glisse <jglisse@redhat.com> Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2018-11-10 04:43:07 +08:00
struct nd_region *nd_region = to_nd_region(dev->parent);
ndns = nvdimm_namespace_common_probe(dev);
if (IS_ERR(ndns))
return ERR_CAST(ndns);
nsio = to_nd_namespace_io(&ndns->dev);
/* parse the 'pfn' info block via ->rw_bytes */
rc = devm_nsio_enable(dev, nsio);
if (rc)
return ERR_PTR(rc);
rc = nvdimm_setup_pfn(nd_pfn, &pgmap);
if (rc)
return ERR_PTR(rc);
devm_nsio_disable(dev, nsio);
/* reserve the metadata area, device-dax will reserve the data */
pfn_sb = nd_pfn->pfn_sb;
offset = le64_to_cpu(pfn_sb->dataoff);
if (!devm_request_mem_region(dev, nsio->res.start, offset,
dev_name(&ndns->dev))) {
dev_warn(dev, "could not reserve metadata\n");
return ERR_PTR(-EBUSY);
}
rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", &region_id, &id);
if (rc != 2)
return ERR_PTR(-EINVAL);
/* adjust the dax_region resource to the start of data */
memcpy(&res, &pgmap.res, sizeof(res));
res.start += offset;
dax_region = alloc_dax_region(dev, region_id, &res,
acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware Interface Table), is the first known instance of a memory range described by a unique "target" proximity domain. Where "initiator" and "target" proximity domains is an approach that the ACPI HMAT (Heterogeneous Memory Attributes Table) uses to described the unique performance properties of a memory range relative to a given initiator (e.g. CPU or DMA device). Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y char-device follows the traditional notion of 'numa-node' where the attribute conveys the closest online numa-node. That numa-node attribute is useful for cpu-binding and memory-binding processes *near* the device. However, when the memory range backing a 'pmem', or 'dax' device is onlined (memory hot-add) the memory-only-numa-node representing that address needs to be differentiated from the set of online nodes. In other words, the numa-node association of the device depends on whether you can bind processes *near* the cpu-numa-node in the offline device-case, or bind process *on* the memory-range directly after the backing address range is onlined. Allow for the case that platform firmware describes persistent memory with a unique proximity domain, i.e. when it is distinct from the proximity of DRAM and CPUs that are on the same socket. Plumb the Linux numa-node translation of that proximity through the libnvdimm region device to namespaces that are in device-dax mode. With this in place the proposed kmem driver [1] can optionally discover a unique numa-node number for the address range as it transitions the memory from an offline state managed by a device-driver to an online memory range managed by the core-mm. [1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.com Reported-by: Fan Du <fan.du@intel.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Oliver O'Halloran" <oohall@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Jérôme Glisse <jglisse@redhat.com> Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2018-11-10 04:43:07 +08:00
nd_region->target_node, le32_to_cpu(pfn_sb->align),
PFN_DEV|PFN_MAP);
if (!dax_region)
return ERR_PTR(-ENOMEM);
dev_dax = __devm_create_dev_dax(dax_region, id, &pgmap, subsys);
/* child dev_dax instances now own the lifetime of the dax_region */
dax_region_put(dax_region);
return dev_dax;
}
EXPORT_SYMBOL_GPL(__dax_pmem_probe);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Intel Corporation");