libnvdimm for 4.18

* DAX broke a fundamental assumption of truncate of file mapped pages.
   The truncate path assumed that it is safe to disconnect a pinned page
   from a file and let the filesystem reclaim the physical block. With DAX
   the page is equivalent to the filesystem block. Introduce
   dax_layout_busy_page() to enable filesystems to wait for pinned DAX
   pages to be released. Without this wait a filesystem could allocate
   blocks under active device-DMA to a new file.
 
 * DAX arranges for the block layer to be bypassed and uses
   dax_direct_access() + copy_to_iter() to satisfy read(2) calls.
   However, the memcpy_mcsafe() facility is available through the pmem
   block driver. In order to safely handle media errors, via the DAX
   block-layer bypass, introduce copy_to_iter_mcsafe().
 
 * Fix cache management policy relative to the ACPI NFIT Platform
   Capabilities Structure to properly elide cache flushes when they are not
   necessary. The table indicates whether CPU caches are power-fail
   protected. Clarify that a deep flush is always performed on
   REQ_{FUA,PREFLUSH} requests.
 -----BEGIN PGP SIGNATURE-----
 
 iQIcBAABAgAGBQJbGxI7AAoJEB7SkWpmfYgCDjsP/2Lcibu9Kf4tKIzuInsle6iE
 6qP29qlkpHVTpDKbhvIxTYTYL9sMU0DNUrpPCJR/EYdeyztLWDFC5EAT1wF240vf
 maV37s/uP331jSC/2VJnKWzBs2ztQxmKLEIQCxh6aT0qs9cbaOvJgB/WlVu+qtsl
 aGJFLmb6vdQacp31noU5plKrMgMA1pADyF5qx9I9K2HwowHE7T368ZEFS/3S//c3
 LXmpx/Nfq52sGu/qbRbu6B1CTJhIGhmarObyQnvBYoKntK1Ov4e8DS95wD3EhNDe
 FuRkOCUKhjl6cFy7QVWh1ct1bFm84ny+b4/AtbpOmv9l/+0mveJ7e+5mu8HQTifT
 wYiEe2xzXJ+OG/xntv8SvlZKMpjP3BqI0jYsTutsjT4oHrciiXdXM186cyS+BiGp
 KtFmWyncQJgfiTq6+Hj5XpP9BapNS+OYdYgUagw9ZwzdzptuGFYUMSVOBrYrn6c/
 fwqtxjubykJoW0P3pkIoT91arFSea7nxOKnGwft06imQ7TwR4ARsI308feQ9itJq
 2P2e7/20nYMsw2aRaUDDA70Yu+Lagn1m8WL87IybUGeUDLb1BAkjphAlWa6COJ+u
 PhvAD2tvyM9m0c7O5Mytvz7iWKG6SVgatoAyOPkaeplQK8khZ+wEpuK58sO6C1w8
 4GBvt9ri9i/Ww/A+ppWs
 =4bfw
 -----END PGP SIGNATURE-----

Merge tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:
 "This adds a user for the new 'bytes-remaining' updates to
  memcpy_mcsafe() that you already received through Ingo via the
  x86-dax- for-linus pull.

  Not included here, but still targeting this cycle, is support for
  handling memory media errors (poison) consumed via userspace dax
  mappings.

  Summary:

   - DAX broke a fundamental assumption of truncate of file mapped
     pages. The truncate path assumed that it is safe to disconnect a
     pinned page from a file and let the filesystem reclaim the physical
     block. With DAX the page is equivalent to the filesystem block.
     Introduce dax_layout_busy_page() to enable filesystems to wait for
     pinned DAX pages to be released. Without this wait a filesystem
     could allocate blocks under active device-DMA to a new file.

   - DAX arranges for the block layer to be bypassed and uses
     dax_direct_access() + copy_to_iter() to satisfy read(2) calls.
     However, the memcpy_mcsafe() facility is available through the pmem
     block driver. In order to safely handle media errors, via the DAX
     block-layer bypass, introduce copy_to_iter_mcsafe().

   - Fix cache management policy relative to the ACPI NFIT Platform
     Capabilities Structure to properly elide cache flushes when they
     are not necessary. The table indicates whether CPU caches are
     power-fail protected. Clarify that a deep flush is always performed
     on REQ_{FUA,PREFLUSH} requests"

* tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (21 commits)
  dax: Use dax_write_cache* helpers
  libnvdimm, pmem: Do not flush power-fail protected CPU caches
  libnvdimm, pmem: Unconditionally deep flush on *sync
  libnvdimm, pmem: Complete REQ_FLUSH => REQ_PREFLUSH
  acpi, nfit: Remove ecc_unit_size
  dax: dax_insert_mapping_entry always succeeds
  libnvdimm, e820: Register all pmem resources
  libnvdimm: Debug probe times
  linvdimm, pmem: Preserve read-only setting for pmem devices
  x86, nfit_test: Add unit test for memcpy_mcsafe()
  pmem: Switch to copy_to_iter_mcsafe()
  dax: Report bytes remaining in dax_iomap_actor()
  dax: Introduce a ->copy_to_iter dax operation
  uio, lib: Fix CONFIG_ARCH_HAS_UACCESS_MCSAFE compilation
  xfs, dax: introduce xfs_break_dax_layouts()
  xfs: prepare xfs_break_layouts() for another layout type
  xfs: prepare xfs_break_layouts() to be called with XFS_MMAPLOCK_EXCL
  mm, fs, dax: handle layout changes to pinned dax mappings
  mm: fix __gup_device_huge vs unmap
  mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS
  ...
This commit is contained in:
Linus Torvalds 2018-06-08 17:21:52 -07:00
commit 7d3bf613e9
40 changed files with 925 additions and 380 deletions

View File

@ -0,0 +1,17 @@
What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size
Date: Aug, 2017
KernelVersion: v4.14 (Removed v4.18)
Contact: linux-nvdimm@lists.01.org
Description:
(RO) Size of a write request to a DIMM that will not incur a
read-modify-write cycle at the memory controller.
When the nfit driver initializes it runs an ARS (Address Range
Scrub) operation across every pmem range. Part of that process
involves determining the ARS capabilities of a given address
range. One of the capabilities that is reported is the 'Clear
Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
This property indicates the boundary at which the NVDIMM may
need to perform read-modify-write cycles to maintain ECC (Error
Correcting Code) blocks.

View File

@ -212,22 +212,3 @@ Description:
range. Used by NVDIMM Region Mapping Structure to uniquely refer range. Used by NVDIMM Region Mapping Structure to uniquely refer
to this structure. Value of 0 is reserved and not used as an to this structure. Value of 0 is reserved and not used as an
index. index.
What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size
Date: Aug, 2017
KernelVersion: v4.14
Contact: linux-nvdimm@lists.01.org
Description:
(RO) Size of a write request to a DIMM that will not incur a
read-modify-write cycle at the memory controller.
When the nfit driver initializes it runs an ARS (Address Range
Scrub) operation across every pmem range. Part of that process
involves determining the ARS capabilities of a given address
range. One of the capabilities that is reported is the 'Clear
Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
This property indicates the boundary at which the NVDIMM may
need to perform read-modify-write cycles to maintain ECC (Error
Correcting Code) blocks.

View File

@ -72,6 +72,9 @@ config EARLY_PRINTK_USB_XDBC
You should normally say N here, unless you want to debug early You should normally say N here, unless you want to debug early
crashes or need a very simple printk logging facility. crashes or need a very simple printk logging facility.
config MCSAFE_TEST
def_bool n
config X86_PTDUMP_CORE config X86_PTDUMP_CORE
def_bool n def_bool n

View File

@ -0,0 +1,75 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MCSAFE_TEST_H_
#define _MCSAFE_TEST_H_
#ifndef __ASSEMBLY__
#ifdef CONFIG_MCSAFE_TEST
extern unsigned long mcsafe_test_src;
extern unsigned long mcsafe_test_dst;
static inline void mcsafe_inject_src(void *addr)
{
if (addr)
mcsafe_test_src = (unsigned long) addr;
else
mcsafe_test_src = ~0UL;
}
static inline void mcsafe_inject_dst(void *addr)
{
if (addr)
mcsafe_test_dst = (unsigned long) addr;
else
mcsafe_test_dst = ~0UL;
}
#else /* CONFIG_MCSAFE_TEST */
static inline void mcsafe_inject_src(void *addr)
{
}
static inline void mcsafe_inject_dst(void *addr)
{
}
#endif /* CONFIG_MCSAFE_TEST */
#else /* __ASSEMBLY__ */
#include <asm/export.h>
#ifdef CONFIG_MCSAFE_TEST
.macro MCSAFE_TEST_CTL
.pushsection .data
.align 8
.globl mcsafe_test_src
mcsafe_test_src:
.quad 0
EXPORT_SYMBOL_GPL(mcsafe_test_src)
.globl mcsafe_test_dst
mcsafe_test_dst:
.quad 0
EXPORT_SYMBOL_GPL(mcsafe_test_dst)
.popsection
.endm
.macro MCSAFE_TEST_SRC reg count target
leaq \count(\reg), %r9
cmp mcsafe_test_src, %r9
ja \target
.endm
.macro MCSAFE_TEST_DST reg count target
leaq \count(\reg), %r9
cmp mcsafe_test_dst, %r9
ja \target
.endm
#else
.macro MCSAFE_TEST_CTL
.endm
.macro MCSAFE_TEST_SRC reg count target
.endm
.macro MCSAFE_TEST_DST reg count target
.endm
#endif /* CONFIG_MCSAFE_TEST */
#endif /* __ASSEMBLY__ */
#endif /* _MCSAFE_TEST_H_ */

View File

@ -3,6 +3,7 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/errno.h> #include <asm/errno.h>
#include <asm/cpufeatures.h> #include <asm/cpufeatures.h>
#include <asm/mcsafe_test.h>
#include <asm/alternative-asm.h> #include <asm/alternative-asm.h>
#include <asm/export.h> #include <asm/export.h>
@ -183,6 +184,9 @@ ENTRY(memcpy_orig)
ENDPROC(memcpy_orig) ENDPROC(memcpy_orig)
#ifndef CONFIG_UML #ifndef CONFIG_UML
MCSAFE_TEST_CTL
/* /*
* __memcpy_mcsafe - memory copy with machine check exception handling * __memcpy_mcsafe - memory copy with machine check exception handling
* Note that we only catch machine checks when reading the source addresses. * Note that we only catch machine checks when reading the source addresses.
@ -206,6 +210,8 @@ ENTRY(__memcpy_mcsafe)
subl %ecx, %edx subl %ecx, %edx
.L_read_leading_bytes: .L_read_leading_bytes:
movb (%rsi), %al movb (%rsi), %al
MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
.L_write_leading_bytes: .L_write_leading_bytes:
movb %al, (%rdi) movb %al, (%rdi)
incq %rsi incq %rsi
@ -221,6 +227,8 @@ ENTRY(__memcpy_mcsafe)
.L_read_words: .L_read_words:
movq (%rsi), %r8 movq (%rsi), %r8
MCSAFE_TEST_SRC %rsi 8 .E_read_words
MCSAFE_TEST_DST %rdi 8 .E_write_words
.L_write_words: .L_write_words:
movq %r8, (%rdi) movq %r8, (%rdi)
addq $8, %rsi addq $8, %rsi
@ -237,6 +245,8 @@ ENTRY(__memcpy_mcsafe)
movl %edx, %ecx movl %edx, %ecx
.L_read_trailing_bytes: .L_read_trailing_bytes:
movb (%rsi), %al movb (%rsi), %al
MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
.L_write_trailing_bytes: .L_write_trailing_bytes:
movb %al, (%rdi) movb %al, (%rdi)
incq %rsi incq %rsi

View File

@ -1978,19 +1978,8 @@ static ssize_t range_index_show(struct device *dev,
} }
static DEVICE_ATTR_RO(range_index); static DEVICE_ATTR_RO(range_index);
static ssize_t ecc_unit_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_region *nd_region = to_nd_region(dev);
struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
}
static DEVICE_ATTR_RO(ecc_unit_size);
static struct attribute *acpi_nfit_region_attributes[] = { static struct attribute *acpi_nfit_region_attributes[] = {
&dev_attr_range_index.attr, &dev_attr_range_index.attr,
&dev_attr_ecc_unit_size.attr,
NULL, NULL,
}; };

View File

@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
bool __bdev_dax_supported(struct block_device *bdev, int blocksize) bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
{ {
struct dax_device *dax_dev; struct dax_device *dax_dev;
bool dax_enabled = false;
pgoff_t pgoff; pgoff_t pgoff;
int err, id; int err, id;
void *kaddr; void *kaddr;
@ -134,14 +135,21 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
* on being able to do (page_address(pfn_to_page())). * on being able to do (page_address(pfn_to_page())).
*/ */
WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
dax_enabled = true;
} else if (pfn_t_devmap(pfn)) { } else if (pfn_t_devmap(pfn)) {
/* pass */; struct dev_pagemap *pgmap;
} else {
pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
dax_enabled = true;
put_dev_pagemap(pgmap);
}
if (!dax_enabled) {
pr_debug("%s: error: dax support not enabled\n", pr_debug("%s: error: dax support not enabled\n",
bdevname(bdev, buf)); bdevname(bdev, buf));
return false; return false;
} }
return true; return true;
} }
EXPORT_SYMBOL_GPL(__bdev_dax_supported); EXPORT_SYMBOL_GPL(__bdev_dax_supported);
@ -182,8 +190,7 @@ static ssize_t write_cache_show(struct device *dev,
if (!dax_dev) if (!dax_dev)
return -ENXIO; return -ENXIO;
rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE, rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
&dax_dev->flags));
put_dax(dax_dev); put_dax(dax_dev);
return rc; return rc;
} }
@ -201,10 +208,8 @@ static ssize_t write_cache_store(struct device *dev,
if (rc) if (rc)
len = rc; len = rc;
else if (write_cache)
set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
else else
clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); dax_write_cache(dax_dev, write_cache);
put_dax(dax_dev); put_dax(dax_dev);
return len; return len;
@ -282,11 +287,21 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
} }
EXPORT_SYMBOL_GPL(dax_copy_from_iter); EXPORT_SYMBOL_GPL(dax_copy_from_iter);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i)
{
if (!dax_alive(dax_dev))
return 0;
return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
EXPORT_SYMBOL_GPL(dax_copy_to_iter);
#ifdef CONFIG_ARCH_HAS_PMEM_API #ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size); void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
{ {
if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))) if (unlikely(!dax_write_cache_enabled(dax_dev)))
return; return;
arch_wb_cache_pmem(addr, size); arch_wb_cache_pmem(addr, size);

View File

@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
} }
static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
struct linear_c *lc = ti->private;
struct block_device *bdev = lc->dev->bdev;
struct dax_device *dax_dev = lc->dev->dax_dev;
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
dev_sector = linear_map_sector(ti, sector);
if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
return 0;
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
#else #else
#define linear_dax_direct_access NULL #define linear_dax_direct_access NULL
#define linear_dax_copy_from_iter NULL #define linear_dax_copy_from_iter NULL
#define linear_dax_copy_to_iter NULL
#endif #endif
static struct target_type linear_target = { static struct target_type linear_target = {
@ -204,6 +219,7 @@ static struct target_type linear_target = {
.iterate_devices = linear_iterate_devices, .iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access, .direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter, .dax_copy_from_iter = linear_dax_copy_from_iter,
.dax_copy_to_iter = linear_dax_copy_to_iter,
}; };
int __init dm_linear_init(void) int __init dm_linear_init(void)

View File

@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
dax_copy: dax_copy:
return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
} }
static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
pgoff_t pgoff, void *addr, size_t bytes,
struct iov_iter *i)
{
struct log_writes_c *lc = ti->private;
sector_t sector = pgoff * PAGE_SECTORS;
if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
return 0;
return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
}
#else #else
#define log_writes_dax_direct_access NULL #define log_writes_dax_direct_access NULL
#define log_writes_dax_copy_from_iter NULL #define log_writes_dax_copy_from_iter NULL
#define log_writes_dax_copy_to_iter NULL
#endif #endif
static struct target_type log_writes_target = { static struct target_type log_writes_target = {
@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
.io_hints = log_writes_io_hints, .io_hints = log_writes_io_hints,
.direct_access = log_writes_dax_direct_access, .direct_access = log_writes_dax_direct_access,
.dax_copy_from_iter = log_writes_dax_copy_from_iter, .dax_copy_from_iter = log_writes_dax_copy_from_iter,
.dax_copy_to_iter = log_writes_dax_copy_to_iter,
}; };
static int __init dm_log_writes_init(void) static int __init dm_log_writes_init(void)

View File

@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
} }
static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
struct stripe_c *sc = ti->private;
struct dax_device *dax_dev;
struct block_device *bdev;
uint32_t stripe;
stripe_map_sector(sc, sector, &stripe, &dev_sector);
dev_sector += sc->stripe[stripe].physical_start;
dax_dev = sc->stripe[stripe].dev->dax_dev;
bdev = sc->stripe[stripe].dev->bdev;
if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
return 0;
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
#else #else
#define stripe_dax_direct_access NULL #define stripe_dax_direct_access NULL
#define stripe_dax_copy_from_iter NULL #define stripe_dax_copy_from_iter NULL
#define stripe_dax_copy_to_iter NULL
#endif #endif
/* /*
@ -478,6 +498,7 @@ static struct target_type stripe_target = {
.io_hints = stripe_io_hints, .io_hints = stripe_io_hints,
.direct_access = stripe_dax_direct_access, .direct_access = stripe_dax_direct_access,
.dax_copy_from_iter = stripe_dax_copy_from_iter, .dax_copy_from_iter = stripe_dax_copy_from_iter,
.dax_copy_to_iter = stripe_dax_copy_to_iter,
}; };
int __init dm_stripe_init(void) int __init dm_stripe_init(void)

View File

@ -1089,6 +1089,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
return ret; return ret;
} }
static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS;
struct dm_target *ti;
long ret = 0;
int srcu_idx;
ti = dm_dax_get_live_target(md, sector, &srcu_idx);
if (!ti)
goto out;
if (!ti->type->dax_copy_to_iter) {
ret = copy_to_iter(addr, bytes, i);
goto out;
}
ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
out:
dm_put_live_table(md, srcu_idx);
return ret;
}
/* /*
* A target may call dm_accept_partial_bio only from the map routine. It is * A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
@ -3137,6 +3161,7 @@ static const struct block_device_operations dm_blk_dops = {
static const struct dax_operations dm_dax_ops = { static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access, .direct_access = dm_dax_direct_access,
.copy_from_iter = dm_dax_copy_from_iter, .copy_from_iter = dm_dax_copy_from_iter,
.copy_to_iter = dm_dax_copy_to_iter,
}; };
/* /*

View File

@ -100,6 +100,9 @@ static int nvdimm_bus_probe(struct device *dev)
if (!try_module_get(provider)) if (!try_module_get(provider))
return -ENXIO; return -ENXIO;
dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n",
dev->driver->name, dev_name(dev));
nvdimm_bus_probe_start(nvdimm_bus); nvdimm_bus_probe_start(nvdimm_bus);
rc = nd_drv->probe(dev); rc = nd_drv->probe(dev);
if (rc == 0) if (rc == 0)
@ -108,7 +111,7 @@ static int nvdimm_bus_probe(struct device *dev)
nd_region_disable(nvdimm_bus, dev); nd_region_disable(nvdimm_bus, dev);
nvdimm_bus_probe_end(nvdimm_bus); nvdimm_bus_probe_end(nvdimm_bus);
dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name,
dev_name(dev), rc); dev_name(dev), rc);
if (rc != 0) if (rc != 0)
@ -566,14 +569,18 @@ int nvdimm_revalidate_disk(struct gendisk *disk)
{ {
struct device *dev = disk_to_dev(disk)->parent; struct device *dev = disk_to_dev(disk)->parent;
struct nd_region *nd_region = to_nd_region(dev->parent); struct nd_region *nd_region = to_nd_region(dev->parent);
const char *pol = nd_region->ro ? "only" : "write"; int disk_ro = get_disk_ro(disk);
if (nd_region->ro == get_disk_ro(disk)) /*
* Upgrade to read-only if the region is read-only preserve as
* read-only if the disk is already read-only.
*/
if (disk_ro || nd_region->ro == disk_ro)
return 0; return 0;
dev_info(dev, "%s read-%s, marking %s read-%s\n", dev_info(dev, "%s read-only, marking %s read-only\n",
dev_name(&nd_region->dev), pol, disk->disk_name, pol); dev_name(&nd_region->dev), disk->disk_name);
set_disk_ro(disk, nd_region->ro); set_disk_ro(disk, 1);
return 0; return 0;

View File

@ -38,12 +38,27 @@ static int e820_range_to_nid(resource_size_t addr)
} }
#endif #endif
static int e820_register_one(struct resource *res, void *data)
{
struct nd_region_desc ndr_desc;
struct nvdimm_bus *nvdimm_bus = data;
memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.res = res;
ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
ndr_desc.numa_node = e820_range_to_nid(res->start);
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
return -ENXIO;
return 0;
}
static int e820_pmem_probe(struct platform_device *pdev) static int e820_pmem_probe(struct platform_device *pdev)
{ {
static struct nvdimm_bus_descriptor nd_desc; static struct nvdimm_bus_descriptor nd_desc;
struct device *dev = &pdev->dev; struct device *dev = &pdev->dev;
struct nvdimm_bus *nvdimm_bus; struct nvdimm_bus *nvdimm_bus;
struct resource *p; int rc = -ENXIO;
nd_desc.attr_groups = e820_pmem_attribute_groups; nd_desc.attr_groups = e820_pmem_attribute_groups;
nd_desc.provider_name = "e820"; nd_desc.provider_name = "e820";
@ -53,27 +68,15 @@ static int e820_pmem_probe(struct platform_device *pdev)
goto err; goto err;
platform_set_drvdata(pdev, nvdimm_bus); platform_set_drvdata(pdev, nvdimm_bus);
for (p = iomem_resource.child; p ; p = p->sibling) { rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
struct nd_region_desc ndr_desc; IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
if (rc)
if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY) goto err;
continue;
memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.res = p;
ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
ndr_desc.numa_node = e820_range_to_nid(p->start);
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
goto err;
}
return 0; return 0;
err:
err:
nvdimm_bus_unregister(nvdimm_bus); nvdimm_bus_unregister(nvdimm_bus);
dev_err(dev, "failed to register legacy persistent memory ranges\n"); dev_err(dev, "failed to register legacy persistent memory ranges\n");
return -ENXIO; return rc;
} }
static struct platform_driver e820_pmem_driver = { static struct platform_driver e820_pmem_driver = {

View File

@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
res->start += start_pad; res->start += start_pad;
res->end -= end_trunc; res->end -= end_trunc;
pgmap->type = MEMORY_DEVICE_HOST;
if (nd_pfn->mode == PFN_MODE_RAM) { if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < SZ_8K) if (offset < SZ_8K)
return -EINVAL; return -EINVAL;

View File

@ -164,11 +164,6 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
return rc; return rc;
} }
/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
#ifndef REQ_FLUSH
#define REQ_FLUSH REQ_PREFLUSH
#endif
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{ {
blk_status_t rc = 0; blk_status_t rc = 0;
@ -179,7 +174,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
struct pmem_device *pmem = q->queuedata; struct pmem_device *pmem = q->queuedata;
struct nd_region *nd_region = to_region(pmem); struct nd_region *nd_region = to_region(pmem);
if (bio->bi_opf & REQ_FLUSH) if (bio->bi_opf & REQ_PREFLUSH)
nvdimm_flush(nd_region); nvdimm_flush(nd_region);
do_acct = nd_iostat_start(bio, &start); do_acct = nd_iostat_start(bio, &start);
@ -264,9 +259,16 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
return copy_from_iter_flushcache(addr, bytes, i); return copy_from_iter_flushcache(addr, bytes, i);
} }
static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
return copy_to_iter_mcsafe(addr, bytes, i);
}
static const struct dax_operations pmem_dax_ops = { static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access, .direct_access = pmem_dax_direct_access,
.copy_from_iter = pmem_copy_from_iter, .copy_from_iter = pmem_copy_from_iter,
.copy_to_iter = pmem_copy_to_iter,
}; };
static const struct attribute_group *pmem_attribute_groups[] = { static const struct attribute_group *pmem_attribute_groups[] = {
@ -294,12 +296,33 @@ static void pmem_release_disk(void *__pmem)
put_disk(pmem->disk); put_disk(pmem->disk);
} }
static void pmem_release_pgmap_ops(void *__pgmap)
{
dev_pagemap_put_ops();
}
static void fsdax_pagefree(struct page *page, void *data)
{
wake_up_var(&page->_refcount);
}
static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
{
dev_pagemap_get_ops();
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
return -ENOMEM;
pgmap->type = MEMORY_DEVICE_FS_DAX;
pgmap->page_free = fsdax_pagefree;
return 0;
}
static int pmem_attach_disk(struct device *dev, static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns) struct nd_namespace_common *ndns)
{ {
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
struct nd_region *nd_region = to_nd_region(dev->parent); struct nd_region *nd_region = to_nd_region(dev->parent);
int nid = dev_to_node(dev), fua, wbc; int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res; struct resource *res = &nsio->res;
struct resource bb_res; struct resource bb_res;
struct nd_pfn *nd_pfn = NULL; struct nd_pfn *nd_pfn = NULL;
@ -335,7 +358,6 @@ static int pmem_attach_disk(struct device *dev,
dev_warn(dev, "unable to guarantee persistence of writes\n"); dev_warn(dev, "unable to guarantee persistence of writes\n");
fua = 0; fua = 0;
} }
wbc = nvdimm_has_cache(nd_region);
if (!devm_request_mem_region(dev, res->start, resource_size(res), if (!devm_request_mem_region(dev, res->start, resource_size(res),
dev_name(&ndns->dev))) { dev_name(&ndns->dev))) {
@ -353,6 +375,8 @@ static int pmem_attach_disk(struct device *dev,
pmem->pfn_flags = PFN_DEV; pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter; pmem->pgmap.ref = &q->q_usage_counter;
if (is_nd_pfn(dev)) { if (is_nd_pfn(dev)) {
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap); addr = devm_memremap_pages(dev, &pmem->pgmap);
pfn_sb = nd_pfn->pfn_sb; pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@ -364,6 +388,8 @@ static int pmem_attach_disk(struct device *dev,
} else if (pmem_should_map_pages(dev)) { } else if (pmem_should_map_pages(dev)) {
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
pmem->pgmap.altmap_valid = false; pmem->pgmap.altmap_valid = false;
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap); addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP; pmem->pfn_flags |= PFN_MAP;
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
@ -382,7 +408,7 @@ static int pmem_attach_disk(struct device *dev,
return PTR_ERR(addr); return PTR_ERR(addr);
pmem->virt_addr = addr; pmem->virt_addr = addr;
blk_queue_write_cache(q, wbc, fua); blk_queue_write_cache(q, true, fua);
blk_queue_make_request(q, pmem_make_request); blk_queue_make_request(q, pmem_make_request);
blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_logical_block_size(q, pmem_sector_size(ndns));
@ -413,7 +439,7 @@ static int pmem_attach_disk(struct device *dev,
put_disk(disk); put_disk(disk);
return -ENOMEM; return -ENOMEM;
} }
dax_write_cache(dax_dev, wbc); dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev; pmem->dax_dev = dax_dev;
gendev = disk_to_dev(disk); gendev = disk_to_dev(disk);

View File

@ -1132,7 +1132,8 @@ EXPORT_SYMBOL_GPL(nvdimm_has_flush);
int nvdimm_has_cache(struct nd_region *nd_region) int nvdimm_has_cache(struct nd_region *nd_region)
{ {
return is_nd_pmem(&nd_region->dev); return is_nd_pmem(&nd_region->dev) &&
!test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags);
} }
EXPORT_SYMBOL_GPL(nvdimm_has_cache); EXPORT_SYMBOL_GPL(nvdimm_has_cache);

View File

@ -51,9 +51,16 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev,
return copy_from_iter(addr, bytes, i); return copy_from_iter(addr, bytes, i);
} }
static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
{
return copy_to_iter(addr, bytes, i);
}
static const struct dax_operations dcssblk_dax_ops = { static const struct dax_operations dcssblk_dax_ops = {
.direct_access = dcssblk_dax_direct_access, .direct_access = dcssblk_dax_direct_access,
.copy_from_iter = dcssblk_dax_copy_from_iter, .copy_from_iter = dcssblk_dax_copy_from_iter,
.copy_to_iter = dcssblk_dax_copy_to_iter,
}; };
struct dcssblk_dev_info { struct dcssblk_dev_info {

View File

@ -38,6 +38,7 @@ config FS_DAX
bool "Direct Access (DAX) support" bool "Direct Access (DAX) support"
depends on MMU depends on MMU
depends on !(ARM || MIPS || SPARC) depends on !(ARM || MIPS || SPARC)
select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
select FS_IOMAP select FS_IOMAP
select DAX select DAX
help help

136
fs/dax.c
View File

@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
} }
} }
static struct page *dax_busy_page(void *entry)
{
unsigned long pfn;
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
if (page_ref_count(page) > 1)
return page;
}
return NULL;
}
/* /*
* Find radix tree entry at given index. If it points to an exceptional entry, * Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't * return it with the radix tree entry locked. If the radix tree doesn't
@ -492,6 +505,90 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
return entry; return entry;
} }
/**
* dax_layout_busy_page - find first pinned page in @mapping
* @mapping: address space to scan for a page with ref count > 1
*
* DAX requires ZONE_DEVICE mapped pages. These pages are never
* 'onlined' to the page allocator so they are considered idle when
* page->count == 1. A filesystem uses this interface to determine if
* any page in the mapping is busy, i.e. for DMA, or other
* get_user_pages() usages.
*
* It is expected that the filesystem is holding locks to block the
* establishment of new mappings in this address_space. I.e. it expects
* to be able to run unmap_mapping_range() and subsequently not race
* mapping_mapped() becoming true.
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
pgoff_t indices[PAGEVEC_SIZE];
struct page *page = NULL;
struct pagevec pvec;
pgoff_t index, end;
unsigned i;
/*
* In the 'limited' case get_user_pages() for dax is disabled.
*/
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return NULL;
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
pagevec_init(&pvec);
index = 0;
end = -1;
/*
* If we race get_user_pages_fast() here either we'll see the
* elevated page count in the pagevec_lookup and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
* pte_lock() and pmd_lock(). New references are not taken without
* holding those locks, and unmap_mapping_range() will not zero the
* pte or pmd without holding the respective lock, so we are
* guaranteed to either see new references or prevent new
* references from being established.
*/
unmap_mapping_range(mapping, 0, 0, 1);
while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *pvec_ent = pvec.pages[i];
void *entry;
index = indices[i];
if (index >= end)
break;
if (!radix_tree_exceptional_entry(pvec_ent))
continue;
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (entry)
page = dax_busy_page(entry);
put_unlocked_mapping_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
if (page)
break;
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
index++;
if (page)
break;
}
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_mapping_entry(struct address_space *mapping, static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc) pgoff_t index, bool trunc)
{ {
@ -912,7 +1009,6 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
unsigned long vaddr = vmf->address; unsigned long vaddr = vmf->address;
vm_fault_t ret = VM_FAULT_NOPAGE; vm_fault_t ret = VM_FAULT_NOPAGE;
struct page *zero_page; struct page *zero_page;
void *entry2;
pfn_t pfn; pfn_t pfn;
zero_page = ZERO_PAGE(0); zero_page = ZERO_PAGE(0);
@ -922,13 +1018,8 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
} }
pfn = page_to_pfn_t(zero_page); pfn = page_to_pfn_t(zero_page);
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
RADIX_DAX_ZERO_PAGE, false); false);
if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
out: out:
trace_dax_load_hole(inode, vmf, ret); trace_dax_load_hole(inode, vmf, ret);
@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iov_iter *iter = data; struct iov_iter *iter = data;
loff_t end = pos + length, done = 0; loff_t end = pos + length, done = 0;
ssize_t ret = 0; ssize_t ret = 0;
size_t xfer;
int id; int id;
if (iov_iter_rw(iter) == READ) { if (iov_iter_rw(iter) == READ) {
@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* vfs_write(), depending on which operation we are doing. * vfs_write(), depending on which operation we are doing.
*/ */
if (iov_iter_rw(iter) == WRITE) if (iov_iter_rw(iter) == WRITE)
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter); map_len, iter);
else else
map_len = copy_to_iter(kaddr, map_len, iter); xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
if (map_len <= 0) { map_len, iter);
ret = map_len ? map_len : -EFAULT;
break;
}
pos += map_len; pos += xfer;
length -= map_len; length -= xfer;
done += map_len; done += xfer;
if (xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
} }
dax_read_unlock(id); dax_read_unlock(id);
@ -1240,10 +1334,6 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
0, write && !sync); 0, write && !sync);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto error_finish_iomap;
}
/* /*
* If we are doing synchronous page fault and inode needs fsync, * If we are doing synchronous page fault and inode needs fsync,
@ -1324,8 +1414,6 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pfn = page_to_pfn_t(zero_page); pfn = page_to_pfn_t(zero_page);
ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret))
goto fallback;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) { if (!pmd_none(*(vmf->pmd))) {
@ -1447,8 +1535,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD, write && !sync); RADIX_DAX_PMD, write && !sync);
if (IS_ERR(entry))
goto finish_iomap;
/* /*
* If we are doing synchronous page fault and inode needs fsync, * If we are doing synchronous page fault and inode needs fsync,

View File

@ -312,7 +312,7 @@ xfs_file_aio_write_checks(
if (error <= 0) if (error <= 0)
return error; return error;
error = xfs_break_layouts(inode, iolock); error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
if (error) if (error)
return error; return error;
@ -731,6 +731,69 @@ xfs_file_write_iter(
return xfs_file_buffered_aio_write(iocb, from); return xfs_file_buffered_aio_write(iocb, from);
} }
static void
xfs_wait_dax_page(
struct inode *inode,
bool *did_unlock)
{
struct xfs_inode *ip = XFS_I(inode);
*did_unlock = true;
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
schedule();
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
static int
xfs_break_dax_layouts(
struct inode *inode,
uint iolock,
bool *did_unlock)
{
struct page *page;
ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
page = dax_layout_busy_page(inode->i_mapping);
if (!page)
return 0;
return ___wait_var_event(&page->_refcount,
atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
0, 0, xfs_wait_dax_page(inode, did_unlock));
}
int
xfs_break_layouts(
struct inode *inode,
uint *iolock,
enum layout_break_reason reason)
{
bool retry;
int error;
ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
do {
retry = false;
switch (reason) {
case BREAK_UNMAP:
error = xfs_break_dax_layouts(inode, *iolock, &retry);
if (error || retry)
break;
/* fall through */
case BREAK_WRITE:
error = xfs_break_leased_layouts(inode, iolock, &retry);
break;
default:
WARN_ON_ONCE(1);
error = -EINVAL;
}
} while (error == 0 && retry);
return error;
}
#define XFS_FALLOC_FL_SUPPORTED \ #define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
@ -747,7 +810,7 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
long error; long error;
enum xfs_prealloc_flags flags = 0; enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0; loff_t new_size = 0;
bool do_file_insert = false; bool do_file_insert = false;
@ -757,13 +820,10 @@ xfs_file_fallocate(
return -EOPNOTSUPP; return -EOPNOTSUPP;
xfs_ilock(ip, iolock); xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock); error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error) if (error)
goto out_unlock; goto out_unlock;
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock |= XFS_MMAPLOCK_EXCL;
if (mode & FALLOC_FL_PUNCH_HOLE) { if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len); error = xfs_free_file_space(ip, offset, len);
if (error) if (error)

View File

@ -378,6 +378,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \
>> XFS_ILOCK_SHIFT) >> XFS_ILOCK_SHIFT)
/*
* Layouts are broken in the BREAK_WRITE case to ensure that
* layout-holders do not collide with local writes. Additionally,
* layouts are broken in the BREAK_UNMAP case to make sure the
* layout-holder has a consistent view of the file's extent map. While
* BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
* BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
* go idle.
*/
enum layout_break_reason {
BREAK_WRITE,
BREAK_UNMAP,
};
/* /*
* For multiple groups support: if S_ISGID bit is set in the parent * For multiple groups support: if S_ISGID bit is set in the parent
* directory, group of new file is set to that of the parent, and * directory, group of new file is set to that of the parent, and
@ -453,6 +467,8 @@ enum xfs_prealloc_flags {
int xfs_update_prealloc_flags(struct xfs_inode *ip, int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags); enum xfs_prealloc_flags flags);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
/* from xfs_iops.c */ /* from xfs_iops.c */
extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_inode(struct xfs_inode *ip);

View File

@ -39,7 +39,6 @@
#include "xfs_icache.h" #include "xfs_icache.h"
#include "xfs_symlink.h" #include "xfs_symlink.h"
#include "xfs_trans.h" #include "xfs_trans.h"
#include "xfs_pnfs.h"
#include "xfs_acl.h" #include "xfs_acl.h"
#include "xfs_btree.h" #include "xfs_btree.h"
#include <linux/fsmap.h> #include <linux/fsmap.h>
@ -614,7 +613,7 @@ xfs_ioc_space(
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr; struct iattr iattr;
enum xfs_prealloc_flags flags = 0; enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error; int error;
/* /*
@ -644,13 +643,10 @@ xfs_ioc_space(
return error; return error;
xfs_ilock(ip, iolock); xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock); error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error) if (error)
goto out_unlock; goto out_unlock;
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock |= XFS_MMAPLOCK_EXCL;
switch (bf->l_whence) { switch (bf->l_whence) {
case 0: /*SEEK_SET*/ case 0: /*SEEK_SET*/
break; break;

View File

@ -37,7 +37,6 @@
#include "xfs_da_btree.h" #include "xfs_da_btree.h"
#include "xfs_dir2.h" #include "xfs_dir2.h"
#include "xfs_trans_space.h" #include "xfs_trans_space.h"
#include "xfs_pnfs.h"
#include "xfs_iomap.h" #include "xfs_iomap.h"
#include <linux/capability.h> #include <linux/capability.h>
@ -1030,14 +1029,19 @@ xfs_vn_setattr(
int error; int error;
if (iattr->ia_valid & ATTR_SIZE) { if (iattr->ia_valid & ATTR_SIZE) {
struct xfs_inode *ip = XFS_I(d_inode(dentry)); struct inode *inode = d_inode(dentry);
uint iolock = XFS_IOLOCK_EXCL; struct xfs_inode *ip = XFS_I(inode);
uint iolock;
error = xfs_break_layouts(d_inode(dentry), &iolock);
if (error)
return error;
xfs_ilock(ip, XFS_MMAPLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error) {
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
}
error = xfs_vn_setattr_size(dentry, iattr); error = xfs_vn_setattr_size(dentry, iattr);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
} else { } else {

View File

@ -31,19 +31,20 @@
* rules in the page fault path we don't bother. * rules in the page fault path we don't bother.
*/ */
int int
xfs_break_layouts( xfs_break_leased_layouts(
struct inode *inode, struct inode *inode,
uint *iolock) uint *iolock,
bool *did_unlock)
{ {
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
int error; int error;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock); xfs_iunlock(ip, *iolock);
*did_unlock = true;
error = break_layout(inode, true); error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL; *iolock &= ~XFS_IOLOCK_SHARED;
*iolock |= XFS_IOLOCK_EXCL;
xfs_ilock(ip, *iolock); xfs_ilock(ip, *iolock);
} }
@ -120,8 +121,8 @@ xfs_fs_map_blocks(
* Lock out any other I/O before we flush and invalidate the pagecache, * Lock out any other I/O before we flush and invalidate the pagecache,
* and then hand out a layout to the remote system. This is very * and then hand out a layout to the remote system. This is very
* similar to direct I/O, except that the synchronization is much more * similar to direct I/O, except that the synchronization is much more
* complicated. See the comment near xfs_break_layouts for a detailed * complicated. See the comment near xfs_break_leased_layouts
* explanation. * for a detailed explanation.
*/ */
xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_IOLOCK_EXCL);

View File

@ -9,10 +9,11 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
struct iattr *iattr); struct iattr *iattr);
int xfs_break_layouts(struct inode *inode, uint *iolock); int xfs_break_leased_layouts(struct inode *inode, uint *iolock,
bool *did_unlock);
#else #else
static inline int static inline int
xfs_break_layouts(struct inode *inode, uint *iolock) xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock)
{ {
return 0; return 0;
} }

View File

@ -20,6 +20,9 @@ struct dax_operations {
/* copy_from_iter: required operation for fs-dax direct-i/o */ /* copy_from_iter: required operation for fs-dax direct-i/o */
size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t, size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
struct iov_iter *); struct iov_iter *);
/* copy_to_iter: required operation for fs-dax direct-i/o */
size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
struct iov_iter *);
}; };
extern struct attribute_group dax_attribute_group; extern struct attribute_group dax_attribute_group;
@ -83,6 +86,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
int dax_writeback_mapping_range(struct address_space *mapping, int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc); struct block_device *bdev, struct writeback_control *wbc);
struct page *dax_layout_busy_page(struct address_space *mapping);
#else #else
static inline bool bdev_dax_supported(struct block_device *bdev, static inline bool bdev_dax_supported(struct block_device *bdev,
int blocksize) int blocksize)
@ -104,6 +109,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
return NULL; return NULL;
} }
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
return NULL;
}
static inline int dax_writeback_mapping_range(struct address_space *mapping, static inline int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc) struct block_device *bdev, struct writeback_control *wbc)
{ {
@ -119,6 +129,8 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
void **kaddr, pfn_t *pfn); void **kaddr, pfn_t *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i); size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,

View File

@ -133,7 +133,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
*/ */
typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn); long nr_pages, void **kaddr, pfn_t *pfn);
typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff, typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i); void *addr, size_t bytes, struct iov_iter *i);
#define PAGE_SECTORS (PAGE_SIZE / 512) #define PAGE_SECTORS (PAGE_SIZE / 512)
@ -184,7 +184,8 @@ struct target_type {
dm_iterate_devices_fn iterate_devices; dm_iterate_devices_fn iterate_devices;
dm_io_hints_fn io_hints; dm_io_hints_fn io_hints;
dm_dax_direct_access_fn direct_access; dm_dax_direct_access_fn direct_access;
dm_dax_copy_from_iter_fn dax_copy_from_iter; dm_dax_copy_iter_fn dax_copy_from_iter;
dm_dax_copy_iter_fn dax_copy_to_iter;
/* For internal device-mapper use. */ /* For internal device-mapper use. */
struct list_head list; struct list_head list;

View File

@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_ #ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_
#include <linux/mm.h>
#include <linux/ioport.h> #include <linux/ioport.h>
#include <linux/percpu-refcount.h> #include <linux/percpu-refcount.h>
@ -30,13 +29,6 @@ struct vmem_altmap {
* Specialize ZONE_DEVICE memory into multiple types each having differents * Specialize ZONE_DEVICE memory into multiple types each having differents
* usage. * usage.
* *
* MEMORY_DEVICE_HOST:
* Persistent device memory (pmem): struct page might be allocated in different
* memory and architecture might want to perform special actions. It is similar
* to regular memory, in that the CPU can access it transparently. However,
* it is likely to have different bandwidth and latency than regular memory.
* See Documentation/nvdimm/nvdimm.txt for more information.
*
* MEMORY_DEVICE_PRIVATE: * MEMORY_DEVICE_PRIVATE:
* Device memory that is not directly addressable by the CPU: CPU can neither * Device memory that is not directly addressable by the CPU: CPU can neither
* read nor write private memory. In this case, we do still have struct pages * read nor write private memory. In this case, we do still have struct pages
@ -53,11 +45,19 @@ struct vmem_altmap {
* driver can hotplug the device memory using ZONE_DEVICE and with that memory * driver can hotplug the device memory using ZONE_DEVICE and with that memory
* type. Any page of a process can be migrated to such memory. However no one * type. Any page of a process can be migrated to such memory. However no one
* should be allow to pin such memory so that it can always be evicted. * should be allow to pin such memory so that it can always be evicted.
*
* MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page
* pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
* wakeup event whenever a page is unpinned and becomes idle. This
* wakeup is used to coordinate physical address space management (ex:
* fs truncate/hole punch) vs pinned pages (ex: device dma).
*/ */
enum memory_type { enum memory_type {
MEMORY_DEVICE_HOST = 0, MEMORY_DEVICE_PRIVATE = 1,
MEMORY_DEVICE_PRIVATE,
MEMORY_DEVICE_PUBLIC, MEMORY_DEVICE_PUBLIC,
MEMORY_DEVICE_FS_DAX,
}; };
/* /*
@ -129,8 +129,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
static inline bool is_zone_device_page(const struct page *page);
#else #else
static inline void *devm_memremap_pages(struct device *dev, static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap) struct dev_pagemap *pgmap)
@ -161,20 +159,6 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
} }
#endif /* CONFIG_ZONE_DEVICE */ #endif /* CONFIG_ZONE_DEVICE */
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
static inline bool is_device_private_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
static inline bool is_device_public_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
}
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
static inline void put_dev_pagemap(struct dev_pagemap *pgmap) static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{ {
if (pgmap) if (pgmap)

View File

@ -830,27 +830,65 @@ static inline bool is_zone_device_page(const struct page *page)
} }
#endif #endif
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) #ifdef CONFIG_DEV_PAGEMAP_OPS
void put_zone_device_private_or_public_page(struct page *page); void dev_pagemap_get_ops(void);
DECLARE_STATIC_KEY_FALSE(device_private_key); void dev_pagemap_put_ops(void);
#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key) void __put_devmap_managed_page(struct page *page);
static inline bool is_device_private_page(const struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
static inline bool is_device_public_page(const struct page *page); static inline bool put_devmap_managed_page(struct page *page)
#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ {
static inline void put_zone_device_private_or_public_page(struct page *page) if (!static_branch_unlikely(&devmap_managed_key))
return false;
if (!is_zone_device_page(page))
return false;
switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_PUBLIC:
case MEMORY_DEVICE_FS_DAX:
__put_devmap_managed_page(page);
return true;
default:
break;
}
return false;
}
static inline bool is_device_private_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
static inline bool is_device_public_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
}
#else /* CONFIG_DEV_PAGEMAP_OPS */
static inline void dev_pagemap_get_ops(void)
{ {
} }
#define IS_HMM_ENABLED 0
static inline void dev_pagemap_put_ops(void)
{
}
static inline bool put_devmap_managed_page(struct page *page)
{
return false;
}
static inline bool is_device_private_page(const struct page *page) static inline bool is_device_private_page(const struct page *page)
{ {
return false; return false;
} }
static inline bool is_device_public_page(const struct page *page) static inline bool is_device_public_page(const struct page *page)
{ {
return false; return false;
} }
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ #endif /* CONFIG_DEV_PAGEMAP_OPS */
static inline void get_page(struct page *page) static inline void get_page(struct page *page)
{ {
@ -868,16 +906,13 @@ static inline void put_page(struct page *page)
page = compound_head(page); page = compound_head(page);
/* /*
* For private device pages we need to catch refcount transition from * For devmap managed pages we need to catch refcount transition from
* 2 to 1, when refcount reach one it means the private device page is * 2 to 1, when refcount reach one it means the page is free and we
* free and we need to inform the device driver through callback. See * need to inform the device driver through callback. See
* include/linux/memremap.h and HMM for details. * include/linux/memremap.h and HMM for details.
*/ */
if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) || if (put_devmap_managed_page(page))
unlikely(is_device_public_page(page)))) {
put_zone_device_private_or_public_page(page);
return; return;
}
if (put_page_testzero(page)) if (put_page_testzero(page))
__put_page(page); __put_page(page);

View File

@ -155,7 +155,7 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#endif #endif
#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE #ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i);
#else #else
#define _copy_to_iter_mcsafe _copy_to_iter #define _copy_to_iter_mcsafe _copy_to_iter
#endif #endif

View File

@ -112,7 +112,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += memremap.o obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
$(obj)/configs.o: $(obj)/config_data.h $(obj)/configs.o: $(obj)/config_data.h

167
kernel/iomem.c Normal file
View File

@ -0,0 +1,167 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/device.h>
#include <linux/types.h>
#include <linux/io.h>
#include <linux/mm.h>
#ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
{
return ioremap(offset, size);
}
#endif
#ifndef arch_memremap_wb
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
{
return (__force void *)ioremap_cache(offset, size);
}
#endif
#ifndef arch_memremap_can_ram_remap
static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
unsigned long flags)
{
return true;
}
#endif
static void *try_ram_remap(resource_size_t offset, size_t size,
unsigned long flags)
{
unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
arch_memremap_can_ram_remap(offset, size, flags))
return __va(offset);
return NULL; /* fallback to arch_memremap_wb */
}
/**
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
* @size: size of remap
* @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
* MEMREMAP_ENC, MEMREMAP_DEC
*
* memremap() is "ioremap" for cases where it is known that the resource
* being mapped does not have i/o side effects and the __iomem
* annotation is not applicable. In the case of multiple flags, the different
* mapping types will be attempted in the order listed below until one of
* them succeeds.
*
* MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
* Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap() will bypass establishing a new mapping and instead return
* a pointer into the direct map.
*
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
* cache or are written through to memory and never exist in a
* cache-dirty state with respect to program visibility. Attempts to
* map System RAM with this mapping type will fail.
*
* MEMREMAP_WC - establish a writecombine mapping, whereby writes may
* be coalesced together (e.g. in the CPU's write buffers), but is otherwise
* uncached. Attempts to map System RAM with this mapping type will fail.
*/
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
{
int is_ram = region_intersects(offset, size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
void *addr = NULL;
if (!flags)
return NULL;
if (is_ram == REGION_MIXED) {
WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
&offset, (unsigned long) size);
return NULL;
}
/* Try all mapping types requested until one returns non-NULL */
if (flags & MEMREMAP_WB) {
/*
* MEMREMAP_WB is special in that it can be satisifed
* from the direct map. Some archs depend on the
* capability of memremap() to autodetect cases where
* the requested range is potentially in System RAM.
*/
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size, flags);
if (!addr)
addr = arch_memremap_wb(offset, size);
}
/*
* If we don't have a mapping yet and other request flags are
* present then we will be attempting to establish a new virtual
* address mapping. Enforce that this mapping is not aliasing
* System RAM.
*/
if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
&offset, (unsigned long) size);
return NULL;
}
if (!addr && (flags & MEMREMAP_WT))
addr = ioremap_wt(offset, size);
if (!addr && (flags & MEMREMAP_WC))
addr = ioremap_wc(offset, size);
return addr;
}
EXPORT_SYMBOL(memremap);
void memunmap(void *addr)
{
if (is_vmalloc_addr(addr))
iounmap((void __iomem *) addr);
}
EXPORT_SYMBOL(memunmap);
static void devm_memremap_release(struct device *dev, void *res)
{
memunmap(*(void **)res);
}
static int devm_memremap_match(struct device *dev, void *res, void *match_data)
{
return *(void **)res == match_data;
}
void *devm_memremap(struct device *dev, resource_size_t offset,
size_t size, unsigned long flags)
{
void **ptr, *addr;
ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
dev_to_node(dev));
if (!ptr)
return ERR_PTR(-ENOMEM);
addr = memremap(offset, size, flags);
if (addr) {
*ptr = addr;
devres_add(dev, ptr);
} else {
devres_free(ptr);
return ERR_PTR(-ENXIO);
}
return addr;
}
EXPORT_SYMBOL(devm_memremap);
void devm_memunmap(struct device *dev, void *addr)
{
WARN_ON(devres_release(dev, devm_memremap_release,
devm_memremap_match, addr));
}
EXPORT_SYMBOL(devm_memunmap);

View File

@ -1,15 +1,5 @@
/* /* SPDX-License-Identifier: GPL-2.0 */
* Copyright(c) 2015 Intel Corporation. All rights reserved. /* Copyright(c) 2015 Intel Corporation. All rights reserved. */
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/radix-tree.h> #include <linux/radix-tree.h>
#include <linux/device.h> #include <linux/device.h>
#include <linux/types.h> #include <linux/types.h>
@ -19,170 +9,8 @@
#include <linux/memory_hotplug.h> #include <linux/memory_hotplug.h>
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/swapops.h> #include <linux/swapops.h>
#include <linux/wait_bit.h>
#ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
{
return ioremap(offset, size);
}
#endif
#ifndef arch_memremap_wb
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
{
return (__force void *)ioremap_cache(offset, size);
}
#endif
#ifndef arch_memremap_can_ram_remap
static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
unsigned long flags)
{
return true;
}
#endif
static void *try_ram_remap(resource_size_t offset, size_t size,
unsigned long flags)
{
unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
arch_memremap_can_ram_remap(offset, size, flags))
return __va(offset);
return NULL; /* fallback to arch_memremap_wb */
}
/**
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
* @size: size of remap
* @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
* MEMREMAP_ENC, MEMREMAP_DEC
*
* memremap() is "ioremap" for cases where it is known that the resource
* being mapped does not have i/o side effects and the __iomem
* annotation is not applicable. In the case of multiple flags, the different
* mapping types will be attempted in the order listed below until one of
* them succeeds.
*
* MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
* Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap() will bypass establishing a new mapping and instead return
* a pointer into the direct map.
*
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
* cache or are written through to memory and never exist in a
* cache-dirty state with respect to program visibility. Attempts to
* map System RAM with this mapping type will fail.
*
* MEMREMAP_WC - establish a writecombine mapping, whereby writes may
* be coalesced together (e.g. in the CPU's write buffers), but is otherwise
* uncached. Attempts to map System RAM with this mapping type will fail.
*/
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
{
int is_ram = region_intersects(offset, size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
void *addr = NULL;
if (!flags)
return NULL;
if (is_ram == REGION_MIXED) {
WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
&offset, (unsigned long) size);
return NULL;
}
/* Try all mapping types requested until one returns non-NULL */
if (flags & MEMREMAP_WB) {
/*
* MEMREMAP_WB is special in that it can be satisifed
* from the direct map. Some archs depend on the
* capability of memremap() to autodetect cases where
* the requested range is potentially in System RAM.
*/
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size, flags);
if (!addr)
addr = arch_memremap_wb(offset, size);
}
/*
* If we don't have a mapping yet and other request flags are
* present then we will be attempting to establish a new virtual
* address mapping. Enforce that this mapping is not aliasing
* System RAM.
*/
if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
&offset, (unsigned long) size);
return NULL;
}
if (!addr && (flags & MEMREMAP_WT))
addr = ioremap_wt(offset, size);
if (!addr && (flags & MEMREMAP_WC))
addr = ioremap_wc(offset, size);
return addr;
}
EXPORT_SYMBOL(memremap);
void memunmap(void *addr)
{
if (is_vmalloc_addr(addr))
iounmap((void __iomem *) addr);
}
EXPORT_SYMBOL(memunmap);
static void devm_memremap_release(struct device *dev, void *res)
{
memunmap(*(void **)res);
}
static int devm_memremap_match(struct device *dev, void *res, void *match_data)
{
return *(void **)res == match_data;
}
void *devm_memremap(struct device *dev, resource_size_t offset,
size_t size, unsigned long flags)
{
void **ptr, *addr;
ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
dev_to_node(dev));
if (!ptr)
return ERR_PTR(-ENOMEM);
addr = memremap(offset, size, flags);
if (addr) {
*ptr = addr;
devres_add(dev, ptr);
} else {
devres_free(ptr);
return ERR_PTR(-ENXIO);
}
return addr;
}
EXPORT_SYMBOL(devm_memremap);
void devm_memunmap(struct device *dev, void *addr)
{
WARN_ON(devres_release(dev, devm_memremap_release,
devm_memremap_match, addr));
}
EXPORT_SYMBOL(devm_memunmap);
#ifdef CONFIG_ZONE_DEVICE
static DEFINE_MUTEX(pgmap_lock); static DEFINE_MUTEX(pgmap_lock);
static RADIX_TREE(pgmap_radix, GFP_KERNEL); static RADIX_TREE(pgmap_radix, GFP_KERNEL);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
return pgmap; return pgmap;
} }
#endif /* CONFIG_ZONE_DEVICE */ EXPORT_SYMBOL_GPL(get_dev_pagemap);
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) #ifdef CONFIG_DEV_PAGEMAP_OPS
void put_zone_device_private_or_public_page(struct page *page) DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL_GPL(devmap_managed_key);
static atomic_t devmap_enable;
/*
* Toggle the static key for ->page_free() callbacks when dev_pagemap
* pages go idle.
*/
void dev_pagemap_get_ops(void)
{
if (atomic_inc_return(&devmap_enable) == 1)
static_branch_enable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
void dev_pagemap_put_ops(void)
{
if (atomic_dec_and_test(&devmap_enable))
static_branch_disable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
void __put_devmap_managed_page(struct page *page)
{ {
int count = page_ref_dec_return(page); int count = page_ref_dec_return(page);
@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page)
} else if (!count) } else if (!count)
__put_page(page); __put_page(page);
} }
EXPORT_SYMBOL(put_zone_device_private_or_public_page); EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ #endif /* CONFIG_DEV_PAGEMAP_OPS */

View File

@ -415,6 +415,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
return __walk_iomem_res_desc(&res, desc, false, arg, func); return __walk_iomem_res_desc(&res, desc, false, arg, func);
} }
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
/* /*
* This function calls the @func callback against all memory ranges of type * This function calls the @func callback against all memory ranges of type

View File

@ -621,6 +621,9 @@ config ARCH_HAS_PMEM_API
config ARCH_HAS_UACCESS_FLUSHCACHE config ARCH_HAS_UACCESS_FLUSHCACHE
bool bool
config ARCH_HAS_UACCESS_MCSAFE
bool
config STACKDEPOT config STACKDEPOT
bool bool
select STACKTRACE select STACKTRACE

View File

@ -694,6 +694,9 @@ config ARCH_HAS_HMM
config MIGRATE_VMA_HELPER config MIGRATE_VMA_HELPER
bool bool
config DEV_PAGEMAP_OPS
bool
config HMM config HMM
bool bool
select MIGRATE_VMA_HELPER select MIGRATE_VMA_HELPER
@ -714,6 +717,7 @@ config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)" bool "Unaddressable device memory (GPU memory, ...)"
depends on ARCH_HAS_HMM depends on ARCH_HAS_HMM
select HMM select HMM
select DEV_PAGEMAP_OPS
help help
Allows creation of struct pages to represent unaddressable device Allows creation of struct pages to represent unaddressable device
@ -724,6 +728,7 @@ config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)" bool "Addressable device memory (like GPU memory)"
depends on ARCH_HAS_HMM depends on ARCH_HAS_HMM
select HMM select HMM
select DEV_PAGEMAP_OPS
help help
Allows creation of struct pages to represent addressable device Allows creation of struct pages to represent addressable device

View File

@ -1475,32 +1475,48 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
return 1; return 1;
} }
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, struct page **pages, int *nr) unsigned long end, struct page **pages, int *nr)
{ {
unsigned long fault_pfn; unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
return __gup_device_huge(fault_pfn, addr, end, pages, nr); if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
return 1;
} }
static int __gup_device_huge_pud(pud_t pud, unsigned long addr, static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, struct page **pages, int *nr) unsigned long end, struct page **pages, int *nr)
{ {
unsigned long fault_pfn; unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
return __gup_device_huge(fault_pfn, addr, end, pages, nr); if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
return 1;
} }
#else #else
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, struct page **pages, int *nr) unsigned long end, struct page **pages, int *nr)
{ {
BUILD_BUG(); BUILD_BUG();
return 0; return 0;
} }
static int __gup_device_huge_pud(pud_t pud, unsigned long addr, static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
unsigned long end, struct page **pages, int *nr) unsigned long end, struct page **pages, int *nr)
{ {
BUILD_BUG(); BUILD_BUG();
@ -1518,7 +1534,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0; return 0;
if (pmd_devmap(orig)) if (pmd_devmap(orig))
return __gup_device_huge_pmd(orig, addr, end, pages, nr); return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
refs = 0; refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@ -1556,7 +1572,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0; return 0;
if (pud_devmap(orig)) if (pud_devmap(orig))
return __gup_device_huge_pud(orig, addr, end, pages, nr); return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
refs = 0; refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);

View File

@ -35,15 +35,6 @@
#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
/*
* Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h
*/
DEFINE_STATIC_KEY_FALSE(device_private_key);
EXPORT_SYMBOL(device_private_key);
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
#if IS_ENABLED(CONFIG_HMM_MIRROR) #if IS_ENABLED(CONFIG_HMM_MIRROR)
static const struct mmu_notifier_ops hmm_mmu_notifier_ops; static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
@ -1167,7 +1158,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
resource_size_t addr; resource_size_t addr;
int ret; int ret;
static_branch_enable(&device_private_key); dev_pagemap_get_ops();
devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
GFP_KERNEL, dev_to_node(device)); GFP_KERNEL, dev_to_node(device));
@ -1261,7 +1252,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
static_branch_enable(&device_private_key); dev_pagemap_get_ops();
devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
GFP_KERNEL, dev_to_node(device)); GFP_KERNEL, dev_to_node(device));

View File

@ -29,6 +29,7 @@
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/memremap.h>
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/uio.h> #include <linux/uio.h>
@ -743,7 +744,7 @@ void release_pages(struct page **pages, int nr)
flags); flags);
locked_pgdat = NULL; locked_pgdat = NULL;
} }
put_zone_device_private_or_public_page(page); put_devmap_managed_page(page);
continue; continue;
} }

View File

@ -29,6 +29,8 @@
#include "nfit_test.h" #include "nfit_test.h"
#include "../watermark.h" #include "../watermark.h"
#include <asm/mcsafe_test.h>
/* /*
* Generate an NFIT table to describe the following topology: * Generate an NFIT table to describe the following topology:
* *
@ -2681,6 +2683,107 @@ static struct platform_driver nfit_test_driver = {
.id_table = nfit_test_id, .id_table = nfit_test_id,
}; };
static char mcsafe_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
enum INJECT {
INJECT_NONE,
INJECT_SRC,
INJECT_DST,
};
static void mcsafe_test_init(char *dst, char *src, size_t size)
{
size_t i;
memset(dst, 0xff, size);
for (i = 0; i < size; i++)
src[i] = (char) i;
}
static bool mcsafe_test_validate(unsigned char *dst, unsigned char *src,
size_t size, unsigned long rem)
{
size_t i;
for (i = 0; i < size - rem; i++)
if (dst[i] != (unsigned char) i) {
pr_info_once("%s:%d: offset: %zd got: %#x expect: %#x\n",
__func__, __LINE__, i, dst[i],
(unsigned char) i);
return false;
}
for (i = size - rem; i < size; i++)
if (dst[i] != 0xffU) {
pr_info_once("%s:%d: offset: %zd got: %#x expect: 0xff\n",
__func__, __LINE__, i, dst[i]);
return false;
}
return true;
}
void mcsafe_test(void)
{
char *inject_desc[] = { "none", "source", "destination" };
enum INJECT inj;
if (IS_ENABLED(CONFIG_MCSAFE_TEST)) {
pr_info("%s: run...\n", __func__);
} else {
pr_info("%s: disabled, skip.\n", __func__);
return;
}
for (inj = INJECT_NONE; inj <= INJECT_DST; inj++) {
int i;
pr_info("%s: inject: %s\n", __func__, inject_desc[inj]);
for (i = 0; i < 512; i++) {
unsigned long expect, rem;
void *src, *dst;
bool valid;
switch (inj) {
case INJECT_NONE:
mcsafe_inject_src(NULL);
mcsafe_inject_dst(NULL);
dst = &mcsafe_buf[2048];
src = &mcsafe_buf[1024 - i];
expect = 0;
break;
case INJECT_SRC:
mcsafe_inject_src(&mcsafe_buf[1024]);
mcsafe_inject_dst(NULL);
dst = &mcsafe_buf[2048];
src = &mcsafe_buf[1024 - i];
expect = 512 - i;
break;
case INJECT_DST:
mcsafe_inject_src(NULL);
mcsafe_inject_dst(&mcsafe_buf[2048]);
dst = &mcsafe_buf[2048 - i];
src = &mcsafe_buf[1024];
expect = 512 - i;
break;
}
mcsafe_test_init(dst, src, 512);
rem = __memcpy_mcsafe(dst, src, 512);
valid = mcsafe_test_validate(dst, src, 512, expect);
if (rem == expect && valid)
continue;
pr_info("%s: copy(%#lx, %#lx, %d) off: %d rem: %ld %s expect: %ld\n",
__func__,
((unsigned long) dst) & ~PAGE_MASK,
((unsigned long ) src) & ~PAGE_MASK,
512, i, rem, valid ? "valid" : "bad",
expect);
}
}
mcsafe_inject_src(NULL);
mcsafe_inject_dst(NULL);
}
static __init int nfit_test_init(void) static __init int nfit_test_init(void)
{ {
int rc, i; int rc, i;
@ -2689,6 +2792,7 @@ static __init int nfit_test_init(void)
libnvdimm_test(); libnvdimm_test();
acpi_nfit_test(); acpi_nfit_test();
device_dax_test(); device_dax_test();
mcsafe_test();
nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm); nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);