mirror of https://gitee.com/openkylin/linux.git
x86, uaccess: introduce copy_from_iter_flushcache for pmem / cache-bypass operations
The pmem driver has a need to transfer data with a persistent memory destination and be able to rely on the fact that the destination writes are not cached. It is sufficient for the writes to be flushed to a cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect userspace to call fsync() to ensure data-writes have reached a power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or REQ_FLUSH to the pmem driver which will turn around and fence previous writes with an "sfence". Implement a __copy_from_user_inatomic_flushcache, memcpy_page_flushcache, and memcpy_flushcache, that guarantee that the destination buffer is not dirty in the cpu cache on completion. The new copy_from_iter_flushcache and sub-routines will be used to replace the "pmem api" (include/linux/pmem.h + arch/x86/include/asm/pmem.h). The availability of copy_from_iter_flushcache() and memcpy_flushcache() are gated by the CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE config symbol, and fallback to copy_from_iter_nocache() and plain memcpy() otherwise. This is meant to satisfy the concern from Linus that if a driver wants to do something beyond the normal nocache semantics it should be something private to that driver [1], and Al's concern that anything uaccess related belongs with the rest of the uaccess code [2]. The first consumer of this interface is a new 'copy_from_iter' dax operation so that pmem can inject cache maintenance operations without imposing this overhead on other dax-capable drivers. [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html Cc: <x86@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Toshi Kani <toshi.kani@hpe.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Matthew Wilcox <mawilcox@microsoft.com> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
parent
3c2993b8c6
commit
0aed55af88
|
@ -54,6 +54,7 @@ config X86
|
|||
select ARCH_HAS_KCOV if X86_64
|
||||
select ARCH_HAS_MMIO_FLUSH
|
||||
select ARCH_HAS_PMEM_API if X86_64
|
||||
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||
select ARCH_HAS_SET_MEMORY
|
||||
select ARCH_HAS_SG_CHAIN
|
||||
select ARCH_HAS_STRICT_KERNEL_RWX
|
||||
|
|
|
@ -109,6 +109,11 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
|
||||
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
|
||||
void memcpy_flushcache(void *dst, const void *src, size_t cnt);
|
||||
#endif
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
#endif /* _ASM_X86_STRING_64_H */
|
||||
|
|
|
@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne
|
|||
extern long __copy_user_nocache(void *dst, const void __user *src,
|
||||
unsigned size, int zerorest);
|
||||
|
||||
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
|
||||
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
|
||||
size_t len);
|
||||
|
||||
static inline int
|
||||
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
|
||||
unsigned size)
|
||||
|
@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
|
|||
return __copy_user_nocache(dst, src, size, 0);
|
||||
}
|
||||
|
||||
static inline int
|
||||
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
|
||||
{
|
||||
kasan_check_write(dst, size);
|
||||
return __copy_user_flushcache(dst, src, size);
|
||||
}
|
||||
|
||||
unsigned long
|
||||
copy_user_handle_tail(char *to, char *from, unsigned len);
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
*/
|
||||
#include <linux/export.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/highmem.h>
|
||||
|
||||
/*
|
||||
* Zero Userspace
|
||||
|
@ -73,3 +74,130 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
|
|||
clac();
|
||||
return len;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
|
||||
/**
|
||||
* clean_cache_range - write back a cache range with CLWB
|
||||
* @vaddr: virtual start address
|
||||
* @size: number of bytes to write back
|
||||
*
|
||||
* Write back a cache range using the CLWB (cache line write back)
|
||||
* instruction. Note that @size is internally rounded up to be cache
|
||||
* line size aligned.
|
||||
*/
|
||||
static void clean_cache_range(void *addr, size_t size)
|
||||
{
|
||||
u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
|
||||
unsigned long clflush_mask = x86_clflush_size - 1;
|
||||
void *vend = addr + size;
|
||||
void *p;
|
||||
|
||||
for (p = (void *)((unsigned long)addr & ~clflush_mask);
|
||||
p < vend; p += x86_clflush_size)
|
||||
clwb(p);
|
||||
}
|
||||
|
||||
long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
|
||||
{
|
||||
unsigned long flushed, dest = (unsigned long) dst;
|
||||
long rc = __copy_user_nocache(dst, src, size, 0);
|
||||
|
||||
/*
|
||||
* __copy_user_nocache() uses non-temporal stores for the bulk
|
||||
* of the transfer, but we need to manually flush if the
|
||||
* transfer is unaligned. A cached memory copy is used when
|
||||
* destination or size is not naturally aligned. That is:
|
||||
* - Require 8-byte alignment when size is 8 bytes or larger.
|
||||
* - Require 4-byte alignment when size is 4 bytes.
|
||||
*/
|
||||
if (size < 8) {
|
||||
if (!IS_ALIGNED(dest, 4) || size != 4)
|
||||
clean_cache_range(dst, 1);
|
||||
} else {
|
||||
if (!IS_ALIGNED(dest, 8)) {
|
||||
dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
|
||||
clean_cache_range(dst, 1);
|
||||
}
|
||||
|
||||
flushed = dest - (unsigned long) dst;
|
||||
if (size > flushed && !IS_ALIGNED(size - flushed, 8))
|
||||
clean_cache_range(dst + size - 1, 1);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
void memcpy_flushcache(void *_dst, const void *_src, size_t size)
|
||||
{
|
||||
unsigned long dest = (unsigned long) _dst;
|
||||
unsigned long source = (unsigned long) _src;
|
||||
|
||||
/* cache copy and flush to align dest */
|
||||
if (!IS_ALIGNED(dest, 8)) {
|
||||
unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
|
||||
|
||||
memcpy((void *) dest, (void *) source, len);
|
||||
clean_cache_range((void *) dest, len);
|
||||
dest += len;
|
||||
source += len;
|
||||
size -= len;
|
||||
if (!size)
|
||||
return;
|
||||
}
|
||||
|
||||
/* 4x8 movnti loop */
|
||||
while (size >= 32) {
|
||||
asm("movq (%0), %%r8\n"
|
||||
"movq 8(%0), %%r9\n"
|
||||
"movq 16(%0), %%r10\n"
|
||||
"movq 24(%0), %%r11\n"
|
||||
"movnti %%r8, (%1)\n"
|
||||
"movnti %%r9, 8(%1)\n"
|
||||
"movnti %%r10, 16(%1)\n"
|
||||
"movnti %%r11, 24(%1)\n"
|
||||
:: "r" (source), "r" (dest)
|
||||
: "memory", "r8", "r9", "r10", "r11");
|
||||
dest += 32;
|
||||
source += 32;
|
||||
size -= 32;
|
||||
}
|
||||
|
||||
/* 1x8 movnti loop */
|
||||
while (size >= 8) {
|
||||
asm("movq (%0), %%r8\n"
|
||||
"movnti %%r8, (%1)\n"
|
||||
:: "r" (source), "r" (dest)
|
||||
: "memory", "r8");
|
||||
dest += 8;
|
||||
source += 8;
|
||||
size -= 8;
|
||||
}
|
||||
|
||||
/* 1x4 movnti loop */
|
||||
while (size >= 4) {
|
||||
asm("movl (%0), %%r8d\n"
|
||||
"movnti %%r8d, (%1)\n"
|
||||
:: "r" (source), "r" (dest)
|
||||
: "memory", "r8");
|
||||
dest += 4;
|
||||
source += 4;
|
||||
size -= 4;
|
||||
}
|
||||
|
||||
/* cache copy for remaining bytes */
|
||||
if (size) {
|
||||
memcpy((void *) dest, (void *) source, size);
|
||||
clean_cache_range((void *) dest, size);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memcpy_flushcache);
|
||||
|
||||
void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
|
||||
size_t len)
|
||||
{
|
||||
char *from = kmap_atomic(page);
|
||||
|
||||
memcpy_flushcache(to, from + offset, len);
|
||||
kunmap_atomic(from);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1842,8 +1842,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
|
|||
}
|
||||
|
||||
if (rw)
|
||||
memcpy_to_pmem(mmio->addr.aperture + offset,
|
||||
iobuf + copied, c);
|
||||
memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
|
||||
else {
|
||||
if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
|
||||
mmio_flush_range((void __force *)
|
||||
|
|
|
@ -277,7 +277,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
|
|||
rc = -EIO;
|
||||
}
|
||||
|
||||
memcpy_to_pmem(nsio->addr + offset, buf, size);
|
||||
memcpy_flushcache(nsio->addr + offset, buf, size);
|
||||
nvdimm_flush(to_nd_region(ndns->dev.parent));
|
||||
|
||||
return rc;
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include <linux/pfn_t.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pmem.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/nd.h>
|
||||
#include "pmem.h"
|
||||
|
@ -80,7 +81,7 @@ static void write_pmem(void *pmem_addr, struct page *page,
|
|||
{
|
||||
void *mem = kmap_atomic(page);
|
||||
|
||||
memcpy_to_pmem(pmem_addr, mem + off, len);
|
||||
memcpy_flushcache(pmem_addr, mem + off, len);
|
||||
kunmap_atomic(mem);
|
||||
}
|
||||
|
||||
|
@ -235,8 +236,15 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
|
|||
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
|
||||
}
|
||||
|
||||
static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
return copy_from_iter_flushcache(addr, bytes, i);
|
||||
}
|
||||
|
||||
static const struct dax_operations pmem_dax_ops = {
|
||||
.direct_access = pmem_dax_direct_access,
|
||||
.copy_from_iter = pmem_copy_from_iter,
|
||||
};
|
||||
|
||||
static void pmem_release_queue(void *q)
|
||||
|
@ -294,7 +302,8 @@ static int pmem_attach_disk(struct device *dev,
|
|||
dev_set_drvdata(dev, pmem);
|
||||
pmem->phys_addr = res->start;
|
||||
pmem->size = resource_size(res);
|
||||
if (nvdimm_has_flush(nd_region) < 0)
|
||||
if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE)
|
||||
|| nvdimm_has_flush(nd_region) < 0)
|
||||
dev_warn(dev, "unable to guarantee persistence of writes\n");
|
||||
|
||||
if (!devm_request_mem_region(dev, res->start, resource_size(res),
|
||||
|
|
|
@ -1015,8 +1015,8 @@ void nvdimm_flush(struct nd_region *nd_region)
|
|||
* The first wmb() is needed to 'sfence' all previous writes
|
||||
* such that they are architecturally visible for the platform
|
||||
* buffer flush. Note that we've already arranged for pmem
|
||||
* writes to avoid the cache via arch_memcpy_to_pmem(). The
|
||||
* final wmb() ensures ordering for the NVDIMM flush write.
|
||||
* writes to avoid the cache via memcpy_flushcache(). The final
|
||||
* wmb() ensures ordering for the NVDIMM flush write.
|
||||
*/
|
||||
wmb();
|
||||
for (i = 0; i < nd_region->ndr_mappings; i++)
|
||||
|
|
|
@ -16,6 +16,9 @@ struct dax_operations {
|
|||
*/
|
||||
long (*direct_access)(struct dax_device *, pgoff_t, long,
|
||||
void **, pfn_t *);
|
||||
/* copy_from_iter: dax-driver override for default copy_from_iter */
|
||||
size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
|
||||
struct iov_iter *);
|
||||
};
|
||||
|
||||
#if IS_ENABLED(CONFIG_DAX)
|
||||
|
|
|
@ -122,6 +122,12 @@ static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
|
|||
return 0;
|
||||
}
|
||||
#endif
|
||||
#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
|
||||
static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
|
||||
{
|
||||
memcpy(dst, src, cnt);
|
||||
}
|
||||
#endif
|
||||
void *memchr_inv(const void *s, int c, size_t n);
|
||||
char *strreplace(char *s, char old, char new);
|
||||
|
||||
|
|
|
@ -95,6 +95,21 @@ size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
|
|||
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
|
||||
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
|
||||
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
|
||||
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
|
||||
/*
|
||||
* Note, users like pmem that depend on the stricter semantics of
|
||||
* copy_from_iter_flushcache() than copy_from_iter_nocache() must check for
|
||||
* IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
|
||||
* destination is flushed from the cache on return.
|
||||
*/
|
||||
size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
|
||||
#else
|
||||
static inline size_t copy_from_iter_flushcache(void *addr, size_t bytes,
|
||||
struct iov_iter *i)
|
||||
{
|
||||
return copy_from_iter_nocache(addr, bytes, i);
|
||||
}
|
||||
#endif
|
||||
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);
|
||||
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
|
||||
unsigned long iov_iter_alignment(const struct iov_iter *i);
|
||||
|
|
|
@ -548,6 +548,9 @@ config ARCH_HAS_SG_CHAIN
|
|||
config ARCH_HAS_PMEM_API
|
||||
bool
|
||||
|
||||
config ARCH_HAS_UACCESS_FLUSHCACHE
|
||||
bool
|
||||
|
||||
config ARCH_HAS_MMIO_FLUSH
|
||||
bool
|
||||
|
||||
|
|
|
@ -615,6 +615,28 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
|||
}
|
||||
EXPORT_SYMBOL(copy_from_iter_nocache);
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
|
||||
size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
char *to = addr;
|
||||
if (unlikely(i->type & ITER_PIPE)) {
|
||||
WARN_ON(1);
|
||||
return 0;
|
||||
}
|
||||
iterate_and_advance(i, bytes, v,
|
||||
__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
|
||||
v.iov_base, v.iov_len),
|
||||
memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
|
||||
v.bv_offset, v.bv_len),
|
||||
memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
|
||||
v.iov_len)
|
||||
)
|
||||
|
||||
return bytes;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(copy_from_iter_flushcache);
|
||||
#endif
|
||||
|
||||
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
char *to = addr;
|
||||
|
|
Loading…
Reference in New Issue