linux/mm/fadvise.c

152 lines
3.3 KiB
C
Raw Normal View History

/*
* mm/fadvise.c
*
* Copyright (C) 2002, Linus Torvalds
*
* 11Jan2003 Andrew Morton
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
[PATCH] fadvise(): write commands Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:18:04 +08:00
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <asm/unistd.h>
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
*/
SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
{
struct file *file = fget(fd);
struct address_space *mapping;
struct backing_dev_info *bdi;
[PATCH] fadvise(): write commands Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:18:04 +08:00
loff_t endbyte; /* inclusive */
pgoff_t start_index;
pgoff_t end_index;
unsigned long nrpages;
int ret = 0;
if (!file)
return -EBADF;
if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
mapping = file->f_mapping;
if (!mapping || len < 0) {
ret = -EINVAL;
goto out;
}
if (mapping->a_ops->get_xip_mem) {
check ADVICE of fadvise64_64 even if get_xip_page is given I've written some test programs in ltp project. During writing I met an problem which I cannot solve in user land. So I wrote a patch for linux kernel. Please, include this patch if acceptable. The test program tests the 4th parameter of fadvise64_64: long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); My test case calls fadvise64_64 with invalid advice value and checks errno is set to EINVAL. About the advice parameter man page says: ... Permissible values for advice include: POSIX_FADV_NORMAL ... POSIX_FADV_SEQUENTIAL ... POSIX_FADV_RANDOM ... POSIX_FADV_NOREUSE ... POSIX_FADV_WILLNEED ... POSIX_FADV_DONTNEED ... ERRORS ... EINVAL An invalid value was specified for advice. However, I got a bug report that the system call invocations in my test case returned 0 unexpectedly. I've inspected the kernel code: asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; int ret = 0; if (!file) return -EBADF; if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { ret = -ESPIPE; goto out; } mapping = file->f_mapping; if (!mapping || len < 0) { ret = -EINVAL; goto out; } if (mapping->a_ops->get_xip_page) /* no bad return value, but ignore advice */ goto out; ... out: fput(file); return ret; } I found the advice parameter is just ignored in the case mapping->a_ops->get_xip_page is given. This behavior is different from what is written on the man page. Is this o.k.? get_xip_page is given if CONFIG_EXT2_FS_XIP is true. Anyway I cannot find the easy way to detect get_xip_page field is given or CONFIG_EXT2_FS_XIP is true from the user space. I propose the following patch which checks the advice parameter even if get_xip_page is given. Signed-off-by: Masatake YAMATO <yamato@redhat.com> Acked-by: Carsten Otte <cotte@de.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:29:31 +08:00
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
case POSIX_FADV_WILLNEED:
case POSIX_FADV_NOREUSE:
case POSIX_FADV_DONTNEED:
/* no bad return value, but ignore advice */
break;
default:
ret = -EINVAL;
}
goto out;
check ADVICE of fadvise64_64 even if get_xip_page is given I've written some test programs in ltp project. During writing I met an problem which I cannot solve in user land. So I wrote a patch for linux kernel. Please, include this patch if acceptable. The test program tests the 4th parameter of fadvise64_64: long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); My test case calls fadvise64_64 with invalid advice value and checks errno is set to EINVAL. About the advice parameter man page says: ... Permissible values for advice include: POSIX_FADV_NORMAL ... POSIX_FADV_SEQUENTIAL ... POSIX_FADV_RANDOM ... POSIX_FADV_NOREUSE ... POSIX_FADV_WILLNEED ... POSIX_FADV_DONTNEED ... ERRORS ... EINVAL An invalid value was specified for advice. However, I got a bug report that the system call invocations in my test case returned 0 unexpectedly. I've inspected the kernel code: asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; int ret = 0; if (!file) return -EBADF; if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { ret = -ESPIPE; goto out; } mapping = file->f_mapping; if (!mapping || len < 0) { ret = -EINVAL; goto out; } if (mapping->a_ops->get_xip_page) /* no bad return value, but ignore advice */ goto out; ... out: fput(file); return ret; } I found the advice parameter is just ignored in the case mapping->a_ops->get_xip_page is given. This behavior is different from what is written on the man page. Is this o.k.? get_xip_page is given if CONFIG_EXT2_FS_XIP is true. Anyway I cannot find the easy way to detect get_xip_page field is given or CONFIG_EXT2_FS_XIP is true from the user space. I propose the following patch which checks the advice parameter even if get_xip_page is given. Signed-off-by: Masatake YAMATO <yamato@redhat.com> Acked-by: Carsten Otte <cotte@de.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:29:31 +08:00
}
/* Careful about overflows. Len == 0 means "as much as possible" */
endbyte = offset + len;
if (!len || endbyte < len)
endbyte = -1;
[PATCH] fadvise(): write commands Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:18:04 +08:00
else
endbyte--; /* inclusive */
bdi = mapping->backing_dev_info;
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
break;
case POSIX_FADV_RANDOM:
file->f_ra.ra_pages = 0;
break;
case POSIX_FADV_SEQUENTIAL:
file->f_ra.ra_pages = bdi->ra_pages * 2;
break;
case POSIX_FADV_WILLNEED:
if (!mapping->a_ops->readpage) {
ret = -EINVAL;
break;
}
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
[PATCH] fadvise(): write commands Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:18:04 +08:00
end_index = endbyte >> PAGE_CACHE_SHIFT;
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;
if (!nrpages)
nrpages = ~0UL;
ret = force_page_cache_readahead(mapping, file,
start_index,
max_sane_readahead(nrpages));
if (ret > 0)
ret = 0;
break;
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
if (!bdi_write_congested(mapping->backing_dev_info))
filemap_flush(mapping);
/* First and last FULL page! */
[PATCH] fadvise(): write commands Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:18:04 +08:00
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
[PATCH] fadvise(): write commands Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:18:04 +08:00
if (end_index >= start_index)
invalidate_mapping_pages(mapping, start_index,
end_index);
break;
default:
ret = -EINVAL;
}
out:
fput(file);
return ret;
}
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
{
return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
}
SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
#endif
#ifdef __ARCH_WANT_SYS_FADVISE64
SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
{
return sys_fadvise64_64(fd, offset, len, advice);
}
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
{
return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
}
SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
#endif
#endif