2006-10-11 16:20:50 +08:00
|
|
|
/*
|
2006-10-11 16:20:53 +08:00
|
|
|
* linux/fs/ext4/file.c
|
2006-10-11 16:20:50 +08:00
|
|
|
*
|
|
|
|
* Copyright (C) 1992, 1993, 1994, 1995
|
|
|
|
* Remy Card (card@masi.ibp.fr)
|
|
|
|
* Laboratoire MASI - Institut Blaise Pascal
|
|
|
|
* Universite Pierre et Marie Curie (Paris VI)
|
|
|
|
*
|
|
|
|
* from
|
|
|
|
*
|
|
|
|
* linux/fs/minix/file.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
|
|
*
|
2006-10-11 16:20:53 +08:00
|
|
|
* ext4 fs regular file handling primitives
|
2006-10-11 16:20:50 +08:00
|
|
|
*
|
|
|
|
* 64-bit file support on 64-bit platforms by Jakub Jelinek
|
|
|
|
* (jj@sunsite.ms.mff.cuni.cz)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/fs.h>
|
2009-06-13 22:09:48 +08:00
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/path.h>
|
2015-09-09 05:58:40 +08:00
|
|
|
#include <linux/dax.h>
|
2010-03-03 22:05:07 +08:00
|
|
|
#include <linux/quotaops.h>
|
2012-11-09 10:57:40 +08:00
|
|
|
#include <linux/pagevec.h>
|
2015-02-23 00:58:50 +08:00
|
|
|
#include <linux/uio.h>
|
2008-04-30 06:13:32 +08:00
|
|
|
#include "ext4.h"
|
|
|
|
#include "ext4_jbd2.h"
|
2006-10-11 16:20:50 +08:00
|
|
|
#include "xattr.h"
|
|
|
|
#include "acl.h"
|
|
|
|
|
2016-11-21 06:36:06 +08:00
|
|
|
#ifdef CONFIG_FS_DAX
|
|
|
|
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
inode_lock_shared(inode);
|
|
|
|
/*
|
|
|
|
* Recheck under inode lock - at this point we are sure it cannot
|
|
|
|
* change anymore
|
|
|
|
*/
|
|
|
|
if (!IS_DAX(inode)) {
|
|
|
|
inode_unlock_shared(inode);
|
|
|
|
/* Fallback to buffered IO in case we cannot support DAX */
|
|
|
|
return generic_file_read_iter(iocb, to);
|
|
|
|
}
|
|
|
|
ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
|
|
|
|
inode_unlock_shared(inode);
|
|
|
|
|
|
|
|
file_accessed(iocb->ki_filp);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
|
|
{
|
2017-02-05 14:28:48 +08:00
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
|
|
|
|
return -EIO;
|
|
|
|
|
2016-11-21 06:36:06 +08:00
|
|
|
if (!iov_iter_count(to))
|
|
|
|
return 0; /* skip atime */
|
|
|
|
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
|
|
if (IS_DAX(file_inode(iocb->ki_filp)))
|
|
|
|
return ext4_dax_read_iter(iocb, to);
|
|
|
|
#endif
|
|
|
|
return generic_file_read_iter(iocb, to);
|
|
|
|
}
|
|
|
|
|
2006-10-11 16:20:50 +08:00
|
|
|
/*
|
|
|
|
* Called when an inode is released. Note that this is different
|
2006-10-11 16:20:53 +08:00
|
|
|
* from ext4_file_open: open gets called at every open, but release
|
2006-10-11 16:20:50 +08:00
|
|
|
* gets called only when /all/ the files are closed.
|
|
|
|
*/
|
2008-09-09 10:25:24 +08:00
|
|
|
static int ext4_release_file(struct inode *inode, struct file *filp)
|
2006-10-11 16:20:50 +08:00
|
|
|
{
|
2010-01-25 03:34:07 +08:00
|
|
|
if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
|
2009-02-24 21:21:14 +08:00
|
|
|
ext4_alloc_da_blocks(inode);
|
2010-01-25 03:34:07 +08:00
|
|
|
ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
|
2009-02-24 21:21:14 +08:00
|
|
|
}
|
2006-10-11 16:20:50 +08:00
|
|
|
/* if we are the last writer on the inode, drop the block reservation */
|
|
|
|
if ((filp->f_mode & FMODE_WRITE) &&
|
2009-03-28 10:36:43 +08:00
|
|
|
(atomic_read(&inode->i_writecount) == 1) &&
|
|
|
|
!EXT4_I(inode)->i_reserved_data_blocks)
|
2006-10-11 16:20:50 +08:00
|
|
|
{
|
2008-01-29 12:58:26 +08:00
|
|
|
down_write(&EXT4_I(inode)->i_data_sem);
|
2008-10-10 21:40:52 +08:00
|
|
|
ext4_discard_preallocations(inode);
|
2008-01-29 12:58:26 +08:00
|
|
|
up_write(&EXT4_I(inode)->i_data_sem);
|
2006-10-11 16:20:50 +08:00
|
|
|
}
|
|
|
|
if (is_dx(inode) && filp->private_data)
|
2006-10-11 16:20:53 +08:00
|
|
|
ext4_htree_free_dir_info(filp->private_data);
|
2006-10-11 16:20:50 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-05-12 22:50:23 +08:00
|
|
|
static void ext4_unwritten_wait(struct inode *inode)
|
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.
The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions. When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.
Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO. I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write(). But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.
I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex. So that won't work.
This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment. I've
tested a backport of this patch with qemu, and it does
avoid the corruption. It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.
Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.
[tytso@mit.edu: Keep the mutex as a hashed array instead
of bloating the ext4 inode]
[tytso@mit.edu: Fix up namespace issues so that global
variables are protected with an "ext4_" prefix.]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-02-12 21:17:34 +08:00
|
|
|
{
|
|
|
|
wait_queue_head_t *wq = ext4_ioend_wq(inode);
|
|
|
|
|
2012-09-29 11:24:52 +08:00
|
|
|
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
|
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.
The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions. When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.
Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO. I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write(). But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.
I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex. So that won't work.
This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment. I've
tested a backport of this patch with qemu, and it does
avoid the corruption. It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.
Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.
[tytso@mit.edu: Keep the mutex as a hashed array instead
of bloating the ext4 inode]
[tytso@mit.edu: Fix up namespace issues so that global
variables are protected with an "ext4_" prefix.]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-02-12 21:17:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This tests whether the IO in question is block-aligned or not.
|
|
|
|
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
|
|
|
|
* are converted to written only after the IO is complete. Until they are
|
|
|
|
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
|
|
|
|
* it needs to zero out portions of the start and/or end block. If 2 AIO
|
|
|
|
* threads are at work on the same unwritten block, they must be synchronized
|
|
|
|
* or one thread will zero the other's data, causing corruption.
|
|
|
|
*/
|
|
|
|
static int
|
2014-04-18 04:09:22 +08:00
|
|
|
ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
|
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.
The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions. When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.
Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO. I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write(). But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.
I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex. So that won't work.
This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment. I've
tested a backport of this patch with qemu, and it does
avoid the corruption. It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.
Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.
[tytso@mit.edu: Keep the mutex as a hashed array instead
of bloating the ext4 inode]
[tytso@mit.edu: Fix up namespace issues so that global
variables are protected with an "ext4_" prefix.]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-02-12 21:17:34 +08:00
|
|
|
{
|
|
|
|
struct super_block *sb = inode->i_sb;
|
|
|
|
int blockmask = sb->s_blocksize - 1;
|
|
|
|
|
2014-04-13 00:45:25 +08:00
|
|
|
if (pos >= i_size_read(inode))
|
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.
The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions. When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.
Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO. I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write(). But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.
I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex. So that won't work.
This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment. I've
tested a backport of this patch with qemu, and it does
avoid the corruption. It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.
Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.
[tytso@mit.edu: Keep the mutex as a hashed array instead
of bloating the ext4 inode]
[tytso@mit.edu: Fix up namespace issues so that global
variables are protected with an "ext4_" prefix.]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-02-12 21:17:34 +08:00
|
|
|
return 0;
|
|
|
|
|
2014-04-18 04:09:22 +08:00
|
|
|
if ((pos | iov_iter_alignment(from)) & blockmask)
|
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.
The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions. When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.
Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO. I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write(). But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.
I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex. So that won't work.
This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment. I've
tested a backport of this patch with qemu, and it does
avoid the corruption. It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.
Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.
[tytso@mit.edu: Keep the mutex as a hashed array instead
of bloating the ext4 inode]
[tytso@mit.edu: Fix up namespace issues so that global
variables are protected with an "ext4_" prefix.]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-02-12 21:17:34 +08:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-11-21 06:29:51 +08:00
|
|
|
/* Is IO overwriting allocated and initialized blocks? */
|
|
|
|
static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
|
|
|
|
{
|
|
|
|
struct ext4_map_blocks map;
|
|
|
|
unsigned int blkbits = inode->i_blkbits;
|
|
|
|
int err, blklen;
|
|
|
|
|
|
|
|
if (pos + len > i_size_read(inode))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
map.m_lblk = pos >> blkbits;
|
|
|
|
map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
|
|
|
|
blklen = map.m_len;
|
|
|
|
|
|
|
|
err = ext4_map_blocks(NULL, inode, &map, 0);
|
|
|
|
/*
|
|
|
|
* 'err==len' means that all of the blocks have been preallocated,
|
|
|
|
* regardless of whether they have been initialized or not. To exclude
|
|
|
|
* unwritten extents, we need to check m_flags.
|
|
|
|
*/
|
|
|
|
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
ret = generic_write_checks(iocb, from);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
/*
|
|
|
|
* If we have encountered a bitmap-format file, the size limit
|
|
|
|
* is smaller than s_maxbytes, which is for extent-mapped files.
|
|
|
|
*/
|
|
|
|
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
|
|
|
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
|
|
|
|
|
|
if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
|
|
|
|
return -EFBIG;
|
|
|
|
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
|
|
|
|
}
|
|
|
|
return iov_iter_count(from);
|
|
|
|
}
|
|
|
|
|
2016-11-21 07:09:11 +08:00
|
|
|
#ifdef CONFIG_FS_DAX
|
|
|
|
static ssize_t
|
|
|
|
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
inode_lock(inode);
|
|
|
|
ret = ext4_write_checks(iocb, from);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
|
|
|
ret = file_remove_privs(iocb->ki_filp);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
ret = file_update_time(iocb->ki_filp);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
|
|
|
|
out:
|
2017-02-09 03:39:27 +08:00
|
|
|
inode_unlock(inode);
|
2016-11-21 07:09:11 +08:00
|
|
|
if (ret > 0)
|
|
|
|
ret = generic_write_sync(iocb, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-10-11 16:20:50 +08:00
|
|
|
static ssize_t
|
2014-04-18 04:09:22 +08:00
|
|
|
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
2006-10-11 16:20:50 +08:00
|
|
|
{
|
2014-04-22 02:26:57 +08:00
|
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
2015-04-10 01:52:01 +08:00
|
|
|
int o_direct = iocb->ki_flags & IOCB_DIRECT;
|
2016-03-09 11:44:50 +08:00
|
|
|
int unaligned_aio = 0;
|
2012-07-23 08:19:31 +08:00
|
|
|
int overwrite = 0;
|
2012-05-29 06:06:51 +08:00
|
|
|
ssize_t ret;
|
2014-04-22 02:26:28 +08:00
|
|
|
|
2017-02-05 14:28:48 +08:00
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
|
|
return -EIO;
|
|
|
|
|
2016-11-21 07:09:11 +08:00
|
|
|
#ifdef CONFIG_FS_DAX
|
|
|
|
if (IS_DAX(inode))
|
|
|
|
return ext4_dax_write_iter(iocb, from);
|
|
|
|
#endif
|
|
|
|
|
2016-03-09 11:44:50 +08:00
|
|
|
inode_lock(inode);
|
2016-11-21 06:29:51 +08:00
|
|
|
ret = ext4_write_checks(iocb, from);
|
2016-03-09 11:44:50 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
|
|
|
|
2014-04-22 02:37:52 +08:00
|
|
|
/*
|
2016-03-09 11:44:50 +08:00
|
|
|
* Unaligned direct AIO must be serialized among each other as zeroing
|
|
|
|
* of partial blocks of two competing unaligned AIOs can result in data
|
|
|
|
* corruption.
|
2014-04-22 02:37:52 +08:00
|
|
|
*/
|
2016-03-09 11:44:50 +08:00
|
|
|
if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
|
2014-04-22 02:37:52 +08:00
|
|
|
!is_sync_kiocb(iocb) &&
|
2016-03-09 11:44:50 +08:00
|
|
|
ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
|
|
|
|
unaligned_aio = 1;
|
2014-04-22 02:37:52 +08:00
|
|
|
ext4_unwritten_wait(inode);
|
|
|
|
}
|
|
|
|
|
2014-10-30 22:53:16 +08:00
|
|
|
iocb->private = &overwrite;
|
2016-11-21 06:29:51 +08:00
|
|
|
/* Check whether we do a DIO overwrite or not */
|
|
|
|
if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
|
|
|
|
ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
|
|
|
|
overwrite = 1;
|
2014-04-22 02:26:28 +08:00
|
|
|
|
2014-04-18 04:09:22 +08:00
|
|
|
ret = __generic_file_write_iter(iocb, from);
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2014-04-22 02:26:28 +08:00
|
|
|
|
2016-04-07 23:52:01 +08:00
|
|
|
if (ret > 0)
|
|
|
|
ret = generic_write_sync(iocb, ret);
|
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.
The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions. When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.
Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO. I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write(). But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.
I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex. So that won't work.
This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment. I've
tested a backport of this patch with qemu, and it does
avoid the corruption. It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.
Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.
[tytso@mit.edu: Keep the mutex as a hashed array instead
of bloating the ext4 inode]
[tytso@mit.edu: Fix up namespace issues so that global
variables are protected with an "ext4_" prefix.]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-02-12 21:17:34 +08:00
|
|
|
|
2015-04-08 02:48:22 +08:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
out:
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.
The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions. When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.
Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO. I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write(). But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.
I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex. So that won't work.
This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment. I've
tested a backport of this patch with qemu, and it does
avoid the corruption. It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.
Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.
[tytso@mit.edu: Keep the mutex as a hashed array instead
of bloating the ext4 inode]
[tytso@mit.edu: Fix up namespace issues so that global
variables are protected with an "ext4_" prefix.]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-02-12 21:17:34 +08:00
|
|
|
return ret;
|
2006-10-11 16:20:50 +08:00
|
|
|
}
|
|
|
|
|
2015-02-17 07:59:38 +08:00
|
|
|
#ifdef CONFIG_FS_DAX
|
2017-02-25 06:56:41 +08:00
|
|
|
static int ext4_dax_fault(struct vm_fault *vmf)
|
2015-02-17 07:59:38 +08:00
|
|
|
{
|
2015-09-09 05:59:22 +08:00
|
|
|
int result;
|
2017-02-25 06:56:41 +08:00
|
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
2015-12-08 03:28:03 +08:00
|
|
|
struct super_block *sb = inode->i_sb;
|
2015-09-09 05:59:22 +08:00
|
|
|
bool write = vmf->flags & FAULT_FLAG_WRITE;
|
|
|
|
|
|
|
|
if (write) {
|
|
|
|
sb_start_pagefault(sb);
|
2017-02-25 06:56:41 +08:00
|
|
|
file_update_time(vmf->vma->vm_file);
|
2016-10-21 17:33:49 +08:00
|
|
|
}
|
|
|
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
2017-02-25 06:56:41 +08:00
|
|
|
result = dax_iomap_fault(vmf, &ext4_iomap_ops);
|
2016-10-21 17:33:49 +08:00
|
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
|
|
if (write)
|
2015-09-09 05:59:22 +08:00
|
|
|
sb_end_pagefault(sb);
|
|
|
|
|
|
|
|
return result;
|
2015-02-17 07:59:38 +08:00
|
|
|
}
|
|
|
|
|
2015-12-08 03:28:03 +08:00
|
|
|
/*
|
2016-02-28 03:01:13 +08:00
|
|
|
* Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
|
2015-12-08 03:28:03 +08:00
|
|
|
* handler we check for races agaist truncate. Note that since we cycle through
|
|
|
|
* i_mmap_sem, we are sure that also any hole punching that began before we
|
|
|
|
* were called is finished by now and so if it included part of the file we
|
|
|
|
* are working on, our pte will get unmapped and the check for pte_same() in
|
|
|
|
* wp_pfn_shared() fails. Thus fault gets retried and things work out as
|
|
|
|
* desired.
|
|
|
|
*/
|
2017-02-25 06:56:41 +08:00
|
|
|
static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
|
2015-12-08 03:28:03 +08:00
|
|
|
{
|
2017-02-25 06:56:41 +08:00
|
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
2015-12-08 03:28:03 +08:00
|
|
|
struct super_block *sb = inode->i_sb;
|
|
|
|
loff_t size;
|
2016-01-23 07:10:53 +08:00
|
|
|
int ret;
|
2015-12-08 03:28:03 +08:00
|
|
|
|
|
|
|
sb_start_pagefault(sb);
|
2017-02-25 06:56:41 +08:00
|
|
|
file_update_time(vmf->vma->vm_file);
|
2015-12-08 03:28:03 +08:00
|
|
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
|
|
|
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
|
|
if (vmf->pgoff >= size)
|
|
|
|
ret = VM_FAULT_SIGBUS;
|
2016-01-23 07:10:53 +08:00
|
|
|
else
|
2017-02-25 06:56:41 +08:00
|
|
|
ret = dax_pfn_mkwrite(vmf);
|
2015-12-08 03:28:03 +08:00
|
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
|
|
sb_end_pagefault(sb);
|
|
|
|
|
|
|
|
return ret;
|
2015-02-17 07:59:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
|
|
|
.fault = ext4_dax_fault,
|
mm,fs,dax: change ->pmd_fault to ->huge_fault
Patch series "1G transparent hugepage support for device dax", v2.
The following series implements support for 1G trasparent hugepage on
x86 for device dax. The bulk of the code was written by Mathew Wilcox a
while back supporting transparent 1G hugepage for fs DAX. I have
forward ported the relevant bits to 4.10-rc. The current submission has
only the necessary code to support device DAX.
Comments from Dan Williams: So the motivation and intended user of this
functionality mirrors the motivation and users of 1GB page support in
hugetlbfs. Given expected capacities of persistent memory devices an
in-memory database may want to reduce tlb pressure beyond what they can
already achieve with 2MB mappings of a device-dax file. We have
customer feedback to that effect as Willy mentioned in his previous
version of these patches [1].
[1]: https://lkml.org/lkml/2016/1/31/52
Comments from Nilesh @ Oracle:
There are applications which have a process model; and if you assume
10,000 processes attempting to mmap all the 6TB memory available on a
server; we are looking at the following:
processes : 10,000
memory : 6TB
pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB
pmd @ 2M page size: 120,000 / 512 = ~240GB
pud @ 1G page size: 240GB / 512 = ~480MB
As you can see with 2M pages, this system will use up an exorbitant
amount of DRAM to hold the page tables; but the 1G pages finally brings
it down to a reasonable level. Memory sizes will keep increasing; so
this number will keep increasing.
An argument can be made to convert the applications from process model
to thread model, but in the real world that may not be always practical.
Hopefully this helps explain the use case where this is valuable.
This patch (of 3):
In preparation for adding the ability to handle PUD pages, convert
vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault. The
vm_fault structure is extended to include a union of the different page
table pointers that may be needed, and three flag bits are reserved to
indicate which type of pointer is in the union.
[ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()]
Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com
[dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path]
Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-25 06:56:59 +08:00
|
|
|
.huge_fault = ext4_dax_fault,
|
2016-02-28 03:01:13 +08:00
|
|
|
.page_mkwrite = ext4_dax_fault,
|
2015-12-08 03:28:03 +08:00
|
|
|
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
|
2015-02-17 07:59:38 +08:00
|
|
|
};
|
|
|
|
#else
|
|
|
|
#define ext4_dax_vm_ops ext4_file_vm_ops
|
|
|
|
#endif
|
|
|
|
|
2009-09-28 02:29:37 +08:00
|
|
|
static const struct vm_operations_struct ext4_file_vm_ops = {
|
2015-12-08 03:28:03 +08:00
|
|
|
.fault = ext4_filemap_fault,
|
2014-04-08 06:37:19 +08:00
|
|
|
.map_pages = filemap_map_pages,
|
2008-07-12 07:27:31 +08:00
|
|
|
.page_mkwrite = ext4_page_mkwrite,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
|
|
|
|
{
|
2015-04-12 12:56:10 +08:00
|
|
|
struct inode *inode = file->f_mapping->host;
|
|
|
|
|
2017-02-05 14:28:48 +08:00
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
|
|
return -EIO;
|
|
|
|
|
2015-04-12 12:56:10 +08:00
|
|
|
if (ext4_encrypted_inode(inode)) {
|
2016-07-11 02:01:03 +08:00
|
|
|
int err = fscrypt_get_encryption_info(inode);
|
2015-04-12 12:56:10 +08:00
|
|
|
if (err)
|
|
|
|
return 0;
|
2016-07-11 02:01:03 +08:00
|
|
|
if (!fscrypt_has_encryption_key(inode))
|
2015-06-01 01:35:39 +08:00
|
|
|
return -ENOKEY;
|
2015-04-12 12:56:10 +08:00
|
|
|
}
|
2008-07-12 07:27:31 +08:00
|
|
|
file_accessed(file);
|
2015-02-17 07:59:38 +08:00
|
|
|
if (IS_DAX(file_inode(file))) {
|
|
|
|
vma->vm_ops = &ext4_dax_vm_ops;
|
2015-09-09 05:59:03 +08:00
|
|
|
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
2015-02-17 07:59:38 +08:00
|
|
|
} else {
|
|
|
|
vma->vm_ops = &ext4_file_vm_ops;
|
|
|
|
}
|
2008-07-12 07:27:31 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-06-13 22:09:48 +08:00
|
|
|
static int ext4_file_open(struct inode * inode, struct file * filp)
|
|
|
|
{
|
|
|
|
struct super_block *sb = inode->i_sb;
|
|
|
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
|
|
struct vfsmount *mnt = filp->f_path.mnt;
|
2016-03-27 04:14:41 +08:00
|
|
|
struct dentry *dir;
|
2009-06-13 22:09:48 +08:00
|
|
|
struct path path;
|
|
|
|
char buf[64], *cp;
|
2015-04-12 12:56:10 +08:00
|
|
|
int ret;
|
2009-06-13 22:09:48 +08:00
|
|
|
|
2017-02-05 14:28:48 +08:00
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
|
|
return -EIO;
|
|
|
|
|
2009-06-13 22:09:48 +08:00
|
|
|
if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
|
|
|
|
!(sb->s_flags & MS_RDONLY))) {
|
|
|
|
sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
|
|
|
|
/*
|
|
|
|
* Sample where the filesystem has been mounted and
|
|
|
|
* store it in the superblock for sysadmin convenience
|
|
|
|
* when trying to sort through large numbers of block
|
|
|
|
* devices or filesystem images.
|
|
|
|
*/
|
|
|
|
memset(buf, 0, sizeof(buf));
|
2010-01-24 09:10:29 +08:00
|
|
|
path.mnt = mnt;
|
|
|
|
path.dentry = mnt->mnt_root;
|
2009-06-13 22:09:48 +08:00
|
|
|
cp = d_path(&path, buf, sizeof(buf));
|
|
|
|
if (!IS_ERR(cp)) {
|
2012-07-23 08:31:31 +08:00
|
|
|
handle_t *handle;
|
|
|
|
int err;
|
|
|
|
|
2013-02-09 10:59:22 +08:00
|
|
|
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
|
2012-07-23 08:31:31 +08:00
|
|
|
if (IS_ERR(handle))
|
|
|
|
return PTR_ERR(handle);
|
2014-05-13 10:06:43 +08:00
|
|
|
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
|
2012-07-23 08:31:31 +08:00
|
|
|
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
|
|
|
|
if (err) {
|
|
|
|
ext4_journal_stop(handle);
|
|
|
|
return err;
|
|
|
|
}
|
2011-10-25 21:18:41 +08:00
|
|
|
strlcpy(sbi->s_es->s_last_mounted, cp,
|
|
|
|
sizeof(sbi->s_es->s_last_mounted));
|
2012-07-23 08:31:31 +08:00
|
|
|
ext4_handle_dirty_super(handle, sb);
|
|
|
|
ext4_journal_stop(handle);
|
2009-06-13 22:09:48 +08:00
|
|
|
}
|
|
|
|
}
|
2015-06-01 01:35:39 +08:00
|
|
|
if (ext4_encrypted_inode(inode)) {
|
2016-07-11 02:01:03 +08:00
|
|
|
ret = fscrypt_get_encryption_info(inode);
|
2015-06-01 01:35:39 +08:00
|
|
|
if (ret)
|
|
|
|
return -EACCES;
|
2016-07-11 02:01:03 +08:00
|
|
|
if (!fscrypt_has_encryption_key(inode))
|
2015-06-01 01:35:39 +08:00
|
|
|
return -ENOKEY;
|
|
|
|
}
|
2016-03-27 04:14:41 +08:00
|
|
|
|
2016-03-27 04:14:42 +08:00
|
|
|
dir = dget_parent(file_dentry(filp));
|
2016-03-27 04:14:41 +08:00
|
|
|
if (ext4_encrypted_inode(d_inode(dir)) &&
|
2016-07-11 02:01:03 +08:00
|
|
|
!fscrypt_has_permitted_context(d_inode(dir), inode)) {
|
2016-02-08 13:54:26 +08:00
|
|
|
ext4_warning(inode->i_sb,
|
2016-04-27 13:11:21 +08:00
|
|
|
"Inconsistent encryption contexts: %lu/%lu",
|
2016-03-27 04:14:41 +08:00
|
|
|
(unsigned long) d_inode(dir)->i_ino,
|
2016-02-08 13:54:26 +08:00
|
|
|
(unsigned long) inode->i_ino);
|
2016-03-27 04:14:41 +08:00
|
|
|
dput(dir);
|
2016-02-08 13:54:26 +08:00
|
|
|
return -EPERM;
|
|
|
|
}
|
2016-03-27 04:14:41 +08:00
|
|
|
dput(dir);
|
2011-01-11 01:29:43 +08:00
|
|
|
/*
|
|
|
|
* Set up the jbd2_inode if we are opening the inode for
|
|
|
|
* writing and the journal is present
|
|
|
|
*/
|
2013-08-17 09:19:41 +08:00
|
|
|
if (filp->f_mode & FMODE_WRITE) {
|
2015-04-12 12:56:10 +08:00
|
|
|
ret = ext4_inode_attach_jinode(inode);
|
2013-08-17 09:19:41 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2011-01-11 01:29:43 +08:00
|
|
|
}
|
2015-06-01 01:35:39 +08:00
|
|
|
return dquot_file_open(inode, filp);
|
2009-06-13 22:09:48 +08:00
|
|
|
}
|
|
|
|
|
2012-11-09 10:57:40 +08:00
|
|
|
/*
|
|
|
|
* Here we use ext4_map_blocks() to get a block mapping for a extent-based
|
|
|
|
* file rather than ext4_ext_walk_space() because we can introduce
|
|
|
|
* SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
|
|
|
|
* function. When extent status tree has been fully implemented, it will
|
|
|
|
* track all extent status for a file and we can directly use it to
|
|
|
|
* retrieve the offset for SEEK_DATA/SEEK_HOLE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
|
|
|
|
* lookup page cache to check whether or not there has some data between
|
|
|
|
* [startoff, endoff] because, if this range contains an unwritten extent,
|
|
|
|
* we determine this extent as a data or a hole according to whether the
|
|
|
|
* page cache has data or not.
|
|
|
|
*/
|
2015-01-03 04:16:00 +08:00
|
|
|
static int ext4_find_unwritten_pgoff(struct inode *inode,
|
|
|
|
int whence,
|
2016-03-10 12:11:13 +08:00
|
|
|
ext4_lblk_t end_blk,
|
2015-01-03 04:16:00 +08:00
|
|
|
loff_t *offset)
|
2012-11-09 10:57:40 +08:00
|
|
|
{
|
|
|
|
struct pagevec pvec;
|
2015-01-03 04:16:00 +08:00
|
|
|
unsigned int blkbits;
|
2012-11-09 10:57:40 +08:00
|
|
|
pgoff_t index;
|
|
|
|
pgoff_t end;
|
2015-01-03 04:16:00 +08:00
|
|
|
loff_t endoff;
|
2012-11-09 10:57:40 +08:00
|
|
|
loff_t startoff;
|
|
|
|
loff_t lastoff;
|
|
|
|
int found = 0;
|
|
|
|
|
2015-01-03 04:16:00 +08:00
|
|
|
blkbits = inode->i_sb->s_blocksize_bits;
|
2012-11-09 10:57:40 +08:00
|
|
|
startoff = *offset;
|
|
|
|
lastoff = startoff;
|
2016-03-10 12:11:13 +08:00
|
|
|
endoff = (loff_t)end_blk << blkbits;
|
2012-11-09 10:57:40 +08:00
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
index = startoff >> PAGE_SHIFT;
|
|
|
|
end = endoff >> PAGE_SHIFT;
|
2012-11-09 10:57:40 +08:00
|
|
|
|
|
|
|
pagevec_init(&pvec, 0);
|
|
|
|
do {
|
|
|
|
int i, num;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
|
|
|
|
num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
|
|
|
|
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
|
|
|
|
(pgoff_t)num);
|
|
|
|
if (nr_pages == 0) {
|
2012-12-18 07:59:39 +08:00
|
|
|
if (whence == SEEK_DATA)
|
2012-11-09 10:57:40 +08:00
|
|
|
break;
|
|
|
|
|
2012-12-18 07:59:39 +08:00
|
|
|
BUG_ON(whence != SEEK_HOLE);
|
2012-11-09 10:57:40 +08:00
|
|
|
/*
|
|
|
|
* If this is the first time to go into the loop and
|
|
|
|
* offset is not beyond the end offset, it will be a
|
|
|
|
* hole at this offset
|
|
|
|
*/
|
|
|
|
if (lastoff == startoff || lastoff < endoff)
|
|
|
|
found = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is the first time to go into the loop and
|
|
|
|
* offset is smaller than the first page offset, it will be a
|
|
|
|
* hole at this offset.
|
|
|
|
*/
|
2012-12-18 07:59:39 +08:00
|
|
|
if (lastoff == startoff && whence == SEEK_HOLE &&
|
2012-11-09 10:57:40 +08:00
|
|
|
lastoff < page_offset(pvec.pages[0])) {
|
|
|
|
found = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
struct page *page = pvec.pages[i];
|
|
|
|
struct buffer_head *bh, *head;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the current offset is not beyond the end of given
|
|
|
|
* range, it will be a hole.
|
|
|
|
*/
|
2012-12-18 07:59:39 +08:00
|
|
|
if (lastoff < endoff && whence == SEEK_HOLE &&
|
2012-11-09 10:57:40 +08:00
|
|
|
page->index > end) {
|
|
|
|
found = 1;
|
|
|
|
*offset = lastoff;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
lock_page(page);
|
|
|
|
|
|
|
|
if (unlikely(page->mapping != inode->i_mapping)) {
|
|
|
|
unlock_page(page);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!page_has_buffers(page)) {
|
|
|
|
unlock_page(page);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (page_has_buffers(page)) {
|
|
|
|
lastoff = page_offset(page);
|
|
|
|
bh = head = page_buffers(page);
|
|
|
|
do {
|
|
|
|
if (buffer_uptodate(bh) ||
|
|
|
|
buffer_unwritten(bh)) {
|
2012-12-18 07:59:39 +08:00
|
|
|
if (whence == SEEK_DATA)
|
2012-11-09 10:57:40 +08:00
|
|
|
found = 1;
|
|
|
|
} else {
|
2012-12-18 07:59:39 +08:00
|
|
|
if (whence == SEEK_HOLE)
|
2012-11-09 10:57:40 +08:00
|
|
|
found = 1;
|
|
|
|
}
|
|
|
|
if (found) {
|
|
|
|
*offset = max_t(loff_t,
|
|
|
|
startoff, lastoff);
|
|
|
|
unlock_page(page);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
lastoff += bh->b_size;
|
|
|
|
bh = bh->b_this_page;
|
|
|
|
} while (bh != head);
|
|
|
|
}
|
|
|
|
|
|
|
|
lastoff = page_offset(page) + PAGE_SIZE;
|
|
|
|
unlock_page(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The no. of pages is less than our desired, that would be a
|
|
|
|
* hole in there.
|
|
|
|
*/
|
2012-12-18 07:59:39 +08:00
|
|
|
if (nr_pages < num && whence == SEEK_HOLE) {
|
2012-11-09 10:57:40 +08:00
|
|
|
found = 1;
|
|
|
|
*offset = lastoff;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
index = pvec.pages[i - 1]->index + 1;
|
|
|
|
pagevec_release(&pvec);
|
|
|
|
} while (index <= end);
|
|
|
|
|
|
|
|
out:
|
|
|
|
pagevec_release(&pvec);
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ext4_seek_data() retrieves the offset for SEEK_DATA.
|
|
|
|
*/
|
|
|
|
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
|
|
|
|
{
|
|
|
|
struct inode *inode = file->f_mapping->host;
|
2015-01-03 04:16:00 +08:00
|
|
|
struct extent_status es;
|
|
|
|
ext4_lblk_t start, last, end;
|
|
|
|
loff_t dataoff, isize;
|
|
|
|
int blkbits;
|
2016-03-10 12:11:13 +08:00
|
|
|
int ret;
|
2012-11-09 10:57:40 +08:00
|
|
|
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_lock(inode);
|
2015-01-03 04:16:00 +08:00
|
|
|
|
|
|
|
isize = i_size_read(inode);
|
|
|
|
if (offset >= isize) {
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2012-11-09 10:57:40 +08:00
|
|
|
return -ENXIO;
|
|
|
|
}
|
2015-01-03 04:16:00 +08:00
|
|
|
|
|
|
|
blkbits = inode->i_sb->s_blocksize_bits;
|
|
|
|
start = offset >> blkbits;
|
|
|
|
last = start;
|
|
|
|
end = isize >> blkbits;
|
|
|
|
dataoff = offset;
|
|
|
|
|
|
|
|
do {
|
2016-03-10 12:11:13 +08:00
|
|
|
ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
|
|
|
|
if (ret <= 0) {
|
|
|
|
/* No extent found -> no data */
|
|
|
|
if (ret == 0)
|
|
|
|
ret = -ENXIO;
|
|
|
|
inode_unlock(inode);
|
|
|
|
return ret;
|
2015-01-03 04:16:00 +08:00
|
|
|
}
|
2012-11-09 10:57:40 +08:00
|
|
|
|
2016-03-10 12:11:13 +08:00
|
|
|
last = es.es_lblk;
|
|
|
|
if (last != start)
|
|
|
|
dataoff = (loff_t)last << blkbits;
|
|
|
|
if (!ext4_es_is_unwritten(&es))
|
2012-11-09 10:57:40 +08:00
|
|
|
break;
|
|
|
|
|
2015-01-03 04:16:00 +08:00
|
|
|
/*
|
|
|
|
* If there is a unwritten extent at this offset,
|
|
|
|
* it will be as a data or a hole according to page
|
|
|
|
* cache that has data or not.
|
|
|
|
*/
|
2016-03-10 12:11:13 +08:00
|
|
|
if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
|
|
|
|
es.es_lblk + es.es_len, &dataoff))
|
|
|
|
break;
|
|
|
|
last += es.es_len;
|
2015-01-03 04:16:00 +08:00
|
|
|
dataoff = (loff_t)last << blkbits;
|
2016-03-10 12:11:13 +08:00
|
|
|
cond_resched();
|
2015-01-03 04:16:00 +08:00
|
|
|
} while (last <= end);
|
2012-11-09 10:57:40 +08:00
|
|
|
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2012-11-09 10:57:40 +08:00
|
|
|
|
2015-01-03 04:16:00 +08:00
|
|
|
if (dataoff > isize)
|
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
return vfs_setpos(file, dataoff, maxsize);
|
2012-11-09 10:57:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-01-03 04:16:00 +08:00
|
|
|
* ext4_seek_hole() retrieves the offset for SEEK_HOLE.
|
2012-11-09 10:57:40 +08:00
|
|
|
*/
|
|
|
|
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
|
|
|
|
{
|
|
|
|
struct inode *inode = file->f_mapping->host;
|
2015-01-03 04:16:00 +08:00
|
|
|
struct extent_status es;
|
|
|
|
ext4_lblk_t start, last, end;
|
|
|
|
loff_t holeoff, isize;
|
|
|
|
int blkbits;
|
2016-03-10 12:11:13 +08:00
|
|
|
int ret;
|
2012-11-09 10:57:40 +08:00
|
|
|
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_lock(inode);
|
2015-01-03 04:16:00 +08:00
|
|
|
|
|
|
|
isize = i_size_read(inode);
|
|
|
|
if (offset >= isize) {
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2012-11-09 10:57:40 +08:00
|
|
|
return -ENXIO;
|
|
|
|
}
|
|
|
|
|
2015-01-03 04:16:00 +08:00
|
|
|
blkbits = inode->i_sb->s_blocksize_bits;
|
|
|
|
start = offset >> blkbits;
|
|
|
|
last = start;
|
|
|
|
end = isize >> blkbits;
|
|
|
|
holeoff = offset;
|
2012-11-09 10:57:40 +08:00
|
|
|
|
2015-01-03 04:16:00 +08:00
|
|
|
do {
|
2016-03-10 12:11:13 +08:00
|
|
|
ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
|
|
|
|
if (ret < 0) {
|
|
|
|
inode_unlock(inode);
|
|
|
|
return ret;
|
2015-01-03 04:16:00 +08:00
|
|
|
}
|
2016-03-10 12:11:13 +08:00
|
|
|
/* Found a hole? */
|
|
|
|
if (ret == 0 || es.es_lblk > last) {
|
|
|
|
if (last != start)
|
|
|
|
holeoff = (loff_t)last << blkbits;
|
|
|
|
break;
|
2015-01-03 04:16:00 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If there is a unwritten extent at this offset,
|
|
|
|
* it will be as a data or a hole according to page
|
|
|
|
* cache that has data or not.
|
|
|
|
*/
|
2016-03-10 12:11:13 +08:00
|
|
|
if (ext4_es_is_unwritten(&es) &&
|
|
|
|
ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
|
|
|
|
last + es.es_len, &holeoff))
|
|
|
|
break;
|
2015-01-03 04:16:00 +08:00
|
|
|
|
2016-03-10 12:11:13 +08:00
|
|
|
last += es.es_len;
|
|
|
|
holeoff = (loff_t)last << blkbits;
|
|
|
|
cond_resched();
|
2015-01-03 04:16:00 +08:00
|
|
|
} while (last <= end);
|
|
|
|
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2012-11-09 10:57:40 +08:00
|
|
|
|
2015-01-03 04:16:00 +08:00
|
|
|
if (holeoff > isize)
|
|
|
|
holeoff = isize;
|
|
|
|
|
|
|
|
return vfs_setpos(file, holeoff, maxsize);
|
2012-11-09 10:57:40 +08:00
|
|
|
}
|
|
|
|
|
2010-10-28 09:30:06 +08:00
|
|
|
/*
|
2012-05-01 02:14:03 +08:00
|
|
|
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
|
|
|
|
* by calling generic_file_llseek_size() with the appropriate maxbytes
|
|
|
|
* value for each.
|
2010-10-28 09:30:06 +08:00
|
|
|
*/
|
2012-12-18 07:59:39 +08:00
|
|
|
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
|
2010-10-28 09:30:06 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = file->f_mapping->host;
|
|
|
|
loff_t maxbytes;
|
|
|
|
|
|
|
|
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
|
|
|
|
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
|
|
|
|
else
|
|
|
|
maxbytes = inode->i_sb->s_maxbytes;
|
|
|
|
|
2012-12-18 07:59:39 +08:00
|
|
|
switch (whence) {
|
2012-11-09 10:57:40 +08:00
|
|
|
case SEEK_SET:
|
|
|
|
case SEEK_CUR:
|
|
|
|
case SEEK_END:
|
2012-12-18 07:59:39 +08:00
|
|
|
return generic_file_llseek_size(file, offset, whence,
|
2012-11-09 10:57:40 +08:00
|
|
|
maxbytes, i_size_read(inode));
|
|
|
|
case SEEK_DATA:
|
|
|
|
return ext4_seek_data(file, offset, maxbytes);
|
|
|
|
case SEEK_HOLE:
|
|
|
|
return ext4_seek_hole(file, offset, maxbytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
return -EINVAL;
|
2010-10-28 09:30:06 +08:00
|
|
|
}
|
|
|
|
|
2006-10-11 16:20:53 +08:00
|
|
|
const struct file_operations ext4_file_operations = {
|
2010-10-28 09:30:06 +08:00
|
|
|
.llseek = ext4_llseek,
|
2016-11-21 06:36:06 +08:00
|
|
|
.read_iter = ext4_file_read_iter,
|
2014-04-18 04:09:22 +08:00
|
|
|
.write_iter = ext4_file_write_iter,
|
2008-04-30 10:03:54 +08:00
|
|
|
.unlocked_ioctl = ext4_ioctl,
|
2006-10-11 16:20:50 +08:00
|
|
|
#ifdef CONFIG_COMPAT
|
2006-10-11 16:20:53 +08:00
|
|
|
.compat_ioctl = ext4_compat_ioctl,
|
2006-10-11 16:20:50 +08:00
|
|
|
#endif
|
2008-07-12 07:27:31 +08:00
|
|
|
.mmap = ext4_file_mmap,
|
2009-06-13 22:09:48 +08:00
|
|
|
.open = ext4_file_open,
|
2006-10-11 16:20:53 +08:00
|
|
|
.release = ext4_release_file,
|
|
|
|
.fsync = ext4_sync_file,
|
2016-10-08 07:59:59 +08:00
|
|
|
.get_unmapped_area = thp_get_unmapped_area,
|
2006-10-11 16:20:50 +08:00
|
|
|
.splice_read = generic_file_splice_read,
|
2014-04-05 16:27:08 +08:00
|
|
|
.splice_write = iter_file_splice_write,
|
2011-01-14 20:07:43 +08:00
|
|
|
.fallocate = ext4_fallocate,
|
2006-10-11 16:20:50 +08:00
|
|
|
};
|
|
|
|
|
2007-02-12 16:55:38 +08:00
|
|
|
const struct inode_operations ext4_file_inode_operations = {
|
2006-10-11 16:20:53 +08:00
|
|
|
.setattr = ext4_setattr,
|
2008-07-12 07:27:31 +08:00
|
|
|
.getattr = ext4_getattr,
|
2006-10-11 16:20:53 +08:00
|
|
|
.listxattr = ext4_listxattr,
|
2011-07-23 23:37:31 +08:00
|
|
|
.get_acl = ext4_get_acl,
|
2013-12-20 21:16:44 +08:00
|
|
|
.set_acl = ext4_set_acl,
|
2008-10-07 12:46:36 +08:00
|
|
|
.fiemap = ext4_fiemap,
|
2006-10-11 16:20:50 +08:00
|
|
|
};
|
|
|
|
|