2005-12-16 06:31:24 +08:00
|
|
|
/* -*- mode: c; c-basic-offset: 8; -*-
|
|
|
|
* vim: noexpandtab sw=8 ts=8 sts=0:
|
|
|
|
*
|
|
|
|
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public
|
|
|
|
* License along with this program; if not, write to the
|
|
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
|
* Boston, MA 021110-1307, USA.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <asm/byteorder.h>
|
2007-02-10 12:24:12 +08:00
|
|
|
#include <linux/swap.h>
|
2007-03-07 09:24:46 +08:00
|
|
|
#include <linux/pipe_fs_i.h>
|
2007-10-31 03:08:32 +08:00
|
|
|
#include <linux/mpage.h>
|
2008-10-10 01:38:40 +08:00
|
|
|
#include <linux/quotaops.h>
|
2015-02-17 08:00:00 +08:00
|
|
|
#include <linux/blkdev.h>
|
2015-02-23 00:58:50 +08:00
|
|
|
#include <linux/uio.h>
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
#include <cluster/masklog.h>
|
|
|
|
|
|
|
|
#include "ocfs2.h"
|
|
|
|
|
|
|
|
#include "alloc.h"
|
|
|
|
#include "aops.h"
|
|
|
|
#include "dlmglue.h"
|
|
|
|
#include "extent_map.h"
|
|
|
|
#include "file.h"
|
|
|
|
#include "inode.h"
|
|
|
|
#include "journal.h"
|
2007-02-10 12:24:12 +08:00
|
|
|
#include "suballoc.h"
|
2005-12-16 06:31:24 +08:00
|
|
|
#include "super.h"
|
|
|
|
#include "symlink.h"
|
2009-08-25 08:02:48 +08:00
|
|
|
#include "refcounttree.h"
|
2011-02-22 21:33:59 +08:00
|
|
|
#include "ocfs2_trace.h"
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
#include "buffer_head_io.h"
|
2015-02-17 08:00:00 +08:00
|
|
|
#include "dir.h"
|
|
|
|
#include "namei.h"
|
|
|
|
#include "sysfile.h"
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
|
|
|
int err = -EIO;
|
|
|
|
int status;
|
|
|
|
struct ocfs2_dinode *fe = NULL;
|
|
|
|
struct buffer_head *bh = NULL;
|
|
|
|
struct buffer_head *buffer_cache_bh = NULL;
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
void *kaddr;
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_symlink_get_block(
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
(unsigned long long)iblock, bh_result, create);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
|
|
|
|
|
|
|
|
if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
|
|
|
|
mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
|
|
|
|
(unsigned long long)iblock);
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
2008-11-14 06:49:11 +08:00
|
|
|
status = ocfs2_read_inode_block(inode, &bh);
|
2005-12-16 06:31:24 +08:00
|
|
|
if (status < 0) {
|
|
|
|
mlog_errno(status);
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
fe = (struct ocfs2_dinode *) bh->b_data;
|
|
|
|
|
|
|
|
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
|
|
|
|
le32_to_cpu(fe->i_clusters))) {
|
2013-11-13 07:06:54 +08:00
|
|
|
err = -ENOMEM;
|
2005-12-16 06:31:24 +08:00
|
|
|
mlog(ML_ERROR, "block offset is outside the allocated size: "
|
|
|
|
"%llu\n", (unsigned long long)iblock);
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We don't use the page cache to create symlink data, so if
|
|
|
|
* need be, copy it over from the buffer cache. */
|
|
|
|
if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
|
|
|
|
u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
|
|
|
|
iblock;
|
|
|
|
buffer_cache_bh = sb_getblk(osb->sb, blkno);
|
|
|
|
if (!buffer_cache_bh) {
|
2013-11-13 07:06:54 +08:00
|
|
|
err = -ENOMEM;
|
2005-12-16 06:31:24 +08:00
|
|
|
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* we haven't locked out transactions, so a commit
|
|
|
|
* could've happened. Since we've got a reference on
|
|
|
|
* the bh, even if it commits while we're doing the
|
|
|
|
* copy, the data is still good. */
|
|
|
|
if (buffer_jbd(buffer_cache_bh)
|
|
|
|
&& ocfs2_inode_is_new(inode)) {
|
2011-11-25 23:14:34 +08:00
|
|
|
kaddr = kmap_atomic(bh_result->b_page);
|
2005-12-16 06:31:24 +08:00
|
|
|
if (!kaddr) {
|
|
|
|
mlog(ML_ERROR, "couldn't kmap!\n");
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
memcpy(kaddr + (bh_result->b_size * iblock),
|
|
|
|
buffer_cache_bh->b_data,
|
|
|
|
bh_result->b_size);
|
2011-11-25 23:14:34 +08:00
|
|
|
kunmap_atomic(kaddr);
|
2005-12-16 06:31:24 +08:00
|
|
|
set_buffer_uptodate(bh_result);
|
|
|
|
}
|
|
|
|
brelse(buffer_cache_bh);
|
|
|
|
}
|
|
|
|
|
|
|
|
map_bh(bh_result, inode->i_sb,
|
|
|
|
le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
|
|
|
|
|
|
|
|
err = 0;
|
|
|
|
|
|
|
|
bail:
|
2008-10-08 05:25:16 +08:00
|
|
|
brelse(bh);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2009-08-25 08:05:12 +08:00
|
|
|
int ocfs2_get_block(struct inode *inode, sector_t iblock,
|
|
|
|
struct buffer_head *bh_result, int create)
|
2005-12-16 06:31:24 +08:00
|
|
|
{
|
|
|
|
int err = 0;
|
2007-03-10 08:21:46 +08:00
|
|
|
unsigned int ext_flags;
|
2007-10-31 03:08:32 +08:00
|
|
|
u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
|
|
|
|
u64 p_blkno, count, past_eof;
|
2007-02-15 07:30:30 +08:00
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
(unsigned long long)iblock, bh_result, create);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
|
|
|
|
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
|
|
|
|
inode, inode->i_ino);
|
|
|
|
|
|
|
|
if (S_ISLNK(inode->i_mode)) {
|
|
|
|
/* this always does I/O for some reason. */
|
|
|
|
err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
|
2007-03-10 08:21:46 +08:00
|
|
|
&ext_flags);
|
2005-12-16 06:31:24 +08:00
|
|
|
if (err) {
|
|
|
|
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
|
2006-03-04 02:24:33 +08:00
|
|
|
"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
|
|
|
|
(unsigned long long)p_blkno);
|
2005-12-16 06:31:24 +08:00
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
if (max_blocks < count)
|
|
|
|
count = max_blocks;
|
|
|
|
|
2007-02-15 07:30:30 +08:00
|
|
|
/*
|
|
|
|
* ocfs2 never allocates in this function - the only time we
|
|
|
|
* need to use BH_New is when we're extending i_size on a file
|
|
|
|
* system which doesn't support holes, in which case BH_New
|
2010-10-06 16:47:23 +08:00
|
|
|
* allows __block_write_begin() to zero.
|
2008-06-30 18:45:45 +08:00
|
|
|
*
|
|
|
|
* If we see this on a sparse file system, then a truncate has
|
|
|
|
* raced us and removed the cluster. In this case, we clear
|
|
|
|
* the buffers dirty and uptodate bits and let the buffer code
|
|
|
|
* ignore it as a hole.
|
2007-02-15 07:30:30 +08:00
|
|
|
*/
|
2008-06-30 18:45:45 +08:00
|
|
|
if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
|
|
|
|
clear_buffer_dirty(bh_result);
|
|
|
|
clear_buffer_uptodate(bh_result);
|
|
|
|
goto bail;
|
|
|
|
}
|
2007-02-15 07:30:30 +08:00
|
|
|
|
2007-03-10 08:21:46 +08:00
|
|
|
/* Treat the unwritten extent as a hole for zeroing purposes. */
|
|
|
|
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
|
2007-02-15 07:30:30 +08:00
|
|
|
map_bh(bh_result, inode->i_sb, p_blkno);
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
bh_result->b_size = count << inode->i_blkbits;
|
|
|
|
|
2007-02-15 07:30:30 +08:00
|
|
|
if (!ocfs2_sparse_alloc(osb)) {
|
|
|
|
if (p_blkno == 0) {
|
|
|
|
err = -EIO;
|
|
|
|
mlog(ML_ERROR,
|
|
|
|
"iblock = %llu p_blkno = %llu blkno=(%llu)\n",
|
|
|
|
(unsigned long long)iblock,
|
|
|
|
(unsigned long long)p_blkno,
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno);
|
|
|
|
mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
|
|
|
|
dump_stack();
|
2009-07-13 11:38:58 +08:00
|
|
|
goto bail;
|
2007-02-15 07:30:30 +08:00
|
|
|
}
|
|
|
|
}
|
2005-12-16 06:31:24 +08:00
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
|
2011-02-22 21:33:59 +08:00
|
|
|
|
|
|
|
trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
(unsigned long long)past_eof);
|
2010-07-02 06:13:31 +08:00
|
|
|
if (create && (iblock >= past_eof))
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
bail:
|
|
|
|
if (err < 0)
|
|
|
|
err = -EIO;
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
int ocfs2_read_inline_data(struct inode *inode, struct page *page,
|
|
|
|
struct buffer_head *di_bh)
|
2007-09-08 05:05:51 +08:00
|
|
|
{
|
|
|
|
void *kaddr;
|
2007-12-19 22:24:09 +08:00
|
|
|
loff_t size;
|
2007-09-08 05:05:51 +08:00
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
|
|
|
|
|
|
|
|
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
|
2015-09-05 06:44:51 +08:00
|
|
|
ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
|
2007-09-08 05:05:51 +08:00
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno);
|
|
|
|
return -EROFS;
|
|
|
|
}
|
|
|
|
|
|
|
|
size = i_size_read(inode);
|
|
|
|
|
|
|
|
if (size > PAGE_CACHE_SIZE ||
|
2009-03-05 11:06:15 +08:00
|
|
|
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
|
2007-09-08 05:05:51 +08:00
|
|
|
ocfs2_error(inode->i_sb,
|
2015-09-05 06:44:51 +08:00
|
|
|
"Inode %llu has with inline data has bad size: %Lu\n",
|
2007-12-19 22:24:09 +08:00
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
(unsigned long long)size);
|
2007-09-08 05:05:51 +08:00
|
|
|
return -EROFS;
|
|
|
|
}
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
kaddr = kmap_atomic(page);
|
2007-09-08 05:05:51 +08:00
|
|
|
if (size)
|
|
|
|
memcpy(kaddr, di->id2.i_data.id_data, size);
|
|
|
|
/* Clear the remaining part of the page */
|
|
|
|
memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
|
|
|
|
flush_dcache_page(page);
|
2011-11-25 23:14:34 +08:00
|
|
|
kunmap_atomic(kaddr);
|
2007-09-08 05:05:51 +08:00
|
|
|
|
|
|
|
SetPageUptodate(page);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
|
|
|
|
BUG_ON(!PageLocked(page));
|
2008-02-27 04:45:56 +08:00
|
|
|
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
|
2007-09-08 05:05:51 +08:00
|
|
|
|
2008-11-14 06:49:11 +08:00
|
|
|
ret = ocfs2_read_inode_block(inode, &di_bh);
|
2007-09-08 05:05:51 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ocfs2_read_inline_data(inode, page, di_bh);
|
|
|
|
out:
|
|
|
|
unlock_page(page);
|
|
|
|
|
|
|
|
brelse(di_bh);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
static int ocfs2_readpage(struct file *file, struct page *page)
|
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
2007-09-08 05:05:51 +08:00
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
2005-12-16 06:31:24 +08:00
|
|
|
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
|
|
|
|
int ret, unlock = 1;
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
|
|
|
|
(page ? page->index : 0));
|
2005-12-16 06:31:24 +08:00
|
|
|
|
2007-10-19 06:30:42 +08:00
|
|
|
ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
|
2005-12-16 06:31:24 +08:00
|
|
|
if (ret != 0) {
|
|
|
|
if (ret == AOP_TRUNCATED_PAGE)
|
|
|
|
unlock = 0;
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:05:51 +08:00
|
|
|
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
|
2011-06-24 04:51:47 +08:00
|
|
|
/*
|
|
|
|
* Unlock the page and cycle ip_alloc_sem so that we don't
|
|
|
|
* busyloop waiting for ip_alloc_sem to unlock
|
|
|
|
*/
|
2007-05-15 02:38:51 +08:00
|
|
|
ret = AOP_TRUNCATED_PAGE;
|
2011-06-24 04:51:47 +08:00
|
|
|
unlock_page(page);
|
|
|
|
unlock = 0;
|
|
|
|
down_read(&oi->ip_alloc_sem);
|
|
|
|
up_read(&oi->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
goto out_inode_unlock;
|
2007-05-15 02:38:51 +08:00
|
|
|
}
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* i_size might have just been updated as we grabed the meta lock. We
|
|
|
|
* might now be discovering a truncate that hit on another node.
|
|
|
|
* block_read_full_page->get_block freaks out if it is asked to read
|
|
|
|
* beyond the end of a file, so we check here. Callers
|
2007-07-19 16:46:59 +08:00
|
|
|
* (generic_file_read, vm_ops->fault) are clever enough to check i_size
|
2005-12-16 06:31:24 +08:00
|
|
|
* and notice that the page they just read isn't needed.
|
|
|
|
*
|
|
|
|
* XXX sys_readahead() seems to get that wrong?
|
|
|
|
*/
|
|
|
|
if (start >= i_size_read(inode)) {
|
2008-02-05 14:28:29 +08:00
|
|
|
zero_user(page, 0, PAGE_SIZE);
|
2005-12-16 06:31:24 +08:00
|
|
|
SetPageUptodate(page);
|
|
|
|
ret = 0;
|
|
|
|
goto out_alloc;
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:05:51 +08:00
|
|
|
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
|
|
|
|
ret = ocfs2_readpage_inline(inode, page);
|
|
|
|
else
|
|
|
|
ret = block_read_full_page(page, ocfs2_get_block);
|
2005-12-16 06:31:24 +08:00
|
|
|
unlock = 0;
|
|
|
|
|
|
|
|
out_alloc:
|
|
|
|
up_read(&OCFS2_I(inode)->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
out_inode_unlock:
|
|
|
|
ocfs2_inode_unlock(inode, 0);
|
2005-12-16 06:31:24 +08:00
|
|
|
out:
|
|
|
|
if (unlock)
|
|
|
|
unlock_page(page);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
/*
|
|
|
|
* This is used only for read-ahead. Failures or difficult to handle
|
|
|
|
* situations are safe to ignore.
|
|
|
|
*
|
|
|
|
* Right now, we don't bother with BH_Boundary - in-inode extent lists
|
|
|
|
* are quite large (243 extents on 4k blocks), so most inodes don't
|
|
|
|
* grow out to a tree. If need be, detecting boundary extents could
|
|
|
|
* trivially be added in a future version of ocfs2_get_block().
|
|
|
|
*/
|
|
|
|
static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
|
|
|
|
struct list_head *pages, unsigned nr_pages)
|
|
|
|
{
|
|
|
|
int ret, err = -EIO;
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
|
|
|
loff_t start;
|
|
|
|
struct page *last;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use the nonblocking flag for the dlm code to avoid page
|
|
|
|
* lock inversion, but don't bother with retrying.
|
|
|
|
*/
|
|
|
|
ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
|
|
|
|
if (ret)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
|
|
|
|
ocfs2_inode_unlock(inode, 0);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't bother with inline-data. There isn't anything
|
|
|
|
* to read-ahead in that case anyway...
|
|
|
|
*/
|
|
|
|
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether a remote node truncated this file - we just
|
|
|
|
* drop out in that case as it's not worth handling here.
|
|
|
|
*/
|
|
|
|
last = list_entry(pages->prev, struct page, lru);
|
|
|
|
start = (loff_t)last->index << PAGE_CACHE_SHIFT;
|
|
|
|
if (start >= i_size_read(inode))
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
up_read(&oi->ip_alloc_sem);
|
|
|
|
ocfs2_inode_unlock(inode, 0);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
/* Note: Because we don't support holes, our allocation has
|
|
|
|
* already happened (allocation writes zeros to the file data)
|
|
|
|
* so we don't have to worry about ordered writes in
|
|
|
|
* ocfs2_writepage.
|
|
|
|
*
|
|
|
|
* ->writepage is called during the process of invalidating the page cache
|
|
|
|
* during blocked lock processing. It can't block on any cluster locks
|
|
|
|
* to during block mapping. It's relying on the fact that the block
|
|
|
|
* mapping can't have disappeared under the dirty pages that it is
|
|
|
|
* being asked to write back.
|
|
|
|
*/
|
|
|
|
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
|
|
|
|
{
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_writepage(
|
|
|
|
(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
|
|
|
|
page->index);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
return block_write_full_page(page, ocfs2_get_block, wbc);
|
2005-12-16 06:31:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Taken from ext3. We don't necessarily need the full blown
|
|
|
|
* functionality yet, but IMHO it's better to cut and paste the whole
|
|
|
|
* thing so we can avoid introducing our own bugs (and easily pick up
|
|
|
|
* their fixes when they happen) --Mark */
|
2007-02-17 03:46:50 +08:00
|
|
|
int walk_page_buffers( handle_t *handle,
|
|
|
|
struct buffer_head *head,
|
|
|
|
unsigned from,
|
|
|
|
unsigned to,
|
|
|
|
int *partial,
|
|
|
|
int (*fn)( handle_t *handle,
|
|
|
|
struct buffer_head *bh))
|
2005-12-16 06:31:24 +08:00
|
|
|
{
|
|
|
|
struct buffer_head *bh;
|
|
|
|
unsigned block_start, block_end;
|
|
|
|
unsigned blocksize = head->b_size;
|
|
|
|
int err, ret = 0;
|
|
|
|
struct buffer_head *next;
|
|
|
|
|
|
|
|
for ( bh = head, block_start = 0;
|
|
|
|
ret == 0 && (bh != head || !block_start);
|
|
|
|
block_start = block_end, bh = next)
|
|
|
|
{
|
|
|
|
next = bh->b_this_page;
|
|
|
|
block_end = block_start + blocksize;
|
|
|
|
if (block_end <= from || block_start >= to) {
|
|
|
|
if (partial && !buffer_uptodate(bh))
|
|
|
|
*partial = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
err = (*fn)(handle, bh);
|
|
|
|
if (!ret)
|
|
|
|
ret = err;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
|
|
|
|
{
|
|
|
|
sector_t status;
|
|
|
|
u64 p_blkno = 0;
|
|
|
|
int err = 0;
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
(unsigned long long)block);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
/* We don't need to lock journal system files, since they aren't
|
|
|
|
* accessed concurrently from multiple nodes.
|
|
|
|
*/
|
|
|
|
if (!INODE_JOURNAL(inode)) {
|
2007-10-19 06:30:42 +08:00
|
|
|
err = ocfs2_inode_lock(inode, NULL, 0);
|
2005-12-16 06:31:24 +08:00
|
|
|
if (err) {
|
|
|
|
if (err != -ENOENT)
|
|
|
|
mlog_errno(err);
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
down_read(&OCFS2_I(inode)->ip_alloc_sem);
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:05:51 +08:00
|
|
|
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
|
|
|
|
err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
|
|
|
|
NULL);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
if (!INODE_JOURNAL(inode)) {
|
|
|
|
up_read(&OCFS2_I(inode)->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
ocfs2_inode_unlock(inode, 0);
|
2005-12-16 06:31:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (err) {
|
|
|
|
mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
|
|
|
|
(unsigned long long)block);
|
|
|
|
mlog_errno(err);
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
|
|
|
bail:
|
|
|
|
status = err ? 0 : p_blkno;
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2007-01-05 06:54:41 +08:00
|
|
|
static int ocfs2_releasepage(struct page *page, gfp_t wait)
|
|
|
|
{
|
|
|
|
if (!page_has_buffers(page))
|
|
|
|
return 0;
|
2013-11-13 07:07:08 +08:00
|
|
|
return try_to_free_buffers(page);
|
2007-01-05 06:54:41 +08:00
|
|
|
}
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
|
|
|
|
u32 cpos,
|
|
|
|
unsigned int *start,
|
|
|
|
unsigned int *end)
|
|
|
|
{
|
|
|
|
unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
|
|
|
|
|
|
|
|
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
|
|
|
|
unsigned int cpp;
|
|
|
|
|
|
|
|
cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
|
|
|
|
|
|
|
|
cluster_start = cpos % cpp;
|
|
|
|
cluster_start = cluster_start << osb->s_clustersize_bits;
|
|
|
|
|
|
|
|
cluster_end = cluster_start + osb->s_clustersize;
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG_ON(cluster_start > PAGE_SIZE);
|
|
|
|
BUG_ON(cluster_end > PAGE_SIZE);
|
|
|
|
|
|
|
|
if (start)
|
|
|
|
*start = cluster_start;
|
|
|
|
if (end)
|
|
|
|
*end = cluster_end;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 'from' and 'to' are the region in the page to avoid zeroing.
|
|
|
|
*
|
|
|
|
* If pagesize > clustersize, this function will avoid zeroing outside
|
|
|
|
* of the cluster boundary.
|
|
|
|
*
|
|
|
|
* from == to == 0 is code for "zero the entire cluster region"
|
|
|
|
*/
|
|
|
|
static void ocfs2_clear_page_regions(struct page *page,
|
|
|
|
struct ocfs2_super *osb, u32 cpos,
|
|
|
|
unsigned from, unsigned to)
|
|
|
|
{
|
|
|
|
void *kaddr;
|
|
|
|
unsigned int cluster_start, cluster_end;
|
|
|
|
|
|
|
|
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
kaddr = kmap_atomic(page);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
if (from || to) {
|
|
|
|
if (from > cluster_start)
|
|
|
|
memset(kaddr + cluster_start, 0, from - cluster_start);
|
|
|
|
if (to < cluster_end)
|
|
|
|
memset(kaddr + to, 0, cluster_end - to);
|
|
|
|
} else {
|
|
|
|
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
|
|
|
|
}
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
kunmap_atomic(kaddr);
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2007-11-02 02:37:48 +08:00
|
|
|
/*
|
|
|
|
* Nonsparse file systems fully allocate before we get to the write
|
|
|
|
* code. This prevents ocfs2_write() from tagging the write as an
|
|
|
|
* allocating one, which means ocfs2_map_page_blocks() might try to
|
|
|
|
* read-in the blocks at the tail of our file. Avoid reading them by
|
|
|
|
* testing i_size against each block offset.
|
|
|
|
*/
|
|
|
|
static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
|
|
|
|
unsigned int block_start)
|
|
|
|
{
|
|
|
|
u64 offset = page_offset(page) + block_start;
|
|
|
|
|
|
|
|
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (i_size_read(inode) > offset)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
/*
|
2010-10-06 16:47:23 +08:00
|
|
|
* Some of this taken from __block_write_begin(). We already have our
|
2007-02-10 12:24:12 +08:00
|
|
|
* mapping by now though, and the entire write will be allocating or
|
|
|
|
* it won't, so not much need to use BH_New.
|
|
|
|
*
|
|
|
|
* This will also skip zeroing, which is handled externally.
|
|
|
|
*/
|
2007-02-17 03:46:50 +08:00
|
|
|
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
|
|
|
|
struct inode *inode, unsigned int from,
|
|
|
|
unsigned int to, int new)
|
2007-02-10 12:24:12 +08:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
|
|
|
|
unsigned int block_end, block_start;
|
|
|
|
unsigned int bsize = 1 << inode->i_blkbits;
|
|
|
|
|
|
|
|
if (!page_has_buffers(page))
|
|
|
|
create_empty_buffers(page, bsize, 0);
|
|
|
|
|
|
|
|
head = page_buffers(page);
|
|
|
|
for (bh = head, block_start = 0; bh != head || !block_start;
|
|
|
|
bh = bh->b_this_page, block_start += bsize) {
|
|
|
|
block_end = block_start + bsize;
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
clear_buffer_new(bh);
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
/*
|
|
|
|
* Ignore blocks outside of our i/o range -
|
|
|
|
* they may belong to unallocated clusters.
|
|
|
|
*/
|
2007-02-17 03:46:50 +08:00
|
|
|
if (block_start >= to || block_end <= from) {
|
2007-02-10 12:24:12 +08:00
|
|
|
if (PageUptodate(page))
|
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For an allocating write with cluster size >= page
|
|
|
|
* size, we always write the entire page.
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
if (new)
|
|
|
|
set_buffer_new(bh);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
if (!buffer_mapped(bh)) {
|
|
|
|
map_bh(bh, inode->i_sb, *p_blkno);
|
|
|
|
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PageUptodate(page)) {
|
|
|
|
if (!buffer_uptodate(bh))
|
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
|
2007-06-19 02:12:36 +08:00
|
|
|
!buffer_new(bh) &&
|
2007-11-02 02:37:48 +08:00
|
|
|
ocfs2_should_read_blk(inode, page, block_start) &&
|
2007-06-19 02:12:36 +08:00
|
|
|
(block_start < from || block_end > to)) {
|
2007-02-10 12:24:12 +08:00
|
|
|
ll_rw_block(READ, 1, &bh);
|
|
|
|
*wait_bh++=bh;
|
|
|
|
}
|
|
|
|
|
|
|
|
*p_blkno = *p_blkno + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we issued read requests - let them complete.
|
|
|
|
*/
|
|
|
|
while(wait_bh > wait) {
|
|
|
|
wait_on_buffer(*--wait_bh);
|
|
|
|
if (!buffer_uptodate(*wait_bh))
|
|
|
|
ret = -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 0 || !new)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we get -EIO above, zero out any newly allocated blocks
|
|
|
|
* to avoid exposing stale data.
|
|
|
|
*/
|
|
|
|
bh = head;
|
|
|
|
block_start = 0;
|
|
|
|
do {
|
|
|
|
block_end = block_start + bsize;
|
|
|
|
if (block_end <= from)
|
|
|
|
goto next_bh;
|
|
|
|
if (block_start >= to)
|
|
|
|
break;
|
|
|
|
|
2008-02-05 14:28:29 +08:00
|
|
|
zero_user(page, block_start, bh->b_size);
|
2007-02-10 12:24:12 +08:00
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
mark_buffer_dirty(bh);
|
|
|
|
|
|
|
|
next_bh:
|
|
|
|
block_start = block_end;
|
|
|
|
bh = bh->b_this_page;
|
|
|
|
} while (bh != head);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
|
|
|
|
#define OCFS2_MAX_CTXT_PAGES 1
|
|
|
|
#else
|
|
|
|
#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
struct ocfs2_unwritten_extent {
|
|
|
|
struct list_head ue_node;
|
|
|
|
struct list_head ue_ip_node;
|
|
|
|
u32 ue_cpos;
|
|
|
|
u32 ue_phys;
|
|
|
|
};
|
|
|
|
|
2007-03-07 09:24:46 +08:00
|
|
|
/*
|
2007-05-09 08:47:32 +08:00
|
|
|
* Describe the state of a single cluster to be written to.
|
2007-03-07 09:24:46 +08:00
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
struct ocfs2_write_cluster_desc {
|
|
|
|
u32 c_cpos;
|
|
|
|
u32 c_phys;
|
|
|
|
/*
|
|
|
|
* Give this a unique field because c_phys eventually gets
|
|
|
|
* filled.
|
|
|
|
*/
|
|
|
|
unsigned c_new;
|
2016-03-26 05:20:55 +08:00
|
|
|
unsigned c_clear_unwritten;
|
2009-08-07 07:12:58 +08:00
|
|
|
unsigned c_needs_zero;
|
2007-05-09 08:47:32 +08:00
|
|
|
};
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
struct ocfs2_write_ctxt {
|
|
|
|
/* Logical cluster position / len of write */
|
|
|
|
u32 w_cpos;
|
|
|
|
u32 w_clen;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2009-08-07 07:12:58 +08:00
|
|
|
/* First cluster allocated in a nonsparse extend */
|
|
|
|
u32 w_first_new_cpos;
|
|
|
|
|
2016-03-26 05:20:52 +08:00
|
|
|
/* Type of caller. Must be one of buffer, mmap, direct. */
|
|
|
|
ocfs2_write_type_t w_type;
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* This is true if page_size > cluster_size.
|
|
|
|
*
|
|
|
|
* It triggers a set of special cases during write which might
|
|
|
|
* have to deal with allocating writes to partial pages.
|
|
|
|
*/
|
|
|
|
unsigned int w_large_pages;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* Pages involved in this write.
|
|
|
|
*
|
|
|
|
* w_target_page is the page being written to by the user.
|
|
|
|
*
|
|
|
|
* w_pages is an array of pages which always contains
|
|
|
|
* w_target_page, and in the case of an allocating write with
|
|
|
|
* page_size < cluster size, it will contain zero'd and mapped
|
|
|
|
* pages adjacent to w_target_page which need to be written
|
|
|
|
* out in so that future reads from that region will get
|
|
|
|
* zero's.
|
|
|
|
*/
|
|
|
|
unsigned int w_num_pages;
|
2010-06-11 06:21:36 +08:00
|
|
|
struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
|
2007-05-09 08:47:32 +08:00
|
|
|
struct page *w_target_page;
|
2007-06-07 07:15:24 +08:00
|
|
|
|
2011-07-25 01:36:54 +08:00
|
|
|
/*
|
|
|
|
* w_target_locked is used for page_mkwrite path indicating no unlocking
|
|
|
|
* against w_target_page in ocfs2_write_end_nolock.
|
|
|
|
*/
|
|
|
|
unsigned int w_target_locked:1;
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* ocfs2_write_end() uses this to know what the real range to
|
|
|
|
* write in the target should be.
|
|
|
|
*/
|
|
|
|
unsigned int w_target_from;
|
|
|
|
unsigned int w_target_to;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We could use journal_current_handle() but this is cleaner,
|
|
|
|
* IMHO -Mark
|
|
|
|
*/
|
|
|
|
handle_t *w_handle;
|
|
|
|
|
|
|
|
struct buffer_head *w_di_bh;
|
2007-06-19 02:22:56 +08:00
|
|
|
|
|
|
|
struct ocfs2_cached_dealloc_ctxt w_dealloc;
|
2016-03-26 05:21:06 +08:00
|
|
|
|
|
|
|
struct list_head w_unwritten_list;
|
2007-05-09 08:47:32 +08:00
|
|
|
};
|
|
|
|
|
2007-09-08 05:20:45 +08:00
|
|
|
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
|
2007-05-09 08:47:32 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2007-09-08 05:20:45 +08:00
|
|
|
for(i = 0; i < num_pages; i++) {
|
|
|
|
if (pages[i]) {
|
|
|
|
unlock_page(pages[i]);
|
|
|
|
mark_page_accessed(pages[i]);
|
|
|
|
page_cache_release(pages[i]);
|
|
|
|
}
|
2007-03-07 09:24:46 +08:00
|
|
|
}
|
2007-09-08 05:20:45 +08:00
|
|
|
}
|
|
|
|
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
|
2007-09-08 05:20:45 +08:00
|
|
|
{
|
2011-07-25 01:36:54 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* w_target_locked is only set to true in the page_mkwrite() case.
|
|
|
|
* The intent is to allow us to lock the target page from write_begin()
|
|
|
|
* to write_end(). The caller must hold a ref on w_target_page.
|
|
|
|
*/
|
|
|
|
if (wc->w_target_locked) {
|
|
|
|
BUG_ON(!wc->w_target_page);
|
|
|
|
for (i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
if (wc->w_target_page == wc->w_pages[i]) {
|
|
|
|
wc->w_pages[i] = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mark_page_accessed(wc->w_target_page);
|
|
|
|
page_cache_release(wc->w_target_page);
|
|
|
|
}
|
2007-09-08 05:20:45 +08:00
|
|
|
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
}
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
static void ocfs2_free_unwritten_list(struct inode *inode,
|
|
|
|
struct list_head *head)
|
|
|
|
{
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
|
2016-03-26 05:21:06 +08:00
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
list_for_each_entry_safe(ue, tmp, head, ue_node) {
|
|
|
|
list_del(&ue->ue_node);
|
2016-03-26 05:21:06 +08:00
|
|
|
spin_lock(&oi->ip_lock);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
list_del(&ue->ue_ip_node);
|
2016-03-26 05:21:06 +08:00
|
|
|
spin_unlock(&oi->ip_lock);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
kfree(ue);
|
2016-03-26 05:21:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ocfs2_free_write_ctxt(struct inode *inode,
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
{
|
2016-03-26 05:21:06 +08:00
|
|
|
ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
ocfs2_unlock_pages(wc);
|
2007-05-09 08:47:32 +08:00
|
|
|
brelse(wc->w_di_bh);
|
|
|
|
kfree(wc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
|
|
|
|
struct ocfs2_super *osb, loff_t pos,
|
2016-03-26 05:20:52 +08:00
|
|
|
unsigned len, ocfs2_write_type_t type,
|
|
|
|
struct buffer_head *di_bh)
|
2007-05-09 08:47:32 +08:00
|
|
|
{
|
2007-09-06 08:02:25 +08:00
|
|
|
u32 cend;
|
2007-05-09 08:47:32 +08:00
|
|
|
struct ocfs2_write_ctxt *wc;
|
|
|
|
|
|
|
|
wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
|
|
|
|
if (!wc)
|
|
|
|
return -ENOMEM;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
wc->w_cpos = pos >> osb->s_clustersize_bits;
|
2009-08-07 07:12:58 +08:00
|
|
|
wc->w_first_new_cpos = UINT_MAX;
|
2007-09-06 08:02:25 +08:00
|
|
|
cend = (pos + len - 1) >> osb->s_clustersize_bits;
|
|
|
|
wc->w_clen = cend - wc->w_cpos + 1;
|
2007-05-10 06:14:45 +08:00
|
|
|
get_bh(di_bh);
|
|
|
|
wc->w_di_bh = di_bh;
|
2016-03-26 05:20:52 +08:00
|
|
|
wc->w_type = type;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
|
|
|
|
wc->w_large_pages = 1;
|
|
|
|
else
|
|
|
|
wc->w_large_pages = 0;
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
|
2016-03-26 05:21:06 +08:00
|
|
|
INIT_LIST_HEAD(&wc->w_unwritten_list);
|
2007-06-19 02:22:56 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
*wcp = wc;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
return 0;
|
2007-03-07 09:24:46 +08:00
|
|
|
}
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
/*
|
2007-05-09 08:47:32 +08:00
|
|
|
* If a page has any new buffers, zero them out here, and mark them uptodate
|
|
|
|
* and dirty so they'll be written out (in order to prevent uninitialised
|
|
|
|
* block data from leaking). And clear the new bit.
|
2007-02-10 12:24:12 +08:00
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
|
2007-02-10 12:24:12 +08:00
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
unsigned int block_start, block_end;
|
|
|
|
struct buffer_head *head, *bh;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
BUG_ON(!PageLocked(page));
|
|
|
|
if (!page_has_buffers(page))
|
|
|
|
return;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
bh = head = page_buffers(page);
|
|
|
|
block_start = 0;
|
|
|
|
do {
|
|
|
|
block_end = block_start + bh->b_size;
|
|
|
|
|
|
|
|
if (buffer_new(bh)) {
|
|
|
|
if (block_end > from && block_start < to) {
|
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
unsigned start, end;
|
|
|
|
|
|
|
|
start = max(from, block_start);
|
|
|
|
end = min(to, block_end);
|
|
|
|
|
2008-02-05 14:28:29 +08:00
|
|
|
zero_user_segment(page, start, end);
|
2007-05-09 08:47:32 +08:00
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
}
|
|
|
|
|
|
|
|
clear_buffer_new(bh);
|
|
|
|
mark_buffer_dirty(bh);
|
|
|
|
}
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
block_start = block_end;
|
|
|
|
bh = bh->b_this_page;
|
|
|
|
} while (bh != head);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only called when we have a failure during allocating write to write
|
|
|
|
* zero's to the newly allocated region.
|
|
|
|
*/
|
|
|
|
static void ocfs2_write_failure(struct inode *inode,
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
loff_t user_pos, unsigned user_len)
|
|
|
|
{
|
|
|
|
int i;
|
2007-09-19 08:49:29 +08:00
|
|
|
unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
|
|
|
|
to = user_pos + user_len;
|
2007-05-09 08:47:32 +08:00
|
|
|
struct page *tmppage;
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
if (wc->w_target_page)
|
|
|
|
ocfs2_zero_new_buffers(wc->w_target_page, from, to);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
tmppage = wc->w_pages[i];
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
if (tmppage && page_has_buffers(tmppage)) {
|
2008-11-19 08:53:43 +08:00
|
|
|
if (ocfs2_should_order_data(inode))
|
2008-09-04 11:03:41 +08:00
|
|
|
ocfs2_jbd2_file_inode(wc->w_handle, inode);
|
2008-07-17 08:22:22 +08:00
|
|
|
|
|
|
|
block_commit_write(tmppage, from, to);
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
struct page *page, u32 cpos,
|
|
|
|
loff_t user_pos, unsigned user_len,
|
|
|
|
int new)
|
2007-02-10 12:24:12 +08:00
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
int ret;
|
|
|
|
unsigned int map_from = 0, map_to = 0;
|
2007-02-10 12:24:12 +08:00
|
|
|
unsigned int cluster_start, cluster_end;
|
2007-05-09 08:47:32 +08:00
|
|
|
unsigned int user_data_from = 0, user_data_to = 0;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
|
2007-02-10 12:24:12 +08:00
|
|
|
&cluster_start, &cluster_end);
|
|
|
|
|
2011-02-17 23:44:40 +08:00
|
|
|
/* treat the write as new if the a hole/lseek spanned across
|
|
|
|
* the page boundary.
|
|
|
|
*/
|
|
|
|
new = new | ((i_size_read(inode) <= page_offset(page)) &&
|
|
|
|
(page_offset(page) <= user_pos));
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
if (page == wc->w_target_page) {
|
|
|
|
map_from = user_pos & (PAGE_CACHE_SIZE - 1);
|
|
|
|
map_to = map_from + user_len;
|
|
|
|
|
|
|
|
if (new)
|
|
|
|
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
|
|
|
|
cluster_start, cluster_end,
|
|
|
|
new);
|
|
|
|
else
|
|
|
|
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
|
|
|
|
map_from, map_to, new);
|
|
|
|
if (ret) {
|
2007-02-10 12:24:12 +08:00
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
user_data_from = map_from;
|
|
|
|
user_data_to = map_to;
|
2007-02-10 12:24:12 +08:00
|
|
|
if (new) {
|
2007-05-09 08:47:32 +08:00
|
|
|
map_from = cluster_start;
|
|
|
|
map_to = cluster_end;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If we haven't allocated the new page yet, we
|
|
|
|
* shouldn't be writing it out without copying user
|
|
|
|
* data. This is likely a math error from the caller.
|
|
|
|
*/
|
|
|
|
BUG_ON(!new);
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
map_from = cluster_start;
|
|
|
|
map_to = cluster_end;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
|
2007-05-09 08:47:32 +08:00
|
|
|
cluster_start, cluster_end, new);
|
2007-02-10 12:24:12 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parts of newly allocated pages need to be zero'd.
|
|
|
|
*
|
|
|
|
* Above, we have also rewritten 'to' and 'from' - as far as
|
|
|
|
* the rest of the function is concerned, the entire cluster
|
|
|
|
* range inside of a page needs to be written.
|
|
|
|
*
|
|
|
|
* We can skip this if the page is up to date - it's already
|
|
|
|
* been zero'd from being read in as a hole.
|
|
|
|
*/
|
|
|
|
if (new && !PageUptodate(page))
|
|
|
|
ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
|
2007-05-09 08:47:32 +08:00
|
|
|
cpos, user_data_from, user_data_to);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
flush_dcache_page(page);
|
|
|
|
|
|
|
|
out:
|
2007-05-09 08:47:32 +08:00
|
|
|
return ret;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-05-09 08:47:32 +08:00
|
|
|
* This function will only grab one clusters worth of pages.
|
2007-02-10 12:24:12 +08:00
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
2010-07-03 08:20:27 +08:00
|
|
|
u32 cpos, loff_t user_pos,
|
|
|
|
unsigned user_len, int new,
|
2007-05-10 06:16:19 +08:00
|
|
|
struct page *mmap_page)
|
2007-02-10 12:24:12 +08:00
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
int ret = 0, i;
|
2010-07-03 08:20:27 +08:00
|
|
|
unsigned long start, target_index, end_index, index;
|
2007-02-10 12:24:12 +08:00
|
|
|
struct inode *inode = mapping->host;
|
2010-07-03 08:20:27 +08:00
|
|
|
loff_t last_byte;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
target_index = user_pos >> PAGE_CACHE_SHIFT;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Figure out how many pages we'll be manipulating here. For
|
2007-02-17 03:46:50 +08:00
|
|
|
* non allocating write, we just change the one
|
2010-07-03 08:20:27 +08:00
|
|
|
* page. Otherwise, we'll need a whole clusters worth. If we're
|
|
|
|
* writing past i_size, we only need enough pages to cover the
|
|
|
|
* last page of the write.
|
2007-02-10 12:24:12 +08:00
|
|
|
*/
|
|
|
|
if (new) {
|
2007-05-09 08:47:32 +08:00
|
|
|
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
|
|
|
|
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
|
2010-07-03 08:20:27 +08:00
|
|
|
/*
|
|
|
|
* We need the index *past* the last page we could possibly
|
|
|
|
* touch. This is the page past the end of the write or
|
|
|
|
* i_size, whichever is greater.
|
|
|
|
*/
|
|
|
|
last_byte = max(user_pos + user_len, i_size_read(inode));
|
|
|
|
BUG_ON(last_byte < 1);
|
|
|
|
end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
|
|
|
|
if ((start + wc->w_num_pages) > end_index)
|
|
|
|
wc->w_num_pages = end_index - start;
|
2007-02-10 12:24:12 +08:00
|
|
|
} else {
|
2007-05-09 08:47:32 +08:00
|
|
|
wc->w_num_pages = 1;
|
|
|
|
start = target_index;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
2016-03-26 05:20:58 +08:00
|
|
|
end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
2007-02-10 12:24:12 +08:00
|
|
|
index = start + i;
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
if (index >= target_index && index <= end_index &&
|
|
|
|
wc->w_type == OCFS2_WRITE_MMAP) {
|
2007-05-10 06:16:19 +08:00
|
|
|
/*
|
|
|
|
* ocfs2_pagemkwrite() is a little different
|
|
|
|
* and wants us to directly use the page
|
|
|
|
* passed in.
|
|
|
|
*/
|
|
|
|
lock_page(mmap_page);
|
|
|
|
|
2011-07-25 01:36:54 +08:00
|
|
|
/* Exit and let the caller retry */
|
2007-05-10 06:16:19 +08:00
|
|
|
if (mmap_page->mapping != mapping) {
|
2011-07-25 01:36:54 +08:00
|
|
|
WARN_ON(mmap_page->mapping);
|
2007-05-10 06:16:19 +08:00
|
|
|
unlock_page(mmap_page);
|
2011-07-25 01:36:54 +08:00
|
|
|
ret = -EAGAIN;
|
2007-05-10 06:16:19 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
page_cache_get(mmap_page);
|
|
|
|
wc->w_pages[i] = mmap_page;
|
2011-07-25 01:36:54 +08:00
|
|
|
wc->w_target_locked = true;
|
2016-03-26 05:20:58 +08:00
|
|
|
} else if (index >= target_index && index <= end_index &&
|
|
|
|
wc->w_type == OCFS2_WRITE_DIRECT) {
|
|
|
|
/* Direct write has no mapping page. */
|
|
|
|
wc->w_pages[i] = NULL;
|
|
|
|
continue;
|
2007-05-10 06:16:19 +08:00
|
|
|
} else {
|
|
|
|
wc->w_pages[i] = find_or_create_page(mapping, index,
|
|
|
|
GFP_NOFS);
|
|
|
|
if (!wc->w_pages[i]) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
2013-02-22 08:42:57 +08:00
|
|
|
wait_for_stable_page(wc->w_pages[i]);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
|
|
if (index == target_index)
|
|
|
|
wc->w_target_page = wc->w_pages[i];
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
out:
|
2011-07-25 01:36:54 +08:00
|
|
|
if (ret)
|
|
|
|
wc->w_target_locked = false;
|
2007-05-09 08:47:32 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare a single cluster for write one cluster into the file.
|
|
|
|
*/
|
|
|
|
static int ocfs2_write_cluster(struct address_space *mapping,
|
2016-03-26 05:21:03 +08:00
|
|
|
u32 *phys, unsigned int new,
|
2016-03-26 05:20:55 +08:00
|
|
|
unsigned int clear_unwritten,
|
2009-08-07 07:12:58 +08:00
|
|
|
unsigned int should_zero,
|
2007-06-19 02:22:56 +08:00
|
|
|
struct ocfs2_alloc_context *data_ac,
|
2007-05-09 08:47:32 +08:00
|
|
|
struct ocfs2_alloc_context *meta_ac,
|
|
|
|
struct ocfs2_write_ctxt *wc, u32 cpos,
|
|
|
|
loff_t user_pos, unsigned user_len)
|
|
|
|
{
|
2016-03-26 05:20:55 +08:00
|
|
|
int ret, i;
|
2016-03-26 05:21:03 +08:00
|
|
|
u64 p_blkno;
|
2007-05-09 08:47:32 +08:00
|
|
|
struct inode *inode = mapping->host;
|
2008-08-21 10:36:33 +08:00
|
|
|
struct ocfs2_extent_tree et;
|
2016-03-26 05:21:03 +08:00
|
|
|
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
if (new) {
|
2007-05-09 08:47:32 +08:00
|
|
|
u32 tmp_pos;
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
/*
|
|
|
|
* This is safe to call with the page locks - it won't take
|
|
|
|
* any additional semaphores or cluster locks.
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
tmp_pos = cpos;
|
2008-08-18 17:38:45 +08:00
|
|
|
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
|
2016-03-26 05:20:55 +08:00
|
|
|
&tmp_pos, 1, !clear_unwritten,
|
|
|
|
wc->w_di_bh, wc->w_handle,
|
|
|
|
data_ac, meta_ac, NULL);
|
2007-02-10 12:24:12 +08:00
|
|
|
/*
|
|
|
|
* This shouldn't happen because we must have already
|
|
|
|
* calculated the correct meta data allocation required. The
|
|
|
|
* internal tree allocation code should know how to increase
|
|
|
|
* transaction credits itself.
|
|
|
|
*
|
|
|
|
* If need be, we could handle -EAGAIN for a
|
|
|
|
* RESTART_TRANS here.
|
|
|
|
*/
|
|
|
|
mlog_bug_on_msg(ret == -EAGAIN,
|
|
|
|
"Inode %llu: EAGAIN return during allocation.\n",
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno);
|
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2016-03-26 05:20:55 +08:00
|
|
|
} else if (clear_unwritten) {
|
2009-02-13 19:54:22 +08:00
|
|
|
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
|
|
|
|
wc->w_di_bh);
|
2008-08-21 10:36:33 +08:00
|
|
|
ret = ocfs2_mark_extent_written(inode, &et,
|
2016-03-26 05:21:03 +08:00
|
|
|
wc->w_handle, cpos, 1, *phys,
|
2008-08-21 10:36:33 +08:00
|
|
|
meta_ac, &wc->w_dealloc);
|
2007-06-19 02:22:56 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The only reason this should fail is due to an inability to
|
|
|
|
* find the extent added.
|
|
|
|
*/
|
2016-03-26 05:21:03 +08:00
|
|
|
ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
|
2007-02-10 12:24:12 +08:00
|
|
|
if (ret < 0) {
|
2014-12-11 07:42:02 +08:00
|
|
|
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
|
2016-03-26 05:21:03 +08:00
|
|
|
"at logical cluster %u",
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
|
2007-02-10 12:24:12 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-03-26 05:21:03 +08:00
|
|
|
BUG_ON(*phys == 0);
|
|
|
|
|
|
|
|
p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
|
|
|
|
if (!should_zero)
|
|
|
|
p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
int tmpret;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
/* This is the direct io target page. */
|
|
|
|
if (wc->w_pages[i] == NULL) {
|
|
|
|
p_blkno++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
|
|
|
|
wc->w_pages[i], cpos,
|
2007-06-19 02:22:56 +08:00
|
|
|
user_pos, user_len,
|
|
|
|
should_zero);
|
2007-05-09 08:47:32 +08:00
|
|
|
if (tmpret) {
|
|
|
|
mlog_errno(tmpret);
|
|
|
|
if (ret == 0)
|
2009-07-13 11:38:23 +08:00
|
|
|
ret = tmpret;
|
2007-05-09 08:47:32 +08:00
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* We only have cleanup to do in case of allocating write.
|
|
|
|
*/
|
|
|
|
if (ret && new)
|
|
|
|
ocfs2_write_failure(inode, wc, user_pos, user_len);
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
out:
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
return ret;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
|
|
|
|
struct ocfs2_alloc_context *data_ac,
|
|
|
|
struct ocfs2_alloc_context *meta_ac,
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
loff_t pos, unsigned len)
|
|
|
|
{
|
|
|
|
int ret, i;
|
2007-09-18 00:06:29 +08:00
|
|
|
loff_t cluster_off;
|
|
|
|
unsigned int local_len = len;
|
2007-05-15 09:09:54 +08:00
|
|
|
struct ocfs2_write_cluster_desc *desc;
|
2007-09-18 00:06:29 +08:00
|
|
|
struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
|
2007-05-15 09:09:54 +08:00
|
|
|
|
|
|
|
for (i = 0; i < wc->w_clen; i++) {
|
|
|
|
desc = &wc->w_desc[i];
|
|
|
|
|
2007-09-18 00:06:29 +08:00
|
|
|
/*
|
|
|
|
* We have to make sure that the total write passed in
|
|
|
|
* doesn't extend past a single cluster.
|
|
|
|
*/
|
|
|
|
local_len = len;
|
|
|
|
cluster_off = pos & (osb->s_clustersize - 1);
|
|
|
|
if ((cluster_off + local_len) > osb->s_clustersize)
|
|
|
|
local_len = osb->s_clustersize - cluster_off;
|
|
|
|
|
2016-03-26 05:21:03 +08:00
|
|
|
ret = ocfs2_write_cluster(mapping, &desc->c_phys,
|
2016-03-26 05:20:55 +08:00
|
|
|
desc->c_new,
|
|
|
|
desc->c_clear_unwritten,
|
2009-08-07 07:12:58 +08:00
|
|
|
desc->c_needs_zero,
|
|
|
|
data_ac, meta_ac,
|
2007-09-18 00:06:29 +08:00
|
|
|
wc, desc->c_cpos, pos, local_len);
|
2007-05-15 09:09:54 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2007-09-18 00:06:29 +08:00
|
|
|
|
|
|
|
len -= local_len;
|
|
|
|
pos += local_len;
|
2007-05-15 09:09:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* ocfs2_write_end() wants to know which parts of the target page it
|
|
|
|
* should complete the write on. It's easiest to compute them ahead of
|
|
|
|
* time when a more complete view of the write is available.
|
|
|
|
*/
|
|
|
|
static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
loff_t pos, unsigned len, int alloc)
|
2007-02-10 12:24:12 +08:00
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
struct ocfs2_write_cluster_desc *desc;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
|
|
|
|
wc->w_target_to = wc->w_target_from + len;
|
|
|
|
|
|
|
|
if (alloc == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocating write - we may have different boundaries based
|
|
|
|
* on page size and cluster size.
|
|
|
|
*
|
|
|
|
* NOTE: We can no longer compute one value from the other as
|
|
|
|
* the actual write length and user provided length may be
|
|
|
|
* different.
|
|
|
|
*/
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
if (wc->w_large_pages) {
|
|
|
|
/*
|
|
|
|
* We only care about the 1st and last cluster within
|
2007-06-19 02:22:56 +08:00
|
|
|
* our range and whether they should be zero'd or not. Either
|
2007-05-09 08:47:32 +08:00
|
|
|
* value may be extended out to the start/end of a
|
|
|
|
* newly allocated cluster.
|
|
|
|
*/
|
|
|
|
desc = &wc->w_desc[0];
|
2009-08-07 07:12:58 +08:00
|
|
|
if (desc->c_needs_zero)
|
2007-05-09 08:47:32 +08:00
|
|
|
ocfs2_figure_cluster_boundaries(osb,
|
|
|
|
desc->c_cpos,
|
|
|
|
&wc->w_target_from,
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
desc = &wc->w_desc[wc->w_clen - 1];
|
2009-08-07 07:12:58 +08:00
|
|
|
if (desc->c_needs_zero)
|
2007-05-09 08:47:32 +08:00
|
|
|
ocfs2_figure_cluster_boundaries(osb,
|
|
|
|
desc->c_cpos,
|
|
|
|
NULL,
|
|
|
|
&wc->w_target_to);
|
|
|
|
} else {
|
|
|
|
wc->w_target_from = 0;
|
|
|
|
wc->w_target_to = PAGE_CACHE_SIZE;
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
/*
|
|
|
|
* Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
|
|
|
|
* do the zero work. And should not to clear UNWRITTEN since it will be cleared
|
|
|
|
* by the direct io procedure.
|
|
|
|
* If this is a new extent that allocated by direct io, we should mark it in
|
|
|
|
* the ip_unwritten_list.
|
|
|
|
*/
|
|
|
|
static int ocfs2_unwritten_check(struct inode *inode,
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
struct ocfs2_write_cluster_desc *desc)
|
|
|
|
{
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
|
2016-03-26 05:21:06 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!desc->c_needs_zero)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
retry:
|
|
|
|
spin_lock(&oi->ip_lock);
|
|
|
|
/* Needs not to zero no metter buffer or direct. The one who is zero
|
|
|
|
* the cluster is doing zero. And he will clear unwritten after all
|
|
|
|
* cluster io finished. */
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
|
|
|
|
if (desc->c_cpos == ue->ue_cpos) {
|
2016-03-26 05:21:06 +08:00
|
|
|
BUG_ON(desc->c_new);
|
|
|
|
desc->c_needs_zero = 0;
|
|
|
|
desc->c_clear_unwritten = 0;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (wc->w_type != OCFS2_WRITE_DIRECT)
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
if (new == NULL) {
|
|
|
|
spin_unlock(&oi->ip_lock);
|
|
|
|
new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
|
|
|
|
GFP_NOFS);
|
|
|
|
if (new == NULL) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
/* This direct write will doing zero. */
|
|
|
|
new->ue_cpos = desc->c_cpos;
|
|
|
|
new->ue_phys = desc->c_phys;
|
|
|
|
desc->c_clear_unwritten = 0;
|
|
|
|
list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
|
|
|
|
list_add_tail(&new->ue_node, &wc->w_unwritten_list);
|
|
|
|
new = NULL;
|
|
|
|
unlock:
|
|
|
|
spin_unlock(&oi->ip_lock);
|
|
|
|
out:
|
|
|
|
if (new)
|
|
|
|
kfree(new);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
/*
|
|
|
|
* Populate each single-cluster write descriptor in the write context
|
|
|
|
* with information about the i/o to be done.
|
2007-06-19 02:22:56 +08:00
|
|
|
*
|
|
|
|
* Returns the number of clusters that will have to be allocated, as
|
|
|
|
* well as a worst case estimate of the number of extent records that
|
|
|
|
* would have to be created during a write to an unwritten region.
|
2007-05-15 09:09:54 +08:00
|
|
|
*/
|
|
|
|
static int ocfs2_populate_write_desc(struct inode *inode,
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
2007-06-19 02:22:56 +08:00
|
|
|
unsigned int *clusters_to_alloc,
|
|
|
|
unsigned int *extents_to_split)
|
2007-02-10 12:24:12 +08:00
|
|
|
{
|
2007-05-15 09:09:54 +08:00
|
|
|
int ret;
|
2007-05-09 08:47:32 +08:00
|
|
|
struct ocfs2_write_cluster_desc *desc;
|
2007-05-15 09:09:54 +08:00
|
|
|
unsigned int num_clusters = 0;
|
2007-06-19 02:22:56 +08:00
|
|
|
unsigned int ext_flags = 0;
|
2007-05-15 09:09:54 +08:00
|
|
|
u32 phys = 0;
|
|
|
|
int i;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
*clusters_to_alloc = 0;
|
|
|
|
*extents_to_split = 0;
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
for (i = 0; i < wc->w_clen; i++) {
|
|
|
|
desc = &wc->w_desc[i];
|
|
|
|
desc->c_cpos = wc->w_cpos + i;
|
|
|
|
|
|
|
|
if (num_clusters == 0) {
|
2007-06-19 02:22:56 +08:00
|
|
|
/*
|
|
|
|
* Need to look up the next extent record.
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
|
2007-06-19 02:22:56 +08:00
|
|
|
&num_clusters, &ext_flags);
|
2007-05-09 08:47:32 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
2007-05-10 06:14:45 +08:00
|
|
|
goto out;
|
2007-05-09 08:47:32 +08:00
|
|
|
}
|
2007-06-19 02:22:56 +08:00
|
|
|
|
2009-08-25 08:02:48 +08:00
|
|
|
/* We should already CoW the refcountd extent. */
|
|
|
|
BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
/*
|
|
|
|
* Assume worst case - that we're writing in
|
|
|
|
* the middle of the extent.
|
|
|
|
*
|
|
|
|
* We can assume that the write proceeds from
|
|
|
|
* left to right, in which case the extent
|
|
|
|
* insert code is smart enough to coalesce the
|
|
|
|
* next splits into the previous records created.
|
|
|
|
*/
|
|
|
|
if (ext_flags & OCFS2_EXT_UNWRITTEN)
|
|
|
|
*extents_to_split = *extents_to_split + 2;
|
2007-05-09 08:47:32 +08:00
|
|
|
} else if (phys) {
|
|
|
|
/*
|
|
|
|
* Only increment phys if it doesn't describe
|
|
|
|
* a hole.
|
|
|
|
*/
|
|
|
|
phys++;
|
|
|
|
}
|
|
|
|
|
2009-08-07 07:12:58 +08:00
|
|
|
/*
|
|
|
|
* If w_first_new_cpos is < UINT_MAX, we have a non-sparse
|
|
|
|
* file that got extended. w_first_new_cpos tells us
|
|
|
|
* where the newly allocated clusters are so we can
|
|
|
|
* zero them.
|
|
|
|
*/
|
|
|
|
if (desc->c_cpos >= wc->w_first_new_cpos) {
|
|
|
|
BUG_ON(phys == 0);
|
|
|
|
desc->c_needs_zero = 1;
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
desc->c_phys = phys;
|
|
|
|
if (phys == 0) {
|
|
|
|
desc->c_new = 1;
|
2009-08-07 07:12:58 +08:00
|
|
|
desc->c_needs_zero = 1;
|
2016-03-26 05:20:55 +08:00
|
|
|
desc->c_clear_unwritten = 1;
|
2007-05-15 09:09:54 +08:00
|
|
|
*clusters_to_alloc = *clusters_to_alloc + 1;
|
2007-05-09 08:47:32 +08:00
|
|
|
}
|
2009-08-07 07:12:58 +08:00
|
|
|
|
|
|
|
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
|
2016-03-26 05:20:55 +08:00
|
|
|
desc->c_clear_unwritten = 1;
|
2009-08-07 07:12:58 +08:00
|
|
|
desc->c_needs_zero = 1;
|
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
ret = ocfs2_unwritten_check(inode, wc, desc);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
num_clusters--;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
static int ocfs2_write_begin_inline(struct address_space *mapping,
|
|
|
|
struct inode *inode,
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
struct page *page;
|
|
|
|
handle_t *handle;
|
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
|
2014-10-10 06:25:15 +08:00
|
|
|
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
|
|
|
|
if (IS_ERR(handle)) {
|
|
|
|
ret = PTR_ERR(handle);
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
page = find_or_create_page(mapping, 0, GFP_NOFS);
|
|
|
|
if (!page) {
|
2014-10-10 06:25:15 +08:00
|
|
|
ocfs2_commit_trans(osb, handle);
|
2007-09-08 05:46:51 +08:00
|
|
|
ret = -ENOMEM;
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we don't set w_num_pages then this page won't get unlocked
|
|
|
|
* and freed on cleanup of the write context.
|
|
|
|
*/
|
|
|
|
wc->w_pages[0] = wc->w_target_page = page;
|
|
|
|
wc->w_num_pages = 1;
|
|
|
|
|
2009-02-13 08:41:25 +08:00
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
|
2008-10-18 10:25:01 +08:00
|
|
|
OCFS2_JOURNAL_ACCESS_WRITE);
|
2007-09-08 05:46:51 +08:00
|
|
|
if (ret) {
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
|
|
|
|
ocfs2_set_inode_data_inline(inode, di);
|
|
|
|
|
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
|
|
|
|
if (ret) {
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
wc->w_handle = handle;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
|
|
|
|
{
|
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
|
|
|
|
|
2007-11-21 03:48:41 +08:00
|
|
|
if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
|
2007-09-08 05:46:51 +08:00
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
|
|
|
|
struct inode *inode, loff_t pos,
|
|
|
|
unsigned len, struct page *mmap_page,
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
{
|
|
|
|
int ret, written = 0;
|
|
|
|
loff_t end = pos + len;
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
2009-03-05 11:06:15 +08:00
|
|
|
struct ocfs2_dinode *di = NULL;
|
2007-09-08 05:46:51 +08:00
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
|
|
|
|
len, (unsigned long long)pos,
|
|
|
|
oi->ip_dyn_features);
|
2007-09-08 05:46:51 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle inodes which already have inline data 1st.
|
|
|
|
*/
|
|
|
|
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
|
|
|
if (mmap_page == NULL &&
|
|
|
|
ocfs2_size_fits_inline_data(wc->w_di_bh, end))
|
|
|
|
goto do_inline_write;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The write won't fit - we have to give this inode an
|
|
|
|
* inline extent list now.
|
|
|
|
*/
|
|
|
|
ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
|
|
|
|
if (ret)
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether the inode can accept inline data.
|
|
|
|
*/
|
|
|
|
if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether the write can fit.
|
|
|
|
*/
|
2009-03-05 11:06:15 +08:00
|
|
|
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
if (mmap_page ||
|
|
|
|
end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
|
2007-09-08 05:46:51 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
do_inline_write:
|
|
|
|
ret = ocfs2_write_begin_inline(mapping, inode, wc);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This signals to the caller that the data can be written
|
|
|
|
* inline.
|
|
|
|
*/
|
|
|
|
written = 1;
|
|
|
|
out:
|
|
|
|
return written ? written : ret;
|
|
|
|
}
|
|
|
|
|
2007-08-29 08:13:23 +08:00
|
|
|
/*
|
|
|
|
* This function only does anything for file systems which can't
|
|
|
|
* handle sparse files.
|
|
|
|
*
|
|
|
|
* What we want to do here is fill in any hole between the current end
|
|
|
|
* of allocation and the end of our write. That way the rest of the
|
|
|
|
* write path can treat it as an non-allocating write, which has no
|
|
|
|
* special case code for sparse/nonsparse files.
|
|
|
|
*/
|
2010-07-02 06:13:31 +08:00
|
|
|
static int ocfs2_expand_nonsparse_inode(struct inode *inode,
|
|
|
|
struct buffer_head *di_bh,
|
|
|
|
loff_t pos, unsigned len,
|
2007-08-29 08:13:23 +08:00
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
loff_t newsize = pos + len;
|
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
|
2007-08-29 08:13:23 +08:00
|
|
|
|
|
|
|
if (newsize <= i_size_read(inode))
|
|
|
|
return 0;
|
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
|
2007-08-29 08:13:23 +08:00
|
|
|
if (ret)
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
/* There is no wc if this is call from direct. */
|
|
|
|
if (wc)
|
|
|
|
wc->w_first_new_cpos =
|
|
|
|
ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
|
2009-08-07 07:12:58 +08:00
|
|
|
|
2007-08-29 08:13:23 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
|
|
|
|
loff_t pos)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
|
|
|
|
if (pos > i_size_read(inode))
|
|
|
|
ret = ocfs2_zero_extend(inode, di_bh, pos);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-11-04 15:14:11 +08:00
|
|
|
/*
|
|
|
|
* Try to flush truncate logs if we can free enough clusters from it.
|
|
|
|
* As for return value, "< 0" means error, "0" no space and "1" means
|
|
|
|
* we have freed enough spaces and let the caller try to allocate again.
|
|
|
|
*/
|
|
|
|
static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
|
|
|
|
unsigned int needed)
|
|
|
|
{
|
|
|
|
tid_t target;
|
|
|
|
int ret = 0;
|
|
|
|
unsigned int truncated_clusters;
|
|
|
|
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_lock(osb->osb_tl_inode);
|
2010-11-04 15:14:11 +08:00
|
|
|
truncated_clusters = osb->truncated_clusters;
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(osb->osb_tl_inode);
|
2010-11-04 15:14:11 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we can succeed in allocating if we free
|
|
|
|
* the truncate log.
|
|
|
|
*/
|
|
|
|
if (truncated_clusters < needed)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = ocfs2_flush_truncate_log(osb);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
|
|
|
|
jbd2_log_wait_commit(osb->journal->j_journal, target);
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-03-26 05:20:52 +08:00
|
|
|
int ocfs2_write_begin_nolock(struct address_space *mapping,
|
|
|
|
loff_t pos, unsigned len, ocfs2_write_type_t type,
|
2007-05-15 09:09:54 +08:00
|
|
|
struct page **pagep, void **fsdata,
|
|
|
|
struct buffer_head *di_bh, struct page *mmap_page)
|
|
|
|
{
|
2009-08-07 07:12:58 +08:00
|
|
|
int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
|
2010-11-04 15:14:11 +08:00
|
|
|
unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
|
2007-05-15 09:09:54 +08:00
|
|
|
struct ocfs2_write_ctxt *wc;
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
struct ocfs2_dinode *di;
|
|
|
|
struct ocfs2_alloc_context *data_ac = NULL;
|
|
|
|
struct ocfs2_alloc_context *meta_ac = NULL;
|
|
|
|
handle_t *handle;
|
2008-08-21 10:36:33 +08:00
|
|
|
struct ocfs2_extent_tree et;
|
2010-11-04 15:14:11 +08:00
|
|
|
int try_free = 1, ret1;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
2010-11-04 15:14:11 +08:00
|
|
|
try_again:
|
2016-03-26 05:20:52 +08:00
|
|
|
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
|
2007-05-15 09:09:54 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
if (ocfs2_supports_inline_data(osb)) {
|
|
|
|
ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
|
|
|
|
mmap_page, wc);
|
|
|
|
if (ret == 1) {
|
|
|
|
ret = 0;
|
|
|
|
goto success;
|
|
|
|
}
|
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
/* Direct io change i_size late, should not zero tail here. */
|
|
|
|
if (type != OCFS2_WRITE_DIRECT) {
|
|
|
|
if (ocfs2_sparse_alloc(osb))
|
|
|
|
ret = ocfs2_zero_tail(inode, di_bh, pos);
|
|
|
|
else
|
|
|
|
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
|
|
|
|
len, wc);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2007-08-29 08:13:23 +08:00
|
|
|
}
|
|
|
|
|
2009-08-25 08:02:48 +08:00
|
|
|
ret = ocfs2_check_range_for_refcount(inode, pos, len);
|
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
} else if (ret == 1) {
|
2010-11-04 15:14:11 +08:00
|
|
|
clusters_need = wc->w_clen;
|
2013-08-14 07:00:58 +08:00
|
|
|
ret = ocfs2_refcount_cow(inode, di_bh,
|
2009-08-26 09:47:28 +08:00
|
|
|
wc->w_cpos, wc->w_clen, UINT_MAX);
|
2009-08-25 08:02:48 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
|
|
|
|
&extents_to_split);
|
2007-05-15 09:09:54 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2010-11-04 15:14:11 +08:00
|
|
|
clusters_need += clusters_to_alloc;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
|
|
|
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_write_begin_nolock(
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
(long long)i_size_read(inode),
|
|
|
|
le32_to_cpu(di->i_clusters),
|
2016-03-26 05:20:52 +08:00
|
|
|
pos, len, type, mmap_page,
|
2011-02-22 21:33:59 +08:00
|
|
|
clusters_to_alloc, extents_to_split);
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* We set w_target_from, w_target_to here so that
|
|
|
|
* ocfs2_write_end() knows which range in the target page to
|
|
|
|
* write out. An allocation requires that we write the entire
|
|
|
|
* cluster range.
|
|
|
|
*/
|
2007-06-19 02:22:56 +08:00
|
|
|
if (clusters_to_alloc || extents_to_split) {
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* XXX: We are stretching the limits of
|
2007-06-19 02:22:56 +08:00
|
|
|
* ocfs2_lock_allocators(). It greatly over-estimates
|
2007-05-09 08:47:32 +08:00
|
|
|
* the work to be done.
|
|
|
|
*/
|
2009-02-13 19:54:22 +08:00
|
|
|
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
|
|
|
|
wc->w_di_bh);
|
2008-08-21 10:36:33 +08:00
|
|
|
ret = ocfs2_lock_allocators(inode, &et,
|
2008-08-18 17:38:42 +08:00
|
|
|
clusters_to_alloc, extents_to_split,
|
2008-08-21 10:36:33 +08:00
|
|
|
&data_ac, &meta_ac);
|
2007-02-10 12:24:12 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
2007-05-10 06:14:45 +08:00
|
|
|
goto out;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2009-12-08 05:15:40 +08:00
|
|
|
if (data_ac)
|
|
|
|
data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
|
|
|
|
|
2008-08-18 17:38:43 +08:00
|
|
|
credits = ocfs2_calc_extend_credits(inode->i_sb,
|
2013-11-13 07:06:52 +08:00
|
|
|
&di->id2.i_list);
|
2016-03-26 05:21:01 +08:00
|
|
|
} else if (type == OCFS2_WRITE_DIRECT)
|
|
|
|
/* direct write needs not to start trans if no extents alloc. */
|
|
|
|
goto success;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2009-08-07 07:12:58 +08:00
|
|
|
/*
|
|
|
|
* We have to zero sparse allocated clusters, unwritten extent clusters,
|
|
|
|
* and non-sparse clusters we just extended. For non-sparse writes,
|
|
|
|
* we know zeros will only be needed in the first and/or last cluster.
|
|
|
|
*/
|
2016-03-26 05:21:06 +08:00
|
|
|
if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
|
|
|
|
wc->w_desc[wc->w_clen - 1].c_needs_zero))
|
2009-08-07 07:12:58 +08:00
|
|
|
cluster_of_pages = 1;
|
|
|
|
else
|
|
|
|
cluster_of_pages = 0;
|
|
|
|
|
|
|
|
ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
handle = ocfs2_start_trans(osb, credits);
|
|
|
|
if (IS_ERR(handle)) {
|
|
|
|
ret = PTR_ERR(handle);
|
|
|
|
mlog_errno(ret);
|
2007-05-10 06:14:45 +08:00
|
|
|
goto out;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
wc->w_handle = handle;
|
|
|
|
|
2010-03-03 22:05:00 +08:00
|
|
|
if (clusters_to_alloc) {
|
|
|
|
ret = dquot_alloc_space_nodirty(inode,
|
|
|
|
ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
|
|
|
|
if (ret)
|
|
|
|
goto out_commit;
|
2008-10-10 01:38:40 +08:00
|
|
|
}
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
|
2009-02-13 08:41:25 +08:00
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
|
2008-10-18 10:25:01 +08:00
|
|
|
OCFS2_JOURNAL_ACCESS_WRITE);
|
2007-05-09 08:47:32 +08:00
|
|
|
if (ret) {
|
2007-02-10 12:24:12 +08:00
|
|
|
mlog_errno(ret);
|
2008-10-10 01:38:40 +08:00
|
|
|
goto out_quota;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
/*
|
|
|
|
* Fill our page array first. That way we've grabbed enough so
|
|
|
|
* that we can zero and flush if we error after adding the
|
|
|
|
* extent.
|
|
|
|
*/
|
2010-07-03 08:20:27 +08:00
|
|
|
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
|
2009-08-07 07:12:58 +08:00
|
|
|
cluster_of_pages, mmap_page);
|
2011-07-25 01:36:54 +08:00
|
|
|
if (ret && ret != -EAGAIN) {
|
2007-02-10 12:24:12 +08:00
|
|
|
mlog_errno(ret);
|
2008-10-10 01:38:40 +08:00
|
|
|
goto out_quota;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2011-07-25 01:36:54 +08:00
|
|
|
/*
|
|
|
|
* ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
|
|
|
|
* the target page. In this case, we exit with no error and no target
|
|
|
|
* page. This will trigger the caller, page_mkwrite(), to re-try
|
|
|
|
* the operation.
|
|
|
|
*/
|
|
|
|
if (ret == -EAGAIN) {
|
|
|
|
BUG_ON(wc->w_target_page);
|
|
|
|
ret = 0;
|
|
|
|
goto out_quota;
|
|
|
|
}
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
|
|
|
|
len);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
2008-10-10 01:38:40 +08:00
|
|
|
goto out_quota;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
if (data_ac)
|
|
|
|
ocfs2_free_alloc_context(data_ac);
|
|
|
|
if (meta_ac)
|
|
|
|
ocfs2_free_alloc_context(meta_ac);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
success:
|
2016-03-26 05:20:58 +08:00
|
|
|
if (pagep)
|
|
|
|
*pagep = wc->w_target_page;
|
2007-05-09 08:47:32 +08:00
|
|
|
*fsdata = wc;
|
|
|
|
return 0;
|
2008-10-10 01:38:40 +08:00
|
|
|
out_quota:
|
|
|
|
if (clusters_to_alloc)
|
2010-03-03 22:05:00 +08:00
|
|
|
dquot_free_space(inode,
|
2008-10-10 01:38:40 +08:00
|
|
|
ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
|
2007-02-10 12:24:12 +08:00
|
|
|
out_commit:
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
|
|
|
|
out:
|
2016-03-26 05:21:06 +08:00
|
|
|
ocfs2_free_write_ctxt(inode, wc);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
2013-11-13 07:07:06 +08:00
|
|
|
if (data_ac) {
|
2007-02-10 12:24:12 +08:00
|
|
|
ocfs2_free_alloc_context(data_ac);
|
2013-11-13 07:07:06 +08:00
|
|
|
data_ac = NULL;
|
|
|
|
}
|
|
|
|
if (meta_ac) {
|
2007-02-10 12:24:12 +08:00
|
|
|
ocfs2_free_alloc_context(meta_ac);
|
2013-11-13 07:07:06 +08:00
|
|
|
meta_ac = NULL;
|
|
|
|
}
|
2010-11-04 15:14:11 +08:00
|
|
|
|
|
|
|
if (ret == -ENOSPC && try_free) {
|
|
|
|
/*
|
|
|
|
* Try to free some truncate log so that we can have enough
|
|
|
|
* clusters to allocate.
|
|
|
|
*/
|
|
|
|
try_free = 0;
|
|
|
|
|
|
|
|
ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
|
|
|
|
if (ret1 == 1)
|
|
|
|
goto try_again;
|
|
|
|
|
|
|
|
if (ret1 < 0)
|
|
|
|
mlog_errno(ret1);
|
|
|
|
}
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-10-16 16:25:24 +08:00
|
|
|
static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
|
|
|
|
loff_t pos, unsigned len, unsigned flags,
|
|
|
|
struct page **pagep, void **fsdata)
|
2007-05-10 06:14:45 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
2007-10-19 06:30:42 +08:00
|
|
|
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
2007-05-10 06:14:45 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Take alloc sem here to prevent concurrent lookups. That way
|
|
|
|
* the mapping, zeroing and tree manipulation within
|
|
|
|
* ocfs2_write() will be safe against ->readpage(). This
|
|
|
|
* should also serve to lock out allocation from a shared
|
|
|
|
* writeable region.
|
|
|
|
*/
|
|
|
|
down_write(&OCFS2_I(inode)->ip_alloc_sem);
|
|
|
|
|
2016-03-26 05:20:52 +08:00
|
|
|
ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
|
|
|
|
pagep, fsdata, di_bh, NULL);
|
2007-05-10 06:14:45 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
2007-10-19 06:23:46 +08:00
|
|
|
goto out_fail;
|
2007-05-10 06:14:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
brelse(di_bh);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_fail:
|
|
|
|
up_write(&OCFS2_I(inode)->ip_alloc_sem);
|
|
|
|
|
|
|
|
brelse(di_bh);
|
2007-10-19 06:30:42 +08:00
|
|
|
ocfs2_inode_unlock(inode, 1);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
|
|
|
|
unsigned len, unsigned *copied,
|
|
|
|
struct ocfs2_dinode *di,
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
{
|
|
|
|
void *kaddr;
|
|
|
|
|
|
|
|
if (unlikely(*copied < len)) {
|
|
|
|
if (!PageUptodate(wc->w_target_page)) {
|
|
|
|
*copied = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
kaddr = kmap_atomic(wc->w_target_page);
|
2007-09-08 05:46:51 +08:00
|
|
|
memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
|
2011-11-25 23:14:34 +08:00
|
|
|
kunmap_atomic(kaddr);
|
2007-09-08 05:46:51 +08:00
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
trace_ocfs2_write_end_inline(
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
2007-09-08 05:46:51 +08:00
|
|
|
(unsigned long long)pos, *copied,
|
|
|
|
le16_to_cpu(di->id2.i_data.id_count),
|
|
|
|
le16_to_cpu(di->i_dyn_features));
|
|
|
|
}
|
|
|
|
|
2007-05-10 06:16:19 +08:00
|
|
|
int ocfs2_write_end_nolock(struct address_space *mapping,
|
|
|
|
loff_t pos, unsigned len, unsigned copied,
|
|
|
|
struct page *page, void *fsdata)
|
2007-05-09 08:47:32 +08:00
|
|
|
{
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
int i, ret;
|
2007-05-09 08:47:32 +08:00
|
|
|
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
struct ocfs2_write_ctxt *wc = fsdata;
|
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
handle_t *handle = wc->w_handle;
|
|
|
|
struct page *tmppage;
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
BUG_ON(!list_empty(&wc->w_unwritten_list));
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
if (handle) {
|
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
|
|
|
|
wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
|
|
|
|
if (ret) {
|
|
|
|
copied = ret;
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
}
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
|
|
|
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
|
|
|
|
goto out_write_size;
|
|
|
|
}
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
if (unlikely(copied < len) && wc->w_target_page) {
|
2007-05-09 08:47:32 +08:00
|
|
|
if (!PageUptodate(wc->w_target_page))
|
|
|
|
copied = 0;
|
|
|
|
|
|
|
|
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
|
|
|
|
start+len);
|
|
|
|
}
|
2016-03-26 05:20:58 +08:00
|
|
|
if (wc->w_target_page)
|
|
|
|
flush_dcache_page(wc->w_target_page);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
tmppage = wc->w_pages[i];
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
/* This is the direct io target page. */
|
|
|
|
if (tmppage == NULL)
|
|
|
|
continue;
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
if (tmppage == wc->w_target_page) {
|
|
|
|
from = wc->w_target_from;
|
|
|
|
to = wc->w_target_to;
|
|
|
|
|
|
|
|
BUG_ON(from > PAGE_CACHE_SIZE ||
|
|
|
|
to > PAGE_CACHE_SIZE ||
|
|
|
|
to < from);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Pages adjacent to the target (if any) imply
|
|
|
|
* a hole-filling write in which case we want
|
|
|
|
* to flush their entire range.
|
|
|
|
*/
|
|
|
|
from = 0;
|
|
|
|
to = PAGE_CACHE_SIZE;
|
|
|
|
}
|
|
|
|
|
2008-07-17 08:22:22 +08:00
|
|
|
if (page_has_buffers(tmppage)) {
|
2016-03-26 05:21:01 +08:00
|
|
|
if (handle && ocfs2_should_order_data(inode))
|
|
|
|
ocfs2_jbd2_file_inode(handle, inode);
|
2008-07-17 08:22:22 +08:00
|
|
|
block_commit_write(tmppage, from, to);
|
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
}
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
out_write_size:
|
2016-03-26 05:21:01 +08:00
|
|
|
/* Direct io do not update i_size here. */
|
|
|
|
if (wc->w_type != OCFS2_WRITE_DIRECT) {
|
|
|
|
pos += copied;
|
|
|
|
if (pos > i_size_read(inode)) {
|
|
|
|
i_size_write(inode, pos);
|
|
|
|
mark_inode_dirty(inode);
|
|
|
|
}
|
|
|
|
inode->i_blocks = ocfs2_inode_sector_count(inode);
|
|
|
|
di->i_size = cpu_to_le64((u64)i_size_read(inode));
|
|
|
|
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
|
|
|
|
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
|
|
|
|
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
|
|
|
|
ocfs2_update_inode_fsync_trans(handle, inode, 1);
|
|
|
|
}
|
|
|
|
if (handle)
|
|
|
|
ocfs2_journal_dirty(handle, wc->w_di_bh);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
out:
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
|
|
|
|
* lock, or it will cause a deadlock since journal commit threads holds
|
|
|
|
* this lock and will ask for the page lock when flushing the data.
|
|
|
|
* put it here to preserve the unlock order.
|
|
|
|
*/
|
|
|
|
ocfs2_unlock_pages(wc);
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
if (handle)
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
2007-06-23 06:52:36 +08:00
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
ocfs2_run_deallocs(osb, &wc->w_dealloc);
|
|
|
|
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
brelse(wc->w_di_bh);
|
|
|
|
kfree(wc);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
|
|
|
return copied;
|
|
|
|
}
|
|
|
|
|
2007-10-16 16:25:24 +08:00
|
|
|
static int ocfs2_write_end(struct file *file, struct address_space *mapping,
|
|
|
|
loff_t pos, unsigned len, unsigned copied,
|
|
|
|
struct page *page, void *fsdata)
|
2007-05-10 06:14:45 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
|
|
|
ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
up_write(&OCFS2_I(inode)->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
ocfs2_inode_unlock(inode, 1);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
2007-05-10 06:14:45 +08:00
|
|
|
return ret;
|
2007-02-10 12:24:12 +08:00
|
|
|
}
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
struct ocfs2_dio_write_ctxt {
|
|
|
|
struct list_head dw_zero_list;
|
|
|
|
unsigned dw_zero_count;
|
|
|
|
int dw_orphaned;
|
|
|
|
pid_t dw_writer_pid;
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct ocfs2_dio_write_ctxt *
|
|
|
|
ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
|
|
|
|
{
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc = NULL;
|
|
|
|
|
|
|
|
if (bh->b_private)
|
|
|
|
return bh->b_private;
|
|
|
|
|
|
|
|
dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
|
|
|
|
if (dwc == NULL)
|
|
|
|
return NULL;
|
|
|
|
INIT_LIST_HEAD(&dwc->dw_zero_list);
|
|
|
|
dwc->dw_zero_count = 0;
|
|
|
|
dwc->dw_orphaned = 0;
|
|
|
|
dwc->dw_writer_pid = task_pid_nr(current);
|
|
|
|
bh->b_private = dwc;
|
|
|
|
*alloc = 1;
|
|
|
|
|
|
|
|
return dwc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ocfs2_dio_free_write_ctx(struct inode *inode,
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc)
|
|
|
|
{
|
|
|
|
ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
|
|
|
|
kfree(dwc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TODO: Make this into a generic get_blocks function.
|
|
|
|
*
|
|
|
|
* From do_direct_io in direct-io.c:
|
|
|
|
* "So what we do is to permit the ->get_blocks function to populate
|
|
|
|
* bh.b_size with the size of IO which is permitted at this offset and
|
|
|
|
* this i_blkbits."
|
|
|
|
*
|
|
|
|
* This function is called directly from get_more_blocks in direct-io.c.
|
|
|
|
*
|
|
|
|
* called like this: dio->get_blocks(dio->inode, fs_startblk,
|
|
|
|
* fs_count, map_bh, dio->rw == WRITE);
|
|
|
|
*/
|
|
|
|
static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
2016-03-26 05:21:18 +08:00
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
struct ocfs2_write_ctxt *wc;
|
|
|
|
struct ocfs2_write_cluster_desc *desc = NULL;
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc = NULL;
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
u64 p_blkno;
|
|
|
|
loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
|
|
|
|
unsigned len, total_len = bh_result->b_size;
|
|
|
|
int ret = 0, first_get_block = 0;
|
|
|
|
|
|
|
|
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
|
|
|
|
len = min(total_len, len);
|
|
|
|
|
|
|
|
mlog(0, "get block of %lu at %llu:%u req %u\n",
|
|
|
|
inode->i_ino, pos, len, total_len);
|
|
|
|
|
2016-03-26 05:21:20 +08:00
|
|
|
/*
|
|
|
|
* Because we need to change file size in ocfs2_dio_end_io_write(), or
|
|
|
|
* we may need to add it to orphan dir. So can not fall to fast path
|
|
|
|
* while file size will be changed.
|
|
|
|
*/
|
|
|
|
if (pos + total_len <= i_size_read(inode)) {
|
|
|
|
down_read(&oi->ip_alloc_sem);
|
|
|
|
/* This is the fast path for re-write. */
|
|
|
|
ret = ocfs2_get_block(inode, iblock, bh_result, create);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
2016-03-26 05:21:20 +08:00
|
|
|
up_read(&oi->ip_alloc_sem);
|
2016-03-26 05:21:18 +08:00
|
|
|
|
2016-03-26 05:21:20 +08:00
|
|
|
if (buffer_mapped(bh_result) &&
|
|
|
|
!buffer_new(bh_result) &&
|
|
|
|
ret == 0)
|
|
|
|
goto out;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
2016-03-26 05:21:20 +08:00
|
|
|
/* Clear state set by ocfs2_get_block. */
|
|
|
|
bh_result->b_state = 0;
|
|
|
|
}
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
|
|
|
|
if (unlikely(dwc == NULL)) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
|
|
|
|
ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
|
|
|
|
!dwc->dw_orphaned) {
|
|
|
|
/*
|
|
|
|
* when we are going to alloc extents beyond file size, add the
|
|
|
|
* inode to orphan dir, so we can recall those spaces when
|
|
|
|
* system crashed during write.
|
|
|
|
*/
|
|
|
|
ret = ocfs2_add_inode_to_orphan(osb, inode);
|
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
dwc->dw_orphaned = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-03-26 05:21:18 +08:00
|
|
|
down_write(&oi->ip_alloc_sem);
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
if (first_get_block) {
|
|
|
|
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
|
|
|
|
ret = ocfs2_zero_tail(inode, di_bh, pos);
|
|
|
|
else
|
|
|
|
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
|
|
|
|
total_len, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
|
|
|
|
OCFS2_WRITE_DIRECT, NULL,
|
|
|
|
(void **)&wc, di_bh, NULL);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
desc = &wc->w_desc[0];
|
|
|
|
|
|
|
|
p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
|
|
|
|
BUG_ON(p_blkno == 0);
|
|
|
|
p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
|
|
|
|
|
|
|
|
map_bh(bh_result, inode->i_sb, p_blkno);
|
|
|
|
bh_result->b_size = len;
|
|
|
|
if (desc->c_needs_zero)
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
|
|
|
/* May sleep in end_io. It should not happen in a irq context. So defer
|
|
|
|
* it to dio work queue. */
|
|
|
|
set_buffer_defer_completion(bh_result);
|
|
|
|
|
|
|
|
if (!list_empty(&wc->w_unwritten_list)) {
|
|
|
|
struct ocfs2_unwritten_extent *ue = NULL;
|
|
|
|
|
|
|
|
ue = list_first_entry(&wc->w_unwritten_list,
|
|
|
|
struct ocfs2_unwritten_extent,
|
|
|
|
ue_node);
|
|
|
|
BUG_ON(ue->ue_cpos != desc->c_cpos);
|
|
|
|
/* The physical address may be 0, fill it. */
|
|
|
|
ue->ue_phys = desc->c_phys;
|
|
|
|
|
|
|
|
list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
|
|
|
|
dwc->dw_zero_count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
|
|
|
|
BUG_ON(ret != len);
|
|
|
|
ret = 0;
|
|
|
|
unlock:
|
2016-03-26 05:21:18 +08:00
|
|
|
up_write(&oi->ip_alloc_sem);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
ocfs2_inode_unlock(inode, 1);
|
|
|
|
brelse(di_bh);
|
|
|
|
out:
|
|
|
|
if (ret < 0)
|
|
|
|
ret = -EIO;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ocfs2_dio_end_io_write(struct inode *inode,
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc,
|
|
|
|
loff_t offset,
|
|
|
|
ssize_t bytes)
|
|
|
|
{
|
|
|
|
struct ocfs2_cached_dealloc_ctxt dealloc;
|
|
|
|
struct ocfs2_extent_tree et;
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
2016-03-26 05:21:18 +08:00
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
struct ocfs2_unwritten_extent *ue = NULL;
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
struct ocfs2_dinode *di;
|
|
|
|
struct ocfs2_alloc_context *data_ac = NULL;
|
|
|
|
struct ocfs2_alloc_context *meta_ac = NULL;
|
|
|
|
handle_t *handle = NULL;
|
|
|
|
loff_t end = offset + bytes;
|
|
|
|
int ret = 0, credits = 0, locked = 0;
|
|
|
|
|
|
|
|
ocfs2_init_dealloc_ctxt(&dealloc);
|
|
|
|
|
|
|
|
/* We do clear unwritten, delete orphan, change i_size here. If neither
|
|
|
|
* of these happen, we can skip all this. */
|
|
|
|
if (list_empty(&dwc->dw_zero_list) &&
|
|
|
|
end <= i_size_read(inode) &&
|
|
|
|
!dwc->dw_orphaned)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
|
|
|
|
* are in that context. */
|
|
|
|
if (dwc->dw_writer_pid != task_pid_nr(current)) {
|
|
|
|
mutex_lock(&inode->i_mutex);
|
|
|
|
locked = 1;
|
|
|
|
}
|
|
|
|
|
2016-03-26 05:21:18 +08:00
|
|
|
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
down_write(&oi->ip_alloc_sem);
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
/* Delete orphan before acquire i_mutex. */
|
|
|
|
if (dwc->dw_orphaned) {
|
|
|
|
BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
|
|
|
|
|
|
|
|
end = end > i_size_read(inode) ? end : 0;
|
|
|
|
|
|
|
|
ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
|
|
|
|
!!end, end);
|
|
|
|
if (ret < 0)
|
|
|
|
mlog_errno(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
di = (struct ocfs2_dinode *)di_bh;
|
|
|
|
|
|
|
|
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
|
|
|
|
|
|
|
|
ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
|
|
|
|
&data_ac, &meta_ac);
|
2016-03-26 05:21:23 +08:00
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto unlock;
|
|
|
|
}
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
|
|
|
|
|
|
|
|
handle = ocfs2_start_trans(osb, credits);
|
|
|
|
if (IS_ERR(handle)) {
|
|
|
|
ret = PTR_ERR(handle);
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
|
|
|
|
OCFS2_JOURNAL_ACCESS_WRITE);
|
|
|
|
if (ret) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
goto commit;
|
|
|
|
}
|
|
|
|
|
|
|
|
list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
|
|
|
|
ret = ocfs2_mark_extent_written(inode, &et, handle,
|
|
|
|
ue->ue_cpos, 1,
|
|
|
|
ue->ue_phys,
|
|
|
|
meta_ac, &dealloc);
|
|
|
|
if (ret < 0) {
|
|
|
|
mlog_errno(ret);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (end > i_size_read(inode)) {
|
|
|
|
ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
|
|
|
|
if (ret < 0)
|
|
|
|
mlog_errno(ret);
|
|
|
|
}
|
|
|
|
commit:
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
unlock:
|
2016-03-26 05:21:18 +08:00
|
|
|
up_write(&oi->ip_alloc_sem);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
ocfs2_inode_unlock(inode, 1);
|
|
|
|
brelse(di_bh);
|
|
|
|
out:
|
|
|
|
if (data_ac)
|
|
|
|
ocfs2_free_alloc_context(data_ac);
|
|
|
|
if (meta_ac)
|
|
|
|
ocfs2_free_alloc_context(meta_ac);
|
2016-03-26 05:21:23 +08:00
|
|
|
ocfs2_run_deallocs(osb, &dealloc);
|
|
|
|
if (locked)
|
|
|
|
mutex_unlock(&inode->i_mutex);
|
|
|
|
ocfs2_dio_free_write_ctx(inode, dwc);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
|
|
|
|
* particularly interested in the aio/dio case. We use the rw_lock DLM lock
|
|
|
|
* to protect io on one node from truncation on another.
|
|
|
|
*/
|
|
|
|
static int ocfs2_dio_end_io(struct kiocb *iocb,
|
|
|
|
loff_t offset,
|
|
|
|
ssize_t bytes,
|
|
|
|
void *private)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
int level;
|
|
|
|
|
|
|
|
if (bytes <= 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* this io's submitter should not have unlocked this before we could */
|
|
|
|
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
|
|
|
|
|
|
|
|
if (private)
|
|
|
|
ocfs2_dio_end_io_write(inode, private, offset, bytes);
|
|
|
|
|
|
|
|
ocfs2_iocb_clear_rw_locked(iocb);
|
|
|
|
|
|
|
|
level = ocfs2_iocb_rw_locked_level(iocb);
|
|
|
|
ocfs2_rw_unlock(inode, level);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
|
|
|
loff_t offset)
|
|
|
|
{
|
|
|
|
struct file *file = iocb->ki_filp;
|
|
|
|
struct inode *inode = file_inode(file)->i_mapping->host;
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
loff_t end = offset + iter->count;
|
|
|
|
get_block_t *get_block;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fallback to buffered I/O if we see an inode without
|
|
|
|
* extents.
|
|
|
|
*/
|
|
|
|
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Fallback to buffered I/O if we do not support append dio. */
|
|
|
|
if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (iov_iter_rw(iter) == READ)
|
|
|
|
get_block = ocfs2_get_block;
|
|
|
|
else
|
|
|
|
get_block = ocfs2_dio_get_block;
|
|
|
|
|
|
|
|
return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
|
|
|
|
iter, offset, get_block,
|
|
|
|
ocfs2_dio_end_io, NULL, 0);
|
|
|
|
}
|
|
|
|
|
2006-06-28 19:26:44 +08:00
|
|
|
const struct address_space_operations ocfs2_aops = {
|
2009-03-05 16:22:21 +08:00
|
|
|
.readpage = ocfs2_readpage,
|
|
|
|
.readpages = ocfs2_readpages,
|
|
|
|
.writepage = ocfs2_writepage,
|
|
|
|
.write_begin = ocfs2_write_begin,
|
|
|
|
.write_end = ocfs2_write_end,
|
|
|
|
.bmap = ocfs2_bmap,
|
|
|
|
.direct_IO = ocfs2_direct_IO,
|
2013-11-13 07:07:08 +08:00
|
|
|
.invalidatepage = block_invalidatepage,
|
2009-03-05 16:22:21 +08:00
|
|
|
.releasepage = ocfs2_releasepage,
|
|
|
|
.migratepage = buffer_migrate_page,
|
|
|
|
.is_partially_uptodate = block_is_partially_uptodate,
|
2009-09-16 17:50:16 +08:00
|
|
|
.error_remove_page = generic_error_remove_page,
|
2005-12-16 06:31:24 +08:00
|
|
|
};
|