From c726de4409a8d3a03877b1ef4342bfe8a15f5e5e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 30 Nov 2010 15:14:39 +1100 Subject: [PATCH 1/5] xfs: fix failed write truncation handling. Since the move to the new truncate sequence we call xfs_setattr to truncate down excessively instanciated blocks. As shown by the testcase in kernel.org BZ #22452 that doesn't work too well. Due to the confusion of the internal inode size, and the VFS inode i_size it zeroes data that it shouldn't. But full blown truncate seems like overkill here. We only instanciate delayed allocations in the write path, and given that we never released the iolock we can't have converted them to real allocations yet either. The only nasty case is pre-existing preallocation which we need to skip. We already do this for page discard during writeback, so make the delayed allocation block punching a generic function and call it from the failed write path as well as xfs_aops_discard_page. The callers are responsible for ensuring that partial blocks are not truncated away, and that they hold the ilock. Based on a fix originally from Christoph Hellwig. This version used filesystem blocks as the range unit. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_aops.c | 94 ++++++++++++++++--------------------- fs/xfs/xfs_bmap.c | 76 ++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap.h | 5 ++ 3 files changed, 121 insertions(+), 54 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7d287afccde5..691f61223ed6 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -934,7 +934,6 @@ xfs_aops_discard_page( struct xfs_inode *ip = XFS_I(inode); struct buffer_head *bh, *head; loff_t offset = page_offset(page); - ssize_t len = 1 << inode->i_blkbits; if (!xfs_is_delayed_page(page, IO_DELAY)) goto out_invalidate; @@ -949,58 +948,14 @@ xfs_aops_discard_page( xfs_ilock(ip, XFS_ILOCK_EXCL); bh = head = page_buffers(page); do { - int done; - xfs_fileoff_t offset_fsb; - xfs_bmbt_irec_t imap; - int nimaps = 1; int error; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + xfs_fileoff_t start_fsb; if (!buffer_delay(bh)) goto next_buffer; - offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi(NULL, ip, offset_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); - - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_fs_cmn_err(CE_ALERT, ip->i_mount, - "page discard failed delalloc mapping lookup."); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_buffer; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_buffer; - } - WARN_ON(imap.br_blockcount == 0); - - /* - * Note: while we initialise the firstblock/flist pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. - */ - xfs_bmap_init(&flist, &firstblock); - error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, - &flist, &done); - - ASSERT(!flist.xbf_count && !flist.xbf_first); + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); if (error) { /* something screwed, just bail */ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { @@ -1010,7 +965,7 @@ xfs_aops_discard_page( break; } next_buffer: - offset += len; + offset += 1 << inode->i_blkbits; } while ((bh = bh->b_this_page) != head); @@ -1505,11 +1460,42 @@ xfs_vm_write_failed( struct inode *inode = mapping->host; if (to > inode->i_size) { - struct iattr ia = { - .ia_valid = ATTR_SIZE | ATTR_FORCE, - .ia_size = inode->i_size, - }; - xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); + /* + * punch out the delalloc blocks we have already allocated. We + * don't call xfs_setattr() to do this as we may be in the + * middle of a multi-iovec write and so the vfs inode->i_size + * will not match the xfs ip->i_size and so it will zero too + * much. Hence we jus truncate the page cache to zero what is + * necessary and punch the delalloc blocks directly. + */ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + truncate_pagecache(inode, to, inode->i_size); + + /* + * Check if there are any blocks that are outside of i_size + * that need to be trimmed back. + */ + start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; + end_fsb = XFS_B_TO_FSB(ip->i_mount, to); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); } } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8abd12e32e13..08b179fa9e8f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -6070,3 +6070,79 @@ xfs_bmap_disk_count_leaves( *count += xfs_bmbt_disk_get_blockcount(frp); } } + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will alays punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, start_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 71ec9b6ecdfc..3651191daea1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -394,6 +394,11 @@ xfs_bmap_count_blocks( int whichfork, int *count); +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ From 90810b9e82a36c3c57c1aeb8b2918b242a130b26 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 30 Nov 2010 15:16:16 +1100 Subject: [PATCH 2/5] xfs: push stale, pinned buffers on trylock failures As reported by Nick Piggin, XFS is suffering from long pauses under highly concurrent workloads when hosted on ramdisks. The problem is that an inode buffer is stuck in the pinned state in memory and as a result either the inode buffer or one of the inodes within the buffer is stopping the tail of the log from being moved forward. The system remains in this state until a periodic log force issued by xfssyncd causes the buffer to be unpinned. The main problem is that these are stale buffers, and are hence held locked until the transaction/checkpoint that marked them state has been committed to disk. When the filesystem gets into this state, only the xfssyncd can cause the async transactions to be committed to disk and hence unpin the inode buffer. This problem was encountered when scaling the busy extent list, but only the blocking lock interface was fixed to solve the problem. Extend the same fix to the buffer trylock operations - if we fail to lock a pinned, stale buffer, then force the log immediately so that when the next attempt to lock it comes around, it will have been unpinned. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_buf.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index aa1d353def29..4c5deb6e9e31 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -488,29 +488,16 @@ _xfs_buf_find( spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); - /* Attempt to get the semaphore without sleeping, - * if this does not work then we need to drop the - * spinlock and do a hard attempt on the semaphore. - */ - if (down_trylock(&bp->b_sema)) { + if (xfs_buf_cond_lock(bp)) { + /* failed, so wait for the lock if requested. */ if (!(flags & XBF_TRYLOCK)) { - /* wait for buffer ownership */ xfs_buf_lock(bp); XFS_STATS_INC(xb_get_locked_waited); } else { - /* We asked for a trylock and failed, no need - * to look at file offset and length here, we - * know that this buffer at least overlaps our - * buffer and is locked, therefore our buffer - * either does not exist, or is this buffer. - */ xfs_buf_rele(bp); XFS_STATS_INC(xb_busy_locked); return NULL; } - } else { - /* trylock worked */ - XB_SET_OWNER(bp); } if (bp->b_flags & XBF_STALE) { @@ -876,10 +863,18 @@ xfs_buf_rele( */ /* - * Locks a buffer object, if it is not already locked. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. + * Locks a buffer object, if it is not already locked. Note that this in + * no way locks the underlying pages, so it is only useful for + * synchronizing concurrent use of buffer objects, not for synchronizing + * independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. */ int xfs_buf_cond_lock( @@ -890,6 +885,8 @@ xfs_buf_cond_lock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); + else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_cond_lock(bp, _RET_IP_); return locked ? 0 : -EBUSY; From 309c848002052edbec650075a1eb098b17c17f35 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 30 Nov 2010 15:16:02 +1100 Subject: [PATCH 3/5] xfs: delayed alloc blocks beyond EOF are valid after writeback There is an assumption in the parts of XFS that flushing a dirty file will make all the delayed allocation blocks disappear from an inode. That is, that after calling xfs_flush_pages() then ip->i_delayed_blks will be zero. This is an invalid assumption as we may have specualtive preallocation beyond EOF and they are recorded in ip->i_delayed_blks. A flush of the dirty pages of an inode will not change the state of these blocks beyond EOF, so a non-zero deeelalloc block count after a flush is valid. The bmap code has an invalid ASSERT() that needs to be removed, and the swapext code has a bug in that while it swaps the data forks around, it fails to swap the i_delayed_blks counter associated with the fork and hence can get the block accounting wrong. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap.c | 9 +++++++-- fs/xfs/xfs_dfrag.c | 13 +++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 08b179fa9e8f..4111cd3966c7 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5471,8 +5471,13 @@ xfs_getbmap( if (error) goto out_unlock_iolock; } - - ASSERT(ip->i_delayed_blks == 0); + /* + * even after flushing the inode, there can still be delalloc + * blocks on the inode beyond EOF due to speculative + * preallocation. These are not removed until the release + * function is called or the inode is inactivated. Hence we + * cannot assert here that ip->i_delayed_blks == 0. + */ } lock = xfs_ilock_map_shared(ip); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 3b9582c60a22..e60490bc00a6 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -377,6 +377,19 @@ xfs_swap_extents( ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + ilf_fields = XFS_ILOG_CORE; switch(ip->i_d.di_format) { From de25c1818c44f580ff556cb9e0f7a1c687ed870b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 30 Nov 2010 15:15:46 +1100 Subject: [PATCH 4/5] xfs: avoid moving stale inodes in the AIL When an inode has been marked stale because the cluster is being freed, we don't want to (re-)insert this inode into the AIL. There is a race condition where the cluster buffer may be unpinned before the inode is inserted into the AIL during transaction committed processing. If the buffer is unpinned before the inode item has been committed and inserted, then it is possible for the buffer to be released and hence processthe stale inode callbacks before the inode is inserted into the AIL. In this case, we then insert a clean, stale inode into the AIL which will never get removed by an IO completion. It will, however, get reclaimed and that triggers an assert in xfs_inode_free() complaining about freeing an inode still in the AIL. This race can be avoided by not moving stale inodes forward in the AIL during transaction commit completion processing. This closes the race condition by ensuring we never insert clean stale inodes into the AIL. It is safe to do this because a dirty stale inode, by definition, must already be in the AIL. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode_item.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c7ac020705df..7c8d30c453c3 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -657,18 +657,37 @@ xfs_inode_item_unlock( } /* - * This is called to find out where the oldest active copy of the - * inode log item in the on disk log resides now that the last log - * write of it completed at the given lsn. Since we always re-log - * all dirty data in an inode, the latest copy in the on disk log - * is the only one that matters. Therefore, simply return the - * given lsn. + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completions before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, return a lower LSN than the one passed in so that the + * transaction committed code will not move the inode forward in the AIL but + * will still unpin it properly. */ STATIC xfs_lsn_t xfs_inode_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_iflags_test(ip, XFS_ISTALE)) + return lsn - 1; return lsn; } From c76febef574fd86566bbdf1a73a547a439115c25 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 30 Nov 2010 15:15:31 +1100 Subject: [PATCH 5/5] xfs: only run xfs_error_test if error injection is active Recent tests writing lots of small files showed the flusher thread being CPU bound and taking a long time to do allocations on a debug kernel. perf showed this as the prime reason: samples pcnt function DSO _______ _____ ___________________________ _________________ 224648.00 36.8% xfs_error_test [kernel.kallsyms] 86045.00 14.1% xfs_btree_check_sblock [kernel.kallsyms] 39778.00 6.5% prandom32 [kernel.kallsyms] 37436.00 6.1% xfs_btree_increment [kernel.kallsyms] 29278.00 4.8% xfs_btree_get_rec [kernel.kallsyms] 27717.00 4.5% random32 [kernel.kallsyms] Walking btree blocks during allocation checking them requires each block (a cache hit, so no I/O) call xfs_error_test(), which then does a random32() call as the first operation. IOWs, ~50% of the CPU is being consumed just testing whether we need to inject an error, even though error injection is not active. Kill this overhead when error injection is not active by adding a global counter of active error traps and only calling into xfs_error_test when fault injection is active. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_error.c | 3 +++ fs/xfs/xfs_error.h | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed9990267661..c78cc6a3d87c 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -58,6 +58,7 @@ xfs_error_trap(int e) int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; +int xfs_error_test_active; int xfs_error_test(int error_tag, int *fsidp, char *expression, @@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) len = strlen(mp->m_fsname); xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; return 0; } } @@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) xfs_etest_fsid[i] = 0LL; kmem_free(xfs_etest_fsname[i]); xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; } } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c2c1a072bb82..f338847f80b8 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level, #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #ifdef DEBUG +extern int xfs_error_test_active; extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); #define XFS_NUM_INJECT_ERROR 10 #define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ + ((expr) || (xfs_error_test_active && \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ - (rf))) + (rf)))) extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);