linux/fs/xfs/xfs_dquot.c

1373 lines
34 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2003 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_error.h"
/*
* Lock order:
*
* ip->i_lock
* qi->qi_tree_lock
* dquot->q_qlock (xfs_dqlock() and friends)
* dquot->q_flush (xfs_dqflock() and friends)
* qi->qi_lru_lock
*
* If two dquots need to be locked the order is user before group/project,
* otherwise by the lowest id first, see xfs_dqlock2.
*/
struct kmem_zone *xfs_qm_dqtrxzone;
static struct kmem_zone *xfs_qm_dqzone;
xfs: lockdep needs to know about 3 dquot-deep nesting Michael Semon reported that xfs/299 generated this lockdep warning: ============================================= [ INFO: possible recursive locking detected ] 3.12.0-rc2+ #2 Not tainted --------------------------------------------- touch/21072 is trying to acquire lock: (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 but task is already holding lock: (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&xfs_dquot_other_class); lock(&xfs_dquot_other_class); *** DEADLOCK *** May be due to missing lock nesting notation 7 locks held by touch/21072: #0: (sb_writers#10){++++.+}, at: [<c11185b6>] mnt_want_write+0x1e/0x3e #1: (&type->i_mutex_dir_key#4){+.+.+.}, at: [<c11078ee>] do_last+0x245/0xe40 #2: (sb_internal#2){++++.+}, at: [<c122c9e0>] xfs_trans_alloc+0x1f/0x35 #3: (&(&ip->i_lock)->mr_lock/1){+.+...}, at: [<c126cd1b>] xfs_ilock+0x100/0x1f1 #4: (&(&ip->i_lock)->mr_lock){++++-.}, at: [<c126cf52>] xfs_ilock_nowait+0x105/0x22f #5: (&dqp->q_qlock){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 #6: (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 The lockdep annotation for dquot lock nesting only understands locking for user and "other" dquots, not user, group and quota dquots. Fix the annotations to match the locking heirarchy we now have. Reported-by: Michael L. Semon <mlsemon35@gmail.com> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Ben Myers <bpm@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-09-30 07:37:03 +08:00
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
/*
* This is called to free all the memory associated with a dquot
*/
void
xfs_qm_dqdestroy(
struct xfs_dquot *dqp)
{
ASSERT(list_empty(&dqp->q_lru));
xfs: allocate log vector buffers outside CIL context lock One of the problems we currently have with delayed logging is that under serious memory pressure we can deadlock memory reclaim. THis occurs when memory reclaim (such as run by kswapd) is reclaiming XFS inodes and issues a log force to unpin inodes that are dirty in the CIL. The CIL is pushed, but this will only occur once it gets the CIL context lock to ensure that all committing transactions are complete and no new transactions start being committed to the CIL while the push switches to a new context. The deadlock occurs when the CIL context lock is held by a committing process that is doing memory allocation for log vector buffers, and that allocation is then blocked on memory reclaim making progress. Memory reclaim, however, is blocked waiting for a log force to make progress, and so we effectively deadlock at this point. To solve this problem, we have to move the CIL log vector buffer allocation outside of the context lock so that memory reclaim can always make progress when it needs to force the log. The problem with doing this is that a CIL push can take place while we are determining if we need to allocate a new log vector buffer for an item and hence the current log vector may go away without warning. That means we canot rely on the existing log vector being present when we finally grab the context lock and so we must have a replacement buffer ready to go at all times. To ensure this, introduce a "shadow log vector" buffer that is always guaranteed to be present when we gain the CIL context lock and format the item. This shadow buffer may or may not be used during the formatting, but if the log item does not have an existing log vector buffer or that buffer is too small for the new modifications, we swap it for the new shadow buffer and format the modifications into that new log vector buffer. The result of this is that for any object we modify more than once in a given CIL checkpoint, we double the memory required to track dirty regions in the log. For single modifications then we consume the shadow log vectorwe allocate on commit, and that gets consumed by the checkpoint. However, if we make multiple modifications, then the second transaction commit will allocate a shadow log vector and hence we will end up with double the memory usage as only one of the log vectors is consumed by the CIL checkpoint. The remaining shadow vector will be freed when th elog item is freed. This can probably be optimised in future - access to the shadow log vector is serialised by the object lock (as opposited to the active log vector, which is controlled by the CIL context lock) and so we can probably free shadow log vector from some objects when the log item is marked clean on removal from the AIL. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-07-22 07:52:35 +08:00
kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
kmem_cache_free(xfs_qm_dqzone, dqp);
}
/*
* If default limits are in force, push them into the dquot now.
* We overwrite the dquot limits only if they are zero and this
* is not the root dquot.
*/
void
xfs_qm_adjust_dqlimits(
struct xfs_mount *mp,
struct xfs_dquot *dq)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_def_quota *defq;
int prealloc = 0;
ASSERT(dq->q_id);
defq = xfs_get_defquota(q, xfs_dquot_type(dq));
if (defq->blk.soft && !dq->q_blk.softlimit) {
dq->q_blk.softlimit = defq->blk.soft;
prealloc = 1;
}
if (defq->blk.hard && !dq->q_blk.hardlimit) {
dq->q_blk.hardlimit = defq->blk.hard;
prealloc = 1;
}
if (defq->ino.soft && !dq->q_ino.softlimit)
dq->q_ino.softlimit = defq->ino.soft;
if (defq->ino.hard && !dq->q_ino.hardlimit)
dq->q_ino.hardlimit = defq->ino.hard;
if (defq->rtb.soft && !dq->q_rtb.softlimit)
dq->q_rtb.softlimit = defq->rtb.soft;
if (defq->rtb.hard && !dq->q_rtb.hardlimit)
dq->q_rtb.hardlimit = defq->rtb.hard;
if (prealloc)
xfs_dquot_set_prealloc_limits(dq);
}
/*
* Check the limits and timers of a dquot and start or reset timers
* if necessary.
* This gets called even when quota enforcement is OFF, which makes our
* life a little less complicated. (We just don't reject any quota
* reservations in that case, when enforcement is off).
* We also return 0 as the values of the timers in Q_GETQUOTA calls, when
* enforcement's off.
* In contrast, warnings are a little different in that they don't
* 'automatically' get started when limits get exceeded. They do
* get reset to zero, however, when we find the count to be under
* the soft limit (they are only ever set non-zero via userspace).
*/
void
xfs_qm_adjust_dqtimers(
struct xfs_mount *mp,
struct xfs_dquot *dq)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct xfs_def_quota *defq;
ASSERT(dq->q_id);
defq = xfs_get_defquota(qi, xfs_dquot_type(dq));
#ifdef DEBUG
if (dq->q_blk.hardlimit)
ASSERT(dq->q_blk.softlimit <= dq->q_blk.hardlimit);
if (dq->q_ino.hardlimit)
ASSERT(dq->q_ino.softlimit <= dq->q_ino.hardlimit);
if (dq->q_rtb.hardlimit)
ASSERT(dq->q_rtb.softlimit <= dq->q_rtb.hardlimit);
#endif
if (!dq->q_blk.timer) {
if ((dq->q_blk.softlimit &&
(dq->q_blk.count > dq->q_blk.softlimit)) ||
(dq->q_blk.hardlimit &&
(dq->q_blk.count > dq->q_blk.hardlimit))) {
dq->q_blk.timer = ktime_get_real_seconds() +
defq->blk.time;
} else {
dq->q_blk.warnings = 0;
}
} else {
if ((!dq->q_blk.softlimit ||
(dq->q_blk.count <= dq->q_blk.softlimit)) &&
(!dq->q_blk.hardlimit ||
(dq->q_blk.count <= dq->q_blk.hardlimit))) {
dq->q_blk.timer = 0;
}
}
if (!dq->q_ino.timer) {
if ((dq->q_ino.softlimit &&
(dq->q_ino.count > dq->q_ino.softlimit)) ||
(dq->q_ino.hardlimit &&
(dq->q_ino.count > dq->q_ino.hardlimit))) {
dq->q_ino.timer = ktime_get_real_seconds() +
defq->ino.time;
} else {
dq->q_ino.warnings = 0;
}
} else {
if ((!dq->q_ino.softlimit ||
(dq->q_ino.count <= dq->q_ino.softlimit)) &&
(!dq->q_ino.hardlimit ||
(dq->q_ino.count <= dq->q_ino.hardlimit))) {
dq->q_ino.timer = 0;
}
}
if (!dq->q_rtb.timer) {
if ((dq->q_rtb.softlimit &&
(dq->q_rtb.count > dq->q_rtb.softlimit)) ||
(dq->q_rtb.hardlimit &&
(dq->q_rtb.count > dq->q_rtb.hardlimit))) {
dq->q_rtb.timer = ktime_get_real_seconds() +
defq->rtb.time;
} else {
dq->q_rtb.warnings = 0;
}
} else {
if ((!dq->q_rtb.softlimit ||
(dq->q_rtb.count <= dq->q_rtb.softlimit)) &&
(!dq->q_rtb.hardlimit ||
(dq->q_rtb.count <= dq->q_rtb.hardlimit))) {
dq->q_rtb.timer = 0;
}
}
}
/*
* initialize a buffer full of dquots and log the whole thing
*/
STATIC void
xfs_qm_init_dquot_blk(
xfs: use ordered buffers to initialize dquot buffers during quotacheck While QAing the new xfs_repair quotacheck code, I uncovered a quota corruption bug resulting from a bad interaction between dquot buffer initialization and quotacheck. The bug can be reproduced with the following sequence: # mkfs.xfs -f /dev/sdf # mount /dev/sdf /opt -o usrquota # su nobody -s /bin/bash -c 'touch /opt/barf' # sync # xfs_quota -x -c 'report -ahi' /opt User quota on /opt (/dev/sdf) Inodes User ID Used Soft Hard Warn/Grace ---------- --------------------------------- root 3 0 0 00 [------] nobody 1 0 0 00 [------] # xfs_io -x -c 'shutdown' /opt # umount /opt # mount /dev/sdf /opt -o usrquota # touch /opt/man2 # xfs_quota -x -c 'report -ahi' /opt User quota on /opt (/dev/sdf) Inodes User ID Used Soft Hard Warn/Grace ---------- --------------------------------- root 1 0 0 00 [------] nobody 1 0 0 00 [------] # umount /opt Notice how the initial quotacheck set the root dquot icount to 3 (rootino, rbmino, rsumino), but after shutdown -> remount -> recovery, xfs_quota reports that the root dquot has only 1 icount. We haven't deleted anything from the filesystem, which means that quota is now under-counting. This behavior is not limited to icount or the root dquot, but this is the shortest reproducer. I traced the cause of this discrepancy to the way that we handle ondisk dquot updates during quotacheck vs. regular fs activity. Normally, when we allocate a disk block for a dquot, we log the buffer as a regular (dquot) buffer. Subsequent updates to the dquots backed by that block are done via separate dquot log item updates, which means that they depend on the logged buffer update being written to disk before the dquot items. Because individual dquots have their own LSN fields, that initial dquot buffer must always be recovered. However, the story changes for quotacheck, which can cause dquot block allocations but persists the final dquot counter values via a delwri list. Because recovery doesn't gate dquot buffer replay on an LSN, this means that the initial dquot buffer can be replayed over the (newer) contents that were delwritten at the end of quotacheck. In effect, this re-initializes the dquot counters after they've been updated. If the log does not contain any other dquot items to recover, the obsolete dquot contents will not be corrected by log recovery. Because quotacheck uses a transaction to log the setting of the CHKD flags in the superblock, we skip quotacheck during the second mount call, which allows the incorrect icount to remain. Fix this by changing the ondisk dquot initialization function to use ordered buffers to write out fresh dquot blocks if it detects that we're running quotacheck. If the system goes down before quotacheck can complete, the CHKD flags will not be set in the superblock and the next mount will run quotacheck again, which can fix uninitialized dquot buffers. This requires amending the defer code to maintaine ordered buffer state across defer rolls for the sake of the dquot allocation code. For regular operations we preserve the current behavior since the dquot items require properly initialized ondisk dquot records. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
2020-05-14 06:33:27 +08:00
struct xfs_trans *tp,
struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
struct xfs_buf *bp)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
xfs: use ordered buffers to initialize dquot buffers during quotacheck While QAing the new xfs_repair quotacheck code, I uncovered a quota corruption bug resulting from a bad interaction between dquot buffer initialization and quotacheck. The bug can be reproduced with the following sequence: # mkfs.xfs -f /dev/sdf # mount /dev/sdf /opt -o usrquota # su nobody -s /bin/bash -c 'touch /opt/barf' # sync # xfs_quota -x -c 'report -ahi' /opt User quota on /opt (/dev/sdf) Inodes User ID Used Soft Hard Warn/Grace ---------- --------------------------------- root 3 0 0 00 [------] nobody 1 0 0 00 [------] # xfs_io -x -c 'shutdown' /opt # umount /opt # mount /dev/sdf /opt -o usrquota # touch /opt/man2 # xfs_quota -x -c 'report -ahi' /opt User quota on /opt (/dev/sdf) Inodes User ID Used Soft Hard Warn/Grace ---------- --------------------------------- root 1 0 0 00 [------] nobody 1 0 0 00 [------] # umount /opt Notice how the initial quotacheck set the root dquot icount to 3 (rootino, rbmino, rsumino), but after shutdown -> remount -> recovery, xfs_quota reports that the root dquot has only 1 icount. We haven't deleted anything from the filesystem, which means that quota is now under-counting. This behavior is not limited to icount or the root dquot, but this is the shortest reproducer. I traced the cause of this discrepancy to the way that we handle ondisk dquot updates during quotacheck vs. regular fs activity. Normally, when we allocate a disk block for a dquot, we log the buffer as a regular (dquot) buffer. Subsequent updates to the dquots backed by that block are done via separate dquot log item updates, which means that they depend on the logged buffer update being written to disk before the dquot items. Because individual dquots have their own LSN fields, that initial dquot buffer must always be recovered. However, the story changes for quotacheck, which can cause dquot block allocations but persists the final dquot counter values via a delwri list. Because recovery doesn't gate dquot buffer replay on an LSN, this means that the initial dquot buffer can be replayed over the (newer) contents that were delwritten at the end of quotacheck. In effect, this re-initializes the dquot counters after they've been updated. If the log does not contain any other dquot items to recover, the obsolete dquot contents will not be corrected by log recovery. Because quotacheck uses a transaction to log the setting of the CHKD flags in the superblock, we skip quotacheck during the second mount call, which allows the incorrect icount to remain. Fix this by changing the ondisk dquot initialization function to use ordered buffers to write out fresh dquot blocks if it detects that we're running quotacheck. If the system goes down before quotacheck can complete, the CHKD flags will not be set in the superblock and the next mount will run quotacheck again, which can fix uninitialized dquot buffers. This requires amending the defer code to maintaine ordered buffer state across defer rolls for the sake of the dquot allocation code. For regular operations we preserve the current behavior since the dquot items require properly initialized ondisk dquot records. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
2020-05-14 06:33:27 +08:00
struct xfs_dqblk *d;
xfs_dqid_t curid;
unsigned int qflag;
unsigned int blftype;
int i;
ASSERT(tp);
ASSERT(xfs_buf_islocked(bp));
d = bp->b_addr;
/*
* ID of the first dquot in the block - id's are zero based.
*/
curid = id - (id % q->qi_dqperchunk);
memset(d, 0, BBTOB(q->qi_dqchunklen));
for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
}
xfs: use ordered buffers to initialize dquot buffers during quotacheck While QAing the new xfs_repair quotacheck code, I uncovered a quota corruption bug resulting from a bad interaction between dquot buffer initialization and quotacheck. The bug can be reproduced with the following sequence: # mkfs.xfs -f /dev/sdf # mount /dev/sdf /opt -o usrquota # su nobody -s /bin/bash -c 'touch /opt/barf' # sync # xfs_quota -x -c 'report -ahi' /opt User quota on /opt (/dev/sdf) Inodes User ID Used Soft Hard Warn/Grace ---------- --------------------------------- root 3 0 0 00 [------] nobody 1 0 0 00 [------] # xfs_io -x -c 'shutdown' /opt # umount /opt # mount /dev/sdf /opt -o usrquota # touch /opt/man2 # xfs_quota -x -c 'report -ahi' /opt User quota on /opt (/dev/sdf) Inodes User ID Used Soft Hard Warn/Grace ---------- --------------------------------- root 1 0 0 00 [------] nobody 1 0 0 00 [------] # umount /opt Notice how the initial quotacheck set the root dquot icount to 3 (rootino, rbmino, rsumino), but after shutdown -> remount -> recovery, xfs_quota reports that the root dquot has only 1 icount. We haven't deleted anything from the filesystem, which means that quota is now under-counting. This behavior is not limited to icount or the root dquot, but this is the shortest reproducer. I traced the cause of this discrepancy to the way that we handle ondisk dquot updates during quotacheck vs. regular fs activity. Normally, when we allocate a disk block for a dquot, we log the buffer as a regular (dquot) buffer. Subsequent updates to the dquots backed by that block are done via separate dquot log item updates, which means that they depend on the logged buffer update being written to disk before the dquot items. Because individual dquots have their own LSN fields, that initial dquot buffer must always be recovered. However, the story changes for quotacheck, which can cause dquot block allocations but persists the final dquot counter values via a delwri list. Because recovery doesn't gate dquot buffer replay on an LSN, this means that the initial dquot buffer can be replayed over the (newer) contents that were delwritten at the end of quotacheck. In effect, this re-initializes the dquot counters after they've been updated. If the log does not contain any other dquot items to recover, the obsolete dquot contents will not be corrected by log recovery. Because quotacheck uses a transaction to log the setting of the CHKD flags in the superblock, we skip quotacheck during the second mount call, which allows the incorrect icount to remain. Fix this by changing the ondisk dquot initialization function to use ordered buffers to write out fresh dquot blocks if it detects that we're running quotacheck. If the system goes down before quotacheck can complete, the CHKD flags will not be set in the superblock and the next mount will run quotacheck again, which can fix uninitialized dquot buffers. This requires amending the defer code to maintaine ordered buffer state across defer rolls for the sake of the dquot allocation code. For regular operations we preserve the current behavior since the dquot items require properly initialized ondisk dquot records. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
2020-05-14 06:33:27 +08:00
if (type & XFS_DQ_USER) {
qflag = XFS_UQUOTA_CHKD;
blftype = XFS_BLF_UDQUOT_BUF;
} else if (type & XFS_DQ_PROJ) {
qflag = XFS_PQUOTA_CHKD;
blftype = XFS_BLF_PDQUOT_BUF;
} else {
qflag = XFS_GQUOTA_CHKD;
blftype = XFS_BLF_GDQUOT_BUF;
}
xfs_trans_dquot_buf(tp, bp, blftype);
/*
* quotacheck uses delayed writes to update all the dquots on disk in an
* efficient manner instead of logging the individual dquot changes as
* they are made. However if we log the buffer allocated here and crash
* after quotacheck while the logged initialisation is still in the
* active region of the log, log recovery can replay the dquot buffer
* initialisation over the top of the checked dquots and corrupt quota
* accounting.
*
* To avoid this problem, quotacheck cannot log the initialised buffer.
* We must still dirty the buffer and write it back before the
* allocation transaction clears the log. Therefore, mark the buffer as
* ordered instead of logging it directly. This is safe for quotacheck
* because it detects and repairs allocated but initialized dquot blocks
* in the quota inodes.
*/
if (!(mp->m_qflags & qflag))
xfs_trans_ordered_buf(tp, bp);
else
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
/*
* Initialize the dynamic speculative preallocation thresholds. The lo/hi
* watermarks correspond to the soft and hard limits by default. If a soft limit
* is not specified, we use 95% of the hard limit.
*/
void
xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
{
uint64_t space;
dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit;
dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit;
if (!dqp->q_prealloc_lo_wmark) {
dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
do_div(dqp->q_prealloc_lo_wmark, 100);
dqp->q_prealloc_lo_wmark *= 95;
}
space = dqp->q_prealloc_hi_wmark;
do_div(space, 100);
dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
}
/*
* Ensure that the given in-core dquot has a buffer on disk backing it, and
* return the buffer locked and held. This is called when the bmapi finds a
* hole.
*/
STATIC int
xfs_dquot_disk_alloc(
struct xfs_trans **tpp,
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
struct xfs_bmbt_irec map;
struct xfs_trans *tp = *tpp;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *bp;
struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
int nmaps = 1;
int error;
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
trace_xfs_dqalloc(dqp);
xfs_ilock(quotip, XFS_ILOCK_EXCL);
if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we didn't
* have an inode lock
*/
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return -ESRCH;
}
/* Create the block mapping. */
xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
xfs: don't set bmapi total block req where minleft is xfs_bmapi_write() takes a total block requirement parameter that is passed down to the block allocation code and is used to specify the total block requirement of the associated transaction. This is used to try and select an AG that can not only satisfy the requested extent allocation, but can also accommodate subsequent allocations that might be required to complete the transaction. For example, additional bmbt block allocations may be required on insertion of the resulting extent to an inode data fork. While it's important for callers to calculate and reserve such extra blocks in the transaction, it is not necessary to pass the total value to xfs_bmapi_write() in all cases. The latter automatically sets minleft to ensure that sufficient free blocks remain after the allocation attempt to expand the format of the associated inode (i.e., such as extent to btree conversion, btree splits, etc). Therefore, any callers that pass a total block requirement of the bmap mapping length plus worst case bmbt expansion essentially specify the additional reservation requirement twice. These callers can pass a total of zero to rely on the bmapi minleft policy. Beyond being superfluous, the primary motivation for this change is that the total reservation logic in the bmbt code is dubious in scenarios where minlen < maxlen and a maxlen extent cannot be allocated (which is more common for data extent allocations where contiguity is not required). The total value is based on maxlen in the xfs_bmapi_write() caller. If the bmbt code falls back to an allocation between minlen and maxlen, that allocation will not succeed until total is reset to minlen, which essentially throws away any additional reservation included in total by the caller. In addition, the total value is not reset until after alignment is dropped, which means that such callers drop alignment far too aggressively than necessary. Update all callers of xfs_bmapi_write() that pass a total block value of the mapping length plus bmbt reservation to instead pass zero and rely on xfs_bmapi_minleft() to enforce the bmbt reservation requirement. This trades off slightly less conservative AG selection for the ability to preserve alignment in more scenarios. xfs_bmapi_write() callers that incorporate unrelated or additional reservations in total beyond what is already included in minleft must continue to use the former. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-10-22 00:26:48 +08:00
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
&nmaps);
if (error)
return error;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
/*
* Keep track of the blkno to save a lookup later
*/
dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
/* now we can just get the buffer (there's nothing to read yet) */
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp);
if (error)
return error;
bp->b_ops = &xfs_dquot_buf_ops;
/*
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
xfs_qm_init_dquot_blk(tp, mp, dqp->q_id,
dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
* Hold the buffer and join it to the dfops so that we'll still own
* the buffer when we return to the caller. The buffer disposal on
* error must be paid attention to very carefully, as it has been
* broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota
* code when allocating a new dquot record" in 2005, and the later
* conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep
* the buffer locked across the _defer_finish call. We can now do
* this correctly with xfs_defer_bjoin.
*
* Above, we allocated a disk block for the dquot information and used
* get_buf to initialize the dquot. If the _defer_finish fails, the old
* transaction is gone but the new buffer is not joined or held to any
* transaction, so we must _buf_relse it.
*
* If everything succeeds, the caller of this function is returned a
* buffer that is locked and held to the transaction. The caller
* is responsible for unlocking any buffer passed back, either
* manually or by committing the transaction. On error, the buffer is
* released and not passed back.
*/
xfs_trans_bhold(tp, bp);
error = xfs_defer_finish(tpp);
if (error) {
xfs_trans_bhold_release(*tpp, bp);
xfs_trans_brelse(*tpp, bp);
return error;
}
*bpp = bp;
return 0;
}
/*
* Read in the in-core dquot's on-disk metadata and return the buffer.
* Returns ENOENT to signal a hole.
*/
STATIC int
xfs_dquot_disk_read(
struct xfs_mount *mp,
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
struct xfs_bmbt_irec map;
struct xfs_buf *bp;
struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
uint lock_mode;
int nmaps = 1;
int error;
lock_mode = xfs_ilock_data_map_shared(quotip);
if (!xfs_this_quota_on(mp, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
*/
xfs_iunlock(quotip, lock_mode);
return -ESRCH;
}
/*
* Find the block map; no allocations yet
*/
error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
xfs_iunlock(quotip, lock_mode);
if (error)
return error;
ASSERT(nmaps == 1);
ASSERT(map.br_blockcount >= 1);
ASSERT(map.br_startblock != DELAYSTARTBLOCK);
if (map.br_startblock == HOLESTARTBLOCK)
return -ENOENT;
trace_xfs_dqtobp_read(dqp);
/*
* store the blkno etc so that we don't have to do the
* mapping all the time
*/
dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
if (error) {
ASSERT(bp == NULL);
return error;
}
ASSERT(xfs_buf_islocked(bp));
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
*bpp = bp;
return 0;
}
/* Allocate and initialize everything we need for an incore dquot. */
STATIC struct xfs_dquot *
xfs_dquot_alloc(
struct xfs_mount *mp,
xfs_dqid_t id,
uint type)
{
struct xfs_dquot *dqp;
dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
dqp->dq_flags = type;
dqp->q_id = id;
dqp->q_mount = mp;
INIT_LIST_HEAD(&dqp->q_lru);
mutex_init(&dqp->q_qlock);
init_waitqueue_head(&dqp->q_pinwait);
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
/*
* Offset of dquot in the (fixed sized) dquot chunk.
*/
dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
sizeof(xfs_dqblk_t);
/*
* Because we want to use a counting completion, complete
* the flush completion once to allow a single access to
* the flush completion without blocking.
*/
init_completion(&dqp->q_flush);
complete(&dqp->q_flush);
/*
* Make sure group quotas have a different lock class than user
* quotas.
*/
xfs: lockdep needs to know about 3 dquot-deep nesting Michael Semon reported that xfs/299 generated this lockdep warning: ============================================= [ INFO: possible recursive locking detected ] 3.12.0-rc2+ #2 Not tainted --------------------------------------------- touch/21072 is trying to acquire lock: (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 but task is already holding lock: (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&xfs_dquot_other_class); lock(&xfs_dquot_other_class); *** DEADLOCK *** May be due to missing lock nesting notation 7 locks held by touch/21072: #0: (sb_writers#10){++++.+}, at: [<c11185b6>] mnt_want_write+0x1e/0x3e #1: (&type->i_mutex_dir_key#4){+.+.+.}, at: [<c11078ee>] do_last+0x245/0xe40 #2: (sb_internal#2){++++.+}, at: [<c122c9e0>] xfs_trans_alloc+0x1f/0x35 #3: (&(&ip->i_lock)->mr_lock/1){+.+...}, at: [<c126cd1b>] xfs_ilock+0x100/0x1f1 #4: (&(&ip->i_lock)->mr_lock){++++-.}, at: [<c126cf52>] xfs_ilock_nowait+0x105/0x22f #5: (&dqp->q_qlock){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 #6: (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64 The lockdep annotation for dquot lock nesting only understands locking for user and "other" dquots, not user, group and quota dquots. Fix the annotations to match the locking heirarchy we now have. Reported-by: Michael L. Semon <mlsemon35@gmail.com> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Ben Myers <bpm@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-09-30 07:37:03 +08:00
switch (type) {
case XFS_DQ_USER:
/* uses the default lock class */
break;
case XFS_DQ_GROUP:
lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
break;
case XFS_DQ_PROJ:
lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
break;
default:
ASSERT(0);
break;
}
xfs_qm_dquot_logitem_init(dqp);
XFS_STATS_INC(mp, xs_qm_dquot);
return dqp;
}
/* Copy the in-core quota fields in from the on-disk buffer. */
STATIC int
xfs_dquot_from_disk(
struct xfs_dquot *dqp,
struct xfs_buf *bp)
{
struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
/*
* Ensure that we got the type and ID we were looking for.
* Everything else was checked by the dquot buffer verifier.
*/
if ((ddqp->d_flags & XFS_DQ_ALLTYPES) != dqp->dq_flags ||
be32_to_cpu(ddqp->d_id) != dqp->q_id) {
xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
"Metadata corruption detected at %pS, quota %u",
__this_address, dqp->q_id);
xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
return -EFSCORRUPTED;
}
/* copy everything from disk dquot to the incore dquot */
dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit);
dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit);
dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount);
dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer);
dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer);
dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer);
/*
* Reservation counters are defined as reservation plus current usage
* to avoid having to add every time.
*/
dqp->q_blk.reserved = dqp->q_blk.count;
dqp->q_ino.reserved = dqp->q_ino.count;
dqp->q_rtb.reserved = dqp->q_rtb.count;
/* initialize the dquot speculative prealloc thresholds */
xfs_dquot_set_prealloc_limits(dqp);
return 0;
}
/* Copy the in-core quota fields into the on-disk buffer. */
void
xfs_dquot_to_disk(
struct xfs_disk_dquot *ddqp,
struct xfs_dquot *dqp)
{
ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
ddqp->d_version = XFS_DQUOT_VERSION;
ddqp->d_flags = dqp->dq_flags & XFS_DQ_ALLTYPES;
ddqp->d_id = cpu_to_be32(dqp->q_id);
ddqp->d_pad0 = 0;
ddqp->d_pad = 0;
ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit);
ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit);
ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit);
ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit);
ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit);
ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit);
ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count);
ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer);
ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer);
ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer);
}
/* Allocate and initialize the dquot buffer for this in-core dquot. */
static int
xfs_qm_dqread_alloc(
struct xfs_mount *mp,
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
struct xfs_trans *tp;
int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
if (error)
goto err;
error = xfs_dquot_disk_alloc(&tp, dqp, bpp);
if (error)
goto err_cancel;
error = xfs_trans_commit(tp);
if (error) {
/*
* Buffer was held to the transaction, so we have to unlock it
* manually here because we're not passing it back.
*/
xfs_buf_relse(*bpp);
*bpp = NULL;
goto err;
}
return 0;
err_cancel:
xfs_trans_cancel(tp);
err:
return error;
}
/*
* Read in the ondisk dquot using dqtobp() then copy it to an incore version,
* and release the buffer immediately. If @can_alloc is true, fill any
* holes in the on-disk metadata.
*/
static int
xfs_qm_dqread(
struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
bool can_alloc,
struct xfs_dquot **dqpp)
{
struct xfs_dquot *dqp;
struct xfs_buf *bp;
int error;
dqp = xfs_dquot_alloc(mp, id, type);
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
trace_xfs_dqread(dqp);
/* Try to read the buffer, allocating if necessary. */
error = xfs_dquot_disk_read(mp, dqp, &bp);
if (error == -ENOENT && can_alloc)
error = xfs_qm_dqread_alloc(mp, dqp, &bp);
if (error)
goto err;
/*
* At this point we should have a clean locked buffer. Copy the data
* to the incore dquot and release the buffer since the incore dquot
* has its own locking protocol so we needn't tie up the buffer any
* further.
*/
ASSERT(xfs_buf_islocked(bp));
error = xfs_dquot_from_disk(dqp, bp);
xfs_buf_relse(bp);
if (error)
goto err;
*dqpp = dqp;
return error;
err:
trace_xfs_dqread_fail(dqp);
xfs_qm_dqdestroy(dqp);
*dqpp = NULL;
return error;
}
/*
* Advance to the next id in the current chunk, or if at the
* end of the chunk, skip ahead to first id in next allocated chunk
* using the SEEK_DATA interface.
*/
static int
xfs_dq_get_next_id(
struct xfs_mount *mp,
uint type,
xfs_dqid_t *id)
{
struct xfs_inode *quotip = xfs_quota_inode(mp, type);
xfs_dqid_t next_id = *id + 1; /* simple advance */
uint lock_flags;
struct xfs_bmbt_irec got;
struct xfs_iext_cursor cur;
xfs_fsblock_t start;
int error = 0;
/* If we'd wrap past the max ID, stop */
if (next_id < *id)
return -ENOENT;
/* If new ID is within the current chunk, advancing it sufficed */
if (next_id % mp->m_quotainfo->qi_dqperchunk) {
*id = next_id;
return 0;
}
/* Nope, next_id is now past the current chunk, so find the next one */
start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
lock_flags = xfs_ilock_data_map_shared(quotip);
if (!(quotip->i_df.if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK);
if (error)
return error;
}
if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &cur, &got)) {
/* contiguous chunk, bump startoff for the id calculation */
if (got.br_startoff < start)
got.br_startoff = start;
*id = got.br_startoff * mp->m_quotainfo->qi_dqperchunk;
} else {
error = -ENOENT;
}
xfs_iunlock(quotip, lock_flags);
return error;
}
/*
* Look up the dquot in the in-core cache. If found, the dquot is returned
* locked and ready to go.
*/
static struct xfs_dquot *
xfs_qm_dqget_cache_lookup(
struct xfs_mount *mp,
struct xfs_quotainfo *qi,
struct radix_tree_root *tree,
xfs_dqid_t id)
{
struct xfs_dquot *dqp;
restart:
mutex_lock(&qi->qi_tree_lock);
dqp = radix_tree_lookup(tree, id);
if (!dqp) {
mutex_unlock(&qi->qi_tree_lock);
XFS_STATS_INC(mp, xs_qm_dqcachemisses);
return NULL;
}
xfs_dqlock(dqp);
if (dqp->q_flags & XFS_DQFLAG_FREEING) {
xfs_dqunlock(dqp);
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_freeing(dqp);
delay(1);
goto restart;
}
dqp->q_nrefs++;
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_hit(dqp);
XFS_STATS_INC(mp, xs_qm_dqcachehits);
return dqp;
}
/*
* Try to insert a new dquot into the in-core cache. If an error occurs the
* caller should throw away the dquot and start over. Otherwise, the dquot
* is returned locked (and held by the cache) as if there had been a cache
* hit.
*/
static int
xfs_qm_dqget_cache_insert(
struct xfs_mount *mp,
struct xfs_quotainfo *qi,
struct radix_tree_root *tree,
xfs_dqid_t id,
struct xfs_dquot *dqp)
{
int error;
mutex_lock(&qi->qi_tree_lock);
error = radix_tree_insert(tree, id, dqp);
if (unlikely(error)) {
/* Duplicate found! Caller must try again. */
WARN_ON(error != -EEXIST);
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
return error;
}
/* Return a locked dquot to the caller, with a reference taken. */
xfs_dqlock(dqp);
dqp->q_nrefs = 1;
qi->qi_dquots++;
mutex_unlock(&qi->qi_tree_lock);
return 0;
}
/* Check our input parameters. */
static int
xfs_qm_dqget_checks(
struct xfs_mount *mp,
uint type)
{
if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
return -ESRCH;
switch (type) {
case XFS_DQ_USER:
if (!XFS_IS_UQUOTA_ON(mp))
return -ESRCH;
return 0;
case XFS_DQ_GROUP:
if (!XFS_IS_GQUOTA_ON(mp))
return -ESRCH;
return 0;
case XFS_DQ_PROJ:
if (!XFS_IS_PQUOTA_ON(mp))
return -ESRCH;
return 0;
default:
WARN_ON_ONCE(0);
return -EINVAL;
}
}
/*
* Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
* dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
bool can_alloc,
struct xfs_dquot **O_dqpp)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
error = xfs_qm_dqget_checks(mp, type);
if (error)
return error;
restart:
dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
if (dqp) {
*O_dqpp = dqp;
return 0;
}
error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
if (error)
return error;
error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
if (error) {
/*
* Duplicate found. Just throw away the new dquot and start
* over.
*/
xfs_qm_dqdestroy(dqp);
XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
trace_xfs_dqget_miss(dqp);
*O_dqpp = dqp;
return 0;
}
/*
* Given a dquot id and type, read and initialize a dquot from the on-disk
* metadata. This function is only for use during quota initialization so
* it ignores the dquot cache assuming that the dquot shrinker isn't set up.
* The caller is responsible for _qm_dqdestroy'ing the returned dquot.
*/
int
xfs_qm_dqget_uncached(
struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
struct xfs_dquot **dqpp)
{
int error;
error = xfs_qm_dqget_checks(mp, type);
if (error)
return error;
return xfs_qm_dqread(mp, id, type, 0, dqpp);
}
/* Return the quota id for a given inode and type. */
xfs_dqid_t
xfs_qm_id_for_quotatype(
struct xfs_inode *ip,
uint type)
{
switch (type) {
case XFS_DQ_USER:
return i_uid_read(VFS_I(ip));
case XFS_DQ_GROUP:
return i_gid_read(VFS_I(ip));
case XFS_DQ_PROJ:
return ip->i_d.di_projid;
}
ASSERT(0);
return 0;
}
/*
* Return the dquot for a given inode and type. If @can_alloc is true, then
* allocate blocks if needed. The inode's ILOCK must be held and it must not
* have already had an inode attached.
*/
int
xfs_qm_dqget_inode(
struct xfs_inode *ip,
uint type,
bool can_alloc,
struct xfs_dquot **O_dqpp)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
xfs_dqid_t id;
int error;
error = xfs_qm_dqget_checks(mp, type);
if (error)
return error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(xfs_inode_dquot(ip, type) == NULL);
id = xfs_qm_id_for_quotatype(ip, type);
restart:
dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
if (dqp) {
*O_dqpp = dqp;
return 0;
}
/*
* Dquot cache miss. We don't want to keep the inode lock across
* a (potential) disk read. Also we don't want to deal with the lock
* ordering between quotainode and this inode. OTOH, dropping the inode
* lock here means dealing with a chown that can happen before
* we re-acquire the lock.
*/
xfs_iunlock(ip, XFS_ILOCK_EXCL);
error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
/*
* A dquot could be attached to this inode by now, since we had
* dropped the ilock.
*/
if (xfs_this_quota_on(mp, type)) {
struct xfs_dquot *dqp1;
dqp1 = xfs_inode_dquot(ip, type);
if (dqp1) {
xfs_qm_dqdestroy(dqp);
dqp = dqp1;
xfs_dqlock(dqp);
goto dqret;
}
} else {
/* inode stays locked on return */
xfs_qm_dqdestroy(dqp);
return -ESRCH;
}
error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
if (error) {
/*
* Duplicate found. Just throw away the new dquot and start
* over.
*/
xfs_qm_dqdestroy(dqp);
XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
dqret:
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
trace_xfs_dqget_miss(dqp);
*O_dqpp = dqp;
return 0;
}
/*
* Starting at @id and progressing upwards, look for an initialized incore
* dquot, lock it, and return it.
*/
int
xfs_qm_dqget_next(
struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
struct xfs_dquot **dqpp)
{
struct xfs_dquot *dqp;
int error = 0;
*dqpp = NULL;
for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) {
error = xfs_qm_dqget(mp, id, type, false, &dqp);
if (error == -ENOENT)
continue;
else if (error != 0)
break;
if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
*dqpp = dqp;
return 0;
}
xfs_qm_dqput(dqp);
}
return error;
}
/*
* Release a reference to the dquot (decrement ref-count) and unlock it.
*
* If there is a group quota attached to this dquot, carefully release that
* too without tripping over deadlocks'n'stuff.
*/
void
xfs_qm_dqput(
struct xfs_dquot *dqp)
{
ASSERT(dqp->q_nrefs > 0);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
trace_xfs_dqput(dqp);
xfs: remove dquot hints group and project quota hints are currently stored on the user dquot. If we are attaching quotas to the inode, then the group and project dquots are stored as hints on the user dquot to save having to look them up again later. The thing is, the hints are not used for that inode for the rest of the life of the inode - the dquots are attached directly to the inode itself - so the only time the hints are used is when an inode first has dquots attached. When the hints on the user dquot don't match the dquots being attache dto the inode, they are then removed and replaced with the new hints. If a user is concurrently modifying files in different group and/or project contexts, then this leads to thrashing of the hints attached to user dquot. If user quotas are not enabled, then hints are never even used. So, if the hints are used to avoid the cost of the lookup, is the cost of the lookup significant enough to justify the hint infrstructure? Maybe it was once, when there was a global quota manager shared between all XFS filesystems and was hash table based. However, lookups are now much simpler, requiring only a single lock and radix tree lookup local to the filesystem and no hash or LRU manipulations to be made. Hence the cost of lookup is much lower than when hints were implemented. Turns out that benchmarks show that, too, with thir being no differnce in performance when doing file creation workloads as a single user with user, group and project quotas enabled - the hints do not make the code go any faster. In fact, removing the hints shows a 2-3% reduction in the time it takes to create 50 million inodes.... So, let's just get rid of the hints and the complexity around them. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-05 15:30:15 +08:00
if (--dqp->q_nrefs == 0) {
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
trace_xfs_dqput_free(dqp);
if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
xfs: remove dquot hints group and project quota hints are currently stored on the user dquot. If we are attaching quotas to the inode, then the group and project dquots are stored as hints on the user dquot to save having to look them up again later. The thing is, the hints are not used for that inode for the rest of the life of the inode - the dquots are attached directly to the inode itself - so the only time the hints are used is when an inode first has dquots attached. When the hints on the user dquot don't match the dquots being attache dto the inode, they are then removed and replaced with the new hints. If a user is concurrently modifying files in different group and/or project contexts, then this leads to thrashing of the hints attached to user dquot. If user quotas are not enabled, then hints are never even used. So, if the hints are used to avoid the cost of the lookup, is the cost of the lookup significant enough to justify the hint infrstructure? Maybe it was once, when there was a global quota manager shared between all XFS filesystems and was hash table based. However, lookups are now much simpler, requiring only a single lock and radix tree lookup local to the filesystem and no hash or LRU manipulations to be made. Hence the cost of lookup is much lower than when hints were implemented. Turns out that benchmarks show that, too, with thir being no differnce in performance when doing file creation workloads as a single user with user, group and project quotas enabled - the hints do not make the code go any faster. In fact, removing the hints shows a 2-3% reduction in the time it takes to create 50 million inodes.... So, let's just get rid of the hints and the complexity around them. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-05 15:30:15 +08:00
}
xfs_dqunlock(dqp);
}
/*
* Release a dquot. Flush it if dirty, then dqput() it.
* dquot must not be locked.
*/
void
xfs_qm_dqrele(
struct xfs_dquot *dqp)
{
if (!dqp)
return;
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
trace_xfs_dqrele(dqp);
xfs_dqlock(dqp);
/*
* We don't care to flush it if the dquot is dirty here.
* That will create stutters that we want to avoid.
* Instead we do a delayed write when we try to reclaim
* a dirty dquot. Also xfs_sync will take part of the burden...
*/
xfs_qm_dqput(dqp);
}
/*
* This is the dquot flushing I/O completion routine. It is called
* from interrupt level when the buffer containing the dquot is
* flushed to disk. It is responsible for removing the dquot logitem
* from the AIL if it has not been re-logged, and unlocking the dquot's
* flush lock. This behavior is very similar to that of inodes..
*/
static void
xfs_qm_dqflush_done(
struct xfs_log_item *lip)
{
struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip;
struct xfs_dquot *dqp = qip->qli_dquot;
struct xfs_ail *ailp = lip->li_ailp;
xfs_lsn_t tail_lsn;
/*
* We only want to pull the item from the AIL if its
* location in the log has not changed since we started the flush.
* Thus, we only bother if the dquot's lsn has
* not changed. First we check the lsn outside the lock
* since it's cheaper, and then we recheck while
* holding the lock before removing the dquot from the AIL.
*/
if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
((lip->li_lsn == qip->qli_flush_lsn) ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
spin_lock(&ailp->ail_lock);
xfs_clear_li_failed(lip);
if (lip->li_lsn == qip->qli_flush_lsn) {
/* xfs_ail_update_finish() drops the AIL lock */
tail_lsn = xfs_ail_delete_one(ailp, lip);
xfs_ail_update_finish(ailp, tail_lsn);
} else {
spin_unlock(&ailp->ail_lock);
}
}
/*
* Release the dq's flush lock since we're done with it.
*/
xfs_dqfunlock(dqp);
}
void
xfs_dquot_done(
struct xfs_buf *bp)
{
struct xfs_log_item *lip, *n;
list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
list_del_init(&lip->li_bio_list);
xfs_qm_dqflush_done(lip);
}
}
/* Check incore dquot for errors before we flush. */
static xfs_failaddr_t
xfs_qm_dqflush_check(
struct xfs_dquot *dqp)
{
__u8 type = dqp->dq_flags & XFS_DQ_ALLTYPES;
if (type != XFS_DQ_USER &&
type != XFS_DQ_GROUP &&
type != XFS_DQ_PROJ)
return __this_address;
if (dqp->q_id == 0)
return NULL;
if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit &&
!dqp->q_blk.timer)
return __this_address;
if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit &&
!dqp->q_ino.timer)
return __this_address;
if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit &&
!dqp->q_rtb.timer)
return __this_address;
return NULL;
}
/*
* Write a modified dquot to disk.
* The dquot must be locked and the flush lock too taken by caller.
* The flush lock will not be unlocked until the dquot reaches the disk,
* but the dquot is free to be unlocked and modified by the caller
* in the interim. Dquot is still locked on return. This behavior is
* identical to that of inodes.
*/
int
xfs_qm_dqflush(
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
struct xfs_buf *bp;
struct xfs_dqblk *dqblk;
xfs_failaddr_t fa;
int error;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(!completion_done(&dqp->q_flush));
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
trace_xfs_dqflush(dqp);
*bpp = NULL;
xfs_qm_dqunpin_wait(dqp);
/*
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
&bp, &xfs_dquot_buf_ops);
if (error == -EAGAIN)
goto out_unlock;
if (error)
goto out_abort;
fa = xfs_qm_dqflush_check(dqp);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
dqp->q_id, fa);
xfs_buf_relse(bp);
error = -EFSCORRUPTED;
goto out_abort;
}
/* Flush the incore dquot to the ondisk buffer. */
dqblk = bp->b_addr + dqp->q_bufoffset;
xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
/*
* Clear the dirty field and remember the flush lsn for later use.
*/
dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
&dqp->q_logitem.qli_item.li_lsn);
/*
* copy the lsn into the on-disk dquot now while we have the in memory
* dquot here. This can't be done later in the write verifier as we
* can't get access to the log item at that point in time.
*
* We also calculate the CRC here so that the on-disk dquot in the
* buffer always has a valid CRC. This ensures there is no possibility
* of a dquot without an up-to-date CRC getting to disk.
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
/*
* Attach the dquot to the buffer so that we can remove this dquot from
* the AIL and release the flush lock once the dquot is synced to disk.
*/
bp->b_flags |= _XBF_DQUOTS;
list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list);
/*
* If the buffer is pinned then push on the log so we won't
* get stuck waiting in the write for too long.
*/
if (xfs_buf_ispinned(bp)) {
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
trace_xfs_dqflush_force(dqp);
xfs_log_force(mp, 0);
}
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
trace_xfs_dqflush_done(dqp);
*bpp = bp;
return 0;
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-15 07:14:59 +08:00
out_abort:
dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
xfs_trans_ail_delete(lip, 0);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
out_unlock:
xfs_dqfunlock(dqp);
return error;
}
/*
* Lock two xfs_dquot structures.
*
* To avoid deadlocks we always lock the quota structure with
* the lowerd id first.
*/
void
xfs_dqlock2(
struct xfs_dquot *d1,
struct xfs_dquot *d2)
{
if (d1 && d2) {
ASSERT(d1 != d2);
if (d1->q_id > d2->q_id) {
mutex_lock(&d2->q_qlock);
mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
} else {
mutex_lock(&d1->q_qlock);
mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
}
} else if (d1) {
mutex_lock(&d1->q_qlock);
} else if (d2) {
mutex_lock(&d2->q_qlock);
}
}
int __init
xfs_qm_init(void)
{
xfs_qm_dqzone = kmem_cache_create("xfs_dquot",
sizeof(struct xfs_dquot),
0, 0, NULL);
if (!xfs_qm_dqzone)
goto out;
xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx",
sizeof(struct xfs_dquot_acct),
0, 0, NULL);
if (!xfs_qm_dqtrxzone)
goto out_free_dqzone;
return 0;
out_free_dqzone:
kmem_cache_destroy(xfs_qm_dqzone);
out:
return -ENOMEM;
}
void
xfs_qm_exit(void)
{
kmem_cache_destroy(xfs_qm_dqtrxzone);
kmem_cache_destroy(xfs_qm_dqzone);
}
/*
* Iterate every dquot of a particular type. The caller must ensure that the
* particular quota type is active. iter_fn can return negative error codes,
* or -ECANCELED to indicate that it wants to stop iterating.
*/
int
xfs_qm_dqiterate(
struct xfs_mount *mp,
uint dqtype,
xfs_qm_dqiterate_fn iter_fn,
void *priv)
{
struct xfs_dquot *dq;
xfs_dqid_t id = 0;
int error;
do {
error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
if (error == -ENOENT)
return 0;
if (error)
return error;
error = iter_fn(dq, dqtype, priv);
id = dq->q_id;
xfs_qm_dqput(dq);
} while (error == 0 && id != 0);
return error;
}