2018-06-06 10:42:14 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-11-02 11:58:39 +08:00
|
|
|
* Copyright (c) 2000,2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#ifndef __XFS_INODE_ITEM_H__
|
|
|
|
#define __XFS_INODE_ITEM_H__
|
|
|
|
|
2013-08-12 18:49:23 +08:00
|
|
|
/* kernel only definitions */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct xfs_buf;
|
2009-12-04 18:19:07 +08:00
|
|
|
struct xfs_bmbt_rec;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct xfs_inode;
|
|
|
|
struct xfs_mount;
|
|
|
|
|
2020-05-01 03:52:19 +08:00
|
|
|
struct xfs_inode_log_item {
|
2019-06-29 10:27:33 +08:00
|
|
|
struct xfs_log_item ili_item; /* common portion */
|
2005-04-17 06:20:36 +08:00
|
|
|
struct xfs_inode *ili_inode; /* inode ptr */
|
2020-06-30 05:48:46 +08:00
|
|
|
unsigned short ili_lock_flags; /* inode lock flags */
|
|
|
|
/*
|
|
|
|
* The ili_lock protects the interactions between the dirty state and
|
|
|
|
* the flush state of the inode log item. This allows us to do atomic
|
|
|
|
* modifications of multiple state fields without having to hold a
|
|
|
|
* specific inode lock to serialise them.
|
|
|
|
*
|
|
|
|
* We need atomic changes between inode dirtying, inode flushing and
|
|
|
|
* inode completion, but these all hold different combinations of
|
|
|
|
* ILOCK and iflock and hence we need some other method of serialising
|
|
|
|
* updates to the flush state.
|
|
|
|
*/
|
|
|
|
spinlock_t ili_lock; /* flush state lock */
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned int ili_last_fields; /* fields when flushed */
|
2012-02-29 17:53:54 +08:00
|
|
|
unsigned int ili_fields; /* fields to be logged */
|
xfs: optimise away log forces on timestamp updates for fdatasync
xfs: timestamp updates cause excessive fdatasync log traffic
Sage Weil reported that a ceph test workload was writing to the
log on every fdatasync during an overwrite workload. Event tracing
showed that the only metadata modification being made was the
timestamp updates during the write(2) syscall, but fdatasync(2)
is supposed to ignore them. The key observation was that the
transactions in the log all looked like this:
INODE: #regs: 4 ino: 0x8b flags: 0x45 dsize: 32
And contained a flags field of 0x45 or 0x85, and had data and
attribute forks following the inode core. This means that the
timestamp updates were triggering dirty relogging of previously
logged parts of the inode that hadn't yet been flushed back to
disk.
There are two parts to this problem. The first is that XFS relogs
dirty regions in subsequent transactions, so it carries around the
fields that have been dirtied since the last time the inode was
written back to disk, not since the last time the inode was forced
into the log.
The second part is that on v5 filesystems, the inode change count
update during inode dirtying also sets the XFS_ILOG_CORE flag, so
on v5 filesystems this makes a timestamp update dirty the entire
inode.
As a result when fdatasync is run, it looks at the dirty fields in
the inode, and sees more than just the timestamp flag, even though
the only metadata change since the last fdatasync was just the
timestamps. Hence we force the log on every subsequent fdatasync
even though it is not needed.
To fix this, add a new field to the inode log item that tracks
changes since the last time fsync/fdatasync forced the log to flush
the changes to the journal. This flag is updated when we dirty the
inode, but we do it before updating the change count so it does not
carry the "core dirty" flag from timestamp updates. The fields are
zeroed when the inode is marked clean (due to writeback/freeing) or
when an fsync/datasync forces the log. Hence if we only dirty the
timestamps on the inode between fsync/fdatasync calls, the fdatasync
will not trigger another log force.
Over 100 runs of the test program:
Ext4 baseline:
runtime: 1.63s +/- 0.24s
avg lat: 1.59ms +/- 0.24ms
iops: ~2000
XFS, vanilla kernel:
runtime: 2.45s +/- 0.18s
avg lat: 2.39ms +/- 0.18ms
log forces: ~400/s
iops: ~1000
XFS, patched kernel:
runtime: 1.49s +/- 0.26s
avg lat: 1.46ms +/- 0.25ms
log forces: ~30/s
iops: ~1500
Reported-by: Sage Weil <sage@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-11-03 10:14:59 +08:00
|
|
|
unsigned int ili_fsync_fields; /* logged since last fsync */
|
2020-06-30 05:48:46 +08:00
|
|
|
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
|
|
|
|
xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
|
2020-05-01 03:52:19 +08:00
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2020-06-30 05:48:48 +08:00
|
|
|
static inline int xfs_inode_clean(struct xfs_inode *ip)
|
2008-03-06 10:43:59 +08:00
|
|
|
{
|
2012-02-29 17:53:54 +08:00
|
|
|
return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
|
2008-03-06 10:43:59 +08:00
|
|
|
}
|
|
|
|
|
2005-11-02 11:38:42 +08:00
|
|
|
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
|
|
|
|
extern void xfs_inode_item_destroy(struct xfs_inode *);
|
2020-06-30 05:48:48 +08:00
|
|
|
extern void xfs_iflush_done(struct xfs_buf *);
|
2020-05-07 04:27:40 +08:00
|
|
|
extern void xfs_iflush_abort(struct xfs_inode *);
|
2006-06-09 12:55:38 +08:00
|
|
|
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
|
2017-11-01 03:04:24 +08:00
|
|
|
struct xfs_inode_log_format *);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-08-12 18:49:23 +08:00
|
|
|
extern struct kmem_zone *xfs_ili_zone;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#endif /* __XFS_INODE_ITEM_H__ */
|