fs: kill i_alloc_sem

i_alloc_sem is a rather special rw_semaphore.  It's the last one that may
be released by a non-owner, and it's write side is always mirrored by
real exclusion.  It's intended use it to wait for all pending direct I/O
requests to finish before starting a truncate.

Replace it with a hand-grown construct:

 - exclusion for truncates is already guaranteed by i_mutex, so it can
   simply fall way
 - the reader side is replaced by an i_dio_count member in struct inode
   that counts the number of pending direct I/O requests.  Truncate can't
   proceed as long as it's non-zero
 - when i_dio_count reaches non-zero we wake up a pending truncate using
   wake_up_bit on a new bit in i_flags
 - new references to i_dio_count can't appear while we are waiting for
   it to read zero because the direct I/O count always needs i_mutex
   (or an equivalent like XFS's i_iolock) for starting a new operation.

This scheme is much simpler, and saves the space of a spinlock_t and a
struct list_head in struct inode (typically 160 bits on a non-debug 64-bit
system).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
Christoph Hellwig 2011-06-24 14:29:43 -04:00 committed by Al Viro
parent f9b5570d7f
commit bd5fe6c5eb
13 changed files with 78 additions and 53 deletions

View File

@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
return error; return error;
if (ia_valid & ATTR_SIZE) if (ia_valid & ATTR_SIZE)
down_write(&dentry->d_inode->i_alloc_sem); inode_dio_wait(inode);
if (inode->i_op->setattr) if (inode->i_op->setattr)
error = inode->i_op->setattr(dentry, attr); error = inode->i_op->setattr(dentry, attr);
else else
error = simple_setattr(dentry, attr); error = simple_setattr(dentry, attr);
if (ia_valid & ATTR_SIZE)
up_write(&dentry->d_inode->i_alloc_sem);
if (!error) if (!error)
fsnotify_change(dentry, ia_valid); fsnotify_change(dentry, ia_valid);

View File

@ -135,6 +135,50 @@ struct dio {
struct page *pages[DIO_PAGES]; /* page buffer */ struct page *pages[DIO_PAGES]; /* page buffer */
}; };
static void __inode_dio_wait(struct inode *inode)
{
wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
do {
prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
if (atomic_read(&inode->i_dio_count))
schedule();
} while (atomic_read(&inode->i_dio_count));
finish_wait(wq, &q.wait);
}
/**
* inode_dio_wait - wait for outstanding DIO requests to finish
* @inode: inode to wait for
*
* Waits for all pending direct I/O requests to finish so that we can
* proceed with a truncate or equivalent operation.
*
* Must be called under a lock that serializes taking new references
* to i_dio_count, usually by inode->i_mutex.
*/
void inode_dio_wait(struct inode *inode)
{
if (atomic_read(&inode->i_dio_count))
__inode_dio_wait(inode);
}
EXPORT_SYMBOL_GPL(inode_dio_wait);
/*
* inode_dio_done - signal finish of a direct I/O requests
* @inode: inode the direct I/O happens on
*
* This is called once we've finished processing a direct I/O request,
* and is used to wake up callers waiting for direct I/O to be quiesced.
*/
void inode_dio_done(struct inode *inode)
{
if (atomic_dec_and_test(&inode->i_dio_count))
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}
EXPORT_SYMBOL_GPL(inode_dio_done);
/* /*
* How many pages are in the queue? * How many pages are in the queue?
*/ */
@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
} }
if (dio->flags & DIO_LOCKING) if (dio->flags & DIO_LOCKING)
/* lockdep: non-owner release */ inode_dio_done(dio->inode);
up_read_non_owner(&dio->inode->i_alloc_sem);
return ret; return ret;
} }
@ -980,9 +1022,6 @@ static int do_direct_IO(struct dio *dio)
return ret; return ret;
} }
/*
* Releases both i_mutex and i_alloc_sem
*/
static ssize_t static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs, const struct iovec *iov, loff_t offset, unsigned long nr_segs,
@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
* For writes this function is called under i_mutex and returns with * For writes this function is called under i_mutex and returns with
* i_mutex held, for reads, i_mutex is not held on entry, but it is * i_mutex held, for reads, i_mutex is not held on entry, but it is
* taken and dropped again before returning. * taken and dropped again before returning.
* For reads and writes i_alloc_sem is taken in shared mode and released * The i_dio_count counter keeps track of the number of outstanding
* on I/O completion (which may happen asynchronously after returning to * direct I/O requests, and truncate waits for it to reach zero.
* the caller). * New references to i_dio_count must only be grabbed with i_mutex
* held.
* *
* - if the flags value does NOT contain DIO_LOCKING we don't use any * - if the flags value does NOT contain DIO_LOCKING we don't use any
* internal locking but rather rely on the filesystem to synchronize * internal locking but rather rely on the filesystem to synchronize
* direct I/O reads/writes versus each other and truncate. * direct I/O reads/writes versus each other and truncate.
* For reads and writes both i_mutex and i_alloc_sem are not held on
* entry and are never taken.
*/ */
ssize_t ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
} }
/* /*
* Will be released at I/O completion, possibly in a * Will be decremented at I/O completion time.
* different thread.
*/ */
down_read_non_owner(&inode->i_alloc_sem); atomic_inc(&inode->i_dio_count);
} }
/* /*

View File

@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mutex_init(&inode->i_mutex); mutex_init(&inode->i_mutex);
lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
init_rwsem(&inode->i_alloc_sem); atomic_set(&inode->i_dio_count, 0);
lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
mapping->a_ops = &empty_aops; mapping->a_ops = &empty_aops;
mapping->host = inode; mapping->host = inode;

View File

@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
* fails again. * fails again.
*/ */
if (unlikely(NInoTruncateFailed(ni))) { if (unlikely(NInoTruncateFailed(ni))) {
down_write(&vi->i_alloc_sem); inode_dio_wait(vi);
err = ntfs_truncate(vi); err = ntfs_truncate(vi);
up_write(&vi->i_alloc_sem);
if (err || NInoTruncateFailed(ni)) { if (err || NInoTruncateFailed(ni)) {
if (!err) if (!err)
err = -EIO; err = -EIO;

View File

@ -2357,12 +2357,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run "
* *
* Returns 0 on success or -errno on error. * Returns 0 on success or -errno on error.
* *
* Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for * Called with ->i_mutex held.
* writing. The only case in the kernel where ->i_alloc_sem is not held is
* mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
* with the current i_size as the offset. The analogous place in NTFS is in
* fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
* without holding ->i_alloc_sem.
*/ */
int ntfs_truncate(struct inode *vi) int ntfs_truncate(struct inode *vi)
{ {
@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
* We also abort all changes of user, group, and mode as we do not implement * We also abort all changes of user, group, and mode as we do not implement
* the NTFS ACLs yet. * the NTFS ACLs yet.
* *
* Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also * Called with ->i_mutex held.
* called with ->i_alloc_sem held for writing.
*/ */
int ntfs_setattr(struct dentry *dentry, struct iattr *attr) int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
{ {

View File

@ -551,9 +551,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
/* /*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
* particularly interested in the aio/dio case. Like the core uses * particularly interested in the aio/dio case. We use the rw_lock DLM lock
* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from * to protect io on one node from truncation on another.
* truncation on another.
*/ */
static void ocfs2_dio_end_io(struct kiocb *iocb, static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset, loff_t offset,
@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
if (ocfs2_iocb_is_sem_locked(iocb)) { if (ocfs2_iocb_is_sem_locked(iocb)) {
up_read(&inode->i_alloc_sem); inode_dio_done(inode);
ocfs2_iocb_clear_sem_locked(iocb); ocfs2_iocb_clear_sem_locked(iocb);
} }

View File

@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
ocfs2_iocb_clear_sem_locked(iocb); ocfs2_iocb_clear_sem_locked(iocb);
relock: relock:
/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ /* to match setattr's i_mutex -> rw_lock ordering */
if (direct_io) { if (direct_io) {
down_read(&inode->i_alloc_sem); atomic_inc(&inode->i_dio_count);
have_alloc_sem = 1; have_alloc_sem = 1;
/* communicate with ocfs2_dio_end_io */ /* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_sem_locked(iocb); ocfs2_iocb_set_sem_locked(iocb);
@ -2290,7 +2290,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
*/ */
if (direct_io && !can_do_direct) { if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level); ocfs2_rw_unlock(inode, rw_level);
up_read(&inode->i_alloc_sem); inode_dio_done(inode);
have_alloc_sem = 0; have_alloc_sem = 0;
rw_level = -1; rw_level = -1;
@ -2361,8 +2361,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
/* /*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that * function pointer which is called when o_direct io completes so that
* it can unlock our rw lock. (it's the clustered equivalent of * it can unlock our rw lock.
* i_alloc_sem; protects truncate from racing with pending ios).
* Unfortunately there are error cases which call end_io and others * Unfortunately there are error cases which call end_io and others
* that don't. so we don't have to unlock the rw_lock if either an * that don't. so we don't have to unlock the rw_lock if either an
* async dio is going to do it in the future or an end_io after an * async dio is going to do it in the future or an end_io after an
@ -2379,7 +2378,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
out_sems: out_sems:
if (have_alloc_sem) { if (have_alloc_sem) {
up_read(&inode->i_alloc_sem); inode_dio_done(inode);
ocfs2_iocb_clear_sem_locked(iocb); ocfs2_iocb_clear_sem_locked(iocb);
} }
@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
* need locks to protect pending reads from racing with truncate. * need locks to protect pending reads from racing with truncate.
*/ */
if (filp->f_flags & O_DIRECT) { if (filp->f_flags & O_DIRECT) {
down_read(&inode->i_alloc_sem);
have_alloc_sem = 1; have_alloc_sem = 1;
atomic_inc(&inode->i_dio_count);
ocfs2_iocb_set_sem_locked(iocb); ocfs2_iocb_set_sem_locked(iocb);
ret = ocfs2_rw_lock(inode, 0); ret = ocfs2_rw_lock(inode, 0);
@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
bail: bail:
if (have_alloc_sem) { if (have_alloc_sem) {
up_read(&inode->i_alloc_sem); inode_dio_done(inode);
ocfs2_iocb_clear_sem_locked(iocb); ocfs2_iocb_clear_sem_locked(iocb);
} }
if (rw_level != -1) if (rw_level != -1)

View File

@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
reiserfs_write_unlock(inode->i_sb); reiserfs_write_unlock(inode->i_sb);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
down_write(&dentry->d_inode->i_alloc_sem); inode_dio_wait(dentry->d_inode);
reiserfs_write_lock(inode->i_sb); reiserfs_write_lock(inode->i_sb);
err = reiserfs_setattr(dentry, &newattrs); err = reiserfs_setattr(dentry, &newattrs);
up_write(&dentry->d_inode->i_alloc_sem);
mutex_unlock(&dentry->d_inode->i_mutex); mutex_unlock(&dentry->d_inode->i_mutex);
} else } else
update_ctime(inode); update_ctime(inode);

View File

@ -779,7 +779,7 @@ struct inode {
struct timespec i_ctime; struct timespec i_ctime;
blkcnt_t i_blocks; blkcnt_t i_blocks;
unsigned short i_bytes; unsigned short i_bytes;
struct rw_semaphore i_alloc_sem; atomic_t i_dio_count;
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct file_lock *i_flock; struct file_lock *i_flock;
struct address_space *i_mapping; struct address_space *i_mapping;
@ -1705,6 +1705,10 @@ struct super_operations {
* set during data writeback, and cleared with a wakeup * set during data writeback, and cleared with a wakeup
* on the bit address once it is done. * on the bit address once it is done.
* *
* I_REFERENCED Marks the inode as recently references on the LRU list.
*
* I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
*
* Q: What is the difference between I_WILL_FREE and I_FREEING? * Q: What is the difference between I_WILL_FREE and I_FREEING?
*/ */
#define I_DIRTY_SYNC (1 << 0) #define I_DIRTY_SYNC (1 << 0)
@ -1718,6 +1722,8 @@ struct super_operations {
#define __I_SYNC 7 #define __I_SYNC 7
#define I_SYNC (1 << __I_SYNC) #define I_SYNC (1 << __I_SYNC)
#define I_REFERENCED (1 << 8) #define I_REFERENCED (1 << 8)
#define __I_DIO_WAKEUP 9
#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
@ -1828,7 +1834,6 @@ struct file_system_type {
struct lock_class_key i_lock_key; struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key; struct lock_class_key i_mutex_dir_key;
struct lock_class_key i_alloc_sem_key;
}; };
extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
@ -2404,6 +2409,8 @@ enum {
}; };
void dio_end_io(struct bio *bio, int error); void dio_end_io(struct bio *bio, int error);
void inode_dio_wait(struct inode *inode);
void inode_dio_done(struct inode *inode);
ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset, struct block_device *bdev, const struct iovec *iov, loff_t offset,

View File

@ -78,9 +78,6 @@
* ->i_mutex (generic_file_buffered_write) * ->i_mutex (generic_file_buffered_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault) * ->mmap_sem (fault_in_pages_readable->do_page_fault)
* *
* ->i_mutex
* ->i_alloc_sem (various)
*
* inode_wb_list_lock * inode_wb_list_lock
* sb_lock (fs/fs-writeback.c) * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode) * ->mapping->tree_lock (__sync_single_inode)

View File

@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
endoff = (loff_t)(end - vma->vm_start - 1) endoff = (loff_t)(end - vma->vm_start - 1)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/* vmtruncate_range needs to take i_mutex and i_alloc_sem */ /* vmtruncate_range needs to take i_mutex */
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
error = vmtruncate_range(mapping->host, offset, endoff); error = vmtruncate_range(mapping->host, offset, endoff);
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);

View File

@ -21,7 +21,6 @@
* Lock ordering in mm: * Lock ordering in mm:
* *
* inode->i_mutex (while writing or truncating, not reading or faulting) * inode->i_mutex (while writing or truncating, not reading or faulting)
* inode->i_alloc_sem (vmtruncate_range)
* mm->mmap_sem * mm->mmap_sem
* page->flags PG_locked (lock_page) * page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex * mapping->i_mmap_mutex

View File

@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
return -ENOSYS; return -ENOSYS;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
down_write(&inode->i_alloc_sem); inode_dio_wait(inode);
unmap_mapping_range(mapping, offset, (end - offset), 1); unmap_mapping_range(mapping, offset, (end - offset), 1);
inode->i_op->truncate_range(inode, offset, end); inode->i_op->truncate_range(inode, offset, end);
/* unmap again to remove racily COWed private pages */ /* unmap again to remove racily COWed private pages */
unmap_mapping_range(mapping, offset, (end - offset), 1); unmap_mapping_range(mapping, offset, (end - offset), 1);
up_write(&inode->i_alloc_sem);
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
return 0; return 0;