From b2edffdd912b4205899a8efa0974dfbbc3216109 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Apr 2015 17:48:54 -0400 Subject: [PATCH 1/5] fix mremap() vs. ioctx_kill() race teach ->mremap() method to return an error and have it fail for aio mappings in process of being killed Note that in case of ->mremap() failure we need to undo move_page_tables() we'd already done; we could call ->mremap() first, but then the failure of move_page_tables() would require undoing whatever _successful_ ->mremap() has done, which would be a lot more headache in general. Signed-off-by: Al Viro --- fs/aio.c | 19 ++++++++++++------- include/linux/fs.h | 2 +- mm/mremap.c | 10 ++++++++-- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index f8e52a1854c1..58c33dcfb6ca 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -278,11 +278,11 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static void aio_ring_remap(struct file *file, struct vm_area_struct *vma) +static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; - int i; + int i, res = -EINVAL; spin_lock(&mm->ioctx_lock); rcu_read_lock(); @@ -292,13 +292,17 @@ static void aio_ring_remap(struct file *file, struct vm_area_struct *vma) ctx = table->table[i]; if (ctx && ctx->aio_ring_file == file) { - ctx->user_id = ctx->mmap_base = vma->vm_start; + if (!atomic_read(&ctx->dead)) { + ctx->user_id = ctx->mmap_base = vma->vm_start; + res = 0; + } break; } } rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); + return res; } static const struct file_operations aio_ring_fops = { @@ -748,11 +752,12 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, { struct kioctx_table *table; - if (atomic_xchg(&ctx->dead, 1)) - return -EINVAL; - - spin_lock(&mm->ioctx_lock); + if (atomic_xchg(&ctx->dead, 1)) { + spin_unlock(&mm->ioctx_lock); + return -EINVAL; + } + table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index f4131e8ead74..52cc4492cb3a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1549,7 +1549,7 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); - void (*mremap)(struct file *, struct vm_area_struct *); + int (*mremap)(struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); diff --git a/mm/mremap.c b/mm/mremap.c index 57dadc025c64..2dc44b1cb1df 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -286,8 +286,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) - vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + } else if (vma->vm_file && vma->vm_file->f_op->mremap) { + err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + if (err < 0) { + move_page_tables(new_vma, new_addr, vma, old_addr, + moved_len, true); + return err; + } + } /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT) { From deeb8525f9bcea60f5e86521880c1161de7a5829 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Apr 2015 17:57:44 -0400 Subject: [PATCH 2/5] ioctx_alloc(): fix vma (and file) leak on failure If we fail past the aio_setup_ring(), we need to destroy the mapping. We don't need to care about anybody having found ctx, or added requests to it, since the last failure exit is exactly the failure to make ctx visible to lookups. Reproducer (based on one by Joe Mario ): void count(char *p) { char s[80]; printf("%s: ", p); fflush(stdout); sprintf(s, "/bin/cat /proc/%d/maps|/bin/fgrep -c '/[aio] (deleted)'", getpid()); system(s); } int main() { io_context_t *ctx; int created, limit, i, destroyed; FILE *f; count("before"); if ((f = fopen("/proc/sys/fs/aio-max-nr", "r")) == NULL) perror("opening aio-max-nr"); else if (fscanf(f, "%d", &limit) != 1) fprintf(stderr, "can't parse aio-max-nr\n"); else if ((ctx = calloc(limit, sizeof(io_context_t))) == NULL) perror("allocating aio_context_t array"); else { for (i = 0, created = 0; i < limit; i++) { if (io_setup(1000, ctx + created) == 0) created++; } for (i = 0, destroyed = 0; i < created; i++) if (io_destroy(ctx[i]) == 0) destroyed++; printf("created %d, failed %d, destroyed %d\n", created, limit - created, destroyed); count("after"); } } Found-by: Joe Mario Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/aio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/aio.c b/fs/aio.c index 58c33dcfb6ca..a793f7023755 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -731,6 +731,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: + atomic_set(&ctx->dead, 1); + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); From cf1b5ea1c5cd26a003b01d4798266a4bdf0ffe64 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Apr 2015 15:41:17 -0400 Subject: [PATCH 3/5] [regression] ocfs2: do *not* increment ->ki_pos twice generic_file_direct_write() already does that. Broken by "ocfs2: do not fallback to buffer I/O write if appending" Signed-off-by: Al Viro --- fs/ocfs2/file.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 46e0d4e857c7..099972490f39 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2394,7 +2394,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, /* * for completing the rest of the request. */ - *ppos += written; count -= written; written_buffered = generic_perform_write(file, from, *ppos); /* From 9ce5a232b8a941be9e74c055535d81508207a570 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Apr 2015 15:45:02 -0400 Subject: [PATCH 4/5] ocfs2_file_write_iter: keep return value and current position update in sync Signed-off-by: Al Viro --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 099972490f39..a39067f4dd55 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2408,7 +2408,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, goto out_dio; } - iocb->ki_pos = *ppos + written_buffered; /* We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. @@ -2417,6 +2416,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ret = filemap_write_and_wait_range(file->f_mapping, *ppos, endbyte); if (ret == 0) { + iocb->ki_pos = *ppos + written_buffered; written += written_buffered; invalidate_mapping_pages(mapping, *ppos >> PAGE_CACHE_SHIFT, From 64b4e2526d1cf6e6a4db6213d6e2b6e6ab59479a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Apr 2015 17:00:32 -0400 Subject: [PATCH 5/5] ocfs2: _really_ sync the right range "ocfs2 syncs the wrong range" had been broken; prior to it the code was doing the wrong thing in case of O_APPEND, all right, but _after_ it we were syncing the wrong range in 100% cases. *ppos, aka iocb->ki_pos is incremented prior to that point, so we are always doing sync on the area _after_ the one we'd written to. Spotted by Joseph Qi back in January; unfortunately, I'd missed his mail back then ;-/ Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ocfs2/file.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a39067f4dd55..ba1790e52ff2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2439,10 +2439,14 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + if (unlikely(written <= 0)) + goto no_sync; + if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || ((file->f_flags & O_DIRECT) && !direct_io)) { - ret = filemap_fdatawrite_range(file->f_mapping, *ppos, - *ppos + count - 1); + ret = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); if (ret < 0) written = ret; @@ -2453,10 +2457,12 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } if (!ret) - ret = filemap_fdatawait_range(file->f_mapping, *ppos, - *ppos + count - 1); + ret = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } +no_sync: /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that