From ec95f1dedc9c64ac5a8b0bdb7c276936c70fdedd Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Wed, 8 Apr 2020 08:52:40 -0400 Subject: [PATCH 1/3] orangefs: get rid of knob code... Christoph Hellwig sent in a reversion of "orangefs: remember count when reading." because: ->read_iter calls can race with each other and one or more ->flush calls. Remove the the scheme to store the read count in the file private data as is is completely racy and can cause use after free or double free conditions Christoph's reversion caused Orangefs not to work or to compile. I added a patch that fixed that, but intel's kbuild test robot pointed out that sending Christoph's patch followed by my patch upstream, it would break bisection because of the failure to compile. So I have combined the reversion plus my patch... here's the commit message that was in my patch: Logically, optimal Orangefs "pages" are 4 megabytes. Reading large Orangefs files 4096 bytes at a time is like trying to kick a dead whale down the beach. Before Christoph's "Revert orangefs: remember count when reading." I tried to give users a knob whereby they could, for example, use "count" in read(2) or bs with dd(1) to get whatever they considered an appropriate amount of bytes at a time from Orangefs and fill as many page cache pages as they could at once. Without the racy code that Christoph reverted Orangefs won't even compile, much less work. So this replaces the logic that used the private file data that Christoph reverted with a static number of bytes to read from Orangefs. I ran tests like the following to determine what a reasonable static number of bytes might be: dd if=/pvfsmnt/asdf of=/dev/null count=128 bs=4194304 dd if=/pvfsmnt/asdf of=/dev/null count=256 bs=2097152 dd if=/pvfsmnt/asdf of=/dev/null count=512 bs=1048576 . . . dd if=/pvfsmnt/asdf of=/dev/null count=4194304 bs=128 Reads seem faster using the static number, so my "knob code" wasn't just racy, it wasn't even a good idea... Signed-off-by: Mike Marshall Reported-by: kbuild test robot --- fs/orangefs/file.c | 26 +---------------------- fs/orangefs/inode.c | 39 ++++++----------------------------- fs/orangefs/orangefs-kernel.h | 4 ---- 3 files changed, 7 insertions(+), 62 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index c740159d9ad1..173e6ea57a47 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -346,23 +346,8 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { int ret; - struct orangefs_read_options *ro; - orangefs_stats.reads++; - /* - * Remember how they set "count" in read(2) or pread(2) or whatever - - * users can use count as a knob to control orangefs io size and later - * we can try to help them fill as many pages as possible in readpage. - */ - if (!iocb->ki_filp->private_data) { - iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL); - if (!iocb->ki_filp->private_data) - return(ENOMEM); - ro = iocb->ki_filp->private_data; - ro->blksiz = iter->count; - } - down_read(&file_inode(iocb->ki_filp)->i_rwsem); ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); if (ret) @@ -650,12 +635,6 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) return rc; } -static int orangefs_file_open(struct inode * inode, struct file *file) -{ - file->private_data = NULL; - return generic_file_open(inode, file); -} - static int orangefs_flush(struct file *file, fl_owner_t id) { /* @@ -669,9 +648,6 @@ static int orangefs_flush(struct file *file, fl_owner_t id) struct inode *inode = file->f_mapping->host; int r; - kfree(file->private_data); - file->private_data = NULL; - if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); inode->i_state &= ~I_DIRTY_TIME; @@ -694,7 +670,7 @@ const struct file_operations orangefs_file_operations = { .lock = orangefs_lock, .unlocked_ioctl = orangefs_ioctl, .mmap = orangefs_file_mmap, - .open = orangefs_file_open, + .open = generic_file_open, .flush = orangefs_flush, .release = orangefs_file_release, .fsync = orangefs_fsync, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 961c0fd8675a..12ae630fbed7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -259,46 +259,19 @@ static int orangefs_readpage(struct file *file, struct page *page) pgoff_t index; /* which page */ struct page *next_page; char *kaddr; - struct orangefs_read_options *ro = file->private_data; loff_t read_size; - loff_t roundedup; int buffer_index = -1; /* orangefs shared memory slot */ int slot_index; /* index into slot */ int remaining; /* - * If they set some miniscule size for "count" in read(2) - * (for example) then let's try to read a page, or the whole file - * if it is smaller than a page. Once "count" goes over a page - * then lets round up to the highest page size multiple that is - * less than or equal to "count" and do that much orangefs IO and - * try to fill as many pages as we can from it. - * - * "count" should be represented in ro->blksiz. - * - * inode->i_size = file size. + * Get up to this many bytes from Orangefs at a time and try + * to fill them into the page cache at once. Tests with dd made + * this seem like a reasonable static number, if there was + * interest perhaps this number could be made setable through + * sysfs... */ - if (ro) { - if (ro->blksiz < PAGE_SIZE) { - if (inode->i_size < PAGE_SIZE) - read_size = inode->i_size; - else - read_size = PAGE_SIZE; - } else { - roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ? - ((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) : - ro->blksiz; - if (roundedup > inode->i_size) - read_size = inode->i_size; - else - read_size = roundedup; - - } - } else { - read_size = PAGE_SIZE; - } - if (!read_size) - read_size = PAGE_SIZE; + read_size = 524288; if (PageDirty(page)) orangefs_launder_page(page); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index ed67f39fa7ce..e12aeb9623d6 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -239,10 +239,6 @@ struct orangefs_write_range { kgid_t gid; }; -struct orangefs_read_options { - ssize_t blksiz; -}; - extern struct orangefs_stats orangefs_stats; /* From 0e393a9a8f2a450862964451715d68e9a96a9c34 Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Wed, 8 Apr 2020 09:05:45 -0400 Subject: [PATCH 2/3] orangefs: don't mess with I_DIRTY_TIMES in orangefs_flush Christoph Hellwig noticed that we were doing some unnecessary work in orangefs_flush: orangefs_flush just writes out data on every close(2) call. There is no need to change anything about the dirty state, especially as orangefs doesn't treat I_DIRTY_TIMES special in any way. The code seems to come from partially open coding vfs_fsync. He sent in a patch with the above commit message and also a patch that was a reversion of another Orangefs patch I had sent upstream a while ago. I had to fix his reversion patch so that it would compile which caused his "don't mess with I_DIRTY_TIMES" patch to fail to apply. So here I have just remade his patch and applied it after the fixed reversion patch. Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 173e6ea57a47..af375e049aae 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -645,16 +645,8 @@ static int orangefs_flush(struct file *file, fl_owner_t id) * on an explicit fsync call. This duplicates historical OrangeFS * behavior. */ - struct inode *inode = file->f_mapping->host; int r; - if (inode->i_state & I_DIRTY_TIME) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); - mark_inode_dirty_sync(inode); - } - r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); if (r > 0) return 0; From aa317d3351dee7cb0b27db808af0cd2340dcbaef Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Wed, 8 Apr 2020 13:01:03 -0400 Subject: [PATCH 3/3] orangefs: clarify build steps for test server in orangefs.txt Signed-off-by: Mike Marshall --- Documentation/filesystems/orangefs.txt | 34 ++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/Documentation/filesystems/orangefs.txt b/Documentation/filesystems/orangefs.txt index f4ba94950e3f..5a3865702a71 100644 --- a/Documentation/filesystems/orangefs.txt +++ b/Documentation/filesystems/orangefs.txt @@ -38,16 +38,6 @@ DOCUMENTATION http://www.orangefs.org/documentation/ - -USERSPACE FILESYSTEM SOURCE -=========================== - -http://www.orangefs.org/download - -Orangefs versions prior to 2.9.3 would not be compatible with the -upstream version of the kernel client. - - RUNNING ORANGEFS ON A SINGLE SERVER =================================== @@ -91,6 +81,14 @@ Mount the filesystem. mount -t pvfs2 tcp://localhost:3334/orangefs /pvfsmnt +USERSPACE FILESYSTEM SOURCE +=========================== + +http://www.orangefs.org/download + +Orangefs versions prior to 2.9.3 would not be compatible with the +upstream version of the kernel client. + BUILDING ORANGEFS ON A SINGLE SERVER ==================================== @@ -102,18 +100,24 @@ You can omit --prefix if you don't care that things are sprinkled around in /usr/local. As of version 2.9.6, OrangeFS uses Berkeley DB by default, we will probably be changing the default to LMDB soon. -./configure --prefix=/opt/ofs --with-db-backend=lmdb +./configure --prefix=/opt/ofs --with-db-backend=lmdb --disable-usrint make make install -Create an orangefs config file. +Create an orangefs config file by running pvfs2-genconfig and +specifying a target config file. Pvfs2-genconfig will prompt you +through. Generally it works fine to take the defaults, but you +should use your server's hostname, rather than "localhost" when +it comes to that question. /opt/ofs/bin/pvfs2-genconfig /etc/pvfs2.conf Create an /etc/pvfs2tab file. +Localhost is fine for your pvfs2tab file: + echo tcp://localhost:3334/orangefs /pvfsmnt pvfs2 defaults,noauto 0 0 > \ /etc/pvfs2tab @@ -127,7 +131,7 @@ Bootstrap the server. Start the server. -/opt/osf/sbin/pvfs2-server /etc/pvfs2.conf +/opt/ofs/sbin/pvfs2-server /etc/pvfs2.conf Now the server should be running. Pvfs2-ls is a simple test to verify that the server is running. @@ -137,11 +141,11 @@ test to verify that the server is running. If stuff seems to be working, load the kernel module and turn on the client core. -/opt/ofs/sbin/pvfs2-client -p /opt/osf/sbin/pvfs2-client-core +/opt/ofs/sbin/pvfs2-client -p /opt/ofs/sbin/pvfs2-client-core Mount your filesystem. -mount -t pvfs2 tcp://localhost:3334/orangefs /pvfsmnt +mount -t pvfs2 tcp://`hostname`:3334/orangefs /pvfsmnt RUNNING XFSTESTS