From d0b040f5f2557b2f507c01e88ad8cff424fdc6a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Apr 2021 12:23:33 +0200 Subject: [PATCH 01/32] ext4: fix overflow in ext4_iomap_alloc() A code in iomap alloc may overflow block number when converting it to byte offset. Luckily this is mostly harmless as we will just use more expensive method of writing using unwritten extents even though we are writing beyond i_size. Cc: stable@kernel.org Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210412102333.2676-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fe6045a46599..211acfba3af7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3418,7 +3418,7 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ - else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; From 5c680150d7f43484fde6b87271229f2206bfff7c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 26 Apr 2021 14:29:47 +0800 Subject: [PATCH 02/32] ext4: remove redundant check buffer_uptodate() Now set_buffer_uptodate() will test first and then set, so we don't have to check buffer_uptodate() first, remove it to simplify code. Reviewed-by: Ritesh Harjani Signed-off-by: Joseph Qi Link: https://lore.kernel.org/r/1619418587-5580-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 37002663d521..639ab5405d6a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3784,7 +3784,7 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) * have to read the block because we may read the old data * successfully. */ - if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + if (buffer_write_io_error(bh)) set_buffer_uptodate(bh); return buffer_uptodate(bh); } From 1fc57ca5a2cd26e0a526e5eb2b0fc0c054117a5b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 29 Apr 2021 18:16:49 +0800 Subject: [PATCH 03/32] ext4: remove redundant assignment to error Variable error is set to zero but this value is never read as it's not used later on, hence it is a redundant assignment and can be removed. Cleans up the following clang-analyzer warning: fs/ext4/ioctl.c:657:3: warning: Value stored to 'error' is never read [clang-analyzer-deadcode.DeadStores]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/1619691409-83160-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 31627f7dc5cd..a96d6721cef9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -659,10 +659,9 @@ static int ext4_ioc_getfsmap(struct super_block *sb, info.gi_sb = sb; info.gi_data = arg; error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); - if (error == EXT4_QUERY_RANGE_ABORT) { - error = 0; + if (error == EXT4_QUERY_RANGE_ABORT) aborted = true; - } else if (error) + else if (error) return error; /* If we didn't abort, set the "last" flag in the last fmx */ From 618f003199c6188e01472b03cdbba227f1dc5f24 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 30 Apr 2021 21:50:46 +0300 Subject: [PATCH 04/32] ext4: fix memory leak in ext4_fill_super static int kthread(void *_create) will return -ENOMEM or -EINTR in case of internal failure or kthread_stop() call happens before threadfn call. To prevent fancy error checking and make code more straightforward we moved all cleanup code out of kmmpd threadfn. Also, dropped struct mmpd_data at all. Now struct super_block is a threadfn data and struct buffer_head embedded into struct ext4_sb_info. Reported-by: syzbot+d9e482e303930fa4f6ff@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Link: https://lore.kernel.org/r/20210430185046.15742-1-paskripkin@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++++ fs/ext4/mmp.c | 28 +++++++++++++--------------- fs/ext4/super.c | 10 ++++------ 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 639ab5405d6a..2b51cbee3907 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1488,6 +1488,7 @@ struct ext4_sb_info { struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; + struct buffer_head *s_mmp_bh; /* Journaling */ struct journal_s *s_journal; @@ -3720,6 +3721,9 @@ extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +/* mmp.c */ +extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); + /* verity.c */ extern const struct fsverity_operations ext4_verityops; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 68fbeedd627b..6cb598b549ca 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -127,9 +127,9 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, */ static int kmmpd(void *data) { - struct super_block *sb = ((struct mmpd_data *) data)->sb; - struct buffer_head *bh = ((struct mmpd_data *) data)->bh; + struct super_block *sb = (struct super_block *) data; struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh; struct mmp_struct *mmp; ext4_fsblk_t mmp_block; u32 seq = 0; @@ -245,12 +245,18 @@ static int kmmpd(void *data) retval = write_mmp_block(sb, bh); exit_thread: - EXT4_SB(sb)->s_mmp_tsk = NULL; - kfree(data); - brelse(bh); return retval; } +void ext4_stop_mmpd(struct ext4_sb_info *sbi) +{ + if (sbi->s_mmp_tsk) { + kthread_stop(sbi->s_mmp_tsk); + brelse(sbi->s_mmp_bh); + sbi->s_mmp_tsk = NULL; + } +} + /* * Get a random new sequence number but make sure it is not greater than * EXT4_MMP_SEQ_MAX. @@ -275,7 +281,6 @@ int ext4_multi_mount_protect(struct super_block *sb, struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *bh = NULL; struct mmp_struct *mmp = NULL; - struct mmpd_data *mmpd_data; u32 seq; unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); unsigned int wait_time = 0; @@ -364,24 +369,17 @@ int ext4_multi_mount_protect(struct super_block *sb, goto failed; } - mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL); - if (!mmpd_data) { - ext4_warning(sb, "not enough memory for mmpd_data"); - goto failed; - } - mmpd_data->sb = sb; - mmpd_data->bh = bh; + EXT4_SB(sb)->s_mmp_bh = bh; /* * Start a kernel thread to update the MMP block periodically. */ - EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s", + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s", (int)sizeof(mmp->mmp_bdevname), bdevname(bh->b_bdev, mmp->mmp_bdevname)); if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { EXT4_SB(sb)->s_mmp_tsk = NULL; - kfree(mmpd_data); ext4_warning(sb, "Unable to create kmmpd thread for %s.", sb->s_id); goto failed; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d29f6aa7d96e..b6fe1a027c78 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1245,8 +1245,8 @@ static void ext4_put_super(struct super_block *sb) ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + ext4_stop_mmpd(sbi); + brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* @@ -5186,8 +5186,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) failed_mount3: flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + ext4_stop_mmpd(sbi); failed_mount2: rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); @@ -5989,8 +5988,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) */ ext4_mark_recovery_complete(sb, es); } - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + ext4_stop_mmpd(sbi); } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || From ce3aba43599f0b50adbebff133df8d08a3d5fffe Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Fri, 7 May 2021 00:26:54 +0530 Subject: [PATCH 05/32] ext4: fix kernel infoleak via ext4_extent_header Initialize eh_generation of struct ext4_extent_header to prevent leaking info to userspace. Fixes KMSAN kernel-infoleak bug reported by syzbot at: http://syzkaller.appspot.com/bug?id=78e9ad0e6952a3ca16e8234724b2fa92d041b9b8 Cc: stable@kernel.org Reported-by: syzbot+2dcfeaf8cb49b05e8f1a@syzkaller.appspotmail.com Fixes: a86c61812637 ("[PATCH] ext3: add extent map support") Signed-off-by: Anirudh Rayabharam Link: https://lore.kernel.org/r/20210506185655.7118-1-mail@anirudhrb.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cbf37b2cf871..1293de50c8d4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -825,6 +825,7 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); + eh->eh_generation = 0; ext4_mark_inode_dirty(handle, inode); } @@ -1090,6 +1091,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; + neh->eh_generation = 0; /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != @@ -1167,6 +1169,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); + neh->eh_generation = 0; fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); From b9a037b7f3c401d3c63e0423e56aef606b1ffaaf Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 7 May 2021 15:19:04 +0800 Subject: [PATCH 06/32] ext4: cleanup in-core orphan list if ext4_truncate() failed to get a transaction handle In ext4_orphan_cleanup(), if ext4_truncate() failed to get a transaction handle, it didn't remove the inode from the in-core orphan list, which may probably trigger below error dump in ext4_destroy_inode() during the final iput() and could lead to memory corruption on the later orphan list changes. EXT4-fs (sda): Inode 6291467 (00000000b8247c67): orphan list check failed! 00000000b8247c67: 0001f30a 00000004 00000000 00000023 ............#... 00000000e24cde71: 00000006 014082a3 00000000 00000000 ......@......... 0000000072c6a5ee: 00000000 00000000 00000000 00000000 ................ ... This patch fix this by cleanup in-core orphan list manually if ext4_truncate() return error. Cc: stable@kernel.org Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210507071904.160808-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b6fe1a027c78..7e05973d410a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3101,8 +3101,15 @@ static void ext4_orphan_cleanup(struct super_block *sb, inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); - if (ret) + if (ret) { + /* + * We need to clean up the in-core orphan list + * manually if ext4_truncate() failed to get a + * transaction handle. + */ + ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); + } inode_unlock(inode); nr_truncates++; } else { From 8f6840c4fd1e7bd715e403074fb161c1a04cda73 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 10 May 2021 19:10:51 +0800 Subject: [PATCH 07/32] ext4: return error code when ext4_fill_flex_info() fails After commit c89128a00838 ("ext4: handle errors on ext4_commit_super"), 'ret' may be set to 0 before calling ext4_fill_flex_info(), if ext4_fill_flex_info() fails ext4_mount() doesn't return error code, it makes 'root' is null which causes crash in legacy_get_tree(). Fixes: c89128a00838 ("ext4: handle errors on ext4_commit_super") Reported-by: Hulk Robot Cc: # v4.18+ Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20210510111051.55650-1-yangyingliang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7e05973d410a..3b6203543607 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5065,6 +5065,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); + ret = -ENOMEM; goto failed_mount6; } From 01d5d96542fd4e383da79593f8a3450995ce2257 Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Tue, 18 May 2021 15:13:25 +0000 Subject: [PATCH 08/32] ext4: add discard/zeroout flags to journal flush Add a flags argument to jbd2_journal_flush to enable discarding or zero-filling the journal blocks while flushing the journal. Signed-off-by: Leah Rumancik Link: https://lore.kernel.org/r/20210518151327.130198-1-leah.rumancik@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 +- fs/ext4/ioctl.c | 6 +-- fs/ext4/super.c | 6 +-- fs/jbd2/journal.c | 119 +++++++++++++++++++++++++++++++++++++++++-- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/journal.c | 8 +-- include/linux/jbd2.h | 6 ++- 7 files changed, 134 insertions(+), 17 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 211acfba3af7..e1ff4eb3ccba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) ext4_clear_inode_state(inode, EXT4_STATE_JDATA); journal = EXT4_JOURNAL(inode); jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal); + err = jbd2_journal_flush(journal, 0); jbd2_journal_unlock_updates(journal); if (err) @@ -6005,7 +6005,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { - err = jbd2_journal_flush(journal); + err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_writepages_rwsem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a96d6721cef9..93e9419825b8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -706,7 +706,7 @@ static long ext4_ioctl_group_add(struct file *file, err = ext4_group_add(sb, input); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) @@ -884,7 +884,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) @@ -1027,7 +1027,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (EXT4_SB(sb)->s_journal) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3b6203543607..ad3919dbd49e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5653,7 +5653,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb, return 0; } jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal); + err = jbd2_journal_flush(journal, 0); if (err < 0) goto out; @@ -5795,7 +5795,7 @@ static int ext4_freeze(struct super_block *sb) * Don't clear the needs_recovery flag if we failed to * flush the journal. */ - error = jbd2_journal_flush(journal); + error = jbd2_journal_flush(journal, 0); if (error < 0) goto out; @@ -6389,7 +6389,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * otherwise be livelocked... */ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); if (err) return err; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2dc944442802..3a2ed60ea8b7 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1686,6 +1686,110 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) write_unlock(&journal->j_state_lock); } +/** + * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock) + * @journal: The journal to erase. + * @flags: A discard/zeroout request is sent for each physically contigous + * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or + * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation + * to perform. + * + * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes + * will be explicitly written if no hardware offload is available, see + * blkdev_issue_zeroout for more details. + */ +static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) +{ + int err = 0; + unsigned long block, log_offset; /* logical */ + unsigned long long phys_block, block_start, block_stop; /* physical */ + loff_t byte_start, byte_stop, byte_count; + struct request_queue *q = bdev_get_queue(journal->j_dev); + + /* flags must be set to either discard or zeroout */ + if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if (!q) + return -ENXIO; + + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + return -EOPNOTSUPP; + + /* + * lookup block mapping and issue discard/zeroout for each + * contiguous region + */ + log_offset = be32_to_cpu(journal->j_superblock->s_first); + block_start = ~0ULL; + for (block = log_offset; block < journal->j_total_len; block++) { + err = jbd2_journal_bmap(journal, block, &phys_block); + if (err) { + pr_err("JBD2: bad block at offset %lu", block); + return err; + } + + if (block_start == ~0ULL) { + block_start = phys_block; + block_stop = block_start - 1; + } + + /* + * last block not contiguous with current block, + * process last contiguous region and return to this block on + * next loop + */ + if (phys_block != block_stop + 1) { + block--; + } else { + block_stop++; + /* + * if this isn't the last block of journal, + * no need to process now because next block may also + * be part of this contiguous region + */ + if (block != journal->j_total_len - 1) + continue; + } + + /* + * end of contiguous region or this is last block of journal, + * take care of the region + */ + byte_start = block_start * journal->j_blocksize; + byte_stop = block_stop * journal->j_blocksize; + byte_count = (block_stop - block_start + 1) * + journal->j_blocksize; + + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping, + byte_start, byte_stop); + + if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { + err = blkdev_issue_discard(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS, 0); + } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { + err = blkdev_issue_zeroout(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS, 0); + } + + if (unlikely(err != 0)) { + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", + err, block_start, block_stop); + return err; + } + + /* reset start and stop after processing a region */ + block_start = ~0ULL; + } + + return blkdev_issue_flush(journal->j_dev); +} /** * jbd2_journal_update_sb_errno() - Update error in the journal. @@ -2246,13 +2350,18 @@ EXPORT_SYMBOL(jbd2_journal_clear_features); /** * jbd2_journal_flush() - Flush journal * @journal: Journal to act on. + * @flags: optional operation on the journal blocks after the flush (see below) * * Flush all data for a given journal to disk and empty the journal. * Filesystems can use this when remounting readonly to ensure that - * recovery does not need to happen on remount. + * recovery does not need to happen on remount. Optionally, a discard or zeroout + * can be issued on the journal blocks after flushing. + * + * flags: + * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks + * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks */ - -int jbd2_journal_flush(journal_t *journal) +int jbd2_journal_flush(journal_t *journal, unsigned int flags) { int err = 0; transaction_t *transaction = NULL; @@ -2306,6 +2415,10 @@ int jbd2_journal_flush(journal_t *journal) * commits of data to the journal will restore the current * s_start value. */ jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); + + if (flags) + err = __jbd2_journal_erase(journal, flags); + mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e032f2e2c2c5..f1cc8258d34a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6018,7 +6018,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) * Then truncate log will be replayed resulting in cluster double free. */ jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4e589ce2fce6..4f15750aac5d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -308,7 +308,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) } jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { up_write(&journal->j_trans_barrier); @@ -1000,7 +1000,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (ocfs2_mount_local(osb)) { jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); @@ -1070,7 +1070,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) if (replayed) { jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); @@ -1666,7 +1666,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* wipe the journal */ jbd2_journal_lock_updates(journal); - status = jbd2_journal_flush(journal); + status = jbd2_journal_flush(journal, 0); jbd2_journal_unlock_updates(journal); if (status < 0) mlog_errno(status); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index db0e1920cb12..8543233b0388 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1370,6 +1370,10 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) * mode */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ +#define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 +#define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 +#define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ + JBD2_JOURNAL_FLUSH_ZEROOUT) /* * Function declarations for the journaling transaction and buffer @@ -1500,7 +1504,7 @@ extern int jbd2_journal_invalidatepage(journal_t *, struct page *, unsigned int, unsigned int); extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page); extern int jbd2_journal_stop(handle_t *); -extern int jbd2_journal_flush (journal_t *); +extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); From 351a0a3fbc3584a00036f05cfdb0cd3eb1dca92a Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Tue, 18 May 2021 15:13:26 +0000 Subject: [PATCH 09/32] ext4: add ioctl EXT4_IOC_CHECKPOINT ioctl EXT4_IOC_CHECKPOINT checkpoints and flushes the journal. This includes forcing all the transactions to the log, checkpointing the transactions, and flushing the log to disk. This ioctl takes u32 "flags" as an argument. Three flags are supported. EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN can be used to verify input to the ioctl. It returns error if there is any invalid input, otherwise it returns success without performing any checkpointing. The other two flags, EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT, can be used to issue requests to discard or zeroout the journal logs blocks, respectively. At this point, EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT is primarily added to enable testing of this codepath on devices that don't support discard. EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT cannot both be set. Systems that wish to achieve content deletion SLO can set up a daemon that calls this ioctl at a regular interval such that it matches with the SLO requirement. Thus, with this patch, the ext4_dir_entry2 wipeout patch[1], and the Ext4 "-o discard" mount option set, Ext4 can now guarantee that all file contents, file metatdata, and filenames will not be accessible through the filesystem and will have had discard or zeroout requests issued for corresponding device blocks. The __jbd2_journal_erase function could also be used to discard or zero-fill the journal during journal load after recovery. This would provide a potential solution to a journal replay bug reported earlier this year[2]. After a successful journal recovery, e2fsck can call this ioctl to discard the journal as well. [1] https://lore.kernel.org/linux-ext4/YIHknqxngB1sUdie@mit.edu/ [2] https://lore.kernel.org/linux-ext4/YDZoaacIYStFQT8g@mit.edu/ Link: https://lore.kernel.org/r/20210518151327.130198-2-leah.rumancik@gmail.com Signed-off-by: Leah Rumancik Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 9 ++++++++ fs/ext4/ioctl.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2b51cbee3907..a646bfcbd0e8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -720,6 +720,7 @@ enum { #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) +#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) @@ -741,6 +742,14 @@ enum { #define EXT4_STATE_FLAG_NEWENTRY 0x00000004 #define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 +/* flags for ioctl EXT4_IOC_CHECKPOINT */ +#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 +#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 +#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 +#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ + EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 93e9419825b8..5730aeca563c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -799,6 +799,57 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) return error; } +static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) +{ + int err = 0; + __u32 flags = 0; + unsigned int flush_flags = 0; + struct super_block *sb = file_inode(filp)->i_sb; + struct request_queue *q; + + if (copy_from_user(&flags, (__u32 __user *)arg, + sizeof(__u32))) + return -EFAULT; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* check for invalid bits set */ + if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if (!EXT4_SB(sb)->s_journal) + return -ENODEV; + + if (flags & ~JBD2_JOURNAL_FLUSH_VALID) + return -EINVAL; + + q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev); + if (!q) + return -ENXIO; + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) + return 0; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD) + flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) { + flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT; + pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow"); + } + + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + + return err; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1210,6 +1261,9 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); + case EXT4_IOC_CHECKPOINT: + return ext4_ioctl_checkpoint(filp, arg); + default: return -ENOTTY; } @@ -1290,6 +1344,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_CHECKPOINT: break; default: return -ENOIOCTLCMD; From fd7b23be92059f14537cb9cac0f0894c3a9b1284 Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Tue, 18 May 2021 15:13:27 +0000 Subject: [PATCH 10/32] ext4: update journal documentation Add a section about journal checkpointing, including information about the ioctl EXT4_IOC_CHECKPOINT which can be used to trigger a journal checkpoint from userspace. Also, update the journal allocation information to reflect that up to 10240000 blocks are used for the journal and that the journal is not necessarily contiguous. Signed-off-by: Leah Rumancik Changes in v5: - clarify behavior of DRY_RUN flag Link: https://lore.kernel.org/r/20210518151327.130198-3-leah.rumancik@gmail.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/journal.rst | 39 +++++++++++++++++----- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst index cdbfec473167..5fad38860f17 100644 --- a/Documentation/filesystems/ext4/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst @@ -4,14 +4,14 @@ Journal (jbd2) -------------- Introduced in ext3, the ext4 filesystem employs a journal to protect the -filesystem against corruption in the case of a system crash. A small -continuous region of disk (default 128MiB) is reserved inside the -filesystem as a place to land “important” data writes on-disk as quickly -as possible. Once the important data transaction is fully written to the -disk and flushed from the disk write cache, a record of the data being -committed is also written to the journal. At some later point in time, -the journal code writes the transactions to their final locations on -disk (this could involve a lot of seeking or a lot of small +filesystem against metadata inconsistencies in the case of a system crash. Up +to 10,240,000 file system blocks (see man mke2fs(8) for more details on journal +size limits) can be reserved inside the filesystem as a place to land +“important” data writes on-disk as quickly as possible. Once the important +data transaction is fully written to the disk and flushed from the disk write +cache, a record of the data being committed is also written to the journal. At +some later point in time, the journal code writes the transactions to their +final locations on disk (this could involve a lot of seeking or a lot of small read-write-erases) before erasing the commit record. Should the system crash during the second slow write, the journal can be replayed all the way to the latest commit record, guaranteeing the atomicity of whatever @@ -731,3 +731,26 @@ point, the refcount for inode 11 is not reliable, but that gets fixed by the replay of last inode 11 tag. Thus, by converting a non-idempotent procedure into a series of idempotent outcomes, fast commits ensured idempotence during the replay. + +Journal Checkpoint +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Checkpointing the journal ensures all transactions and their associated buffers +are submitted to the disk. In-progress transactions are waited upon and included +in the checkpoint. Checkpointing is used internally during critical updates to +the filesystem including journal recovery, filesystem resizing, and freeing of +the journal_t structure. + +A journal checkpoint can be triggered from userspace via the ioctl +EXT4_IOC_CHECKPOINT. This ioctl takes a single, u64 argument for flags. +Currently, three flags are supported. First, EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN +can be used to verify input to the ioctl. It returns error if there is any +invalid input, otherwise it returns success without performing +any checkpointing. This can be used to check whether the ioctl exists on a +system and to verify there are no issues with arguments or flags. The +other two flags are EXT4_IOC_CHECKPOINT_FLAG_DISCARD and +EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT. These flags cause the journal blocks to be +discarded or zero-filled, respectively, after the journal checkpoint is +complete. EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT +cannot both be set. The ioctl may be useful when snapshotting a system or for +complying with content deletion SLOs. From b2d2e7573548295a14db999095fd1df40352c91a Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Thu, 20 May 2021 14:55:52 +0800 Subject: [PATCH 11/32] ext4: remove set but rewrite variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the ext4_dx_add_entry function, the at variable is assigned but will reset just after “again:” label. So delete the unnecessary assignment. this will not chang the logic. Signed-off-by: Tian Tao Reviewed-by: Artem Blagodarenko Link: https://lore.kernel.org/r/1621493752-36890-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a4af26d4459a..5fd56f616cf0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2499,7 +2499,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, /* Which index block gets the new entry? */ if (at - entries >= icount1) { - frame->at = at = at - entries - icount1 + entries2; + frame->at = at - entries - icount1 + entries2; frame->entries = entries = entries2; swap(frame->bh, bh2); } From e5e7010e5444d923e4091cafff61d05f2d19cada Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 22 May 2021 18:30:44 +0800 Subject: [PATCH 12/32] ext4: remove check for zero nr_to_scan in ext4_es_scan() After converting fs shrinkers to new scan/count API, we are no longer pass zero nr_to_scan parameter to detect the number of objects to free, just remove this check. Fixes: 1ab6c4997e04 ("fs: convert fs shrinkers to new scan/count API") Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210522103045.690103-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a729027322d..db3cd70a72e4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1574,9 +1574,6 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); - if (!nr_to_scan) - return ret; - nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); From 4fb7c70a889ead2e91e184895ac6e5354b759135 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 22 May 2021 18:30:45 +0800 Subject: [PATCH 13/32] ext4: correct the cache_nr in tracepoint ext4_es_shrink_exit The cache_cnt parameter of tracepoint ext4_es_shrink_exit means the remaining cache count after shrink, but now it is the cache count before shrink, fix it by read sbi->s_extent_cache_cnt again. Fixes: 1ab6c4997e04 ("fs: convert fs shrinkers to new scan/count API") Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210522103045.690103-3-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index db3cd70a72e4..9a3a8996aacf 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1576,6 +1576,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } From c89849cc0259f3d33624cc3bd127685c3c0fa25d Mon Sep 17 00:00:00 2001 From: Pan Dong Date: Tue, 25 May 2021 15:36:56 +0800 Subject: [PATCH 14/32] ext4: fix avefreec in find_group_orlov The avefreec should be average free clusters instead of average free blocks, otherwize Orlov's allocator will not work properly when bigalloc enabled. Cc: stable@kernel.org Signed-off-by: Pan Dong Link: https://lore.kernel.org/r/20210525073656.31594-1-pandong.peter@bytedance.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9bab7fd4ccd5..e89fc0f770b0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -402,7 +402,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, * * We always try to spread first-level directories. * - * If there are blockgroups with both free inodes and free blocks counts + * If there are blockgroups with both free inodes and free clusters counts * not worse than average we return one with smallest directory count. * Otherwise we simply return a random group. * @@ -411,7 +411,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or + * it has too few free clusters left (min_clusters) or * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more @@ -427,7 +427,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei, grp_free; - ext4_fsblk_t freeb, avefreec; + ext4_fsblk_t freec, avefreec; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_clusters; @@ -446,9 +446,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; - freeb = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - avefreec = freeb; + freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); + avefreec = freec; do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); From f9505c72b2ee80cb68af95449a5215906130e3be Mon Sep 17 00:00:00 2001 From: chenyichong Date: Wed, 26 May 2021 13:29:30 +0800 Subject: [PATCH 15/32] ext4: use local variable ei instead of EXT4_I() macro Signed-off-by: chenyichong Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20210526052930.11278-1-chenyichong@uniontech.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e1ff4eb3ccba..c5cf700e2c8f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -374,7 +374,7 @@ void ext4_da_update_reserve_space(struct inode *inode, ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) From 6d2424a84533d3563ef525cb6e19cfda13abc472 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 27 May 2021 16:55:57 -0700 Subject: [PATCH 16/32] ext4: fix comment for s_hash_unsigned Fix the comment for s_hash_unsigned to not be the opposite of what it actually is. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20210527235557.2377525-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a646bfcbd0e8..8ff4ae3b5715 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1486,7 +1486,7 @@ struct ext4_sb_info { unsigned int s_inode_goal; u32 s_hash_seed[4]; int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; From ee00d6b3c7aa65f97ace382ddf59739e65f5e8dd Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 31 May 2021 09:19:08 +0530 Subject: [PATCH 17/32] ext4: fsmap: fix the block/inode bitmap comment While debugging fstest ext4/027 failure, found below comment to be wrong and confusing. Hence fix it while we are at it. Signed-off-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/e79134132db7ea42f15747b5c669ee91cc1aacdf.1622432690.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/fsmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h index 68c8001fee85..ac642be2302e 100644 --- a/fs/ext4/fsmap.h +++ b/fs/ext4/fsmap.h @@ -50,7 +50,7 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, #define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ #define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ #define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ -#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */ -#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */ +#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */ +#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */ #endif /* __EXT4_FSMAP_H__ */ From 310c097c2bdbea253d6ee4e064f3e65580ef93ac Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 3 Jun 2021 07:33:02 +0530 Subject: [PATCH 18/32] ext4: remove duplicate definition of ext4_xattr_ibody_inline_set() ext4_xattr_ibody_inline_set() & ext4_xattr_ibody_set() have the exact same definition. Hence remove ext4_xattr_ibody_inline_set() and all its call references. Convert the callers of it to call ext4_xattr_ibody_set() instead. [ Modified to preserve ext4_xattr_ibody_set() and remove ext4_xattr_ibody_inline_set() instead. -- TYT ] Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/fd566b799bbbbe9b668eb5eecde5b5e319e3694f.1622685482.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 11 +++++------ fs/ext4/xattr.c | 26 +------------------------- fs/ext4/xattr.h | 6 +++--- 3 files changed, 9 insertions(+), 34 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3cf01629010d..70cb64db33f7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -204,7 +204,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer, /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr - * value since it is already handled by ext4_xattr_ibody_inline_set. + * value since it is already handled by ext4_xattr_ibody_set. * That saves us one memcpy. */ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, @@ -286,7 +286,7 @@ static int ext4_create_inline_data(handle_t *handle, BUG_ON(!is.s.not_found); - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, @@ -358,7 +358,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, i.value = value; i.value_len = len; - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; @@ -431,7 +431,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (error) goto out; - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; @@ -1925,8 +1925,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; - err = ext4_xattr_ibody_inline_set(handle, inode, - &i, &is); + err = ext4_xattr_ibody_set(handle, inode, &i, &is); if (err) goto out_error; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 10ba4b24a0aa..6dd5c05c444a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2190,31 +2190,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_xattr_search *s = &is->s; - int error; - - if (EXT4_I(inode)->i_extra_isize == 0) - return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) - return error; - header = IHDR(inode, ext4_raw_inode(&is->iloc)); - if (!IS_LAST_ENTRY(s->first)) { - header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - ext4_set_inode_state(inode, EXT4_STATE_XATTR); - } else { - header->h_magic = cpu_to_le32(0); - ext4_clear_inode_state(inode, EXT4_STATE_XATTR); - } - return 0; -} - -static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, +int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 730b91fa0dd7..77efb9a627ad 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -186,9 +186,9 @@ extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size); -extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); From e9f9f61d0cdcb7f0b0b5feb2d84aa1c5894751f3 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Jun 2021 12:15:08 -0700 Subject: [PATCH 19/32] ext4: consolidate checks for resize of bigalloc into ext4_resize_begin Two different places checked for attempts to resize a filesystem with the bigalloc feature. Move the check into ext4_resize_begin, which both places already call. Signed-off-by: Josh Triplett Link: https://lore.kernel.org/r/bee03303d999225ecb3bfa5be8576b2f4c6edbe6.1623093259.git.josh@joshtriplett.org Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 14 -------------- fs/ext4/resize.c | 5 +++++ 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5730aeca563c..e27f34bceb8d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -692,13 +692,6 @@ static long ext4_ioctl_group_add(struct file *file, if (err) return err; - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_add_out; - } - err = mnt_want_write_file(file); if (err) goto group_add_out; @@ -921,13 +914,6 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto group_extend_out; } - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_extend_out; - } - err = mnt_want_write_file(filp); if (err) goto group_extend_out; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bd0d185654f3..d13bb9e76482 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -74,6 +74,11 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; } + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags)) ret = -EBUSY; From b1489186cc8391e0c1e342f9fbc3eedf6b944c61 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Jun 2021 12:15:24 -0700 Subject: [PATCH 20/32] ext4: add check to prevent attempting to resize an fs with sparse_super2 The in-kernel ext4 resize code doesn't support filesystem with the sparse_super2 feature. It fails with errors like this and doesn't finish the resize: EXT4-fs (loop0): resizing filesystem from 16640 to 7864320 blocks EXT4-fs warning (device loop0): verify_reserved_gdb:760: reserved GDT 2 missing grp 1 (32770) EXT4-fs warning (device loop0): ext4_resize_fs:2111: error (-22) occurred during file system resize EXT4-fs (loop0): resized filesystem to 2097152 To reproduce: mkfs.ext4 -b 4096 -I 256 -J size=32 -E resize=$((256*1024*1024)) -O sparse_super2 ext4.img 65M truncate -s 30G ext4.img mount ext4.img /mnt python3 -c 'import fcntl, os, struct ; fd = os.open("/mnt", os.O_RDONLY | os.O_DIRECTORY) ; fcntl.ioctl(fd, 0x40086610, struct.pack("Q", 30 * 1024 * 1024 * 1024 // 4096), False) ; os.close(fd)' dmesg | tail e2fsck ext4.img The userspace resize2fs tool has a check for this case: it checks if the filesystem has sparse_super2 set and if the kernel provides /sys/fs/ext4/features/sparse_super2. However, the former check requires manually reading and parsing the filesystem superblock. Detect this case in ext4_resize_begin and error out early with a clear error message. Signed-off-by: Josh Triplett Link: https://lore.kernel.org/r/74b8ae78405270211943cd7393e65586c5faeed1.1623093259.git.josh@joshtriplett.org Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d13bb9e76482..fc885914c88a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -78,6 +78,10 @@ int ext4_resize_begin(struct super_block *sb) ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); return -EOPNOTSUPP; } + if (ext4_has_feature_sparse_super2(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); + return -EOPNOTSUPP; + } if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags)) From d07621d9b9b8231187cc6e2121c927b3b8016789 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 8 Jun 2021 22:12:36 +0800 Subject: [PATCH 21/32] jbd2: clean up misleading comments for jbd2_fc_release_bufs This comments was for jbd2_fc_wait_bufs, not for jbd2_fc_release_bufs. Remove this misleading comments. Signed-off-by: yangerkun Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210608141236.459441-1-yangerkun@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 3a2ed60ea8b7..f88895b4920c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -934,10 +934,6 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) } EXPORT_SYMBOL(jbd2_fc_wait_bufs); -/* - * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf - * for completion. - */ int jbd2_fc_release_bufs(journal_t *journal) { struct buffer_head *bh; @@ -945,10 +941,6 @@ int jbd2_fc_release_bufs(journal_t *journal) j_fc_off = journal->j_fc_off; - /* - * Wait in reverse order to minimize chances of us being woken up before - * all IOs have completed - */ for (i = j_fc_off - 1; i >= 0; i--) { bh = journal->j_fc_wbuf[i]; if (!bh) From 0caaefbaf2a429c256c7469cb603ca8918e96fb0 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Wed, 9 Jun 2021 15:55:45 +0800 Subject: [PATCH 22/32] ext4: no need to verify new add extent block ext4_ext_grow_indepth will add a new extent block which has init the expected content. We can mark this buffer as verified so to stop a useless check in __read_extent_tree_block. Signed-off-by: yangerkun Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210609075545.1442160-1-yangerkun@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1293de50c8d4..92ad64b89d9b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1309,6 +1309,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, neh->eh_magic = EXT4_EXT_MAGIC; ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); + set_buffer_verified(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); From 1866cba842437f3e7a5a8ee5b558744d9ae844d0 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:33 +0800 Subject: [PATCH 23/32] jbd2: remove the out label in __jbd2_journal_remove_checkpoint() The 'out' lable just return the 'ret' value and seems not required, so remove this label and switch to return appropriate value immediately. This patch also do some minor cleanup, no logical change. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 63b526d44886..bf5511d19ac5 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -564,13 +564,13 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; - int ret = 0; JBUFFER_TRACE(jh, "entry"); - if ((transaction = jh->b_cp_transaction) == NULL) { + transaction = jh->b_cp_transaction; + if (!transaction) { JBUFFER_TRACE(jh, "not on transaction"); - goto out; + return 0; } journal = transaction->t_journal; @@ -579,9 +579,9 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) jh->b_cp_transaction = NULL; jbd2_journal_put_journal_head(jh); - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) - goto out; + /* Is this transaction empty? */ + if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) + return 0; /* * There is one special case to worry about: if we have just pulled the @@ -593,10 +593,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * See the comment at the end of jbd2_journal_commit_transaction(). */ if (transaction->t_state != T_FINISHED) - goto out; + return 0; - /* OK, that was the last buffer for the transaction: we can now - safely remove this transaction from the log */ + /* + * OK, that was the last buffer for the transaction, we can now + * safely remove this transaction from the log. + */ stats = &transaction->t_chp_stats; if (stats->cs_chp_time) stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, @@ -606,9 +608,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) __jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); - ret = 1; -out: - return ret; + return 1; } /* From fcf37549ae19e904bc6a5eadf5c25eca36100c5e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:34 +0800 Subject: [PATCH 24/32] jbd2: ensure abort the journal if detect IO error when writing original buffer back Although we merged c044f3d8360 ("jbd2: abort journal if free a async write error metadata buffer"), there is a race between jbd2_journal_try_to_free_buffers() and jbd2_journal_destroy(), so the jbd2_log_do_checkpoint() may still fail to detect the buffer write io error flag which may lead to filesystem inconsistency. jbd2_journal_try_to_free_buffers() ext4_put_super() jbd2_journal_destroy() __jbd2_journal_remove_checkpoint() detect buffer write error jbd2_log_do_checkpoint() jbd2_cleanup_journal_tail() <--- lead to inconsistency jbd2_journal_abort() Fix this issue by introducing a new atomic flag which only have one JBD2_CHECKPOINT_IO_ERROR bit now, and set it in __jbd2_journal_remove_checkpoint() when freeing a checkpoint buffer which has write_io_error flag. Then jbd2_journal_destroy() will detect this mark and abort the journal to prevent updating log tail. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-3-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 12 ++++++++++++ fs/jbd2/journal.c | 14 ++++++++++++++ include/linux/jbd2.h | 11 +++++++++++ 3 files changed, 37 insertions(+) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index bf5511d19ac5..d27c10f4502f 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -564,6 +564,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; + struct buffer_head *bh = jh2bh(jh); JBUFFER_TRACE(jh, "entry"); @@ -575,6 +576,17 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal; JBUFFER_TRACE(jh, "removing from transaction"); + + /* + * If we have failed to write the buffer out to disk, the filesystem + * may become inconsistent. We cannot abort the journal here since + * we hold j_list_lock and we have to be careful about races with + * jbd2_journal_destroy(). So mark the writeback IO error in the + * journal here and we abort the journal later from a better context. + */ + if (buffer_write_io_error(bh)) + set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); + __buffer_unlink(jh); jh->b_cp_transaction = NULL; jbd2_journal_put_journal_head(jh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f88895b4920c..8b3f5bbd65f9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1610,6 +1610,10 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, if (is_journal_aborted(journal)) return -EIO; + if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { + jbd2_journal_abort(journal, -EIO); + return -EIO; + } BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -2091,6 +2095,16 @@ int jbd2_journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); + /* + * OK, all checkpoint transactions have been checked, now check the + * write out io error flag and abort the journal if some buffer failed + * to write back to the original location, otherwise the filesystem + * may become inconsistent. + */ + if (!is_journal_aborted(journal) && + test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) + jbd2_journal_abort(journal, -EIO); + if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock_io(&journal->j_checkpoint_mutex); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8543233b0388..d5db408ae064 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -779,6 +779,11 @@ struct journal_s */ unsigned long j_flags; + /** + * @j_atomic_flags: Atomic journaling state flags. + */ + unsigned long j_atomic_flags; + /** * @j_errno: * @@ -1375,6 +1380,12 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) +/* + * Journal atomic flag definitions + */ +#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing + * buffer back to disk */ + /* * Function declarations for the journaling transaction and buffer * management From 235d68069cbd158cb00835d434e9e9accf9a6dd4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:35 +0800 Subject: [PATCH 25/32] jbd2: don't abort the journal when freeing buffers Now that we can be sure the journal is aborted once a buffer has failed to be written back to disk, we can remove the journal abort logic in jbd2_journal_try_to_free_buffers() which was introduced in commit c044f3d8360d ("jbd2: abort journal if free a async write error metadata buffer"), because it may cost and propably is not safe. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-4-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e8fc45fd751f..8804e126805f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2123,7 +2123,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) { struct buffer_head *head; struct buffer_head *bh; - bool has_write_io_error = false; int ret = 0; J_ASSERT(PageLocked(page)); @@ -2148,26 +2147,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) goto busy; - - /* - * If we free a metadata buffer which has been failed to - * write out, the jbd2 checkpoint procedure will not detect - * this failure and may lead to filesystem inconsistency - * after cleanup journal tail. - */ - if (buffer_write_io_error(bh)) { - pr_err("JBD2: Error while async write back metadata bh %llu.", - (unsigned long long)bh->b_blocknr); - has_write_io_error = true; - } } while ((bh = bh->b_this_page) != head); ret = try_to_free_buffers(page); - busy: - if (has_write_io_error) - jbd2_journal_abort(journal, -EIO); - return ret; } From 214eb5a4d8a2032fb9f0711d1b202eb88ee02920 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:36 +0800 Subject: [PATCH 26/32] jbd2: remove redundant buffer io error checks Now that __jbd2_journal_remove_checkpoint() can detect buffer io error and mark journal checkpoint error, then we abort the journal later before updating log tail to ensure the filesystem works consistently. So we could remove other redundant buffer io error checkes. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-5-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index d27c10f4502f..75a4f622afaf 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -91,8 +91,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh) int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_transaction == NULL && !buffer_locked(bh) && - !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + if (!jh->b_transaction && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; } @@ -228,7 +227,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) * OK, we need to start writing disk blocks. Take one transaction * and write it. */ - result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; @@ -295,8 +293,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) goto restart; } if (!buffer_dirty(bh)) { - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO; BUFFER_TRACE(bh, "remove from checkpoint"); if (__jbd2_journal_remove_checkpoint(jh)) /* The transaction was released; we're done */ @@ -356,8 +352,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) spin_lock(&journal->j_list_lock); goto restart2; } - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO; /* * Now in whatever state the buffer currently is, we @@ -369,10 +363,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) } out: spin_unlock(&journal->j_list_lock); - if (result < 0) - jbd2_journal_abort(journal, result); - else - result = jbd2_cleanup_journal_tail(journal); + result = jbd2_cleanup_journal_tail(journal); return (result < 0) ? result : 0; } From 4ba3fcdde7e36af93610ceb3cc38365b14539865 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:37 +0800 Subject: [PATCH 27/32] jbd2,ext4: add a shrinker to release checkpointed buffers Current metadata buffer release logic in bdev_try_to_free_page() have a lot of use-after-free issues when umount filesystem concurrently, and it is difficult to fix directly because ext4 is the only user of s_op->bdev_try_to_free_page callback and we may have to add more special refcount or lock that is only used by ext4 into the common vfs layer, which is unacceptable. One better solution is remove the bdev_try_to_free_page callback, but the real problem is we cannot easily release journal_head on the checkpointed buffer, so try_to_free_buffers() cannot release buffers and page under memory pressure, which is more likely to trigger out-of-memory. So we cannot remove the callback directly before we find another way to release journal_head. This patch introduce a shrinker to free journal_head on the checkpointed transaction. After the journal_head got freed, try_to_free_buffers() could free buffer properly. Signed-off-by: Zhang Yi Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-6-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 8 ++ fs/jbd2/checkpoint.c | 147 ++++++++++++++++++++++++++++++++++++ fs/jbd2/journal.c | 87 +++++++++++++++++++++ include/linux/jbd2.h | 26 +++++++ include/trace/events/jbd2.h | 101 +++++++++++++++++++++++++ 5 files changed, 369 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad3919dbd49e..7ee2e21537e0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_sysfs(sb); if (sbi->s_journal) { + jbd2_journal_unregister_shrinker(sbi->s_journal); aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -5186,6 +5187,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { + jbd2_journal_unregister_shrinker(sbi->s_journal); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } @@ -5511,6 +5513,12 @@ static int ext4_load_journal(struct super_block *sb, ext4_commit_super(sb); } + err = jbd2_journal_register_shrinker(journal); + if (err) { + EXT4_SB(sb)->s_journal = NULL; + goto err_out; + } + return 0; err_out: diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 75a4f622afaf..1abdae44a3d8 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -79,6 +79,18 @@ static inline void __buffer_relink_io(struct journal_head *jh) transaction->t_checkpoint_io_list = jh; } +/* + * Check a checkpoint buffer could be release or not. + * + * Requires j_list_lock + */ +static inline bool __cp_buffer_busy(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); +} + /* * Try to release a checkpointed buffer from its transaction. * Returns 1 if we released it and 2 if we also released the @@ -458,6 +470,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) return 0; } +/* + * journal_shrink_one_cp_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the given list + * and try to release them. If the whole transaction is released, set + * the 'released' parameter. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, + unsigned long *nr_to_scan, + bool *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + unsigned long nr_freed = 0; + int ret; + + if (!jh || *nr_to_scan == 0) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + + (*nr_to_scan)--; + if (__cp_buffer_busy(jh)) + continue; + + nr_freed++; + ret = __jbd2_journal_remove_checkpoint(jh); + if (ret) { + *released = true; + break; + } + + if (need_resched()) + break; + } while (jh != last_jh && *nr_to_scan); + + return nr_freed; +} + +/* + * jbd2_journal_shrink_checkpoint_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the journal + * and try to release them. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, + unsigned long *nr_to_scan) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + bool released; + tid_t first_tid = 0, last_tid = 0, next_tid = 0; + tid_t tid = 0; + unsigned long nr_freed = 0; + unsigned long nr_scanned = *nr_to_scan; + +again: + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + goto out; + } + + /* + * Get next shrink transaction, resume previous scan or start + * over again. If some others do checkpoint and drop transaction + * from the checkpoint list, we ignore saved j_shrink_transaction + * and start over unconditionally. + */ + if (journal->j_shrink_transaction) + transaction = journal->j_shrink_transaction; + else + transaction = journal->j_checkpoint_transactions; + + if (!first_tid) + first_tid = transaction->t_tid; + last_transaction = journal->j_checkpoint_transactions->t_cpprev; + next_transaction = transaction; + last_tid = last_transaction->t_tid; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + tid = transaction->t_tid; + released = false; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + if (released) + continue; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + } while (transaction != last_transaction); + + if (transaction != last_transaction) { + journal->j_shrink_transaction = next_transaction; + next_tid = next_transaction->t_tid; + } else { + journal->j_shrink_transaction = NULL; + next_tid = 0; + } + + spin_unlock(&journal->j_list_lock); + cond_resched(); + + if (*nr_to_scan && next_tid) + goto again; +out: + nr_scanned -= *nr_to_scan; + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, + nr_freed, nr_scanned, next_tid); + + return nr_freed; +} + /* * journal_clean_checkpoint_list * @@ -580,6 +723,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) __buffer_unlink(jh); jh->b_cp_transaction = NULL; + percpu_counter_dec(&journal->j_jh_shrink_count); jbd2_journal_put_journal_head(jh); /* Is this transaction empty? */ @@ -642,6 +786,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; + percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); } /* @@ -657,6 +802,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) { assert_spin_locked(&journal->j_list_lock); + + journal->j_shrink_transaction = NULL; if (transaction->t_cpnext) { transaction->t_cpnext->t_cpprev = transaction->t_cpprev; transaction->t_cpprev->t_cpnext = transaction->t_cpnext; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8b3f5bbd65f9..7c52feb6f753 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2050,6 +2050,91 @@ int jbd2_journal_load(journal_t *journal) return -EIO; } +/** + * jbd2_journal_shrink_scan() + * + * Scan the checkpointed buffer on the checkpoint list and release the + * journal_head. + */ +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long nr_to_scan = sc->nr_to_scan; + unsigned long nr_shrunk; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); + + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); + + return nr_shrunk; +} + +/** + * jbd2_journal_shrink_count() + * + * Count the number of checkpoint buffers on the checkpoint list. + */ +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); + + return count; +} + +/** + * jbd2_journal_register_shrinker() + * @journal: Journal to act on. + * + * Init a percpu counter to record the checkpointed buffers on the checkpoint + * list and register a shrinker to release their journal_head. + */ +int jbd2_journal_register_shrinker(journal_t *journal) +{ + int err; + + journal->j_shrink_transaction = NULL; + + err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); + if (err) + return err; + + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; + journal->j_shrinker.seeks = DEFAULT_SEEKS; + journal->j_shrinker.batch = journal->j_max_transaction_buffers; + + err = register_shrinker(&journal->j_shrinker); + if (err) { + percpu_counter_destroy(&journal->j_jh_shrink_count); + return err; + } + + return 0; +} + +/** + * jbd2_journal_unregister_shrinker() + * @journal: Journal to act on. + * + * Unregister the checkpointed buffer shrinker and destroy the percpu counter. + */ +void jbd2_journal_unregister_shrinker(journal_t *journal) +{ + percpu_counter_destroy(&journal->j_jh_shrink_count); + unregister_shrinker(&journal->j_shrinker); +} + /** * jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. @@ -2122,6 +2207,8 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } + jbd2_journal_unregister_shrinker(journal); + if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); iput(journal->j_inode); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d5db408ae064..6cc035321562 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -909,6 +909,29 @@ struct journal_s */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; + /** + * @j_shrinker: + * + * Journal head shrinker, reclaim buffer's journal head which + * has been written back. + */ + struct shrinker j_shrinker; + + /** + * @j_jh_shrink_count: + * + * Number of journal buffers on the checkpoint list. [j_list_lock] + */ + struct percpu_counter j_jh_shrink_count; + + /** + * @j_shrink_transaction: + * + * Record next transaction will shrink on the checkpoint list. + * [j_list_lock] + */ + transaction_t *j_shrink_transaction; + /** * @j_head: * @@ -1422,6 +1445,7 @@ extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); @@ -1532,6 +1556,8 @@ extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); +extern int jbd2_journal_register_shrinker(journal_t *journal); +extern void jbd2_journal_unregister_shrinker(journal_t *journal); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index d16a32867f3a..a4dfe005983d 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -394,6 +394,107 @@ TRACE_EVENT(jbd2_lock_buffer_stall, __entry->stall_ms) ); +DECLARE_EVENT_CLASS(jbd2_journal_shrink, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, + unsigned long count), + + TP_ARGS(journal, nr_to_scan, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(unsigned long, count) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->nr_to_scan = nr_to_scan; + __entry->count = count; + ), + + TP_printk("dev %d,%d nr_to_scan %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, __entry->count) +); + +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), + + TP_ARGS(journal, nr_to_scan, count) +); + +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), + + TP_ARGS(journal, nr_to_scan, count) +); + +TRACE_EVENT(jbd2_shrink_scan_exit, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, + unsigned long nr_shrunk, unsigned long count), + + TP_ARGS(journal, nr_to_scan, nr_shrunk, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(unsigned long, nr_shrunk) + __field(unsigned long, count) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->nr_to_scan = nr_to_scan; + __entry->nr_shrunk = nr_shrunk; + __entry->count = count; + ), + + TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, __entry->nr_shrunk, + __entry->count) +); + +TRACE_EVENT(jbd2_shrink_checkpoint_list, + + TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, + unsigned long nr_freed, unsigned long nr_scanned, + tid_t next_tid), + + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, + nr_scanned, next_tid), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(tid_t, first_tid) + __field(tid_t, tid) + __field(tid_t, last_tid) + __field(unsigned long, nr_freed) + __field(unsigned long, nr_scanned) + __field(tid_t, next_tid) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->first_tid = first_tid; + __entry->tid = tid; + __entry->last_tid = last_tid; + __entry->nr_freed = nr_freed; + __entry->nr_scanned = nr_scanned; + __entry->next_tid = next_tid; + ), + + TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " + "scanned %lu next transaction %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->first_tid, __entry->tid, __entry->last_tid, + __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ From dbf2bab7935b65689f3b39178cf87374f0334ead Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:38 +0800 Subject: [PATCH 28/32] jbd2: simplify journal_clean_one_cp_list() Now that __try_to_free_cp_buf() remove checkpointed buffer or transaction when the buffer is not 'busy', which is only called by journal_clean_one_cp_list(). This patch simplify this function by remove __try_to_free_cp_buf() and invoke __cp_buffer_busy() directly. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-7-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 1abdae44a3d8..51d1eb2ffeb9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -91,25 +91,6 @@ static inline bool __cp_buffer_busy(struct journal_head *jh) return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); } -/* - * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * - * Requires j_list_lock - */ -static int __try_to_free_cp_buf(struct journal_head *jh) -{ - int ret = 0; - struct buffer_head *bh = jh2bh(jh); - - if (!jh->b_transaction && !buffer_locked(bh) && !buffer_dirty(bh)) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - } - return ret; -} - /* * __jbd2_log_wait_for_space: wait until there is space in the journal. * @@ -440,7 +421,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret; if (!jh) return 0; @@ -449,13 +429,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) do { jh = next_jh; next_jh = jh->b_cpnext; - if (!destroy) - ret = __try_to_free_cp_buf(jh); - else - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - if (!ret) + + if (!destroy && __cp_buffer_busy(jh)) return 0; - if (ret == 2) + + if (__jbd2_journal_remove_checkpoint(jh)) return 1; /* * This function only frees up some memory From 3b672e3aedffc9f092e7e7eae0050a97a8ca508e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:39 +0800 Subject: [PATCH 29/32] ext4: remove bdev_try_to_free_page() callback After we introduce a jbd2 shrinker to release checkpointed buffer's journal head, we could free buffer without bdev_try_to_free_page() under memory pressure. So this patch remove the whole bdev_try_to_free_page() callback directly. It also remove many use-after-free issues relate to it together. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-8-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7ee2e21537e0..9e0fb798b807 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1442,26 +1442,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode) return ext4_write_inode(inode, &wbc); } -/* - * Try to release metadata pages (indirect blocks, directories) which are - * mapped via the block device. Since these pages could have journal heads - * which would prevent try_to_free_buffers() from freeing them, we must use - * jbd2 layer's try_to_free_buffers() function to release them. - */ -static int bdev_try_to_free_page(struct super_block *sb, struct page *page, - gfp_t wait) -{ - journal_t *journal = EXT4_SB(sb)->s_journal; - - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return jbd2_journal_try_to_free_buffers(journal, page); - - return try_to_free_buffers(page); -} - #ifdef CONFIG_FS_ENCRYPTION static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { @@ -1656,7 +1636,6 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif - .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext4_export_ops = { From acc6100d3ffa24bdd2add8ea85fb66811bcce5d4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:40 +0800 Subject: [PATCH 30/32] fs: remove bdev_try_to_free_page callback After remove the unique user of sop->bdev_try_to_free_page() callback, we could remove the callback and the corresponding blkdev_releasepage() at all. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-9-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/block_dev.c | 15 --------------- include/linux/fs.h | 1 - 2 files changed, 16 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 6cc4d4cfe0c2..e215da6d49b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1733,20 +1733,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(blkdev_read_iter); -/* - * Try to release a page associated with block device when the system - * is under memory pressure. - */ -static int blkdev_releasepage(struct page *page, gfp_t wait) -{ - struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; - - if (super && super->s_op->bdev_try_to_free_page) - return super->s_op->bdev_try_to_free_page(super, page, wait); - - return try_to_free_buffers(page); -} - static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1760,7 +1746,6 @@ static const struct address_space_operations def_blk_aops = { .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = blkdev_writepages, - .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .migratepage = buffer_migrate_page_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, diff --git a/include/linux/fs.h b/include/linux/fs.h index c3c88fdb9b2a..c3277b445f96 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2171,7 +2171,6 @@ struct super_operations { ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot **(*get_dquots)(struct inode *); #endif - int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, From d578b99443fde0968246cc7cbf3bc3016123c2f4 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Fri, 11 Jun 2021 14:02:08 +0000 Subject: [PATCH 31/32] ext4: notify sysfs on errors_count value change After s_error_count is incremented, signal the change in the corresponding sysfs attribute via sysfs_notify. This allows userspace to poll() on changes to /sys/fs/ext4/*/errors_count. [ Moved call of ext4_notify_error_sysfs() to flush_stashed_error_work() to avoid BUG's caused by calling sysfs_notify trying to sleep after being called from an invalid context. -- TYT ] Signed-off-by: Jonathan Davies Link: https://lore.kernel.org/r/20210611140209.28903-1-jonathan.davies@nutanix.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 2 ++ fs/ext4/sysfs.c | 5 +++++ 3 files changed, 8 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8ff4ae3b5715..3c51e243450d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3624,6 +3624,7 @@ extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; /* sysfs.c */ +extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); extern int ext4_register_sysfs(struct super_block *sb); extern void ext4_unregister_sysfs(struct super_block *sb); extern int __init ext4_init_sysfs(void); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9e0fb798b807..20344633bdd9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -718,6 +718,7 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); + ext4_notify_error_sysfs(sbi); return; } write_directly: @@ -726,6 +727,7 @@ static void flush_stashed_error_work(struct work_struct *work) * out and hope for the best. */ ext4_commit_super(sbi->s_sb); + ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 55fcab60a59a..2314f7446592 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -506,6 +506,11 @@ static struct kobj_type ext4_feat_ktype = { .release = (void (*)(struct kobject *))kfree, }; +void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) +{ + sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); +} + static struct kobject *ext4_root; static struct kobject *ext4_feat; From 16aa4c9a1fbe763c147a964cdc1f5be8ed98ed13 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 30 Jun 2021 16:36:38 +0800 Subject: [PATCH 32/32] jbd2: export jbd2_journal_[un]register_shrinker() Export jbd2_journal_[un]register_shrinker() to fix this error when ext4 is built as a module: ERROR: modpost: "jbd2_journal_unregister_shrinker" undefined! ERROR: modpost: "jbd2_journal_register_shrinker" undefined! Fixes: 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210630083638.140218-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 7c52feb6f753..152880c298ca 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2122,6 +2122,7 @@ int jbd2_journal_register_shrinker(journal_t *journal) return 0; } +EXPORT_SYMBOL(jbd2_journal_register_shrinker); /** * jbd2_journal_unregister_shrinker() @@ -2134,6 +2135,7 @@ void jbd2_journal_unregister_shrinker(journal_t *journal) percpu_counter_destroy(&journal->j_jh_shrink_count); unregister_shrinker(&journal->j_shrinker); } +EXPORT_SYMBOL(jbd2_journal_unregister_shrinker); /** * jbd2_journal_destroy() - Release a journal_t structure.