From 2ad32cf09bd28a21e6ad1595355a023ed631b529 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 1 Jul 2021 10:41:46 -0400 Subject: [PATCH 01/20] ceph: fix memory leak on decode error in ceph_handle_caps If we hit a decoding error late in the frame, then we might exit the function without putting the pool_ns string. Ensure that we always put that reference on the way out of the function. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 39db97f149b9..c2d654156783 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4134,8 +4134,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, done: mutex_unlock(&session->s_mutex); done_unlocked: - ceph_put_string(extra_info.pool_ns); iput(inode); +out: + ceph_put_string(extra_info.pool_ns); return; flush_cap_releases: @@ -4150,7 +4151,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, bad: pr_err("ceph_handle_caps: corrupt message\n"); ceph_msg_dump(msg); - return; + goto out; } /* From ce3a8732ae0d43a6d14aa24624c0f12b3e6281b9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 14 Jun 2021 07:15:38 -0400 Subject: [PATCH 02/20] ceph: fix comment about short copies in ceph_write_end Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a1e2813731d1..6d3f74d46e5b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1281,8 +1281,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); - /* zero the stale part of the page if we did a short copy */ if (!PageUptodate(page)) { + /* just return that nothing was copied on a short copy */ if (copied < len) { copied = 0; goto out; From fba97e8025015b63b1bdb73cd868c8ea832a1620 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 5 Jul 2021 09:22:54 +0800 Subject: [PATCH 03/20] ceph: make ceph_create_session_msg a global symbol Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 16 +++++++++------- fs/ceph/mds_client.h | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0b69aec23e5c..b21c44af53bf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1155,7 +1155,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, /* * session messages */ -static struct ceph_msg *create_session_msg(u32 op, u64 seq) +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -1163,7 +1163,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, false); if (!msg) { - pr_err("create_session_msg ENOMEM creating msg\n"); + pr_err("ENOMEM creating session %s msg\n", + ceph_session_op_name(op)); return NULL; } h = msg->front.iov_base; @@ -1294,7 +1295,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); if (!msg) { - pr_err("create_session_msg ENOMEM creating msg\n"); + pr_err("ENOMEM creating session open msg\n"); return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; @@ -1803,8 +1804,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, dout("send_renew_caps to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, - ++session->s_renew_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1818,7 +1819,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc, dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), seq); - msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1870,7 +1871,8 @@ static int request_close_session(struct ceph_mds_session *session) dout("request_close_session mds%d state %s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), session->s_seq); - msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, + session->s_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 20e42d8b66c6..be4a97664464 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -522,6 +522,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, From 59b312f36230ea91ebb6ce1b11f2781604495d30 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 5 Jul 2021 09:22:55 +0800 Subject: [PATCH 04/20] ceph: make iterate_sessions a global symbol Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 26 +---------------------- fs/ceph/mds_client.c | 49 +++++++++++++++++++++++++++++--------------- fs/ceph/mds_client.h | 3 +++ 3 files changed, 36 insertions(+), 42 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c2d654156783..a42dbc343749 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4226,33 +4226,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) dout("flush_dirty_caps done\n"); } -static void iterate_sessions(struct ceph_mds_client *mdsc, - void (*cb)(struct ceph_mds_session *)) -{ - int mds; - - mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; ++mds) { - struct ceph_mds_session *s; - - if (!mdsc->sessions[mds]) - continue; - - s = ceph_get_mds_session(mdsc->sessions[mds]); - if (!s) - continue; - - mutex_unlock(&mdsc->mutex); - cb(s); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); -} - void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - iterate_sessions(mdsc, flush_dirty_session_caps); + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } void __ceph_touch_fmode(struct ceph_inode_info *ci, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b21c44af53bf..926971822174 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -811,6 +811,33 @@ static void put_request_session(struct ceph_mds_request *req) } } +void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state) +{ + int mds; + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; ++mds) { + struct ceph_mds_session *s; + + s = __ceph_lookup_mds_session(mdsc, mds); + if (!s) + continue; + + if (check_state && !check_session_state(s)) { + ceph_put_mds_session(s); + continue; + } + + mutex_unlock(&mdsc->mutex); + cb(s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + void ceph_mdsc_release_request(struct kref *kref) { struct ceph_mds_request *req = container_of(kref, @@ -4411,24 +4438,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, } /* - * lock unlock sessions, to wait ongoing session activities + * lock unlock the session, to wait ongoing session activities */ -static void lock_unlock_sessions(struct ceph_mds_client *mdsc) +static void lock_unlock_session(struct ceph_mds_session *s) { - int i; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) { - struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (!s) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - mutex_unlock(&s->s_mutex); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); } static void maybe_recover_session(struct ceph_mds_client *mdsc) @@ -4687,7 +4702,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) dout("pre_umount\n"); mdsc->stopping = 1; - lock_unlock_sessions(mdsc); + ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index be4a97664464..4a75a14c2a88 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -522,6 +522,9 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state); extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); From d095559ce4100f0c02aea229705230deac329c97 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 5 Jul 2021 09:22:56 +0800 Subject: [PATCH 05/20] ceph: flush mdlog before umounting Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 25 +++++++++++++++++++++++++ fs/ceph/mds_client.h | 1 + fs/ceph/strings.c | 1 + include/linux/ceph/ceph_fs.h | 1 + 4 files changed, 28 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 926971822174..d98a3eda0d4c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4693,6 +4693,30 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests done\n"); } +void send_flush_mdlog(struct ceph_mds_session *s) +{ + struct ceph_msg *msg; + + /* + * Pre-luminous MDS crashes when it sees an unknown session request + */ + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) + return; + + mutex_lock(&s->s_mutex); + dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, + ceph_session_state_name(s->s_state), s->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, + s->s_seq); + if (!msg) { + pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + } else { + ceph_con_send(&s->s_con, msg); + } + mutex_unlock(&s->s_mutex); +} + /* * called before mount is ro, and before dentries are torn down. * (hmm, does this still race with new lookups?) @@ -4702,6 +4726,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) dout("pre_umount\n"); mdsc->stopping = 1; + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4a75a14c2a88..97c7f7bfa55f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -522,6 +522,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern void send_flush_mdlog(struct ceph_mds_session *s); extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, void (*cb)(struct ceph_mds_session *), bool check_state); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 4a79f3632260..573bb9556fb5 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -46,6 +46,7 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; case CEPH_SESSION_FORCE_RO: return "force_ro"; case CEPH_SESSION_REJECT: return "reject"; + case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog"; } return "???"; } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index e41a811026f6..bc2699feddbe 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -299,6 +299,7 @@ enum { CEPH_SESSION_FLUSHMSG_ACK, CEPH_SESSION_FORCE_RO, CEPH_SESSION_REJECT, + CEPH_SESSION_REQUEST_FLUSH_MDLOG, }; extern const char *ceph_session_op_name(int op); From e1a4541ec0b951685a49d1f72d183681e6433a45 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 5 Jul 2021 09:22:57 +0800 Subject: [PATCH 06/20] ceph: flush the mdlog before waiting on unsafe reqs For the client requests who will have unsafe and safe replies from MDS daemons, in the MDS side the MDS daemons won't flush the mdlog (journal log) immediatelly, because they think it's unnecessary. That's true for most cases but not all, likes the fsync request. The fsync will wait until all the unsafe replied requests to be safely replied. Normally if there have multiple threads or clients are running, the whole mdlog in MDS daemons could be flushed in time if any request will trigger the mdlog submit thread. So usually we won't experience the normal operations will stuck for a long time. But in case there has only one client with only thread is running, the stuck phenomenon maybe obvious and the worst case it must wait at most 5 seconds to wait the mdlog to be flushed by the MDS's tick thread periodically. This patch will trigger to flush the mdlog in the relevant and auth MDSes to which the in-flight requests are sent just before waiting the unsafe requests to finish. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a42dbc343749..d5a46556e20e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2219,6 +2219,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) */ static int unsafe_request_wait(struct inode *inode) { + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; @@ -2238,6 +2239,81 @@ static int unsafe_request_wait(struct inode *inode) } spin_unlock(&ci->i_unsafe_lock); + /* + * Trigger to flush the journal logs in all the relevant MDSes + * manually, or in the worst case we must wait at most 5 seconds + * to wait the journal logs to be flushed by the MDSes periodically. + */ + if (req1 || req2) { + struct ceph_mds_session **sessions = NULL; + struct ceph_mds_session *s; + struct ceph_mds_request *req; + unsigned int max; + int i; + + /* + * The mdsc->max_sessions is unlikely to be changed + * mostly, here we will retry it by reallocating the + * sessions arrary memory to get rid of the mdsc->mutex + * lock. + */ +retry: + max = mdsc->max_sessions; + sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO); + if (!sessions) + return -ENOMEM; + + spin_lock(&ci->i_unsafe_lock); + if (req1) { + list_for_each_entry(req, &ci->i_unsafe_dirops, + r_unsafe_dir_item) { + s = req->r_session; + if (unlikely(s->s_mds > max)) { + spin_unlock(&ci->i_unsafe_lock); + goto retry; + } + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + if (req2) { + list_for_each_entry(req, &ci->i_unsafe_iops, + r_unsafe_target_item) { + s = req->r_session; + if (unlikely(s->s_mds > max)) { + spin_unlock(&ci->i_unsafe_lock); + goto retry; + } + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + spin_unlock(&ci->i_unsafe_lock); + + /* the auth MDS */ + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) { + s = ci->i_auth_cap->session; + if (!sessions[s->s_mds]) + sessions[s->s_mds] = ceph_get_mds_session(s); + } + spin_unlock(&ci->i_ceph_lock); + + /* send flush mdlog request to MDSes */ + for (i = 0; i < max; i++) { + s = sessions[i]; + if (s) { + send_flush_mdlog(s); + ceph_put_mds_session(s); + } + } + kfree(sessions); + } + dout("unsafe_request_wait %p wait on tid %llu %llu\n", inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { From 49f8899e5edfc4b8a97d3959e1ded5a95f2e46b3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 7 Jul 2021 12:26:47 -0400 Subject: [PATCH 07/20] ceph: remove some defunct forward declarations We missed these in the recent fscache rework. Reported-by: Steve French Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/cache.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 1409d6149281..058ea2a04376 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -26,12 +26,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); -int ceph_readpage_from_fscache(struct inode *inode, struct page *page); -int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages); - static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { ci->fscache = NULL; From 40e309de4dd84ba91b9e0549a5173ce13ef02c5e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Jul 2021 07:07:50 -0400 Subject: [PATCH 08/20] ceph: add a new vxattr to return auth mds for an inode Add a new vxattr that shows what MDS is authoritative for an inode (if we happen to have auth caps). If we don't have an auth cap for the inode then just return -1. URL: https://tracker.ceph.com/issues/1276 Signed-off-by: Jeff Layton Reviewed-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 1242db8d3444..159a1ffa4f4b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -340,6 +340,18 @@ static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val, ceph_cap_string(issued), issued); } +static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = ceph_fmt_xattr(val, size, "%d", + ci->i_auth_cap ? ci->i_auth_cap->session->s_mds : -1); + spin_unlock(&ci->i_ceph_lock); + return ret; +} + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -473,6 +485,13 @@ static struct ceph_vxattr ceph_common_vxattrs[] = { .exists_cb = NULL, .flags = VXATTR_FLAG_READONLY, }, + { + .name = "ceph.auth_mds", + .name_size = sizeof("ceph.auth_mds"), + .getxattr_cb = ceph_vxattrcb_auth_mds, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, { .name = NULL, 0 } /* Required table terminator */ }; From b4002173b7989588b6feaefc42edaf011b596782 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 27 Jul 2021 15:47:12 -0400 Subject: [PATCH 09/20] ceph: cancel delayed work instead of flushing on mdsc teardown The first thing metric_delayed_work does is check mdsc->stopping, and then return immediately if it's set. That's good since we would have already torn down the metric structures at this point, otherwise, but there is no locking around mdsc->stopping. It's possible that the ceph_metric_destroy call could race with the delayed_work, in which case we could end up with the delayed_work accessing destroyed percpu variables. At this point in the mdsc teardown, the "stopping" flag has already been set, so there's no benefit to flushing the work. Move the work cancellation in ceph_metric_destroy ahead of the percpu variable destruction, and eliminate the flush_delayed_work call in ceph_mdsc_destroy. Fixes: 18f473b384a6 ("ceph: periodically send perf metrics to MDSes") Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 1 - fs/ceph/metric.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d98a3eda0d4c..85934091e024 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4954,7 +4954,6 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_metric_destroy(&mdsc->metric); - flush_delayed_work(&mdsc->metric.delayed_work); fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 5ac151eb0d49..04d5df29bbbf 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -302,6 +302,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m) if (!m) return; + cancel_delayed_work_sync(&m->delayed_work); + percpu_counter_destroy(&m->total_inodes); percpu_counter_destroy(&m->opened_inodes); percpu_counter_destroy(&m->i_caps_mis); @@ -309,8 +311,6 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); - cancel_delayed_work_sync(&m->delayed_work); - ceph_put_mds_session(m->session); } From c80dc3aee984c647d747bb07c108862effc917d8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 27 Jul 2021 10:50:02 -0400 Subject: [PATCH 10/20] ceph: remove redundant initializations from mdsc and session The ceph_mds_client and ceph_mds_session structures are kzalloc'ed so there's no need to explicitly initialize either of their fields to 0. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 85934091e024..b1f8a4025195 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -743,8 +743,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; - s->s_ttl = 0; - s->s_seq = 0; mutex_init(&s->s_mutex); ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); @@ -753,17 +751,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); - s->s_renew_requested = 0; - s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); - s->s_nr_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); xa_init(&s->s_delegated_inos); - s->s_num_cap_releases = 0; - s->s_cap_reconnect = 0; - s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); @@ -4601,21 +4593,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); - mdsc->sessions = NULL; - atomic_set(&mdsc->num_sessions, 0); - mdsc->max_sessions = 0; - mdsc->stopping = 0; - atomic64_set(&mdsc->quotarealms_count, 0); mdsc->quotarealms_inodes = RB_ROOT; mutex_init(&mdsc->quotarealms_inodes_mutex); - mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); - mdsc->num_snap_realms = 0; spin_lock_init(&mdsc->snap_empty_lock); - mdsc->last_tid = 0; - mdsc->oldest_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; @@ -4627,11 +4610,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->last_cap_flush_tid = 1; INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); - mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); - atomic_set(&mdsc->cap_reclaim_pending, 0); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; From 0ba92e1c5f7ce7e9c1c828a84e873d7be51c1b9f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 2 Aug 2021 11:01:26 -0400 Subject: [PATCH 11/20] ceph: add ceph_change_snap_realm() helper Consolidate some fiddly code for changing an inode's snap_realm into a new helper function, and change the callers to use it. While we're in here, nothing uses the i_snap_realm_counter field, so remove that from the inode. Signed-off-by: Jeff Layton Reviewed-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 36 +++--------------------------- fs/ceph/inode.c | 11 ++------- fs/ceph/snap.c | 59 ++++++++++++++++++++++++++++++++----------------- fs/ceph/super.h | 2 +- 4 files changed, 45 insertions(+), 63 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d5a46556e20e..b75a9e1d6882 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -704,23 +704,7 @@ void ceph_add_cap(struct inode *inode, struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) { - struct ceph_snap_realm *oldrealm = ci->i_snap_realm; - if (oldrealm) { - spin_lock(&oldrealm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - spin_unlock(&oldrealm->inodes_with_caps_lock); - } - - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - if (realm->ino == ci->i_vino.ino) - realm->inode = inode; - spin_unlock(&realm->inodes_with_caps_lock); - - if (oldrealm) - ceph_put_snap_realm(mdsc, oldrealm); + ceph_change_snap_realm(inode, realm); } else { pr_err("ceph_add_cap: couldn't find snap realm %llx\n", realmino); @@ -1112,20 +1096,6 @@ int ceph_is_any_caps(struct inode *inode) return ret; } -static void drop_inode_snap_realm(struct ceph_inode_info *ci) -{ - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, - realm); -} - /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * @@ -1201,7 +1171,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) - drop_inode_snap_realm(ci); + ceph_change_snap_realm(&ci->vfs_inode, NULL); __cap_delay_cancel(mdsc, ci); } @@ -3084,7 +3054,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) - drop_inode_snap_realm(ci); + ceph_change_snap_realm(inode, NULL); } } if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1bd2cc015913..2df1e1284451 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -581,16 +581,9 @@ void ceph_evict_inode(struct inode *inode) */ if (ci->i_snap_realm) { if (ceph_snap(inode) == CEPH_NOSNAP) { - struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", - realm); - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); + ci->i_snap_realm); + ceph_change_snap_realm(inode, NULL); } else { ceph_put_snapid_map(mdsc, ci->i_snapid_map); ci->i_snap_realm = NULL; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 15105f9da3fd..b41e6724c591 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -849,6 +849,43 @@ static void flush_snaps(struct ceph_mds_client *mdsc) dout("flush_snaps done\n"); } +/** + * ceph_change_snap_realm - change the snap_realm for an inode + * @inode: inode to move to new snap realm + * @realm: new realm to move inode into (may be NULL) + * + * Detach an inode from its old snaprealm (if any) and attach it to + * the new snaprealm (if any). The old snap realm reference held by + * the inode is put. If realm is non-NULL, then the caller's reference + * to it is taken over by the inode. + */ +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_snap_realm *oldrealm = ci->i_snap_realm; + + lockdep_assert_held(&ci->i_ceph_lock); + + if (oldrealm) { + spin_lock(&oldrealm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + if (oldrealm->ino == ci->i_vino.ino) + oldrealm->inode = NULL; + spin_unlock(&oldrealm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, oldrealm); + } + + ci->i_snap_realm = realm; + + if (realm) { + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; + spin_unlock(&realm->inodes_with_caps_lock); + } +} /* * Handle a snap notification from the MDS. @@ -935,7 +972,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; - struct ceph_snap_realm *oldrealm; if (!inode) continue; @@ -960,27 +996,10 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, } dout(" will move %p to split realm %llx %p\n", inode, realm->ino, realm); - /* - * Move the inode to the new realm - */ - oldrealm = ci->i_snap_realm; - spin_lock(&oldrealm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - spin_unlock(&oldrealm->inodes_with_caps_lock); - - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - if (realm->ino == ci->i_vino.ino) - realm->inode = inode; - spin_unlock(&realm->inodes_with_caps_lock); - - spin_unlock(&ci->i_ceph_lock); ceph_get_snap_realm(mdsc, realm); - ceph_put_snap_realm(mdsc, oldrealm); - + ceph_change_snap_realm(inode, realm); + spin_unlock(&ci->i_ceph_lock); iput(inode); continue; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b1a363641beb..3fad464d7c6d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -418,7 +418,6 @@ struct ceph_inode_info { struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ }; - int i_snap_realm_counter; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; struct timespec64 i_btime; @@ -929,6 +928,7 @@ extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, extern int ceph_update_snap_trace(struct ceph_mds_client *m, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret); +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); From 692e17159792a13e8c5031bdc0ae9b0f3158593d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 2 Aug 2021 15:32:47 -0400 Subject: [PATCH 12/20] ceph: print more information when we can't find snaprealm Print a bit more information when we can't find the realm during ceph_add_cap. Show both the inode number and the old realm inode number. Suggested-by: Sage Weil Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b75a9e1d6882..1f544eeecf79 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -703,13 +703,12 @@ void ceph_add_cap(struct inode *inode, */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); - if (realm) { + if (realm) ceph_change_snap_realm(inode, realm); - } else { - pr_err("ceph_add_cap: couldn't find snap realm %llx\n", - realmino); - WARN_ON(!realm); - } + else + WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", + __func__, realmino, ci->i_vino.ino, + ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); From d517b3983dd3106ca92d6c5d0d09415a4a09481c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 18 Aug 2021 09:31:19 +0800 Subject: [PATCH 13/20] ceph: reconnect to the export targets on new mdsmaps In the case where the export MDS has crashed just after the EImportStart journal is flushed, a standby MDS takes over for it and when replaying the EImportStart journal the MDS will wait the client to reconnect. That may never happen because the client may not have registered or opened the sessions yet. When receiving a new map, ensure we reconnect to valid export targets as well if their sessions don't exist yet. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 57 +++++++++++++++++++++++++++++++++++++++++++- fs/ceph/mdsmap.c | 12 +++++++--- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b1f8a4025195..ba1036f09127 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -4171,13 +4172,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, struct ceph_mdsmap *newmap, struct ceph_mdsmap *oldmap) { - int i; + int i, j, err; int oldstate, newstate; struct ceph_mds_session *s; + unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0}; dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); + if (newmap->m_info) { + for (i = 0; i < newmap->possible_max_rank; i++) { + for (j = 0; j < newmap->m_info[i].num_export_targets; j++) + set_bit(newmap->m_info[i].export_targets[j], targets); + } + } + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { if (!mdsc->sessions[i]) continue; @@ -4231,6 +4240,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, if (s->s_state == CEPH_MDS_SESSION_RESTARTING && newstate >= CEPH_MDS_STATE_RECONNECT) { mutex_unlock(&mdsc->mutex); + clear_bit(i, targets); send_mds_reconnect(mdsc, s); mutex_lock(&mdsc->mutex); } @@ -4253,6 +4263,51 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } + /* + * Only open and reconnect sessions that don't exist yet. + */ + for (i = 0; i < newmap->possible_max_rank; i++) { + /* + * In case the import MDS is crashed just after + * the EImportStart journal is flushed, so when + * a standby MDS takes over it and is replaying + * the EImportStart journal the new MDS daemon + * will wait the client to reconnect it, but the + * client may never register/open the session yet. + * + * Will try to reconnect that MDS daemon if the + * rank number is in the export targets array and + * is the up:reconnect state. + */ + newstate = ceph_mdsmap_get_state(newmap, i); + if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT) + continue; + + /* + * The session maybe registered and opened by some + * requests which were choosing random MDSes during + * the mdsc->mutex's unlock/lock gap below in rare + * case. But the related MDS daemon will just queue + * that requests and be still waiting for the client's + * reconnection request in up:reconnect state. + */ + s = __ceph_lookup_mds_session(mdsc, i); + if (likely(!s)) { + s = __open_export_target_session(mdsc, i); + if (IS_ERR(s)) { + err = PTR_ERR(s); + pr_err("failed to open export target session, err %d\n", + err); + continue; + } + } + dout("send reconnect to export target mds.%d\n", i); + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 3c444b9cb17b..61d67cbcb367 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -122,6 +122,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) int err; u8 mdsmap_v; u16 mdsmap_ev; + u32 target; m = kzalloc(sizeof(*m), GFP_NOFS); if (!m) @@ -260,9 +261,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) sizeof(u32), GFP_NOFS); if (!info->export_targets) goto nomem; - for (j = 0; j < num_export_targets; j++) - info->export_targets[j] = - ceph_decode_32(&pexport_targets); + for (j = 0; j < num_export_targets; j++) { + target = ceph_decode_32(&pexport_targets); + if (target >= m->possible_max_rank) { + err = -EIO; + goto corrupt; + } + info->export_targets[j] = target; + } } else { info->export_targets = NULL; } From b11ed50346683a749632ea664959b28d524d7395 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 11 Aug 2021 06:40:42 -0400 Subject: [PATCH 14/20] ceph: request Fw caps before updating the mtime in ceph_write_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current code will update the mtime and then try to get caps to handle the write. If we end up having to request caps from the MDS, then the mtime in the cap grant will clobber the updated mtime and it'll be lost. This is most noticable when two clients are alternately writing to the same file. Fw caps are continually being granted and revoked, and the mtime ends up stuck because the updated mtimes are always being overwritten with the old one. Fix this by changing the order of operations in ceph_write_iter to get the caps before updating the times. Also, make sure we check the pool full conditions before even getting any caps or uninlining. URL: https://tracker.ceph.com/issues/46574 Reported-by: Jozef Kováč Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d1755ac1d964..3daebfaec8c6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1722,22 +1722,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - err = file_remove_privs(file); - if (err) - goto out; - - err = file_update_time(file); - if (err) - goto out; - - inode_inc_iversion_raw(inode); - - if (ci->i_inline_version != CEPH_INLINE_NONE) { - err = ceph_uninline_data(file, NULL); - if (err < 0) - goto out; - } - down_read(&osdc->lock); map_flags = osdc->osdmap->flags; pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); @@ -1748,6 +1732,16 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } + err = file_remove_privs(file); + if (err) + goto out; + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + err = ceph_uninline_data(file, NULL); + if (err < 0) + goto out; + } + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) @@ -1759,6 +1753,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err < 0) goto out; + err = file_update_time(file); + if (err) + goto out_caps; + + inode_inc_iversion_raw(inode); + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); @@ -1842,6 +1842,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } goto out_unlocked; +out_caps: + ceph_put_cap_refs(ci, got); out: if (direct_lock) ceph_end_io_direct(inode); From a6d37ccdd240e80f26aaea0e62cda310e0e184d7 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 25 Aug 2021 21:45:43 +0800 Subject: [PATCH 15/20] ceph: remove the capsnaps when removing caps capsnaps will take inode references via ihold when queueing to flush. When force unmounting, the client will just close the sessions and may never get a flush reply, causing a leak and inode ref leak. Fix this by removing the capsnaps for an inode when removing the caps. URL: https://tracker.ceph.com/issues/52295 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 68 +++++++++++++++++++++++++++++++++----------- fs/ceph/mds_client.c | 31 +++++++++++++++++++- fs/ceph/super.h | 6 ++++ 3 files changed, 87 insertions(+), 18 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1f544eeecf79..2e5d2659f48e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3159,7 +3159,16 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, break; } } - BUG_ON(!found); + + if (!found) { + /* + * The capsnap should already be removed when removing + * auth cap in the case of a forced unmount. + */ + WARN_ON_ONCE(ci->i_auth_cap); + goto unlock; + } + capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = true; @@ -3181,6 +3190,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, complete_capsnap ? " (complete capsnap)" : ""); } +unlock: spin_unlock(&ci->i_ceph_lock); if (last) { @@ -3651,6 +3661,43 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, iput(inode); } +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + bool ret; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); + + list_del_init(&capsnap->ci_item); + ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); + if (wake_ci) + *wake_ci = ret; + + spin_lock(&mdsc->cap_dirty_lock); + if (list_empty(&ci->i_cap_flush_list)) + list_del_init(&ci->i_flushing_item); + + ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); + if (wake_mdsc) + *wake_mdsc = ret; + spin_unlock(&mdsc->cap_dirty_lock); +} + +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + lockdep_assert_held(&ci->i_ceph_lock); + + WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); + __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); +} + /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. @@ -3688,23 +3735,10 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap, capsnap->follows); } } - if (flushed) { - WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing %p cap_snap %p follows %lld\n", - inode, capsnap, follows); - list_del(&capsnap->ci_item); - wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); - - spin_lock(&mdsc->cap_dirty_lock); - - if (list_empty(&ci->i_cap_flush_list)) - list_del_init(&ci->i_flushing_item); - - wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, - &capsnap->cap_flush); - spin_unlock(&mdsc->cap_dirty_lock); - } + if (flushed) + ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); + if (flushed) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ba1036f09127..d010c24456ef 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1604,14 +1604,39 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, return ret; } +static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_snap *capsnap; + int capsnap_release = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); + + while (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + __ceph_remove_capsnap(inode, capsnap, NULL, NULL); + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + capsnap_release++; + } + wake_up_all(&ci->i_cap_wq); + wake_up_all(&mdsc->cap_flushing_wq); + return capsnap_release; +} + static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); bool dirty_dropped = false; bool invalidate = false; + int capsnap_release = 0; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1619,7 +1644,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = fsc->mdsc; if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { if (inode->i_data.nrpages > 0) @@ -1683,6 +1707,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (!list_empty(&ci->i_cap_snaps)) + capsnap_release = remove_capsnaps(mdsc, inode); } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1699,6 +1726,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ceph_queue_invalidate(inode); if (dirty_dropped) iput(inode); + while (capsnap_release--) + iput(inode); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3fad464d7c6d..e1d62e4e8780 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1163,6 +1163,12 @@ extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); +extern void __ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); +extern void ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); From 42ad631b4d0e0d6da0bfe375af99887251fbf970 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 25 Aug 2021 21:45:44 +0800 Subject: [PATCH 16/20] ceph: don't WARN if we're force umounting Force umount will try to close the sessions by setting the session state to _CLOSING. We don't want to WARN in this situation, since it's expected. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d010c24456ef..c3a0e549c413 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4541,6 +4541,8 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { + struct ceph_fs_client *fsc = s->s_mdsc->fsc; + switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { @@ -4549,8 +4551,9 @@ bool check_session_state(struct ceph_mds_session *s) } break; case CEPH_MDS_SESSION_CLOSING: - /* Should never reach this when we're unmounting */ - WARN_ON_ONCE(s->s_ttl); + /* Should never reach this when not force unmounting */ + WARN_ON_ONCE(s->s_ttl && + READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); fallthrough; case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: From a76d0a9c288ea7f5fc01bf05485573ce6b36b839 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 25 Aug 2021 21:45:45 +0800 Subject: [PATCH 17/20] ceph: don't WARN if we're forcibly removing the session caps For example in the case of a forced umount, we'll remove all the session caps even if they are dirty. Move the warning to a wrapper function and make most of the callers use it. Call the core function when removing caps due to a forced umount. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 39 ++++++++++++++++++++++++++++++--------- fs/ceph/mds_client.c | 2 +- fs/ceph/super.h | 1 + 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2e5d2659f48e..f92eeb741160 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1114,17 +1114,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) return; } + lockdep_assert_held(&ci->i_ceph_lock); + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) { - WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) && - !mdsc->fsc->blocklisted); + if (ci->i_auth_cap == cap) ci->i_auth_cap = NULL; - } /* remove from session list */ spin_lock(&session->s_cap_lock); @@ -1176,6 +1175,28 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } } +void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) +{ + struct ceph_inode_info *ci = cap->ci; + struct ceph_fs_client *fsc; + + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + dout("%s: cap inode is NULL\n", __func__); + return; + } + + lockdep_assert_held(&ci->i_ceph_lock); + + fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + WARN_ON_ONCE(ci->i_auth_cap == cap && + !list_empty(&ci->i_dirty_item) && + !fsc->blocklisted && + READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); + + __ceph_remove_cap(cap, queue_release); +} + struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; @@ -1304,7 +1325,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); - __ceph_remove_cap(cap, true); + ceph_remove_cap(cap, true); } spin_unlock(&ci->i_ceph_lock); } @@ -3822,7 +3843,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, goto out_unlock; if (target < 0) { - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } @@ -3857,7 +3878,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, change_auth_cap_ses(ci, tcap->session); } } - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export tagert */ @@ -3874,7 +3895,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } @@ -3985,7 +4006,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ocap->mseq, mds, le32_to_cpu(ph->seq), le32_to_cpu(ph->mseq)); } - __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c3a0e549c413..94e90dba46d7 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2016,7 +2016,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ - __ceph_remove_cap(cap, true); + ceph_remove_cap(cap, true); (*remaining)--; } else { struct dentry *dentry; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e1d62e4e8780..16c7eb02550c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1138,6 +1138,7 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); From 3eaf5aa1cfa8c97c72f5824e2e9263d6cc977b03 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 2 Sep 2021 08:31:03 -0400 Subject: [PATCH 18/20] ceph: lockdep annotations for try_nonblocking_invalidate Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f92eeb741160..26c5029629a4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1846,6 +1846,8 @@ static u64 __mark_caps_flushing(struct inode *inode, * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; From 9f3589993c0c69ab9f2401a6b65b21c419b482d6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 26 Jan 2021 13:22:31 -0500 Subject: [PATCH 19/20] ceph: drop the mdsc_get_session/put_session dout messages These are very chatty, racy, and not terribly useful. Just remove them. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 94e90dba46d7..7cad180d6deb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -653,14 +653,9 @@ const char *ceph_session_state_name(int s) struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) { - if (refcount_inc_not_zero(&s->s_ref)) { - dout("mdsc get_session %p %d -> %d\n", s, - refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); + if (refcount_inc_not_zero(&s->s_ref)) return s; - } else { - dout("mdsc get_session %p 0 -- FAIL\n", s); - return NULL; - } + return NULL; } void ceph_put_mds_session(struct ceph_mds_session *s) @@ -668,8 +663,6 @@ void ceph_put_mds_session(struct ceph_mds_session *s) if (IS_ERR_OR_NULL(s)) return; - dout("mdsc put_session %p %d -> %d\n", s, - refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); From 05a444d3f90a3c3e6362e88a1bf13e1a60f8cace Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 29 Aug 2021 19:18:24 +0100 Subject: [PATCH 20/20] ceph: fix dereference of null pointer cf Currently in the case where kmem_cache_alloc fails the null pointer cf is dereferenced when assigning cf->is_capsnap = false. Fix this by adding a null pointer check and return path. Cc: stable@vger.kernel.org Addresses-Coverity: ("Dereference null return") Fixes: b2f9fa1f3bd8 ("ceph: correctly handle releasing an embedded cap flush") Signed-off-by: Colin Ian King Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 26c5029629a4..6c0e52fd0743 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1736,6 +1736,9 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void) struct ceph_cap_flush *cf; cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + if (!cf) + return NULL; + cf->is_capsnap = false; return cf; }