diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6f28dd9bacb2..c5d75486823b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1774,7 +1774,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); - wr_req->r_base_oloc.pool = pool; + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9d470397e249..36b4a41dfa67 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -714,7 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - req->r_base_oloc = orig_req->r_base_oloc; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); ret = ceph_osdc_alloc_messages(req, GFP_NOFS); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 63854a8df183..48806ee4488d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, struct ceph_msg *); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); +#define CEPH_HOMELESS_OSD -1 + /* a given osd we're communicating with */ struct ceph_osd { atomic_t o_ref; @@ -118,6 +120,27 @@ struct ceph_osd_req_op { }; }; +struct ceph_osd_request_target { + struct ceph_object_id base_oid; + struct ceph_object_locator base_oloc; + struct ceph_object_id target_oid; + struct ceph_object_locator target_oloc; + + struct ceph_pg pgid; + u32 pg_num; + u32 pg_num_mask; + struct ceph_osds acting; + struct ceph_osds up; + int size; + int min_size; + bool sort_bitwise; + + unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool paused; + + int osd; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 989294d0b8d2..420bb7968b25 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ +#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ struct ceph_pg_pool_info { struct rb_node node; @@ -62,6 +63,22 @@ struct ceph_object_locator { s64 pool; }; +static inline void ceph_oloc_init(struct ceph_object_locator *oloc) +{ + oloc->pool = -1; +} + +static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) +{ + return oloc->pool == -1; +} + +static inline void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + dest->pool = src->pool; +} + /* * Maximum supported by kernel client object name length * @@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set) void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid); +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 913c87c26d33..f28ed864e682 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ /* * The error code to return when an OSD can't handle a write diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0ff400a56cd6..cff3a7e29233 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -298,6 +298,30 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, } } +/* + * Assumes @t is zero-initialized. + */ +static void target_init(struct ceph_osd_request_target *t) +{ + ceph_oid_init(&t->base_oid); + ceph_oloc_init(&t->base_oloc); + ceph_oid_init(&t->target_oid); + ceph_oloc_init(&t->target_oloc); + + ceph_osds_init(&t->acting); + ceph_osds_init(&t->up); + t->size = -1; + t->min_size = -1; + + t->osd = CEPH_HOMELESS_OSD; +} + +static void target_destroy(struct ceph_osd_request_target *t) +{ + ceph_oid_destroy(&t->base_oid); + ceph_oid_destroy(&t->target_oid); +} + /* * requests */ @@ -1273,6 +1297,11 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_set_request_linger); +static bool __pool_full(struct ceph_pg_pool_info *pi) +{ + return pi->flags & CEPH_POOL_FLAG_FULL; +} + /* * Returns whether a request should be blocked from being sent * based on the current osdmap and osd_client settings. @@ -1289,6 +1318,20 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc, (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); } +static bool target_should_be_paused(struct ceph_osd_client *osdc, + const struct ceph_osd_request_target *t, + struct ceph_pg_pool_info *pi) +{ + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + __pool_full(pi); + + WARN_ON(pi->id != t->base_oloc.pool); + return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || + (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + /* * Calculate mapping of a request to a PG. Takes tiering into account. */ @@ -1328,6 +1371,116 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, &req->r_target_oloc, pg_out); } +enum calc_target_result { + CALC_TARGET_NO_ACTION = 0, + CALC_TARGET_NEED_RESEND, + CALC_TARGET_POOL_DNE, +}; + +static enum calc_target_result calc_target(struct ceph_osd_client *osdc, + struct ceph_osd_request_target *t, + u32 *last_force_resend, + bool any_change) +{ + struct ceph_pg_pool_info *pi; + struct ceph_pg pgid, last_pgid; + struct ceph_osds up, acting; + bool force_resend = false; + bool need_check_tiering = false; + bool need_resend = false; + bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_SORTBITWISE); + enum calc_target_result ct_res; + int ret; + + pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); + if (!pi) { + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } + + if (osdc->osdmap->epoch == pi->last_force_request_resend) { + if (last_force_resend && + *last_force_resend < pi->last_force_request_resend) { + *last_force_resend = pi->last_force_request_resend; + force_resend = true; + } else if (!last_force_resend) { + force_resend = true; + } + } + if (ceph_oid_empty(&t->target_oid) || force_resend) { + ceph_oid_copy(&t->target_oid, &t->base_oid); + need_check_tiering = true; + } + if (ceph_oloc_empty(&t->target_oloc) || force_resend) { + ceph_oloc_copy(&t->target_oloc, &t->base_oloc); + need_check_tiering = true; + } + + if (need_check_tiering && + (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + t->target_oloc.pool = pi->read_tier; + if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + t->target_oloc.pool = pi->write_tier; + } + + ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, + &t->target_oloc, &pgid); + if (ret) { + WARN_ON(ret != -ENOENT); + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } + last_pgid.pool = pgid.pool; + last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); + + ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); + if (any_change && + ceph_is_new_interval(&t->acting, + &acting, + &t->up, + &up, + t->size, + pi->size, + t->min_size, + pi->min_size, + t->pg_num, + pi->pg_num, + t->sort_bitwise, + sort_bitwise, + &last_pgid)) + force_resend = true; + + if (t->paused && !target_should_be_paused(osdc, t, pi)) { + t->paused = false; + need_resend = true; + } + + if (ceph_pg_compare(&t->pgid, &pgid) || + ceph_osds_changed(&t->acting, &acting, any_change) || + force_resend) { + t->pgid = pgid; /* struct */ + ceph_osds_copy(&t->acting, &acting); + ceph_osds_copy(&t->up, &up); + t->size = pi->size; + t->min_size = pi->min_size; + t->pg_num = pi->pg_num; + t->pg_num_mask = pi->pg_num_mask; + t->sort_bitwise = sort_bitwise; + + t->osd = acting.primary; + need_resend = true; + } + + ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; +out: + dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); + return ct_res; +} + static void __enqueue_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; @@ -1805,12 +1958,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) redir.oloc.pool = -1; } - if (redir.oloc.pool != -1) { + if (!ceph_oloc_empty(&redir.oloc)) { dout("redirect pool %lld\n", redir.oloc.pool); __unregister_request(osdc, req); - req->r_target_oloc = redir.oloc; /* struct */ + ceph_oloc_copy(&req->r_target_oloc, &redir.oloc); /* * Start redirect requests with nofail=true. If diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 66c3ebead92f..7d4a5b43085e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1521,6 +1521,32 @@ void ceph_oid_destroy(struct ceph_object_id *oid) } EXPORT_SYMBOL(ceph_oid_destroy); +/* + * osds only + */ +static bool __osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (lhs->size == rhs->size && + !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) + return true; + + return false; +} + +/* + * osds + primary + */ +static bool osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (__osds_equal(lhs, rhs) && + lhs->primary == rhs->primary) + return true; + + return false; +} + static bool osds_valid(const struct ceph_osds *set) { /* non-empty set */ @@ -1553,6 +1579,101 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) dest->primary = src->primary; } +static bool is_split(const struct ceph_pg *pgid, + u32 old_pg_num, + u32 new_pg_num) +{ + int old_bits = calc_bits_of(old_pg_num); + int old_mask = (1 << old_bits) - 1; + int n; + + WARN_ON(pgid->seed >= old_pg_num); + if (new_pg_num <= old_pg_num) + return false; + + for (n = 1; ; n++) { + int next_bit = n << (old_bits - 1); + u32 s = next_bit | pgid->seed; + + if (s < old_pg_num || s == pgid->seed) + continue; + if (s >= new_pg_num) + break; + + s = ceph_stable_mod(s, old_pg_num, old_mask); + if (s == pgid->seed) + return true; + } + + return false; +} + +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid) +{ + return !osds_equal(old_acting, new_acting) || + !osds_equal(old_up, new_up) || + old_size != new_size || + old_min_size != new_min_size || + is_split(pgid, old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise; +} + +static int calc_pg_rank(int osd, const struct ceph_osds *acting) +{ + int i; + + for (i = 0; i < acting->size; i++) { + if (acting->osds[i] == osd) + return i; + } + + return -1; +} + +static bool primary_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting) +{ + if (!old_acting->size && !new_acting->size) + return false; /* both still empty */ + + if (!old_acting->size ^ !new_acting->size) + return true; /* was empty, now not, or vice versa */ + + if (old_acting->primary != new_acting->primary) + return true; /* primary changed */ + + if (calc_pg_rank(old_acting->primary, old_acting) != + calc_pg_rank(new_acting->primary, new_acting)) + return true; + + return false; /* same primary (tho replicas may have changed) */ +} + +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change) +{ + if (primary_changed(old_acting, new_acting)) + return true; + + if (any_change && !__osds_equal(old_acting, new_acting)) + return true; + + return false; +} + /* * calculate file layout from given offset, length. * fill in correct oid, logical length, and object extent