libceph: respect RADOS_BACKOFF backoffs
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
parent
76f827a7b1
commit
a02a946dfe
|
@ -147,6 +147,7 @@ struct ceph_dir_layout {
|
|||
#define CEPH_MSG_OSD_OP 42
|
||||
#define CEPH_MSG_OSD_OPREPLY 43
|
||||
#define CEPH_MSG_WATCH_NOTIFY 44
|
||||
#define CEPH_MSG_OSD_BACKOFF 61
|
||||
|
||||
|
||||
/* watch-notify operations */
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef _FS_CEPH_OSD_CLIENT_H
|
||||
#define _FS_CEPH_OSD_CLIENT_H
|
||||
|
||||
#include <linux/bitrev.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/kref.h>
|
||||
#include <linux/mempool.h>
|
||||
|
@ -36,6 +37,8 @@ struct ceph_osd {
|
|||
struct ceph_connection o_con;
|
||||
struct rb_root o_requests;
|
||||
struct rb_root o_linger_requests;
|
||||
struct rb_root o_backoff_mappings;
|
||||
struct rb_root o_backoffs_by_id;
|
||||
struct list_head o_osd_lru;
|
||||
struct ceph_auth_handshake o_auth;
|
||||
unsigned long lru_ttl;
|
||||
|
@ -275,6 +278,48 @@ struct ceph_watch_item {
|
|||
struct ceph_entity_addr addr;
|
||||
};
|
||||
|
||||
struct ceph_spg_mapping {
|
||||
struct rb_node node;
|
||||
struct ceph_spg spgid;
|
||||
|
||||
struct rb_root backoffs;
|
||||
};
|
||||
|
||||
struct ceph_hobject_id {
|
||||
void *key;
|
||||
size_t key_len;
|
||||
void *oid;
|
||||
size_t oid_len;
|
||||
u64 snapid;
|
||||
u32 hash;
|
||||
u8 is_max;
|
||||
void *nspace;
|
||||
size_t nspace_len;
|
||||
s64 pool;
|
||||
|
||||
/* cache */
|
||||
u32 hash_reverse_bits;
|
||||
};
|
||||
|
||||
static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
|
||||
{
|
||||
hoid->hash_reverse_bits = bitrev32(hoid->hash);
|
||||
}
|
||||
|
||||
/*
|
||||
* PG-wide backoff: [begin, end)
|
||||
* per-object backoff: begin == end
|
||||
*/
|
||||
struct ceph_osd_backoff {
|
||||
struct rb_node spg_node;
|
||||
struct rb_node id_node;
|
||||
|
||||
struct ceph_spg spgid;
|
||||
u64 id;
|
||||
struct ceph_hobject_id *begin;
|
||||
struct ceph_hobject_id *end;
|
||||
};
|
||||
|
||||
#define CEPH_LINGER_ID_START 0xffff000000000000ULL
|
||||
|
||||
struct ceph_osd_client {
|
||||
|
|
|
@ -32,6 +32,7 @@ struct ceph_spg {
|
|||
};
|
||||
|
||||
int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
|
||||
int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
|
||||
|
||||
#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
|
||||
together */
|
||||
|
|
|
@ -439,6 +439,12 @@ enum {
|
|||
|
||||
const char *ceph_osd_watch_op_name(int o);
|
||||
|
||||
enum {
|
||||
CEPH_OSD_BACKOFF_OP_BLOCK = 1,
|
||||
CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
|
||||
CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
|
||||
};
|
||||
|
||||
/*
|
||||
* an individual object operation. each may be accompanied by some data
|
||||
* payload
|
||||
|
|
|
@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type)
|
|||
case CEPH_MSG_OSD_OP: return "osd_op";
|
||||
case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
|
||||
case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
|
||||
case CEPH_MSG_OSD_BACKOFF: return "osd_backoff";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -243,6 +243,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
|
|||
mutex_unlock(&osd->lock);
|
||||
}
|
||||
|
||||
static void dump_snapid(struct seq_file *s, u64 snapid)
|
||||
{
|
||||
if (snapid == CEPH_NOSNAP)
|
||||
seq_puts(s, "head");
|
||||
else if (snapid == CEPH_SNAPDIR)
|
||||
seq_puts(s, "snapdir");
|
||||
else
|
||||
seq_printf(s, "%llx", snapid);
|
||||
}
|
||||
|
||||
static void dump_name_escaped(struct seq_file *s, unsigned char *name,
|
||||
size_t len)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
if (name[i] == '%' || name[i] == ':' || name[i] == '/' ||
|
||||
name[i] < 32 || name[i] >= 127) {
|
||||
seq_printf(s, "%%%02x", name[i]);
|
||||
} else {
|
||||
seq_putc(s, name[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid)
|
||||
{
|
||||
if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max &&
|
||||
hoid->pool == S64_MIN) {
|
||||
seq_puts(s, "MIN");
|
||||
return;
|
||||
}
|
||||
if (hoid->is_max) {
|
||||
seq_puts(s, "MAX");
|
||||
return;
|
||||
}
|
||||
seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits);
|
||||
dump_name_escaped(s, hoid->nspace, hoid->nspace_len);
|
||||
seq_putc(s, ':');
|
||||
dump_name_escaped(s, hoid->key, hoid->key_len);
|
||||
seq_putc(s, ':');
|
||||
dump_name_escaped(s, hoid->oid, hoid->oid_len);
|
||||
seq_putc(s, ':');
|
||||
dump_snapid(s, hoid->snapid);
|
||||
}
|
||||
|
||||
static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd)
|
||||
{
|
||||
struct rb_node *n;
|
||||
|
||||
mutex_lock(&osd->lock);
|
||||
for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) {
|
||||
struct ceph_osd_backoff *backoff =
|
||||
rb_entry(n, struct ceph_osd_backoff, id_node);
|
||||
|
||||
seq_printf(s, "osd%d\t", osd->o_osd);
|
||||
dump_spgid(s, &backoff->spgid);
|
||||
seq_printf(s, "\t%llu\t", backoff->id);
|
||||
dump_hoid(s, backoff->begin);
|
||||
seq_putc(s, '\t');
|
||||
dump_hoid(s, backoff->end);
|
||||
seq_putc(s, '\n');
|
||||
}
|
||||
|
||||
mutex_unlock(&osd->lock);
|
||||
}
|
||||
|
||||
static int osdc_show(struct seq_file *s, void *pp)
|
||||
{
|
||||
struct ceph_client *client = s->private;
|
||||
|
@ -268,6 +335,13 @@ static int osdc_show(struct seq_file *s, void *pp)
|
|||
}
|
||||
dump_linger_requests(s, &osdc->homeless_osd);
|
||||
|
||||
seq_puts(s, "BACKOFFS\n");
|
||||
for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
|
||||
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
|
||||
|
||||
dump_backoffs(s, osd);
|
||||
}
|
||||
|
||||
up_read(&osdc->lock);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
|
|||
struct ceph_osd_linger_request *lreq);
|
||||
static void unlink_linger(struct ceph_osd *osd,
|
||||
struct ceph_osd_linger_request *lreq);
|
||||
static void clear_backoffs(struct ceph_osd *osd);
|
||||
|
||||
#if 1
|
||||
static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
|
||||
|
@ -1019,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd)
|
|||
RB_CLEAR_NODE(&osd->o_node);
|
||||
osd->o_requests = RB_ROOT;
|
||||
osd->o_linger_requests = RB_ROOT;
|
||||
osd->o_backoff_mappings = RB_ROOT;
|
||||
osd->o_backoffs_by_id = RB_ROOT;
|
||||
INIT_LIST_HEAD(&osd->o_osd_lru);
|
||||
INIT_LIST_HEAD(&osd->o_keepalive_item);
|
||||
osd->o_incarnation = 1;
|
||||
|
@ -1030,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd)
|
|||
WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
|
||||
WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
|
||||
WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
|
||||
WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
|
||||
WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
|
||||
WARN_ON(!list_empty(&osd->o_osd_lru));
|
||||
WARN_ON(!list_empty(&osd->o_keepalive_item));
|
||||
|
||||
|
@ -1150,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd)
|
|||
unlink_linger(osd, lreq);
|
||||
link_linger(&osdc->homeless_osd, lreq);
|
||||
}
|
||||
clear_backoffs(osd);
|
||||
|
||||
__remove_osd_from_lru(osd);
|
||||
erase_osd(&osdc->osds, osd);
|
||||
|
@ -1431,6 +1437,328 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
|
|||
return ct_res;
|
||||
}
|
||||
|
||||
static struct ceph_spg_mapping *alloc_spg_mapping(void)
|
||||
{
|
||||
struct ceph_spg_mapping *spg;
|
||||
|
||||
spg = kmalloc(sizeof(*spg), GFP_NOIO);
|
||||
if (!spg)
|
||||
return NULL;
|
||||
|
||||
RB_CLEAR_NODE(&spg->node);
|
||||
spg->backoffs = RB_ROOT;
|
||||
return spg;
|
||||
}
|
||||
|
||||
static void free_spg_mapping(struct ceph_spg_mapping *spg)
|
||||
{
|
||||
WARN_ON(!RB_EMPTY_NODE(&spg->node));
|
||||
WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
|
||||
|
||||
kfree(spg);
|
||||
}
|
||||
|
||||
/*
|
||||
* rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
|
||||
* ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
|
||||
* defined only within a specific spgid; it does not pass anything to
|
||||
* children on split, or to another primary.
|
||||
*/
|
||||
DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
|
||||
RB_BYPTR, const struct ceph_spg *, node)
|
||||
|
||||
static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
|
||||
{
|
||||
return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
|
||||
}
|
||||
|
||||
static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
|
||||
void **pkey, size_t *pkey_len)
|
||||
{
|
||||
if (hoid->key_len) {
|
||||
*pkey = hoid->key;
|
||||
*pkey_len = hoid->key_len;
|
||||
} else {
|
||||
*pkey = hoid->oid;
|
||||
*pkey_len = hoid->oid_len;
|
||||
}
|
||||
}
|
||||
|
||||
static int compare_names(const void *name1, size_t name1_len,
|
||||
const void *name2, size_t name2_len)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = memcmp(name1, name2, min(name1_len, name2_len));
|
||||
if (!ret) {
|
||||
if (name1_len < name2_len)
|
||||
ret = -1;
|
||||
else if (name1_len > name2_len)
|
||||
ret = 1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hoid_compare(const struct ceph_hobject_id *lhs,
|
||||
const struct ceph_hobject_id *rhs)
|
||||
{
|
||||
void *effective_key1, *effective_key2;
|
||||
size_t effective_key1_len, effective_key2_len;
|
||||
int ret;
|
||||
|
||||
if (lhs->is_max < rhs->is_max)
|
||||
return -1;
|
||||
if (lhs->is_max > rhs->is_max)
|
||||
return 1;
|
||||
|
||||
if (lhs->pool < rhs->pool)
|
||||
return -1;
|
||||
if (lhs->pool > rhs->pool)
|
||||
return 1;
|
||||
|
||||
if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
|
||||
return -1;
|
||||
if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
|
||||
return 1;
|
||||
|
||||
ret = compare_names(lhs->nspace, lhs->nspace_len,
|
||||
rhs->nspace, rhs->nspace_len);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
|
||||
hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
|
||||
ret = compare_names(effective_key1, effective_key1_len,
|
||||
effective_key2, effective_key2_len);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (lhs->snapid < rhs->snapid)
|
||||
return -1;
|
||||
if (lhs->snapid > rhs->snapid)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
|
||||
* compat stuff here.
|
||||
*
|
||||
* Assumes @hoid is zero-initialized.
|
||||
*/
|
||||
static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid)
|
||||
{
|
||||
u8 struct_v;
|
||||
u32 struct_len;
|
||||
int ret;
|
||||
|
||||
ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
|
||||
&struct_len);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (struct_v < 4) {
|
||||
pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
|
||||
goto e_inval;
|
||||
}
|
||||
|
||||
hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
|
||||
GFP_NOIO);
|
||||
if (IS_ERR(hoid->key)) {
|
||||
ret = PTR_ERR(hoid->key);
|
||||
hoid->key = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
|
||||
GFP_NOIO);
|
||||
if (IS_ERR(hoid->oid)) {
|
||||
ret = PTR_ERR(hoid->oid);
|
||||
hoid->oid = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
|
||||
ceph_decode_32_safe(p, end, hoid->hash, e_inval);
|
||||
ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
|
||||
|
||||
hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
|
||||
GFP_NOIO);
|
||||
if (IS_ERR(hoid->nspace)) {
|
||||
ret = PTR_ERR(hoid->nspace);
|
||||
hoid->nspace = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
ceph_decode_64_safe(p, end, hoid->pool, e_inval);
|
||||
|
||||
ceph_hoid_build_hash_cache(hoid);
|
||||
return 0;
|
||||
|
||||
e_inval:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
|
||||
{
|
||||
return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
|
||||
4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
|
||||
}
|
||||
|
||||
static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid)
|
||||
{
|
||||
ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
|
||||
ceph_encode_string(p, end, hoid->key, hoid->key_len);
|
||||
ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
|
||||
ceph_encode_64(p, hoid->snapid);
|
||||
ceph_encode_32(p, hoid->hash);
|
||||
ceph_encode_8(p, hoid->is_max);
|
||||
ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
|
||||
ceph_encode_64(p, hoid->pool);
|
||||
}
|
||||
|
||||
static void free_hoid(struct ceph_hobject_id *hoid)
|
||||
{
|
||||
if (hoid) {
|
||||
kfree(hoid->key);
|
||||
kfree(hoid->oid);
|
||||
kfree(hoid->nspace);
|
||||
kfree(hoid);
|
||||
}
|
||||
}
|
||||
|
||||
static struct ceph_osd_backoff *alloc_backoff(void)
|
||||
{
|
||||
struct ceph_osd_backoff *backoff;
|
||||
|
||||
backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
|
||||
if (!backoff)
|
||||
return NULL;
|
||||
|
||||
RB_CLEAR_NODE(&backoff->spg_node);
|
||||
RB_CLEAR_NODE(&backoff->id_node);
|
||||
return backoff;
|
||||
}
|
||||
|
||||
static void free_backoff(struct ceph_osd_backoff *backoff)
|
||||
{
|
||||
WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
|
||||
WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
|
||||
|
||||
free_hoid(backoff->begin);
|
||||
free_hoid(backoff->end);
|
||||
kfree(backoff);
|
||||
}
|
||||
|
||||
/*
|
||||
* Within a specific spgid, backoffs are managed by ->begin hoid.
|
||||
*/
|
||||
DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
|
||||
RB_BYVAL, spg_node);
|
||||
|
||||
static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root,
|
||||
const struct ceph_hobject_id *hoid)
|
||||
{
|
||||
struct rb_node *n = root->rb_node;
|
||||
|
||||
while (n) {
|
||||
struct ceph_osd_backoff *cur =
|
||||
rb_entry(n, struct ceph_osd_backoff, spg_node);
|
||||
int cmp;
|
||||
|
||||
cmp = hoid_compare(hoid, cur->begin);
|
||||
if (cmp < 0) {
|
||||
n = n->rb_left;
|
||||
} else if (cmp > 0) {
|
||||
if (hoid_compare(hoid, cur->end) < 0)
|
||||
return cur;
|
||||
|
||||
n = n->rb_right;
|
||||
} else {
|
||||
return cur;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Each backoff has a unique id within its OSD session.
|
||||
*/
|
||||
DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
|
||||
|
||||
static void clear_backoffs(struct ceph_osd *osd)
|
||||
{
|
||||
while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
|
||||
struct ceph_spg_mapping *spg =
|
||||
rb_entry(rb_first(&osd->o_backoff_mappings),
|
||||
struct ceph_spg_mapping, node);
|
||||
|
||||
while (!RB_EMPTY_ROOT(&spg->backoffs)) {
|
||||
struct ceph_osd_backoff *backoff =
|
||||
rb_entry(rb_first(&spg->backoffs),
|
||||
struct ceph_osd_backoff, spg_node);
|
||||
|
||||
erase_backoff(&spg->backoffs, backoff);
|
||||
erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
|
||||
free_backoff(backoff);
|
||||
}
|
||||
erase_spg_mapping(&osd->o_backoff_mappings, spg);
|
||||
free_spg_mapping(spg);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up a temporary, non-owning view into @t.
|
||||
*/
|
||||
static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
|
||||
const struct ceph_osd_request_target *t)
|
||||
{
|
||||
hoid->key = NULL;
|
||||
hoid->key_len = 0;
|
||||
hoid->oid = t->target_oid.name;
|
||||
hoid->oid_len = t->target_oid.name_len;
|
||||
hoid->snapid = CEPH_NOSNAP;
|
||||
hoid->hash = t->pgid.seed;
|
||||
hoid->is_max = false;
|
||||
if (t->target_oloc.pool_ns) {
|
||||
hoid->nspace = t->target_oloc.pool_ns->str;
|
||||
hoid->nspace_len = t->target_oloc.pool_ns->len;
|
||||
} else {
|
||||
hoid->nspace = NULL;
|
||||
hoid->nspace_len = 0;
|
||||
}
|
||||
hoid->pool = t->target_oloc.pool;
|
||||
ceph_hoid_build_hash_cache(hoid);
|
||||
}
|
||||
|
||||
static bool should_plug_request(struct ceph_osd_request *req)
|
||||
{
|
||||
struct ceph_osd *osd = req->r_osd;
|
||||
struct ceph_spg_mapping *spg;
|
||||
struct ceph_osd_backoff *backoff;
|
||||
struct ceph_hobject_id hoid;
|
||||
|
||||
spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
|
||||
if (!spg)
|
||||
return false;
|
||||
|
||||
hoid_fill_from_target(&hoid, &req->r_t);
|
||||
backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
|
||||
if (!backoff)
|
||||
return false;
|
||||
|
||||
dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
|
||||
__func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
|
||||
backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void setup_request_data(struct ceph_osd_request *req,
|
||||
struct ceph_msg *msg)
|
||||
{
|
||||
|
@ -1707,6 +2035,10 @@ static void send_request(struct ceph_osd_request *req)
|
|||
verify_osd_locked(osd);
|
||||
WARN_ON(osd->o_osd != req->r_t.osd);
|
||||
|
||||
/* backoff? */
|
||||
if (should_plug_request(req))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We may have a previously queued request message hanging
|
||||
* around. Cancel it to avoid corrupting the msgr.
|
||||
|
@ -3527,6 +3859,8 @@ static void kick_osd_requests(struct ceph_osd *osd)
|
|||
{
|
||||
struct rb_node *n;
|
||||
|
||||
clear_backoffs(osd);
|
||||
|
||||
for (n = rb_first(&osd->o_requests); n; ) {
|
||||
struct ceph_osd_request *req =
|
||||
rb_entry(n, struct ceph_osd_request, r_node);
|
||||
|
@ -3572,6 +3906,261 @@ static void osd_fault(struct ceph_connection *con)
|
|||
up_write(&osdc->lock);
|
||||
}
|
||||
|
||||
struct MOSDBackoff {
|
||||
struct ceph_spg spgid;
|
||||
u32 map_epoch;
|
||||
u8 op;
|
||||
u64 id;
|
||||
struct ceph_hobject_id *begin;
|
||||
struct ceph_hobject_id *end;
|
||||
};
|
||||
|
||||
static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m)
|
||||
{
|
||||
void *p = msg->front.iov_base;
|
||||
void *const end = p + msg->front.iov_len;
|
||||
u8 struct_v;
|
||||
u32 struct_len;
|
||||
int ret;
|
||||
|
||||
ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
|
||||
ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
|
||||
ceph_decode_8_safe(&p, end, m->op, e_inval);
|
||||
ceph_decode_64_safe(&p, end, m->id, e_inval);
|
||||
|
||||
m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
|
||||
if (!m->begin)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = decode_hoid(&p, end, m->begin);
|
||||
if (ret) {
|
||||
free_hoid(m->begin);
|
||||
return ret;
|
||||
}
|
||||
|
||||
m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
|
||||
if (!m->end) {
|
||||
free_hoid(m->begin);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = decode_hoid(&p, end, m->end);
|
||||
if (ret) {
|
||||
free_hoid(m->begin);
|
||||
free_hoid(m->end);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
e_inval:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static struct ceph_msg *create_backoff_message(
|
||||
const struct ceph_osd_backoff *backoff,
|
||||
u32 map_epoch)
|
||||
{
|
||||
struct ceph_msg *msg;
|
||||
void *p, *end;
|
||||
int msg_size;
|
||||
|
||||
msg_size = CEPH_ENCODING_START_BLK_LEN +
|
||||
CEPH_PGID_ENCODING_LEN + 1; /* spgid */
|
||||
msg_size += 4 + 1 + 8; /* map_epoch, op, id */
|
||||
msg_size += CEPH_ENCODING_START_BLK_LEN +
|
||||
hoid_encoding_size(backoff->begin);
|
||||
msg_size += CEPH_ENCODING_START_BLK_LEN +
|
||||
hoid_encoding_size(backoff->end);
|
||||
|
||||
msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
|
||||
if (!msg)
|
||||
return NULL;
|
||||
|
||||
p = msg->front.iov_base;
|
||||
end = p + msg->front_alloc_len;
|
||||
|
||||
encode_spgid(&p, &backoff->spgid);
|
||||
ceph_encode_32(&p, map_epoch);
|
||||
ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
|
||||
ceph_encode_64(&p, backoff->id);
|
||||
encode_hoid(&p, end, backoff->begin);
|
||||
encode_hoid(&p, end, backoff->end);
|
||||
BUG_ON(p != end);
|
||||
|
||||
msg->front.iov_len = p - msg->front.iov_base;
|
||||
msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
|
||||
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
|
||||
|
||||
return msg;
|
||||
}
|
||||
|
||||
static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m)
|
||||
{
|
||||
struct ceph_spg_mapping *spg;
|
||||
struct ceph_osd_backoff *backoff;
|
||||
struct ceph_msg *msg;
|
||||
|
||||
dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
|
||||
m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
|
||||
|
||||
spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
|
||||
if (!spg) {
|
||||
spg = alloc_spg_mapping();
|
||||
if (!spg) {
|
||||
pr_err("%s failed to allocate spg\n", __func__);
|
||||
return;
|
||||
}
|
||||
spg->spgid = m->spgid; /* struct */
|
||||
insert_spg_mapping(&osd->o_backoff_mappings, spg);
|
||||
}
|
||||
|
||||
backoff = alloc_backoff();
|
||||
if (!backoff) {
|
||||
pr_err("%s failed to allocate backoff\n", __func__);
|
||||
return;
|
||||
}
|
||||
backoff->spgid = m->spgid; /* struct */
|
||||
backoff->id = m->id;
|
||||
backoff->begin = m->begin;
|
||||
m->begin = NULL; /* backoff now owns this */
|
||||
backoff->end = m->end;
|
||||
m->end = NULL; /* ditto */
|
||||
|
||||
insert_backoff(&spg->backoffs, backoff);
|
||||
insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
|
||||
|
||||
/*
|
||||
* Ack with original backoff's epoch so that the OSD can
|
||||
* discard this if there was a PG split.
|
||||
*/
|
||||
msg = create_backoff_message(backoff, m->map_epoch);
|
||||
if (!msg) {
|
||||
pr_err("%s failed to allocate msg\n", __func__);
|
||||
return;
|
||||
}
|
||||
ceph_con_send(&osd->o_con, msg);
|
||||
}
|
||||
|
||||
static bool target_contained_by(const struct ceph_osd_request_target *t,
|
||||
const struct ceph_hobject_id *begin,
|
||||
const struct ceph_hobject_id *end)
|
||||
{
|
||||
struct ceph_hobject_id hoid;
|
||||
int cmp;
|
||||
|
||||
hoid_fill_from_target(&hoid, t);
|
||||
cmp = hoid_compare(&hoid, begin);
|
||||
return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0);
|
||||
}
|
||||
|
||||
static void handle_backoff_unblock(struct ceph_osd *osd,
|
||||
const struct MOSDBackoff *m)
|
||||
{
|
||||
struct ceph_spg_mapping *spg;
|
||||
struct ceph_osd_backoff *backoff;
|
||||
struct rb_node *n;
|
||||
|
||||
dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
|
||||
m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
|
||||
|
||||
backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
|
||||
if (!backoff) {
|
||||
pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
|
||||
__func__, osd->o_osd, m->spgid.pgid.pool,
|
||||
m->spgid.pgid.seed, m->spgid.shard, m->id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (hoid_compare(backoff->begin, m->begin) &&
|
||||
hoid_compare(backoff->end, m->end)) {
|
||||
pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
|
||||
__func__, osd->o_osd, m->spgid.pgid.pool,
|
||||
m->spgid.pgid.seed, m->spgid.shard, m->id);
|
||||
/* unblock it anyway... */
|
||||
}
|
||||
|
||||
spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
|
||||
BUG_ON(!spg);
|
||||
|
||||
erase_backoff(&spg->backoffs, backoff);
|
||||
erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
|
||||
free_backoff(backoff);
|
||||
|
||||
if (RB_EMPTY_ROOT(&spg->backoffs)) {
|
||||
erase_spg_mapping(&osd->o_backoff_mappings, spg);
|
||||
free_spg_mapping(spg);
|
||||
}
|
||||
|
||||
for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
|
||||
struct ceph_osd_request *req =
|
||||
rb_entry(n, struct ceph_osd_request, r_node);
|
||||
|
||||
if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
|
||||
/*
|
||||
* Match against @m, not @backoff -- the PG may
|
||||
* have split on the OSD.
|
||||
*/
|
||||
if (target_contained_by(&req->r_t, m->begin, m->end)) {
|
||||
/*
|
||||
* If no other installed backoff applies,
|
||||
* resend.
|
||||
*/
|
||||
send_request(req);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg)
|
||||
{
|
||||
struct ceph_osd_client *osdc = osd->o_osdc;
|
||||
struct MOSDBackoff m;
|
||||
int ret;
|
||||
|
||||
down_read(&osdc->lock);
|
||||
if (!osd_registered(osd)) {
|
||||
dout("%s osd%d unknown\n", __func__, osd->o_osd);
|
||||
up_read(&osdc->lock);
|
||||
return;
|
||||
}
|
||||
WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
|
||||
|
||||
mutex_lock(&osd->lock);
|
||||
ret = decode_MOSDBackoff(msg, &m);
|
||||
if (ret) {
|
||||
pr_err("failed to decode MOSDBackoff: %d\n", ret);
|
||||
ceph_msg_dump(msg);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
switch (m.op) {
|
||||
case CEPH_OSD_BACKOFF_OP_BLOCK:
|
||||
handle_backoff_block(osd, &m);
|
||||
break;
|
||||
case CEPH_OSD_BACKOFF_OP_UNBLOCK:
|
||||
handle_backoff_unblock(osd, &m);
|
||||
break;
|
||||
default:
|
||||
pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
|
||||
}
|
||||
|
||||
free_hoid(m.begin);
|
||||
free_hoid(m.end);
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&osd->lock);
|
||||
up_read(&osdc->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Process osd watch notifications
|
||||
*/
|
||||
|
@ -4509,6 +5098,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
|
|||
case CEPH_MSG_OSD_OPREPLY:
|
||||
handle_reply(osd, msg);
|
||||
break;
|
||||
case CEPH_MSG_OSD_BACKOFF:
|
||||
handle_backoff(osd, msg);
|
||||
break;
|
||||
case CEPH_MSG_WATCH_NOTIFY:
|
||||
handle_watch_notify(osdc, msg);
|
||||
break;
|
||||
|
@ -4631,6 +5223,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
|
|||
*skip = 0;
|
||||
switch (type) {
|
||||
case CEPH_MSG_OSD_MAP:
|
||||
case CEPH_MSG_OSD_BACKOFF:
|
||||
case CEPH_MSG_WATCH_NOTIFY:
|
||||
return alloc_msg_with_page_vector(hdr);
|
||||
case CEPH_MSG_OSD_OPREPLY:
|
||||
|
|
|
@ -418,6 +418,22 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (lhs->shard < rhs->shard)
|
||||
return -1;
|
||||
if (lhs->shard > rhs->shard)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
|
||||
* to a set of osds) and primary_temp (explicit primary setting)
|
||||
|
|
Loading…
Reference in New Issue