libceph, rbd: ceph_osd_linger_request, watch/notify v2

This adds support and switches rbd to a new, more reliable version of
watch/notify protocol.  As with the OSD client update, this is mostly
about getting the right structures linked into the right places so that
reconnects are properly sent when needed.  watch/notify v2 also
requires sending regular pings to the OSDs - send_linger_ping().

A major change from the old watch/notify implementation is the
introduction of ceph_osd_linger_request - linger requests no longer
piggy back on ceph_osd_request.  ceph_osd_event has been merged into
ceph_osd_linger_request.

All the details are now hidden within libceph, the interface consists
of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack().
ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep
the lifetime management simple.

ceph_osdc_notify_ack() accepts an optional data payload, which is
relayed back to the notifier.

Portions of this patch are loosely based on work by Douglas Fuller
<dfuller@redhat.com> and Mike Christie <michaelc@cs.wisc.edu>.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
Ilya Dryomov 2016-05-26 01:15:02 +02:00
parent c525f03601
commit 922dab6134
7 changed files with 1068 additions and 432 deletions

View File

@ -351,11 +351,11 @@ struct rbd_device {
struct rbd_options *opts;
struct ceph_object_id header_oid;
struct ceph_object_locator header_oloc;
struct ceph_file_layout layout;
struct ceph_osd_event *watch_event;
struct rbd_obj_request *watch_request;
struct ceph_osd_linger_request *watch_handle;
struct rbd_spec *parent_spec;
u64 parent_overlap;
@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
return __rbd_obj_request_wait(obj_request, 0);
}
static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
unsigned long timeout)
{
return __rbd_obj_request_wait(obj_request, timeout);
}
static void rbd_img_request_complete(struct rbd_img_request *img_request)
{
@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
complete_all(&obj_request->completion);
}
static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
{
dout("%s: obj %p\n", __func__, obj_request);
obj_request_done_set(obj_request);
}
static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
{
struct rbd_img_request *img_request = NULL;
@ -1877,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
case CEPH_OSD_OP_CALL:
rbd_osd_call_callback(obj_request);
break;
case CEPH_OSD_OP_NOTIFY_ACK:
case CEPH_OSD_OP_WATCH:
rbd_osd_trivial_callback(obj_request);
break;
default:
rbd_warn(NULL, "%s: unsupported op %hu",
obj_request->object_name, (unsigned short) opcode);
@ -3100,45 +3084,18 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
obj_request_done_set(obj_request);
}
static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
u64 notifier_id, void *data, size_t data_len)
{
struct rbd_obj_request *obj_request;
struct rbd_device *rbd_dev = arg;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
int ret;
obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
OBJ_REQUEST_NODATA);
if (!obj_request)
return -ENOMEM;
ret = -ENOMEM;
obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
obj_request);
if (!obj_request->osd_req)
goto out;
osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
notify_id, 0, 0);
rbd_osd_req_format_read(obj_request);
ret = rbd_obj_request_submit(osdc, obj_request);
if (ret)
goto out;
ret = rbd_obj_request_wait(obj_request);
out:
rbd_obj_request_put(obj_request);
return ret;
}
static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
{
struct rbd_device *rbd_dev = (struct rbd_device *)data;
int ret;
dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
rbd_dev->header_oid.name, (unsigned long long)notify_id,
(unsigned int)opcode);
dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
cookie, notify_id);
/*
* Until adequate refresh error handling is in place, there is
@ -3150,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
if (ret)
rbd_warn(rbd_dev, "refresh failed: %d", ret);
ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
&rbd_dev->header_oloc, notify_id, cookie,
NULL, 0);
if (ret)
rbd_warn(rbd_dev, "notify_ack ret %d", ret);
}
/*
* Send a (un)watch request and wait for the ack. Return a request
* with a ref held on success or error.
*/
static struct rbd_obj_request *rbd_obj_watch_request_helper(
struct rbd_device *rbd_dev,
bool watch)
static void rbd_watch_errcb(void *arg, u64 cookie, int err)
{
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_options *opts = osdc->client->options;
struct rbd_obj_request *obj_request;
struct rbd_device *rbd_dev = arg;
int ret;
obj_request = rbd_obj_request_create(rbd_dev->header_oid.name, 0, 0,
OBJ_REQUEST_NODATA);
if (!obj_request)
return ERR_PTR(-ENOMEM);
rbd_warn(rbd_dev, "encountered watch error: %d", err);
obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
obj_request);
if (!obj_request->osd_req) {
ret = -ENOMEM;
goto out;
}
__rbd_dev_header_unwatch_sync(rbd_dev);
osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
rbd_dev->watch_event->cookie, 0, watch);
rbd_osd_req_format_write(obj_request);
if (watch)
ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
ret = rbd_obj_request_submit(osdc, obj_request);
if (ret)
goto out;
ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
if (ret)
goto out;
ret = obj_request->result;
ret = rbd_dev_header_watch_sync(rbd_dev);
if (ret) {
if (watch)
rbd_obj_request_end(obj_request);
goto out;
rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
return;
}
return obj_request;
out:
rbd_obj_request_put(obj_request);
return ERR_PTR(ret);
ret = rbd_dev_refresh(rbd_dev);
if (ret)
rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
}
/*
@ -3215,57 +3140,33 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
{
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct rbd_obj_request *obj_request;
int ret;
struct ceph_osd_linger_request *handle;
rbd_assert(!rbd_dev->watch_event);
rbd_assert(!rbd_dev->watch_request);
rbd_assert(!rbd_dev->watch_handle);
ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
&rbd_dev->watch_event);
if (ret < 0)
return ret;
obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
if (IS_ERR(obj_request)) {
ceph_osdc_cancel_event(rbd_dev->watch_event);
rbd_dev->watch_event = NULL;
return PTR_ERR(obj_request);
}
/*
* A watch request is set to linger, so the underlying osd
* request won't go away until we unregister it. We retain
* a pointer to the object request during that time (in
* rbd_dev->watch_request), so we'll keep a reference to it.
* We'll drop that reference after we've unregistered it in
* rbd_dev_header_unwatch_sync().
*/
rbd_dev->watch_request = obj_request;
handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
&rbd_dev->header_oloc, rbd_watch_cb,
rbd_watch_errcb, rbd_dev);
if (IS_ERR(handle))
return PTR_ERR(handle);
rbd_dev->watch_handle = handle;
return 0;
}
static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
{
struct rbd_obj_request *obj_request;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
int ret;
rbd_assert(rbd_dev->watch_event);
rbd_assert(rbd_dev->watch_request);
if (!rbd_dev->watch_handle)
return;
rbd_obj_request_end(rbd_dev->watch_request);
rbd_obj_request_put(rbd_dev->watch_request);
rbd_dev->watch_request = NULL;
ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
if (ret)
rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
if (!IS_ERR(obj_request))
rbd_obj_request_put(obj_request);
else
rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
PTR_ERR(obj_request));
ceph_osdc_cancel_event(rbd_dev->watch_event);
rbd_dev->watch_event = NULL;
rbd_dev->watch_handle = NULL;
}
/*
@ -4081,6 +3982,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
init_rwsem(&rbd_dev->header_rwsem);
ceph_oid_init(&rbd_dev->header_oid);
ceph_oloc_init(&rbd_dev->header_oloc);
rbd_dev->dev.bus = &rbd_bus_type;
rbd_dev->dev.type = &rbd_device_type;
@ -5285,6 +5187,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
if (rbd_dev->image_format == 1)
ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
spec->image_name, RBD_SUFFIX);

View File

@ -153,8 +153,9 @@ struct ceph_dir_layout {
/* watch-notify operations */
enum {
WATCH_NOTIFY = 1, /* notifying watcher */
WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */
CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
};

View File

@ -34,7 +34,7 @@ struct ceph_osd {
struct rb_node o_node;
struct ceph_connection o_con;
struct rb_root o_requests;
struct list_head o_linger_requests;
struct rb_root o_linger_requests;
struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth;
unsigned long lru_ttl;
@ -108,11 +108,12 @@ struct ceph_osd_req_op {
} cls;
struct {
u64 cookie;
u64 ver;
u32 prot_ver;
u32 timeout;
__u8 flag;
__u8 op; /* CEPH_OSD_WATCH_OP_ */
u32 gen;
} watch;
struct {
struct ceph_osd_data request_data;
} notify_ack;
struct {
u64 expected_object_size;
u64 expected_write_size;
@ -145,8 +146,6 @@ struct ceph_osd_request_target {
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
struct rb_node r_node;
struct list_head r_linger_item;
struct list_head r_linger_osd_item;
struct ceph_osd *r_osd;
struct ceph_osd_request_target r_t;
@ -162,7 +161,6 @@ struct ceph_osd_request {
int r_result;
bool r_got_reply;
int r_linger;
struct ceph_osd_client *r_osdc;
struct kref r_kref;
@ -181,6 +179,7 @@ struct ceph_osd_request {
struct ceph_snap_context *r_snapc; /* for writes */
struct timespec r_mtime; /* ditto */
u64 r_data_offset; /* ditto */
bool r_linger; /* don't resend on failure */
/* internal */
unsigned long r_stamp; /* jiffies, send or check time */
@ -195,23 +194,40 @@ struct ceph_request_redirect {
struct ceph_object_locator oloc;
};
struct ceph_osd_event {
u64 cookie;
int one_shot;
struct ceph_osd_client *osdc;
void (*cb)(u64, u64, u8, void *);
void *data;
struct rb_node node;
struct list_head osd_node;
struct kref kref;
};
typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
u64 notifier_id, void *data, size_t data_len);
typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
struct ceph_osd_event_work {
struct work_struct work;
struct ceph_osd_event *event;
u64 ver;
u64 notify_id;
u8 opcode;
struct ceph_osd_linger_request {
struct ceph_osd_client *osdc;
u64 linger_id;
bool committed;
struct ceph_osd *osd;
struct ceph_osd_request *reg_req;
struct ceph_osd_request *ping_req;
unsigned long ping_sent;
struct ceph_osd_request_target t;
u32 last_force_resend;
struct timespec mtime;
struct kref kref;
struct mutex lock;
struct rb_node node; /* osd */
struct rb_node osdc_node; /* osdc */
struct list_head scan_item;
struct completion reg_commit_wait;
int reg_commit_error;
int last_error;
u32 register_gen;
rados_watchcb2_t wcb;
rados_watcherrcb_t errcb;
void *data;
};
struct ceph_osd_client {
@ -223,9 +239,10 @@ struct ceph_osd_client {
struct rb_root osds; /* osds */
struct list_head osd_lru; /* idle osds */
spinlock_t osd_lru_lock;
struct list_head req_linger; /* lingering requests */
struct ceph_osd homeless_osd;
atomic64_t last_tid; /* tid of last request */
u64 last_linger_id;
struct rb_root linger_requests; /* lingering requests */
atomic_t num_requests;
atomic_t num_homeless;
struct delayed_work timeout_work;
@ -239,10 +256,6 @@ struct ceph_osd_client {
struct ceph_msgpool msgpool_op;
struct ceph_msgpool msgpool_op_reply;
spinlock_t event_lock;
struct rb_root event_tree;
u64 event_count;
struct workqueue_struct *notify_wq;
};
@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *name, const void *value,
size_t size, u8 cmp_op, u8 cmp_mode);
extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode,
u64 cookie, u64 version, int flag);
extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
u64 expected_object_size,
@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
u32 truncate_seq, u64 truncate_size,
bool use_mempool);
extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
extern void ceph_osdc_get_request(struct ceph_osd_request *req);
extern void ceph_osdc_put_request(struct ceph_osd_request *req);
@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct timespec *mtime,
struct page **pages, int nr_pages);
/* watch/notify events */
extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
void (*event_cb)(u64, u64, u8, void *),
void *data, struct ceph_osd_event **pevent);
extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
extern void ceph_osdc_put_event(struct ceph_osd_event *event);
/* watch/notify */
struct ceph_osd_linger_request *
ceph_osdc_watch(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
rados_watchcb2_t wcb,
rados_watcherrcb_t errcb,
void *data);
int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
struct ceph_osd_linger_request *lreq);
int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
u64 notify_id,
u64 cookie,
void *payload,
size_t payload_len);
#endif

View File

@ -427,7 +427,17 @@ enum {
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
#define RADOS_NOTIFY_VER 1
enum {
CEPH_OSD_WATCH_OP_UNWATCH = 0,
CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
/* note: use only ODD ids to prevent pre-giant code from
interpreting the op as UNWATCH */
CEPH_OSD_WATCH_OP_WATCH = 3,
CEPH_OSD_WATCH_OP_RECONNECT = 5,
CEPH_OSD_WATCH_OP_PING = 7,
};
const char *ceph_osd_watch_op_name(int o);
/*
* an individual object operation. each may be accompanied by some data
@ -462,8 +472,9 @@ struct ceph_osd_op {
} __attribute__ ((packed)) snap;
struct {
__le64 cookie;
__le64 ver;
__u8 flag; /* 0 = unwatch, 1 = watch */
__le64 ver; /* no longer used */
__u8 op; /* CEPH_OSD_WATCH_OP_* */
__le32 gen; /* registration generation */
} __attribute__ ((packed)) watch;
struct {
__le64 offset, length;

View File

@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
}
}
const char *ceph_osd_watch_op_name(int o)
{
switch (o) {
case CEPH_OSD_WATCH_OP_UNWATCH:
return "unwatch";
case CEPH_OSD_WATCH_OP_WATCH:
return "watch";
case CEPH_OSD_WATCH_OP_RECONNECT:
return "reconnect";
case CEPH_OSD_WATCH_OP_PING:
return "ping";
default:
return "???";
}
}
const char *ceph_osd_state_name(int s)
{
switch (s) {

View File

@ -177,6 +177,9 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
ceph_osd_op_name(op->op));
if (op->op == CEPH_OSD_OP_WATCH)
seq_printf(s, "-%s",
ceph_osd_watch_op_name(op->watch.op));
}
seq_putc(s, '\n');
@ -197,6 +200,31 @@ static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
mutex_unlock(&osd->lock);
}
static void dump_linger_request(struct seq_file *s,
struct ceph_osd_linger_request *lreq)
{
seq_printf(s, "%llu\t", lreq->linger_id);
dump_target(s, &lreq->t);
seq_printf(s, "\t%u\t%s/%d\n", lreq->register_gen,
lreq->committed ? "C" : "", lreq->last_error);
}
static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
{
struct rb_node *n;
mutex_lock(&osd->lock);
for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
struct ceph_osd_linger_request *lreq =
rb_entry(n, struct ceph_osd_linger_request, node);
dump_linger_request(s, lreq);
}
mutex_unlock(&osd->lock);
}
static int osdc_show(struct seq_file *s, void *pp)
{
struct ceph_client *client = s->private;
@ -214,6 +242,14 @@ static int osdc_show(struct seq_file *s, void *pp)
}
dump_requests(s, &osdc->homeless_osd);
seq_puts(s, "LINGER REQUESTS\n");
for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
dump_linger_requests(s, osd);
}
dump_linger_requests(s, &osdc->homeless_osd);
up_read(&osdc->lock);
return 0;
}

File diff suppressed because it is too large Load Diff