devlink: Add health report functionality
Upon error discover, every driver can report it to the devlink health mechanism via devlink_health_report function, using the appropriate reporter registered to it. Driver can pass error specific context which will be delivered to it as part of the dump / recovery callbacks. Once an error is reported, devlink health will do the following actions: * A log is being send to the kernel trace events buffer * Health status and statistics are being updated for the reporter instance * Object dump is being taken and stored at the reporter instance (as long as there is no other dump which is already stored) * Auto recovery attempt is being done. Depends on: - Auto Recovery configuration - Grace period vs. Time since last recover Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> Reviewed-by: Moshe Shemesh <moshe@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
a0bdcc59d1
commit
c8e1da0bf9
|
@ -704,6 +704,8 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter);
|
||||||
|
|
||||||
void *
|
void *
|
||||||
devlink_health_reporter_priv(struct devlink_health_reporter *reporter);
|
devlink_health_reporter_priv(struct devlink_health_reporter *reporter);
|
||||||
|
int devlink_health_report(struct devlink_health_reporter *reporter,
|
||||||
|
const char *msg, void *priv_ctx);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -1173,6 +1175,13 @@ devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
|
||||||
{
|
{
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int
|
||||||
|
devlink_health_report(struct devlink_health_reporter *reporter,
|
||||||
|
const char *msg, void *priv_ctx)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if IS_REACHABLE(CONFIG_NET_DEVLINK)
|
#if IS_REACHABLE(CONFIG_NET_DEVLINK)
|
||||||
|
|
|
@ -75,6 +75,71 @@ TRACE_EVENT(devlink_hwerr,
|
||||||
__get_str(driver_name), __entry->err, __get_str(msg))
|
__get_str(driver_name), __entry->err, __get_str(msg))
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tracepoint for devlink health message:
|
||||||
|
*/
|
||||||
|
TRACE_EVENT(devlink_health_report,
|
||||||
|
TP_PROTO(const struct devlink *devlink, const char *reporter_name,
|
||||||
|
const char *msg),
|
||||||
|
|
||||||
|
TP_ARGS(devlink, reporter_name, msg),
|
||||||
|
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
__string(bus_name, devlink->dev->bus->name)
|
||||||
|
__string(dev_name, dev_name(devlink->dev))
|
||||||
|
__string(driver_name, devlink->dev->driver->name)
|
||||||
|
__string(reporter_name, msg)
|
||||||
|
__string(msg, msg)
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_fast_assign(
|
||||||
|
__assign_str(bus_name, devlink->dev->bus->name);
|
||||||
|
__assign_str(dev_name, dev_name(devlink->dev));
|
||||||
|
__assign_str(driver_name, devlink->dev->driver->name);
|
||||||
|
__assign_str(reporter_name, reporter_name);
|
||||||
|
__assign_str(msg, msg);
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s",
|
||||||
|
__get_str(bus_name), __get_str(dev_name),
|
||||||
|
__get_str(driver_name), __get_str(reporter_name),
|
||||||
|
__get_str(msg))
|
||||||
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tracepoint for devlink health recover aborted message:
|
||||||
|
*/
|
||||||
|
TRACE_EVENT(devlink_health_recover_aborted,
|
||||||
|
TP_PROTO(const struct devlink *devlink, const char *reporter_name,
|
||||||
|
bool health_state, u64 time_since_last_recover),
|
||||||
|
|
||||||
|
TP_ARGS(devlink, reporter_name, health_state, time_since_last_recover),
|
||||||
|
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
__string(bus_name, devlink->dev->bus->name)
|
||||||
|
__string(dev_name, dev_name(devlink->dev))
|
||||||
|
__string(driver_name, devlink->dev->driver->name)
|
||||||
|
__string(reporter_name, reporter_name)
|
||||||
|
__field(bool, health_state)
|
||||||
|
__field(u64, time_since_last_recover)
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_fast_assign(
|
||||||
|
__assign_str(bus_name, devlink->dev->bus->name);
|
||||||
|
__assign_str(dev_name, dev_name(devlink->dev));
|
||||||
|
__assign_str(driver_name, devlink->dev->driver->name);
|
||||||
|
__assign_str(reporter_name, reporter_name);
|
||||||
|
__entry->health_state = health_state;
|
||||||
|
__entry->time_since_last_recover = time_since_last_recover;
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: health_state=%d time_since_last_recover=%llu recover aborted",
|
||||||
|
__get_str(bus_name), __get_str(dev_name),
|
||||||
|
__get_str(driver_name), __get_str(reporter_name),
|
||||||
|
__entry->health_state,
|
||||||
|
__entry->time_since_last_recover)
|
||||||
|
);
|
||||||
|
|
||||||
#endif /* _TRACE_DEVLINK_H */
|
#endif /* _TRACE_DEVLINK_H */
|
||||||
|
|
||||||
/* This part must be outside protection */
|
/* This part must be outside protection */
|
||||||
|
|
|
@ -4367,9 +4367,20 @@ struct devlink_health_reporter {
|
||||||
void *priv;
|
void *priv;
|
||||||
const struct devlink_health_reporter_ops *ops;
|
const struct devlink_health_reporter_ops *ops;
|
||||||
struct devlink *devlink;
|
struct devlink *devlink;
|
||||||
|
struct devlink_fmsg *dump_fmsg;
|
||||||
|
struct mutex dump_lock; /* lock parallel read/write from dump buffers */
|
||||||
u64 graceful_period;
|
u64 graceful_period;
|
||||||
bool auto_recover;
|
bool auto_recover;
|
||||||
u8 health_state;
|
u8 health_state;
|
||||||
|
u64 dump_ts;
|
||||||
|
u64 error_count;
|
||||||
|
u64 recovery_count;
|
||||||
|
u64 last_recovery_ts;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum devlink_health_reporter_state {
|
||||||
|
DEVLINK_HEALTH_REPORTER_STATE_HEALTHY,
|
||||||
|
DEVLINK_HEALTH_REPORTER_STATE_ERROR,
|
||||||
};
|
};
|
||||||
|
|
||||||
void *
|
void *
|
||||||
|
@ -4431,6 +4442,7 @@ devlink_health_reporter_create(struct devlink *devlink,
|
||||||
reporter->devlink = devlink;
|
reporter->devlink = devlink;
|
||||||
reporter->graceful_period = graceful_period;
|
reporter->graceful_period = graceful_period;
|
||||||
reporter->auto_recover = auto_recover;
|
reporter->auto_recover = auto_recover;
|
||||||
|
mutex_init(&reporter->dump_lock);
|
||||||
list_add_tail(&reporter->list, &devlink->reporter_list);
|
list_add_tail(&reporter->list, &devlink->reporter_list);
|
||||||
unlock:
|
unlock:
|
||||||
mutex_unlock(&devlink->lock);
|
mutex_unlock(&devlink->lock);
|
||||||
|
@ -4449,10 +4461,117 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
|
||||||
mutex_lock(&reporter->devlink->lock);
|
mutex_lock(&reporter->devlink->lock);
|
||||||
list_del(&reporter->list);
|
list_del(&reporter->list);
|
||||||
mutex_unlock(&reporter->devlink->lock);
|
mutex_unlock(&reporter->devlink->lock);
|
||||||
|
if (reporter->dump_fmsg)
|
||||||
|
devlink_fmsg_free(reporter->dump_fmsg);
|
||||||
kfree(reporter);
|
kfree(reporter);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
|
EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
|
||||||
|
|
||||||
|
static int
|
||||||
|
devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
|
||||||
|
void *priv_ctx)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
|
||||||
|
if (!reporter->ops->recover)
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
|
err = reporter->ops->recover(reporter, priv_ctx);
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
|
||||||
|
reporter->recovery_count++;
|
||||||
|
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
|
||||||
|
reporter->last_recovery_ts = jiffies;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
devlink_health_dump_clear(struct devlink_health_reporter *reporter)
|
||||||
|
{
|
||||||
|
if (!reporter->dump_fmsg)
|
||||||
|
return;
|
||||||
|
devlink_fmsg_free(reporter->dump_fmsg);
|
||||||
|
reporter->dump_fmsg = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
|
||||||
|
void *priv_ctx)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
|
||||||
|
if (!reporter->ops->dump)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (reporter->dump_fmsg)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
reporter->dump_fmsg = devlink_fmsg_alloc();
|
||||||
|
if (!reporter->dump_fmsg) {
|
||||||
|
err = -ENOMEM;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg);
|
||||||
|
if (err)
|
||||||
|
goto dump_err;
|
||||||
|
|
||||||
|
err = reporter->ops->dump(reporter, reporter->dump_fmsg,
|
||||||
|
priv_ctx);
|
||||||
|
if (err)
|
||||||
|
goto dump_err;
|
||||||
|
|
||||||
|
err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg);
|
||||||
|
if (err)
|
||||||
|
goto dump_err;
|
||||||
|
|
||||||
|
reporter->dump_ts = jiffies;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
dump_err:
|
||||||
|
devlink_health_dump_clear(reporter);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
int devlink_health_report(struct devlink_health_reporter *reporter,
|
||||||
|
const char *msg, void *priv_ctx)
|
||||||
|
{
|
||||||
|
struct devlink *devlink = reporter->devlink;
|
||||||
|
|
||||||
|
/* write a log message of the current error */
|
||||||
|
WARN_ON(!msg);
|
||||||
|
trace_devlink_health_report(devlink, reporter->ops->name, msg);
|
||||||
|
reporter->error_count++;
|
||||||
|
|
||||||
|
/* abort if the previous error wasn't recovered */
|
||||||
|
if (reporter->auto_recover &&
|
||||||
|
(reporter->health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
|
||||||
|
jiffies - reporter->last_recovery_ts <
|
||||||
|
msecs_to_jiffies(reporter->graceful_period))) {
|
||||||
|
trace_devlink_health_recover_aborted(devlink,
|
||||||
|
reporter->ops->name,
|
||||||
|
reporter->health_state,
|
||||||
|
jiffies -
|
||||||
|
reporter->last_recovery_ts);
|
||||||
|
return -ECANCELED;
|
||||||
|
}
|
||||||
|
|
||||||
|
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
|
||||||
|
|
||||||
|
mutex_lock(&reporter->dump_lock);
|
||||||
|
/* store current dump of current error, for later analysis */
|
||||||
|
devlink_health_do_dump(reporter, priv_ctx);
|
||||||
|
mutex_unlock(&reporter->dump_lock);
|
||||||
|
|
||||||
|
if (reporter->auto_recover)
|
||||||
|
return devlink_health_reporter_recover(reporter, priv_ctx);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(devlink_health_report);
|
||||||
|
|
||||||
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
|
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
|
||||||
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
|
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
|
||||||
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
|
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
|
||||||
|
|
Loading…
Reference in New Issue