metrics: add per-version daily stats reporting
Adds a few kernel crash stats which are reported daily but are accumulated from beginning to end of a Chrome OS version. BUG=chromium:339588 TEST=ran and checked histograms on device BRANCH=none Change-Id: I630c673156c28dc90ffe0c9c2df58caaada082dc Reviewed-on: https://chromium-review.googlesource.com/190404 Reviewed-by: Luigi Semenzato <semenzato@chromium.org> Commit-Queue: Luigi Semenzato <semenzato@chromium.org> Tested-by: Luigi Semenzato <semenzato@chromium.org>
This commit is contained in:
parent
49fb1adc5a
commit
ba0c65d098
|
@ -106,6 +106,9 @@ const char MetricsDaemon::kMetricSwapOutLongName[] =
|
|||
const char MetricsDaemon::kMetricSwapOutShortName[] =
|
||||
"Platform.SwapOutShort";
|
||||
|
||||
const char MetricsDaemon::kMetricsProcStatFileName[] = "/proc/stat";
|
||||
const int MetricsDaemon::kMetricsProcStatFirstLineItemsCount = 11;
|
||||
|
||||
// Thermal CPU throttling.
|
||||
|
||||
const char MetricsDaemon::kMetricScaledCpuFrequencyName[] =
|
||||
|
@ -147,7 +150,9 @@ MetricsDaemon::MetricsDaemon()
|
|||
write_sectors_(0),
|
||||
vmstats_(),
|
||||
stats_state_(kStatsShort),
|
||||
stats_initial_time_(0) {}
|
||||
stats_initial_time_(0),
|
||||
ticks_per_second_(0),
|
||||
latest_cpu_use_ticks_(0) {}
|
||||
|
||||
MetricsDaemon::~MetricsDaemon() {
|
||||
}
|
||||
|
@ -181,8 +186,8 @@ void MetricsDaemon::Run(bool run_as_daemon) {
|
|||
int32 version = GetOsVersionHash();
|
||||
if (version_cycle_->Get() != version) {
|
||||
version_cycle_->Set(version);
|
||||
SendKernelCrashesCumulativeCountSample();
|
||||
kernel_crashes_version_count_->Set(0);
|
||||
version_cumulative_cpu_use_->Set(0);
|
||||
}
|
||||
|
||||
Loop();
|
||||
|
@ -215,8 +220,14 @@ void MetricsDaemon::Init(bool testing, MetricsLibraryInterface* metrics_lib,
|
|||
DCHECK(metrics_lib != NULL);
|
||||
metrics_lib_ = metrics_lib;
|
||||
|
||||
// Get ticks per second (HZ) on this system.
|
||||
// Sysconf cannot fail, so no sanity checks are needed.
|
||||
ticks_per_second_ = sysconf(_SC_CLK_TCK);
|
||||
|
||||
daily_use_.reset(
|
||||
new PersistentInteger("Logging.DailyUseTime"));
|
||||
version_cumulative_cpu_use_.reset(
|
||||
new PersistentInteger("Logging.CumulativeCpuTime"));
|
||||
|
||||
kernel_crash_interval_.reset(
|
||||
new PersistentInteger("Logging.KernelCrashInterval"));
|
||||
|
@ -398,7 +409,7 @@ MetricsDaemon::LookupSessionState(const char* state_name) {
|
|||
return kUnknownSessionState;
|
||||
}
|
||||
|
||||
void MetricsDaemon::ReportStats(Time now) {
|
||||
void MetricsDaemon::ReportStats(int64 active_use_seconds, Time now) {
|
||||
TimeDelta since_epoch = now - Time::UnixEpoch();
|
||||
int day = since_epoch.InDays();
|
||||
int week = day / 7;
|
||||
|
@ -414,7 +425,7 @@ void MetricsDaemon::ReportStats(Time now) {
|
|||
SendCrashFrequencySample(user_crashes_daily_count_);
|
||||
SendCrashFrequencySample(kernel_crashes_daily_count_);
|
||||
SendCrashFrequencySample(unclean_shutdowns_daily_count_);
|
||||
SendKernelCrashesCumulativeCountSample();
|
||||
SendKernelCrashesCumulativeCountStats(active_use_seconds);
|
||||
|
||||
if (weekly_cycle_->Get() == week) {
|
||||
// We did this week already.
|
||||
|
@ -429,6 +440,54 @@ void MetricsDaemon::ReportStats(Time now) {
|
|||
SendCrashFrequencySample(unclean_shutdowns_weekly_count_);
|
||||
}
|
||||
|
||||
// One might argue that parts of this should go into
|
||||
// chromium/src/base/sys_info_chromeos.c instead, but put it here for now.
|
||||
|
||||
TimeDelta MetricsDaemon::GetIncrementalCpuUse() {
|
||||
|
||||
FilePath proc_stat_path = FilePath(kMetricsProcStatFileName);
|
||||
std::string proc_stat_string;
|
||||
if (!base::ReadFileToString(proc_stat_path, &proc_stat_string)) {
|
||||
LOG(WARNING) << "cannot open " << kMetricsProcStatFileName;
|
||||
return TimeDelta();
|
||||
}
|
||||
|
||||
std::vector<std::string> proc_stat_lines;
|
||||
base::SplitString(proc_stat_string, '\n', &proc_stat_lines);
|
||||
if (proc_stat_lines.empty()) {
|
||||
LOG(WARNING) << "cannot parse " << kMetricsProcStatFileName
|
||||
<< ": " << proc_stat_string;
|
||||
return TimeDelta();
|
||||
}
|
||||
std::vector<std::string> proc_stat_totals;
|
||||
base::SplitStringAlongWhitespace(proc_stat_lines[0], &proc_stat_totals);
|
||||
|
||||
uint64 user_ticks, user_nice_ticks, system_ticks;
|
||||
if (proc_stat_totals.size() != kMetricsProcStatFirstLineItemsCount ||
|
||||
proc_stat_totals[0] != "cpu" ||
|
||||
!base::StringToUint64(proc_stat_totals[1], &user_ticks) ||
|
||||
!base::StringToUint64(proc_stat_totals[2], &user_nice_ticks) ||
|
||||
!base::StringToUint64(proc_stat_totals[3], &system_ticks)) {
|
||||
LOG(WARNING) << "cannot parse first line: " << proc_stat_lines[0];
|
||||
return TimeDelta(base::TimeDelta::FromSeconds(0));
|
||||
}
|
||||
|
||||
uint64 total_cpu_use_ticks = user_ticks + user_nice_ticks + system_ticks;
|
||||
|
||||
// Sanity check.
|
||||
if (total_cpu_use_ticks < latest_cpu_use_ticks_) {
|
||||
LOG(WARNING) << "CPU time decreasing from " << latest_cpu_use_ticks_
|
||||
<< " to " << total_cpu_use_ticks;
|
||||
return TimeDelta();
|
||||
}
|
||||
|
||||
uint64 diff = total_cpu_use_ticks - latest_cpu_use_ticks_;
|
||||
latest_cpu_use_ticks_ = total_cpu_use_ticks;
|
||||
// Use microseconds to avoid significant truncations.
|
||||
return base::TimeDelta::FromMicroseconds(
|
||||
diff * 1000 * 1000 / ticks_per_second_);
|
||||
}
|
||||
|
||||
void MetricsDaemon::SetUserActiveState(bool active, Time now) {
|
||||
DLOG(INFO) << "user: " << (active ? "active" : "inactive");
|
||||
|
||||
|
@ -448,8 +507,11 @@ void MetricsDaemon::SetUserActiveState(bool active, Time now) {
|
|||
user_crash_interval_->Add(seconds);
|
||||
kernel_crash_interval_->Add(seconds);
|
||||
|
||||
// Updates the CPU time accumulator.
|
||||
version_cumulative_cpu_use_->Add(GetIncrementalCpuUse().InMilliseconds());
|
||||
|
||||
// Report daily and weekly stats as needed.
|
||||
ReportStats(now);
|
||||
ReportStats(daily_use_->Get(), now);
|
||||
|
||||
// Schedules a use monitor on inactive->active transitions and
|
||||
// unschedules it on active->inactive transitions.
|
||||
|
@ -1084,14 +1146,44 @@ void MetricsDaemon::SendSample(const string& name, int sample,
|
|||
metrics_lib_->SendToUMA(name, sample, min, max, nbuckets);
|
||||
}
|
||||
|
||||
void MetricsDaemon::SendKernelCrashesCumulativeCountSample() {
|
||||
void MetricsDaemon::SendKernelCrashesCumulativeCountStats(
|
||||
int64 active_use_seconds) {
|
||||
// Report the number of crashes for this OS version, but don't clear the
|
||||
// counter. It is cleared elsewhere on version change.
|
||||
int64 crashes_count = kernel_crashes_version_count_->Get();
|
||||
SendSample(kernel_crashes_version_count_->Name(),
|
||||
kernel_crashes_version_count_->Get(),
|
||||
1, // value of first bucket
|
||||
500, // value of last bucket
|
||||
100); // number of buckets
|
||||
crashes_count,
|
||||
1, // value of first bucket
|
||||
500, // value of last bucket
|
||||
100); // number of buckets
|
||||
|
||||
|
||||
int64 cpu_use_ms = version_cumulative_cpu_use_->Get();
|
||||
SendSample(version_cumulative_cpu_use_->Name(),
|
||||
cpu_use_ms / 1000, // stat is in seconds
|
||||
1, // device may be used very little...
|
||||
8 * 1000 * 1000, // ... or a lot (a little over 90 days)
|
||||
100);
|
||||
|
||||
// On the first run after an autoupdate, cpu_use_ms and active_use_seconds
|
||||
// can be zero. Avoid division by zero.
|
||||
if (cpu_use_ms > 0) {
|
||||
// Send the crash frequency since update in number of crashes per CPU year.
|
||||
SendSample("Logging.KernelCrashesPerCpuYear",
|
||||
crashes_count * kSecondsPerDay * 365 * 1000 / cpu_use_ms,
|
||||
1,
|
||||
1000 * 1000, // about one crash every 30s of CPU time
|
||||
100);
|
||||
}
|
||||
|
||||
if (active_use_seconds > 0) {
|
||||
// Same as above, but per year of active time.
|
||||
SendSample("Logging.KernelCrashesPerActiveYear",
|
||||
crashes_count * kSecondsPerDay * 365 / active_use_seconds,
|
||||
1,
|
||||
1000 * 1000, // about one crash every 30s of active time
|
||||
100);
|
||||
}
|
||||
}
|
||||
|
||||
void MetricsDaemon::SendCrashIntervalSample(
|
||||
|
|
|
@ -145,6 +145,8 @@ class MetricsDaemon {
|
|||
static const int kMetricPageFaultsBuckets;
|
||||
static const char kMetricsDiskStatsPath[];
|
||||
static const char kMetricsVmStatsPath[];
|
||||
static const char kMetricsProcStatFileName[];
|
||||
static const int kMetricsProcStatFirstLineItemsCount;
|
||||
|
||||
// Array of power states.
|
||||
static const char* kPowerStates_[kNumberPowerStates];
|
||||
|
@ -243,9 +245,13 @@ class MetricsDaemon {
|
|||
void SendLinearSample(const std::string& name, int sample,
|
||||
int max, int nbuckets);
|
||||
|
||||
// Sends a histogram sample with the total number of kernel crashes since the
|
||||
// last version update.
|
||||
void SendKernelCrashesCumulativeCountSample();
|
||||
// Sends various cumulative kernel crash-related stats, for instance the
|
||||
// total number of kernel crashes since the last version update.
|
||||
void SendKernelCrashesCumulativeCountStats(int64 active_time_seconds);
|
||||
|
||||
// Returns the total (system-wide) CPU usage between the time of the most
|
||||
// recent call to this function and now.
|
||||
base::TimeDelta GetIncrementalCpuUse();
|
||||
|
||||
// Sends a sample representing a time interval between two crashes of the
|
||||
// same type.
|
||||
|
@ -322,7 +328,7 @@ class MetricsDaemon {
|
|||
bool ReadFreqToInt(const std::string& sysfs_file_name, int* value);
|
||||
|
||||
// Report UMA stats when cycles (daily or weekly) have changed.
|
||||
void ReportStats(base::Time now);
|
||||
void ReportStats(int64 active_time_seconds, base::Time now);
|
||||
|
||||
// Reads the current OS version from /etc/lsb-release and hashes it
|
||||
// to a unsigned 32-bit int.
|
||||
|
@ -375,13 +381,24 @@ class MetricsDaemon {
|
|||
StatsState stats_state_;
|
||||
double stats_initial_time_;
|
||||
|
||||
// Persistent counters for crash statistics.
|
||||
// The system "HZ", or frequency of ticks. Some system data uses ticks as a
|
||||
// unit, and this is used to convert to standard time units.
|
||||
uint32 ticks_per_second_;
|
||||
// Used internally by GetIncrementalCpuUse() to return the CPU utilization
|
||||
// between calls.
|
||||
uint64 latest_cpu_use_ticks_;
|
||||
|
||||
// Persistent values and accumulators for crash statistics.
|
||||
scoped_ptr<PersistentInteger> daily_cycle_;
|
||||
scoped_ptr<PersistentInteger> weekly_cycle_;
|
||||
scoped_ptr<PersistentInteger> version_cycle_;
|
||||
|
||||
scoped_ptr<PersistentInteger> daily_use_;
|
||||
|
||||
// The CPU time accumulator. This contains the CPU time, in milliseconds,
|
||||
// used by the system since the most recent OS version update.
|
||||
scoped_ptr<PersistentInteger> version_cumulative_cpu_use_;
|
||||
|
||||
scoped_ptr<PersistentInteger> user_crash_interval_;
|
||||
scoped_ptr<PersistentInteger> kernel_crash_interval_;
|
||||
scoped_ptr<PersistentInteger> unclean_shutdown_interval_;
|
||||
|
|
Loading…
Reference in New Issue