362 lines
13 KiB
Python
362 lines
13 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
|
||
|
# Use of this source code is governed by a BSD-style license that can be
|
||
|
# found in the LICENSE file.
|
||
|
|
||
|
"""The experiment runner module."""
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import getpass
|
||
|
import os
|
||
|
import shutil
|
||
|
import time
|
||
|
|
||
|
import lock_machine
|
||
|
import test_flag
|
||
|
|
||
|
from cros_utils import command_executer
|
||
|
from cros_utils import logger
|
||
|
from cros_utils.email_sender import EmailSender
|
||
|
from cros_utils.file_utils import FileUtils
|
||
|
|
||
|
import config
|
||
|
from experiment_status import ExperimentStatus
|
||
|
from results_cache import CacheConditions
|
||
|
from results_cache import ResultsCache
|
||
|
from results_report import HTMLResultsReport
|
||
|
from results_report import TextResultsReport
|
||
|
from results_report import JSONResultsReport
|
||
|
from schedv2 import Schedv2
|
||
|
|
||
|
|
||
|
def _WriteJSONReportToFile(experiment, results_dir, json_report):
|
||
|
"""Writes a JSON report to a file in results_dir."""
|
||
|
has_llvm = any('llvm' in l.compiler for l in experiment.labels)
|
||
|
compiler_string = 'llvm' if has_llvm else 'gcc'
|
||
|
board = experiment.labels[0].board
|
||
|
filename = 'report_%s_%s_%s.%s.json' % (board, json_report.date,
|
||
|
json_report.time.replace(
|
||
|
':', '.'), compiler_string)
|
||
|
fullname = os.path.join(results_dir, filename)
|
||
|
report_text = json_report.GetReport()
|
||
|
with open(fullname, 'w') as out_file:
|
||
|
out_file.write(report_text)
|
||
|
|
||
|
|
||
|
class ExperimentRunner(object):
|
||
|
"""ExperimentRunner Class."""
|
||
|
|
||
|
STATUS_TIME_DELAY = 30
|
||
|
THREAD_MONITOR_DELAY = 2
|
||
|
|
||
|
SUCCEEDED = 0
|
||
|
HAS_FAILURE = 1
|
||
|
ALL_FAILED = 2
|
||
|
|
||
|
def __init__(self,
|
||
|
experiment,
|
||
|
json_report,
|
||
|
using_schedv2=False,
|
||
|
log=None,
|
||
|
cmd_exec=None):
|
||
|
self._experiment = experiment
|
||
|
self.l = log or logger.GetLogger(experiment.log_dir)
|
||
|
self._ce = cmd_exec or command_executer.GetCommandExecuter(self.l)
|
||
|
self._terminated = False
|
||
|
self.json_report = json_report
|
||
|
self.locked_machines = []
|
||
|
if experiment.log_level != 'verbose':
|
||
|
self.STATUS_TIME_DELAY = 10
|
||
|
|
||
|
# Setting this to True will use crosperf sched v2 (feature in progress).
|
||
|
self._using_schedv2 = using_schedv2
|
||
|
|
||
|
def _GetMachineList(self):
|
||
|
"""Return a list of all requested machines.
|
||
|
|
||
|
Create a list of all the requested machines, both global requests and
|
||
|
label-specific requests, and return the list.
|
||
|
"""
|
||
|
machines = self._experiment.remote
|
||
|
# All Label.remote is a sublist of experiment.remote.
|
||
|
for l in self._experiment.labels:
|
||
|
for r in l.remote:
|
||
|
assert r in machines
|
||
|
return machines
|
||
|
|
||
|
def _UpdateMachineList(self, locked_machines):
|
||
|
"""Update machines lists to contain only locked machines.
|
||
|
|
||
|
Go through all the lists of requested machines, both global and
|
||
|
label-specific requests, and remove any machine that we were not
|
||
|
able to lock.
|
||
|
|
||
|
Args:
|
||
|
locked_machines: A list of the machines we successfully locked.
|
||
|
"""
|
||
|
for m in self._experiment.remote:
|
||
|
if m not in locked_machines:
|
||
|
self._experiment.remote.remove(m)
|
||
|
|
||
|
for l in self._experiment.labels:
|
||
|
for m in l.remote:
|
||
|
if m not in locked_machines:
|
||
|
l.remote.remove(m)
|
||
|
|
||
|
def _GetMachineType(self, lock_mgr, machine):
|
||
|
"""Get where is the machine from.
|
||
|
|
||
|
Returns:
|
||
|
The location of the machine: local or skylab
|
||
|
"""
|
||
|
# We assume that lab machine always starts with chromeos*, and local
|
||
|
# machines are ip address.
|
||
|
if 'chromeos' in machine:
|
||
|
if lock_mgr.CheckMachineInSkylab(machine):
|
||
|
return 'skylab'
|
||
|
else:
|
||
|
raise RuntimeError('Lab machine not in Skylab.')
|
||
|
return 'local'
|
||
|
|
||
|
def _LockAllMachines(self, experiment):
|
||
|
"""Attempt to globally lock all of the machines requested for run.
|
||
|
|
||
|
This method tries to lock all machines requested for this crosperf run
|
||
|
in three different modes automatically, to prevent any other crosperf runs
|
||
|
from being able to update/use the machines while this experiment is
|
||
|
running:
|
||
|
- Skylab machines: Use skylab lease-dut mechanism to lease
|
||
|
- Local machines: Use file lock mechanism to lock
|
||
|
"""
|
||
|
if test_flag.GetTestMode():
|
||
|
self.locked_machines = self._GetMachineList()
|
||
|
experiment.locked_machines = self.locked_machines
|
||
|
else:
|
||
|
experiment.lock_mgr = lock_machine.LockManager(
|
||
|
self._GetMachineList(),
|
||
|
'',
|
||
|
experiment.labels[0].chromeos_root,
|
||
|
experiment.locks_dir,
|
||
|
log=self.l,
|
||
|
)
|
||
|
for m in experiment.lock_mgr.machines:
|
||
|
machine_type = self._GetMachineType(experiment.lock_mgr, m)
|
||
|
if machine_type == 'local':
|
||
|
experiment.lock_mgr.AddMachineToLocal(m)
|
||
|
elif machine_type == 'skylab':
|
||
|
experiment.lock_mgr.AddMachineToSkylab(m)
|
||
|
machine_states = experiment.lock_mgr.GetMachineStates('lock')
|
||
|
experiment.lock_mgr.CheckMachineLocks(machine_states, 'lock')
|
||
|
self.locked_machines = experiment.lock_mgr.UpdateMachines(True)
|
||
|
experiment.locked_machines = self.locked_machines
|
||
|
self._UpdateMachineList(self.locked_machines)
|
||
|
experiment.machine_manager.RemoveNonLockedMachines(self.locked_machines)
|
||
|
if not self.locked_machines:
|
||
|
raise RuntimeError('Unable to lock any machines.')
|
||
|
|
||
|
def _ClearCacheEntries(self, experiment):
|
||
|
for br in experiment.benchmark_runs:
|
||
|
cache = ResultsCache()
|
||
|
cache.Init(br.label.chromeos_image, br.label.chromeos_root,
|
||
|
br.benchmark.test_name, br.iteration, br.test_args,
|
||
|
br.profiler_args, br.machine_manager, br.machine,
|
||
|
br.label.board, br.cache_conditions, br.logger(), br.log_level,
|
||
|
br.label, br.share_cache, br.benchmark.suite,
|
||
|
br.benchmark.show_all_results, br.benchmark.run_local,
|
||
|
br.benchmark.cwp_dso)
|
||
|
cache_dir = cache.GetCacheDirForWrite()
|
||
|
if os.path.exists(cache_dir):
|
||
|
self.l.LogOutput('Removing cache dir: %s' % cache_dir)
|
||
|
shutil.rmtree(cache_dir)
|
||
|
|
||
|
def _Run(self, experiment):
|
||
|
try:
|
||
|
# We should not lease machines if tests are launched via `skylab
|
||
|
# create-test`. This is because leasing DUT in skylab will create a
|
||
|
# no-op task on the DUT and new test created will be hanging there.
|
||
|
# TODO(zhizhouy): Need to check whether machine is ready or not before
|
||
|
# assigning a test to it.
|
||
|
if not experiment.skylab:
|
||
|
self._LockAllMachines(experiment)
|
||
|
# Calculate all checksums of avaiable/locked machines, to ensure same
|
||
|
# label has same machines for testing
|
||
|
experiment.SetCheckSums(forceSameImage=True)
|
||
|
if self._using_schedv2:
|
||
|
schedv2 = Schedv2(experiment)
|
||
|
experiment.set_schedv2(schedv2)
|
||
|
if CacheConditions.FALSE in experiment.cache_conditions:
|
||
|
self._ClearCacheEntries(experiment)
|
||
|
status = ExperimentStatus(experiment)
|
||
|
experiment.Run()
|
||
|
last_status_time = 0
|
||
|
last_status_string = ''
|
||
|
try:
|
||
|
if experiment.log_level != 'verbose':
|
||
|
self.l.LogStartDots()
|
||
|
while not experiment.IsComplete():
|
||
|
if last_status_time + self.STATUS_TIME_DELAY < time.time():
|
||
|
last_status_time = time.time()
|
||
|
border = '=============================='
|
||
|
if experiment.log_level == 'verbose':
|
||
|
self.l.LogOutput(border)
|
||
|
self.l.LogOutput(status.GetProgressString())
|
||
|
self.l.LogOutput(status.GetStatusString())
|
||
|
self.l.LogOutput(border)
|
||
|
else:
|
||
|
current_status_string = status.GetStatusString()
|
||
|
if current_status_string != last_status_string:
|
||
|
self.l.LogEndDots()
|
||
|
self.l.LogOutput(border)
|
||
|
self.l.LogOutput(current_status_string)
|
||
|
self.l.LogOutput(border)
|
||
|
last_status_string = current_status_string
|
||
|
else:
|
||
|
self.l.LogAppendDot()
|
||
|
time.sleep(self.THREAD_MONITOR_DELAY)
|
||
|
except KeyboardInterrupt:
|
||
|
self._terminated = True
|
||
|
self.l.LogError('Ctrl-c pressed. Cleaning up...')
|
||
|
experiment.Terminate()
|
||
|
raise
|
||
|
except SystemExit:
|
||
|
self._terminated = True
|
||
|
self.l.LogError('Unexpected exit. Cleaning up...')
|
||
|
experiment.Terminate()
|
||
|
raise
|
||
|
finally:
|
||
|
experiment.Cleanup()
|
||
|
|
||
|
def _PrintTable(self, experiment):
|
||
|
self.l.LogOutput(TextResultsReport.FromExperiment(experiment).GetReport())
|
||
|
|
||
|
def _Email(self, experiment):
|
||
|
# Only email by default if a new run was completed.
|
||
|
send_mail = False
|
||
|
for benchmark_run in experiment.benchmark_runs:
|
||
|
if not benchmark_run.cache_hit:
|
||
|
send_mail = True
|
||
|
break
|
||
|
if (not send_mail and not experiment.email_to or
|
||
|
config.GetConfig('no_email')):
|
||
|
return
|
||
|
|
||
|
label_names = []
|
||
|
for label in experiment.labels:
|
||
|
label_names.append(label.name)
|
||
|
subject = '%s: %s' % (experiment.name, ' vs. '.join(label_names))
|
||
|
|
||
|
text_report = TextResultsReport.FromExperiment(experiment, True).GetReport()
|
||
|
text_report += ('\nResults are stored in %s.\n' %
|
||
|
experiment.results_directory)
|
||
|
text_report = "<pre style='font-size: 13px'>%s</pre>" % text_report
|
||
|
html_report = HTMLResultsReport.FromExperiment(experiment).GetReport()
|
||
|
attachment = EmailSender.Attachment('report.html', html_report)
|
||
|
email_to = experiment.email_to or []
|
||
|
email_to.append(getpass.getuser())
|
||
|
EmailSender().SendEmail(
|
||
|
email_to,
|
||
|
subject,
|
||
|
text_report,
|
||
|
attachments=[attachment],
|
||
|
msg_type='html')
|
||
|
|
||
|
def _StoreResults(self, experiment):
|
||
|
if self._terminated:
|
||
|
return self.ALL_FAILED
|
||
|
|
||
|
results_directory = experiment.results_directory
|
||
|
FileUtils().RmDir(results_directory)
|
||
|
FileUtils().MkDirP(results_directory)
|
||
|
self.l.LogOutput('Storing experiment file in %s.' % results_directory)
|
||
|
experiment_file_path = os.path.join(results_directory, 'experiment.exp')
|
||
|
FileUtils().WriteFile(experiment_file_path, experiment.experiment_file)
|
||
|
|
||
|
has_failure = False
|
||
|
all_failed = True
|
||
|
|
||
|
topstats_file = os.path.join(results_directory, 'topstats.log')
|
||
|
self.l.LogOutput('Storing top statistics of each benchmark run into %s.' %
|
||
|
topstats_file)
|
||
|
with open(topstats_file, 'w') as top_fd:
|
||
|
for benchmark_run in experiment.benchmark_runs:
|
||
|
if benchmark_run.result:
|
||
|
# FIXME: Pylint has a bug suggesting the following change, which
|
||
|
# should be fixed in pylint 2.0. Resolve this after pylint >= 2.0.
|
||
|
# Bug: https://github.com/PyCQA/pylint/issues/1984
|
||
|
# pylint: disable=simplifiable-if-statement
|
||
|
if benchmark_run.result.retval:
|
||
|
has_failure = True
|
||
|
else:
|
||
|
all_failed = False
|
||
|
# Header with benchmark run name.
|
||
|
top_fd.write('%s\n' % str(benchmark_run))
|
||
|
# Formatted string with top statistics.
|
||
|
top_fd.write(benchmark_run.result.FormatStringTopCommands())
|
||
|
top_fd.write('\n\n')
|
||
|
|
||
|
if all_failed:
|
||
|
return self.ALL_FAILED
|
||
|
|
||
|
self.l.LogOutput('Storing results of each benchmark run.')
|
||
|
for benchmark_run in experiment.benchmark_runs:
|
||
|
if benchmark_run.result:
|
||
|
benchmark_run_name = ''.join(
|
||
|
ch for ch in benchmark_run.name if ch.isalnum())
|
||
|
benchmark_run_path = os.path.join(results_directory, benchmark_run_name)
|
||
|
if experiment.compress_results:
|
||
|
benchmark_run.result.CompressResultsTo(benchmark_run_path)
|
||
|
else:
|
||
|
benchmark_run.result.CopyResultsTo(benchmark_run_path)
|
||
|
benchmark_run.result.CleanUp(benchmark_run.benchmark.rm_chroot_tmp)
|
||
|
|
||
|
self.l.LogOutput('Storing results report in %s.' % results_directory)
|
||
|
results_table_path = os.path.join(results_directory, 'results.html')
|
||
|
report = HTMLResultsReport.FromExperiment(experiment).GetReport()
|
||
|
if self.json_report:
|
||
|
json_report = JSONResultsReport.FromExperiment(
|
||
|
experiment, json_args={'indent': 2})
|
||
|
_WriteJSONReportToFile(experiment, results_directory, json_report)
|
||
|
|
||
|
FileUtils().WriteFile(results_table_path, report)
|
||
|
|
||
|
self.l.LogOutput('Storing email message body in %s.' % results_directory)
|
||
|
msg_file_path = os.path.join(results_directory, 'msg_body.html')
|
||
|
text_report = TextResultsReport.FromExperiment(experiment, True).GetReport()
|
||
|
text_report += ('\nResults are stored in %s.\n' %
|
||
|
experiment.results_directory)
|
||
|
msg_body = "<pre style='font-size: 13px'>%s</pre>" % text_report
|
||
|
FileUtils().WriteFile(msg_file_path, msg_body)
|
||
|
|
||
|
return self.SUCCEEDED if not has_failure else self.HAS_FAILURE
|
||
|
|
||
|
def Run(self):
|
||
|
try:
|
||
|
self._Run(self._experiment)
|
||
|
finally:
|
||
|
# Always print the report at the end of the run.
|
||
|
self._PrintTable(self._experiment)
|
||
|
ret = self._StoreResults(self._experiment)
|
||
|
if ret != self.ALL_FAILED:
|
||
|
self._Email(self._experiment)
|
||
|
return ret
|
||
|
|
||
|
|
||
|
class MockExperimentRunner(ExperimentRunner):
|
||
|
"""Mocked ExperimentRunner for testing."""
|
||
|
|
||
|
def __init__(self, experiment, json_report):
|
||
|
super(MockExperimentRunner, self).__init__(experiment, json_report)
|
||
|
|
||
|
def _Run(self, experiment):
|
||
|
self.l.LogOutput("Would run the following experiment: '%s'." %
|
||
|
experiment.name)
|
||
|
|
||
|
def _PrintTable(self, experiment):
|
||
|
self.l.LogOutput('Would print the experiment table.')
|
||
|
|
||
|
def _Email(self, experiment):
|
||
|
self.l.LogOutput('Would send result email.')
|
||
|
|
||
|
def _StoreResults(self, experiment):
|
||
|
self.l.LogOutput('Would store the results.')
|