aosp12/external/toolchain-utils/lock_machine.py

530 lines
17 KiB
Python
Raw Normal View History

2023-01-09 17:11:35 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2019 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""This module controls locking and unlocking of test machines."""
from __future__ import print_function
import argparse
import enum
import getpass
import os
import sys
import file_lock_machine
from cros_utils import command_executer
from cros_utils import logger
from cros_utils import machines
class LockException(Exception):
"""Base class for exceptions in this module."""
class MachineNotPingable(LockException):
"""Raised when machine does not respond to ping."""
class LockingError(LockException):
"""Raised when server fails to lock/unlock machine as requested."""
class DontOwnLock(LockException):
"""Raised when user attmepts to unlock machine locked by someone else."""
# This should not be raised if the user specified '--force'
class MachineType(enum.Enum):
"""Enum class to hold machine type."""
LOCAL = 'local'
SKYLAB = 'skylab'
class LockManager(object):
"""Class for locking/unlocking machines vie three different modes.
This class contains methods for checking the locked status of machines,
and for changing the locked status. It handles HW lab machines and local
machines, using appropriate locking mechanisms for each.
"""
SKYLAB_PATH = 'skylab'
# TODO(zhizhouy): lease time may needs to be dynamically adjusted. For now we
# set it long enough to cover the period to finish nightly rotation tests.
LEASE_MINS = 1439
SKYLAB_CREDENTIAL = ('/usr/local/google/home/mobiletc-prebuild'
'/sheriff_utils/credentials/skylab'
'/chromeos-swarming-credential.json')
SWARMING = 'chromite/third_party/swarming.client/swarming.py'
SUCCESS = 0
def __init__(self,
remotes,
force_option,
chromeos_root,
locks_dir='',
log=None):
"""Initializes an LockManager object.
Args:
remotes: A list of machine names or ip addresses to be managed. Names
and ip addresses should be represented as strings. If the list is
empty, the lock manager will get all known machines.
force_option: A Boolean indicating whether or not to force an unlock of
a machine that was locked by someone else.
chromeos_root: The ChromeOS chroot to use for the autotest scripts.
locks_dir: A directory used for file locking local devices.
log: If not None, this is the logger object to be used for writing out
informational output messages. It is expected to be an instance of
Logger class from cros_utils/logger.py.
"""
self.chromeos_root = chromeos_root
self.user = getpass.getuser()
self.logger = log or logger.GetLogger()
self.ce = command_executer.GetCommandExecuter(self.logger)
sys.path.append(chromeos_root)
self.locks_dir = locks_dir
self.machines = list(set(remotes)) or []
self.toolchain_lab_machines = self.GetAllToolchainLabMachines()
if not self.machines:
self.machines = self.toolchain_lab_machines
self.force = force_option
self.local_machines = []
self.skylab_machines = []
def CheckMachine(self, machine, error_msg):
"""Verifies that machine is responding to ping.
Args:
machine: String containing the name or ip address of machine to check.
error_msg: Message to print if ping fails.
Raises:
MachineNotPingable: If machine is not responding to 'ping'
"""
if not machines.MachineIsPingable(machine, logging_level='none'):
cros_machine = machine + '.cros'
if not machines.MachineIsPingable(cros_machine, logging_level='none'):
raise MachineNotPingable(error_msg)
def GetAllToolchainLabMachines(self):
"""Gets a list of all the toolchain machines in the ChromeOS HW lab.
Returns:
A list of names of the toolchain machines in the ChromeOS HW lab.
"""
machines_file = os.path.join(
os.path.dirname(__file__), 'crosperf', 'default_remotes')
machine_list = []
with open(machines_file, 'r') as input_file:
lines = input_file.readlines()
for line in lines:
_, remotes = line.split(':')
remotes = remotes.strip()
for r in remotes.split():
machine_list.append(r.strip())
return machine_list
def GetMachineType(self, m):
"""Get where the machine is located.
Args:
m: String containing the name or ip address of machine.
Returns:
Value of the type in MachineType Enum.
"""
if m in self.local_machines:
return MachineType.LOCAL
if m in self.skylab_machines:
return MachineType.SKYLAB
def PrintStatusHeader(self):
"""Prints the status header lines for machines."""
print('\nMachine (Board)\t\t\t\t\tStatus')
print('---------------\t\t\t\t\t------')
def PrintStatus(self, m, state, machine_type):
"""Prints status for a single machine.
Args:
m: String containing the name or ip address of machine.
state: A dictionary of the current state of the machine.
machine_type: MachineType to determine where the machine is located.
"""
if state['locked']:
print('%s (%s)\t\t%slocked by %s since %s' %
(m, state['board'], '\t\t' if machine_type == MachineType.LOCAL else
'', state['locked_by'], state['lock_time']))
else:
print(
'%s (%s)\t\t%sunlocked' % (m, state['board'], '\t\t' if
machine_type == MachineType.LOCAL else ''))
def AddMachineToLocal(self, machine):
"""Adds a machine to local machine list.
Args:
machine: The machine to be added.
"""
if machine not in self.local_machines:
self.local_machines.append(machine)
def AddMachineToSkylab(self, machine):
"""Adds a machine to skylab machine list.
Args:
machine: The machine to be added.
"""
if machine not in self.skylab_machines:
self.skylab_machines.append(machine)
def ListMachineStates(self, machine_states):
"""Gets and prints the current status for a list of machines.
Prints out the current status for all of the machines in the current
LockManager's list of machines (set when the object is initialized).
Args:
machine_states: A dictionary of the current state of every machine in
the current LockManager's list of machines. Normally obtained by
calling LockManager::GetMachineStates.
"""
self.PrintStatusHeader()
for m in machine_states:
machine_type = self.GetMachineType(m)
state = machine_states[m]
self.PrintStatus(m, state, machine_type)
def UpdateLockInSkylab(self, should_lock_machine, machine):
"""Ask skylab to lease/release a machine.
Args:
should_lock_machine: Boolean indicating whether to lock the machine (True)
or unlock the machine (False).
machine: The machine to update.
Returns:
True if requested action succeeded, else False.
"""
try:
if should_lock_machine:
ret = self.LeaseSkylabMachine(machine)
else:
ret = self.ReleaseSkylabMachine(machine)
except Exception:
return False
return ret
def UpdateFileLock(self, should_lock_machine, machine):
"""Use file lock for local machines,
Args:
should_lock_machine: Boolean indicating whether to lock the machine (True)
or unlock the machine (False).
machine: The machine to update.
Returns:
True if requested action succeeded, else False.
"""
try:
if should_lock_machine:
ret = file_lock_machine.Machine(machine, self.locks_dir).Lock(
True, sys.argv[0])
else:
ret = file_lock_machine.Machine(machine, self.locks_dir).Unlock(True)
except Exception:
return False
return ret
def UpdateMachines(self, lock_machines):
"""Sets the locked state of the machines to the requested value.
The machines updated are the ones in self.machines (specified when the
class object was intialized).
Args:
lock_machines: Boolean indicating whether to lock the machines (True) or
unlock the machines (False).
Returns:
A list of the machines whose state was successfully updated.
"""
updated_machines = []
action = 'Locking' if lock_machines else 'Unlocking'
for m in self.machines:
# TODO(zhizhouy): Handling exceptions with more details when locking
# doesn't succeed.
machine_type = self.GetMachineType(m)
if machine_type == MachineType.SKYLAB:
ret = self.UpdateLockInSkylab(lock_machines, m)
elif machine_type == MachineType.LOCAL:
ret = self.UpdateFileLock(lock_machines, m)
if ret:
self.logger.LogOutput(
'%s %s machine succeeded: %s.' % (action, machine_type.value, m))
updated_machines.append(m)
else:
self.logger.LogOutput(
'%s %s machine failed: %s.' % (action, machine_type.value, m))
self.machines = updated_machines
return updated_machines
def _InternalRemoveMachine(self, machine):
"""Remove machine from internal list of machines.
Args:
machine: Name of machine to be removed from internal list.
"""
# Check to see if machine is lab machine and if so, make sure it has
# ".cros" on the end.
cros_machine = machine
if machine.find('rack') > 0 and machine.find('row') > 0:
if machine.find('.cros') == -1:
cros_machine = cros_machine + '.cros'
self.machines = [
m for m in self.machines if m not in (cros_machine, machine)
]
def CheckMachineLocks(self, machine_states, cmd):
"""Check that every machine in requested list is in the proper state.
If the cmd is 'unlock' verify that every machine is locked by requestor.
If the cmd is 'lock' verify that every machine is currently unlocked.
Args:
machine_states: A dictionary of the current state of every machine in
the current LockManager's list of machines. Normally obtained by
calling LockManager::GetMachineStates.
cmd: The user-requested action for the machines: 'lock' or 'unlock'.
Raises:
DontOwnLock: The lock on a requested machine is owned by someone else.
"""
for k, state in machine_states.items():
if cmd == 'unlock':
if not state['locked']:
self.logger.LogWarning('Attempt to unlock already unlocked machine '
'(%s).' % k)
self._InternalRemoveMachine(k)
# TODO(zhizhouy): Skylab doesn't support host info such as locked_by.
# Need to update this when skylab supports it.
if (state['locked'] and state['locked_by'] and
state['locked_by'] != self.user):
raise DontOwnLock('Attempt to unlock machine (%s) locked by someone '
'else (%s).' % (k, state['locked_by']))
elif cmd == 'lock':
if state['locked']:
self.logger.LogWarning(
'Attempt to lock already locked machine (%s)' % k)
self._InternalRemoveMachine(k)
def GetMachineStates(self, cmd=''):
"""Gets the current state of all the requested machines.
Gets the current state of all the requested machines. Stores the data in a
dictionary keyed by machine name.
Args:
cmd: The command for which we are getting the machine states. This is
important because if one of the requested machines is missing we raise
an exception, unless the requested command is 'add'.
Returns:
A dictionary of machine states for all the machines in the LockManager
object.
"""
machine_list = {}
for m in self.machines:
# For local or skylab machines, we simply set {'locked': status} for them
# TODO(zhizhouy): This is a quick fix since skylab cannot return host info
# as afe does. We need to get more info such as locked_by when skylab
# supports that.
values = {
'locked': 0 if cmd == 'lock' else 1,
'board': '??',
'locked_by': '',
'lock_time': ''
}
machine_list[m] = values
self.ListMachineStates(machine_list)
return machine_list
def CheckMachineInSkylab(self, machine):
"""Run command to check if machine is in Skylab or not.
Returns:
True if machine in skylab, else False
"""
credential = ''
if os.path.exists(self.SKYLAB_CREDENTIAL):
credential = '--auth-service-account-json %s' % self.SKYLAB_CREDENTIAL
swarming = os.path.join(self.chromeos_root, self.SWARMING)
# TODO(zhizhouy): Swarming script doesn't support python3 so explicitly
# launch it with python2 until migrated.
cmd = (('python2 %s ' \
'query --swarming https://chromeos-swarming.appspot.com ' \
"%s 'bots/list?is_dead=FALSE&dimensions=dut_name:%s'") % \
(swarming,
credential,
machine.rstrip('.cros')))
exit_code, stdout, stderr = self.ce.RunCommandWOutput(cmd)
if exit_code:
raise ValueError(
'Querying bots failed (2); stdout: %r; stderr: %r' % (stdout, stderr))
# The command will return a json output as stdout. If machine not in skylab
# stdout will look like this:
# {
# "death_timeout": "600",
# "now": "TIMESTAMP"
# }
# Otherwise there will be a tuple starting with 'items', we simply detect
# this keyword for result.
return 'items' in stdout
def LeaseSkylabMachine(self, machine):
"""Run command to lease dut from skylab.
Returns:
True if succeeded, False if failed.
"""
credential = ''
if os.path.exists(self.SKYLAB_CREDENTIAL):
credential = '-service-account-json %s' % self.SKYLAB_CREDENTIAL
cmd = (('%s lease-dut -minutes %s %s %s') % \
(self.SKYLAB_PATH,
self.LEASE_MINS,
credential,
machine.rstrip('.cros')))
# Wait 120 seconds for server to start the lease task, if not started,
# we will treat it as unavailable.
check_interval_time = 120
retval = self.ce.RunCommand(cmd, command_timeout=check_interval_time)
return retval == self.SUCCESS
def ReleaseSkylabMachine(self, machine):
"""Run command to release dut from skylab.
Returns:
True if succeeded, False if failed.
"""
credential = ''
if os.path.exists(self.SKYLAB_CREDENTIAL):
credential = '-service-account-json %s' % self.SKYLAB_CREDENTIAL
cmd = (('%s release-dut %s %s') % \
(self.SKYLAB_PATH,
credential,
machine.rstrip('.cros')))
retval = self.ce.RunCommand(cmd)
return retval == self.SUCCESS
def Main(argv):
"""Parse the options, initialize lock manager and dispatch proper method.
Args:
argv: The options with which this script was invoked.
Returns:
0 unless an exception is raised.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--list',
dest='cmd',
action='store_const',
const='status',
help='List current status of all known machines.')
parser.add_argument(
'--lock',
dest='cmd',
action='store_const',
const='lock',
help='Lock given machine(s).')
parser.add_argument(
'--unlock',
dest='cmd',
action='store_const',
const='unlock',
help='Unlock given machine(s).')
parser.add_argument(
'--status',
dest='cmd',
action='store_const',
const='status',
help='List current status of given machine(s).')
parser.add_argument(
'--remote', dest='remote', help='machines on which to operate')
parser.add_argument(
'--chromeos_root',
dest='chromeos_root',
required=True,
help='ChromeOS root to use for autotest scripts.')
parser.add_argument(
'--force',
dest='force',
action='store_true',
default=False,
help='Force lock/unlock of machines, even if not'
' current lock owner.')
options = parser.parse_args(argv)
if not options.remote and options.cmd != 'status':
parser.error('No machines specified for operation.')
if not os.path.isdir(options.chromeos_root):
parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root)
if not options.cmd:
parser.error('No operation selected (--list, --status, --lock, --unlock,'
' --add_machine, --remove_machine).')
machine_list = []
if options.remote:
machine_list = options.remote.split()
lock_manager = LockManager(machine_list, options.force, options.chromeos_root)
machine_states = lock_manager.GetMachineStates(cmd=options.cmd)
cmd = options.cmd
if cmd == 'status':
lock_manager.ListMachineStates(machine_states)
elif cmd == 'lock':
if not lock_manager.force:
lock_manager.CheckMachineLocks(machine_states, cmd)
lock_manager.UpdateMachines(True)
elif cmd == 'unlock':
if not lock_manager.force:
lock_manager.CheckMachineLocks(machine_states, cmd)
lock_manager.UpdateMachines(False)
return 0
if __name__ == '__main__':
sys.exit(Main(sys.argv[1:]))