add thread lock to task_queue & add gpu info fetcher to monitor

This commit is contained in:
Gallen 2018-11-01 15:31:44 +08:00
parent 16f7f6b547
commit 39831e9fb9
3 changed files with 100 additions and 55 deletions

View File

@ -45,6 +45,8 @@ class TaskMgr(threading.Thread):
self.thread_stop = False self.thread_stop = False
self.jobmgr = None self.jobmgr = None
self.task_queue = [] self.task_queue = []
self.lazy_delete_list = []
self.task_queue_lock = threading.Lock()
self.user_containers = {} self.user_containers = {}
self.scheduler_interval = scheduler_interval self.scheduler_interval = scheduler_interval
@ -62,9 +64,19 @@ class TaskMgr(threading.Thread):
# self.nodes_info_update_interval = 30 # (s) # self.nodes_info_update_interval = 30 # (s)
def queue_lock(self, f):
def new_f(*args, **kwargs):
self.task_queue_lock.acquire()
result = f(args, kwargs)
self.task_queue_lock.release()
return result
return new_f
def run(self): def run(self):
self.serve() self.serve()
while not self.thread_stop: while not self.thread_stop:
self.clean_up_finished_task()
task, instance_id, worker = self.task_scheduler() task, instance_id, worker = self.task_scheduler()
if task is not None and worker is not None: if task is not None and worker is not None:
self.task_processor(task, instance_id, worker) self.task_processor(task, instance_id, worker)
@ -150,7 +162,7 @@ class TaskMgr(threading.Thread):
else: else:
self.jobmgr.report(task) self.jobmgr.report(task)
self.logger.info('task %s completed' % task.info.id) self.logger.info('task %s completed' % task.info.id)
self.task_queue.remove(task) self.lazy_delete_list.append(task)
def task_failed(self, task): def task_failed(self, task):
@ -161,9 +173,15 @@ class TaskMgr(threading.Thread):
else: else:
self.jobmgr.report(task) self.jobmgr.report(task)
self.logger.info('task %s failed' % task.info.id) self.logger.info('task %s failed' % task.info.id)
self.task_queue.remove(task) self.lazy_delete_list.append(task)
@queue_lock
def clean_up_finished_task(self):
while self.lazy_delete_list:
task = self.lazy_delete_list.pop(0)
self.task_queue.remove(task)
def task_processor(self, task, instance_id, worker_ip): def task_processor(self, task, instance_id, worker_ip):
task.status = RUNNING task.status = RUNNING
@ -217,6 +235,8 @@ class TaskMgr(threading.Thread):
# self.logger.info('[task_scheduler] %s: %d' % (key, worker_info[key])) # self.logger.info('[task_scheduler] %s: %d' % (key, worker_info[key]))
for task in self.task_queue: for task in self.task_queue:
if task in self.lazy_delete_list:
continue
worker = self.find_proper_worker(task) worker = self.find_proper_worker(task)
for index, instance in enumerate(task.instance_list): for index, instance in enumerate(task.instance_list):
@ -259,8 +279,9 @@ class TaskMgr(threading.Thread):
continue continue
if task.info.cluster.instance.memory > worker_info['memory']: if task.info.cluster.instance.memory > worker_info['memory']:
continue continue
# if task.info.cluster.instance.disk > worker_info['disk']: # try not to assign non-gpu task to a worker with gpu
# continue if task.info.cluster.instance.gpu == 0 and worker_info['gpu'] > 0:
continue
if task.info.cluster.instance.gpu > worker_info['gpu']: if task.info.cluster.instance.gpu > worker_info['gpu']:
continue continue
return worker_ip return worker_ip
@ -289,7 +310,7 @@ class TaskMgr(threading.Thread):
info['cpu'] = len(worker_info['cpuconfig']) info['cpu'] = len(worker_info['cpuconfig'])
info['memory'] = (worker_info['meminfo']['buffers'] + worker_info['meminfo']['cached'] + worker_info['meminfo']['free']) / 1024 # (Mb) info['memory'] = (worker_info['meminfo']['buffers'] + worker_info['meminfo']['cached'] + worker_info['meminfo']['free']) / 1024 # (Mb)
info['disk'] = sum([disk['free'] for disk in worker_info['diskinfo']]) / 1024 / 1024 # (Mb) info['disk'] = sum([disk['free'] for disk in worker_info['diskinfo']]) / 1024 / 1024 # (Mb)
info['gpu'] = 0 # not support yet info['gpu'] = len(worker_info['gpuinfo'])
return info return info
@ -350,6 +371,7 @@ class TaskMgr(threading.Thread):
# user: username # user: username
# get the information of a task, including the status, task description and other information # get the information of a task, including the status, task description and other information
@queue_lock
def get_task(self, taskid): def get_task(self, taskid):
for task in self.task_queue: for task in self.task_queue:
if task.info.id == taskid: if task.info.id == taskid:

View File

@ -1,5 +1,8 @@
import lxc import lxc
import subprocess import subprocess
import os
import signal
from utils.log import logger
# Note: keep physical device id always the same as the virtual device id # Note: keep physical device id always the same as the virtual device id
@ -98,3 +101,18 @@ def get_container_name_by_pid(pid):
else: else:
return content[2] return content[2]
return None return None
def clean_up_processes_in_gpu(gpu_id):
logger.info('[gputools] start clean up processes in gpu %d' % gpu_id)
processes = get_gpu_processes()
for process in [p for p in processes if p['gpu'] == gpu_id]:
logger.info('[gputools] find process %d running in gpu %d' % (process['pid'], process['gpu']))
if process['container'] == 'host':
logger.warning('[gputools] find process of host, ignored')
else:
logger.warning('[gputools] find process of container [%s], killed' % process['container'])
try:
os.kill(process['pid'], signal.SIGKILL)
except OSError:
continue

View File

@ -19,7 +19,7 @@ Design:Monitor mainly consists of three parts: Collectors, Master_Collector and
import subprocess,re,os,psutil,math,sys import subprocess,re,os,psutil,math,sys
import time,threading,json,traceback,platform import time,threading,json,traceback,platform
from utils import env, etcdlib from utils import env, etcdlib, gputools
import lxc import lxc
import xmlrpc.client import xmlrpc.client
from datetime import datetime from datetime import datetime
@ -481,6 +481,10 @@ class Collector(threading.Thread):
info[idx][key] = val info[idx][key] = val
return [cpuset, info] return [cpuset, info]
# collect gpu used information
def collect_gpuinfo(self):
return gputools.get_gpu_status()
# collect disk used information # collect disk used information
def collect_diskinfo(self): def collect_diskinfo(self):
global workercinfo global workercinfo
@ -537,6 +541,7 @@ class Collector(threading.Thread):
[cpuinfo,cpuconfig] = self.collect_cpuinfo() [cpuinfo,cpuconfig] = self.collect_cpuinfo()
workerinfo['cpuinfo'] = cpuinfo workerinfo['cpuinfo'] = cpuinfo
workerinfo['cpuconfig'] = cpuconfig workerinfo['cpuconfig'] = cpuconfig
workerinfo['gpuinfo'] = self.collect_gpuinfo()
workerinfo['diskinfo'] = self.collect_diskinfo() workerinfo['diskinfo'] = self.collect_diskinfo()
workerinfo['running'] = True workerinfo['running'] = True
#time.sleep(self.interval) #time.sleep(self.interval)