refactor taskmgr
This commit is contained in:
parent
8f14b4e9c8
commit
c85342bec8
|
@ -24,16 +24,12 @@ def int_to_ip(num):
|
||||||
return str((num>>24)&255)+"."+str((num>>16)&255)+"."+str((num>>8)&255)+"."+str(num&255)
|
return str((num>>24)&255)+"."+str((num>>16)&255)+"."+str((num>>8)&255)+"."+str(num&255)
|
||||||
|
|
||||||
class Task():
|
class Task():
|
||||||
def __init__(self, configinfo, vnodeinfo, taskinfo, priority, max_size):
|
def __init__(self, task_id, username, at_same_time, priority, max_size, task_infos):
|
||||||
self.vnodeinfo = vnodeinfo
|
self.id = task_id
|
||||||
self.taskinfo = taskinfo
|
self.username = username
|
||||||
self.status = WAITING
|
self.status = WAITING
|
||||||
self.subtask_list = []
|
# if all the vnodes must be started at the same time
|
||||||
self.token = ''
|
self.at_same_time = at_same_time
|
||||||
self.maxRetryCount = self.configinfo['maxRetryCount']
|
|
||||||
self.atSameTime = self.configinfo['atSameTime']
|
|
||||||
self.multicommand = self.configinfo['multicommand']
|
|
||||||
self.vnode_nums = self.configinfo['vnode_nums']
|
|
||||||
# priority the bigger the better
|
# priority the bigger the better
|
||||||
# self.priority the smaller the better
|
# self.priority the smaller the better
|
||||||
self.priority = int(time.time()) / 60 / 60 - priority
|
self.priority = int(time.time()) / 60 / 60 - priority
|
||||||
|
@ -41,8 +37,13 @@ class Task():
|
||||||
self.ips = None
|
self.ips = None
|
||||||
self.max_size = max_size
|
self.max_size = max_size
|
||||||
|
|
||||||
for i in range(self.vnode_nums):
|
self.subtask_list = [SubTask(
|
||||||
self.subtask_list.append({'status':'WAITING','try_count':0})
|
idx = index,
|
||||||
|
root_task = self,
|
||||||
|
vnode_info = task_info['vnode_info'],
|
||||||
|
command_info = task_info['command_info'],
|
||||||
|
max_retry_count = task_info['max_retry_count']
|
||||||
|
) for (index, task_info) in enumerate(task_infos)]
|
||||||
|
|
||||||
def __lt__(self, other):
|
def __lt__(self, other):
|
||||||
return self.priority < other.priority
|
return self.priority < other.priority
|
||||||
|
@ -55,8 +56,8 @@ class Task():
|
||||||
self.ips.append(int_to_ip(base_ip + self.task_base_ip + i + 2))
|
self.ips.append(int_to_ip(base_ip + self.task_base_ip + i + 2))
|
||||||
|
|
||||||
def gen_hosts(self):
|
def gen_hosts(self):
|
||||||
username = self.taskinfo.username
|
username = self.username
|
||||||
taskid = self.taskinfo.taskid
|
taskid = self.id
|
||||||
logger.info("Generate hosts for user(%s) task(%s) base_ip(%s)"%(username,taskid,str(self.task_base_ip)))
|
logger.info("Generate hosts for user(%s) task(%s) base_ip(%s)"%(username,taskid,str(self.task_base_ip)))
|
||||||
fspath = env.getenv('FS_PREFIX')
|
fspath = env.getenv('FS_PREFIX')
|
||||||
if not os.path.isdir("%s/global/users/%s" % (fspath,username)):
|
if not os.path.isdir("%s/global/users/%s" % (fspath,username)):
|
||||||
|
@ -72,11 +73,26 @@ class Task():
|
||||||
i += 1
|
i += 1
|
||||||
hosts_file.close()
|
hosts_file.close()
|
||||||
|
|
||||||
def get_one_resources_need(self):
|
class SubTask():
|
||||||
return self.vnodeinfo.vnode.instance
|
def __init__(self, idx, root_task, vnode_info, command_info, max_retry_count):
|
||||||
|
self.root_task = root_task
|
||||||
|
self.vnode_info = vnode_info
|
||||||
|
self.vnode_info.vnodeid = idx
|
||||||
|
self.command_info = command_info
|
||||||
|
self.command_info.vnodeid = idx
|
||||||
|
self.max_retry_count = max_retry_count
|
||||||
|
self.vnode_started = False
|
||||||
|
self.task_started = False
|
||||||
|
self.status = WAITING
|
||||||
|
self.status_reason = ''
|
||||||
|
self.try_count = 0
|
||||||
|
self.worker = None
|
||||||
|
|
||||||
def get_all_resources_need(self):
|
def waiting_for_retry(self):
|
||||||
return [self.vnodeinfo.vnode.instance for i in range(self.vnode_nums)]
|
self.try_count += 1
|
||||||
|
self.status = WAITING if self.try_count <= self.max_retry_count else FAILED
|
||||||
|
if self.status == FAILED and self.root_task.at_same_time:
|
||||||
|
self.root_task.status = FAILED
|
||||||
|
|
||||||
|
|
||||||
class TaskReporter(MasterServicer):
|
class TaskReporter(MasterServicer):
|
||||||
|
@ -89,6 +105,7 @@ class TaskReporter(MasterServicer):
|
||||||
self.taskmgr.on_task_report(task_report)
|
self.taskmgr.on_task_report(task_report)
|
||||||
return Reply(status=Reply.ACCEPTED, message='')
|
return Reply(status=Reply.ACCEPTED, message='')
|
||||||
|
|
||||||
|
|
||||||
class TaskMgr(threading.Thread):
|
class TaskMgr(threading.Thread):
|
||||||
|
|
||||||
# load task information from etcd
|
# load task information from etcd
|
||||||
|
@ -156,9 +173,9 @@ class TaskMgr(threading.Thread):
|
||||||
self.serve()
|
self.serve()
|
||||||
while not self.thread_stop:
|
while not self.thread_stop:
|
||||||
self.sort_out_task_queue()
|
self.sort_out_task_queue()
|
||||||
task, vnodes_workers = self.task_scheduler()
|
task, sub_task_list = self.task_scheduler()
|
||||||
if task is not None and workers is not None:
|
if task is not None and sub_task_list is not None:
|
||||||
self.task_processor(task, vnodes_workers)
|
self.task_processor(task, sub_task_list)
|
||||||
else:
|
else:
|
||||||
time.sleep(self.scheduler_interval)
|
time.sleep(self.scheduler_interval)
|
||||||
|
|
||||||
|
@ -185,39 +202,86 @@ class TaskMgr(threading.Thread):
|
||||||
self.task_queue.append(task)
|
self.task_queue.append(task)
|
||||||
self.task_queue = sorted(self.task_queue, key=lambda x: x.priority)
|
self.task_queue = sorted(self.task_queue, key=lambda x: x.priority)
|
||||||
|
|
||||||
def stop_vnode(self, worker, task, vnodeid):
|
def start_vnode(self, subtask):
|
||||||
vnodeinfo = copy.copy(task.vnodeinfo)
|
|
||||||
vnodeinfo.vnodeid = vnodeid
|
|
||||||
try:
|
try:
|
||||||
self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (task.vnodeinfo.id, vnodeid))
|
self.logger.info('[task_processor] Starting vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||||
channel = grpc.insecure_channel('%s:%s' % (worker, self.worker_port))
|
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||||
stub = WorkerStub(channel)
|
stub = WorkerStub(channel)
|
||||||
response = stub.stop_vnode(vnodeinfo)
|
response = stub.start_vnode(subtask.vnode_info)
|
||||||
if response.status != Reply.ACCEPTED:
|
if response.status != Reply.ACCEPTED:
|
||||||
raise Exception(response.message)
|
raise Exception(response.message)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error('[task_processor] rpc error message: %s' % e)
|
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||||
|
subtask.status_reason = str(e)
|
||||||
return [False, e]
|
return [False, e]
|
||||||
return [True, ""]
|
subtask.vnode_started = True
|
||||||
|
self.cpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.cpu
|
||||||
|
self.gpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.gpu
|
||||||
|
return [True, '']
|
||||||
|
|
||||||
|
def stop_vnode(self, subtask):
|
||||||
|
try:
|
||||||
|
self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||||
|
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||||
|
stub = WorkerStub(channel)
|
||||||
|
response = stub.stop_vnode(subtask.vnode_info)
|
||||||
|
if response.status != Reply.ACCEPTED:
|
||||||
|
raise Exception(response.message)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||||
|
subtask.status_reason = str(e)
|
||||||
|
return [False, e]
|
||||||
|
subtask.vnode_started = False
|
||||||
|
self.cpu_usage[subtask.worker] -= subtask.vnode_info.vnode.instance.cpu
|
||||||
|
self.gpu_usage[subtask.worker] -= subtask.vnode_info.vnode.instance.gpu
|
||||||
|
return [True, '']
|
||||||
|
|
||||||
|
def start_task(self, subtask):
|
||||||
|
try:
|
||||||
|
self.logger.info('[task_processor] Starting task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||||
|
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||||
|
stub = WorkerStub(channel)
|
||||||
|
response = stub.start_task(subtask.command_info)
|
||||||
|
if response.status != Reply.ACCEPTED:
|
||||||
|
raise Exception(response.message)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||||
|
subtask.status_reason = str(e)
|
||||||
|
subtask.task_started = True
|
||||||
|
|
||||||
|
def stop_task(self, subtask):
|
||||||
|
try:
|
||||||
|
self.logger.info('[task_processor] Stoping task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||||
|
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||||
|
stub = WorkerStub(channel)
|
||||||
|
response = stub.stop_stask(subtask.command_info)
|
||||||
|
if response.status != Reply.ACCEPTED:
|
||||||
|
raise Exception(response.message)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||||
|
subtask.status = FAILED
|
||||||
|
subtask.status_reason = str(e)
|
||||||
|
subtask.task_started = False
|
||||||
|
|
||||||
@net_lock
|
@net_lock
|
||||||
def acquire_task_ips(self, task):
|
def acquire_task_ips(self, task):
|
||||||
self.logger.info("[acquire_task_ips] user(%s) task(%s) net(%s)"%(task.taskinfo.username, task.taskinfo.taskid, str(task.task_base_ip)))
|
self.logger.info("[acquire_task_ips] user(%s) task(%s) net(%s)" % (task.username, task.id, str(task.task_base_ip)))
|
||||||
if task.task_base_ip == None:
|
if task.task_base_ip == None:
|
||||||
task.task_base_ip = self.free_nets.pop(0)
|
task.task_base_ip = self.free_nets.pop(0)
|
||||||
return task.task_base_ip
|
return task.task_base_ip
|
||||||
|
|
||||||
@net_lock
|
@net_lock
|
||||||
def release_task_ips(self, task):
|
def release_task_ips(self, task):
|
||||||
self.logger.info("[release_task_ips] user(%s) task(%s) net(%s)"%(task.taskinfo.username, task.taskinfo.taskid, str(task.task_base_ip)))
|
self.logger.info("[release_task_ips] user(%s) task(%s) net(%s)" % (task.username, task.id, str(task.task_base_ip)))
|
||||||
if task.task_base_ip == None:
|
if task.task_base_ip == None:
|
||||||
return
|
return
|
||||||
self.free_nets.append(task.task_base_ip)
|
self.free_nets.append(task.task_base_ip)
|
||||||
|
task.task_base_ip = None
|
||||||
self.logger.error('[release task_net] %s' % str(e))
|
self.logger.error('[release task_net] %s' % str(e))
|
||||||
|
|
||||||
def setup_tasknet(self, task, workers=None):
|
def setup_tasknet(self, task, workers=None):
|
||||||
taskid = task.taskinfo.taskid
|
taskid = task.id
|
||||||
username = task.taskinfo.username
|
username = task.username
|
||||||
brname = "docklet-batch-%s-%s"%(username, taskid)
|
brname = "docklet-batch-%s-%s"%(username, taskid)
|
||||||
gwname = "Batch-%s-%s"%(username, taskid)
|
gwname = "Batch-%s-%s"%(username, taskid)
|
||||||
if task.task_base_ip == None:
|
if task.task_base_ip == None:
|
||||||
|
@ -232,42 +296,37 @@ class TaskMgr(threading.Thread):
|
||||||
return [True, gatewayip]
|
return [True, gatewayip]
|
||||||
|
|
||||||
def remove_tasknet(self, task):
|
def remove_tasknet(self, task):
|
||||||
taskid = task.taskinfo.taskid
|
taskid = task.id
|
||||||
username = task.taskinfo.username
|
username = task.username
|
||||||
brname = "docklet-batch-%s-%s"%(username, taskid)
|
brname = "docklet-batch-%s-%s"%(username, taskid)
|
||||||
netcontrol.del_bridge(brname)
|
netcontrol.del_bridge(brname)
|
||||||
|
|
||||||
def task_processor(self, task, vnodes_workers):
|
def task_processor(self, task, sub_task_list):
|
||||||
task.status = RUNNING
|
task.status = RUNNING
|
||||||
self.jobmgr.report(task.taskinfo.taskid,'running')
|
# self.jobmgr.report(task.id,'running')
|
||||||
|
|
||||||
# properties for transactio
|
# properties for transactio
|
||||||
|
|
||||||
self.acquire_task_net(task)
|
self.acquire_task_ips(task)
|
||||||
task.gen_ips_from_base(self.base_ip)
|
task.gen_ips_from_base(self.base_ip)
|
||||||
task.gen_hosts()
|
task.gen_hosts()
|
||||||
#need to create hosts
|
#need to create hosts
|
||||||
[success, gwip] = self.setup_tasknet(task,[w[1] for w in vnodes_workers])
|
[success, gwip] = self.setup_tasknet(task, [sub_task.worker for sub_task in sub_task_list])
|
||||||
if not success:
|
if not success:
|
||||||
self.release_task_ips(task)
|
self.release_task_ips(task)
|
||||||
return [False, gwip]
|
return [False, gwip]
|
||||||
token = ''.join(random.sample(string.ascii_letters + string.digits, 8))
|
|
||||||
placed_workers = []
|
placed_workers = []
|
||||||
|
|
||||||
|
start_all_vnode_success = True
|
||||||
# start vc
|
# start vc
|
||||||
for vid, worker in vnodes_workers:
|
for sub_task in sub_task_list:
|
||||||
vnodeinfo = copy.copy(task.vnodeinfo)
|
vnode_info = sub_task.vnode_info
|
||||||
vnodeinfo.vnodeid = vid
|
vnode_info.vnode.hostname = "batch-"+str(vid%task.max_size)
|
||||||
vnodeinfo.vnode.hostname = "batch-"+str(vid%task.max_size)
|
if sub_task.vnode_started:
|
||||||
vnode = task.subtask_list[vid]
|
continue
|
||||||
vnode['status'] = RUNNING
|
|
||||||
vnode['try_count'] += 1
|
|
||||||
vnode['token'] = token
|
|
||||||
vnode['worker'] = worker
|
|
||||||
|
|
||||||
self.cpu_usage[worker] += task.vnodeinfo.vnode.instance.cpu
|
username = sub_task.username
|
||||||
self.gpu_usage[worker] += task.vnodeinfo.vnode.instance.gpu
|
|
||||||
username = task.vnodeinfo.username
|
|
||||||
#container_name = task.info.username + '-batch-' + task.info.id + '-' + str(instance_id) + '-' + task.info.token
|
#container_name = task.info.username + '-batch-' + task.info.id + '-' + str(instance_id) + '-' + task.info.token
|
||||||
#if not username in self.user_containers.keys():
|
#if not username in self.user_containers.keys():
|
||||||
#self.user_containers[username] = []
|
#self.user_containers[username] = []
|
||||||
|
@ -275,42 +334,75 @@ class TaskMgr(threading.Thread):
|
||||||
ipaddr = task.ips[vid % task.max_size]
|
ipaddr = task.ips[vid % task.max_size]
|
||||||
brname = "docklet-batch-%s-%s" % (username, taskid)
|
brname = "docklet-batch-%s-%s" % (username, taskid)
|
||||||
networkinfo = Network(ipaddr=ipaddr, gateway=gwip, masterip=self.masterip, brname=brname)
|
networkinfo = Network(ipaddr=ipaddr, gateway=gwip, masterip=self.masterip, brname=brname)
|
||||||
vnodeinfo.vnode.network = networkinfo
|
vnode_info.vnode.network = networkinfo
|
||||||
|
|
||||||
try:
|
|
||||||
self.logger.info('[task_processor] starting vnode for task [%s] instance [%d]' % (task.vnodeinfo.id, vid))
|
|
||||||
channel = grpc.insecure_channel('%s:%s' % (worker, self.worker_port))
|
|
||||||
stub = WorkerStub(channel)
|
|
||||||
response = stub.start_vnode(vnodeinfo)
|
|
||||||
placed_workers.append(worker)
|
placed_workers.append(worker)
|
||||||
if response.status != Reply.ACCEPTED:
|
if not self.start_vnode(sub_task):
|
||||||
raise Exception(response.message)
|
sub_task.waiting_for_retry()
|
||||||
except Exception as e:
|
sub_task.worker = None
|
||||||
self.logger.error('[task_processor] rpc error message: %s' % e)
|
start_all_vnode_success = False
|
||||||
task.status = FAILED
|
|
||||||
vnode['status'] = FAILED
|
if not start_all_vnode_success:
|
||||||
vnode['try_count'] -= 1
|
|
||||||
for pl_worker in placed_workers:
|
|
||||||
pass
|
|
||||||
return
|
return
|
||||||
#self.user_containers[username].remove(container_name)
|
|
||||||
|
|
||||||
# start tasks
|
# start tasks
|
||||||
for vid, worker in vnodes_workers:
|
for sub_task in sub_task_list:
|
||||||
taskinfo = copy.copy(task.taskinfo)
|
task_info = sub_task.command_info
|
||||||
taskinfo.vnodeid = vid
|
task_info.token = ''.join(random.sample(string.ascii_letters + string.digits, 8))
|
||||||
taskinfo.token = token
|
|
||||||
vnode = task.subtask_list[vid]
|
if self.start_task(sub_task):
|
||||||
try:
|
sub_task.status = RUNNING
|
||||||
self.logger.info('[task_processor] starting task [%s] instance [%d]' % (task.vnodeinfo.id, vid))
|
else:
|
||||||
channel = grpc.insecure_channel('%s:%s' % (worker, self.worker_port))
|
sub_task.waiting_for_retry()
|
||||||
stub = WorkerStub(channel)
|
|
||||||
response = stub.start_task(taskinfo)
|
def clear_sub_tasks(self, sub_task_list):
|
||||||
if response.status != Reply.ACCEPTED:
|
for sub_task in sub_task_list:
|
||||||
raise Exception(response.message)
|
self.clear_sub_task(sub_task)
|
||||||
except Exception as e:
|
|
||||||
self.logger.error('[task_processor] rpc error message: %s' % e)
|
def clear_sub_task(self, sub_task):
|
||||||
task.status = FAILED
|
if sub_task.task_started:
|
||||||
|
self.stop_task(sub_task)
|
||||||
|
if sub_task.vnode_started:
|
||||||
|
self.stop_vnode(sub_task)
|
||||||
|
|
||||||
|
def check_task_completed(self, task):
|
||||||
|
if task.status == RUNNING or task.status == WAITING:
|
||||||
|
for sub_task in task.subtask_list:
|
||||||
|
if sub_task.status == RUNNING or sub_task.status == WAITING:
|
||||||
|
return False
|
||||||
|
self.logger.info('task %s completed' % task.id)
|
||||||
|
if task.at_same_time and task.status == FAILED:
|
||||||
|
self.clear_sub_tasks(task.subtask_list)
|
||||||
|
# TODO report to jobmgr
|
||||||
|
self.lazy_delete_list.append(task)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# this method is called when worker send heart-beat rpc request
|
||||||
|
def on_task_report(self, report):
|
||||||
|
self.logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.vnodeid, report.subTaskStatus))
|
||||||
|
task = self.get_task(report.taskid)
|
||||||
|
if task == None:
|
||||||
|
self.logger.error('[on_task_report] task not found')
|
||||||
|
return
|
||||||
|
|
||||||
|
sub_task = task.subtask_list[report.vnodeid]
|
||||||
|
if sub_task.token != report.token:
|
||||||
|
self.logger.warning('[on_task_report] wrong token')
|
||||||
|
return
|
||||||
|
username = task.username
|
||||||
|
# container_name = username + '-batch-' + task.info.id + '-' + str(report.instanceid) + '-' + report.token
|
||||||
|
# self.user_containers[username].remove(container_name)
|
||||||
|
|
||||||
|
if sub_task.status != RUNNING:
|
||||||
|
self.logger.error('[on_task_report] receive task report when instance is not running')
|
||||||
|
|
||||||
|
sub_task.status = report.subTaskStatus
|
||||||
|
sub_task.status_reason = report.errmsg
|
||||||
|
|
||||||
|
self.clear_sub_task(sub_task)
|
||||||
|
|
||||||
|
if report.subTaskStatus == FAILED or report.subTaskStatus == TIMEOUT:
|
||||||
|
sub_task.waiting_for_retry()
|
||||||
|
|
||||||
# return task, workers
|
# return task, workers
|
||||||
def task_scheduler(self):
|
def task_scheduler(self):
|
||||||
|
@ -320,60 +412,45 @@ class TaskMgr(threading.Thread):
|
||||||
for task in self.task_queue:
|
for task in self.task_queue:
|
||||||
if task in self.lazy_delete_list:
|
if task in self.lazy_delete_list:
|
||||||
continue
|
continue
|
||||||
|
if self.check_task_completed(task):
|
||||||
if task.atSameTime:
|
|
||||||
# parallel tasks
|
|
||||||
if task.status == RUNNING:
|
|
||||||
continue
|
continue
|
||||||
workers = self.find_proper_workers(task.get_all_resources_need())
|
|
||||||
if len(workers) < task.vnode_nums:
|
if task.at_same_time:
|
||||||
|
# parallel tasks
|
||||||
|
workers = self.find_proper_workers(task.subtask_list)
|
||||||
|
if len(workers) == 0:
|
||||||
return None, None
|
return None, None
|
||||||
else:
|
else:
|
||||||
idxs = [i for i in range(task.vnode_nums)]
|
for i in range(len(workers)):
|
||||||
return task, zip(idxs,workers)
|
task.subtask_list[i].worker = workers[i]
|
||||||
|
return task, task.subtask_list
|
||||||
else:
|
else:
|
||||||
# traditional tasks
|
# traditional tasks
|
||||||
workers = self.find_proper_workers([task.get_one_resources_need()])
|
for sub_task in task.subtask_list:
|
||||||
if len(workers) < task.vnode_nums:
|
if sub_task.status == WAITING:
|
||||||
return None, None, None
|
workers = self.find_proper_workers([sub_task])
|
||||||
'''for index, instance in enumerate(task.instance_list):
|
if len(workers) > 0:
|
||||||
# find instance to retry
|
sub_task.worker = workers[0]
|
||||||
if (instance['status'] == FAILED or instance['status'] == TIMEOUT) and instance['try_count'] <= task.info.maxRetryCount:
|
return task, [sub_task]
|
||||||
if worker is not None:
|
|
||||||
self.logger.info('[task_scheduler] retry')
|
|
||||||
return task, index, worker
|
|
||||||
# find timeout instance
|
|
||||||
elif instance['status'] == RUNNING:
|
|
||||||
if not self.is_alive(instance['worker']):
|
|
||||||
instance['status'] = FAILED
|
|
||||||
instance['token'] = ''
|
|
||||||
self.cpu_usage[instance['worker']] -= task.info.cluster.instance.cpu
|
|
||||||
self.gpu_usage[instance['worker']] -= task.info.cluster.instance.gpu
|
|
||||||
|
|
||||||
self.logger.warning('[task_scheduler] worker dead, retry task [%s] instance [%d]' % (task.info.id, index))
|
return None, None
|
||||||
if worker is not None:
|
|
||||||
return task, index, worker
|
|
||||||
|
|
||||||
if worker is not None:
|
def find_proper_workers(self, sub_task_list):
|
||||||
# start new instance
|
|
||||||
if len(task.instance_list) < task.info.instanceCount:
|
|
||||||
instance = {}
|
|
||||||
instance['try_count'] = 0
|
|
||||||
task.instance_list.append(instance)
|
|
||||||
return task, len(task.instance_list) - 1, worker'''
|
|
||||||
|
|
||||||
self.check_task_completed(task)
|
|
||||||
|
|
||||||
return None, None, None
|
|
||||||
|
|
||||||
def find_proper_workers(self, vnodes_configs):
|
|
||||||
nodes = self.get_all_nodes()
|
nodes = self.get_all_nodes()
|
||||||
if nodes is None or len(nodes) == 0:
|
if nodes is None or len(nodes) == 0:
|
||||||
self.logger.warning('[task_scheduler] running nodes not found')
|
self.logger.warning('[task_scheduler] running nodes not found')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
proper_workers = []
|
proper_workers = []
|
||||||
for needs in vnodes_configs:
|
has_waiting = False
|
||||||
|
for sub_task in sub_task_list:
|
||||||
|
if sub_task.status == WAITING:
|
||||||
|
has_waiting = True
|
||||||
|
if sub_task.worker is not None and sub_task.vnode_started:
|
||||||
|
proper_workers.append(sub_task.worker)
|
||||||
|
continue
|
||||||
|
needs = sub_task.vnode_info.vnode.instance
|
||||||
|
proper_worker = None
|
||||||
for worker_ip, worker_info in nodes:
|
for worker_ip, worker_info in nodes:
|
||||||
if needs.cpu + self.get_cpu_usage(worker_ip) > worker_info['cpu']:
|
if needs.cpu + self.get_cpu_usage(worker_ip) > worker_info['cpu']:
|
||||||
continue
|
continue
|
||||||
|
@ -391,11 +468,16 @@ class TaskMgr(threading.Thread):
|
||||||
worker_info['memory'] -= needs.memory
|
worker_info['memory'] -= needs.memory
|
||||||
worker_info['gpu'] -= needs.gpu
|
worker_info['gpu'] -= needs.gpu
|
||||||
worker_info['disk'] -= needs.disk
|
worker_info['disk'] -= needs.disk
|
||||||
proper_workers.append(worker_ip)
|
proper_worker = worker_ip
|
||||||
break
|
break
|
||||||
|
if proper_worker is not None:
|
||||||
|
proper_workers.append(proper_worker)
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
if has_waiting:
|
||||||
return proper_workers
|
return proper_workers
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
def get_all_nodes(self):
|
def get_all_nodes(self):
|
||||||
# cache running nodes
|
# cache running nodes
|
||||||
|
@ -445,11 +527,16 @@ class TaskMgr(threading.Thread):
|
||||||
"base": Image.BASE,
|
"base": Image.BASE,
|
||||||
"public": Image.PUBLIC
|
"public": Image.PUBLIC
|
||||||
}
|
}
|
||||||
configinfo = {'vnode_nums':7,'atSameTime':True,'MultiStart':True,
|
task = Task(
|
||||||
'maxRetryCount':int(json_task['retryCount'])}
|
task_id = taskid,
|
||||||
# json_task = json.loads(json_task)
|
username = username,
|
||||||
task = Task(configinfo,
|
# all vnode must be started at the same time
|
||||||
VNodeInfo(
|
at_same_time = json_task['at_same_time'],
|
||||||
|
priority = task_priority,
|
||||||
|
max_size = (1 << self.task_cidr) - 2,
|
||||||
|
task_infos = [{
|
||||||
|
'max_retry_count': int(json_task['retryCount']),
|
||||||
|
'vnode_info': VNodeInfo(
|
||||||
taskid = taskid,
|
taskid = taskid,
|
||||||
username = username,
|
username = username,
|
||||||
vnode = Vnode(
|
vnode = Vnode(
|
||||||
|
@ -461,9 +548,14 @@ class TaskMgr(threading.Thread):
|
||||||
cpu = int(json_task['cpuSetting']),
|
cpu = int(json_task['cpuSetting']),
|
||||||
memory = int(json_task['memorySetting']),
|
memory = int(json_task['memorySetting']),
|
||||||
disk = int(json_task['diskSetting']),
|
disk = int(json_task['diskSetting']),
|
||||||
gpu = int(json_task['gpuSetting'])))
|
gpu = int(json_task['gpuSetting'])),
|
||||||
|
mount = [Mount(
|
||||||
|
localPath = json_task['mapping'][mapping_key]['mappingLocalDir'],
|
||||||
|
remotePath=json_task['mapping'][mapping_key]['mappingRemoteDir'])
|
||||||
|
for mapping_key in json_task['mapping']] if 'mapping' in json_task else []
|
||||||
),
|
),
|
||||||
TaskInfo(
|
),
|
||||||
|
'command_info': TaskInfo(
|
||||||
taskid = taskid,
|
taskid = taskid,
|
||||||
username = username,
|
username = username,
|
||||||
parameters = Parameters(
|
parameters = Parameters(
|
||||||
|
@ -473,13 +565,11 @@ class TaskMgr(threading.Thread):
|
||||||
envVars = {}),
|
envVars = {}),
|
||||||
stderrRedirectPath = json_task.get('stdErrRedPth',""),
|
stderrRedirectPath = json_task.get('stdErrRedPth',""),
|
||||||
stdoutRedirectPath = json_task.get('stdOutRedPth',"")),
|
stdoutRedirectPath = json_task.get('stdOutRedPth',"")),
|
||||||
timeout = int(json_task['expTime']),
|
timeout = int(json_task['expTime'])
|
||||||
),
|
# commands are executed in all vnodes / only excuted in the first vnode
|
||||||
priority=task_priority,max_size=(1<<self.task_cidr)-2)
|
# if in traditional mode, commands will be executed in all vnodes
|
||||||
if 'mapping' in json_task:
|
) if (not json_task['at_same_time'] or json_task['multicommand'] or instance_index == 0) else None
|
||||||
task.vnodeinfo.vnode.mount.extend([Mount(localPath=json_task['mapping'][mapping_key]['mappingLocalDir'],
|
} for instance_index in range(json_task['instCount'])])
|
||||||
remotePath=json_task['mapping'][mapping_key]['mappingRemoteDir'])
|
|
||||||
for mapping_key in json_task['mapping']])
|
|
||||||
self.lazy_append_list.append(task)
|
self.lazy_append_list.append(task)
|
||||||
|
|
||||||
|
|
||||||
|
@ -488,7 +578,7 @@ class TaskMgr(threading.Thread):
|
||||||
@queue_lock
|
@queue_lock
|
||||||
def get_task(self, taskid):
|
def get_task(self, taskid):
|
||||||
for task in self.task_queue:
|
for task in self.task_queue:
|
||||||
if task.info.id == taskid:
|
if task.id == taskid:
|
||||||
return task
|
return task
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue