From 44a68ae51abd1cf2746d36d3fe9ac6b6fad96f58 Mon Sep 17 00:00:00 2001 From: zhongyehong Date: Sun, 13 May 2018 15:26:05 +0800 Subject: [PATCH 01/75] add frame for batch --- src/httprest.py | 27 ++++++++++++++++++++++++++- src/jobmgr.py | 37 +++++++++++++++++++++++++++++++++++++ src/taskmgr.py | 41 +++++++++++++++++++++++++++++++++++++++++ src/worker.py | 2 ++ 4 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 src/jobmgr.py create mode 100644 src/taskmgr.py diff --git a/src/httprest.py b/src/httprest.py index 4d00a4c..563e6d3 100755 --- a/src/httprest.py +++ b/src/httprest.py @@ -23,7 +23,7 @@ import os import http.server, cgi, json, sys, shutil import xmlrpc.client from socketserver import ThreadingMixIn -import nodemgr, vclustermgr, etcdlib, network, imagemgr, notificationmgr, lockmgr +import nodemgr, vclustermgr, etcdlib, network, imagemgr, notificationmgr, lockmgr, jobmgr, taskmgr from logs import logs import userManager,beansapplicationmgr import monitor,traceback @@ -694,6 +694,26 @@ def resetall_system(user, beans, form): return json.dumps({'success':'false', 'message': message}) return json.dumps(result) +@app.route("/batch/job/add/", methods=['POST']) +@login_required +def add_job(user,beans,form): + pass + +@app.route("/batch/job/list/", methods=['POST']) +@login_required +def list_job(user,beans,form): + pass + +@app.route("/batch/job/info/", methods=['POST']) +@login_required +def info_job(user,beans,form): + pass + +@app.route("/batch/task/info/", methods=['POST']) +@login_required +def info_task(user,beans,form): + pass + # @app.route("/inside/cluster/scaleout/", methods=['POST']) # @inside_ip_required # def inside_cluster_scalout(cur_user, cluster_info, form): @@ -760,6 +780,8 @@ if __name__ == '__main__': global G_historymgr global G_applicationmgr global G_ulockmgr + global G_jobmgr + global G_taskmgr # move 'tools.loadenv' to the beginning of this file fs_path = env.getenv("FS_PREFIX") @@ -851,6 +873,9 @@ if __name__ == '__main__': G_networkmgr = network.NetworkMgr(clusternet, etcdclient, mode, ipaddr) G_networkmgr.printpools() + G_taskmgr = taskmgr.TaskMgr() + G_jobmgr = jobmgr.JobMgr(taskmgr) + # start NodeMgr and NodeMgr will wait for all nodes to start ... G_nodemgr = nodemgr.NodeMgr(G_networkmgr, etcdclient, addr = ipaddr, mode=mode) logger.info("nodemgr started") diff --git a/src/jobmgr.py b/src/jobmgr.py new file mode 100644 index 0000000..32a9a81 --- /dev/null +++ b/src/jobmgr.py @@ -0,0 +1,37 @@ +class JobMgr(object): + + # user: username + # job: a json string + # user submit a new job, add this job to queue and database + # call add_task to add task information + def add_job(self, user, job): + pass + + # user: username + # list a user's all job + def list_jobs(self,user): + pass + + # user: username + # jobid: the id of job + # get the information of a job, including the status, json description and other informationa + # call get_task to get the task information + def get_job(self, user, jobid): + pass + + # job: a json string + # this is a thread to process a job + def job_processor(self, job): + # according the DAG of job, add task to taskmanager + # wait for all task completed and exit + pass + + # this is a thread to schedule the jobs + def job_scheduler(self): + # choose a job from queue, create a job processor for it + pass + + # load job information from etcd + # initial a job queue and job schedueler + def __init__(self, taskmgr): + pass diff --git a/src/taskmgr.py b/src/taskmgr.py new file mode 100644 index 0000000..27f5921 --- /dev/null +++ b/src/taskmgr.py @@ -0,0 +1,41 @@ +class TaskMgr(object): + + # task: a json string + # this is a thread to process task(or a instance) + def task_processor(self,task): + # call the rpc to call a function in worker + # create container -> execute task + # (one instance or multiple instances) + # retry when failed + pass + + # this is a thread to schdule the tasks + def task_scheduler(self): + # choose a task from queue, create a task processor for it + pass + + # user: username + # task: a json string + # save the task information into database + def add_task(self,user,task): + pass + + # user: username + # jobid: the id of job + # taskid: the id of task + # get the information of a task, including the status, task description and other information + def get_task(self, user, jobid, taskid): + pass + + # task: a json string + # this is a rpc function for worker, task processor call this function to execute a task in a worker + @staticmethod + def execute_task(self,task): + return + + + # load task information from etcd + # initial a task queue and task schedueler + # taskmgr: a taskmgr instance + def __init__(self): + pass diff --git a/src/worker.py b/src/worker.py index a629cbc..9efb440 100755 --- a/src/worker.py +++ b/src/worker.py @@ -17,6 +17,7 @@ import etcdlib, network, container from nettools import netcontrol,ovscontrol,portcontrol import monitor, proxytool from lvmtool import new_group, recover_group +from taskmgr import TaskMgr ################################################################## # Worker @@ -139,6 +140,7 @@ class Worker(object): self.rpcserver.register_function(proxytool.delete_route) self.rpcserver.register_function(portcontrol.acquire_port_mapping) self.rpcserver.register_function(portcontrol.release_port_mapping) + self.rpcserver.register_function(TaskMgr.execute_task) # register functions or instances to server for rpc #self.rpcserver.register_function(function_name) From 152162f8dcd2a9791c0cde84c63f8079cee75cb2 Mon Sep 17 00:00:00 2001 From: Gallen Date: Mon, 21 May 2018 14:55:22 +0800 Subject: [PATCH 02/75] add gpu tools --- src/gputools.py | 100 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 src/gputools.py diff --git a/src/gputools.py b/src/gputools.py new file mode 100644 index 0000000..303bb7b --- /dev/null +++ b/src/gputools.py @@ -0,0 +1,100 @@ +import lxc +import subprocess + + +# Note: keep physical device id always the same as the virtual device id +# device_path e.g. /dev/nvidia0 +def add_device(container_name, device_path): + c = lxc.Container(container_name) + return c.add_device_node(device_path, device_path) + + +def remove_device(container_name, device_path): + c = lxc.Container(container_name) + return c.remove_device_node('', device_path) + + +# Mon May 21 10:51:45 2018 +# +-----------------------------------------------------------------------------+ +# | NVIDIA-SMI 381.22 Driver Version: 381.22 | +# |-------------------------------+----------------------+----------------------+ +# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +# |===============================+======================+======================| +# | 0 GeForce GTX 108... Off | 0000:02:00.0 Off | N/A | +# | 33% 53C P2 59W / 250W | 295MiB / 11172MiB | 2% Default | +# +-------------------------------+----------------------+----------------------+ +# | 1 GeForce GTX 108... Off | 0000:84:00.0 Off | N/A | +# | 21% 35C P8 10W / 250W | 161MiB / 11172MiB | 0% Default | +# +-------------------------------+----------------------+----------------------+ +# +# +-----------------------------------------------------------------------------+ +# | Processes: GPU Memory | +# | GPU PID Type Process name Usage | +# |=============================================================================| +# | 0 111893 C python3 285MiB | +# | 1 111893 C python3 151MiB | +# +-----------------------------------------------------------------------------+ +# +def nvidia_smi(): + try: + ret = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True) + return ret.stdout.decode('utf-8').split('\n') + except subprocess.CalledProcessError: + return None + + +def get_gpu_driver_version(): + output = nvidia_smi() + if not output: + return None + else: + return output[2].split()[-2] + + +def get_gpu_status(): + output = nvidia_smi() + if not output: + return [] + interval_index = [index for index in range(len(output)) if len(output[index].strip()) == 0][0] + status_list = [] + for index in range(7, interval_index, 3): + status = {} + status['id'] = output[index].split()[1] + sp = output[index+1].split() + status['fan'] = sp[1] + status['memory'] = sp[8] + status['memory_max'] = sp[10] + status['util'] = sp[12] + status_list.append(status) + return status_list + + +def get_gpu_processes(): + output = nvidia_smi() + if not output: + return [] + interval_index = [index for index in range(len(output)) if len(output[index].strip()) == 0][0] + process_list = [] + for index in range(interval_index + 5, len(output)): + sp = output[index].split() + if len(sp) != 7: + break + process = {} + process['gpu'] = sp[1] + process['pid'] = sp[2] + process['name'] = sp[4] + process['memory'] = sp[5] + process['container'] = get_container_name_by_pid(sp[2]) + process_list.append(process) + return process_list + + +def get_container_name_by_pid(pid): + with open('/proc/%s/cgroup' % pid) as f: + content = f.readlines()[0].strip().split('/') + if content[1] != 'lxc': + return 'host' + else: + return content[2] + return None \ No newline at end of file From 946efe277f6e89d610d363e2a5d14ab1e61e3c2a Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Sun, 1 Jul 2018 01:14:23 +0800 Subject: [PATCH 03/75] Update locations of gputools, jobmgr and taskmgr --- src/{ => master}/jobmgr.py | 0 src/{ => utils}/gputools.py | 0 src/{ => worker}/taskmgr.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename src/{ => master}/jobmgr.py (100%) rename src/{ => utils}/gputools.py (100%) rename src/{ => worker}/taskmgr.py (100%) diff --git a/src/jobmgr.py b/src/master/jobmgr.py similarity index 100% rename from src/jobmgr.py rename to src/master/jobmgr.py diff --git a/src/gputools.py b/src/utils/gputools.py similarity index 100% rename from src/gputools.py rename to src/utils/gputools.py diff --git a/src/taskmgr.py b/src/worker/taskmgr.py similarity index 100% rename from src/taskmgr.py rename to src/worker/taskmgr.py From 8233d159f7557d95fd61f02215cd4bb9f15433d9 Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Sun, 1 Jul 2018 01:28:16 +0800 Subject: [PATCH 04/75] Update to work --- src/master/httprest.py | 4 ++-- src/worker/worker.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/master/httprest.py b/src/master/httprest.py index 005d07d..0b4f7e7 100755 --- a/src/master/httprest.py +++ b/src/master/httprest.py @@ -26,12 +26,12 @@ import os import http.server, cgi, json, sys, shutil, traceback import xmlrpc.client from socketserver import ThreadingMixIn -import nodemgr, vclustermgr, etcdlib, network, imagemgr, notificationmgr, lockmgr, cloudmgr,jobmgr, taskmgr from utils import etcdlib, imagemgr -from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr +from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr, jobmgr from utils.logs import logs from master import userManager, beansapplicationmgr, monitor, sysmgr, network from worker.monitor import History_Manager +from worker import taskmgr import threading import requests from utils.nettools import portcontrol diff --git a/src/worker/worker.py b/src/worker/worker.py index 95c8192..03da97a 100755 --- a/src/worker/worker.py +++ b/src/worker/worker.py @@ -19,6 +19,7 @@ from socketserver import ThreadingMixIn import threading from utils import etcdlib, proxytool from worker import container, monitor +from worker.taskmgr import TaskMgr from utils.nettools import netcontrol,ovscontrol,portcontrol from utils.lvmtool import new_group, recover_group from master import network From 73b23e11e41f23cb411f63c14080ede7dd3c410f Mon Sep 17 00:00:00 2001 From: Gallen Date: Fri, 13 Jul 2018 10:45:11 +0800 Subject: [PATCH 05/75] simple taskmgr --- src/master/taskmgr.py | 89 +++++++++++++++++ src/protos/taskmgr.proto | 20 ++++ src/protos/taskmgr_pb2.py | 168 +++++++++++++++++++++++++++++++++ src/protos/taskmgr_pb2_grpc.py | 46 +++++++++ src/worker/taskmgr.py | 41 -------- 5 files changed, 323 insertions(+), 41 deletions(-) create mode 100644 src/master/taskmgr.py create mode 100644 src/protos/taskmgr.proto create mode 100644 src/protos/taskmgr_pb2.py create mode 100644 src/protos/taskmgr_pb2_grpc.py delete mode 100644 src/worker/taskmgr.py diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py new file mode 100644 index 0000000..9312fe8 --- /dev/null +++ b/src/master/taskmgr.py @@ -0,0 +1,89 @@ +import threading +import time + +from concurrent import futures +import grpc +from protos.taskmgr_pb2 import Task, Reply +from protos.taskmgr_pb2_grpc import TaskReporterServicer, add_TaskReporterServicer_to_server + +class TaskReport(TaskReporterServicer): + + def __init__(self, taskmgr): + self.taskmgr = taskmgr + + def report(self, request, context): + self.taskmgr.on_task_report(request) + return Reply(message='received') + +class TaskMgr(threading.Thread): + + # load task information from etcd + # initial a task queue and task schedueler + # taskmgr: a taskmgr instance + def __init__(self): + threading.Thread.__init__(self) + self.thread_stop = False + self.taskQueue = [] + + + def run(self): + self.serve() + while not self.thread_stop: + task = self.task_scheduler() + if task is not None: + self.task_processor(task) + time.sleep(1) + + + def serve(self): + self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + add_TaskReporterServicer_to_server(TaskReport(self), self.server) + self.server.add_insecure_port('[::]:50051') + self.server.start() + + + def stop(self): + self.thread_stop = True + self.server.stop(0) + + + # this method is called when worker send heart-beat rpc request + def on_task_report(self, task): + self.taskQueue.append('task') + print('rec') + time.sleep(2) + print(self.taskQueue) + + + # this is a thread to process task(or a instance) + def task_processor(self,task): + # call the rpc to call a function in worker + # create container -> execute task + # (one instance or multiple instances) + # retry when failed + print('processing %s' % task) + + + # this is a thread to schdule the tasks + def task_scheduler(self): + try: + task = self.taskQueue.pop(0) + except: + task = None + return task + + + # user: username + # task: a json string + # save the task information into database + # called when jobmgr assign task to taskmgr + def add_task(self,user,task): + pass + + + # user: username + # jobid: the id of job + # taskid: the id of task + # get the information of a task, including the status, task description and other information + def get_task(self, user, jobid, taskid): + pass diff --git a/src/protos/taskmgr.proto b/src/protos/taskmgr.proto new file mode 100644 index 0000000..5d62753 --- /dev/null +++ b/src/protos/taskmgr.proto @@ -0,0 +1,20 @@ +syntax = "proto3"; + +service TaskReporter { + rpc report (Task) returns (Reply) {}; +} + +message Task { + int32 id = 1; + TaskStatus taskStatus = 2 [default = RUNNING]; + + enum TaskStatus { + RUNNING = 0; + FAILED = 1; + TIMEOUT = 2; + } +} + +message Reply { + string message = 1; +} diff --git a/src/protos/taskmgr_pb2.py b/src/protos/taskmgr_pb2.py new file mode 100644 index 0000000..fe9f0d5 --- /dev/null +++ b/src/protos/taskmgr_pb2.py @@ -0,0 +1,168 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: protos/taskmgr.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='protos/taskmgr.proto', + package='', + syntax='proto3', + serialized_pb=_b('\n\x14protos/taskmgr.proto\"l\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\x05\x12$\n\ntaskStatus\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\"2\n\nTaskStatus\x12\x0b\n\x07RUNNING\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07TIMEOUT\x10\x02\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\t2)\n\x0cTaskReporter\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') +) + + + +_TASK_TASKSTATUS = _descriptor.EnumDescriptor( + name='TaskStatus', + full_name='Task.TaskStatus', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='RUNNING', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='FAILED', index=1, number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='TIMEOUT', index=2, number=2, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=82, + serialized_end=132, +) +_sym_db.RegisterEnumDescriptor(_TASK_TASKSTATUS) + + +_TASK = _descriptor.Descriptor( + name='Task', + full_name='Task', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='id', full_name='Task.id', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='taskStatus', full_name='Task.taskStatus', index=1, + number=2, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _TASK_TASKSTATUS, + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=24, + serialized_end=132, +) + + +_REPLY = _descriptor.Descriptor( + name='Reply', + full_name='Reply', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='message', full_name='Reply.message', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=134, + serialized_end=158, +) + +_TASK.fields_by_name['taskStatus'].enum_type = _TASK_TASKSTATUS +_TASK_TASKSTATUS.containing_type = _TASK +DESCRIPTOR.message_types_by_name['Task'] = _TASK +DESCRIPTOR.message_types_by_name['Reply'] = _REPLY +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +Task = _reflection.GeneratedProtocolMessageType('Task', (_message.Message,), dict( + DESCRIPTOR = _TASK, + __module__ = 'protos.taskmgr_pb2' + # @@protoc_insertion_point(class_scope:Task) + )) +_sym_db.RegisterMessage(Task) + +Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict( + DESCRIPTOR = _REPLY, + __module__ = 'protos.taskmgr_pb2' + # @@protoc_insertion_point(class_scope:Reply) + )) +_sym_db.RegisterMessage(Reply) + + + +_TASKREPORTER = _descriptor.ServiceDescriptor( + name='TaskReporter', + full_name='TaskReporter', + file=DESCRIPTOR, + index=0, + options=None, + serialized_start=160, + serialized_end=201, + methods=[ + _descriptor.MethodDescriptor( + name='report', + full_name='TaskReporter.report', + index=0, + containing_service=None, + input_type=_TASK, + output_type=_REPLY, + options=None, + ), +]) +_sym_db.RegisterServiceDescriptor(_TASKREPORTER) + +DESCRIPTOR.services_by_name['TaskReporter'] = _TASKREPORTER + +# @@protoc_insertion_point(module_scope) diff --git a/src/protos/taskmgr_pb2_grpc.py b/src/protos/taskmgr_pb2_grpc.py new file mode 100644 index 0000000..4e5722d --- /dev/null +++ b/src/protos/taskmgr_pb2_grpc.py @@ -0,0 +1,46 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +import grpc + +from protos import taskmgr_pb2 as protos_dot_taskmgr__pb2 + + +class TaskReporterStub(object): + # missing associated documentation comment in .proto file + pass + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.report = channel.unary_unary( + '/TaskReporter/report', + request_serializer=protos_dot_taskmgr__pb2.Task.SerializeToString, + response_deserializer=protos_dot_taskmgr__pb2.Reply.FromString, + ) + + +class TaskReporterServicer(object): + # missing associated documentation comment in .proto file + pass + + def report(self, request, context): + # missing associated documentation comment in .proto file + pass + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_TaskReporterServicer_to_server(servicer, server): + rpc_method_handlers = { + 'report': grpc.unary_unary_rpc_method_handler( + servicer.report, + request_deserializer=protos_dot_taskmgr__pb2.Task.FromString, + response_serializer=protos_dot_taskmgr__pb2.Reply.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'TaskReporter', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) diff --git a/src/worker/taskmgr.py b/src/worker/taskmgr.py deleted file mode 100644 index 27f5921..0000000 --- a/src/worker/taskmgr.py +++ /dev/null @@ -1,41 +0,0 @@ -class TaskMgr(object): - - # task: a json string - # this is a thread to process task(or a instance) - def task_processor(self,task): - # call the rpc to call a function in worker - # create container -> execute task - # (one instance or multiple instances) - # retry when failed - pass - - # this is a thread to schdule the tasks - def task_scheduler(self): - # choose a task from queue, create a task processor for it - pass - - # user: username - # task: a json string - # save the task information into database - def add_task(self,user,task): - pass - - # user: username - # jobid: the id of job - # taskid: the id of task - # get the information of a task, including the status, task description and other information - def get_task(self, user, jobid, taskid): - pass - - # task: a json string - # this is a rpc function for worker, task processor call this function to execute a task in a worker - @staticmethod - def execute_task(self,task): - return - - - # load task information from etcd - # initial a task queue and task schedueler - # taskmgr: a taskmgr instance - def __init__(self): - pass From 660d003fbce33e33fdfe94eff8f74fc4207dfbad Mon Sep 17 00:00:00 2001 From: iteratorlee <1400012951@pku.edu.cn> Date: Mon, 16 Jul 2018 20:27:35 +0800 Subject: [PATCH 06/75] fix the path of taskmgr --- src/master/httprest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/master/httprest.py b/src/master/httprest.py index 0b4f7e7..9250ed6 100755 --- a/src/master/httprest.py +++ b/src/master/httprest.py @@ -27,11 +27,10 @@ import http.server, cgi, json, sys, shutil, traceback import xmlrpc.client from socketserver import ThreadingMixIn from utils import etcdlib, imagemgr -from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr, jobmgr +from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr, jobmgr, taskmgr from utils.logs import logs from master import userManager, beansapplicationmgr, monitor, sysmgr, network from worker.monitor import History_Manager -from worker import taskmgr import threading import requests from utils.nettools import portcontrol From 96890bbad0511750daa1f3f133255780e8362b10 Mon Sep 17 00:00:00 2001 From: Gallen Date: Tue, 17 Jul 2018 13:46:34 +0800 Subject: [PATCH 07/75] update rpc proto --- src/protos/rpc.proto | 79 ++++ src/protos/rpc_pb2.py | 686 +++++++++++++++++++++++++++++++++ src/protos/rpc_pb2_grpc.py | 88 +++++ src/protos/taskmgr.proto | 20 - src/protos/taskmgr_pb2.py | 168 -------- src/protos/taskmgr_pb2_grpc.py | 46 --- 6 files changed, 853 insertions(+), 234 deletions(-) create mode 100644 src/protos/rpc.proto create mode 100644 src/protos/rpc_pb2.py create mode 100644 src/protos/rpc_pb2_grpc.py delete mode 100644 src/protos/taskmgr.proto delete mode 100644 src/protos/taskmgr_pb2.py delete mode 100644 src/protos/taskmgr_pb2_grpc.py diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto new file mode 100644 index 0000000..fbe8560 --- /dev/null +++ b/src/protos/rpc.proto @@ -0,0 +1,79 @@ +syntax = "proto3"; + +service Master { + rpc report (Report) returns (Reply) {}; +} + +service Worker { + rpc add_task (Task) returns (Reply) {} +} + +message Report { + int32 taskId = 1; // 任务 id + TaskStatus taskStatus = 2; // 任务状态 + + enum TaskStatus { + RUNNING = 0; + COMPLETED = 1; + FAILED = 2; + TIMEOUT = 3; + } +} + +message Reply { + ReplyStatus message = 1; // 返回值 + + enum ReplyStatus { + ACCEPTED = 0; + REFUSED = 1; + } +} + +message Task { + int32 instanceCount = 1; // 实例个数 + int32 maxRetryCount = 2; // 最大重试次数 + Parameters parameters = 3; // 参数 + Cluster cluster = 4; // 集群配置 + int32 Timeout = 5; // 超时阈值 +} + +message Parameters { + Command command = 1; // 命令配置 + string stderrRedirectPath = 2; // 错误输出重定向 + string stdoutRedirectPath = 3; // 标准输出重定向 +} + +message Command { + string commandLine = 1; // 命令 + string packagePath = 2; // 工作路径 + map envVars = 3; // 自定义环境变量 +} + +message Cluster { + Image image = 1; // 镜像配置 + Instance instance = 2; // 实例配置 + repeated Mount mount = 3; // 挂载配置 +} + +message Image { + string name = 1; // 镜像名 + ImageType type = 2; // 镜像类型(public/private) + string owner = 3; // 所有者 + + enum ImageType { + PUBLIC = 0; + PRIVATE = 1; + } +} + +message Mount { + string localPath = 1; // 本地路径 + string remotePath = 2; // 远程路径 +} + +message Instance { + int32 cpu = 1; // CPU,单位 个? + int32 memory = 2; // 内存,单位 mb + int32 disk = 3; // 磁盘,单位 mb + int32 gpu = 4; // 显卡,单位 个 +} \ No newline at end of file diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py new file mode 100644 index 0000000..1ee63cf --- /dev/null +++ b/src/protos/rpc_pb2.py @@ -0,0 +1,686 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: protos/rpc.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='protos/rpc.proto', + package='', + syntax='proto3', + serialized_pb=_b('\n\x10protos/rpc.proto\"\x83\x01\n\x06Report\x12\x0e\n\x06taskId\x18\x01 \x01(\x05\x12&\n\ntaskStatus\x18\x02 \x01(\x0e\x32\x12.Report.TaskStatus\"A\n\nTaskStatus\x12\x0b\n\x07RUNNING\x10\x00\x12\r\n\tCOMPLETED\x10\x01\x12\n\n\x06\x46\x41ILED\x10\x02\x12\x0b\n\x07TIMEOUT\x10\x03\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\x81\x01\n\x04Task\x12\x15\n\rinstanceCount\x18\x01 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x02 \x01(\x05\x12\x1f\n\nparameters\x18\x03 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x04 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x05 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32%\n\x06Master\x12\x1b\n\x06report\x12\x07.Report\x1a\x06.Reply\"\x00\x32%\n\x06Worker\x12\x1b\n\x08\x61\x64\x64_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') +) + + + +_REPORT_TASKSTATUS = _descriptor.EnumDescriptor( + name='TaskStatus', + full_name='Report.TaskStatus', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='RUNNING', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='COMPLETED', index=1, number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='FAILED', index=2, number=2, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='TIMEOUT', index=3, number=3, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=87, + serialized_end=152, +) +_sym_db.RegisterEnumDescriptor(_REPORT_TASKSTATUS) + +_REPLY_REPLYSTATUS = _descriptor.EnumDescriptor( + name='ReplyStatus', + full_name='Reply.ReplyStatus', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='ACCEPTED', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='REFUSED', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=200, + serialized_end=240, +) +_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS) + +_IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( + name='ImageType', + full_name='Image.ImageType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PUBLIC', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PRIVATE', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=769, + serialized_end=805, +) +_sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) + + +_REPORT = _descriptor.Descriptor( + name='Report', + full_name='Report', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='taskId', full_name='Report.taskId', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='taskStatus', full_name='Report.taskStatus', index=1, + number=2, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _REPORT_TASKSTATUS, + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=21, + serialized_end=152, +) + + +_REPLY = _descriptor.Descriptor( + name='Reply', + full_name='Reply', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='message', full_name='Reply.message', index=0, + number=1, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _REPLY_REPLYSTATUS, + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=154, + serialized_end=240, +) + + +_TASK = _descriptor.Descriptor( + name='Task', + full_name='Task', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='instanceCount', full_name='Task.instanceCount', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='maxRetryCount', full_name='Task.maxRetryCount', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='parameters', full_name='Task.parameters', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='cluster', full_name='Task.cluster', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='Timeout', full_name='Task.Timeout', index=4, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=243, + serialized_end=372, +) + + +_PARAMETERS = _descriptor.Descriptor( + name='Parameters', + full_name='Parameters', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='command', full_name='Parameters.command', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='stderrRedirectPath', full_name='Parameters.stderrRedirectPath', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='stdoutRedirectPath', full_name='Parameters.stdoutRedirectPath', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=374, + serialized_end=469, +) + + +_COMMAND_ENVVARSENTRY = _descriptor.Descriptor( + name='EnvVarsEntry', + full_name='Command.EnvVarsEntry', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='key', full_name='Command.EnvVarsEntry.key', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='value', full_name='Command.EnvVarsEntry.value', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')), + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=565, + serialized_end=611, +) + +_COMMAND = _descriptor.Descriptor( + name='Command', + full_name='Command', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='commandLine', full_name='Command.commandLine', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='packagePath', full_name='Command.packagePath', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='envVars', full_name='Command.envVars', index=2, + number=3, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[_COMMAND_ENVVARSENTRY, ], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=472, + serialized_end=611, +) + + +_CLUSTER = _descriptor.Descriptor( + name='Cluster', + full_name='Cluster', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='image', full_name='Cluster.image', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='instance', full_name='Cluster.instance', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='mount', full_name='Cluster.mount', index=2, + number=3, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=613, + serialized_end=697, +) + + +_IMAGE = _descriptor.Descriptor( + name='Image', + full_name='Image', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='Image.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='type', full_name='Image.type', index=1, + number=2, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='owner', full_name='Image.owner', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _IMAGE_IMAGETYPE, + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=699, + serialized_end=805, +) + + +_MOUNT = _descriptor.Descriptor( + name='Mount', + full_name='Mount', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='localPath', full_name='Mount.localPath', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='remotePath', full_name='Mount.remotePath', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=807, + serialized_end=853, +) + + +_INSTANCE = _descriptor.Descriptor( + name='Instance', + full_name='Instance', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='cpu', full_name='Instance.cpu', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='memory', full_name='Instance.memory', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='disk', full_name='Instance.disk', index=2, + number=3, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='gpu', full_name='Instance.gpu', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=855, + serialized_end=921, +) + +_REPORT.fields_by_name['taskStatus'].enum_type = _REPORT_TASKSTATUS +_REPORT_TASKSTATUS.containing_type = _REPORT +_REPLY.fields_by_name['message'].enum_type = _REPLY_REPLYSTATUS +_REPLY_REPLYSTATUS.containing_type = _REPLY +_TASK.fields_by_name['parameters'].message_type = _PARAMETERS +_TASK.fields_by_name['cluster'].message_type = _CLUSTER +_PARAMETERS.fields_by_name['command'].message_type = _COMMAND +_COMMAND_ENVVARSENTRY.containing_type = _COMMAND +_COMMAND.fields_by_name['envVars'].message_type = _COMMAND_ENVVARSENTRY +_CLUSTER.fields_by_name['image'].message_type = _IMAGE +_CLUSTER.fields_by_name['instance'].message_type = _INSTANCE +_CLUSTER.fields_by_name['mount'].message_type = _MOUNT +_IMAGE.fields_by_name['type'].enum_type = _IMAGE_IMAGETYPE +_IMAGE_IMAGETYPE.containing_type = _IMAGE +DESCRIPTOR.message_types_by_name['Report'] = _REPORT +DESCRIPTOR.message_types_by_name['Reply'] = _REPLY +DESCRIPTOR.message_types_by_name['Task'] = _TASK +DESCRIPTOR.message_types_by_name['Parameters'] = _PARAMETERS +DESCRIPTOR.message_types_by_name['Command'] = _COMMAND +DESCRIPTOR.message_types_by_name['Cluster'] = _CLUSTER +DESCRIPTOR.message_types_by_name['Image'] = _IMAGE +DESCRIPTOR.message_types_by_name['Mount'] = _MOUNT +DESCRIPTOR.message_types_by_name['Instance'] = _INSTANCE +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +Report = _reflection.GeneratedProtocolMessageType('Report', (_message.Message,), dict( + DESCRIPTOR = _REPORT, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Report) + )) +_sym_db.RegisterMessage(Report) + +Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict( + DESCRIPTOR = _REPLY, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Reply) + )) +_sym_db.RegisterMessage(Reply) + +Task = _reflection.GeneratedProtocolMessageType('Task', (_message.Message,), dict( + DESCRIPTOR = _TASK, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Task) + )) +_sym_db.RegisterMessage(Task) + +Parameters = _reflection.GeneratedProtocolMessageType('Parameters', (_message.Message,), dict( + DESCRIPTOR = _PARAMETERS, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Parameters) + )) +_sym_db.RegisterMessage(Parameters) + +Command = _reflection.GeneratedProtocolMessageType('Command', (_message.Message,), dict( + + EnvVarsEntry = _reflection.GeneratedProtocolMessageType('EnvVarsEntry', (_message.Message,), dict( + DESCRIPTOR = _COMMAND_ENVVARSENTRY, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Command.EnvVarsEntry) + )) + , + DESCRIPTOR = _COMMAND, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Command) + )) +_sym_db.RegisterMessage(Command) +_sym_db.RegisterMessage(Command.EnvVarsEntry) + +Cluster = _reflection.GeneratedProtocolMessageType('Cluster', (_message.Message,), dict( + DESCRIPTOR = _CLUSTER, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Cluster) + )) +_sym_db.RegisterMessage(Cluster) + +Image = _reflection.GeneratedProtocolMessageType('Image', (_message.Message,), dict( + DESCRIPTOR = _IMAGE, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Image) + )) +_sym_db.RegisterMessage(Image) + +Mount = _reflection.GeneratedProtocolMessageType('Mount', (_message.Message,), dict( + DESCRIPTOR = _MOUNT, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Mount) + )) +_sym_db.RegisterMessage(Mount) + +Instance = _reflection.GeneratedProtocolMessageType('Instance', (_message.Message,), dict( + DESCRIPTOR = _INSTANCE, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:Instance) + )) +_sym_db.RegisterMessage(Instance) + + +_COMMAND_ENVVARSENTRY.has_options = True +_COMMAND_ENVVARSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')) + +_MASTER = _descriptor.ServiceDescriptor( + name='Master', + full_name='Master', + file=DESCRIPTOR, + index=0, + options=None, + serialized_start=923, + serialized_end=960, + methods=[ + _descriptor.MethodDescriptor( + name='report', + full_name='Master.report', + index=0, + containing_service=None, + input_type=_REPORT, + output_type=_REPLY, + options=None, + ), +]) +_sym_db.RegisterServiceDescriptor(_MASTER) + +DESCRIPTOR.services_by_name['Master'] = _MASTER + + +_WORKER = _descriptor.ServiceDescriptor( + name='Worker', + full_name='Worker', + file=DESCRIPTOR, + index=1, + options=None, + serialized_start=962, + serialized_end=999, + methods=[ + _descriptor.MethodDescriptor( + name='add_task', + full_name='Worker.add_task', + index=0, + containing_service=None, + input_type=_TASK, + output_type=_REPLY, + options=None, + ), +]) +_sym_db.RegisterServiceDescriptor(_WORKER) + +DESCRIPTOR.services_by_name['Worker'] = _WORKER + +# @@protoc_insertion_point(module_scope) diff --git a/src/protos/rpc_pb2_grpc.py b/src/protos/rpc_pb2_grpc.py new file mode 100644 index 0000000..55bd6b1 --- /dev/null +++ b/src/protos/rpc_pb2_grpc.py @@ -0,0 +1,88 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +import grpc + +from protos import rpc_pb2 as protos_dot_rpc__pb2 + + +class MasterStub(object): + # missing associated documentation comment in .proto file + pass + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.report = channel.unary_unary( + '/Master/report', + request_serializer=protos_dot_rpc__pb2.Report.SerializeToString, + response_deserializer=protos_dot_rpc__pb2.Reply.FromString, + ) + + +class MasterServicer(object): + # missing associated documentation comment in .proto file + pass + + def report(self, request, context): + # missing associated documentation comment in .proto file + pass + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_MasterServicer_to_server(servicer, server): + rpc_method_handlers = { + 'report': grpc.unary_unary_rpc_method_handler( + servicer.report, + request_deserializer=protos_dot_rpc__pb2.Report.FromString, + response_serializer=protos_dot_rpc__pb2.Reply.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'Master', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + +class WorkerStub(object): + # missing associated documentation comment in .proto file + pass + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.add_task = channel.unary_unary( + '/Worker/add_task', + request_serializer=protos_dot_rpc__pb2.Task.SerializeToString, + response_deserializer=protos_dot_rpc__pb2.Reply.FromString, + ) + + +class WorkerServicer(object): + # missing associated documentation comment in .proto file + pass + + def add_task(self, request, context): + # missing associated documentation comment in .proto file + pass + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_WorkerServicer_to_server(servicer, server): + rpc_method_handlers = { + 'add_task': grpc.unary_unary_rpc_method_handler( + servicer.add_task, + request_deserializer=protos_dot_rpc__pb2.Task.FromString, + response_serializer=protos_dot_rpc__pb2.Reply.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'Worker', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) diff --git a/src/protos/taskmgr.proto b/src/protos/taskmgr.proto deleted file mode 100644 index 5d62753..0000000 --- a/src/protos/taskmgr.proto +++ /dev/null @@ -1,20 +0,0 @@ -syntax = "proto3"; - -service TaskReporter { - rpc report (Task) returns (Reply) {}; -} - -message Task { - int32 id = 1; - TaskStatus taskStatus = 2 [default = RUNNING]; - - enum TaskStatus { - RUNNING = 0; - FAILED = 1; - TIMEOUT = 2; - } -} - -message Reply { - string message = 1; -} diff --git a/src/protos/taskmgr_pb2.py b/src/protos/taskmgr_pb2.py deleted file mode 100644 index fe9f0d5..0000000 --- a/src/protos/taskmgr_pb2.py +++ /dev/null @@ -1,168 +0,0 @@ -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: protos/taskmgr.proto - -import sys -_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -from google.protobuf import descriptor_pb2 -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='protos/taskmgr.proto', - package='', - syntax='proto3', - serialized_pb=_b('\n\x14protos/taskmgr.proto\"l\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\x05\x12$\n\ntaskStatus\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\"2\n\nTaskStatus\x12\x0b\n\x07RUNNING\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07TIMEOUT\x10\x02\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\t2)\n\x0cTaskReporter\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') -) - - - -_TASK_TASKSTATUS = _descriptor.EnumDescriptor( - name='TaskStatus', - full_name='Task.TaskStatus', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='RUNNING', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='FAILED', index=1, number=1, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TIMEOUT', index=2, number=2, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=82, - serialized_end=132, -) -_sym_db.RegisterEnumDescriptor(_TASK_TASKSTATUS) - - -_TASK = _descriptor.Descriptor( - name='Task', - full_name='Task', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='id', full_name='Task.id', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='taskStatus', full_name='Task.taskStatus', index=1, - number=2, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - _TASK_TASKSTATUS, - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=24, - serialized_end=132, -) - - -_REPLY = _descriptor.Descriptor( - name='Reply', - full_name='Reply', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='message', full_name='Reply.message', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=134, - serialized_end=158, -) - -_TASK.fields_by_name['taskStatus'].enum_type = _TASK_TASKSTATUS -_TASK_TASKSTATUS.containing_type = _TASK -DESCRIPTOR.message_types_by_name['Task'] = _TASK -DESCRIPTOR.message_types_by_name['Reply'] = _REPLY -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -Task = _reflection.GeneratedProtocolMessageType('Task', (_message.Message,), dict( - DESCRIPTOR = _TASK, - __module__ = 'protos.taskmgr_pb2' - # @@protoc_insertion_point(class_scope:Task) - )) -_sym_db.RegisterMessage(Task) - -Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict( - DESCRIPTOR = _REPLY, - __module__ = 'protos.taskmgr_pb2' - # @@protoc_insertion_point(class_scope:Reply) - )) -_sym_db.RegisterMessage(Reply) - - - -_TASKREPORTER = _descriptor.ServiceDescriptor( - name='TaskReporter', - full_name='TaskReporter', - file=DESCRIPTOR, - index=0, - options=None, - serialized_start=160, - serialized_end=201, - methods=[ - _descriptor.MethodDescriptor( - name='report', - full_name='TaskReporter.report', - index=0, - containing_service=None, - input_type=_TASK, - output_type=_REPLY, - options=None, - ), -]) -_sym_db.RegisterServiceDescriptor(_TASKREPORTER) - -DESCRIPTOR.services_by_name['TaskReporter'] = _TASKREPORTER - -# @@protoc_insertion_point(module_scope) diff --git a/src/protos/taskmgr_pb2_grpc.py b/src/protos/taskmgr_pb2_grpc.py deleted file mode 100644 index 4e5722d..0000000 --- a/src/protos/taskmgr_pb2_grpc.py +++ /dev/null @@ -1,46 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -import grpc - -from protos import taskmgr_pb2 as protos_dot_taskmgr__pb2 - - -class TaskReporterStub(object): - # missing associated documentation comment in .proto file - pass - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.report = channel.unary_unary( - '/TaskReporter/report', - request_serializer=protos_dot_taskmgr__pb2.Task.SerializeToString, - response_deserializer=protos_dot_taskmgr__pb2.Reply.FromString, - ) - - -class TaskReporterServicer(object): - # missing associated documentation comment in .proto file - pass - - def report(self, request, context): - # missing associated documentation comment in .proto file - pass - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - -def add_TaskReporterServicer_to_server(servicer, server): - rpc_method_handlers = { - 'report': grpc.unary_unary_rpc_method_handler( - servicer.report, - request_deserializer=protos_dot_taskmgr__pb2.Task.FromString, - response_serializer=protos_dot_taskmgr__pb2.Reply.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - 'TaskReporter', rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) From b24051bffa5751c7b86e63efffb9995320e2c87d Mon Sep 17 00:00:00 2001 From: Gallen Date: Tue, 17 Jul 2018 14:21:34 +0800 Subject: [PATCH 08/75] modify rpc proto --- src/master/taskmgr.py | 37 ++++--- src/protos/rpc.proto | 33 +++---- src/protos/rpc_pb2.py | 192 ++++++++++++++++--------------------- src/protos/rpc_pb2_grpc.py | 4 +- 4 files changed, 120 insertions(+), 146 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 9312fe8..1bcee81 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -1,19 +1,25 @@ import threading import time +# must import logger after initlogging, ugly +from utils.log import initlogging +initlogging("docklet-taskmgr") +from utils.log import logger + +# grpc from concurrent import futures import grpc -from protos.taskmgr_pb2 import Task, Reply -from protos.taskmgr_pb2_grpc import TaskReporterServicer, add_TaskReporterServicer_to_server +from protos.rpc_pb2 import Task, Reply +from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server -class TaskReport(TaskReporterServicer): +class TaskReporter(MasterServicer): def __init__(self, taskmgr): self.taskmgr = taskmgr def report(self, request, context): self.taskmgr.on_task_report(request) - return Reply(message='received') + return Reply(message=Reply.ACCEPTED) class TaskMgr(threading.Thread): @@ -32,12 +38,12 @@ class TaskMgr(threading.Thread): task = self.task_scheduler() if task is not None: self.task_processor(task) - time.sleep(1) + time.sleep(2) def serve(self): self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - add_TaskReporterServicer_to_server(TaskReport(self), self.server) + add_MasterServicer_to_server(TaskReporter(self), self.server) self.server.add_insecure_port('[::]:50051') self.server.start() @@ -49,10 +55,15 @@ class TaskMgr(threading.Thread): # this method is called when worker send heart-beat rpc request def on_task_report(self, task): - self.taskQueue.append('task') - print('rec') - time.sleep(2) - print(self.taskQueue) + logger.info('receive task report: id %d, status %d' % (task.id, task.status)) + if task.status == Task.RUNNING: + pass + elif task.status == Task.COMPLETED: + # tell jobmgr + pass + elif task.status == Task.FAILED || task.status == Task.TIMEOUT: + # retry + pass # this is a thread to process task(or a instance) @@ -77,13 +88,11 @@ class TaskMgr(threading.Thread): # task: a json string # save the task information into database # called when jobmgr assign task to taskmgr - def add_task(self,user,task): + def add_task(self, task): pass # user: username - # jobid: the id of job - # taskid: the id of task # get the information of a task, including the status, task description and other information - def get_task(self, user, jobid, taskid): + def get_task(self, taskid): pass diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index fbe8560..4bcf62d 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -1,25 +1,13 @@ syntax = "proto3"; service Master { - rpc report (Report) returns (Reply) {}; + rpc report (Task) returns (Reply) {}; } service Worker { rpc add_task (Task) returns (Reply) {} } -message Report { - int32 taskId = 1; // 任务 id - TaskStatus taskStatus = 2; // 任务状态 - - enum TaskStatus { - RUNNING = 0; - COMPLETED = 1; - FAILED = 2; - TIMEOUT = 3; - } -} - message Reply { ReplyStatus message = 1; // 返回值 @@ -30,11 +18,20 @@ message Reply { } message Task { - int32 instanceCount = 1; // 实例个数 - int32 maxRetryCount = 2; // 最大重试次数 - Parameters parameters = 3; // 参数 - Cluster cluster = 4; // 集群配置 - int32 Timeout = 5; // 超时阈值 + string id = 1; + TaskStatus status = 2; // 任务状态 + int32 instanceCount = 3; // 实例个数 + int32 maxRetryCount = 4; // 最大重试次数 + Parameters parameters = 5; // 参数 + Cluster cluster = 6; // 集群配置 + int32 Timeout = 7; // 超时阈值 + + enum TaskStatus { + RUNNING = 0; + COMPLETED = 1; + FAILED = 2; + TIMEOUT = 3; + } } message Parameters { diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py index 1ee63cf..aa8a69c 100644 --- a/src/protos/rpc_pb2.py +++ b/src/protos/rpc_pb2.py @@ -19,14 +19,36 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='protos/rpc.proto', package='', syntax='proto3', - serialized_pb=_b('\n\x10protos/rpc.proto\"\x83\x01\n\x06Report\x12\x0e\n\x06taskId\x18\x01 \x01(\x05\x12&\n\ntaskStatus\x18\x02 \x01(\x0e\x32\x12.Report.TaskStatus\"A\n\nTaskStatus\x12\x0b\n\x07RUNNING\x10\x00\x12\r\n\tCOMPLETED\x10\x01\x12\n\n\x06\x46\x41ILED\x10\x02\x12\x0b\n\x07TIMEOUT\x10\x03\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\x81\x01\n\x04Task\x12\x15\n\rinstanceCount\x18\x01 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x02 \x01(\x05\x12\x1f\n\nparameters\x18\x03 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x04 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x05 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32%\n\x06Master\x12\x1b\n\x06report\x12\x07.Report\x1a\x06.Reply\"\x00\x32%\n\x06Worker\x12\x1b\n\x08\x61\x64\x64_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') + serialized_pb=_b('\n\x10protos/rpc.proto\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\xf2\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12 \n\x06status\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\x12\x15\n\rinstanceCount\x18\x03 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x04 \x01(\x05\x12\x1f\n\nparameters\x18\x05 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x06 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x07 \x01(\x05\"A\n\nTaskStatus\x12\x0b\n\x07RUNNING\x10\x00\x12\r\n\tCOMPLETED\x10\x01\x12\n\n\x06\x46\x41ILED\x10\x02\x12\x0b\n\x07TIMEOUT\x10\x03\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32#\n\x06Master\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x32%\n\x06Worker\x12\x1b\n\x08\x61\x64\x64_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') ) -_REPORT_TASKSTATUS = _descriptor.EnumDescriptor( +_REPLY_REPLYSTATUS = _descriptor.EnumDescriptor( + name='ReplyStatus', + full_name='Reply.ReplyStatus', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='ACCEPTED', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='REFUSED', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=66, + serialized_end=106, +) +_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS) + +_TASK_TASKSTATUS = _descriptor.EnumDescriptor( name='TaskStatus', - full_name='Report.TaskStatus', + full_name='Task.TaskStatus', filename=None, file=DESCRIPTOR, values=[ @@ -49,32 +71,10 @@ _REPORT_TASKSTATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=87, - serialized_end=152, + serialized_start=286, + serialized_end=351, ) -_sym_db.RegisterEnumDescriptor(_REPORT_TASKSTATUS) - -_REPLY_REPLYSTATUS = _descriptor.EnumDescriptor( - name='ReplyStatus', - full_name='Reply.ReplyStatus', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='ACCEPTED', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='REFUSED', index=1, number=1, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=200, - serialized_end=240, -) -_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS) +_sym_db.RegisterEnumDescriptor(_TASK_TASKSTATUS) _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( name='ImageType', @@ -93,51 +93,12 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=769, - serialized_end=805, + serialized_start=748, + serialized_end=784, ) _sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) -_REPORT = _descriptor.Descriptor( - name='Report', - full_name='Report', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='taskId', full_name='Report.taskId', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='taskStatus', full_name='Report.taskStatus', index=1, - number=2, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - _REPORT_TASKSTATUS, - ], - options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=21, - serialized_end=152, -) - - _REPLY = _descriptor.Descriptor( name='Reply', full_name='Reply', @@ -165,8 +126,8 @@ _REPLY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=154, - serialized_end=240, + serialized_start=20, + serialized_end=106, ) @@ -178,36 +139,50 @@ _TASK = _descriptor.Descriptor( containing_type=None, fields=[ _descriptor.FieldDescriptor( - name='instanceCount', full_name='Task.instanceCount', index=0, - number=1, type=5, cpp_type=1, label=1, + name='id', full_name='Task.id', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='status', full_name='Task.status', index=1, + number=2, type=14, cpp_type=8, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='maxRetryCount', full_name='Task.maxRetryCount', index=1, - number=2, type=5, cpp_type=1, label=1, + name='instanceCount', full_name='Task.instanceCount', index=2, + number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='parameters', full_name='Task.parameters', index=2, - number=3, type=11, cpp_type=10, label=1, + name='maxRetryCount', full_name='Task.maxRetryCount', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='parameters', full_name='Task.parameters', index=4, + number=5, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='cluster', full_name='Task.cluster', index=3, - number=4, type=11, cpp_type=10, label=1, + name='cluster', full_name='Task.cluster', index=5, + number=6, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='Timeout', full_name='Task.Timeout', index=4, - number=5, type=5, cpp_type=1, label=1, + name='Timeout', full_name='Task.Timeout', index=6, + number=7, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -217,6 +192,7 @@ _TASK = _descriptor.Descriptor( ], nested_types=[], enum_types=[ + _TASK_TASKSTATUS, ], options=None, is_extendable=False, @@ -224,8 +200,8 @@ _TASK = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=243, - serialized_end=372, + serialized_start=109, + serialized_end=351, ) @@ -269,8 +245,8 @@ _PARAMETERS = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=374, - serialized_end=469, + serialized_start=353, + serialized_end=448, ) @@ -307,8 +283,8 @@ _COMMAND_ENVVARSENTRY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=565, - serialized_end=611, + serialized_start=544, + serialized_end=590, ) _COMMAND = _descriptor.Descriptor( @@ -351,8 +327,8 @@ _COMMAND = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=472, - serialized_end=611, + serialized_start=451, + serialized_end=590, ) @@ -396,8 +372,8 @@ _CLUSTER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=613, - serialized_end=697, + serialized_start=592, + serialized_end=676, ) @@ -442,8 +418,8 @@ _IMAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=699, - serialized_end=805, + serialized_start=678, + serialized_end=784, ) @@ -480,8 +456,8 @@ _MOUNT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=807, - serialized_end=853, + serialized_start=786, + serialized_end=832, ) @@ -532,16 +508,16 @@ _INSTANCE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=855, - serialized_end=921, + serialized_start=834, + serialized_end=900, ) -_REPORT.fields_by_name['taskStatus'].enum_type = _REPORT_TASKSTATUS -_REPORT_TASKSTATUS.containing_type = _REPORT _REPLY.fields_by_name['message'].enum_type = _REPLY_REPLYSTATUS _REPLY_REPLYSTATUS.containing_type = _REPLY +_TASK.fields_by_name['status'].enum_type = _TASK_TASKSTATUS _TASK.fields_by_name['parameters'].message_type = _PARAMETERS _TASK.fields_by_name['cluster'].message_type = _CLUSTER +_TASK_TASKSTATUS.containing_type = _TASK _PARAMETERS.fields_by_name['command'].message_type = _COMMAND _COMMAND_ENVVARSENTRY.containing_type = _COMMAND _COMMAND.fields_by_name['envVars'].message_type = _COMMAND_ENVVARSENTRY @@ -550,7 +526,6 @@ _CLUSTER.fields_by_name['instance'].message_type = _INSTANCE _CLUSTER.fields_by_name['mount'].message_type = _MOUNT _IMAGE.fields_by_name['type'].enum_type = _IMAGE_IMAGETYPE _IMAGE_IMAGETYPE.containing_type = _IMAGE -DESCRIPTOR.message_types_by_name['Report'] = _REPORT DESCRIPTOR.message_types_by_name['Reply'] = _REPLY DESCRIPTOR.message_types_by_name['Task'] = _TASK DESCRIPTOR.message_types_by_name['Parameters'] = _PARAMETERS @@ -561,13 +536,6 @@ DESCRIPTOR.message_types_by_name['Mount'] = _MOUNT DESCRIPTOR.message_types_by_name['Instance'] = _INSTANCE _sym_db.RegisterFileDescriptor(DESCRIPTOR) -Report = _reflection.GeneratedProtocolMessageType('Report', (_message.Message,), dict( - DESCRIPTOR = _REPORT, - __module__ = 'protos.rpc_pb2' - # @@protoc_insertion_point(class_scope:Report) - )) -_sym_db.RegisterMessage(Report) - Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict( DESCRIPTOR = _REPLY, __module__ = 'protos.rpc_pb2' @@ -642,15 +610,15 @@ _MASTER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, options=None, - serialized_start=923, - serialized_end=960, + serialized_start=902, + serialized_end=937, methods=[ _descriptor.MethodDescriptor( name='report', full_name='Master.report', index=0, containing_service=None, - input_type=_REPORT, + input_type=_TASK, output_type=_REPLY, options=None, ), @@ -666,8 +634,8 @@ _WORKER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=1, options=None, - serialized_start=962, - serialized_end=999, + serialized_start=939, + serialized_end=976, methods=[ _descriptor.MethodDescriptor( name='add_task', diff --git a/src/protos/rpc_pb2_grpc.py b/src/protos/rpc_pb2_grpc.py index 55bd6b1..e629219 100644 --- a/src/protos/rpc_pb2_grpc.py +++ b/src/protos/rpc_pb2_grpc.py @@ -16,7 +16,7 @@ class MasterStub(object): """ self.report = channel.unary_unary( '/Master/report', - request_serializer=protos_dot_rpc__pb2.Report.SerializeToString, + request_serializer=protos_dot_rpc__pb2.Task.SerializeToString, response_deserializer=protos_dot_rpc__pb2.Reply.FromString, ) @@ -37,7 +37,7 @@ def add_MasterServicer_to_server(servicer, server): rpc_method_handlers = { 'report': grpc.unary_unary_rpc_method_handler( servicer.report, - request_deserializer=protos_dot_rpc__pb2.Report.FromString, + request_deserializer=protos_dot_rpc__pb2.Task.FromString, response_serializer=protos_dot_rpc__pb2.Reply.SerializeToString, ), } From a670020e8db0158128d49f983c507d9dff3c7c88 Mon Sep 17 00:00:00 2001 From: Gallen Date: Tue, 17 Jul 2018 14:32:49 +0800 Subject: [PATCH 09/75] modify rpc proto --- src/master/taskmgr.py | 25 ++++++++++++++++--- src/protos/rpc.proto | 9 ++++--- src/protos/rpc_pb2.py | 58 +++++++++++++++++++++++-------------------- 3 files changed, 57 insertions(+), 35 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 1bcee81..6e23f3b 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -54,8 +54,14 @@ class TaskMgr(threading.Thread): # this method is called when worker send heart-beat rpc request - def on_task_report(self, task): - logger.info('receive task report: id %d, status %d' % (task.id, task.status)) + def on_task_report(self, report): + logger.info('[on_task_report] receive task report: id %d, status %d' % (report.id, report.status)) + task = get_task(report.id) + if task == None: + logger.error('[on_task_report] task not found') + return + + task.status = report.status if task.status == Task.RUNNING: pass elif task.status == Task.COMPLETED: @@ -63,7 +69,15 @@ class TaskMgr(threading.Thread): pass elif task.status == Task.FAILED || task.status == Task.TIMEOUT: # retry - pass + if task.maxRetryCount <= 0: + # tell jobmgr + pass + else: + # decrease max retry count & waiting for retry + task.maxRetryCount -= 1 + task.status = Task.WAITING + else: + logger.error('[on_task_report] receive report from waiting task') # this is a thread to process task(or a instance) @@ -95,4 +109,7 @@ class TaskMgr(threading.Thread): # user: username # get the information of a task, including the status, task description and other information def get_task(self, taskid): - pass + for task in self.taskQueue: + if task.id == taskid: + return task + return None diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index 4bcf62d..d1adc74 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -27,10 +27,11 @@ message Task { int32 Timeout = 7; // 超时阈值 enum TaskStatus { - RUNNING = 0; - COMPLETED = 1; - FAILED = 2; - TIMEOUT = 3; + WAITING = 0; + RUNNING = 1; + COMPLETED = 2; + FAILED = 3; + TIMEOUT = 4; } } diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py index aa8a69c..4cc4ceb 100644 --- a/src/protos/rpc_pb2.py +++ b/src/protos/rpc_pb2.py @@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='protos/rpc.proto', package='', syntax='proto3', - serialized_pb=_b('\n\x10protos/rpc.proto\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\xf2\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12 \n\x06status\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\x12\x15\n\rinstanceCount\x18\x03 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x04 \x01(\x05\x12\x1f\n\nparameters\x18\x05 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x06 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x07 \x01(\x05\"A\n\nTaskStatus\x12\x0b\n\x07RUNNING\x10\x00\x12\r\n\tCOMPLETED\x10\x01\x12\n\n\x06\x46\x41ILED\x10\x02\x12\x0b\n\x07TIMEOUT\x10\x03\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32#\n\x06Master\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x32%\n\x06Worker\x12\x1b\n\x08\x61\x64\x64_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') + serialized_pb=_b('\n\x10protos/rpc.proto\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\xff\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12 \n\x06status\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\x12\x15\n\rinstanceCount\x18\x03 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x04 \x01(\x05\x12\x1f\n\nparameters\x18\x05 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x06 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x07 \x01(\x05\"N\n\nTaskStatus\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32#\n\x06Master\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x32%\n\x06Worker\x12\x1b\n\x08\x61\x64\x64_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') ) @@ -53,26 +53,30 @@ _TASK_TASKSTATUS = _descriptor.EnumDescriptor( file=DESCRIPTOR, values=[ _descriptor.EnumValueDescriptor( - name='RUNNING', index=0, number=0, + name='WAITING', index=0, number=0, options=None, type=None), _descriptor.EnumValueDescriptor( - name='COMPLETED', index=1, number=1, + name='RUNNING', index=1, number=1, options=None, type=None), _descriptor.EnumValueDescriptor( - name='FAILED', index=2, number=2, + name='COMPLETED', index=2, number=2, options=None, type=None), _descriptor.EnumValueDescriptor( - name='TIMEOUT', index=3, number=3, + name='FAILED', index=3, number=3, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='TIMEOUT', index=4, number=4, options=None, type=None), ], containing_type=None, options=None, serialized_start=286, - serialized_end=351, + serialized_end=364, ) _sym_db.RegisterEnumDescriptor(_TASK_TASKSTATUS) @@ -93,8 +97,8 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=748, - serialized_end=784, + serialized_start=761, + serialized_end=797, ) _sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) @@ -201,7 +205,7 @@ _TASK = _descriptor.Descriptor( oneofs=[ ], serialized_start=109, - serialized_end=351, + serialized_end=364, ) @@ -245,8 +249,8 @@ _PARAMETERS = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=353, - serialized_end=448, + serialized_start=366, + serialized_end=461, ) @@ -283,8 +287,8 @@ _COMMAND_ENVVARSENTRY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=544, - serialized_end=590, + serialized_start=557, + serialized_end=603, ) _COMMAND = _descriptor.Descriptor( @@ -327,8 +331,8 @@ _COMMAND = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=451, - serialized_end=590, + serialized_start=464, + serialized_end=603, ) @@ -372,8 +376,8 @@ _CLUSTER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=592, - serialized_end=676, + serialized_start=605, + serialized_end=689, ) @@ -418,8 +422,8 @@ _IMAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=678, - serialized_end=784, + serialized_start=691, + serialized_end=797, ) @@ -456,8 +460,8 @@ _MOUNT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=786, - serialized_end=832, + serialized_start=799, + serialized_end=845, ) @@ -508,8 +512,8 @@ _INSTANCE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=834, - serialized_end=900, + serialized_start=847, + serialized_end=913, ) _REPLY.fields_by_name['message'].enum_type = _REPLY_REPLYSTATUS @@ -610,8 +614,8 @@ _MASTER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, options=None, - serialized_start=902, - serialized_end=937, + serialized_start=915, + serialized_end=950, methods=[ _descriptor.MethodDescriptor( name='report', @@ -634,8 +638,8 @@ _WORKER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=1, options=None, - serialized_start=939, - serialized_end=976, + serialized_start=952, + serialized_end=989, methods=[ _descriptor.MethodDescriptor( name='add_task', From 33f2b4a5691b0a9044e0634010863091f4dc31a5 Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Thu, 19 Jul 2018 14:29:25 +0800 Subject: [PATCH 10/75] add taskcontroller --- src/protos/rpc.proto | 5 +- src/worker/taskcontroller.py | 109 +++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 src/worker/taskcontroller.py diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index d1adc74..ee0d249 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -5,7 +5,8 @@ service Master { } service Worker { - rpc add_task (Task) returns (Reply) {} + rpc add_task (Task) returns (Reply) {} + rpc process_task (Task) returns (Reply) {} } message Reply { @@ -74,4 +75,4 @@ message Instance { int32 memory = 2; // 内存,单位 mb int32 disk = 3; // 磁盘,单位 mb int32 gpu = 4; // 显卡,单位 个 -} \ No newline at end of file +} diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py new file mode 100644 index 0000000..f0903b6 --- /dev/null +++ b/src/worker/taskcontroller.py @@ -0,0 +1,109 @@ +#!/usr/bin/python3 + +import xmlrpc.client +from log import logger +import env +import json,lxc,subprocess,threading,os +import imagemgr + +class TaskController(object): + + def __init__(self): + self.imgmgr = imagemgr.ImageMgr() + self.fspath = env.getenv('FS_PREFIX') + self.confpath = env.getenv('DOCKLET_CONF') + self.masterip = '162.105.88.190' + self.masterport = 9002 + self.masterrpc = xmlrpc.client.ServerProxy("http://%s:%s" % (self.masterip,self.masterport)) + logger.info('TaskController init success') + + def process_task(self, parameter): + logger.info('excute task with parameter: ' + parameter) + parameter = json.loads(parameter) + jobid = parameter['JobId'] + taskid = parameter['TaskId'] + taskno = parameter['TaskNo'] + username = parameter['UserName'] + lxcname = '%s-%s-%s-%s' % (username,jobid,taskid,taskno) + command = '/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine'] + envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars'] + envs['TASK_NO']=taskno + image = parameter['ImageId'] + instance_type = parameter['InstanceType'] + + status = self.imgmgr.prepareFS(username,image,lxcname,instance_type['disk']) + if not status: + return [False, "Create container for batch failed when preparing filesystem"] + + rootfs = "/var/lib/lxc/%s/rootfs" % lxcname + + if not os.path.isdir("%s/global/users/%s" % (self.fspath,username)): + path = env.getenv('DOCKLET_LIB') + subprocess.call([path+"/userinit.sh", username]) + logger.info("user %s directory not found, create it" % username) + sys_run("mkdir -p /var/lib/lxc/%s" % lxcname) + logger.info("generate config file for %s" % lxcname) + + def config_prepare(content): + content = content.replace("%ROOTFS%",rootfs) + content = content.replace("%CONTAINER_MEMORY%",str(instance_type['memory'])) + content = content.replace("%CONTAINER_CPU%",str(instance_type['cpu']*100000)) + content = content.replace("%FS_PREFIX%",self.fspath) + content = content.replace("%USERNAME%",username) + content = content.replace("%LXCNAME%",lxcname) + return content + + conffile = open(self.confpath+"/container.batch.conf", 'r') + conftext = conffile.read() + conffile.close() + + conftext = config_prepare(conftext) + + conffile = open("/var/lib/lxc/%s/config" % lxcname, 'w') + conffile.write(conftext) + conffile.close() + + container = lxc.Container(lxcname) + if not container.start(): + logger.error('start container %s failed' % lxcname) + return True + #return json.dumps({'success':'false','message': "start container failed"}) + else: + logger.info('start container %s success' % lxcname) + + #mount oss here + + thread = threading.Thread(target = self.excute_task, args=(jobid,taskid,envs,lxcname,command)) + thread.setDaemon(True) + thread.start() + + return True + #return json.dumps({'success':'true','message':'task is running'}) + + def excute_task(self,jobid,taskid,envs,lxcname,command): + cmd = "lxc-attach -n " + lxcname + for envkey,envval in envs.items(): + cmd = cmd + " -v %s=%s" % (envkey,envval) + cmd = cmd + " " + command + logger.info('run task with command - %s' % cmd) + Ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) + if Ret == 0: + #call master rpc function to tell the taskmgr + self.masterrpc.complete_task(jobid,taskid) + else: + self.masterrpc.fail_task(jobid,taskid) + #call master rpc function to tell the wrong + + #umount oss here + + container = lxc.Container(lxcname) + if container.stop(): + logger.info("stop container %s success" % lxcname) + else: + logger.error("stop container %s failed" % lxcname) + + logger.info("deleting container:%s" % lxcname) + if self.imgmgr.deleteFS(lxcname): + logger.info("delete container %s success" % lxcname) + else: + logger.error("delete container %s failed" % lxcname) From 2b1e59af8f6561e3a4f043ae25e974b06d6d0aa6 Mon Sep 17 00:00:00 2001 From: Firmlyzhu Date: Thu, 19 Jul 2018 14:40:26 +0800 Subject: [PATCH 11/75] Fix some bugs --- prepare.sh | 2 +- src/worker/worker.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/prepare.sh b/prepare.sh index 0025f5b..9c00643 100755 --- a/prepare.sh +++ b/prepare.sh @@ -16,7 +16,7 @@ fi # some packages' name maybe different in debian apt-get install -y cgmanager lxc lxcfs lxc-templates lvm2 bridge-utils curl exim4 openssh-server openvswitch-switch apt-get install -y python3 python3-netifaces python3-flask python3-flask-sqlalchemy python3-pampy python3-httplib2 python3-pip -apt-get install -y python3-psutil python3-flask-migrate +apt-get install -y python3-psutil python3-flask-migrate python3-paramiko apt-get install -y python3-lxc apt-get install -y python3-requests python3-suds apt-get install -y nodejs nodejs-legacy npm diff --git a/src/worker/worker.py b/src/worker/worker.py index 03da97a..88839c7 100755 --- a/src/worker/worker.py +++ b/src/worker/worker.py @@ -19,7 +19,6 @@ from socketserver import ThreadingMixIn import threading from utils import etcdlib, proxytool from worker import container, monitor -from worker.taskmgr import TaskMgr from utils.nettools import netcontrol,ovscontrol,portcontrol from utils.lvmtool import new_group, recover_group from master import network @@ -145,7 +144,6 @@ class Worker(object): self.rpcserver.register_function(proxytool.delete_route) self.rpcserver.register_function(portcontrol.acquire_port_mapping) self.rpcserver.register_function(portcontrol.release_port_mapping) - self.rpcserver.register_function(TaskMgr.execute_task) # register functions or instances to server for rpc #self.rpcserver.register_function(function_name) From 245fad1fad6772f6c40550ebc1174fa2de8c096f Mon Sep 17 00:00:00 2001 From: Gallen Date: Thu, 19 Jul 2018 14:55:27 +0800 Subject: [PATCH 12/75] basic implement for taskmgr --- src/master/taskmgr.py | 137 +++++++++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 35 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 6e23f3b..92b5575 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -1,6 +1,8 @@ import threading import time +import master.monitor + # must import logger after initlogging, ugly from utils.log import initlogging initlogging("docklet-taskmgr") @@ -26,19 +28,28 @@ class TaskMgr(threading.Thread): # load task information from etcd # initial a task queue and task schedueler # taskmgr: a taskmgr instance - def __init__(self): + def __init__(self, nodemgr): threading.Thread.__init__(self) self.thread_stop = False - self.taskQueue = [] + + # tasks + self.task_queue = [] + + # nodes + self.nodemgr = nodemgr + self.all_nodes = None + self.last_nodes_info_update_time = 0 + self.nodes_info_update_interval = 30 # (s) def run(self): self.serve() while not self.thread_stop: - task = self.task_scheduler() - if task is not None: - self.task_processor(task) - time.sleep(2) + task, instance_id, worker = self.task_scheduler() + if task is not None and worker is not None: + self.task_processor(task, instance_id, worker) + else: + time.sleep(2) def serve(self): @@ -46,11 +57,13 @@ class TaskMgr(threading.Thread): add_MasterServicer_to_server(TaskReporter(self), self.server) self.server.add_insecure_port('[::]:50051') self.server.start() + logger.info('[taskmgr_rpc] start rpc server') def stop(self): self.thread_stop = True self.server.stop(0) + logger.info('[taskmgr_rpc] stop rpc server') # this method is called when worker send heart-beat rpc request @@ -61,41 +74,93 @@ class TaskMgr(threading.Thread): logger.error('[on_task_report] task not found') return - task.status = report.status - if task.status == Task.RUNNING: + instance_id = report.parameters.command.envVars['INSTANCE_ID'] + instance = task.instance_list[instance_id] + + if report.status == Task.RUNNING: pass - elif task.status == Task.COMPLETED: - # tell jobmgr - pass - elif task.status == Task.FAILED || task.status == Task.TIMEOUT: - # retry - if task.maxRetryCount <= 0: - # tell jobmgr - pass - else: - # decrease max retry count & waiting for retry - task.maxRetryCount -= 1 - task.status = Task.WAITING + elif report.status == Task.COMPLETED: + instance['status'] = 'completed' + check_task_completed(task) + elif report.status == Task.FAILED || report.status == Task.TIMEOUT: + instance['status'] = 'failed' + if instance['try_count'] > task.maxRetryCount: + check_task_completed(task) else: logger.error('[on_task_report] receive report from waiting task') - # this is a thread to process task(or a instance) - def task_processor(self,task): - # call the rpc to call a function in worker - # create container -> execute task - # (one instance or multiple instances) - # retry when failed - print('processing %s' % task) + def check_task_completed(self, task): + if len(task.instance_list) < task.instanceCount: + return + failed = False + for instance in task.instance_list: + if instance['status'] == 'running': + return + if instance['status'] == 'failed': + if instance['try_count'] > task.maxRetryCount: + failed = True + else: + return + if failed: + # tell jobmgr task failed + task.status = Task.FAILED + else: + # tell jobmgr task completed + task.status = Task.COMPLETED + self.task_queue.remove(task) - # this is a thread to schdule the tasks + def task_processor(self, task, instance_id, worker): + task.status = Task.RUNNING + task.parameters.command.envVars['INSTANCE_ID'] = instance_id + # TODO call the rpc to call a function in worker + print('processing %s' % task.id) + + + # return task, worker def task_scheduler(self): - try: - task = self.taskQueue.pop(0) - except: - task = None - return task + # simple FIFO + for task in self.task_queue: + worker = self.find_proper_worker(task) + if worker is not None: + # find instance to retry + for instance, index in enumerate(task.instance_list): + if instance['status'] == 'failed' and instance['try_count'] <= task.maxRetryCount: + instance['try_count'] += 1 + return task, index, worker + + # start new instance + if len(task.instance_list) < task.instanceCount: + instance = {} + instance['status'] = 'running' + instance['try_count'] = 0 + task.instance_list.append(instance) + return task, len(task.instance_list) - 1, worker + return None + + + def find_proper_worker(self, task): + nodes = get_all_nodes() + if nodes is None or len(nodes) == 0: + logger.warning('[task_scheduler] running nodes not found') + return None + + # TODO + return nodes[0] + + + def get_all_nodes(self): + # cache running nodes + if self.all_nodes is not None and time.time() - self.last_nodes_info_update_time < self.nodes_info_update_interval: + return self.all_nodes + # get running nodes + node_ips = self.nodemgr.get_nodeips() + self.all_nodes = [] + for node_ip in node_ips: + fetcher = master.monitor.Fetcher(node_ip) + self.all_nodes.append(fetcher.info) + return self.all_nodes # user: username @@ -103,13 +168,15 @@ class TaskMgr(threading.Thread): # save the task information into database # called when jobmgr assign task to taskmgr def add_task(self, task): - pass + # decode json string to object defined in grpc + task.instance_list = [] + self.task_queue.append(task) # user: username # get the information of a task, including the status, task description and other information def get_task(self, taskid): - for task in self.taskQueue: + for task in self.task_queue: if task.id == taskid: return task return None From f13cf4c47d9e9b3ca78fff6967cb05c01375fce7 Mon Sep 17 00:00:00 2001 From: Firmlyzhu Date: Thu, 19 Jul 2018 15:26:52 +0800 Subject: [PATCH 13/75] update rpc protos --- prepare.sh | 1 + src/protos/rpc.proto | 2 +- src/protos/rpc_pb2.py | 84 ++++++++++++++++++------------------ src/protos/rpc_pb2_grpc.py | 32 +++++++------- src/worker/taskcontroller.py | 20 +++++---- 5 files changed, 71 insertions(+), 68 deletions(-) diff --git a/prepare.sh b/prepare.sh index 9c00643..3099607 100755 --- a/prepare.sh +++ b/prepare.sh @@ -23,6 +23,7 @@ apt-get install -y nodejs nodejs-legacy npm apt-get install -y etcd apt-get install -y glusterfs-client attr apt-get install -y nginx +pip3 install grpcio grpcio-tools googleapis-common-protos #add ip forward echo "net.ipv4.ip_forward=1" >>/etc/sysctl.conf diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index ee0d249..174e7a5 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -5,7 +5,7 @@ service Master { } service Worker { - rpc add_task (Task) returns (Reply) {} + //rpc add_task (Task) returns (Reply) {} rpc process_task (Task) returns (Reply) {} } diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py index 4cc4ceb..116e2a4 100644 --- a/src/protos/rpc_pb2.py +++ b/src/protos/rpc_pb2.py @@ -1,5 +1,5 @@ # Generated by the protocol buffer compiler. DO NOT EDIT! -# source: protos/rpc.proto +# source: rpc.proto import sys _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) @@ -16,10 +16,10 @@ _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor.FileDescriptor( - name='protos/rpc.proto', + name='rpc.proto', package='', syntax='proto3', - serialized_pb=_b('\n\x10protos/rpc.proto\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\xff\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12 \n\x06status\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\x12\x15\n\rinstanceCount\x18\x03 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x04 \x01(\x05\x12\x1f\n\nparameters\x18\x05 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x06 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x07 \x01(\x05\"N\n\nTaskStatus\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32#\n\x06Master\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x32%\n\x06Worker\x12\x1b\n\x08\x61\x64\x64_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') + serialized_pb=_b('\n\trpc.proto\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\xff\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12 \n\x06status\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\x12\x15\n\rinstanceCount\x18\x03 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x04 \x01(\x05\x12\x1f\n\nparameters\x18\x05 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x06 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x07 \x01(\x05\"N\n\nTaskStatus\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32#\n\x06Master\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x32)\n\x06Worker\x12\x1f\n\x0cprocess_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') ) @@ -41,8 +41,8 @@ _REPLY_REPLYSTATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=66, - serialized_end=106, + serialized_start=59, + serialized_end=99, ) _sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS) @@ -75,8 +75,8 @@ _TASK_TASKSTATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=286, - serialized_end=364, + serialized_start=279, + serialized_end=357, ) _sym_db.RegisterEnumDescriptor(_TASK_TASKSTATUS) @@ -97,8 +97,8 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=761, - serialized_end=797, + serialized_start=754, + serialized_end=790, ) _sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) @@ -130,8 +130,8 @@ _REPLY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=20, - serialized_end=106, + serialized_start=13, + serialized_end=99, ) @@ -204,8 +204,8 @@ _TASK = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=109, - serialized_end=364, + serialized_start=102, + serialized_end=357, ) @@ -249,8 +249,8 @@ _PARAMETERS = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=366, - serialized_end=461, + serialized_start=359, + serialized_end=454, ) @@ -287,8 +287,8 @@ _COMMAND_ENVVARSENTRY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=557, - serialized_end=603, + serialized_start=550, + serialized_end=596, ) _COMMAND = _descriptor.Descriptor( @@ -331,8 +331,8 @@ _COMMAND = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=464, - serialized_end=603, + serialized_start=457, + serialized_end=596, ) @@ -376,8 +376,8 @@ _CLUSTER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=605, - serialized_end=689, + serialized_start=598, + serialized_end=682, ) @@ -422,8 +422,8 @@ _IMAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=691, - serialized_end=797, + serialized_start=684, + serialized_end=790, ) @@ -460,8 +460,8 @@ _MOUNT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=799, - serialized_end=845, + serialized_start=792, + serialized_end=838, ) @@ -512,8 +512,8 @@ _INSTANCE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=847, - serialized_end=913, + serialized_start=840, + serialized_end=906, ) _REPLY.fields_by_name['message'].enum_type = _REPLY_REPLYSTATUS @@ -542,21 +542,21 @@ _sym_db.RegisterFileDescriptor(DESCRIPTOR) Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict( DESCRIPTOR = _REPLY, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Reply) )) _sym_db.RegisterMessage(Reply) Task = _reflection.GeneratedProtocolMessageType('Task', (_message.Message,), dict( DESCRIPTOR = _TASK, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Task) )) _sym_db.RegisterMessage(Task) Parameters = _reflection.GeneratedProtocolMessageType('Parameters', (_message.Message,), dict( DESCRIPTOR = _PARAMETERS, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Parameters) )) _sym_db.RegisterMessage(Parameters) @@ -565,12 +565,12 @@ Command = _reflection.GeneratedProtocolMessageType('Command', (_message.Message, EnvVarsEntry = _reflection.GeneratedProtocolMessageType('EnvVarsEntry', (_message.Message,), dict( DESCRIPTOR = _COMMAND_ENVVARSENTRY, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Command.EnvVarsEntry) )) , DESCRIPTOR = _COMMAND, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Command) )) _sym_db.RegisterMessage(Command) @@ -578,28 +578,28 @@ _sym_db.RegisterMessage(Command.EnvVarsEntry) Cluster = _reflection.GeneratedProtocolMessageType('Cluster', (_message.Message,), dict( DESCRIPTOR = _CLUSTER, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Cluster) )) _sym_db.RegisterMessage(Cluster) Image = _reflection.GeneratedProtocolMessageType('Image', (_message.Message,), dict( DESCRIPTOR = _IMAGE, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Image) )) _sym_db.RegisterMessage(Image) Mount = _reflection.GeneratedProtocolMessageType('Mount', (_message.Message,), dict( DESCRIPTOR = _MOUNT, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Mount) )) _sym_db.RegisterMessage(Mount) Instance = _reflection.GeneratedProtocolMessageType('Instance', (_message.Message,), dict( DESCRIPTOR = _INSTANCE, - __module__ = 'protos.rpc_pb2' + __module__ = 'rpc_pb2' # @@protoc_insertion_point(class_scope:Instance) )) _sym_db.RegisterMessage(Instance) @@ -614,8 +614,8 @@ _MASTER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, options=None, - serialized_start=915, - serialized_end=950, + serialized_start=908, + serialized_end=943, methods=[ _descriptor.MethodDescriptor( name='report', @@ -638,12 +638,12 @@ _WORKER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=1, options=None, - serialized_start=952, - serialized_end=989, + serialized_start=945, + serialized_end=986, methods=[ _descriptor.MethodDescriptor( - name='add_task', - full_name='Worker.add_task', + name='process_task', + full_name='Worker.process_task', index=0, containing_service=None, input_type=_TASK, diff --git a/src/protos/rpc_pb2_grpc.py b/src/protos/rpc_pb2_grpc.py index e629219..dc3de46 100644 --- a/src/protos/rpc_pb2_grpc.py +++ b/src/protos/rpc_pb2_grpc.py @@ -1,7 +1,7 @@ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! import grpc -from protos import rpc_pb2 as protos_dot_rpc__pb2 +import rpc_pb2 as rpc__pb2 class MasterStub(object): @@ -16,8 +16,8 @@ class MasterStub(object): """ self.report = channel.unary_unary( '/Master/report', - request_serializer=protos_dot_rpc__pb2.Task.SerializeToString, - response_deserializer=protos_dot_rpc__pb2.Reply.FromString, + request_serializer=rpc__pb2.Task.SerializeToString, + response_deserializer=rpc__pb2.Reply.FromString, ) @@ -37,8 +37,8 @@ def add_MasterServicer_to_server(servicer, server): rpc_method_handlers = { 'report': grpc.unary_unary_rpc_method_handler( servicer.report, - request_deserializer=protos_dot_rpc__pb2.Task.FromString, - response_serializer=protos_dot_rpc__pb2.Reply.SerializeToString, + request_deserializer=rpc__pb2.Task.FromString, + response_serializer=rpc__pb2.Reply.SerializeToString, ), } generic_handler = grpc.method_handlers_generic_handler( @@ -56,10 +56,10 @@ class WorkerStub(object): Args: channel: A grpc.Channel. """ - self.add_task = channel.unary_unary( - '/Worker/add_task', - request_serializer=protos_dot_rpc__pb2.Task.SerializeToString, - response_deserializer=protos_dot_rpc__pb2.Reply.FromString, + self.process_task = channel.unary_unary( + '/Worker/process_task', + request_serializer=rpc__pb2.Task.SerializeToString, + response_deserializer=rpc__pb2.Reply.FromString, ) @@ -67,9 +67,9 @@ class WorkerServicer(object): # missing associated documentation comment in .proto file pass - def add_task(self, request, context): - # missing associated documentation comment in .proto file - pass + def process_task(self, request, context): + """rpc add_task (Task) returns (Reply) {} + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') @@ -77,10 +77,10 @@ class WorkerServicer(object): def add_WorkerServicer_to_server(servicer, server): rpc_method_handlers = { - 'add_task': grpc.unary_unary_rpc_method_handler( - servicer.add_task, - request_deserializer=protos_dot_rpc__pb2.Task.FromString, - response_serializer=protos_dot_rpc__pb2.Reply.SerializeToString, + 'process_task': grpc.unary_unary_rpc_method_handler( + servicer.process_task, + request_deserializer=rpc__pb2.Task.FromString, + response_serializer=rpc__pb2.Reply.SerializeToString, ), } generic_handler = grpc.method_handlers_generic_handler( diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py index f0903b6..e70ba3d 100644 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -1,23 +1,25 @@ #!/usr/bin/python3 -import xmlrpc.client -from log import logger -import env +from concurrent import futures +import grpc +from utils.log import logger +from utils import env import json,lxc,subprocess,threading,os -import imagemgr +from utils import imagemgr +from protos import rpc_pb2, rpc_pb2_grpc -class TaskController(object): +class TaskController(rpc_pb2_grpc.WorkerServicer): def __init__(self): self.imgmgr = imagemgr.ImageMgr() self.fspath = env.getenv('FS_PREFIX') self.confpath = env.getenv('DOCKLET_CONF') - self.masterip = '162.105.88.190' - self.masterport = 9002 - self.masterrpc = xmlrpc.client.ServerProxy("http://%s:%s" % (self.masterip,self.masterport)) + #self.masterip = '162.105.88.190' + #self.masterport = 9002 + #self.masterrpc = xmlrpc.client.ServerProxy("http://%s:%s" % (self.masterip,self.masterport)) logger.info('TaskController init success') - def process_task(self, parameter): + def process_task(self, request, context): logger.info('excute task with parameter: ' + parameter) parameter = json.loads(parameter) jobid = parameter['JobId'] From 6ecf54da745b5d02b2fbd65ba0db6ec1b4295db1 Mon Sep 17 00:00:00 2001 From: Gallen Date: Thu, 19 Jul 2018 17:24:16 +0800 Subject: [PATCH 14/75] update taskmgr --- src/master/taskmgr.py | 86 +++++++++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 92b5575..470aefb 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -1,5 +1,7 @@ import threading import time +import string +import random import master.monitor @@ -11,8 +13,9 @@ from utils.log import logger # grpc from concurrent import futures import grpc -from protos.rpc_pb2 import Task, Reply -from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server +from protos.rpc_pb2 import Task, TaskMsg, Status, Reply +from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, WorkerStub + class TaskReporter(MasterServicer): @@ -23,6 +26,7 @@ class TaskReporter(MasterServicer): self.taskmgr.on_task_report(request) return Reply(message=Reply.ACCEPTED) + class TaskMgr(threading.Thread): # load task information from etcd @@ -34,6 +38,7 @@ class TaskMgr(threading.Thread): # tasks self.task_queue = [] + self.heart_beat_timeout = 60 # (s) # nodes self.nodemgr = nodemgr @@ -68,22 +73,22 @@ class TaskMgr(threading.Thread): # this method is called when worker send heart-beat rpc request def on_task_report(self, report): - logger.info('[on_task_report] receive task report: id %d, status %d' % (report.id, report.status)) - task = get_task(report.id) + logger.info('[on_task_report] receive task report: id %d, status %d' % (report.taskid, report.instanceStatus)) + task = get_task(report.taskid) if task == None: logger.error('[on_task_report] task not found') return - instance_id = report.parameters.command.envVars['INSTANCE_ID'] - instance = task.instance_list[instance_id] + instance = task.instance_list[report.instanceid] + if instance['token'] != report.token: + return - if report.status == Task.RUNNING: - pass - elif report.status == Task.COMPLETED: - instance['status'] = 'completed' + instance['status'] = report.instanceStatus + if report.instanceStatus == Status.RUNNING: + instance['last_update_time'] = time.time() + elif report.instanceStatus == Status.COMPLETED: check_task_completed(task) - elif report.status == Task.FAILED || report.status == Task.TIMEOUT: - instance['status'] = 'failed' + elif report.instanceStatus == Status.FAILED || report.instanceStatus == Status.TIMEOUT: if instance['try_count'] > task.maxRetryCount: check_task_completed(task) else: @@ -95,27 +100,45 @@ class TaskMgr(threading.Thread): return failed = False for instance in task.instance_list: - if instance['status'] == 'running': + if instance['status'] == Status.RUNNING || instance['status'] == Status.WAITING: return - if instance['status'] == 'failed': + if instance['status'] == Status.FAILED || instance['status'] == Status.TIMEOUT: if instance['try_count'] > task.maxRetryCount: failed = True else: return if failed: - # tell jobmgr task failed - task.status = Task.FAILED + # TODO tell jobmgr task failed + task.status = Status.FAILED else: - # tell jobmgr task completed - task.status = Task.COMPLETED + # TODO tell jobmgr task completed + task.status = Status.COMPLETED self.task_queue.remove(task) def task_processor(self, task, instance_id, worker): - task.status = Task.RUNNING - task.parameters.command.envVars['INSTANCE_ID'] = instance_id - # TODO call the rpc to call a function in worker - print('processing %s' % task.id) + task.status = Status.RUNNING + + # properties for transaction + task.instanceid = instance_id + task.token = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + + instance = task.instance_list[instance_id] + instance['status'] = Status.RUNNING + instance['last_update_time'] = time.time() + instance['try_count'] += 1 + instance['token'] = task.token + + try: + logger.info('[task_processor] processing %s' % task.id) + channel = grpc.insecure_channel('%s:50052' % worker) + stub = WorkerStub(channel) + response = stub.process_task(task) + logger.info('[task_processor] worker response: %d' response.message) + except Exception as e: + logger.error('[task_processor] rpc error message: %s' e) + instance['status'] = Status.FAILED + instance['try_count'] -= 1 # return task, worker @@ -126,14 +149,17 @@ class TaskMgr(threading.Thread): if worker is not None: # find instance to retry for instance, index in enumerate(task.instance_list): - if instance['status'] == 'failed' and instance['try_count'] <= task.maxRetryCount: - instance['try_count'] += 1 + if (instance['status'] == Status.FAILED || instance['status'] == Status.TIMEOUT) and instance['try_count'] <= task.maxRetryCount: return task, index, worker + elif instance['status'] == Status.RUNNING: + if time.time() - instance['last_update_time'] > self.heart_beat_timeout: + instance['status'] = Status.FAILED + instance['token'] = '' + return task, index, worker # start new instance if len(task.instance_list) < task.instanceCount: instance = {} - instance['status'] = 'running' instance['try_count'] = 0 task.instance_list.append(instance) return task, len(task.instance_list) - 1, worker @@ -146,8 +172,11 @@ class TaskMgr(threading.Thread): logger.warning('[task_scheduler] running nodes not found') return None - # TODO - return nodes[0] + for node in nodes: + # TODO + if True: + return node[0] + return None def get_all_nodes(self): @@ -159,7 +188,7 @@ class TaskMgr(threading.Thread): self.all_nodes = [] for node_ip in node_ips: fetcher = master.monitor.Fetcher(node_ip) - self.all_nodes.append(fetcher.info) + self.all_nodes.append((node_ip, fetcher.info)) return self.all_nodes @@ -170,6 +199,7 @@ class TaskMgr(threading.Thread): def add_task(self, task): # decode json string to object defined in grpc task.instance_list = [] + task.status = Status.WAITING self.task_queue.append(task) From c6707d406d8c23ea87002fc6850a68839ca0363e Mon Sep 17 00:00:00 2001 From: Gallen Date: Thu, 19 Jul 2018 18:16:48 +0800 Subject: [PATCH 15/75] update taskmgr --- src/master/httprest.py | 3 +++ src/master/taskmgr.py | 50 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/master/httprest.py b/src/master/httprest.py index 9250ed6..7ef349e 100755 --- a/src/master/httprest.py +++ b/src/master/httprest.py @@ -905,6 +905,9 @@ if __name__ == '__main__': G_cloudmgr = cloudmgr.CloudMgr() G_taskmgr = taskmgr.TaskMgr() G_jobmgr = jobmgr.JobMgr(taskmgr) + G_jobmgr.start() + G_taskmgr.set_jobmgr(G_jobmgr) + G_taskmgr.start() # start NodeMgr and NodeMgr will wait for all nodes to start ... G_nodemgr = nodemgr.NodeMgr(G_networkmgr, etcdclient, addr = ipaddr, mode=mode) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 470aefb..1b2e931 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -2,6 +2,7 @@ import threading import time import string import random +import json import master.monitor @@ -13,7 +14,7 @@ from utils.log import logger # grpc from concurrent import futures import grpc -from protos.rpc_pb2 import Task, TaskMsg, Status, Reply +from protos.rpc_pb2 import Task, TaskMsg, Status, Reply, Parameters, Cluster, Command, Image, Mount, Instance from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, WorkerStub @@ -35,8 +36,7 @@ class TaskMgr(threading.Thread): def __init__(self, nodemgr): threading.Thread.__init__(self) self.thread_stop = False - - # tasks + self.jobmgr = None self.task_queue = [] self.heart_beat_timeout = 60 # (s) @@ -73,7 +73,7 @@ class TaskMgr(threading.Thread): # this method is called when worker send heart-beat rpc request def on_task_report(self, report): - logger.info('[on_task_report] receive task report: id %d, status %d' % (report.taskid, report.instanceStatus)) + logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.instanceid, report.instanceStatus)) task = get_task(report.taskid) if task == None: logger.error('[on_task_report] task not found') @@ -81,6 +81,7 @@ class TaskMgr(threading.Thread): instance = task.instance_list[report.instanceid] if instance['token'] != report.token: + logger.warning('[on_task_report] wrong token') return instance['status'] = report.instanceStatus @@ -107,12 +108,16 @@ class TaskMgr(threading.Thread): failed = True else: return + if self.jobmgr is None: + logger.error('[check_task_completed] jobmgr is None!') + return if failed: # TODO tell jobmgr task failed task.status = Status.FAILED else: # TODO tell jobmgr task completed task.status = Status.COMPLETED + logger.info('task %s completed' % task.id) self.task_queue.remove(task) @@ -192,14 +197,47 @@ class TaskMgr(threading.Thread): return self.all_nodes + def set_jobmgr(self, jobmgr): + self.jobmgr = jobmgr + + # user: username # task: a json string # save the task information into database # called when jobmgr assign task to taskmgr - def add_task(self, task): + def add_task(self, username, taskid, json_task): # decode json string to object defined in grpc - task.instance_list = [] + json_task = json.loads(json_task) + task = Task( + id = taskid, + username = username, + instanceCount = json_task['instanceCount'], + maxRetryCount = json_task['maxRetryCount'], + timeout = json_task['timeout'], + parameters = Parameters( + command = Command( + commandLine = json_task['parameters']['command']['commandLine'], + packagePath = json_task['parameters']['command']['packagePath'], + envVars = json_task['parameters']['command']['envVars']), + stderrRedirectPath = json_task['parameters']['stderrRedirectPath'], + stdoutRedirectPath = json_task['parameters']['stdoutRedirectPath']), + cluster = Cluster( + ,image = Image( + name = json_task['cluster']['image']['name'], + type = json_task['cluster']['image']['type'], + owner = json_task['cluster']['image']['owner']), + instance = Instance( + cpu = json_task['cluster']['instance']['cpu'], + memory = json_task['cluster']['instance']['memory'], + disk = json_task['cluster']['instance']['disk'], + gpu = json_task['cluster']['instance']['gpu']))) + task.cluster.mount = [] + for mount in json_task['cluster']['mount']: + task.cluster.mount.append(Mount(localPath=mount['localPath'], remotePath=mount['remotePath'])) + + # local properties task.status = Status.WAITING + task.instance_list = [] self.task_queue.append(task) From 7e0603b1fba8004bd93daae20ea8106017850cd9 Mon Sep 17 00:00:00 2001 From: Firmlyzhu Date: Thu, 19 Jul 2018 19:03:24 +0800 Subject: [PATCH 16/75] Update taskcontroller to create container by grpc --- conf/container.batch.conf | 50 ++++++++ src/master/testTaskCtrler.py | 28 +++++ src/protos/rpc.proto | 42 ++++--- src/protos/rpc_pb2.py | 215 +++++++++++++++++++++++------------ src/protos/rpc_pb2_grpc.py | 10 +- src/worker/taskcontroller.py | 81 +++++++++---- 6 files changed, 309 insertions(+), 117 deletions(-) create mode 100644 conf/container.batch.conf create mode 100644 src/master/testTaskCtrler.py diff --git a/conf/container.batch.conf b/conf/container.batch.conf new file mode 100644 index 0000000..96ef497 --- /dev/null +++ b/conf/container.batch.conf @@ -0,0 +1,50 @@ +# This is the common container.conf for all containers. +# If want set custom settings, you have two choices: +# 1. Directly modify this file, which is not recommend, because the +# setting will be overriden when new version container.conf released. +# 2. Use a custom config file in this conf directory: lxc.custom.conf, +# it uses the same grammer as container.conf, and will be merged +# with the default container.conf by docklet at runtime. +# +# The following is an example mounting user html directory +# lxc.mount.entry = /public/home/%USERNAME%/public_html %ROOTFS%/root/public_html none bind,rw,create=dir 0 0 +# + +#### include /usr/share/lxc/config/ubuntu.common.conf +lxc.include = /usr/share/lxc/config/ubuntu.common.conf + +############## DOCKLET CONFIG ############## + +# Setup 0 tty devices +lxc.tty = 0 + +lxc.rootfs = %ROOTFS% +lxc.utsname = %HOSTNAME% + +lxc.network.type = veth +lxc.network.name = eth0 +lxc.network.link = lxcbr0 +lxc.network.flags = up + +lxc.cgroup.pids.max = 2048 +lxc.cgroup.memory.limit_in_bytes = %CONTAINER_MEMORY%M +#lxc.cgroup.memory.kmem.limit_in_bytes = 512M +#lxc.cgroup.memory.soft_limit_in_bytes = 4294967296 +#lxc.cgroup.memory.memsw.limit_in_bytes = 8589934592 + +# lxc.cgroup.cpu.cfs_period_us : period time of cpu, default 100000, means 100ms +# lxc.cgroup.cpu.cfs_quota_us : quota time of this process +lxc.cgroup.cpu.cfs_quota_us = %CONTAINER_CPU% + +lxc.cap.drop = sys_admin net_admin mac_admin mac_override sys_time sys_module + +lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/data %ROOTFS%/root/nfs none bind,rw,create=dir 0 0 +#lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/hosts/%CLUSTERID%.hosts %ROOTFS%/etc/hosts none bind,ro,create=file 0 0 +lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/ssh %ROOTFS%/root/.ssh none bind,ro,create=dir 0 0 +lxc.mount.entry = %FS_PREFIX%/local/temp/%LXCNAME%/ %ROOTFS%/tmp none bind,rw,create=dir 0 0 + +# setting hostname +lxc.hook.pre-start = HNAME=%HOSTNAME% %LXCSCRIPT%/lxc-prestart + +# setting nfs softlink +#lxc.hook.mount = %LXCSCRIPT%/lxc-mount diff --git a/src/master/testTaskCtrler.py b/src/master/testTaskCtrler.py new file mode 100644 index 0000000..7d82650 --- /dev/null +++ b/src/master/testTaskCtrler.py @@ -0,0 +1,28 @@ +import sys +if sys.path[0].endswith("master"): + sys.path[0] = sys.path[0][:-6] + +import grpc + +from protos import rpc_pb2, rpc_pb2_grpc + +def run(): + channel = grpc.insecure_channel('localhost:50051') + stub = rpc_pb2_grpc.WorkerStub(channel) + + comm = rpc_pb2.Command(commandLine="echo hello_world > test.txt", packagePath=".", envVars={}) + paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="", stdoutRedirectPath="") + + img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.PUBLIC, owner="docklet") + inst = rpc_pb2.Instance(cpu=2, memory=2000, disk=500, gpu=0) + mnt = rpc_pb2.Mount(localPath="",remotePath="") + clu = rpc_pb2.Cluster(image=img, instance=inst, mount=[mnt]) + + task = rpc_pb2.Task(id="test",username="root",instanceid=0,instanceCount=1,maxRetryCount=1,parameters=paras,cluster=clu,timeout=10) + + response = stub.process_task(task) + print("Batch client received: " + str(response.status)+" "+response.message) + + +if __name__ == '__main__': + run() diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index 174e7a5..46c87bd 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -1,16 +1,16 @@ syntax = "proto3"; service Master { - rpc report (Task) returns (Reply) {}; + rpc report (TaskMsg) returns (Reply) {}; } service Worker { - //rpc add_task (Task) returns (Reply) {} rpc process_task (Task) returns (Reply) {} } message Reply { - ReplyStatus message = 1; // 返回值 + ReplyStatus status = 1; // 返回值 + string message = 2; enum ReplyStatus { ACCEPTED = 0; @@ -18,22 +18,30 @@ message Reply { } } +message TaskMsg { + string taskid = 1; + int32 instanceid = 2; + Status instanceStatus = 3; // 任务状态 + +} + +enum Status { + WAITING = 0; + RUNNING = 1; + COMPLETED = 2; + FAILED = 3; + TIMEOUT = 4; +} + message Task { string id = 1; - TaskStatus status = 2; // 任务状态 - int32 instanceCount = 3; // 实例个数 - int32 maxRetryCount = 4; // 最大重试次数 - Parameters parameters = 5; // 参数 - Cluster cluster = 6; // 集群配置 - int32 Timeout = 7; // 超时阈值 - - enum TaskStatus { - WAITING = 0; - RUNNING = 1; - COMPLETED = 2; - FAILED = 3; - TIMEOUT = 4; - } + string username = 2; + int32 instanceid = 3; + int32 instanceCount = 4; // 实例个数 + int32 maxRetryCount = 5; // 最大重试次数 + Parameters parameters = 6; // 参数 + Cluster cluster = 7; // 集群配置 + int32 timeout = 8; // 超时阈值 } message Parameters { diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py index 116e2a4..509d3c4 100644 --- a/src/protos/rpc_pb2.py +++ b/src/protos/rpc_pb2.py @@ -3,6 +3,7 @@ import sys _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection @@ -19,36 +20,12 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='rpc.proto', package='', syntax='proto3', - serialized_pb=_b('\n\trpc.proto\"V\n\x05Reply\x12#\n\x07message\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\xff\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12 \n\x06status\x18\x02 \x01(\x0e\x32\x10.Task.TaskStatus\x12\x15\n\rinstanceCount\x18\x03 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x04 \x01(\x05\x12\x1f\n\nparameters\x18\x05 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x06 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07Timeout\x18\x07 \x01(\x05\"N\n\nTaskStatus\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05\x32#\n\x06Master\x12\x19\n\x06report\x12\x05.Task\x1a\x06.Reply\"\x00\x32)\n\x06Worker\x12\x1f\n\x0cprocess_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') + serialized_pb=_b('\n\trpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"N\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x12\n\ninstanceid\x18\x02 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x03 \x01(\x0e\x32\x07.Status\"\xb3\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*J\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x32&\n\x06Master\x12\x1c\n\x06report\x12\x08.TaskMsg\x1a\x06.Reply\"\x00\x32)\n\x06Worker\x12\x1f\n\x0cprocess_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') ) - - -_REPLY_REPLYSTATUS = _descriptor.EnumDescriptor( - name='ReplyStatus', - full_name='Reply.ReplyStatus', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='ACCEPTED', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='REFUSED', index=1, number=1, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=59, - serialized_end=99, -) -_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS) - -_TASK_TASKSTATUS = _descriptor.EnumDescriptor( - name='TaskStatus', - full_name='Task.TaskStatus', +_STATUS = _descriptor.EnumDescriptor( + name='Status', + full_name='Status', filename=None, file=DESCRIPTOR, values=[ @@ -75,10 +52,40 @@ _TASK_TASKSTATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=279, - serialized_end=357, + serialized_start=928, + serialized_end=1002, ) -_sym_db.RegisterEnumDescriptor(_TASK_TASKSTATUS) +_sym_db.RegisterEnumDescriptor(_STATUS) + +Status = enum_type_wrapper.EnumTypeWrapper(_STATUS) +WAITING = 0 +RUNNING = 1 +COMPLETED = 2 +FAILED = 3 +TIMEOUT = 4 + + +_REPLY_REPLYSTATUS = _descriptor.EnumDescriptor( + name='ReplyStatus', + full_name='Reply.ReplyStatus', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='ACCEPTED', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='REFUSED', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=75, + serialized_end=115, +) +_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS) _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( name='ImageType', @@ -97,8 +104,8 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=754, - serialized_end=790, + serialized_start=774, + serialized_end=810, ) _sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) @@ -111,12 +118,19 @@ _REPLY = _descriptor.Descriptor( containing_type=None, fields=[ _descriptor.FieldDescriptor( - name='message', full_name='Reply.message', index=0, + name='status', full_name='Reply.status', index=0, number=1, type=14, cpp_type=8, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='message', full_name='Reply.message', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), ], extensions=[ ], @@ -131,7 +145,52 @@ _REPLY = _descriptor.Descriptor( oneofs=[ ], serialized_start=13, - serialized_end=99, + serialized_end=115, +) + + +_TASKMSG = _descriptor.Descriptor( + name='TaskMsg', + full_name='TaskMsg', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='taskid', full_name='TaskMsg.taskid', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='instanceid', full_name='TaskMsg.instanceid', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='instanceStatus', full_name='TaskMsg.instanceStatus', index=2, + number=3, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=117, + serialized_end=195, ) @@ -150,43 +209,50 @@ _TASK = _descriptor.Descriptor( is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='status', full_name='Task.status', index=1, - number=2, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, + name='username', full_name='Task.username', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='instanceCount', full_name='Task.instanceCount', index=2, + name='instanceid', full_name='Task.instanceid', index=2, number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='maxRetryCount', full_name='Task.maxRetryCount', index=3, + name='instanceCount', full_name='Task.instanceCount', index=3, number=4, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='parameters', full_name='Task.parameters', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, + name='maxRetryCount', full_name='Task.maxRetryCount', index=4, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='cluster', full_name='Task.cluster', index=5, + name='parameters', full_name='Task.parameters', index=5, number=6, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='Timeout', full_name='Task.Timeout', index=6, - number=7, type=5, cpp_type=1, label=1, + name='cluster', full_name='Task.cluster', index=6, + number=7, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='timeout', full_name='Task.timeout', index=7, + number=8, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -196,7 +262,6 @@ _TASK = _descriptor.Descriptor( ], nested_types=[], enum_types=[ - _TASK_TASKSTATUS, ], options=None, is_extendable=False, @@ -204,8 +269,8 @@ _TASK = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=102, - serialized_end=357, + serialized_start=198, + serialized_end=377, ) @@ -249,8 +314,8 @@ _PARAMETERS = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=359, - serialized_end=454, + serialized_start=379, + serialized_end=474, ) @@ -287,8 +352,8 @@ _COMMAND_ENVVARSENTRY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=550, - serialized_end=596, + serialized_start=570, + serialized_end=616, ) _COMMAND = _descriptor.Descriptor( @@ -331,8 +396,8 @@ _COMMAND = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=457, - serialized_end=596, + serialized_start=477, + serialized_end=616, ) @@ -376,8 +441,8 @@ _CLUSTER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=598, - serialized_end=682, + serialized_start=618, + serialized_end=702, ) @@ -422,8 +487,8 @@ _IMAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=684, - serialized_end=790, + serialized_start=704, + serialized_end=810, ) @@ -460,8 +525,8 @@ _MOUNT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=792, - serialized_end=838, + serialized_start=812, + serialized_end=858, ) @@ -512,16 +577,15 @@ _INSTANCE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=840, - serialized_end=906, + serialized_start=860, + serialized_end=926, ) -_REPLY.fields_by_name['message'].enum_type = _REPLY_REPLYSTATUS +_REPLY.fields_by_name['status'].enum_type = _REPLY_REPLYSTATUS _REPLY_REPLYSTATUS.containing_type = _REPLY -_TASK.fields_by_name['status'].enum_type = _TASK_TASKSTATUS +_TASKMSG.fields_by_name['instanceStatus'].enum_type = _STATUS _TASK.fields_by_name['parameters'].message_type = _PARAMETERS _TASK.fields_by_name['cluster'].message_type = _CLUSTER -_TASK_TASKSTATUS.containing_type = _TASK _PARAMETERS.fields_by_name['command'].message_type = _COMMAND _COMMAND_ENVVARSENTRY.containing_type = _COMMAND _COMMAND.fields_by_name['envVars'].message_type = _COMMAND_ENVVARSENTRY @@ -531,6 +595,7 @@ _CLUSTER.fields_by_name['mount'].message_type = _MOUNT _IMAGE.fields_by_name['type'].enum_type = _IMAGE_IMAGETYPE _IMAGE_IMAGETYPE.containing_type = _IMAGE DESCRIPTOR.message_types_by_name['Reply'] = _REPLY +DESCRIPTOR.message_types_by_name['TaskMsg'] = _TASKMSG DESCRIPTOR.message_types_by_name['Task'] = _TASK DESCRIPTOR.message_types_by_name['Parameters'] = _PARAMETERS DESCRIPTOR.message_types_by_name['Command'] = _COMMAND @@ -538,6 +603,7 @@ DESCRIPTOR.message_types_by_name['Cluster'] = _CLUSTER DESCRIPTOR.message_types_by_name['Image'] = _IMAGE DESCRIPTOR.message_types_by_name['Mount'] = _MOUNT DESCRIPTOR.message_types_by_name['Instance'] = _INSTANCE +DESCRIPTOR.enum_types_by_name['Status'] = _STATUS _sym_db.RegisterFileDescriptor(DESCRIPTOR) Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict( @@ -547,6 +613,13 @@ Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), d )) _sym_db.RegisterMessage(Reply) +TaskMsg = _reflection.GeneratedProtocolMessageType('TaskMsg', (_message.Message,), dict( + DESCRIPTOR = _TASKMSG, + __module__ = 'rpc_pb2' + # @@protoc_insertion_point(class_scope:TaskMsg) + )) +_sym_db.RegisterMessage(TaskMsg) + Task = _reflection.GeneratedProtocolMessageType('Task', (_message.Message,), dict( DESCRIPTOR = _TASK, __module__ = 'rpc_pb2' @@ -614,15 +687,15 @@ _MASTER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, options=None, - serialized_start=908, - serialized_end=943, + serialized_start=1004, + serialized_end=1042, methods=[ _descriptor.MethodDescriptor( name='report', full_name='Master.report', index=0, containing_service=None, - input_type=_TASK, + input_type=_TASKMSG, output_type=_REPLY, options=None, ), @@ -638,8 +711,8 @@ _WORKER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=1, options=None, - serialized_start=945, - serialized_end=986, + serialized_start=1044, + serialized_end=1085, methods=[ _descriptor.MethodDescriptor( name='process_task', diff --git a/src/protos/rpc_pb2_grpc.py b/src/protos/rpc_pb2_grpc.py index dc3de46..8116682 100644 --- a/src/protos/rpc_pb2_grpc.py +++ b/src/protos/rpc_pb2_grpc.py @@ -1,7 +1,7 @@ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! import grpc -import rpc_pb2 as rpc__pb2 +from protos import rpc_pb2 as rpc__pb2 class MasterStub(object): @@ -16,7 +16,7 @@ class MasterStub(object): """ self.report = channel.unary_unary( '/Master/report', - request_serializer=rpc__pb2.Task.SerializeToString, + request_serializer=rpc__pb2.TaskMsg.SerializeToString, response_deserializer=rpc__pb2.Reply.FromString, ) @@ -37,7 +37,7 @@ def add_MasterServicer_to_server(servicer, server): rpc_method_handlers = { 'report': grpc.unary_unary_rpc_method_handler( servicer.report, - request_deserializer=rpc__pb2.Task.FromString, + request_deserializer=rpc__pb2.TaskMsg.FromString, response_serializer=rpc__pb2.Reply.SerializeToString, ), } @@ -68,8 +68,8 @@ class WorkerServicer(object): pass def process_task(self, request, context): - """rpc add_task (Task) returns (Reply) {} - """ + # missing associated documentation comment in .proto file + pass context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py index e70ba3d..69f7b1c 100644 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -1,16 +1,27 @@ #!/usr/bin/python3 +import sys +if sys.path[0].endswith("worker"): + sys.path[0] = sys.path[0][:-6] +from utils import env, tools +#config = env.getenv("CONFIG") +config = "/opt/docklet/local/docklet-running.conf" +tools.loadenv(config) +from utils.log import initlogging +initlogging("docklet-worker") +from utils.log import logger from concurrent import futures import grpc -from utils.log import logger -from utils import env -import json,lxc,subprocess,threading,os +#from utils.log import logger +#from utils import env +import json,lxc,subprocess,threading,os,time from utils import imagemgr from protos import rpc_pb2, rpc_pb2_grpc class TaskController(rpc_pb2_grpc.WorkerServicer): def __init__(self): + rpc_pb2_grpc.WorkerServicer.__init__(self) self.imgmgr = imagemgr.ImageMgr() self.fspath = env.getenv('FS_PREFIX') self.confpath = env.getenv('DOCKLET_CONF') @@ -20,22 +31,24 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): logger.info('TaskController init success') def process_task(self, request, context): - logger.info('excute task with parameter: ' + parameter) - parameter = json.loads(parameter) - jobid = parameter['JobId'] - taskid = parameter['TaskId'] - taskno = parameter['TaskNo'] - username = parameter['UserName'] - lxcname = '%s-%s-%s-%s' % (username,jobid,taskid,taskno) - command = '/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine'] - envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars'] - envs['TASK_NO']=taskno - image = parameter['ImageId'] - instance_type = parameter['InstanceType'] + logger.info('excute task with parameter: ' + str(request)) + taskid = request.id + instanceid = request.instanceid - status = self.imgmgr.prepareFS(username,image,lxcname,instance_type['disk']) + command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine'] + #envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars'] + envs = request.parameters.command.envVars + image = {} + image['name'] = request.cluster.image.name + image['type'] = 'private' if request.cluster.image.type == rpc_pb2.Image.PRIVATE else 'public' + image['owner'] = request.cluster.image.owner + username = request.username + lxcname = '%s-batch-%s-%s' % (username,taskid,str(instanceid)) + instance_type = request.cluster.instance + + status = self.imgmgr.prepareFS(username,image,lxcname,str(instance_type.disk)) if not status: - return [False, "Create container for batch failed when preparing filesystem"] + return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message="Create container for batch failed when preparing filesystem") rootfs = "/var/lib/lxc/%s/rootfs" % lxcname @@ -48,13 +61,16 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): def config_prepare(content): content = content.replace("%ROOTFS%",rootfs) - content = content.replace("%CONTAINER_MEMORY%",str(instance_type['memory'])) - content = content.replace("%CONTAINER_CPU%",str(instance_type['cpu']*100000)) + content = content.replace("%HOSTNAME%","batch-%s" % instanceid) + content = content.replace("%CONTAINER_MEMORY%",str(instance_type.memory)) + content = content.replace("%CONTAINER_CPU%",str(instance_type.cpu*100000)) content = content.replace("%FS_PREFIX%",self.fspath) + content = content.replace("%LXCSCRIPT%",env.getenv("LXC_SCRIPT")) content = content.replace("%USERNAME%",username) content = content.replace("%LXCNAME%",lxcname) return content + logger.info(self.confpath) conffile = open(self.confpath+"/container.batch.conf", 'r') conftext = conffile.read() conffile.close() @@ -68,18 +84,18 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): container = lxc.Container(lxcname) if not container.start(): logger.error('start container %s failed' % lxcname) - return True + return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") #return json.dumps({'success':'false','message': "start container failed"}) else: logger.info('start container %s success' % lxcname) #mount oss here - thread = threading.Thread(target = self.excute_task, args=(jobid,taskid,envs,lxcname,command)) - thread.setDaemon(True) - thread.start() + #thread = threading.Thread(target = self.excute_task, args=(jobid,taskid,envs,lxcname,command)) + #thread.setDaemon(True) + #thread.start() - return True + return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") #return json.dumps({'success':'true','message':'task is running'}) def excute_task(self,jobid,taskid,envs,lxcname,command): @@ -109,3 +125,20 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): logger.info("delete container %s success" % lxcname) else: logger.error("delete container %s failed" % lxcname) + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +def TaskControllerServe(): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + rpc_pb2_grpc.add_WorkerServicer_to_server(TaskController(), server) + server.add_insecure_port('[::]:50051') + server.start() + logger.info("Start TaskController Servicer") + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + +if __name__ == "__main__": + TaskControllerServe() From 2c8216e1437cfb834f74dda12cf256dba24e678d Mon Sep 17 00:00:00 2001 From: Gallen Date: Thu, 19 Jul 2018 20:47:49 +0800 Subject: [PATCH 17/75] add simple resource manage to taskmgr --- src/master/taskmgr.py | 82 +++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 1b2e931..d0abfd6 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -25,7 +25,7 @@ class TaskReporter(MasterServicer): def report(self, request, context): self.taskmgr.on_task_report(request) - return Reply(message=Reply.ACCEPTED) + return Reply(status=Reply.ACCEPTED, message='') class TaskMgr(threading.Thread): @@ -42,9 +42,10 @@ class TaskMgr(threading.Thread): # nodes self.nodemgr = nodemgr - self.all_nodes = None - self.last_nodes_info_update_time = 0 - self.nodes_info_update_interval = 30 # (s) + self.cpu_usage = {} + # self.all_nodes = None + # self.last_nodes_info_update_time = 0 + # self.nodes_info_update_interval = 30 # (s) def run(self): @@ -85,11 +86,15 @@ class TaskMgr(threading.Thread): return instance['status'] = report.instanceStatus + if report.instanceStatus == Status.RUNNING: instance['last_update_time'] = time.time() - elif report.instanceStatus == Status.COMPLETED: + else: + self.cpu_usage[instance['worker']] -= task.cluster.instance.cpu + + if report.instanceStatus == Status.COMPLETED: check_task_completed(task) - elif report.instanceStatus == Status.FAILED || report.instanceStatus == Status.TIMEOUT: + elif report.instanceStatus == Status.FAILED or report.instanceStatus == Status.TIMEOUT: if instance['try_count'] > task.maxRetryCount: check_task_completed(task) else: @@ -101,9 +106,9 @@ class TaskMgr(threading.Thread): return failed = False for instance in task.instance_list: - if instance['status'] == Status.RUNNING || instance['status'] == Status.WAITING: + if instance['status'] == Status.RUNNING or instance['status'] == Status.WAITING: return - if instance['status'] == Status.FAILED || instance['status'] == Status.TIMEOUT: + if instance['status'] == Status.FAILED or instance['status'] == Status.TIMEOUT: if instance['try_count'] > task.maxRetryCount: failed = True else: @@ -133,15 +138,19 @@ class TaskMgr(threading.Thread): instance['last_update_time'] = time.time() instance['try_count'] += 1 instance['token'] = task.token + instance['worker'] = worker + + self.cpu_usage[worker] += task.cluster.instance.cpu try: logger.info('[task_processor] processing %s' % task.id) channel = grpc.insecure_channel('%s:50052' % worker) stub = WorkerStub(channel) response = stub.process_task(task) - logger.info('[task_processor] worker response: %d' response.message) + if response.status != Reply.ACCEPTED: + raise Exception(response.message) except Exception as e: - logger.error('[task_processor] rpc error message: %s' e) + logger.error('[task_processor] rpc error message: %s' % e) instance['status'] = Status.FAILED instance['try_count'] -= 1 @@ -154,7 +163,7 @@ class TaskMgr(threading.Thread): if worker is not None: # find instance to retry for instance, index in enumerate(task.instance_list): - if (instance['status'] == Status.FAILED || instance['status'] == Status.TIMEOUT) and instance['try_count'] <= task.maxRetryCount: + if (instance['status'] == Status.FAILED or instance['status'] == Status.TIMEOUT) and instance['try_count'] <= task.maxRetryCount: return task, index, worker elif instance['status'] == Status.RUNNING: if time.time() - instance['last_update_time'] > self.heart_beat_timeout: @@ -177,24 +186,46 @@ class TaskMgr(threading.Thread): logger.warning('[task_scheduler] running nodes not found') return None - for node in nodes: - # TODO - if True: - return node[0] + for worker_ip, worker_info in nodes: + if task.cluster.instance.cpu + get_cpu_usage(worker_ip) > worker_info['cpu']: + continue + if task.cluster.instance.memory > worker_info['memory']: + continue + if task.cluster.instance.disk > worker_info['disk']: + continue + if task.cluster.instance.gpu > worker_info['gpu']: + continue + return worker_ip return None def get_all_nodes(self): # cache running nodes - if self.all_nodes is not None and time.time() - self.last_nodes_info_update_time < self.nodes_info_update_interval: - return self.all_nodes + # if self.all_nodes is not None and time.time() - self.last_nodes_info_update_time < self.nodes_info_update_interval: + # return self.all_nodes # get running nodes node_ips = self.nodemgr.get_nodeips() - self.all_nodes = [] - for node_ip in node_ips: - fetcher = master.monitor.Fetcher(node_ip) - self.all_nodes.append((node_ip, fetcher.info)) - return self.all_nodes + all_nodes = [(node_ip, get_worker_resource_info(node_ip)) for node_ip in node_ips] + return all_nodes + + + def get_worker_resource_info(self, worker_ip): + fetcher = master.monitor.Fetcher(worker_ip) + worker_info = fetcher.info + info = {} + info['cpu'] = len(worker_info['cpuconfig']) + info['memory'] = worker_info['meminfo']['free'] / 1024 # (Mb) + info['disk'] = sum([disk['free'] for disk in worker_info['diskinfo']]) / 1024 / 1024 # (Mb) + info['gpu'] = 0 # not support yet + return info + + + def get_cpu_usage(self, worker_ip): + try: + return self.cpu_usage[worker_ip] + except: + self.cpu_usage[worker_ip] = 0 + return 0 def set_jobmgr(self, jobmgr): @@ -222,7 +253,7 @@ class TaskMgr(threading.Thread): stderrRedirectPath = json_task['parameters']['stderrRedirectPath'], stdoutRedirectPath = json_task['parameters']['stdoutRedirectPath']), cluster = Cluster( - ,image = Image( + image = Image( name = json_task['cluster']['image']['name'], type = json_task['cluster']['image']['type'], owner = json_task['cluster']['image']['owner']), @@ -231,9 +262,8 @@ class TaskMgr(threading.Thread): memory = json_task['cluster']['instance']['memory'], disk = json_task['cluster']['instance']['disk'], gpu = json_task['cluster']['instance']['gpu']))) - task.cluster.mount = [] - for mount in json_task['cluster']['mount']: - task.cluster.mount.append(Mount(localPath=mount['localPath'], remotePath=mount['remotePath'])) + task.cluster.mount = [Mount(localPath=mount['localPath'], remotePath=mount['remotePath']) + for mount in json_task['cluster']['mount']] # local properties task.status = Status.WAITING From ef11ea24dbf6e94fe361e0d2676779cec9fcafc7 Mon Sep 17 00:00:00 2001 From: Gallen Date: Fri, 20 Jul 2018 14:07:22 +0800 Subject: [PATCH 18/75] fix bug --- src/master/taskmgr.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index d0abfd6..b78e3b4 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -85,13 +85,12 @@ class TaskMgr(threading.Thread): logger.warning('[on_task_report] wrong token') return - instance['status'] = report.instanceStatus - - if report.instanceStatus == Status.RUNNING: - instance['last_update_time'] = time.time() - else: + if instance['status'] == Status.RUNNING and report.instanceStatus != Status.RUNNING: self.cpu_usage[instance['worker']] -= task.cluster.instance.cpu + instance['status'] = report.instanceStatus + instance['last_update_time'] = time.time() + if report.instanceStatus == Status.COMPLETED: check_task_completed(task) elif report.instanceStatus == Status.FAILED or report.instanceStatus == Status.TIMEOUT: From c0d55267dd3be3a9ad3b658427f06a557fc71cb0 Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Wed, 25 Jul 2018 17:39:55 +0800 Subject: [PATCH 19/75] add managing of ip addresses in taskcontroller. --- bin/docklet-worker | 3 ++ conf/container.batch.conf | 2 ++ conf/docklet.conf.template | 3 ++ src/master/httprest.py | 4 +-- src/master/testTaskCtrler.py | 2 +- src/protos/rpc.proto | 5 ++-- src/protos/rpc_pb2.py | 34 +++++++++++++---------- src/utils/env.py | 2 ++ src/worker/taskcontroller.py | 54 ++++++++++++++++++++++++++++++++---- 9 files changed, 84 insertions(+), 25 deletions(-) diff --git a/bin/docklet-worker b/bin/docklet-worker index 02f1a01..2d1f9c8 100755 --- a/bin/docklet-worker +++ b/bin/docklet-worker @@ -20,6 +20,8 @@ FS_PREFIX=/opt/docklet # cluster net ip range, default is 172.16.0.1/16 CLUSTER_NET="172.16.0.1/16" +# ip addresses range of containers for batch job, default is 10.0.3.0/24 +BATCH_NET="10.0.3.0/24" #configurable-http-proxy public port, default is 8000 PROXY_PORT=8000 #configurable-http-proxy api port, default is 8001 @@ -83,6 +85,7 @@ pre_start () { # iptables for NAT network for containers to access web iptables -t nat -F iptables -t nat -A POSTROUTING -s $CLUSTER_NET -j MASQUERADE + iptables -t nat -A POSTROUTING -s $BATCH_NET -j MASQUERADE if [ ! -d $FS_PREFIX/local/basefs ]; then log_daemon_msg "basefs does not exist, run prepare.sh first" && exit 1 diff --git a/conf/container.batch.conf b/conf/container.batch.conf index 96ef497..f91af20 100644 --- a/conf/container.batch.conf +++ b/conf/container.batch.conf @@ -25,6 +25,8 @@ lxc.network.type = veth lxc.network.name = eth0 lxc.network.link = lxcbr0 lxc.network.flags = up +lxc.network.ipv4 = %IP% +lxc.network.ipv4.gateway = %GATEWAY% lxc.cgroup.pids.max = 2048 lxc.cgroup.memory.limit_in_bytes = %CONTAINER_MEMORY%M diff --git a/conf/docklet.conf.template b/conf/docklet.conf.template index e9c838b..9488804 100644 --- a/conf/docklet.conf.template +++ b/conf/docklet.conf.template @@ -47,6 +47,9 @@ # CLUSTER_NET: cluster network ip address range, default is 172.16.0.1/16 # CLUSTER_NET=172.16.0.1/16 +# BATCH_NET: ip addresses range of containers for batch job, default is 10.0.3.0/24 +# BATCH_NET=10.0.3.0/24 + # Deprecated since v0.2.7. read from quota group set in web admin page # CONTAINER_CPU: CPU quota of container, default is 100000 # A single CPU core has total=100000 (100ms), so the default 100000 diff --git a/src/master/httprest.py b/src/master/httprest.py index 7ef349e..499b4c4 100755 --- a/src/master/httprest.py +++ b/src/master/httprest.py @@ -903,11 +903,11 @@ if __name__ == '__main__': G_networkmgr.printpools() G_cloudmgr = cloudmgr.CloudMgr() - G_taskmgr = taskmgr.TaskMgr() + '''G_taskmgr = taskmgr.TaskMgr() G_jobmgr = jobmgr.JobMgr(taskmgr) G_jobmgr.start() G_taskmgr.set_jobmgr(G_jobmgr) - G_taskmgr.start() + G_taskmgr.start()''' # start NodeMgr and NodeMgr will wait for all nodes to start ... G_nodemgr = nodemgr.NodeMgr(G_networkmgr, etcdclient, addr = ipaddr, mode=mode) diff --git a/src/master/testTaskCtrler.py b/src/master/testTaskCtrler.py index 7d82650..f4b6f50 100644 --- a/src/master/testTaskCtrler.py +++ b/src/master/testTaskCtrler.py @@ -13,7 +13,7 @@ def run(): comm = rpc_pb2.Command(commandLine="echo hello_world > test.txt", packagePath=".", envVars={}) paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="", stdoutRedirectPath="") - img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.PUBLIC, owner="docklet") + img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet") inst = rpc_pb2.Instance(cpu=2, memory=2000, disk=500, gpu=0) mnt = rpc_pb2.Mount(localPath="",remotePath="") clu = rpc_pb2.Cluster(image=img, instance=inst, mount=[mnt]) diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index 46c87bd..530ccb7 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -68,8 +68,9 @@ message Image { string owner = 3; // 所有者 enum ImageType { - PUBLIC = 0; - PRIVATE = 1; + BASE = 0; + PUBLIC = 1; + PRIVATE = 2; } } diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py index 509d3c4..15fd3a2 100644 --- a/src/protos/rpc_pb2.py +++ b/src/protos/rpc_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='rpc.proto', package='', syntax='proto3', - serialized_pb=_b('\n\trpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"N\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x12\n\ninstanceid\x18\x02 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x03 \x01(\x0e\x32\x07.Status\"\xb3\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"j\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\"$\n\tImageType\x12\n\n\x06PUBLIC\x10\x00\x12\x0b\n\x07PRIVATE\x10\x01\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*J\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x32&\n\x06Master\x12\x1c\n\x06report\x12\x08.TaskMsg\x1a\x06.Reply\"\x00\x32)\n\x06Worker\x12\x1f\n\x0cprocess_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') + serialized_pb=_b('\n\trpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"N\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x12\n\ninstanceid\x18\x02 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x03 \x01(\x0e\x32\x07.Status\"\xb3\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*J\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x32&\n\x06Master\x12\x1c\n\x06report\x12\x08.TaskMsg\x1a\x06.Reply\"\x00\x32)\n\x06Worker\x12\x1f\n\x0cprocess_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') ) _STATUS = _descriptor.EnumDescriptor( @@ -52,8 +52,8 @@ _STATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=928, - serialized_end=1002, + serialized_start=938, + serialized_end=1012, ) _sym_db.RegisterEnumDescriptor(_STATUS) @@ -94,18 +94,22 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( file=DESCRIPTOR, values=[ _descriptor.EnumValueDescriptor( - name='PUBLIC', index=0, number=0, + name='BASE', index=0, number=0, options=None, type=None), _descriptor.EnumValueDescriptor( - name='PRIVATE', index=1, number=1, + name='PUBLIC', index=1, number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PRIVATE', index=2, number=2, options=None, type=None), ], containing_type=None, options=None, serialized_start=774, - serialized_end=810, + serialized_end=820, ) _sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) @@ -488,7 +492,7 @@ _IMAGE = _descriptor.Descriptor( oneofs=[ ], serialized_start=704, - serialized_end=810, + serialized_end=820, ) @@ -525,8 +529,8 @@ _MOUNT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=812, - serialized_end=858, + serialized_start=822, + serialized_end=868, ) @@ -577,8 +581,8 @@ _INSTANCE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=860, - serialized_end=926, + serialized_start=870, + serialized_end=936, ) _REPLY.fields_by_name['status'].enum_type = _REPLY_REPLYSTATUS @@ -687,8 +691,8 @@ _MASTER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, options=None, - serialized_start=1004, - serialized_end=1042, + serialized_start=1014, + serialized_end=1052, methods=[ _descriptor.MethodDescriptor( name='report', @@ -711,8 +715,8 @@ _WORKER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=1, options=None, - serialized_start=1044, - serialized_end=1085, + serialized_start=1054, + serialized_end=1095, methods=[ _descriptor.MethodDescriptor( name='process_task', diff --git a/src/utils/env.py b/src/utils/env.py index 3fcc8d1..458aff8 100755 --- a/src/utils/env.py +++ b/src/utils/env.py @@ -9,6 +9,8 @@ def getenv(key): return int(os.environ.get("CLUSTER_SIZE", 1)) elif key == "CLUSTER_NET": return os.environ.get("CLUSTER_NET", "172.16.0.1/16") + elif key == "BATCH_NET": + return os.environ.get("BATCH_NET","10.0.3.0/24") elif key == "CONTAINER_CPU": return int(os.environ.get("CONTAINER_CPU", 100000)) elif key == "CONTAINER_DISK": diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py index 69f7b1c..ac3d4f9 100644 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -18,6 +18,13 @@ import json,lxc,subprocess,threading,os,time from utils import imagemgr from protos import rpc_pb2, rpc_pb2_grpc +def ip_to_int(addr): + [a, b, c, d] = addr.split('.') + return (int(a)<<24) + (int(b)<<16) + (int(c)<<8) + int(d) + +def int_to_ip(num): + return str((num>>24)&255)+"."+str((num>>16)&255)+"."+str((num>>8)&255)+"."+str(num&255) + class TaskController(rpc_pb2_grpc.WorkerServicer): def __init__(self): @@ -25,29 +32,63 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): self.imgmgr = imagemgr.ImageMgr() self.fspath = env.getenv('FS_PREFIX') self.confpath = env.getenv('DOCKLET_CONF') - #self.masterip = '162.105.88.190' - #self.masterport = 9002 - #self.masterrpc = xmlrpc.client.ServerProxy("http://%s:%s" % (self.masterip,self.masterport)) + self.cons_gateway = '10.0.3.1' + self.cons_ips = '10.0.3.0/24' + + self.cidr = 32 - int(self.cons_ips.split('/')[1]) + self.ipbase = ip_to_int(self.cons_ips.split('/')[0]) + self.free_ips = [] + for i in range(2, (1 << self.cidr) - 1): + self.free_ips.append(i) + logger.info('TaskController init success') + # Need Locks + def acquire_ip(self): + if len(self.free_ips) == 0: + return [False, "No free ips"] + ip = int_to_ip(self.ipbase + self.free_ips[0]) + self.free_ips.remove(self.free_ips[0]) + logger.info(str(self.free_ips)) + return [True, ip + "/" + str(32 - self.cidr)] + + # Need Locks + def release_ip(self,ipstr): + ipnum = ip_to_int(ipstr.split('/')[0]) - self.ipbase + self.free_ips.append(ipnum) + logger.info(str(self.free_ips)) + def process_task(self, request, context): logger.info('excute task with parameter: ' + str(request)) taskid = request.id instanceid = request.instanceid + # get config from request command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine'] #envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars'] envs = request.parameters.command.envVars image = {} image['name'] = request.cluster.image.name - image['type'] = 'private' if request.cluster.image.type == rpc_pb2.Image.PRIVATE else 'public' + if request.cluster.image.type == rpc_pb2.Image.PRIVATE: + image['type'] = 'private' + elif request.cluster.image.type == rpc_pb2.Image.PUBLIC: + image['type'] = 'public' + else: + image['type'] = 'base' image['owner'] = request.cluster.image.owner username = request.username lxcname = '%s-batch-%s-%s' % (username,taskid,str(instanceid)) instance_type = request.cluster.instance + # acquire ip + [status, ip] = self.acquire_ip() + if not status: + return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=ip) + + # prepare image and filesystem status = self.imgmgr.prepareFS(username,image,lxcname,str(instance_type.disk)) if not status: + self.release_ip(ip) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message="Create container for batch failed when preparing filesystem") rootfs = "/var/lib/lxc/%s/rootfs" % lxcname @@ -68,6 +109,8 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): content = content.replace("%LXCSCRIPT%",env.getenv("LXC_SCRIPT")) content = content.replace("%USERNAME%",username) content = content.replace("%LXCNAME%",lxcname) + content = content.replace("%IP%",ip) + content = content.replace("%GATEWAY%",self.cons_gateway) return content logger.info(self.confpath) @@ -84,7 +127,8 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): container = lxc.Container(lxcname) if not container.start(): logger.error('start container %s failed' % lxcname) - return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") + self.release_ip(ip) + return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container") #return json.dumps({'success':'false','message': "start container failed"}) else: logger.info('start container %s success' % lxcname) From e75dbb928d692390d0ad795a75e5b4fa93b622f6 Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Thu, 26 Jul 2018 17:49:10 +0800 Subject: [PATCH 20/75] add lock to ip mananging of task controller --- src/worker/taskcontroller.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py index ac3d4f9..7b865b1 100644 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -32,6 +32,7 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): self.imgmgr = imagemgr.ImageMgr() self.fspath = env.getenv('FS_PREFIX') self.confpath = env.getenv('DOCKLET_CONF') + self.lock = threading.Lock() self.cons_gateway = '10.0.3.1' self.cons_ips = '10.0.3.0/24' @@ -45,18 +46,22 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): # Need Locks def acquire_ip(self): + self.lock.acquire() if len(self.free_ips) == 0: return [False, "No free ips"] ip = int_to_ip(self.ipbase + self.free_ips[0]) self.free_ips.remove(self.free_ips[0]) logger.info(str(self.free_ips)) + self.lock.release() return [True, ip + "/" + str(32 - self.cidr)] # Need Locks def release_ip(self,ipstr): + self.lock.acquire() ipnum = ip_to_int(ipstr.split('/')[0]) - self.ipbase self.free_ips.append(ipnum) logger.info(str(self.free_ips)) + self.lock.release() def process_task(self, request, context): logger.info('excute task with parameter: ' + str(request)) @@ -173,7 +178,7 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): _ONE_DAY_IN_SECONDS = 60 * 60 * 24 def TaskControllerServe(): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=5)) rpc_pb2_grpc.add_WorkerServicer_to_server(TaskController(), server) server.add_insecure_port('[::]:50051') server.start() From 4c0891938dd7d70cc0bd1ce84032785b2fdd1d1d Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Fri, 27 Jul 2018 18:57:03 +0800 Subject: [PATCH 21/75] Enable taskcontroller to excecute command --- src/master/testTaskCtrler.py | 2 +- src/worker/taskcontroller.py | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/master/testTaskCtrler.py b/src/master/testTaskCtrler.py index f4b6f50..0ea7d81 100644 --- a/src/master/testTaskCtrler.py +++ b/src/master/testTaskCtrler.py @@ -10,7 +10,7 @@ def run(): channel = grpc.insecure_channel('localhost:50051') stub = rpc_pb2_grpc.WorkerStub(channel) - comm = rpc_pb2.Command(commandLine="echo hello_world > test.txt", packagePath=".", envVars={}) + comm = rpc_pb2.Command(commandLine=r"echo \"s\" | awk '{print \"test\n\\\"\"}' > test.txt;cat test.txt", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}' paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="", stdoutRedirectPath="") img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet") diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py index 7b865b1..39c21d9 100644 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -71,7 +71,10 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): # get config from request command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine'] #envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars'] + pkgpath = request.parameters.command.packagePath envs = request.parameters.command.envVars + envs['taskid'] = str(taskid) + envs['instanceid'] = str(instanceid) image = {} image['name'] = request.cluster.image.name if request.cluster.image.type == rpc_pb2.Image.PRIVATE: @@ -134,32 +137,31 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): logger.error('start container %s failed' % lxcname) self.release_ip(ip) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container") - #return json.dumps({'success':'false','message': "start container failed"}) else: logger.info('start container %s success' % lxcname) #mount oss here - #thread = threading.Thread(target = self.excute_task, args=(jobid,taskid,envs,lxcname,command)) - #thread.setDaemon(True) - #thread.start() + thread = threading.Thread(target = self.excute_task, args=(taskid,instanceid,envs,lxcname,pkgpath,command,ip)) + thread.setDaemon(True) + thread.start() return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") - #return json.dumps({'success':'true','message':'task is running'}) - def excute_task(self,jobid,taskid,envs,lxcname,command): + def excute_task(self,taskid,instanceid,envs,lxcname,pkgpath,command,ip): cmd = "lxc-attach -n " + lxcname for envkey,envval in envs.items(): cmd = cmd + " -v %s=%s" % (envkey,envval) - cmd = cmd + " " + command + cmd = cmd + " -- /bin/bash -c " + "\"cd " + pkgpath + ";" + command + "\"" logger.info('run task with command - %s' % cmd) - Ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) - if Ret == 0: + ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) + logger.info(ret) + if ret.returncode == 0: #call master rpc function to tell the taskmgr - self.masterrpc.complete_task(jobid,taskid) + pass else: - self.masterrpc.fail_task(jobid,taskid) #call master rpc function to tell the wrong + pass #umount oss here @@ -175,6 +177,10 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): else: logger.error("delete container %s failed" % lxcname) + logger.info("release ip address %s" % ip) + self.release_ip(ip) + + _ONE_DAY_IN_SECONDS = 60 * 60 * 24 def TaskControllerServe(): From f613591ffe48102b90c1d9fb4f528c589a78116b Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Sat, 28 Jul 2018 17:32:01 +0800 Subject: [PATCH 22/75] let taskcontroller be started by docklet-worker --- bin/docklet-worker | 44 ++++++++++++++++++++++++++++++++++++ conf/docklet.conf.template | 32 +++++++++++++++++++++++--- src/utils/env.py | 14 ++++++++++-- src/worker/taskcontroller.py | 22 +++++++++++------- 4 files changed, 99 insertions(+), 13 deletions(-) mode change 100644 => 100755 src/worker/taskcontroller.py diff --git a/bin/docklet-worker b/bin/docklet-worker index 2d1f9c8..55d6f92 100755 --- a/bin/docklet-worker +++ b/bin/docklet-worker @@ -45,6 +45,13 @@ DAEMON_OPTS= # The process ID of the script when it runs is stored here: PIDFILE=$RUN_DIR/$DAEMON_NAME.pid +# settings for docklet batch worker, which is required for batch job processing system +BATCH_ON=True +DAEMON_BATCH=$DOCKLET_LIB/worker/taskcontroller.py +DAEMON_NAME_BATCH=docklet-taskcontroller +PIDFILE_BATCH=$RUN_DIR/batch.pid +DAEMON_OPTS_BATCH= + # settings for docklet proxy, which is required for web access DAEMON_PROXY=`which configurable-http-proxy` DAEMON_NAME_PROXY=docklet-proxy @@ -104,6 +111,19 @@ do_start() { log_end_msg $? } +do_start_batch () { + if [ "$BATCH_ON" = "False" ] + then + return 1 + fi + log_daemon_msg "Starting $DAEMON_NAME_BATCH in $FS_PREFIX" + + DAEMON_OPTS_BATCH="" + + start-stop-daemon --start --background --pidfile $PIDFILE_BATCH --make-pidfile --user $DAEMON_USER --chuid $DAEMON_USER --startas $DAEMON_BATCH -- $DAEMON_OPTS_BATCH + log_end_msg $? +} + do_start_proxy () { if [ "$DISTRIBUTED_GATEWAY" = "False" ] then @@ -121,6 +141,16 @@ do_stop () { log_end_msg $? } +do_stop_batch () { + if [ "$BATCH_ON" = "False" ] + then + return 1 + fi + log_daemon_msg "Stopping $DAEMON_NAME_BATCH daemon" + start-stop-daemon --stop --quiet --oknodo --remove-pidfile --pidfile $PIDFILE_BATCH --retry 10 + log_end_msg $? +} + do_stop_proxy () { if [ "$DISTRIBUTED_GATEWAY" = "False" ] then @@ -149,11 +179,13 @@ do_stop_meter() { case "$1" in start) do_start + do_start_batch do_start_proxy ;; stop) do_stop + do_stop_batch do_stop_proxy ;; start-meter) @@ -164,6 +196,15 @@ case "$1" in do_stop_meter ;; + start_batch) + pre_start + do_start_batch + ;; + + stop_batch) + do_stop_batch + ;; + start_proxy) do_start_proxy ;; @@ -179,13 +220,16 @@ case "$1" in restart) do_stop + do_stop_batch do_stop_proxy do_start + do_start_batch do_start_proxy ;; status) status_of_proc -p $PIDFILE "$DAEMON" "$DAEMON_NAME" && exit 0 || exit $? + status_of_proc -p $PIDFILE_BATCH "$DAEMON_BATCH" "$DAEMON_NAME_BATCH" || status=$? status_of_proc -p $PIDFILE_PROXY "$DAEMON_PROXY" "$DAEMON_NAME_PROXY" || status=$? ;; *) diff --git a/conf/docklet.conf.template b/conf/docklet.conf.template index 9488804..912b6e5 100644 --- a/conf/docklet.conf.template +++ b/conf/docklet.conf.template @@ -47,9 +47,6 @@ # CLUSTER_NET: cluster network ip address range, default is 172.16.0.1/16 # CLUSTER_NET=172.16.0.1/16 -# BATCH_NET: ip addresses range of containers for batch job, default is 10.0.3.0/24 -# BATCH_NET=10.0.3.0/24 - # Deprecated since v0.2.7. read from quota group set in web admin page # CONTAINER_CPU: CPU quota of container, default is 100000 # A single CPU core has total=100000 (100ms), so the default 100000 @@ -185,3 +182,32 @@ # ALLOW_SCALE_OUT: allow docklet to rent server on the cloud to scale out # Only when you deploy docklet on the cloud can you set it to True # ALLOW_SCALE_OUT=False + +# ================================================== +# +# Batch Config +# +# ================================================== + +# BATCH_ON: whether to start batch job processing system when start +# the docklet. Default: True +# BATCH_ON=True + +# BATCH_MASTER_PORT: the rpc server port on master. +# default: 50050 +# BATCH_MASTER_PORT=50050 + +# BATCH_WORKER_PORT: the rpc server port on worker. +# default: 50051 +# BATCH_WORKER_PORT=50051 + +# BATCH_GATEWAY: the ip address of gateway for the containers processing +# batch jobs. default: 10.0.3.1 +# BATCH_GATEWAY=10.0.3.1 + +# BATCH_NET: ip addresses range of containers for batch job, default is 10.0.3.0/24 +# BATCH_NET=10.0.3.0/24 + +# BATCH_MAX_THREAD_WORKER: the maximun number of threads of the rpc server on +# the batch job worker. default:5 +# BATCH_MAX_THREAD_WORKER=5 diff --git a/src/utils/env.py b/src/utils/env.py index 458aff8..d999516 100755 --- a/src/utils/env.py +++ b/src/utils/env.py @@ -9,8 +9,6 @@ def getenv(key): return int(os.environ.get("CLUSTER_SIZE", 1)) elif key == "CLUSTER_NET": return os.environ.get("CLUSTER_NET", "172.16.0.1/16") - elif key == "BATCH_NET": - return os.environ.get("BATCH_NET","10.0.3.0/24") elif key == "CONTAINER_CPU": return int(os.environ.get("CONTAINER_CPU", 100000)) elif key == "CONTAINER_DISK": @@ -81,5 +79,17 @@ def getenv(key): return os.environ.get("ALLOCATED_PORTS","10000-65535") elif key =="ALLOW_SCALE_OUT": return os.environ.get("ALLOW_SCALE_OUT", "False") + elif key == "BATCH_ON": + return os.environ.get("BATCH_ON","True") + elif key == "BATCH_MASTER_PORT": + return os.environ.get("BATCH_MASTER_PORT","50050") + elif key == "BATCH_WORKER_PORT": + return os.environ.get("BATCH_WORKER_PORT","50051") + elif key == "BATCH_GATEWAY": + return os.environ.get("BATCH_GATEWAY","10.0.3.1") + elif key == "BATCH_NET": + return os.environ.get("BATCH_NET","10.0.3.0/24") + elif key == "BATCH_MAX_THREAD_WORKER": + return os.environ.get("BATCH_MAX_THREAD_WORKER","5") else: return os.environ.get(key,"") diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py old mode 100644 new mode 100755 index 39c21d9..a1bc028 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -3,11 +3,11 @@ import sys if sys.path[0].endswith("worker"): sys.path[0] = sys.path[0][:-6] from utils import env, tools -#config = env.getenv("CONFIG") -config = "/opt/docklet/local/docklet-running.conf" +config = env.getenv("CONFIG") +#config = "/opt/docklet/local/docklet-running.conf" tools.loadenv(config) from utils.log import initlogging -initlogging("docklet-worker") +initlogging("docklet-taskcontroller") from utils.log import logger from concurrent import futures @@ -33,14 +33,17 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): self.fspath = env.getenv('FS_PREFIX') self.confpath = env.getenv('DOCKLET_CONF') self.lock = threading.Lock() - self.cons_gateway = '10.0.3.1' - self.cons_ips = '10.0.3.0/24' + self.cons_gateway = env.getenv('BATCH_GATEWAY') + self.cons_ips = env.getenv('BATCH_NET') + logger.info("Batch gateway ip address %s" % self.cons_gateway) + logger.info("Batch ip pools %s" % self.cons_ips) self.cidr = 32 - int(self.cons_ips.split('/')[1]) self.ipbase = ip_to_int(self.cons_ips.split('/')[0]) self.free_ips = [] for i in range(2, (1 << self.cidr) - 1): self.free_ips.append(i) + logger.info("Free ip addresses pool %s" % str(self.free_ips)) logger.info('TaskController init success') @@ -184,11 +187,14 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): _ONE_DAY_IN_SECONDS = 60 * 60 * 24 def TaskControllerServe(): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=5)) + max_threads = int(env.getenv('BATCH_MAX_THREAD_WORKER')) + worker_port = int(env.getenv('BATCH_WORKER_PORT')) + logger.info("Max Threads on a worker is %d" % max_threads) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_threads)) rpc_pb2_grpc.add_WorkerServicer_to_server(TaskController(), server) - server.add_insecure_port('[::]:50051') + server.add_insecure_port('[::]:'+str(worker_port)) server.start() - logger.info("Start TaskController Servicer") + logger.info("Start TaskController Servicer on port:%d" % worker_port) try: while True: time.sleep(_ONE_DAY_IN_SECONDS) From fb6046e34eb9ae837173b073a43cfd0d4f8414f8 Mon Sep 17 00:00:00 2001 From: Gallen Date: Sat, 28 Jul 2018 18:09:01 +0800 Subject: [PATCH 23/75] add simple taskmgr testing, test & fix bugs, modify rpc.proto --- src/master/taskmgr.py | 138 ++++++++++++++++++++----------------- src/master/testTaskMgr.py | 128 ++++++++++++++++++++++++++++++++++ src/protos/rpc.proto | 44 ++++++------ src/protos/rpc_pb2.py | 124 ++++++++++++++++----------------- src/protos/rpc_pb2_grpc.py | 18 ++--- 5 files changed, 297 insertions(+), 155 deletions(-) create mode 100644 src/master/testTaskMgr.py diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index b78e3b4..0bb8b0b 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -4,20 +4,26 @@ import string import random import json -import master.monitor - # must import logger after initlogging, ugly -from utils.log import initlogging -initlogging("docklet-taskmgr") -from utils.log import logger +# from utils.log import initlogging +# initlogging("docklet-taskmgr") +# from utils.log import logger # grpc from concurrent import futures import grpc -from protos.rpc_pb2 import Task, TaskMsg, Status, Reply, Parameters, Cluster, Command, Image, Mount, Instance +from protos.rpc_pb2 import * from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, WorkerStub +class Task(): + def __init__(self, info): + self.info = info + self.status = WAITING + self.instance_list = [] + self.token = '' + + class TaskReporter(MasterServicer): def __init__(self, taskmgr): @@ -33,15 +39,17 @@ class TaskMgr(threading.Thread): # load task information from etcd # initial a task queue and task schedueler # taskmgr: a taskmgr instance - def __init__(self, nodemgr): + def __init__(self, nodemgr, monitor_fetcher, logger): threading.Thread.__init__(self) self.thread_stop = False self.jobmgr = None self.task_queue = [] - self.heart_beat_timeout = 60 # (s) + self.heart_beat_timeout = 5 # (s) + self.logger = logger # nodes self.nodemgr = nodemgr + self.monitor_fetcher = monitor_fetcher self.cpu_usage = {} # self.all_nodes = None # self.last_nodes_info_update_time = 0 @@ -63,136 +71,146 @@ class TaskMgr(threading.Thread): add_MasterServicer_to_server(TaskReporter(self), self.server) self.server.add_insecure_port('[::]:50051') self.server.start() - logger.info('[taskmgr_rpc] start rpc server') + self.logger.info('[taskmgr_rpc] start rpc server') def stop(self): self.thread_stop = True self.server.stop(0) - logger.info('[taskmgr_rpc] stop rpc server') + self.logger.info('[taskmgr_rpc] stop rpc server') # this method is called when worker send heart-beat rpc request def on_task_report(self, report): - logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.instanceid, report.instanceStatus)) + self.logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.instanceid, report.instanceStatus)) task = get_task(report.taskid) if task == None: - logger.error('[on_task_report] task not found') + self.logger.error('[on_task_report] task not found') return instance = task.instance_list[report.instanceid] if instance['token'] != report.token: - logger.warning('[on_task_report] wrong token') + self.logger.warning('[on_task_report] wrong token') return - if instance['status'] == Status.RUNNING and report.instanceStatus != Status.RUNNING: - self.cpu_usage[instance['worker']] -= task.cluster.instance.cpu + if instance['status'] == RUNNING and report.instanceStatus != RUNNING: + self.cpu_usage[instance['worker']] -= task.info.cluster.instance.cpu instance['status'] = report.instanceStatus instance['last_update_time'] = time.time() - if report.instanceStatus == Status.COMPLETED: + if report.instanceStatus == COMPLETED: check_task_completed(task) - elif report.instanceStatus == Status.FAILED or report.instanceStatus == Status.TIMEOUT: - if instance['try_count'] > task.maxRetryCount: + elif report.instanceStatus == FAILED or report.instanceStatus == TIMEOUT: + if instance['try_count'] > task.info.maxRetryCount: check_task_completed(task) else: - logger.error('[on_task_report] receive report from waiting task') + self.logger.error('[on_task_report] receive report from waiting task') def check_task_completed(self, task): - if len(task.instance_list) < task.instanceCount: + if len(task.instance_list) < task.info.instanceCount: return failed = False for instance in task.instance_list: - if instance['status'] == Status.RUNNING or instance['status'] == Status.WAITING: + if instance['status'] == RUNNING or instance['status'] == WAITING: return - if instance['status'] == Status.FAILED or instance['status'] == Status.TIMEOUT: - if instance['try_count'] > task.maxRetryCount: + if instance['status'] == FAILED or instance['status'] == TIMEOUT: + if instance['try_count'] > task.info.maxRetryCount: failed = True else: return if self.jobmgr is None: - logger.error('[check_task_completed] jobmgr is None!') + self.logger.error('[check_task_completed] jobmgr is None!') return if failed: # TODO tell jobmgr task failed - task.status = Status.FAILED + task.status = FAILED + self.jobmgr.report(task) else: # TODO tell jobmgr task completed - task.status = Status.COMPLETED - logger.info('task %s completed' % task.id) + task.status = COMPLETED + self.jobmgr.report(task) + self.logger.info('task %s completed' % task.info.id) self.task_queue.remove(task) def task_processor(self, task, instance_id, worker): - task.status = Status.RUNNING + task.status = RUNNING # properties for transaction - task.instanceid = instance_id + task.info.instanceid = instance_id task.token = ''.join(random.sample(string.ascii_letters + string.digits, 8)) instance = task.instance_list[instance_id] - instance['status'] = Status.RUNNING + instance['status'] = RUNNING instance['last_update_time'] = time.time() instance['try_count'] += 1 instance['token'] = task.token instance['worker'] = worker - self.cpu_usage[worker] += task.cluster.instance.cpu + self.cpu_usage[worker] += task.info.cluster.instance.cpu try: - logger.info('[task_processor] processing %s' % task.id) + self.logger.info('[task_processor] processing task [%s] instance [%d]' % (task.info.id, task.info.instanceid)) channel = grpc.insecure_channel('%s:50052' % worker) stub = WorkerStub(channel) - response = stub.process_task(task) + response = stub.process_task(task.info) if response.status != Reply.ACCEPTED: raise Exception(response.message) except Exception as e: - logger.error('[task_processor] rpc error message: %s' % e) - instance['status'] = Status.FAILED + self.logger.error('[task_processor] rpc error message: %s' % e) + instance['status'] = FAILED instance['try_count'] -= 1 # return task, worker def task_scheduler(self): # simple FIFO + self.logger.info('[task_scheduler] scheduling...') for task in self.task_queue: worker = self.find_proper_worker(task) - if worker is not None: + + for index, instance in enumerate(task.instance_list): # find instance to retry - for instance, index in enumerate(task.instance_list): - if (instance['status'] == Status.FAILED or instance['status'] == Status.TIMEOUT) and instance['try_count'] <= task.maxRetryCount: + if (instance['status'] == FAILED or instance['status'] == TIMEOUT) and instance['try_count'] <= task.info.maxRetryCount: + if worker is not None: + self.logger.info('[task_scheduler] retry') return task, index, worker - elif instance['status'] == Status.RUNNING: - if time.time() - instance['last_update_time'] > self.heart_beat_timeout: - instance['status'] = Status.FAILED - instance['token'] = '' + # find timeout instance + elif instance['status'] == RUNNING: + if time.time() - instance['last_update_time'] > self.heart_beat_timeout: + instance['status'] = FAILED + instance['token'] = '' + self.cpu_usage[instance['worker']] -= task.info.cluster.instance.cpu + + self.logger.warning('[task_scheduler] worker timeout task [%s] instance [%d]' % (task.info.id, index)) + if worker is not None: return task, index, worker + if worker is not None: # start new instance - if len(task.instance_list) < task.instanceCount: + if len(task.instance_list) < task.info.instanceCount: instance = {} instance['try_count'] = 0 task.instance_list.append(instance) return task, len(task.instance_list) - 1, worker - return None - + return None, None, None def find_proper_worker(self, task): - nodes = get_all_nodes() + nodes = self.get_all_nodes() if nodes is None or len(nodes) == 0: - logger.warning('[task_scheduler] running nodes not found') + self.logger.warning('[task_scheduler] running nodes not found') return None for worker_ip, worker_info in nodes: - if task.cluster.instance.cpu + get_cpu_usage(worker_ip) > worker_info['cpu']: + if task.info.cluster.instance.cpu + self.get_cpu_usage(worker_ip) > worker_info['cpu']: continue - if task.cluster.instance.memory > worker_info['memory']: + if task.info.cluster.instance.memory > worker_info['memory']: continue - if task.cluster.instance.disk > worker_info['disk']: + if task.info.cluster.instance.disk > worker_info['disk']: continue - if task.cluster.instance.gpu > worker_info['gpu']: + if task.info.cluster.instance.gpu > worker_info['gpu']: continue return worker_ip return None @@ -204,12 +222,12 @@ class TaskMgr(threading.Thread): # return self.all_nodes # get running nodes node_ips = self.nodemgr.get_nodeips() - all_nodes = [(node_ip, get_worker_resource_info(node_ip)) for node_ip in node_ips] + all_nodes = [(node_ip, self.get_worker_resource_info(node_ip)) for node_ip in node_ips] return all_nodes def get_worker_resource_info(self, worker_ip): - fetcher = master.monitor.Fetcher(worker_ip) + fetcher = self.monitor_fetcher(worker_ip) worker_info = fetcher.info info = {} info['cpu'] = len(worker_info['cpuconfig']) @@ -238,7 +256,7 @@ class TaskMgr(threading.Thread): def add_task(self, username, taskid, json_task): # decode json string to object defined in grpc json_task = json.loads(json_task) - task = Task( + task = Task(TaskInfo( id = taskid, username = username, instanceCount = json_task['instanceCount'], @@ -260,13 +278,9 @@ class TaskMgr(threading.Thread): cpu = json_task['cluster']['instance']['cpu'], memory = json_task['cluster']['instance']['memory'], disk = json_task['cluster']['instance']['disk'], - gpu = json_task['cluster']['instance']['gpu']))) - task.cluster.mount = [Mount(localPath=mount['localPath'], remotePath=mount['remotePath']) - for mount in json_task['cluster']['mount']] - - # local properties - task.status = Status.WAITING - task.instance_list = [] + gpu = json_task['cluster']['instance']['gpu'])))) + task.info.cluster.mount.extend([Mount(localPath=mount['localPath'], remotePath=mount['remotePath']) + for mount in json_task['cluster']['mount']]) self.task_queue.append(task) @@ -274,6 +288,6 @@ class TaskMgr(threading.Thread): # get the information of a task, including the status, task description and other information def get_task(self, taskid): for task in self.task_queue: - if task.id == taskid: + if task.info.id == taskid: return task return None diff --git a/src/master/testTaskMgr.py b/src/master/testTaskMgr.py new file mode 100644 index 0000000..bfdad07 --- /dev/null +++ b/src/master/testTaskMgr.py @@ -0,0 +1,128 @@ +import master.taskmgr +from concurrent import futures +import grpc +from protos import rpc_pb2, rpc_pb2_grpc +import threading, json, time + + +class SimulatedNodeMgr(): + def get_nodeips(self): + return ['0.0.0.0'] + + +class SimulatedMonitorFetcher(): + def __init__(self, ip): + self.info = {} + self.info['cpuconfig'] = [1,1,1,1] + self.info['meminfo'] = {} + self.info['meminfo']['free'] = 4 * 1024 * 1024 # (kb) simulate 4 GB memory + self.info['diskinfo'] = [] + self.info['diskinfo'].append({}) + self.info['diskinfo'][0]['free'] = 8 * 1024 * 1024 * 1024 # (b) simulate 8 GB disk + + +class SimulatedTaskController(rpc_pb2_grpc.WorkerServicer): + def process_task(self, task, context): + print('[SimulatedTaskController] receive task [%s]' % task.id) + return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") + + +class SimulatedWorker(threading.Thread): + + def __init__(self): + threading.Thread.__init__(self) + self.thread_stop = False + + def run(self): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=5)) + rpc_pb2_grpc.add_WorkerServicer_to_server(SimulatedTaskController(), server) + server.add_insecure_port('[::]:50052') + server.start() + while not self.thread_stop: + time.sleep(5) + server.stop(0) + + def stop(self): + self.thread_stop = True + + +class SimulatedJobMgr(threading.Thread): + + def __init__(self): + threading.Thread.__init__(self) + self.thread_stop = False + + def run(self): + while not self.thread_stop: + time.sleep(5) + server.stop(0) + + def stop(self): + self.thread_stop = True + + def report(self, task): + print('[SimulatedJobMgr] task[%s] status %d' % (task.id, task.status)) + + def asignTask(self, taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk): + task = {} + task['instanceCount'] = instance_count + task['maxRetryCount'] = retry_count + task['timeout'] = timeout + task['parameters'] = {} + task['parameters']['command'] = {} + task['parameters']['command']['commandLine'] = '' + task['parameters']['command']['packagePath'] = '' + task['parameters']['command']['envVars'] = {'a':'1'} + task['parameters']['stderrRedirectPath'] = '' + task['parameters']['stdoutRedirectPath'] = '' + task['cluster'] = {} + task['cluster']['image'] = {} + task['cluster']['image']['name'] = '' + task['cluster']['image']['type'] = 1 + task['cluster']['image']['owner'] = '' + task['cluster']['instance'] = {} + task['cluster']['instance']['cpu'] = cpu + task['cluster']['instance']['memory'] = memory + task['cluster']['instance']['disk'] = disk + task['cluster']['instance']['gpu'] = 0 + task['cluster']['mount'] = [{'remotePath':'', 'localPath':''}] + + taskmgr.add_task('user', taskid, json.dumps(task)) + + +class SimulatedLogger(): + def info(self, msg): + print('[INFO] ' + msg) + + def warning(self, msg): + print('[WARNING] ' + msg) + + def error(self, msg): + print('[ERROR] ' + msg) + + +def test(): + global worker + global jobmgr + global taskmgr + + worker = SimulatedWorker() + worker.start() + jobmgr = SimulatedJobMgr() + jobmgr.start() + + taskmgr = master.taskmgr.TaskMgr(SimulatedNodeMgr(), SimulatedMonitorFetcher, SimulatedLogger()) + taskmgr.set_jobmgr(jobmgr) + taskmgr.start() + + jobmgr.asignTask(taskmgr, 'task_0', 2, 2, 60, 2, 2048, 2048) + + +def stop(): + global worker + global jobmgr + global taskmgr + + worker.stop() + jobmgr.stop() + taskmgr.stop() diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index 530ccb7..b90c8c8 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -1,42 +1,42 @@ syntax = "proto3"; service Master { - rpc report (TaskMsg) returns (Reply) {}; + rpc report (TaskMsg) returns (Reply) {}; } service Worker { - rpc process_task (Task) returns (Reply) {} + rpc process_task (TaskInfo) returns (Reply) {} } message Reply { - ReplyStatus status = 1; // 返回值 - string message = 2; + ReplyStatus status = 1; // 返回值 + string message = 2; - enum ReplyStatus { - ACCEPTED = 0; - REFUSED = 1; - } + enum ReplyStatus { + ACCEPTED = 0; + REFUSED = 1; + } } message TaskMsg { - string taskid = 1; - int32 instanceid = 2; - Status instanceStatus = 3; // 任务状态 + string taskid = 1; + int32 instanceid = 2; + Status instanceStatus = 3; // 任务状态 } enum Status { - WAITING = 0; - RUNNING = 1; - COMPLETED = 2; - FAILED = 3; - TIMEOUT = 4; + WAITING = 0; + RUNNING = 1; + COMPLETED = 2; + FAILED = 3; + TIMEOUT = 4; } -message Task { +message TaskInfo { string id = 1; - string username = 2; - int32 instanceid = 3; + string username = 2; + int32 instanceid = 3; int32 instanceCount = 4; // 实例个数 int32 maxRetryCount = 5; // 最大重试次数 Parameters parameters = 6; // 参数 @@ -68,9 +68,9 @@ message Image { string owner = 3; // 所有者 enum ImageType { - BASE = 0; - PUBLIC = 1; - PRIVATE = 2; + BASE = 0; + PUBLIC = 1; + PRIVATE = 2; } } diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py index 15fd3a2..158d29b 100644 --- a/src/protos/rpc_pb2.py +++ b/src/protos/rpc_pb2.py @@ -1,5 +1,5 @@ # Generated by the protocol buffer compiler. DO NOT EDIT! -# source: rpc.proto +# source: protos/rpc.proto import sys _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) @@ -17,10 +17,10 @@ _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor.FileDescriptor( - name='rpc.proto', + name='protos/rpc.proto', package='', syntax='proto3', - serialized_pb=_b('\n\trpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"N\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x12\n\ninstanceid\x18\x02 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x03 \x01(\x0e\x32\x07.Status\"\xb3\x01\n\x04Task\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*J\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x32&\n\x06Master\x12\x1c\n\x06report\x12\x08.TaskMsg\x1a\x06.Reply\"\x00\x32)\n\x06Worker\x12\x1f\n\x0cprocess_task\x12\x05.Task\x1a\x06.Reply\"\x00\x62\x06proto3') + serialized_pb=_b('\n\x10protos/rpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"N\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x12\n\ninstanceid\x18\x02 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x03 \x01(\x0e\x32\x07.Status\"\xb7\x01\n\x08TaskInfo\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*J\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x32&\n\x06Master\x12\x1c\n\x06report\x12\x08.TaskMsg\x1a\x06.Reply\"\x00\x32-\n\x06Worker\x12#\n\x0cprocess_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x62\x06proto3') ) _STATUS = _descriptor.EnumDescriptor( @@ -52,8 +52,8 @@ _STATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=938, - serialized_end=1012, + serialized_start=949, + serialized_end=1023, ) _sym_db.RegisterEnumDescriptor(_STATUS) @@ -82,8 +82,8 @@ _REPLY_REPLYSTATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=75, - serialized_end=115, + serialized_start=82, + serialized_end=122, ) _sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS) @@ -108,8 +108,8 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=774, - serialized_end=820, + serialized_start=785, + serialized_end=831, ) _sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) @@ -148,8 +148,8 @@ _REPLY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13, - serialized_end=115, + serialized_start=20, + serialized_end=122, ) @@ -193,69 +193,69 @@ _TASKMSG = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=117, - serialized_end=195, + serialized_start=124, + serialized_end=202, ) -_TASK = _descriptor.Descriptor( - name='Task', - full_name='Task', +_TASKINFO = _descriptor.Descriptor( + name='TaskInfo', + full_name='TaskInfo', filename=None, file=DESCRIPTOR, containing_type=None, fields=[ _descriptor.FieldDescriptor( - name='id', full_name='Task.id', index=0, + name='id', full_name='TaskInfo.id', index=0, number=1, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='username', full_name='Task.username', index=1, + name='username', full_name='TaskInfo.username', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='instanceid', full_name='Task.instanceid', index=2, + name='instanceid', full_name='TaskInfo.instanceid', index=2, number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='instanceCount', full_name='Task.instanceCount', index=3, + name='instanceCount', full_name='TaskInfo.instanceCount', index=3, number=4, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='maxRetryCount', full_name='Task.maxRetryCount', index=4, + name='maxRetryCount', full_name='TaskInfo.maxRetryCount', index=4, number=5, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='parameters', full_name='Task.parameters', index=5, + name='parameters', full_name='TaskInfo.parameters', index=5, number=6, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='cluster', full_name='Task.cluster', index=6, + name='cluster', full_name='TaskInfo.cluster', index=6, number=7, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( - name='timeout', full_name='Task.timeout', index=7, + name='timeout', full_name='TaskInfo.timeout', index=7, number=8, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, @@ -273,8 +273,8 @@ _TASK = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=198, - serialized_end=377, + serialized_start=205, + serialized_end=388, ) @@ -318,8 +318,8 @@ _PARAMETERS = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=379, - serialized_end=474, + serialized_start=390, + serialized_end=485, ) @@ -356,8 +356,8 @@ _COMMAND_ENVVARSENTRY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=570, - serialized_end=616, + serialized_start=581, + serialized_end=627, ) _COMMAND = _descriptor.Descriptor( @@ -400,8 +400,8 @@ _COMMAND = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=477, - serialized_end=616, + serialized_start=488, + serialized_end=627, ) @@ -445,8 +445,8 @@ _CLUSTER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=618, - serialized_end=702, + serialized_start=629, + serialized_end=713, ) @@ -491,8 +491,8 @@ _IMAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=704, - serialized_end=820, + serialized_start=715, + serialized_end=831, ) @@ -529,8 +529,8 @@ _MOUNT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=822, - serialized_end=868, + serialized_start=833, + serialized_end=879, ) @@ -581,15 +581,15 @@ _INSTANCE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=870, - serialized_end=936, + serialized_start=881, + serialized_end=947, ) _REPLY.fields_by_name['status'].enum_type = _REPLY_REPLYSTATUS _REPLY_REPLYSTATUS.containing_type = _REPLY _TASKMSG.fields_by_name['instanceStatus'].enum_type = _STATUS -_TASK.fields_by_name['parameters'].message_type = _PARAMETERS -_TASK.fields_by_name['cluster'].message_type = _CLUSTER +_TASKINFO.fields_by_name['parameters'].message_type = _PARAMETERS +_TASKINFO.fields_by_name['cluster'].message_type = _CLUSTER _PARAMETERS.fields_by_name['command'].message_type = _COMMAND _COMMAND_ENVVARSENTRY.containing_type = _COMMAND _COMMAND.fields_by_name['envVars'].message_type = _COMMAND_ENVVARSENTRY @@ -600,7 +600,7 @@ _IMAGE.fields_by_name['type'].enum_type = _IMAGE_IMAGETYPE _IMAGE_IMAGETYPE.containing_type = _IMAGE DESCRIPTOR.message_types_by_name['Reply'] = _REPLY DESCRIPTOR.message_types_by_name['TaskMsg'] = _TASKMSG -DESCRIPTOR.message_types_by_name['Task'] = _TASK +DESCRIPTOR.message_types_by_name['TaskInfo'] = _TASKINFO DESCRIPTOR.message_types_by_name['Parameters'] = _PARAMETERS DESCRIPTOR.message_types_by_name['Command'] = _COMMAND DESCRIPTOR.message_types_by_name['Cluster'] = _CLUSTER @@ -612,28 +612,28 @@ _sym_db.RegisterFileDescriptor(DESCRIPTOR) Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict( DESCRIPTOR = _REPLY, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Reply) )) _sym_db.RegisterMessage(Reply) TaskMsg = _reflection.GeneratedProtocolMessageType('TaskMsg', (_message.Message,), dict( DESCRIPTOR = _TASKMSG, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:TaskMsg) )) _sym_db.RegisterMessage(TaskMsg) -Task = _reflection.GeneratedProtocolMessageType('Task', (_message.Message,), dict( - DESCRIPTOR = _TASK, - __module__ = 'rpc_pb2' - # @@protoc_insertion_point(class_scope:Task) +TaskInfo = _reflection.GeneratedProtocolMessageType('TaskInfo', (_message.Message,), dict( + DESCRIPTOR = _TASKINFO, + __module__ = 'protos.rpc_pb2' + # @@protoc_insertion_point(class_scope:TaskInfo) )) -_sym_db.RegisterMessage(Task) +_sym_db.RegisterMessage(TaskInfo) Parameters = _reflection.GeneratedProtocolMessageType('Parameters', (_message.Message,), dict( DESCRIPTOR = _PARAMETERS, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Parameters) )) _sym_db.RegisterMessage(Parameters) @@ -642,12 +642,12 @@ Command = _reflection.GeneratedProtocolMessageType('Command', (_message.Message, EnvVarsEntry = _reflection.GeneratedProtocolMessageType('EnvVarsEntry', (_message.Message,), dict( DESCRIPTOR = _COMMAND_ENVVARSENTRY, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Command.EnvVarsEntry) )) , DESCRIPTOR = _COMMAND, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Command) )) _sym_db.RegisterMessage(Command) @@ -655,28 +655,28 @@ _sym_db.RegisterMessage(Command.EnvVarsEntry) Cluster = _reflection.GeneratedProtocolMessageType('Cluster', (_message.Message,), dict( DESCRIPTOR = _CLUSTER, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Cluster) )) _sym_db.RegisterMessage(Cluster) Image = _reflection.GeneratedProtocolMessageType('Image', (_message.Message,), dict( DESCRIPTOR = _IMAGE, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Image) )) _sym_db.RegisterMessage(Image) Mount = _reflection.GeneratedProtocolMessageType('Mount', (_message.Message,), dict( DESCRIPTOR = _MOUNT, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Mount) )) _sym_db.RegisterMessage(Mount) Instance = _reflection.GeneratedProtocolMessageType('Instance', (_message.Message,), dict( DESCRIPTOR = _INSTANCE, - __module__ = 'rpc_pb2' + __module__ = 'protos.rpc_pb2' # @@protoc_insertion_point(class_scope:Instance) )) _sym_db.RegisterMessage(Instance) @@ -691,8 +691,8 @@ _MASTER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, options=None, - serialized_start=1014, - serialized_end=1052, + serialized_start=1025, + serialized_end=1063, methods=[ _descriptor.MethodDescriptor( name='report', @@ -715,15 +715,15 @@ _WORKER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=1, options=None, - serialized_start=1054, - serialized_end=1095, + serialized_start=1065, + serialized_end=1110, methods=[ _descriptor.MethodDescriptor( name='process_task', full_name='Worker.process_task', index=0, containing_service=None, - input_type=_TASK, + input_type=_TASKINFO, output_type=_REPLY, options=None, ), diff --git a/src/protos/rpc_pb2_grpc.py b/src/protos/rpc_pb2_grpc.py index 8116682..e5a6690 100644 --- a/src/protos/rpc_pb2_grpc.py +++ b/src/protos/rpc_pb2_grpc.py @@ -1,7 +1,7 @@ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! import grpc -from protos import rpc_pb2 as rpc__pb2 +from protos import rpc_pb2 as protos_dot_rpc__pb2 class MasterStub(object): @@ -16,8 +16,8 @@ class MasterStub(object): """ self.report = channel.unary_unary( '/Master/report', - request_serializer=rpc__pb2.TaskMsg.SerializeToString, - response_deserializer=rpc__pb2.Reply.FromString, + request_serializer=protos_dot_rpc__pb2.TaskMsg.SerializeToString, + response_deserializer=protos_dot_rpc__pb2.Reply.FromString, ) @@ -37,8 +37,8 @@ def add_MasterServicer_to_server(servicer, server): rpc_method_handlers = { 'report': grpc.unary_unary_rpc_method_handler( servicer.report, - request_deserializer=rpc__pb2.TaskMsg.FromString, - response_serializer=rpc__pb2.Reply.SerializeToString, + request_deserializer=protos_dot_rpc__pb2.TaskMsg.FromString, + response_serializer=protos_dot_rpc__pb2.Reply.SerializeToString, ), } generic_handler = grpc.method_handlers_generic_handler( @@ -58,8 +58,8 @@ class WorkerStub(object): """ self.process_task = channel.unary_unary( '/Worker/process_task', - request_serializer=rpc__pb2.Task.SerializeToString, - response_deserializer=rpc__pb2.Reply.FromString, + request_serializer=protos_dot_rpc__pb2.TaskInfo.SerializeToString, + response_deserializer=protos_dot_rpc__pb2.Reply.FromString, ) @@ -79,8 +79,8 @@ def add_WorkerServicer_to_server(servicer, server): rpc_method_handlers = { 'process_task': grpc.unary_unary_rpc_method_handler( servicer.process_task, - request_deserializer=rpc__pb2.Task.FromString, - response_serializer=rpc__pb2.Reply.SerializeToString, + request_deserializer=protos_dot_rpc__pb2.TaskInfo.FromString, + response_serializer=protos_dot_rpc__pb2.Reply.SerializeToString, ), } generic_handler = grpc.method_handlers_generic_handler( From 453f7fc7fae9619d175c7535da702e366bb49386 Mon Sep 17 00:00:00 2001 From: Gallen Date: Sat, 28 Jul 2018 18:22:50 +0800 Subject: [PATCH 24/75] [jobmgr] get rpc port from utils.env --- src/master/taskmgr.py | 15 ++++++++++----- src/master/testTaskMgr.py | 4 +++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 0bb8b0b..11fca3d 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -15,6 +15,8 @@ import grpc from protos.rpc_pb2 import * from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, WorkerStub +from utils import env + class Task(): def __init__(self, info): @@ -47,6 +49,9 @@ class TaskMgr(threading.Thread): self.heart_beat_timeout = 5 # (s) self.logger = logger + self.master_port = env.getenv('BATCH_MASTER_PORT') + self.worker_port = env.getenv('BATCH_WORKER_PORT') + # nodes self.nodemgr = nodemgr self.monitor_fetcher = monitor_fetcher @@ -69,7 +74,7 @@ class TaskMgr(threading.Thread): def serve(self): self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) add_MasterServicer_to_server(TaskReporter(self), self.server) - self.server.add_insecure_port('[::]:50051') + self.server.add_insecure_port('[::]:' + self.master_port) self.server.start() self.logger.info('[taskmgr_rpc] start rpc server') @@ -135,7 +140,7 @@ class TaskMgr(threading.Thread): self.task_queue.remove(task) - def task_processor(self, task, instance_id, worker): + def task_processor(self, task, instance_id, worker_ip): task.status = RUNNING # properties for transaction @@ -147,13 +152,13 @@ class TaskMgr(threading.Thread): instance['last_update_time'] = time.time() instance['try_count'] += 1 instance['token'] = task.token - instance['worker'] = worker + instance['worker'] = worker_ip - self.cpu_usage[worker] += task.info.cluster.instance.cpu + self.cpu_usage[worker_ip] += task.info.cluster.instance.cpu try: self.logger.info('[task_processor] processing task [%s] instance [%d]' % (task.info.id, task.info.instanceid)) - channel = grpc.insecure_channel('%s:50052' % worker) + channel = grpc.insecure_channel('%s:%s' % (worker_ip, self.worker_port)) stub = WorkerStub(channel) response = stub.process_task(task.info) if response.status != Reply.ACCEPTED: diff --git a/src/master/testTaskMgr.py b/src/master/testTaskMgr.py index bfdad07..b74dd71 100644 --- a/src/master/testTaskMgr.py +++ b/src/master/testTaskMgr.py @@ -3,6 +3,7 @@ from concurrent import futures import grpc from protos import rpc_pb2, rpc_pb2_grpc import threading, json, time +from utils import env class SimulatedNodeMgr(): @@ -34,9 +35,10 @@ class SimulatedWorker(threading.Thread): self.thread_stop = False def run(self): + worker_port = env.getenv('BATCH_WORKER_PORT') server = grpc.server(futures.ThreadPoolExecutor(max_workers=5)) rpc_pb2_grpc.add_WorkerServicer_to_server(SimulatedTaskController(), server) - server.add_insecure_port('[::]:50052') + server.add_insecure_port('[::]:' + worker_port) server.start() while not self.thread_stop: time.sleep(5) From 0c2196009b0e81cc0c26a90a4da77fa717f56c10 Mon Sep 17 00:00:00 2001 From: Gallen Date: Sat, 28 Jul 2018 21:34:57 +0800 Subject: [PATCH 25/75] add more tests & more bugs fixed, add token to rpc.proto --- src/master/taskmgr.py | 28 ++++++++++------- src/master/testTaskMgr.py | 59 +++++++++++++++++++++++++++------- src/protos/rpc.proto | 3 +- src/protos/rpc_pb2.py | 66 ++++++++++++++++++++++++--------------- 4 files changed, 107 insertions(+), 49 deletions(-) diff --git a/src/master/taskmgr.py b/src/master/taskmgr.py index 11fca3d..d4290fe 100644 --- a/src/master/taskmgr.py +++ b/src/master/taskmgr.py @@ -41,12 +41,14 @@ class TaskMgr(threading.Thread): # load task information from etcd # initial a task queue and task schedueler # taskmgr: a taskmgr instance - def __init__(self, nodemgr, monitor_fetcher, logger): + def __init__(self, nodemgr, monitor_fetcher, logger, worker_timeout=60, scheduler_interval=2): threading.Thread.__init__(self) self.thread_stop = False self.jobmgr = None self.task_queue = [] - self.heart_beat_timeout = 5 # (s) + + self.heart_beat_timeout = worker_timeout # (s) + self.scheduler_interval = scheduler_interval self.logger = logger self.master_port = env.getenv('BATCH_MASTER_PORT') @@ -68,7 +70,7 @@ class TaskMgr(threading.Thread): if task is not None and worker is not None: self.task_processor(task, instance_id, worker) else: - time.sleep(2) + time.sleep(self.scheduler_interval) def serve(self): @@ -88,7 +90,7 @@ class TaskMgr(threading.Thread): # this method is called when worker send heart-beat rpc request def on_task_report(self, report): self.logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.instanceid, report.instanceStatus)) - task = get_task(report.taskid) + task = self.get_task(report.taskid) if task == None: self.logger.error('[on_task_report] task not found') return @@ -98,6 +100,9 @@ class TaskMgr(threading.Thread): self.logger.warning('[on_task_report] wrong token') return + if instance['status'] != RUNNING: + self.logger.error('[on_task_report] receive task report when instance is not running') + if instance['status'] == RUNNING and report.instanceStatus != RUNNING: self.cpu_usage[instance['worker']] -= task.info.cluster.instance.cpu @@ -105,12 +110,10 @@ class TaskMgr(threading.Thread): instance['last_update_time'] = time.time() if report.instanceStatus == COMPLETED: - check_task_completed(task) + self.check_task_completed(task) elif report.instanceStatus == FAILED or report.instanceStatus == TIMEOUT: if instance['try_count'] > task.info.maxRetryCount: - check_task_completed(task) - else: - self.logger.error('[on_task_report] receive report from waiting task') + self.check_task_completed(task) def check_task_completed(self, task): @@ -145,13 +148,13 @@ class TaskMgr(threading.Thread): # properties for transaction task.info.instanceid = instance_id - task.token = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + task.info.token = ''.join(random.sample(string.ascii_letters + string.digits, 8)) instance = task.instance_list[instance_id] instance['status'] = RUNNING instance['last_update_time'] = time.time() instance['try_count'] += 1 - instance['token'] = task.token + instance['token'] = task.info.token instance['worker'] = worker_ip self.cpu_usage[worker_ip] += task.info.cluster.instance.cpu @@ -172,7 +175,7 @@ class TaskMgr(threading.Thread): # return task, worker def task_scheduler(self): # simple FIFO - self.logger.info('[task_scheduler] scheduling...') + self.logger.info('[task_scheduler] scheduling... (%d tasks remains)' % len(self.task_queue)) for task in self.task_queue: worker = self.find_proper_worker(task) @@ -200,6 +203,9 @@ class TaskMgr(threading.Thread): instance['try_count'] = 0 task.instance_list.append(instance) return task, len(task.instance_list) - 1, worker + + self.check_task_completed(task) + return None, None, None def find_proper_worker(self, task): diff --git a/src/master/testTaskMgr.py b/src/master/testTaskMgr.py index b74dd71..fefb058 100644 --- a/src/master/testTaskMgr.py +++ b/src/master/testTaskMgr.py @@ -1,8 +1,9 @@ import master.taskmgr from concurrent import futures import grpc -from protos import rpc_pb2, rpc_pb2_grpc -import threading, json, time +from protos.rpc_pb2 import * +from protos.rpc_pb2_grpc import * +import threading, json, time, random from utils import env @@ -22,10 +23,15 @@ class SimulatedMonitorFetcher(): self.info['diskinfo'][0]['free'] = 8 * 1024 * 1024 * 1024 # (b) simulate 8 GB disk -class SimulatedTaskController(rpc_pb2_grpc.WorkerServicer): +class SimulatedTaskController(WorkerServicer): + + def __init__(self, worker): + self.worker = worker + def process_task(self, task, context): - print('[SimulatedTaskController] receive task [%s]' % task.id) - return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") + print('[SimulatedTaskController] receive task [%s] instanceid [%d] token [%s]' % (task.id, task.instanceid, task.token)) + worker.process(task) + return Reply(status=Reply.ACCEPTED,message="") class SimulatedWorker(threading.Thread): @@ -33,20 +39,36 @@ class SimulatedWorker(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.thread_stop = False + self.tasks = [] def run(self): worker_port = env.getenv('BATCH_WORKER_PORT') server = grpc.server(futures.ThreadPoolExecutor(max_workers=5)) - rpc_pb2_grpc.add_WorkerServicer_to_server(SimulatedTaskController(), server) + add_WorkerServicer_to_server(SimulatedTaskController(self), server) server.add_insecure_port('[::]:' + worker_port) server.start() while not self.thread_stop: + for task in self.tasks: + seed = random.random() + if seed < 0.25: + report(task.id, task.instanceid, RUNNING, task.token) + elif seed < 0.5: + report(task.id, task.instanceid, COMPLETED, task.token) + self.tasks.remove(task) + elif seed < 0.75: + report(task.id, task.instanceid, FAILED, task.token) + self.tasks.remove(task) + else: + pass time.sleep(5) server.stop(0) def stop(self): self.thread_stop = True + def process(self, task): + self.tasks.append(task) + class SimulatedJobMgr(threading.Thread): @@ -63,9 +85,9 @@ class SimulatedJobMgr(threading.Thread): self.thread_stop = True def report(self, task): - print('[SimulatedJobMgr] task[%s] status %d' % (task.id, task.status)) + print('[SimulatedJobMgr] task[%s] status %d' % (task.info.id, task.status)) - def asignTask(self, taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk): + def assignTask(self, taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk): task = {} task['instanceCount'] = instance_count task['maxRetryCount'] = retry_count @@ -113,11 +135,26 @@ def test(): jobmgr = SimulatedJobMgr() jobmgr.start() - taskmgr = master.taskmgr.TaskMgr(SimulatedNodeMgr(), SimulatedMonitorFetcher, SimulatedLogger()) + taskmgr = master.taskmgr.TaskMgr(SimulatedNodeMgr(), SimulatedMonitorFetcher, SimulatedLogger(), worker_timeout=10, scheduler_interval=2) taskmgr.set_jobmgr(jobmgr) taskmgr.start() - jobmgr.asignTask(taskmgr, 'task_0', 2, 2, 60, 2, 2048, 2048) + add('task_0', instance_count=2, retry_count=2, timeout=60, cpu=2, memory=2048, disk=2048) + + +def add(taskid, instance_count, retry_count, timeout, cpu, memory, disk): + global jobmgr + global taskmgr + jobmgr.assignTask(taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk) + + +def report(taskid, instanceid, status, token): + global taskmgr + + master_port = env.getenv('BATCH_MASTER_PORT') + channel = grpc.insecure_channel('%s:%s' % ('0.0.0.0', master_port)) + stub = MasterStub(channel) + response = stub.report(TaskMsg(taskid=taskid, instanceid=instanceid, instanceStatus=status, token=token)) def stop(): @@ -127,4 +164,4 @@ def stop(): worker.stop() jobmgr.stop() - taskmgr.stop() + taskmgr.stop() \ No newline at end of file diff --git a/src/protos/rpc.proto b/src/protos/rpc.proto index b90c8c8..9472b37 100644 --- a/src/protos/rpc.proto +++ b/src/protos/rpc.proto @@ -22,7 +22,7 @@ message TaskMsg { string taskid = 1; int32 instanceid = 2; Status instanceStatus = 3; // 任务状态 - + string token = 4; } enum Status { @@ -42,6 +42,7 @@ message TaskInfo { Parameters parameters = 6; // 参数 Cluster cluster = 7; // 集群配置 int32 timeout = 8; // 超时阈值 + string token = 9; } message Parameters { diff --git a/src/protos/rpc_pb2.py b/src/protos/rpc_pb2.py index 158d29b..c46f25a 100644 --- a/src/protos/rpc_pb2.py +++ b/src/protos/rpc_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='protos/rpc.proto', package='', syntax='proto3', - serialized_pb=_b('\n\x10protos/rpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"N\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x12\n\ninstanceid\x18\x02 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x03 \x01(\x0e\x32\x07.Status\"\xb7\x01\n\x08TaskInfo\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*J\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x32&\n\x06Master\x12\x1c\n\x06report\x12\x08.TaskMsg\x1a\x06.Reply\"\x00\x32-\n\x06Worker\x12#\n\x0cprocess_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x62\x06proto3') + serialized_pb=_b('\n\x10protos/rpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"]\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x12\n\ninstanceid\x18\x02 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x03 \x01(\x0e\x32\x07.Status\x12\r\n\x05token\x18\x04 \x01(\t\"\xc6\x01\n\x08TaskInfo\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\x12\r\n\x05token\x18\t \x01(\t\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\".\n\x05Mount\x12\x11\n\tlocalPath\x18\x01 \x01(\t\x12\x12\n\nremotePath\x18\x02 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*J\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x32&\n\x06Master\x12\x1c\n\x06report\x12\x08.TaskMsg\x1a\x06.Reply\"\x00\x32-\n\x06Worker\x12#\n\x0cprocess_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x62\x06proto3') ) _STATUS = _descriptor.EnumDescriptor( @@ -52,8 +52,8 @@ _STATUS = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=949, - serialized_end=1023, + serialized_start=979, + serialized_end=1053, ) _sym_db.RegisterEnumDescriptor(_STATUS) @@ -108,8 +108,8 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=785, - serialized_end=831, + serialized_start=815, + serialized_end=861, ) _sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE) @@ -181,6 +181,13 @@ _TASKMSG = _descriptor.Descriptor( message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='token', full_name='TaskMsg.token', index=3, + number=4, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), ], extensions=[ ], @@ -194,7 +201,7 @@ _TASKMSG = _descriptor.Descriptor( oneofs=[ ], serialized_start=124, - serialized_end=202, + serialized_end=217, ) @@ -261,6 +268,13 @@ _TASKINFO = _descriptor.Descriptor( message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='token', full_name='TaskInfo.token', index=8, + number=9, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), ], extensions=[ ], @@ -273,8 +287,8 @@ _TASKINFO = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=205, - serialized_end=388, + serialized_start=220, + serialized_end=418, ) @@ -318,8 +332,8 @@ _PARAMETERS = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=390, - serialized_end=485, + serialized_start=420, + serialized_end=515, ) @@ -356,8 +370,8 @@ _COMMAND_ENVVARSENTRY = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=581, - serialized_end=627, + serialized_start=611, + serialized_end=657, ) _COMMAND = _descriptor.Descriptor( @@ -400,8 +414,8 @@ _COMMAND = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=488, - serialized_end=627, + serialized_start=518, + serialized_end=657, ) @@ -445,8 +459,8 @@ _CLUSTER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=629, - serialized_end=713, + serialized_start=659, + serialized_end=743, ) @@ -491,8 +505,8 @@ _IMAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=715, - serialized_end=831, + serialized_start=745, + serialized_end=861, ) @@ -529,8 +543,8 @@ _MOUNT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=833, - serialized_end=879, + serialized_start=863, + serialized_end=909, ) @@ -581,8 +595,8 @@ _INSTANCE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=881, - serialized_end=947, + serialized_start=911, + serialized_end=977, ) _REPLY.fields_by_name['status'].enum_type = _REPLY_REPLYSTATUS @@ -691,8 +705,8 @@ _MASTER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, options=None, - serialized_start=1025, - serialized_end=1063, + serialized_start=1055, + serialized_end=1093, methods=[ _descriptor.MethodDescriptor( name='report', @@ -715,8 +729,8 @@ _WORKER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=1, options=None, - serialized_start=1065, - serialized_end=1110, + serialized_start=1095, + serialized_end=1140, methods=[ _descriptor.MethodDescriptor( name='process_task', From 420d5c033e4ba96cf1c9da78deb8a682d00104b9 Mon Sep 17 00:00:00 2001 From: iteratorlee <1400012951@pku.edu.cn> Date: Mon, 30 Jul 2018 12:28:22 +0800 Subject: [PATCH 26/75] add create batch page --- src/master/jobmgr.py | 12 ++- web/templates/base_AdminLTE.html | 3 + web/templates/batch/batch_create.html | 142 ++++++++++++++++++++++++++ web/templates/batch/batch_list.html | 63 ++++++++++++ web/templates/batch/batch_state.html | 0 web/web.py | 22 ++++ web/webViews/batch.py | 43 ++++++++ 7 files changed, 282 insertions(+), 3 deletions(-) create mode 100644 web/templates/batch/batch_create.html create mode 100644 web/templates/batch/batch_list.html create mode 100644 web/templates/batch/batch_state.html create mode 100644 web/webViews/batch.py diff --git a/src/master/jobmgr.py b/src/master/jobmgr.py index 32a9a81..613a7e1 100644 --- a/src/master/jobmgr.py +++ b/src/master/jobmgr.py @@ -1,5 +1,11 @@ +import time, threading +import master.monitor + +from utils.log import initlogging, logger +initlogging("docklet-jobmgr") + class JobMgr(object): - + # user: username # job: a json string # user submit a new job, add this job to queue and database @@ -18,12 +24,12 @@ class JobMgr(object): # call get_task to get the task information def get_job(self, user, jobid): pass - + # job: a json string # this is a thread to process a job def job_processor(self, job): # according the DAG of job, add task to taskmanager - # wait for all task completed and exit + # wait for all task completed and exit pass # this is a thread to schedule the jobs diff --git a/web/templates/base_AdminLTE.html b/web/templates/base_AdminLTE.html index ae6ff14..5fbe303 100644 --- a/web/templates/base_AdminLTE.html +++ b/web/templates/base_AdminLTE.html @@ -156,6 +156,9 @@ + {% if mysession['usergroup'] == 'root' or mysession['usergroup'] == 'admin'%} diff --git a/web/templates/batch/batch_create.html b/web/templates/batch/batch_create.html new file mode 100644 index 0000000..2d9b777 --- /dev/null +++ b/web/templates/batch/batch_create.html @@ -0,0 +1,142 @@ +{% extends 'base_AdminLTE.html' %} + +{% block title %}Docklet | Create Batch Job{% endblock %} + +{% block css_src %} + + + + + +{% endblock %} + +{% block panel_title %}Batch Job Info{% endblock %} + +{% block panel_list %} + +{% endblock %} + +
+{% block content %} +
+
+
+
+

Batch Job Create + +

+ +
+ + +
+
+
+
+ +
+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+
+ +
+
+{% endblock %} + +{% block script_src %} + + + + + + + + + + + + + + + + + + +{% endblock %} diff --git a/web/templates/batch/batch_list.html b/web/templates/batch/batch_list.html new file mode 100644 index 0000000..9a1a20d --- /dev/null +++ b/web/templates/batch/batch_list.html @@ -0,0 +1,63 @@ +{% extends "base_AdminLTE.html"%} +{% block title %}Docklet | Batch Job{% endblock %} + +{% block panel_title %}Batch Job{% endblock %} + +{% block panel_list %} + +{% endblock %} +{% block content %} +
+
+
+
+

Batch Job List

+ +
+ + +
+
+
+ +

+ +

+ +
+ + + + + + + + + + + +
IDNameStatusTasksOperationsRunning Time
+
+
+
+ +{% endblock %} +{% block script_src %} + + +{% endblock %} diff --git a/web/templates/batch/batch_state.html b/web/templates/batch/batch_state.html new file mode 100644 index 0000000..e69de29 diff --git a/web/web.py b/web/web.py index 0d195fe..56568bf 100755 --- a/web/web.py +++ b/web/web.py @@ -39,6 +39,7 @@ from webViews.cloud import * from webViews.authenticate.auth import login_required, administration_required,activated_required from webViews.authenticate.register import registerView from webViews.authenticate.login import loginView, logoutView +from webViews.batch import * import webViews.dockletrequest from webViews import cookie_tool import traceback @@ -119,6 +120,27 @@ def redirect_dochome(): def config(): return configView.as_view() +@app.route("/batch_jobs/", methods=['GET']) +@login_required +def batch_job(): + return batchJobListView().as_view() + +@app.route("/batch_job/create/", methods=['GET']) +@login_required +def create_batch_job(): + return createBatchJobView().as_view() + +@app.route("/batch_job/add/", methods=['POST']) +@login_required +def add_batch_job(): + #TODO get form parameters of a job description + addBatchJobView.job_name = request.form["job_name"] + return addBatchJobView().as_view() + +@app.route("/batch_job/state/", methods=['GET']) +@login_required +def state_batch_job(): + return stateBatchJobView().as_view() @app.route("/workspace/create/", methods=['GET']) @activated_required diff --git a/web/webViews/batch.py b/web/webViews/batch.py new file mode 100644 index 0000000..3ad3526 --- /dev/null +++ b/web/webViews/batch.py @@ -0,0 +1,43 @@ +from flask import session, redirect, request +from webViews.view import normalView +from webViews.checkname import checkname + +class batchJobListView(normalView): + template_path = "batch/batch_list.html" + + @classmethod + def get(self): + if True: + return self.render(self.template_path) + else: + return self.error() + +class createBatchJobView(normalView): + template_path = "batch/batch_create.html" + + @classmethod + def get(self): + if True: + return self.render(self.template_path) + else: + return self.error() + +class stateBatchJobView(normalView): + template_path = "batch/batch_state.html" + + @classmethod + def get(self): + if True: + return self.render(self.template_path) + else: + return self.error() + +class addBatchJobView(normalView): + template_path = "batch/batch_list.html" + + @classmethod + def post(self): + if True: + return self.render(self.template_path) + else: + return self.error() From eae99bf275677165dd400444723595b86575fbb9 Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Mon, 30 Jul 2018 17:16:26 +0800 Subject: [PATCH 27/75] Use script to execute command --- src/master/testTaskCtrler.py | 4 ++-- src/worker/taskcontroller.py | 38 ++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/master/testTaskCtrler.py b/src/master/testTaskCtrler.py index 0ea7d81..5b52992 100644 --- a/src/master/testTaskCtrler.py +++ b/src/master/testTaskCtrler.py @@ -10,7 +10,7 @@ def run(): channel = grpc.insecure_channel('localhost:50051') stub = rpc_pb2_grpc.WorkerStub(channel) - comm = rpc_pb2.Command(commandLine=r"echo \"s\" | awk '{print \"test\n\\\"\"}' > test.txt;cat test.txt", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}' + comm = rpc_pb2.Command(commandLine="echo \"s\" | awk '{print \"test\\n\\\"\"}' > test.txt;cat test.txt", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}' paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="", stdoutRedirectPath="") img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet") @@ -18,7 +18,7 @@ def run(): mnt = rpc_pb2.Mount(localPath="",remotePath="") clu = rpc_pb2.Cluster(image=img, instance=inst, mount=[mnt]) - task = rpc_pb2.Task(id="test",username="root",instanceid=0,instanceCount=1,maxRetryCount=1,parameters=paras,cluster=clu,timeout=10) + task = rpc_pb2.TaskInfo(id="test",username="root",instanceid=0,instanceCount=1,maxRetryCount=1,parameters=paras,cluster=clu,timeout=10,token="test") response = stub.process_task(task) print("Batch client received: " + str(response.status)+" "+response.message) diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py index a1bc028..74c5a65 100755 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -14,7 +14,7 @@ from concurrent import futures import grpc #from utils.log import logger #from utils import env -import json,lxc,subprocess,threading,os,time +import json,lxc,subprocess,threading,os,time,traceback from utils import imagemgr from protos import rpc_pb2, rpc_pb2_grpc @@ -152,19 +152,31 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") def excute_task(self,taskid,instanceid,envs,lxcname,pkgpath,command,ip): - cmd = "lxc-attach -n " + lxcname - for envkey,envval in envs.items(): - cmd = cmd + " -v %s=%s" % (envkey,envval) - cmd = cmd + " -- /bin/bash -c " + "\"cd " + pkgpath + ";" + command + "\"" - logger.info('run task with command - %s' % cmd) - ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) - logger.info(ret) - if ret.returncode == 0: - #call master rpc function to tell the taskmgr - pass + lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs" + scriptname = "batch_job.sh" + try: + scriptfile = open(lxcfspath+"/root/"+scriptname,"w") + scriptfile.write("#!/bin/bash\n") + scriptfile.write("cd "+str(pkgpath)+"\n") + scriptfile.write(command) + scriptfile.close() + except Exception as err: + logger.error(traceback.format_exc()) + logger.error("Fail to write script file with taskid(%s) instanceid(%s)" % (str(taskid),str(instanceid))) else: - #call master rpc function to tell the wrong - pass + cmd = "lxc-attach -n " + lxcname + for envkey,envval in envs.items(): + cmd = cmd + " -v %s=%s" % (envkey,envval) + cmd = cmd + " -- /bin/bash \"" + "/root/" + scriptname + "\"" + logger.info('run task with command - %s' % cmd) + ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) + logger.info(ret) + if ret.returncode == 0: + #call master rpc function to tell the taskmgr + pass + else: + #call master rpc function to tell the wrong + pass #umount oss here From 060d9d49f616aaeff3ec5ebafccc9a62cdd0d790 Mon Sep 17 00:00:00 2001 From: zhuyj17 Date: Tue, 31 Jul 2018 17:23:15 +0800 Subject: [PATCH 28/75] add RedirectPath --- src/master/testTaskCtrler.py | 2 +- src/worker/taskcontroller.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/master/testTaskCtrler.py b/src/master/testTaskCtrler.py index 5b52992..907703b 100644 --- a/src/master/testTaskCtrler.py +++ b/src/master/testTaskCtrler.py @@ -11,7 +11,7 @@ def run(): stub = rpc_pb2_grpc.WorkerStub(channel) comm = rpc_pb2.Command(commandLine="echo \"s\" | awk '{print \"test\\n\\\"\"}' > test.txt;cat test.txt", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}' - paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="", stdoutRedirectPath="") + paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="tmp/", stdoutRedirectPath="tmp/") img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet") inst = rpc_pb2.Instance(cpu=2, memory=2000, disk=500, gpu=0) diff --git a/src/worker/taskcontroller.py b/src/worker/taskcontroller.py index 74c5a65..1716eb6 100755 --- a/src/worker/taskcontroller.py +++ b/src/worker/taskcontroller.py @@ -90,6 +90,7 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): username = request.username lxcname = '%s-batch-%s-%s' % (username,taskid,str(instanceid)) instance_type = request.cluster.instance + outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath] # acquire ip [status, ip] = self.acquire_ip() @@ -145,17 +146,32 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): #mount oss here - thread = threading.Thread(target = self.excute_task, args=(taskid,instanceid,envs,lxcname,pkgpath,command,ip)) + thread = threading.Thread(target = self.excute_task, args=(taskid,instanceid,envs,lxcname,pkgpath,command,outpath,ip)) thread.setDaemon(True) thread.start() return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="") - def excute_task(self,taskid,instanceid,envs,lxcname,pkgpath,command,ip): - lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs" + def write_output(self,content,path): + dirpath = path[:path.rfind("/")] + if not os.path.isdir(dirpath): + logger.info("Output directory doesn't exist. Create (%s)" % dirpath) + os.makedirs(dirpath) + try: + outfile = open(path,"w") + outfile.write(content.decode(encoding="utf-8")) + outfile.close() + except Exception as err: + logger.error(traceback.format_exc()) + logger.error("Fail to write to path(%s)" % path) + else: + logger.info("Succeed to writing to %s" % path) + + def excute_task(self,taskid,instanceid,envs,lxcname,pkgpath,command,outpath,ip): + lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs/" scriptname = "batch_job.sh" try: - scriptfile = open(lxcfspath+"/root/"+scriptname,"w") + scriptfile = open(lxcfspath+"root/"+scriptname,"w") scriptfile.write("#!/bin/bash\n") scriptfile.write("cd "+str(pkgpath)+"\n") scriptfile.write(command) @@ -169,8 +185,14 @@ class TaskController(rpc_pb2_grpc.WorkerServicer): cmd = cmd + " -v %s=%s" % (envkey,envval) cmd = cmd + " -- /bin/bash \"" + "/root/" + scriptname + "\"" logger.info('run task with command - %s' % cmd) - ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True) + ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) logger.info(ret) + if outpath[0][-1] == "/": + outpath[0] += "stdout.txt" + if outpath[1][-1] == "/": + outpath[1] += "stderr.txt" + self.write_output(ret.stdout,lxcfspath+outpath[0]) + self.write_output(ret.stderr,lxcfspath+outpath[1]) if ret.returncode == 0: #call master rpc function to tell the taskmgr pass From b1b3478100e8e948194ca4ef371fb15d7d67338a Mon Sep 17 00:00:00 2001 From: iteratorlee <1400012951@pku.edu.cn> Date: Wed, 1 Aug 2018 09:57:33 +0800 Subject: [PATCH 29/75] update batch submit page --- web/templates/batch/batch_create.html | 169 ++++++++++++++++++++------ web/web.py | 3 +- web/webViews/batch.py | 5 +- 3 files changed, 140 insertions(+), 37 deletions(-) diff --git a/web/templates/batch/batch_create.html b/web/templates/batch/batch_create.html index 2d9b777..479e4ac 100644 --- a/web/templates/batch/batch_create.html +++ b/web/templates/batch/batch_create.html @@ -40,35 +40,35 @@
-
+ -
-
-
-
+
+
+
+
+
+
+
+

-
-
-
-
-
-
+
+

-
-
-
-
- -
-
-
+
+
+
+
+ +
+
+
@@ -91,18 +91,46 @@ - - - + + + +