New Batch Framework

This commit is contained in:
zhuyj17 2019-03-03 17:13:02 +08:00
parent 583bd42e90
commit e386131cff
5 changed files with 853 additions and 296 deletions

View File

@ -16,13 +16,14 @@ from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, Wo
from utils import env
class Task():
def __init__(self, info, priority):
self.info = info
self.status = WAITING
self.instance_list = []
self.subtask_list = []
self.token = ''
self.atSameTime = True
self.multicommand = True
# priority the bigger the better
# self.priority the smaller the better
self.priority = int(time.time()) / 60 / 60 - priority
@ -30,7 +31,6 @@ class Task():
def __lt__(self, other):
return self.priority < other.priority
class TaskReporter(MasterServicer):
def __init__(self, taskmgr):
@ -41,7 +41,6 @@ class TaskReporter(MasterServicer):
self.taskmgr.on_task_report(task_report)
return Reply(status=Reply.ACCEPTED, message='')
class TaskMgr(threading.Thread):
# load task information from etcd
@ -55,7 +54,7 @@ class TaskMgr(threading.Thread):
self.lazy_append_list = []
self.lazy_delete_list = []
self.task_queue_lock = threading.Lock()
self.user_containers = {}
#self.user_containers = {}
self.scheduler_interval = scheduler_interval
self.logger = logger
@ -87,13 +86,12 @@ class TaskMgr(threading.Thread):
self.serve()
while not self.thread_stop:
self.sort_out_task_queue()
task, instance_id, worker = self.task_scheduler()
if task is not None and worker is not None:
self.task_processor(task, instance_id, worker)
task, workers = self.task_scheduler()
if task is not None and workers is not None:
self.task_processor(task, workers)
else:
time.sleep(self.scheduler_interval)
def serve(self):
self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
add_MasterServicer_to_server(TaskReporter(self), self.server)
@ -101,104 +99,11 @@ class TaskMgr(threading.Thread):
self.server.start()
self.logger.info('[taskmgr_rpc] start rpc server')
def stop(self):
self.thread_stop = True
self.server.stop(0)
self.logger.info('[taskmgr_rpc] stop rpc server')
# this method is called when worker send heart-beat rpc request
def on_task_report(self, report):
self.logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.instanceid, report.instanceStatus))
task = self.get_task(report.taskid)
if task == None:
self.logger.error('[on_task_report] task not found')
return
instance = task.instance_list[report.instanceid]
if instance['token'] != report.token:
self.logger.warning('[on_task_report] wrong token')
return
username = task.info.username
container_name = username + '-batch-' + task.info.id + '-' + str(report.instanceid) + '-' + report.token
self.user_containers[username].remove(container_name)
if instance['status'] != RUNNING:
self.logger.error('[on_task_report] receive task report when instance is not running')
if instance['status'] == RUNNING and report.instanceStatus != RUNNING:
self.cpu_usage[instance['worker']] -= task.info.cluster.instance.cpu
self.gpu_usage[instance['worker']] -= task.info.cluster.instance.gpu
instance['status'] = report.instanceStatus
instance['error_msg'] = report.errmsg
if report.instanceStatus == COMPLETED:
self.check_task_completed(task)
elif report.instanceStatus == FAILED or report.instanceStatus == TIMEOUT:
if instance['try_count'] > task.info.maxRetryCount:
self.check_task_completed(task)
else:
reason = 'FAILED' if report.instanceStatus == FAILED else 'TIMEOUT'
self.task_retrying(task, reason, instance['try_count'])
elif report.instanceStatus == OUTPUTERROR:
self.task_failed(task,"OUTPUTERROR")
def check_task_completed(self, task):
if len(task.instance_list) < task.info.instanceCount:
return
failed = False
reason = "FAILED"
for instance in task.instance_list:
if instance['status'] == RUNNING or instance['status'] == WAITING:
return
if instance['status'] == FAILED or instance['status'] == TIMEOUT:
if instance['try_count'] > task.info.maxRetryCount:
failed = True
if instance['status'] == TIMEOUT:
reason = "TIMEOUT"
else:
return
if instance['status'] == OUTPUTERROR:
failed = True
break
if failed:
self.task_failed(task,reason)
else:
self.task_completed(task)
def task_completed(self, task):
task.status = COMPLETED
if self.jobmgr is None:
self.logger.error('[task_completed] jobmgr is None!')
else:
self.jobmgr.report(task.info.id,'finished')
self.logger.info('task %s completed' % task.info.id)
self.lazy_delete_list.append(task)
def task_failed(self, task, reason):
task.status = FAILED
if self.jobmgr is None:
self.logger.error('[task_failed] jobmgr is None!')
else:
self.jobmgr.report(task.info.id,'failed', reason, task.info.maxRetryCount+1)
self.logger.info('task %s failed' % task.info.id)
self.lazy_delete_list.append(task)
def task_retrying(self, task, reason, tried_times):
if self.jobmgr is None:
self.logger.error('[task_retrying] jobmgr is None!')
else:
self.jobmgr.report(task.info.id,'retrying',reason,tried_times)
@queue_lock
def sort_out_task_queue(self):
while self.lazy_delete_list:
@ -210,63 +115,15 @@ class TaskMgr(threading.Thread):
self.task_queue.append(task)
self.task_queue = sorted(self.task_queue, key=lambda x: x.priority)
def task_processor(self, task, instance_id, worker_ip):
task.status = RUNNING
self.jobmgr.report(task.info.id,'running')
# properties for transaction
task.info.instanceid = instance_id
task.info.token = ''.join(random.sample(string.ascii_letters + string.digits, 8))
instance = task.instance_list[instance_id]
instance['status'] = RUNNING
instance['try_count'] += 1
instance['token'] = task.info.token
instance['worker'] = worker_ip
self.cpu_usage[worker_ip] += task.info.cluster.instance.cpu
self.gpu_usage[worker_ip] += task.info.cluster.instance.gpu
username = task.info.username
container_name = task.info.username + '-batch-' + task.info.id + '-' + str(instance_id) + '-' + task.info.token
if not username in self.user_containers.keys():
self.user_containers[username] = []
self.user_containers[username].append(container_name)
try:
self.logger.info('[task_processor] processing task [%s] instance [%d]' % (task.info.id, task.info.instanceid))
channel = grpc.insecure_channel('%s:%s' % (worker_ip, self.worker_port))
stub = WorkerStub(channel)
response = stub.process_task(task.info)
if response.status != Reply.ACCEPTED:
raise Exception(response.message)
except Exception as e:
self.logger.error('[task_processor] rpc error message: %s' % e)
instance['status'] = FAILED
instance['try_count'] -= 1
self.user_containers[username].remove(container_name)
# return task, worker
# return task, workers
def task_scheduler(self):
# simple FIFO with priority
self.logger.info('[task_scheduler] scheduling... (%d tasks remains)' % len(self.task_queue))
# nodes = self.get_all_nodes()
# if nodes is None or len(nodes) == 0:
# self.logger.info('[task_scheduler] no nodes found')
# else:
# for worker_ip, worker_info in nodes:
# self.logger.info('[task_scheduler] nodes %s' % worker_ip)
# for key in worker_info:
# if key == 'cpu':
# self.logger.info('[task_scheduler] %s: %d/%d' % (key, self.get_cpu_usage(worker_ip), worker_info[key]))
# else:
# self.logger.info('[task_scheduler] %s: %d' % (key, worker_info[key]))
for task in self.task_queue:
if task in self.lazy_delete_list:
continue
worker = self.find_proper_worker(task)
for index, instance in enumerate(task.instance_list):
@ -297,27 +154,35 @@ class TaskMgr(threading.Thread):
self.check_task_completed(task)
return None, None, None
return None, None
def find_proper_worker(self, task):
def find_proper_workers(self, vnodes_configs):
nodes = self.get_all_nodes()
if nodes is None or len(nodes) == 0:
self.logger.warning('[task_scheduler] running nodes not found')
return None
proper_workers = []
for needs in vnodes_configs:
for worker_ip, worker_info in nodes:
if task.info.cluster.instance.cpu + self.get_cpu_usage(worker_ip) > worker_info['cpu']:
if needs['cpu'] + self.get_cpu_usage(worker_ip) > worker_info['cpu']:
continue
if task.info.cluster.instance.memory > worker_info['memory']:
elif needs['memory'] > worker_info['memory']:
continue
# try not to assign non-gpu task to a worker with gpu
if task.info.cluster.instance.gpu == 0 and worker_info['gpu'] > 0:
#if needs['gpu'] == 0 and worker_info['gpu'] > 0:
#continue
elif needs['gpu'] + self.get_gpu_usage(worker_ip) > worker_info['gpu']:
continue
if task.info.cluster.instance.gpu + self.get_gpu_usage(worker_ip) > worker_info['gpu']:
continue
return worker_ip
else:
worker_info['cpu'] -= needs['cpu']
worker_info['memory'] -= needs['memory']
worker_info['gpu'] -= needs['gpu']
proper_workers.append(worker_ip)
break
else:
return None
return proper_workers
def get_all_nodes(self):
# cache running nodes
@ -328,12 +193,10 @@ class TaskMgr(threading.Thread):
all_nodes = [(node_ip, self.get_worker_resource_info(node_ip)) for node_ip in node_ips]
return all_nodes
def is_alive(self, worker):
nodes = self.nodemgr.get_batch_nodeips()
return worker in nodes
def get_worker_resource_info(self, worker_ip):
fetcher = self.monitor_fetcher(worker_ip)
worker_info = fetcher.info
@ -344,27 +207,6 @@ class TaskMgr(threading.Thread):
info['gpu'] = len(worker_info['gpuinfo'])
return info
def get_cpu_usage(self, worker_ip):
try:
return self.cpu_usage[worker_ip]
except:
self.cpu_usage[worker_ip] = 0
return 0
def get_gpu_usage(self, worker_ip):
try:
return self.gpu_usage[worker_ip]
except:
self.gpu_usage[worker_ip] = 0
return 0
def set_jobmgr(self, jobmgr):
self.jobmgr = jobmgr
# save the task information into database
# called when jobmgr assign task to taskmgr
def add_task(self, username, taskid, json_task, task_priority=1):

424
src/master/taskmgr_old.py Normal file
View File

@ -0,0 +1,424 @@
import threading
import time
import string
import random
import json
from functools import wraps
# must import logger after initlogging, ugly
from utils.log import logger
# grpc
from concurrent import futures
import grpc
from protos.rpc_pb2 import *
from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, WorkerStub
from utils import env
class Task():
def __init__(self, info, priority):
self.info = info
self.status = WAITING
self.instance_list = []
self.token = ''
# priority the bigger the better
# self.priority the smaller the better
self.priority = int(time.time()) / 60 / 60 - priority
def __lt__(self, other):
return self.priority < other.priority
class TaskReporter(MasterServicer):
def __init__(self, taskmgr):
self.taskmgr = taskmgr
def report(self, request, context):
for task_report in request.taskmsgs:
self.taskmgr.on_task_report(task_report)
return Reply(status=Reply.ACCEPTED, message='')
class TaskMgr(threading.Thread):
# load task information from etcd
# initial a task queue and task schedueler
# taskmgr: a taskmgr instance
def __init__(self, nodemgr, monitor_fetcher, scheduler_interval=2, external_logger=None):
threading.Thread.__init__(self)
self.thread_stop = False
self.jobmgr = None
self.task_queue = []
self.lazy_append_list = []
self.lazy_delete_list = []
self.task_queue_lock = threading.Lock()
self.user_containers = {}
self.scheduler_interval = scheduler_interval
self.logger = logger
self.master_port = env.getenv('BATCH_MASTER_PORT')
self.worker_port = env.getenv('BATCH_WORKER_PORT')
# nodes
self.nodemgr = nodemgr
self.monitor_fetcher = monitor_fetcher
self.cpu_usage = {}
self.gpu_usage = {}
# self.all_nodes = None
# self.last_nodes_info_update_time = 0
# self.nodes_info_update_interval = 30 # (s)
def queue_lock(f):
@wraps(f)
def new_f(self, *args, **kwargs):
self.task_queue_lock.acquire()
result = f(self, *args, **kwargs)
self.task_queue_lock.release()
return result
return new_f
def run(self):
self.serve()
while not self.thread_stop:
self.sort_out_task_queue()
task, instance_id, worker = self.task_scheduler()
if task is not None and worker is not None:
self.task_processor(task, instance_id, worker)
else:
time.sleep(self.scheduler_interval)
def serve(self):
self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
add_MasterServicer_to_server(TaskReporter(self), self.server)
self.server.add_insecure_port('[::]:' + self.master_port)
self.server.start()
self.logger.info('[taskmgr_rpc] start rpc server')
def stop(self):
self.thread_stop = True
self.server.stop(0)
self.logger.info('[taskmgr_rpc] stop rpc server')
# this method is called when worker send heart-beat rpc request
def on_task_report(self, report):
self.logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.instanceid, report.instanceStatus))
task = self.get_task(report.taskid)
if task == None:
self.logger.error('[on_task_report] task not found')
return
instance = task.instance_list[report.instanceid]
if instance['token'] != report.token:
self.logger.warning('[on_task_report] wrong token')
return
username = task.info.username
container_name = username + '-batch-' + task.info.id + '-' + str(report.instanceid) + '-' + report.token
self.user_containers[username].remove(container_name)
if instance['status'] != RUNNING:
self.logger.error('[on_task_report] receive task report when instance is not running')
if instance['status'] == RUNNING and report.instanceStatus != RUNNING:
self.cpu_usage[instance['worker']] -= task.info.cluster.instance.cpu
self.gpu_usage[instance['worker']] -= task.info.cluster.instance.gpu
instance['status'] = report.instanceStatus
instance['error_msg'] = report.errmsg
if report.instanceStatus == COMPLETED:
self.check_task_completed(task)
elif report.instanceStatus == FAILED or report.instanceStatus == TIMEOUT:
if instance['try_count'] > task.info.maxRetryCount:
self.check_task_completed(task)
else:
reason = 'FAILED' if report.instanceStatus == FAILED else 'TIMEOUT'
self.task_retrying(task, reason, instance['try_count'])
elif report.instanceStatus == OUTPUTERROR:
self.task_failed(task,"OUTPUTERROR")
def check_task_completed(self, task):
if len(task.instance_list) < task.info.instanceCount:
return
failed = False
reason = "FAILED"
for instance in task.instance_list:
if instance['status'] == RUNNING or instance['status'] == WAITING:
return
if instance['status'] == FAILED or instance['status'] == TIMEOUT:
if instance['try_count'] > task.info.maxRetryCount:
failed = True
if instance['status'] == TIMEOUT:
reason = "TIMEOUT"
else:
return
if instance['status'] == OUTPUTERROR:
failed = True
break
if failed:
self.task_failed(task,reason)
else:
self.task_completed(task)
def task_completed(self, task):
task.status = COMPLETED
if self.jobmgr is None:
self.logger.error('[task_completed] jobmgr is None!')
else:
self.jobmgr.report(task.info.id,'finished')
self.logger.info('task %s completed' % task.info.id)
self.lazy_delete_list.append(task)
def task_failed(self, task, reason):
task.status = FAILED
if self.jobmgr is None:
self.logger.error('[task_failed] jobmgr is None!')
else:
self.jobmgr.report(task.info.id,'failed', reason, task.info.maxRetryCount+1)
self.logger.info('task %s failed' % task.info.id)
self.lazy_delete_list.append(task)
def task_retrying(self, task, reason, tried_times):
if self.jobmgr is None:
self.logger.error('[task_retrying] jobmgr is None!')
else:
self.jobmgr.report(task.info.id,'retrying',reason,tried_times)
@queue_lock
def sort_out_task_queue(self):
while self.lazy_delete_list:
task = self.lazy_delete_list.pop(0)
self.task_queue.remove(task)
if self.lazy_append_list:
while self.lazy_append_list:
task = self.lazy_append_list.pop(0)
self.task_queue.append(task)
self.task_queue = sorted(self.task_queue, key=lambda x: x.priority)
def task_processor(self, task, instance_id, worker_ip):
task.status = RUNNING
self.jobmgr.report(task.info.id,'running')
# properties for transaction
task.info.instanceid = instance_id
task.info.token = ''.join(random.sample(string.ascii_letters + string.digits, 8))
instance = task.instance_list[instance_id]
instance['status'] = RUNNING
instance['try_count'] += 1
instance['token'] = task.info.token
instance['worker'] = worker_ip
self.cpu_usage[worker_ip] += task.info.cluster.instance.cpu
self.gpu_usage[worker_ip] += task.info.cluster.instance.gpu
username = task.info.username
container_name = task.info.username + '-batch-' + task.info.id + '-' + str(instance_id) + '-' + task.info.token
if not username in self.user_containers.keys():
self.user_containers[username] = []
self.user_containers[username].append(container_name)
try:
self.logger.info('[task_processor] processing task [%s] instance [%d]' % (task.info.id, task.info.instanceid))
channel = grpc.insecure_channel('%s:%s' % (worker_ip, self.worker_port))
stub = WorkerStub(channel)
response = stub.process_task(task.info)
if response.status != Reply.ACCEPTED:
raise Exception(response.message)
except Exception as e:
self.logger.error('[task_processor] rpc error message: %s' % e)
instance['status'] = FAILED
instance['try_count'] -= 1
self.user_containers[username].remove(container_name)
# return task, worker
def task_scheduler(self):
# simple FIFO with priority
self.logger.info('[task_scheduler] scheduling... (%d tasks remains)' % len(self.task_queue))
# nodes = self.get_all_nodes()
# if nodes is None or len(nodes) == 0:
# self.logger.info('[task_scheduler] no nodes found')
# else:
# for worker_ip, worker_info in nodes:
# self.logger.info('[task_scheduler] nodes %s' % worker_ip)
# for key in worker_info:
# if key == 'cpu':
# self.logger.info('[task_scheduler] %s: %d/%d' % (key, self.get_cpu_usage(worker_ip), worker_info[key]))
# else:
# self.logger.info('[task_scheduler] %s: %d' % (key, worker_info[key]))
for task in self.task_queue:
if task in self.lazy_delete_list:
continue
worker = self.find_proper_worker(task)
for index, instance in enumerate(task.instance_list):
# find instance to retry
if (instance['status'] == FAILED or instance['status'] == TIMEOUT) and instance['try_count'] <= task.info.maxRetryCount:
if worker is not None:
self.logger.info('[task_scheduler] retry')
return task, index, worker
# find timeout instance
elif instance['status'] == RUNNING:
if not self.is_alive(instance['worker']):
instance['status'] = FAILED
instance['token'] = ''
self.cpu_usage[instance['worker']] -= task.info.cluster.instance.cpu
self.gpu_usage[instance['worker']] -= task.info.cluster.instance.gpu
self.logger.warning('[task_scheduler] worker dead, retry task [%s] instance [%d]' % (task.info.id, index))
if worker is not None:
return task, index, worker
if worker is not None:
# start new instance
if len(task.instance_list) < task.info.instanceCount:
instance = {}
instance['try_count'] = 0
task.instance_list.append(instance)
return task, len(task.instance_list) - 1, worker
self.check_task_completed(task)
return None, None, None
def find_proper_worker(self, task):
nodes = self.get_all_nodes()
if nodes is None or len(nodes) == 0:
self.logger.warning('[task_scheduler] running nodes not found')
return None
for worker_ip, worker_info in nodes:
if task.info.cluster.instance.cpu + self.get_cpu_usage(worker_ip) > worker_info['cpu']:
continue
if task.info.cluster.instance.memory > worker_info['memory']:
continue
# try not to assign non-gpu task to a worker with gpu
if task.info.cluster.instance.gpu == 0 and worker_info['gpu'] > 0:
continue
if task.info.cluster.instance.gpu + self.get_gpu_usage(worker_ip) > worker_info['gpu']:
continue
return worker_ip
return None
def get_all_nodes(self):
# cache running nodes
# if self.all_nodes is not None and time.time() - self.last_nodes_info_update_time < self.nodes_info_update_interval:
# return self.all_nodes
# get running nodes
node_ips = self.nodemgr.get_batch_nodeips()
all_nodes = [(node_ip, self.get_worker_resource_info(node_ip)) for node_ip in node_ips]
return all_nodes
def is_alive(self, worker):
nodes = self.nodemgr.get_batch_nodeips()
return worker in nodes
def get_worker_resource_info(self, worker_ip):
fetcher = self.monitor_fetcher(worker_ip)
worker_info = fetcher.info
info = {}
info['cpu'] = len(worker_info['cpuconfig'])
info['memory'] = (worker_info['meminfo']['buffers'] + worker_info['meminfo']['cached'] + worker_info['meminfo']['free']) / 1024 # (Mb)
info['disk'] = sum([disk['free'] for disk in worker_info['diskinfo']]) / 1024 / 1024 # (Mb)
info['gpu'] = len(worker_info['gpuinfo'])
return info
def get_cpu_usage(self, worker_ip):
try:
return self.cpu_usage[worker_ip]
except:
self.cpu_usage[worker_ip] = 0
return 0
def get_gpu_usage(self, worker_ip):
try:
return self.gpu_usage[worker_ip]
except:
self.gpu_usage[worker_ip] = 0
return 0
def set_jobmgr(self, jobmgr):
self.jobmgr = jobmgr
# save the task information into database
# called when jobmgr assign task to taskmgr
def add_task(self, username, taskid, json_task, task_priority=1):
# decode json string to object defined in grpc
self.logger.info('[taskmgr add_task] receive task %s' % taskid)
image_dict = {
"private": Image.PRIVATE,
"base": Image.BASE,
"public": Image.PUBLIC
}
# json_task = json.loads(json_task)
task = Task(TaskInfo(
id = taskid,
username = username,
instanceCount = int(json_task['instCount']),
maxRetryCount = int(json_task['retryCount']),
timeout = int(json_task['expTime']),
parameters = Parameters(
command = Command(
commandLine = json_task['command'],
packagePath = json_task['srcAddr'],
envVars = {}),
stderrRedirectPath = json_task.get('stdErrRedPth',""),
stdoutRedirectPath = json_task.get('stdOutRedPth',"")),
cluster = Cluster(
image = Image(
name = json_task['image'].split('_')[0], #json_task['cluster']['image']['name'],
type = image_dict[json_task['image'].split('_')[2]], #json_task['cluster']['image']['type'],
owner = username if not json_task['image'].split('_')[1] else json_task['image'].split('_')[1]), #json_task['cluster']['image']['owner']),
instance = Instance(
cpu = int(json_task['cpuSetting']),
memory = int(json_task['memorySetting']),
disk = int(json_task['diskSetting']),
gpu = int(json_task['gpuSetting'])))),
priority=task_priority)
if 'mapping' in json_task:
task.info.cluster.mount.extend([Mount(localPath=json_task['mapping'][mapping_key]['mappingLocalDir'],
remotePath=json_task['mapping'][mapping_key]['mappingRemoteDir'])
for mapping_key in json_task['mapping']])
self.lazy_append_list.append(task)
# user: username
# get the information of a task, including the status, task description and other information
@queue_lock
def get_task(self, taskid):
for task in self.task_queue:
if task.info.id == taskid:
return task
return None
# get names of all the batch containers of the user
def get_user_batch_containers(self,username):
if not username in self.user_containers.keys():
return []
else:
return self.user_containers[username]

View File

@ -5,8 +5,18 @@ service Master {
}
service Worker {
rpc process_task (TaskInfo) returns (Reply) {}
rpc stop_tasks (ReportMsg) returns (Reply) {}
rpc start_vnode (VNodeInfo) returns (Reply) {}
rpc start_task (TaskInfo) returns (Reply) {}
rpc stop_task (ReportMsg) returns (Reply) {}
rpc stop_vnode (VNodeInfo) returns (Reply) {}
}
message VNodeInfo {
string taskid = 1;
string username = 2;
int32 vnodeid = 3;
Parameters parameters = 4; //
VNode vnode = 5; //
}
message Reply {
@ -26,8 +36,8 @@ message ReportMsg {
message TaskMsg {
string taskid = 1;
string username = 2;
int32 instanceid = 3;
Status instanceStatus = 4; //
int32 vnodeid = 3;
Status subTaskStatus = 4; //
string token = 5;
string errmsg = 6;
}
@ -42,16 +52,12 @@ enum Status {
}
message TaskInfo {
string id = 1;
string taskid = 1;
string username = 2;
int32 instanceid = 3;
int32 instanceCount = 4; //
int32 maxRetryCount = 5; //
Parameters parameters = 6; //
Cluster cluster = 7; //
int32 timeout = 8; //
string token = 9;
bool reused = 10; //
int32 vnodeid = 3;
Parameters parameters = 4; //
int32 timeout = 5; //
string token = 6;
}
message Parameters {
@ -66,10 +72,18 @@ message Command {
map<string, string> envVars = 3; //
}
message Cluster {
message VNode {
Image image = 1; //
Instance instance = 2; //
repeated Mount mount = 3; //
Network network = 4; //
}
message Network {
string ipaddr = 1;
string gateway = 2;
string masterip = 3;
string brname = 4;
}
message Image {

View File

@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
name='rpc.proto',
package='',
syntax='proto3',
serialized_pb=_b('\n\trpc.proto\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\'\n\tReportMsg\x12\x1a\n\x08taskmsgs\x18\x01 \x03(\x0b\x32\x08.TaskMsg\"\x7f\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x1f\n\x0einstanceStatus\x18\x04 \x01(\x0e\x32\x07.Status\x12\r\n\x05token\x18\x05 \x01(\t\x12\x0e\n\x06\x65rrmsg\x18\x06 \x01(\t\"\xd6\x01\n\x08TaskInfo\x12\n\n\x02id\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x12\n\ninstanceid\x18\x03 \x01(\x05\x12\x15\n\rinstanceCount\x18\x04 \x01(\x05\x12\x15\n\rmaxRetryCount\x18\x05 \x01(\x05\x12\x1f\n\nparameters\x18\x06 \x01(\x0b\x32\x0b.Parameters\x12\x19\n\x07\x63luster\x18\x07 \x01(\x0b\x32\x08.Cluster\x12\x0f\n\x07timeout\x18\x08 \x01(\x05\x12\r\n\x05token\x18\t \x01(\t\x12\x0e\n\x06reused\x18\n \x01(\x08\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"T\n\x07\x43luster\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\"u\n\x05Mount\x12\x10\n\x08provider\x18\x01 \x01(\t\x12\x11\n\tlocalPath\x18\x02 \x01(\t\x12\x12\n\nremotePath\x18\x03 \x01(\t\x12\x11\n\taccessKey\x18\x04 \x01(\t\x12\x11\n\tsecretKey\x18\x05 \x01(\t\x12\r\n\x05other\x18\x06 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*[\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x12\x0f\n\x0bOUTPUTERROR\x10\x05\x32(\n\x06Master\x12\x1e\n\x06report\x12\n.ReportMsg\x1a\x06.Reply\"\x00\x32Q\n\x06Worker\x12#\n\x0cprocess_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x12\"\n\nstop_tasks\x12\n.ReportMsg\x1a\x06.Reply\"\x00\x62\x06proto3')
serialized_pb=_b('\n\trpc.proto\"v\n\tVNodeInfo\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x1f\n\nparameters\x18\x04 \x01(\x0b\x32\x0b.Parameters\x12\x15\n\x05vnode\x18\x05 \x01(\x0b\x32\x06.VNode\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\'\n\tReportMsg\x12\x1a\n\x08taskmsgs\x18\x01 \x03(\x0b\x32\x08.TaskMsg\"{\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x1e\n\rsubTaskStatus\x18\x04 \x01(\x0e\x32\x07.Status\x12\r\n\x05token\x18\x05 \x01(\t\x12\x0e\n\x06\x65rrmsg\x18\x06 \x01(\t\"~\n\x08TaskInfo\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x1f\n\nparameters\x18\x04 \x01(\x0b\x32\x0b.Parameters\x12\x0f\n\x07timeout\x18\x05 \x01(\x05\x12\r\n\x05token\x18\x06 \x01(\t\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"m\n\x05VNode\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\x12\x19\n\x07network\x18\x04 \x01(\x0b\x32\x08.Network\"L\n\x07Network\x12\x0e\n\x06ipaddr\x18\x01 \x01(\t\x12\x0f\n\x07gateway\x18\x02 \x01(\t\x12\x10\n\x08masterip\x18\x03 \x01(\t\x12\x0e\n\x06\x62rname\x18\x04 \x01(\t\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\"u\n\x05Mount\x12\x10\n\x08provider\x18\x01 \x01(\t\x12\x11\n\tlocalPath\x18\x02 \x01(\t\x12\x12\n\nremotePath\x18\x03 \x01(\t\x12\x11\n\taccessKey\x18\x04 \x01(\t\x12\x11\n\tsecretKey\x18\x05 \x01(\t\x12\r\n\x05other\x18\x06 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*[\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x12\x0f\n\x0bOUTPUTERROR\x10\x05\x32(\n\x06Master\x12\x1e\n\x06report\x12\n.ReportMsg\x1a\x06.Reply\"\x00\x32\x97\x01\n\x06Worker\x12#\n\x0bstart_vnode\x12\n.VNodeInfo\x1a\x06.Reply\"\x00\x12!\n\nstart_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x12!\n\tstop_task\x12\n.ReportMsg\x1a\x06.Reply\"\x00\x12\"\n\nstop_vnode\x12\n.VNodeInfo\x1a\x06.Reply\"\x00\x62\x06proto3')
)
_STATUS = _descriptor.EnumDescriptor(
@ -56,8 +56,8 @@ _STATUS = _descriptor.EnumDescriptor(
],
containing_type=None,
options=None,
serialized_start=1134,
serialized_end=1225,
serialized_start=1264,
serialized_end=1355,
)
_sym_db.RegisterEnumDescriptor(_STATUS)
@ -87,8 +87,8 @@ _REPLY_REPLYSTATUS = _descriptor.EnumDescriptor(
],
containing_type=None,
options=None,
serialized_start=75,
serialized_end=115,
serialized_start=195,
serialized_end=235,
)
_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS)
@ -113,12 +113,71 @@ _IMAGE_IMAGETYPE = _descriptor.EnumDescriptor(
],
containing_type=None,
options=None,
serialized_start=899,
serialized_end=945,
serialized_start=1029,
serialized_end=1075,
)
_sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE)
_VNODEINFO = _descriptor.Descriptor(
name='VNodeInfo',
full_name='VNodeInfo',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='taskid', full_name='VNodeInfo.taskid', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='username', full_name='VNodeInfo.username', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='vnodeid', full_name='VNodeInfo.vnodeid', index=2,
number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='parameters', full_name='VNodeInfo.parameters', index=3,
number=4, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='vnode', full_name='VNodeInfo.vnode', index=4,
number=5, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=13,
serialized_end=131,
)
_REPLY = _descriptor.Descriptor(
name='Reply',
full_name='Reply',
@ -153,8 +212,8 @@ _REPLY = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=13,
serialized_end=115,
serialized_start=133,
serialized_end=235,
)
@ -184,8 +243,8 @@ _REPORTMSG = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=117,
serialized_end=156,
serialized_start=237,
serialized_end=276,
)
@ -211,14 +270,14 @@ _TASKMSG = _descriptor.Descriptor(
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='instanceid', full_name='TaskMsg.instanceid', index=2,
name='vnodeid', full_name='TaskMsg.vnodeid', index=2,
number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='instanceStatus', full_name='TaskMsg.instanceStatus', index=3,
name='subTaskStatus', full_name='TaskMsg.subTaskStatus', index=3,
number=4, type=14, cpp_type=8, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
@ -250,8 +309,8 @@ _TASKMSG = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=158,
serialized_end=285,
serialized_start=278,
serialized_end=401,
)
@ -263,7 +322,7 @@ _TASKINFO = _descriptor.Descriptor(
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='id', full_name='TaskInfo.id', index=0,
name='taskid', full_name='TaskInfo.taskid', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
@ -277,61 +336,33 @@ _TASKINFO = _descriptor.Descriptor(
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='instanceid', full_name='TaskInfo.instanceid', index=2,
name='vnodeid', full_name='TaskInfo.vnodeid', index=2,
number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='instanceCount', full_name='TaskInfo.instanceCount', index=3,
number=4, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
name='parameters', full_name='TaskInfo.parameters', index=3,
number=4, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='maxRetryCount', full_name='TaskInfo.maxRetryCount', index=4,
name='timeout', full_name='TaskInfo.timeout', index=4,
number=5, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='parameters', full_name='TaskInfo.parameters', index=5,
number=6, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='cluster', full_name='TaskInfo.cluster', index=6,
number=7, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='timeout', full_name='TaskInfo.timeout', index=7,
number=8, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='token', full_name='TaskInfo.token', index=8,
number=9, type=9, cpp_type=9, label=1,
name='token', full_name='TaskInfo.token', index=5,
number=6, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='reused', full_name='TaskInfo.reused', index=9,
number=10, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
@ -344,8 +375,8 @@ _TASKINFO = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=288,
serialized_end=502,
serialized_start=403,
serialized_end=529,
)
@ -389,8 +420,8 @@ _PARAMETERS = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=504,
serialized_end=599,
serialized_start=531,
serialized_end=626,
)
@ -427,8 +458,8 @@ _COMMAND_ENVVARSENTRY = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=695,
serialized_end=741,
serialized_start=722,
serialized_end=768,
)
_COMMAND = _descriptor.Descriptor(
@ -471,39 +502,46 @@ _COMMAND = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=602,
serialized_end=741,
serialized_start=629,
serialized_end=768,
)
_CLUSTER = _descriptor.Descriptor(
name='Cluster',
full_name='Cluster',
_VNODE = _descriptor.Descriptor(
name='VNode',
full_name='VNode',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='image', full_name='Cluster.image', index=0,
name='image', full_name='VNode.image', index=0,
number=1, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='instance', full_name='Cluster.instance', index=1,
name='instance', full_name='VNode.instance', index=1,
number=2, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='mount', full_name='Cluster.mount', index=2,
name='mount', full_name='VNode.mount', index=2,
number=3, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='network', full_name='VNode.network', index=3,
number=4, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
@ -516,8 +554,60 @@ _CLUSTER = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=743,
serialized_end=827,
serialized_start=770,
serialized_end=879,
)
_NETWORK = _descriptor.Descriptor(
name='Network',
full_name='Network',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='ipaddr', full_name='Network.ipaddr', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='gateway', full_name='Network.gateway', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='masterip', full_name='Network.masterip', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='brname', full_name='Network.brname', index=3,
number=4, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=881,
serialized_end=957,
)
@ -562,8 +652,8 @@ _IMAGE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=829,
serialized_end=945,
serialized_start=959,
serialized_end=1075,
)
@ -628,8 +718,8 @@ _MOUNT = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=947,
serialized_end=1064,
serialized_start=1077,
serialized_end=1194,
)
@ -680,37 +770,48 @@ _INSTANCE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=1066,
serialized_end=1132,
serialized_start=1196,
serialized_end=1262,
)
_VNODEINFO.fields_by_name['parameters'].message_type = _PARAMETERS
_VNODEINFO.fields_by_name['vnode'].message_type = _VNODE
_REPLY.fields_by_name['status'].enum_type = _REPLY_REPLYSTATUS
_REPLY_REPLYSTATUS.containing_type = _REPLY
_REPORTMSG.fields_by_name['taskmsgs'].message_type = _TASKMSG
_TASKMSG.fields_by_name['instanceStatus'].enum_type = _STATUS
_TASKMSG.fields_by_name['subTaskStatus'].enum_type = _STATUS
_TASKINFO.fields_by_name['parameters'].message_type = _PARAMETERS
_TASKINFO.fields_by_name['cluster'].message_type = _CLUSTER
_PARAMETERS.fields_by_name['command'].message_type = _COMMAND
_COMMAND_ENVVARSENTRY.containing_type = _COMMAND
_COMMAND.fields_by_name['envVars'].message_type = _COMMAND_ENVVARSENTRY
_CLUSTER.fields_by_name['image'].message_type = _IMAGE
_CLUSTER.fields_by_name['instance'].message_type = _INSTANCE
_CLUSTER.fields_by_name['mount'].message_type = _MOUNT
_VNODE.fields_by_name['image'].message_type = _IMAGE
_VNODE.fields_by_name['instance'].message_type = _INSTANCE
_VNODE.fields_by_name['mount'].message_type = _MOUNT
_VNODE.fields_by_name['network'].message_type = _NETWORK
_IMAGE.fields_by_name['type'].enum_type = _IMAGE_IMAGETYPE
_IMAGE_IMAGETYPE.containing_type = _IMAGE
DESCRIPTOR.message_types_by_name['VNodeInfo'] = _VNODEINFO
DESCRIPTOR.message_types_by_name['Reply'] = _REPLY
DESCRIPTOR.message_types_by_name['ReportMsg'] = _REPORTMSG
DESCRIPTOR.message_types_by_name['TaskMsg'] = _TASKMSG
DESCRIPTOR.message_types_by_name['TaskInfo'] = _TASKINFO
DESCRIPTOR.message_types_by_name['Parameters'] = _PARAMETERS
DESCRIPTOR.message_types_by_name['Command'] = _COMMAND
DESCRIPTOR.message_types_by_name['Cluster'] = _CLUSTER
DESCRIPTOR.message_types_by_name['VNode'] = _VNODE
DESCRIPTOR.message_types_by_name['Network'] = _NETWORK
DESCRIPTOR.message_types_by_name['Image'] = _IMAGE
DESCRIPTOR.message_types_by_name['Mount'] = _MOUNT
DESCRIPTOR.message_types_by_name['Instance'] = _INSTANCE
DESCRIPTOR.enum_types_by_name['Status'] = _STATUS
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
VNodeInfo = _reflection.GeneratedProtocolMessageType('VNodeInfo', (_message.Message,), dict(
DESCRIPTOR = _VNODEINFO,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:VNodeInfo)
))
_sym_db.RegisterMessage(VNodeInfo)
Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict(
DESCRIPTOR = _REPLY,
__module__ = 'rpc_pb2'
@ -761,12 +862,19 @@ Command = _reflection.GeneratedProtocolMessageType('Command', (_message.Message,
_sym_db.RegisterMessage(Command)
_sym_db.RegisterMessage(Command.EnvVarsEntry)
Cluster = _reflection.GeneratedProtocolMessageType('Cluster', (_message.Message,), dict(
DESCRIPTOR = _CLUSTER,
VNode = _reflection.GeneratedProtocolMessageType('VNode', (_message.Message,), dict(
DESCRIPTOR = _VNODE,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Cluster)
# @@protoc_insertion_point(class_scope:VNode)
))
_sym_db.RegisterMessage(Cluster)
_sym_db.RegisterMessage(VNode)
Network = _reflection.GeneratedProtocolMessageType('Network', (_message.Message,), dict(
DESCRIPTOR = _NETWORK,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Network)
))
_sym_db.RegisterMessage(Network)
Image = _reflection.GeneratedProtocolMessageType('Image', (_message.Message,), dict(
DESCRIPTOR = _IMAGE,
@ -799,8 +907,8 @@ _MASTER = _descriptor.ServiceDescriptor(
file=DESCRIPTOR,
index=0,
options=None,
serialized_start=1227,
serialized_end=1267,
serialized_start=1357,
serialized_end=1397,
methods=[
_descriptor.MethodDescriptor(
name='report',
@ -823,27 +931,45 @@ _WORKER = _descriptor.ServiceDescriptor(
file=DESCRIPTOR,
index=1,
options=None,
serialized_start=1269,
serialized_end=1350,
serialized_start=1400,
serialized_end=1551,
methods=[
_descriptor.MethodDescriptor(
name='process_task',
full_name='Worker.process_task',
name='start_vnode',
full_name='Worker.start_vnode',
index=0,
containing_service=None,
input_type=_VNODEINFO,
output_type=_REPLY,
options=None,
),
_descriptor.MethodDescriptor(
name='start_task',
full_name='Worker.start_task',
index=1,
containing_service=None,
input_type=_TASKINFO,
output_type=_REPLY,
options=None,
),
_descriptor.MethodDescriptor(
name='stop_tasks',
full_name='Worker.stop_tasks',
index=1,
name='stop_task',
full_name='Worker.stop_task',
index=2,
containing_service=None,
input_type=_REPORTMSG,
output_type=_REPLY,
options=None,
),
_descriptor.MethodDescriptor(
name='stop_vnode',
full_name='Worker.stop_vnode',
index=3,
containing_service=None,
input_type=_VNODEINFO,
output_type=_REPLY,
options=None,
),
])
_sym_db.RegisterServiceDescriptor(_WORKER)

151
src/worker/taskworker.py Normal file
View File

@ -0,0 +1,151 @@
import sys
if sys.path[0].endswith("worker"):
sys.path[0] = sys.path[0][:-6]
from utils import env, tools
config = env.getenv("CONFIG")
#config = "/opt/docklet/local/docklet-running.conf"
tools.loadenv(config)
from utils.log import initlogging
initlogging("docklet-taskworker")
from utils.log import logger
from concurrent import futures
import grpc
#from utils.log import logger
#from utils import env
import json,lxc,subprocess,threading,os,time,traceback
from utils import imagemgr,etcdlib,gputools
from utils.lvmtool import sys_run
from worker import ossmounter
from protos import rpc_pb2, rpc_pb2_grpc
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
MAX_RUNNING_TIME = _ONE_DAY_IN_SECONDS
class TaskWorker(rpc_pb2_grpc.WorkerServicer):
def __init__(self):
rpc_pb2_grpc.WorkerServicer.__init__(self)
etcdaddr = env.getenv("ETCD")
logger.info ("using ETCD %s" % etcdaddr )
clustername = env.getenv("CLUSTER_NAME")
logger.info ("using CLUSTER_NAME %s" % clustername )
# init etcdlib client
try:
self.etcdclient = etcdlib.Client(etcdaddr, prefix = clustername)
except Exception:
logger.error ("connect etcd failed, maybe etcd address not correct...")
sys.exit(1)
else:
logger.info("etcd connected")
# get master ip and report port
[success,masterip] = self.etcdclient.getkey("service/master")
if not success:
logger.error("Fail to get master ip address.")
sys.exit(1)
else:
self.master_ip = masterip
logger.info("Get master ip address: %s" % (self.master_ip))
self.master_port = env.getenv('BATCH_MASTER_PORT')
self.imgmgr = imagemgr.ImageMgr()
self.fspath = env.getenv('FS_PREFIX')
self.confpath = env.getenv('DOCKLET_CONF')
self.taskmsgs = []
self.msgslock = threading.Lock()
self.report_interval = 2
self.lock = threading.Lock()
self.mount_lock = threading.Lock()
self.gpu_lock = threading.Lock()
self.gpu_status = {}
gpus = gputools.get_gpu_status()
for gpu in gpus:
self.gpu_status[gpu['id']] = ""
self.start_report()
logger.info('TaskWorker init success')
def start_vnode(self, request, context):
logger.info('start vnode with config: ' + str(request))
taskid = request.taskid
vnodeid = request.vnodeid
envs = {}
envs['taskid'] = str(taskid)
envs['vnodeid'] = str(vnodeid)
image = {}
image['name'] = request.vnode.image.name
if request.vnode.image.type == rpc_pb2.Image.PRIVATE:
image['type'] = 'private'
elif request.vnode.image.type == rpc_pb2.Image.PUBLIC:
image['type'] = 'public'
else:
image['type'] = 'base'
image['owner'] = request.vnode.image.owner
username = request.username
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
instance_type = request.vnode.instance
mount_list = request.vnode.mount
gpu_need = int(request.vnode.instance.gpu)
def start_task(self, request, context):
pass
def stop_task(self, request, context):
pass
def stop_vnode(self, request, context):
pass
def add_msg(self,taskid,username,vnodeid,status,token,errmsg):
self.msgslock.acquire()
try:
self.taskmsgs.append(rpc_pb2.TaskMsg(taskid=str(taskid),username=username,vnodeid=int(vnodeid),subTaskStatus=status,token=token,errmsg=errmsg))
except Exception as err:
logger.error(traceback.format_exc())
self.msgslock.release()
def report_msg(self):
channel = grpc.insecure_channel(self.master_ip+":"+self.master_port)
stub = rpc_pb2_grpc.MasterStub(channel)
while True:
self.msgslock.acquire()
reportmsg = rpc_pb2.ReportMsg(taskmsgs = self.taskmsgs)
try:
response = stub.report(reportmsg)
logger.info("Response from master by reporting: "+str(response.status)+" "+response.message)
except Exception as err:
logger.error(traceback.format_exc())
self.taskmsgs = []
self.msgslock.release()
time.sleep(self.report_interval)
def start_report(self):
thread = threading.Thread(target = self.report_msg, args=())
thread.setDaemon(True)
thread.start()
logger.info("Start to report task messages to master every %d seconds." % self.report_interval)
def TaskWorkerServe():
max_threads = int(env.getenv('BATCH_MAX_THREAD_WORKER'))
worker_port = int(env.getenv('BATCH_WORKER_PORT'))
logger.info("Max Threads on a worker is %d" % max_threads)
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_threads))
rpc_pb2_grpc.add_WorkerServicer_to_server(TaskWorker(), server)
server.add_insecure_port('[::]:'+str(worker_port))
server.start()
logger.info("Start TaskWorker Servicer on port:%d" % worker_port)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
TaskControllerServe()