Merge pull request #380 from unias/batch

Batch
This commit is contained in:
Yujian Zhu 2019-04-20 16:09:38 +08:00 committed by GitHub
commit 0deaa377c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
37 changed files with 5539 additions and 72 deletions

View File

@ -18,9 +18,6 @@ DOCKLET_USER=$DOCKLET_HOME/user
# default working directory, default to /opt/docklet
FS_PREFIX=/opt/docklet
RUN_DIR=$FS_PREFIX/local/run
LOG_DIR=$FS_PREFIX/local/log
#network interface , default is eth0
NETWORK_DEVICE=eth0
#etcd server address, default is localhost:2379
@ -32,6 +29,8 @@ WEB_PORT=8888
USER_PORT=9100
#cluster net, default is 172.16.0.1/16
CLUSTER_NET="172.16.0.1/16"
# ip addresses range of containers for batch job, default is 10.16.0.0/16
BATCH_NET="10.16.0.0/16"
#configurable-http-proxy public port, default is 8000
PROXY_PORT=8000
#configurable-http-proxy api port, default is 8001
@ -42,6 +41,9 @@ DISTRIBUTED_GATEWAY=False
export FS_PREFIX
RUN_DIR=$FS_PREFIX/local/run
LOG_DIR=$FS_PREFIX/local/log
# This next line determines what user the script runs as.
DAEMON_USER=root
@ -103,6 +105,7 @@ pre_start_master () {
# iptables for NAT network for containers to access web
iptables -t nat -F
iptables -t nat -A POSTROUTING -s $CLUSTER_NET -j MASQUERADE
iptables -t nat -A POSTROUTING -s $BATCH_NET -j MASQUERADE
}

View File

@ -18,9 +18,6 @@ DOCKLET_USER=$DOCKLET_HOME/user
# default working directory, default to /opt/docklet
FS_PREFIX=/opt/docklet
RUN_DIR=$FS_PREFIX/local/run
LOG_DIR=$FS_PREFIX/local/log
#configurable-http-proxy public port, default is 8000
PROXY_PORT=8000
#configurable-http-proxy api port, default is 8001
@ -36,11 +33,16 @@ WEB_PORT=8888
USER_PORT=9100
#cluster net, default is 172.16.0.1/16
CLUSTER_NET="172.16.0.1/16"
# ip addresses range of containers for batch job, default is 10.16.0.0/16
BATCH_NET="10.16.0.0/16"
. $DOCKLET_CONF/docklet.conf
export FS_PREFIX
RUN_DIR=$FS_PREFIX/local/run
LOG_DIR=$FS_PREFIX/local/log
# This next line determines what user the script runs as.
DAEMON_USER=root
@ -103,6 +105,7 @@ pre_start_master () {
# iptables for NAT network for containers to access web
iptables -t nat -F
iptables -t nat -A POSTROUTING -s $CLUSTER_NET -j MASQUERADE
iptables -t nat -A POSTROUTING -s $BATCH_NET -j MASQUERADE
}

View File

@ -20,19 +20,21 @@ FS_PREFIX=/opt/docklet
# cluster net ip range, default is 172.16.0.1/16
CLUSTER_NET="172.16.0.1/16"
# ip addresses range of containers for batch job, default is 10.16.0.0/16
BATCH_NET="10.16.0.0/16"
#configurable-http-proxy public port, default is 8000
PROXY_PORT=8000
#configurable-http-proxy api port, default is 8001
PROXY_API_PORT=8001
DISTRIBUTED_GATEWAY=False
RUN_DIR=$FS_PREFIX/local/run
LOG_DIR=$FS_PREFIX/local/log
. $DOCKLET_CONF/docklet.conf
export FS_PREFIX
RUN_DIR=$FS_PREFIX/local/run
LOG_DIR=$FS_PREFIX/local/log
# This next line determines what user the script runs as.
DAEMON_USER=root
@ -43,6 +45,13 @@ DAEMON_OPTS=
# The process ID of the script when it runs is stored here:
PIDFILE=$RUN_DIR/$DAEMON_NAME.pid
# settings for docklet batch worker, which is required for batch job processing system
BATCH_ON=True
DAEMON_BATCH=$DOCKLET_LIB/worker/taskworker.py
DAEMON_NAME_BATCH=docklet-taskworker
PIDFILE_BATCH=$RUN_DIR/batch.pid
DAEMON_OPTS_BATCH=
# settings for docklet proxy, which is required for web access
DAEMON_PROXY=`which configurable-http-proxy`
DAEMON_NAME_PROXY=docklet-proxy
@ -83,6 +92,7 @@ pre_start () {
# iptables for NAT network for containers to access web
iptables -t nat -F
iptables -t nat -A POSTROUTING -s $CLUSTER_NET -j MASQUERADE
iptables -t nat -A POSTROUTING -s $BATCH_NET -j MASQUERADE
if [ ! -d $FS_PREFIX/local/basefs ]; then
log_daemon_msg "basefs does not exist, run prepare.sh first" && exit 1
@ -95,12 +105,27 @@ pre_start () {
do_start() {
pre_start
DAEMON_OPTS=$1
log_daemon_msg "Starting $DAEMON_NAME in $FS_PREFIX"
#python3 $DAEMON
start-stop-daemon --start --oknodo --background --pidfile $PIDFILE --make-pidfile --user $DAEMON_USER --chuid $DAEMON_USER --startas $DAEMON -- $DAEMON_OPTS
log_end_msg $?
}
do_start_batch () {
if [ "$BATCH_ON" = "False" ]
then
return 1
fi
log_daemon_msg "Starting $DAEMON_NAME_BATCH in $FS_PREFIX"
DAEMON_OPTS_BATCH=""
start-stop-daemon --start --background --pidfile $PIDFILE_BATCH --make-pidfile --user $DAEMON_USER --chuid $DAEMON_USER --startas $DAEMON_BATCH -- $DAEMON_OPTS_BATCH
log_end_msg $?
}
do_start_proxy () {
if [ "$DISTRIBUTED_GATEWAY" = "False" ]
then
@ -118,6 +143,16 @@ do_stop () {
log_end_msg $?
}
do_stop_batch () {
if [ "$BATCH_ON" = "False" ]
then
return 1
fi
log_daemon_msg "Stopping $DAEMON_NAME_BATCH daemon"
start-stop-daemon --stop --quiet --oknodo --remove-pidfile --pidfile $PIDFILE_BATCH --retry 10
log_end_msg $?
}
do_stop_proxy () {
if [ "$DISTRIBUTED_GATEWAY" = "False" ]
then
@ -145,12 +180,14 @@ do_stop_meter() {
case "$1" in
start)
do_start
do_start "normal-worker"
do_start_batch
do_start_proxy
;;
stop)
do_stop
do_stop_batch
do_stop_proxy
;;
start-meter)
@ -161,6 +198,16 @@ case "$1" in
do_stop_meter
;;
start_batch)
do_start "batch-worker"
do_start_batch
;;
stop_batch)
do_stop
do_stop_batch
;;
start_proxy)
do_start_proxy
;;
@ -176,13 +223,16 @@ case "$1" in
restart)
do_stop
do_stop_batch
do_stop_proxy
do_start
do_start "normal-worker"
do_start_batch
do_start_proxy
;;
status)
status_of_proc -p $PIDFILE "$DAEMON" "$DAEMON_NAME" && exit 0 || exit $?
status_of_proc -p $PIDFILE_BATCH "$DAEMON_BATCH" "$DAEMON_NAME_BATCH" || status=$?
status_of_proc -p $PIDFILE_PROXY "$DAEMON_PROXY" "$DAEMON_NAME_PROXY" || status=$?
;;
*)

56
conf/container.batch.conf Normal file
View File

@ -0,0 +1,56 @@
# This is the common container.conf for all containers.
# If want set custom settings, you have two choices:
# 1. Directly modify this file, which is not recommend, because the
# setting will be overriden when new version container.conf released.
# 2. Use a custom config file in this conf directory: lxc.custom.conf,
# it uses the same grammer as container.conf, and will be merged
# with the default container.conf by docklet at runtime.
#
# The following is an example mounting user html directory
# lxc.mount.entry = /public/home/%USERNAME%/public_html %ROOTFS%/root/public_html none bind,rw,create=dir 0 0
#
#### include /usr/share/lxc/config/ubuntu.common.conf
lxc.include = /usr/share/lxc/config/ubuntu.common.conf
############## DOCKLET CONFIG ##############
# Setup 0 tty devices
lxc.tty = 0
lxc.rootfs = %ROOTFS%
lxc.utsname = %HOSTNAME%
lxc.network.type = veth
lxc.network.name = eth0
# veth.pair is limited in 16 bytes
lxc.network.veth.pair = %VETHPAIR%
lxc.network.script.up = Bridge=%BRNAME% %LXCSCRIPT%/lxc-ifup
lxc.network.script.down = Bridge=%BRNAME% %LXCSCRIPT%/lxc-ifdown
lxc.network.ipv4 = %IP%
lxc.network.ipv4.gateway = %GATEWAY%
lxc.network.flags = up
lxc.network.mtu = 1420
lxc.cgroup.pids.max = 2048
lxc.cgroup.memory.limit_in_bytes = %CONTAINER_MEMORY%M
#lxc.cgroup.memory.kmem.limit_in_bytes = 512M
#lxc.cgroup.memory.soft_limit_in_bytes = 4294967296
#lxc.cgroup.memory.memsw.limit_in_bytes = 8589934592
# lxc.cgroup.cpu.cfs_period_us : period time of cpu, default 100000, means 100ms
# lxc.cgroup.cpu.cfs_quota_us : quota time of this process
lxc.cgroup.cpu.cfs_quota_us = %CONTAINER_CPU%
lxc.cap.drop = sys_admin net_admin mac_admin mac_override sys_time sys_module
lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/data %ROOTFS%/root/nfs none bind,rw,create=dir 0 0
lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/hosts/batch-%TASKID%.hosts %ROOTFS%/etc/hosts none bind,ro,create=file 0 0
lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/ssh %ROOTFS%/root/.ssh none bind,ro,create=dir 0 0
lxc.mount.entry = %FS_PREFIX%/local/temp/%LXCNAME%/ %ROOTFS%/tmp none bind,rw,create=dir 0 0
# setting hostname
lxc.hook.pre-start = HNAME=%HOSTNAME% %LXCSCRIPT%/lxc-prestart
# setting nfs softlink
#lxc.hook.mount = %LXCSCRIPT%/lxc-mount

View File

@ -182,3 +182,31 @@
# ALLOW_SCALE_OUT: allow docklet to rent server on the cloud to scale out
# Only when you deploy docklet on the cloud can you set it to True
# ALLOW_SCALE_OUT=False
# ==================================================
#
# Batch Config
#
# ==================================================
# BATCH_ON: whether to start batch job processing system when start
# the docklet. Default: True
# BATCH_ON=True
# BATCH_MASTER_PORT: the rpc server port on master.
# default: 50050
# BATCH_MASTER_PORT=50050
# BATCH_WORKER_PORT: the rpc server port on worker.
# default: 50051
# BATCH_WORKER_PORT=50051
# BATCH_NET: ip addresses range of containers for batch job, default is 10.16.0.0/16
# BATCH_NET=10.16.0.0/16
# BATCH_TASK_CIDR: 2^(BATCH_TASK_CIDR)-2 is the number of ip addresses for a task, default is 4
# BATCH_TASK_CIDR=4
# BATCH_MAX_THREAD_WORKER: the maximun number of threads of the rpc server on
# the batch job worker. default:5
# BATCH_MAX_THREAD_WORKER=5

View File

@ -9,7 +9,7 @@
ovs-vsctl --if-exists del-port $Bridge $5
cnt=$(ovs-vsctl list-ports ${Bridge} | wc -l)
if [ "$cnt" = "1" ]; then
greport=$(ovs-vsctl list-ports $(Bridge) | grep "^gre-[[:digit:]][[:digit:]]*-[[:digit:]][[:digit:]]*\.[[:digit:]][[:digit:]]*\.[[:digit:]][[:digit:]]*\.[[:digit:]][[:digit:]]*$" | wc -l)
greport=$(ovs-vsctl list-ports ${Bridge} | grep "gre" | wc -l)
if [ "$greport" = "1" ]; then
ovs-vsctl del-br $Bridge
fi

View File

@ -16,7 +16,7 @@ fi
# some packages' name maybe different in debian
apt-get install -y cgmanager lxc lxcfs lxc-templates lvm2 bridge-utils curl exim4 openssh-server openvswitch-switch
apt-get install -y python3 python3-netifaces python3-flask python3-flask-sqlalchemy python3-pampy python3-httplib2 python3-pip
apt-get install -y python3-psutil python3-flask-migrate
apt-get install -y python3-psutil python3-flask-migrate python3-paramiko
apt-get install -y python3-lxc
apt-get install -y python3-requests python3-suds
apt-get install -y nodejs nodejs-legacy npm
@ -24,6 +24,9 @@ apt-get install -y etcd
apt-get install -y glusterfs-client attr
apt-get install -y nginx
pip3 install Flask-WTF
apt-get install -y gdebi-core
gdebi ossfs_1.80.5_ubuntu16.04_amd64.deb
pip3 install grpcio grpcio-tools googleapis-common-protos
#add ip forward
echo "net.ipv4.ip_forward=1" >>/etc/sysctl.conf
@ -51,15 +54,19 @@ echo ""
[[ -f conf/docklet.conf ]] || { echo "Generating docklet.conf from template" && cp conf/docklet.conf.template conf/docklet.conf; }
[[ -f web/templates/home.html ]] || { echo "Generating HomePage from home.template" && cp web/templates/home.template web/templates/home.html; }
mkdir -p /opt/docklet/global
mkdir -p /opt/docklet/local/
FS_PREFIX=/opt/docklet
. conf/docklet.conf
export FS_PREFIX
echo "directory /opt/docklet have been created"
mkdir -p $FS_PREFIX/global
mkdir -p $FS_PREFIX/local/
if [[ ! -d /opt/docklet/local/basefs && ! $1 = "withoutfs" ]]; then
mkdir -p /opt/docklet/local/basefs
echo "directory FS_PREFIX (${FS_PREFIX}) have been created"
if [[ ! -d $FS_PREFIX/local/basefs && ! $1 = "withoutfs" ]]; then
mkdir -p $FS_PREFIX/local/basefs
echo "Generating basefs"
wget -P /opt/docklet/local http://iwork.pku.edu.cn:1616/basefs-0.11.tar.bz2 && tar xvf /opt/docklet/local/basefs-0.11.tar.bz2 -C /opt/docklet/local/ > /dev/null
wget -P $FS_PREFIX/local http://iwork.pku.edu.cn:1616/basefs-0.11.tar.bz2 && tar xvf $FS_PREFIX/local/basefs-0.11.tar.bz2 -C $FS_PREFIX/local/ > /dev/null
[ $? != "0" ] && echo "Generate basefs failed, please download it from http://unias.github.io/docklet/download to FS_PREFIX/local and then extract it using root. (defalut FS_PRERIX is /opt/docklet)"
fi

View File

@ -27,7 +27,7 @@ import http.server, cgi, json, sys, shutil, traceback
import xmlrpc.client
from socketserver import ThreadingMixIn
from utils import etcdlib, imagemgr
from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr
from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr, jobmgr, taskmgr
from utils.logs import logs
from master import userManager, beansapplicationmgr, monitor, sysmgr, network
from worker.monitor import History_Manager
@ -790,6 +790,147 @@ def resetall_system(user, beans, form):
return json.dumps({'success':'false', 'message': message})
return json.dumps(result)
@app.route("/batch/job/add/", methods=['POST'])
@login_required
@beans_check
def add_job(user,beans,form):
global G_jobmgr
job_data = form.to_dict()
job_info = {
'tasks': {}
}
message = {
'success': 'true',
'message': 'add batch job success'
}
for key in job_data:
if key == 'csrf_token':
continue
key_arr = key.split('_')
value = job_data[key]
if key_arr[0] == 'srcAddr' and value == '':
#task_idx = 'task_' + key_arr[1]
if task_idx in job_info['tasks']:
job_info['tasks'][task_idx]['srcAddr'] = '/root'
else:
job_info['tasks'][task_idx] = {
'srcAddr': '/root'
}
elif key_arr[0] != 'dependency'and value == '':
message['success'] = 'false'
message['message'] = 'value of %s is null' % key
elif len(key_arr) == 1:
job_info[key_arr[0]] = value
elif len(key_arr) == 2:
key_prefix, task_idx = key_arr[0], key_arr[1]
#task_idx = 'task_' + task_idx
if task_idx in job_info["tasks"]:
job_info["tasks"][task_idx][key_prefix] = value
else:
tmp_dict = {
key_prefix: value
}
job_info["tasks"][task_idx] = tmp_dict
elif len(key_arr) == 3:
key_prefix, task_idx, mapping_idx = key_arr[0], key_arr[1], key_arr[2]
#task_idx = 'task_' + task_idx
mapping_idx = 'mapping_' + mapping_idx
if task_idx in job_info["tasks"]:
if "mapping" in job_info["tasks"][task_idx]:
if mapping_idx in job_info["tasks"][task_idx]["mapping"]:
job_info["tasks"][task_idx]["mapping"][mapping_idx][key_prefix] = value
else:
tmp_dict = {
key_prefix: value
}
job_info["tasks"][task_idx]["mapping"][mapping_idx] = tmp_dict
else:
job_info["tasks"][task_idx]["mapping"] = {
mapping_idx: {
key_prefix: value
}
}
else:
tmp_dict = {
"mapping":{
mapping_idx: {
key_prefix: value
}
}
}
job_info["tasks"][task_idx] = tmp_dict
logger.debug('batch job adding info %s' % json.dumps(job_info, indent=4))
[status, msg] = G_jobmgr.add_job(user, job_info)
if status:
return json.dumps(message)
else:
logger.debug('fail to add batch job: %s' % msg)
message["success"] = "false"
message["message"] = msg
return json.dumps(message)
return json.dumps(message)
@app.route("/batch/job/list/", methods=['POST'])
@login_required
def list_job(user,beans,form):
global G_jobmgr
result = {
'success': 'true',
'data': G_jobmgr.list_jobs(user)
}
return json.dumps(result)
@app.route("/batch/job/info/", methods=['POST'])
@login_required
def info_job(user,beans,form):
global G_jobmgr
jobid = form.get("jobid","")
[success, data] = G_jobmgr.get_job(user, jobid)
if success:
return json.dumps({'success':'true', 'data':data})
else:
return json.dumps({'success':'false', 'message': data})
@app.route("/batch/job/stop/", methods=['POST'])
@login_required
def stop_job(user,beans,form):
global G_jobmgr
jobid = form.get("jobid","")
[success,msg] = G_jobmgr.stop_job(user,jobid)
if success:
return json.dumps({'success':'true', 'action':'stop job'})
else:
return json.dumps({'success':'false', 'message': msg})
@app.route("/batch/job/output/", methods=['POST'])
@login_required
def get_output(user,beans,form):
global G_jobmgr
jobid = form.get("jobid","")
taskid = form.get("taskid","")
vnodeid = form.get("vnodeid","")
issue = form.get("issue","")
result = {
'success': 'true',
'data': G_jobmgr.get_output(user,jobid,taskid,vnodeid,issue)
}
return json.dumps(result)
@app.route("/batch/task/info/", methods=['POST'])
@login_required
def info_task(user,beans,form):
pass
@app.route("/batch/vnodes/list/", methods=['POST'])
@login_required
def batch_vnodes_list(user,beans,form):
global G_taskmgr
result = {
'success': 'true',
'data': G_taskmgr.get_user_batch_containers(user)
}
return json.dumps(result)
# @app.route("/inside/cluster/scaleout/", methods=['POST'])
# @inside_ip_required
# def inside_cluster_scalout(cur_user, cluster_info, form):
@ -857,6 +998,8 @@ if __name__ == '__main__':
global G_applicationmgr
global G_ulockmgr
global G_cloudmgr
global G_jobmgr
global G_taskmgr
# move 'tools.loadenv' to the beginning of this file
fs_path = env.getenv("FS_PREFIX")
@ -973,4 +1116,9 @@ if __name__ == '__main__':
# server = http.server.HTTPServer((masterip, masterport), DockletHttpHandler)
logger.info("starting master server")
G_taskmgr = taskmgr.TaskMgr(G_nodemgr, monitor.Fetcher, ipaddr)
G_jobmgr = jobmgr.JobMgr(G_taskmgr)
G_taskmgr.set_jobmgr(G_jobmgr)
G_taskmgr.start()
app.run(host = masterip, port = masterport, threaded=True)

493
src/master/jobmgr.py Normal file
View File

@ -0,0 +1,493 @@
import time, threading, random, string, os, traceback, requests
import master.monitor
import subprocess,json
from functools import wraps
from datetime import datetime
from utils.log import initlogging, logger
from utils.model import db, Batchjob, Batchtask
from utils import env
def db_commit():
try:
db.session.commit()
except Exception as err:
db.session.rollback()
logger.error(traceback.format_exc())
raise
class BatchJob(object):
def __init__(self, jobid, user, job_info, old_job_db=None):
if old_job_db is None:
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority']))
else:
self.job_db = old_job_db
self.job_db.clear()
job_info = {}
job_info['jobName'] = self.job_db.name
job_info['jobPriority'] = self.job_db.priority
all_tasks = self.job_db.tasks.all()
job_info['tasks'] = {}
for t in all_tasks:
job_info['tasks'][t.idx] = json.loads(t.config)
self.user = user
#self.raw_job_info = job_info
self.job_id = jobid
self.job_name = job_info['jobName']
self.job_priority = int(job_info['jobPriority'])
self.lock = threading.Lock()
self.tasks = {}
self.dependency_out = {}
self.tasks_cnt = {'pending':0, 'scheduling':0, 'running':0, 'retrying':0, 'failed':0, 'finished':0, 'stopped':0}
#init self.tasks & self.dependency_out & self.tasks_cnt
logger.debug("Init BatchJob user:%s job_name:%s create_time:%s" % (self.job_db.username, self.job_db.name, str(self.job_db.create_time)))
raw_tasks = job_info["tasks"]
self.tasks_cnt['pending'] = len(raw_tasks.keys())
for task_idx in raw_tasks.keys():
task_info = raw_tasks[task_idx]
if old_job_db is None:
task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info)
self.job_db.tasks.append(task_db)
else:
task_db = Batchtask.query.get(jobid+"_"+task_idx)
task_db.clear()
self.tasks[task_idx] = {}
self.tasks[task_idx]['id'] = jobid+"_"+task_idx
self.tasks[task_idx]['config'] = task_info
self.tasks[task_idx]['db'] = task_db
self.tasks[task_idx]['status'] = 'pending'
self.tasks[task_idx]['dependency'] = []
dependency = task_info['dependency'].strip().replace(' ', '').split(',')
if len(dependency) == 1 and dependency[0] == '':
continue
for d in dependency:
if not d in raw_tasks.keys():
raise ValueError('task %s is not defined in the dependency of task %s' % (d, task_idx))
self.tasks[task_idx]['dependency'].append(d)
if not d in self.dependency_out.keys():
self.dependency_out[d] = []
self.dependency_out[d].append(task_idx)
if old_job_db is None:
db.session.add(self.job_db)
db_commit()
self.log_status()
logger.debug("BatchJob(id:%s) dependency_out: %s" % (self.job_db.id, json.dumps(self.dependency_out, indent=3)))
def data_lock(f):
@wraps(f)
def new_f(self, *args, **kwargs):
self.lock.acquire()
try:
result = f(self, *args, **kwargs)
except Exception as err:
self.lock.release()
raise err
self.lock.release()
return result
return new_f
# return the tasks without dependencies
@data_lock
def get_tasks_no_dependency(self,update_status=False):
logger.debug("Get tasks without dependencies of BatchJob(id:%s)" % self.job_db.id)
ret_tasks = []
for task_idx in self.tasks.keys():
if (self.tasks[task_idx]['status'] == 'pending' and
len(self.tasks[task_idx]['dependency']) == 0):
if update_status:
self.tasks_cnt['pending'] -= 1
self.tasks_cnt['scheduling'] += 1
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
self.tasks[task_idx]['db'].status = 'scheduling'
self.tasks[task_idx]['status'] = 'scheduling'
task_name = self.tasks[task_idx]['db'].id
ret_tasks.append([task_name, self.tasks[task_idx]['config'], self.job_priority])
self.log_status()
db_commit()
return ret_tasks
@data_lock
def stop_job(self):
self.job_db = Batchjob.query.get(self.job_id)
self.job_db.status = 'stopping'
db_commit()
# update status of this job based
def _update_job_status(self):
allcnt = len(self.tasks.keys())
if self.tasks_cnt['failed'] != 0:
self.job_db.status = 'failed'
self.job_db.end_time = datetime.now()
elif self.tasks_cnt['finished'] == allcnt:
self.job_db.status = 'done'
self.job_db.end_time = datetime.now()
elif self.job_db.status == 'stopping':
if self.tasks_cnt['running'] == 0 and self.tasks_cnt['scheduling'] == 0 and self.tasks_cnt['retrying'] == 0:
self.job_db.status = 'stopped'
self.job_db.end_time = datetime.now()
elif self.tasks_cnt['running'] != 0 or self.tasks_cnt['retrying'] != 0:
self.job_db.status = 'running'
else:
self.job_db.status = 'pending'
db_commit()
# start run a task, update status
@data_lock
def update_task_running(self, task_idx):
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) running." % (task_idx, self.job_id))
old_status = self.tasks[task_idx]['status']
if old_status == 'stopping':
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
return
self.tasks_cnt[old_status] -= 1
self.tasks[task_idx]['status'] = 'running'
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
self.tasks[task_idx]['db'].status = 'running'
self.tasks[task_idx]['db'].start_time = datetime.now()
self.tasks_cnt['running'] += 1
self.job_db = Batchjob.query.get(self.job_id)
self._update_job_status()
self.log_status()
# a task has finished, update dependency and return tasks without dependencies
@data_lock
def finish_task(self, task_idx, running_time, billing):
if task_idx not in self.tasks.keys():
logger.error('Task_idx %s not in job. user:%s job_name:%s job_id:%s'%(task_idx, self.user, self.job_name, self.job_id))
return []
logger.debug("Task(idx:%s) of BatchJob(id:%s) has finished(running_time=%d,billing=%d). Update dependency..." % (task_idx, self.job_id, running_time, billing))
old_status = self.tasks[task_idx]['status']
if old_status == 'stopping':
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
return
self.tasks_cnt[old_status] -= 1
self.tasks[task_idx]['status'] = 'finished'
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
self.tasks[task_idx]['db'].status = 'finished'
self.tasks[task_idx]['db'].tried_times += 1
self.tasks[task_idx]['db'].running_time = running_time
self.tasks[task_idx]['db'].end_time = datetime.now()
self.tasks[task_idx]['db'].billing = billing
self.tasks[task_idx]['db'].failed_reason = ""
self.job_db = Batchjob.query.get(self.job_id)
self.job_db.billing += billing
self.tasks_cnt['finished'] += 1
if task_idx not in self.dependency_out.keys():
self._update_job_status()
self.log_status()
return []
ret_tasks = []
for out_idx in self.dependency_out[task_idx]:
try:
self.tasks[out_idx]['dependency'].remove(task_idx)
except Exception as err:
logger.warning(traceback.format_exc())
continue
if (self.tasks[out_idx]['status'] == 'pending' and
len(self.tasks[out_idx]['dependency']) == 0):
self.tasks_cnt['pending'] -= 1
self.tasks_cnt['scheduling'] += 1
self.tasks[out_idx]['status'] = 'scheduling'
self.tasks[out_idx]['db'] = Batchtask.query.get(self.tasks[out_idx]['id'])
self.tasks[out_idx]['db'].status = 'scheduling'
task_name = self.job_id + '_' + out_idx
ret_tasks.append([task_name, self.tasks[out_idx]['config'], self.job_priority])
self._update_job_status()
self.log_status()
return ret_tasks
# update retrying status of task
@data_lock
def update_task_retrying(self, task_idx, reason, tried_times):
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) retrying. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
old_status = self.tasks[task_idx]['status']
if old_status == 'stopping':
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
return
self.tasks_cnt[old_status] -= 1
self.tasks_cnt['retrying'] += 1
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
self.tasks[task_idx]['db'].status = 'retrying'
self.tasks[task_idx]['db'].failed_reason = reason
self.tasks[task_idx]['db'].tried_times += 1
self.tasks[task_idx]['status'] = 'retrying'
self.job_db = Batchjob.query.get(self.job_id)
self._update_job_status()
self.log_status()
# update failed status of task
@data_lock
def update_task_failed(self, task_idx, reason, tried_times, running_time, billing):
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) failed. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
old_status = self.tasks[task_idx]['status']
self.tasks_cnt[old_status] -= 1
self.tasks_cnt['failed'] += 1
self.tasks[task_idx]['status'] = 'failed'
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
self.tasks[task_idx]['db'].status = 'failed'
self.tasks[task_idx]['db'].failed_reason = reason
self.tasks[task_idx]['db'].tried_times += 1
self.tasks[task_idx]['db'].end_time = datetime.now()
self.tasks[task_idx]['db'].running_time = running_time
self.tasks[task_idx]['db'].billing = billing
self.job_db = Batchjob.query.get(self.job_id)
self.job_db.billing += billing
self._update_job_status()
self.log_status()
@data_lock
def update_task_stopped(self, task_idx, running_time, billing):
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) stopped.running_time:%d billing:%d" % (task_idx, self.job_id, int(running_time), billing))
old_status = self.tasks[task_idx]['status']
if old_status == 'failed' or old_status == 'finished' or old_status == 'stopped':
logger.info("task(idx:%s) of BatchJob(id:%s) has been done."%(task_idx, self.job_id))
return False
self.tasks_cnt[old_status] -= 1
self.tasks_cnt['stopped'] += 1
self.tasks[task_idx]['status'] = 'stopped'
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
self.tasks[task_idx]['db'].status = 'stopped'
self.tasks[task_idx]['db'].end_time = datetime.now()
self.tasks[task_idx]['db'].running_time = running_time
self.tasks[task_idx]['db'].billing = billing
self.job_db = Batchjob.query.get(self.job_id)
self.job_db.billing += billing
self._update_job_status()
self.log_status()
return True
# print status for debuging
def log_status(self):
task_copy = {}
for task_idx in self.tasks.keys():
task_copy[task_idx] = {}
task_copy[task_idx]['status'] = self.tasks[task_idx]['status']
task_copy[task_idx]['dependency'] = self.tasks[task_idx]['dependency']
logger.debug("BatchJob(id:%s) tasks status: %s" % (self.job_id, json.dumps(task_copy, indent=3)))
logger.debug("BatchJob(id:%s) tasks_cnt: %s" % (self.job_id, self.tasks_cnt))
logger.debug("BatchJob(id:%s) job_status: %s" %(self.job_id, self.job_db.status))
class JobMgr():
# load job information from etcd
# initial a job queue and job schedueler
def __init__(self, taskmgr):
logger.info("Init jobmgr...")
try:
Batchjob.query.all()
except:
db.create_all(bind='__all__')
self.job_map = {}
self.taskmgr = taskmgr
self.fspath = env.getenv('FS_PREFIX')
self.lock = threading.Lock()
self.userpoint = "http://" + env.getenv('USER_IP') + ":" + str(env.getenv('USER_PORT'))
self.auth_key = env.getenv('AUTH_KEY')
self.recover_jobs()
def recover_jobs(self):
logger.info("Rerun the unfailed and unfinished jobs...")
try:
rejobs = Batchjob.query.filter(~Batchjob.status.in_(['done','failed']))
rejobs = rejobs.order_by(Batchjob.create_time).all()
for rejob in rejobs:
logger.info("Rerun job: "+rejob.id)
logger.debug(str(rejob))
job = BatchJob(rejob.id, rejob.username, None, rejob)
self.job_map[job.job_id] = job
self.process_job(job)
except Exception as err:
logger.error(traceback.format_exc())
def charge_beans(self,username,billing):
logger.debug("Charge user(%s) for %d beans"%(username, billing))
data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key}
url = "/billing/beans/"
return requests.post(self.userpoint+url,data=data).json()
def add_lock(f):
@wraps(f)
def new_f(self, *args, **kwargs):
self.lock.acquire()
try:
result = f(self, *args, **kwargs)
except Exception as err:
self.lock.release()
raise err
self.lock.release()
return result
return new_f
@add_lock
def create_job(self, user, job_info):
jobid = self.gen_jobid()
job = BatchJob(jobid, user, job_info)
return job
# user: username
# job_info: a json string
# user submit a new job, add this job to queue and database
def add_job(self, user, job_info):
try:
job = self.create_job(user, job_info)
self.job_map[job.job_id] = job
self.process_job(job)
except ValueError as err:
logger.error(err)
return [False, err.args[0]]
except Exception as err:
logger.error(traceback.format_exc())
#logger.error(err)
return [False, err.args[0]]
return [True, "add batch job success"]
# user: username
# jobid: the id of job
def stop_job(self, user, job_id):
logger.info("[jobmgr] stop job(id:%s) user(%s)"%(job_id, user))
if job_id not in self.job_map.keys():
return [False,"Job id %s does not exists! Maybe it has been finished."%job_id]
try:
job = self.job_map[job_id]
if job.job_db.status == 'done' or job.job_db.status == 'failed':
return [True,""]
if job.user != user:
raise Exception("Wrong User.")
for task_idx in job.tasks.keys():
taskid = job_id + '_' + task_idx
self.taskmgr.lazy_stop_task(taskid)
job.stop_job()
except Exception as err:
logger.error(traceback.format_exc())
#logger.error(err)
return [False, err.args[0]]
return [True,""]
# user: username
# list a user's all job
def list_jobs(self,user):
alljobs = Batchjob.query.filter_by(username=user).all()
res = []
for job in alljobs:
jobdata = json.loads(str(job))
tasks = job.tasks.all()
jobdata['tasks'] = [t.idx for t in tasks]
tasks_vnodeCount = {}
for t in tasks:
tasks_vnodeCount[t.idx] = int(json.loads(t.config)['vnodeCount'])
jobdata['tasks_vnodeCount'] = tasks_vnodeCount
res.append(jobdata)
return res
# user: username
# jobid: the id of job
# get the information of a job, including the status, json description and other information
def get_job(self, user, job_id):
job = Batchjob.query.get(job_id)
if job is None:
return [False, "Jobid(%s) does not exist."%job_id]
if job.username != user:
return [False, "Wrong User!"]
jobdata = json.loads(str(job))
tasks = job.tasks.order_by(Batchtask.idx).all()
tasksdata = [json.loads(str(t)) for t in tasks]
jobdata['tasks'] = tasksdata
return [True, jobdata]
# check if a job exists
def is_job_exist(self, job_id):
return Batchjob.query.get(job_id) != None
# generate a random job id
def gen_jobid(self):
datestr = datetime.now().strftime("%y%m%d")
job_id = datestr+''.join(random.sample(string.ascii_letters + string.digits, 3))
while self.is_job_exist(job_id):
job_id = datestr+''.join(random.sample(string.ascii_letters + string.digits, 3))
return job_id
# add tasks into taskmgr's queue
def add_task_taskmgr(self, user, tasks):
for task_name, task_info, task_priority in tasks:
if not task_info:
logger.error("task_info does not exist! task_name(%s)" % task_name)
return False
else:
logger.debug("Add task(name:%s) with priority(%s) to taskmgr's queue." % (task_name, task_priority) )
self.taskmgr.add_task(user, task_name, task_info, task_priority)
return True
# to process a job, add tasks without dependencies of the job into taskmgr
def process_job(self, job):
tasks = job.get_tasks_no_dependency(True)
return self.add_task_taskmgr(job.user, tasks)
# report task status from taskmgr when running, failed and finished
# task_name: job_id + '_' + task_idx
# status: 'running', 'finished', 'retrying', 'failed', 'stopped'
# reason: reason for failure or retrying, such as "FAILED", "TIMEOUT", "OUTPUTERROR"
# tried_times: how many times the task has been tried.
def report(self, user, task_name, status, reason="", tried_times=1, running_time=0, billing=0):
split_task_name = task_name.split('_')
if len(split_task_name) != 2:
logger.error("[jobmgr report]Illegal task_name(%s) report from taskmgr" % task_name)
return
if billing > 0 and (status == 'failed' or status == 'finished'):
self.charge_beans(user, billing)
job_id, task_idx = split_task_name
if job_id not in self.job_map.keys():
logger.error("[jobmgr report]jobid(%s) does not exist. task_name(%s)" % (job_id,task_name))
#update data in db
taskdb = Batchtask.query.get(task_name)
if (taskdb is None or taskdb.status == 'finished' or
taskdb.status == 'failed' or taskdb.status == 'stopped'):
return
taskdb.status = status
if status == 'failed':
taskdb.failed_reason = reason
if status == 'failed' or status == 'stopped' or status == 'finished':
taskdb.end_time = datetime.now()
if billing > 0:
taskdb.running_time = running_time
taskdb.billing = billing
db_commit()
return
job = self.job_map[job_id]
if status == "running":
#logger.debug(str(job.job_db))
job.update_task_running(task_idx)
#logger.debug(str(job.job_db))
elif status == "finished":
#logger.debug(str(job.job_db))
next_tasks = job.finish_task(task_idx, running_time, billing)
ret = self.add_task_taskmgr(user, next_tasks)
#logger.debug(str(job.job_db))
elif status == "retrying":
job.update_task_retrying(task_idx, reason, tried_times)
elif status == "failed":
job.update_task_failed(task_idx, reason, tried_times, running_time, billing)
elif status == "stopped":
if job.update_task_stopped(task_idx, running_time, billing) and billing > 0:
self.charge_beans(user, billing)
if job.job_db.status == 'done' or job.job_db.status == 'failed' or job.job_db.status == 'stopped':
del self.job_map[job_id]
# Get Batch job stdout or stderr from its file
def get_output(self, username, jobid, taskid, vnodeid, issue):
filename = jobid + "_" + taskid + "_" + vnodeid + "_" + issue + ".txt"
fpath = "%s/global/users/%s/data/batch_%s/%s" % (self.fspath,username,jobid,filename)
logger.info("Get output from:%s" % fpath)
try:
ret = subprocess.run('tail -n 100 ' + fpath,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
raise IOError(ret.stdout.decode(encoding="utf-8"))
except Exception as err:
logger.error(traceback.format_exc())
return ""
else:
return ret.stdout.decode(encoding="utf-8")

View File

@ -47,6 +47,8 @@ class NodeMgr(object):
# get allnodes
self.allnodes = self._nodelist_etcd("allnodes")
self.runnodes = []
self.batchnodes = []
self.allrunnodes = []
[status, runlist] = self.etcd.listdir("machines/runnodes")
for node in runlist:
nodeip = node['key'].rsplit('/',1)[1]
@ -140,6 +142,14 @@ class NodeMgr(object):
#print(etcd_runip)
#print(self.rpcs)
self.runnodes = etcd_runip
self.batchnodes = self.runnodes.copy()
self.allrunnodes = self.runnodes.copy()
[status, batchlist] = self.etcd.listdir("machines/batchnodes")
if status:
for node in batchlist:
nodeip = node['key'].rsplit('/', 1)[1]
self.batchnodes.append(nodeip)
self.allrunnodes.append(nodeip)
def recover_node(self,ip,tasks):
logger.info("now recover for worker:%s" % ip)
@ -152,14 +162,19 @@ class NodeMgr(object):
# get all run nodes' IP addr
def get_nodeips(self):
return self.runnodes
return self.allrunnodes
def get_batch_nodeips(self):
return self.batchnodes
def get_base_nodeips(self):
return self.runnodes
def get_allnodes(self):
return self.allnodes
def ip_to_rpc(self,ip):
if ip in self.runnodes:
if ip in self.allrunnodes:
return xmlrpc.client.ServerProxy("http://%s:%s" % (ip, env.getenv("WORKER_PORT")))
else:
logger.info('Worker %s is not connected, create rpc client failed, push task into queue')

55
src/master/parser.py Normal file
View File

@ -0,0 +1,55 @@
#!/user/bin/python3
import json
job_data = {'image_1': 'base_base_base', 'mappingRemoteDir_2_2': 'sss', 'dependency_1': 'aaa', 'mappingLocalDir_2_1': 'xxx', 'mappingLocalDir_1_2': 'aaa', 'mappingLocalDir_1_1': 'aaa', 'mappingLocalDir_2_3': 'fdsffdf', 'mappingRemoteDir_1_1': 'ddd', 'mappingRemoteDir_2_3': 'sss', 'srcAddr_1': 'aaa', 'mappingSource_2_1': 'Aliyun', 'cpuSetting_1': '1', 'mappingSource_2_2': 'Aliyun', 'retryCount_2': '1', 'mappingSource_1_1': 'Aliyun', 'expTime_1': '60', 'diskSetting_2': '1024', 'diskSetting_1': '1024', 'dependency_2': 'ddd', 'memorySetting_1': '1024', 'command_2': 'ccc', 'mappingRemoteDir_1_2': 'ddd', 'gpuSetting_2': '0', 'memorySetting_2': '1024', 'gpuSetting_1': '0', 'mappingLocalDir_2_2': 'bbb', 'mappingSource_1_2': 'Aliyun', 'expTime_2': '60', 'mappingRemoteDir_2_1': 'vvv', 'srcAddr_2': 'fff', 'cpuSetting_2': '1', 'instCount_1': '1', 'mappingSource_2_3': 'Aliyun', 'token': 'ZXlKaGJHY2lPaUpJVXpJMU5pSXNJbWxoZENJNk1UVXpNelE0TVRNMU5Td2laWGh3SWpveE5UTXpORGcwT1RVMWZRLmV5SnBaQ0k2TVgwLkF5UnRnaGJHZXhJY2lBSURZTUd5eXZIUVJnUGd1ZTA3OEtGWkVoejJVMkE=', 'instCount_2': '1', 'retryCount_1': '1', 'command_1': 'aaa', 'jobPriority': '0', 'image_2': 'base_base_base', 'jobName': 'aaa'}
def parse(job_data):
job_info = {}
message = {}
for key in job_data:
key_arr = key.split('_')
value = job_data[key]
if len(key_arr) == 1:
job_info[key_arr[0]] = value
elif len(key_arr) == 2:
key_prefix, task_idx = key_arr[0], key_arr[1]
task_idx = 'task_' + task_idx
if task_idx in job_info:
job_info[task_idx][key_prefix] = value
else:
tmp_dict = {
key_prefix: value
}
job_info[task_idx] = tmp_dict
elif len(key_arr) == 3:
key_prefix, task_idx, mapping_idx = key_arr[0], key_arr[1], key_arr[2]
task_idx = 'task_' + task_idx
mapping_idx = 'mapping_' + mapping_idx
if task_idx in job_info:
if "mapping" in job_info[task_idx]:
if mapping_idx in job_info[task_idx]["mapping"]:
job_info[task_idx]["mapping"][mapping_idx][key_prefix] = value
else:
tmp_dict = {
key_prefix: value
}
job_info[task_idx]["mapping"][mapping_idx] = tmp_dict
else:
job_info[task_idx]["mapping"] = {
mapping_idx: {
key_prefix: value
}
}
else:
tmp_dict = {
"mapping":{
mapping_idx: {
key_prefix: value
}
}
}
job_info[task_idx] = tmp_dict
print(json.dumps(job_info, indent=4))
if __name__ == '__main__':
parse(job_data)

767
src/master/taskmgr.py Normal file
View File

@ -0,0 +1,767 @@
import threading
import time
import string
import os
import random, copy, subprocess
import json, math
from functools import wraps
# must import logger after initlogging, ugly
from utils.log import logger
# grpc
from concurrent import futures
import grpc
from protos.rpc_pb2 import *
from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, WorkerStub
from utils.nettools import netcontrol
from utils import env
def ip_to_int(addr):
[a, b, c, d] = addr.split('.')
return (int(a)<<24) + (int(b)<<16) + (int(c)<<8) + int(d)
def int_to_ip(num):
return str((num>>24)&255)+"."+str((num>>16)&255)+"."+str((num>>8)&255)+"."+str(num&255)
class Task():
def __init__(self, task_id, username, at_same_time, priority, max_size, task_infos):
self.id = task_id
self.username = username
self.status = WAITING
self.failed_reason = ""
# if all the vnodes must be started at the same time
self.at_same_time = at_same_time
# priority the bigger the better
# self.priority the smaller the better
self.priority = int(time.time()) / 60 / 60 - priority
self.task_base_ip = None
self.ips = None
self.max_size = max_size
self.subtask_list = [SubTask(
idx = index,
root_task = self,
vnode_info = task_info['vnode_info'],
command_info = task_info['command_info'],
max_retry_count = task_info['max_retry_count']
) for (index, task_info) in enumerate(task_infos)]
def get_billing(self):
billing_beans = 0
running_time = 0
cpu_price = 1 / 3600.0 # /core*s
mem_price = 1 / 3600.0 # /GB*s
disk_price = 1 / 3600.0 # /GB*s
gpu_price = 100 / 3600.0 # /core*s
for subtask in self.subtask_list:
tmp_time = subtask.running_time
cpu_beans = subtask.vnode_info.vnode.instance.cpu * tmp_time * cpu_price
mem_beans = subtask.vnode_info.vnode.instance.memory / 1024.0 * tmp_time * mem_price
disk_beans = subtask.vnode_info.vnode.instance.disk / 1024.0 * tmp_time * disk_price
gpu_beans = subtask.vnode_info.vnode.instance.gpu * tmp_time * gpu_price
logger.info("subtask:%s running_time=%f beans for: cpu=%f mem_beans=%f disk_beans=%f gpu_beans=%f"
%(self.id, tmp_time, cpu_beans, mem_beans, disk_beans, gpu_beans ))
beans = math.ceil(cpu_beans + mem_beans + disk_beans + gpu_beans)
running_time += tmp_time
billing_beans += beans
return running_time, billing_beans
def __lt__(self, other):
return self.priority < other.priority
def gen_ips_from_base(self,base_ip):
if self.task_base_ip == None:
return
self.ips = []
for i in range(self.max_size):
self.ips.append(int_to_ip(base_ip + self.task_base_ip + i + 2))
def gen_hosts(self):
username = self.username
taskid = self.id
logger.info("Generate hosts for user(%s) task(%s) base_ip(%s)"%(username,taskid,str(self.task_base_ip)))
fspath = env.getenv('FS_PREFIX')
if not os.path.isdir("%s/global/users/%s" % (fspath,username)):
path = env.getenv('DOCKLET_LIB')
subprocess.call([path+"/master/userinit.sh", username])
logger.info("user %s directory not found, create it" % username)
hosts_file = open("%s/global/users/%s/hosts/%s.hosts" % (fspath,username,"batch-"+taskid),"w")
hosts_file.write("127.0.0.1 localhost\n")
i = 0
for ip in self.ips:
hosts_file.write(ip+" batch-"+str(i)+"\n")
i += 1
hosts_file.close()
class SubTask():
def __init__(self, idx, root_task, vnode_info, command_info, max_retry_count):
self.root_task = root_task
self.vnode_info = vnode_info
self.vnode_info.vnodeid = idx
self.command_info = command_info
if self.command_info != None:
self.command_info.vnodeid = idx
self.max_retry_count = max_retry_count
self.vnode_started = False
self.task_started = False
self.start_at = 0
self.end_at = 0
self.running_time = 0
self.status = WAITING
self.status_reason = ''
self.try_count = 0
self.worker = None
self.lock = threading.Lock()
def waiting_for_retry(self,reason=""):
self.try_count += 1
self.status = WAITING if self.try_count <= self.max_retry_count else FAILED
if self.status == FAILED:
self.root_task.status = FAILED
self.failed_reason = reason
self.root_task.failed_reason = reason
class TaskReporter(MasterServicer):
def __init__(self, taskmgr):
self.taskmgr = taskmgr
def report(self, request, context):
for task_report in request.taskmsgs:
self.taskmgr.on_task_report(task_report)
return Reply(status=Reply.ACCEPTED, message='')
class TaskMgr(threading.Thread):
# load task information from etcd
# initial a task queue and task schedueler
# taskmgr: a taskmgr instance
def __init__(self, nodemgr, monitor_fetcher, master_ip, scheduler_interval=2, external_logger=None):
threading.Thread.__init__(self)
self.thread_stop = False
self.jobmgr = None
self.master_ip = master_ip
self.task_queue = []
self.lazy_append_list = []
self.lazy_delete_list = []
self.lazy_stop_list = []
self.task_queue_lock = threading.Lock()
self.stop_lock = threading.Lock()
self.add_lock = threading.Lock()
#self.user_containers = {}
self.scheduler_interval = scheduler_interval
self.logger = logger
self.master_port = env.getenv('BATCH_MASTER_PORT')
self.worker_port = env.getenv('BATCH_WORKER_PORT')
# nodes
self.nodemgr = nodemgr
self.monitor_fetcher = monitor_fetcher
self.cpu_usage = {}
self.gpu_usage = {}
# self.all_nodes = None
# self.last_nodes_info_update_time = 0
# self.nodes_info_update_interval = 30 # (s)
self.network_lock = threading.Lock()
batch_net = env.getenv('BATCH_NET')
self.batch_cidr = int(batch_net.split('/')[1])
batch_net = batch_net.split('/')[0]
task_cidr = int(env.getenv('BATCH_TASK_CIDR'))
task_cidr = min(task_cidr,31-self.batch_cidr)
self.task_cidr = max(task_cidr,2)
self.base_ip = ip_to_int(batch_net)
self.free_nets = []
for i in range(0, (1 << (32-self.batch_cidr)) - 1, (1 << self.task_cidr)):
self.free_nets.append(i)
self.logger.info("Free nets addresses pool %s" % str(self.free_nets))
self.logger.info("Each Batch Net CIDR:%s"%(str(self.task_cidr)))
def data_lock(lockname):
def lock(f):
@wraps(f)
def new_f(self, *args, **kwargs):
lockobj = getattr(self,lockname)
lockobj.acquire()
try:
result = f(self, *args, **kwargs)
except Exception as err:
lockobj.release()
raise err
lockobj.release()
return result
return new_f
return lock
def subtask_lock(f):
@wraps(f)
def new_f(self, subtask, *args, **kwargs):
subtask.lock.acquire()
try:
result = f(self, subtask, *args, **kwargs)
except Exception as err:
subtask.lock.release()
raise err
subtask.lock.release()
return result
return new_f
def run(self):
self.serve()
while not self.thread_stop:
self.sort_out_task_queue()
task, sub_task_list = self.task_scheduler()
if task is not None and sub_task_list is not None:
self.task_processor(task, sub_task_list)
else:
time.sleep(self.scheduler_interval)
def serve(self):
self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
add_MasterServicer_to_server(TaskReporter(self), self.server)
self.server.add_insecure_port('[::]:' + self.master_port)
self.server.start()
self.logger.info('[taskmgr_rpc] start rpc server')
def stop(self):
self.thread_stop = True
self.server.stop(0)
self.logger.info('[taskmgr_rpc] stop rpc server')
@data_lock('task_queue_lock')
@data_lock('add_lock')
@data_lock('stop_lock')
def sort_out_task_queue(self):
for task in self.task_queue:
if task.id in self.lazy_stop_list:
self.stop_remove_task(task)
self.lazy_delete_list.append(task)
running_time, billing = task.get_billing()
self.logger.info('task %s stopped, running_time:%s billing:%d'%(task.id, str(running_time), billing))
running_time = math.ceil(running_time)
self.jobmgr.report(task.username, task.id,'stopped',running_time=running_time,billing=billing)
while self.lazy_delete_list:
task = self.lazy_delete_list.pop(0)
try:
self.task_queue.remove(task)
except Exception as err:
self.logger.warning(str(err))
new_append_list = []
for task in self.lazy_append_list:
if task.id in self.lazy_stop_list:
self.jobmgr.report(task.username, task.id, 'stopped')
else:
new_append_list.append(task)
self.lazy_append_list = new_append_list
self.lazy_stop_list.clear()
if self.lazy_append_list:
self.task_queue.extend(self.lazy_append_list)
self.lazy_append_list.clear()
self.task_queue = sorted(self.task_queue, key=lambda x: x.priority)
def start_vnode(self, subtask):
try:
self.logger.info('[task_processor] Starting vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
stub = WorkerStub(channel)
response = stub.start_vnode(subtask.vnode_info)
if response.status != Reply.ACCEPTED:
raise Exception(response.message)
except Exception as e:
self.logger.error('[task_processor] rpc error message: %s' % e)
subtask.status_reason = str(e)
return [False, e]
subtask.vnode_started = True
subtask.start_at = time.time()
self.cpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.cpu
self.gpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.gpu
return [True, '']
@subtask_lock
def stop_vnode(self, subtask):
if not subtask.vnode_started:
return [True, ""]
try:
self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
stub = WorkerStub(channel)
response = stub.stop_vnode(subtask.vnode_info)
if response.status != Reply.ACCEPTED:
raise Exception(response.message)
except Exception as e:
self.logger.error('[task_processor] rpc error message: %s' % e)
subtask.status_reason = str(e)
return [False, e]
subtask.vnode_started = False
subtask.end_at = time.time()
subtask.running_time += subtask.end_at - subtask.start_at
self.cpu_usage[subtask.worker] -= subtask.vnode_info.vnode.instance.cpu
self.gpu_usage[subtask.worker] -= subtask.vnode_info.vnode.instance.gpu
return [True, '']
def start_subtask(self, subtask):
try:
self.logger.info('[task_processor] Starting task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
stub = WorkerStub(channel)
response = stub.start_task(subtask.command_info)
if response.status != Reply.ACCEPTED:
raise Exception(response.message)
except Exception as e:
self.logger.error('[task_processor] rpc error message: %s' % e)
subtask.status_reason = str(e)
return [False, e]
subtask.task_started = True
return [True, '']
def stop_subtask(self, subtask):
try:
self.logger.info('[task_processor] Stopping task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
stub = WorkerStub(channel)
response = stub.stop_task(subtask.command_info)
if response.status != Reply.ACCEPTED:
raise Exception(response.message)
except Exception as e:
self.logger.error('[task_processor] rpc error message: %s' % e)
subtask.status = FAILED
subtask.status_reason = str(e)
return [False, e]
subtask.task_started = False
return [True, '']
@data_lock('network_lock')
def acquire_task_ips(self, task):
self.logger.info("[acquire_task_ips] user(%s) task(%s) net(%s)" % (task.username, task.id, str(task.task_base_ip)))
if task.task_base_ip == None:
task.task_base_ip = self.free_nets.pop(0)
return task.task_base_ip
@data_lock('network_lock')
def release_task_ips(self, task):
self.logger.info("[release_task_ips] user(%s) task(%s) net(%s)" % (task.username, task.id, str(task.task_base_ip)))
if task.task_base_ip == None:
return
self.free_nets.append(task.task_base_ip)
task.task_base_ip = None
#self.logger.error('[release task_net] %s' % str(e))
def setup_tasknet(self, task, workers=None):
taskid = task.id
username = task.username
brname = "docklet-batch-%s-%s"%(username, taskid)
gwname = taskid
if task.task_base_ip == None:
return [False, "task.task_base_ip is None!"]
gatewayip = int_to_ip(self.base_ip + task.task_base_ip + 1)
gatewayipcidr = gatewayip + "/" + str(32-self.task_cidr)
netcontrol.new_bridge(brname)
netcontrol.setup_gw(brname,gwname,gatewayipcidr,0,0)
for wip in workers:
if wip != self.master_ip:
netcontrol.setup_gre(brname,wip)
return [True, gatewayip]
def remove_tasknet(self, task):
taskid = task.id
username = task.username
brname = "docklet-batch-%s-%s"%(username, taskid)
netcontrol.del_bridge(brname)
def task_processor(self, task, sub_task_list):
task.status = RUNNING
self.jobmgr.report(task.username, task.id, 'running')
# properties for transactio
self.acquire_task_ips(task)
task.gen_ips_from_base(self.base_ip)
task.gen_hosts()
#need to create hosts
[success, gwip] = self.setup_tasknet(task, [sub_task.worker for sub_task in sub_task_list])
if not success:
self.release_task_ips(task)
return [False, gwip]
placed_workers = []
start_all_vnode_success = True
# start vc
for sub_task in sub_task_list:
vnode_info = sub_task.vnode_info
vnode_info.vnode.hostname = "batch-" + str(vnode_info.vnodeid % task.max_size)
if sub_task.vnode_started:
continue
username = sub_task.root_task.username
#container_name = task.info.username + '-batch-' + task.info.id + '-' + str(instance_id) + '-' + task.info.token
#if not username in self.user_containers.keys():
#self.user_containers[username] = []
#self.user_containers[username].append(container_name)
ipaddr = task.ips[vnode_info.vnodeid % task.max_size] + "/" + str(32-self.task_cidr)
brname = "docklet-batch-%s-%s" % (username, sub_task.root_task.id)
networkinfo = Network(ipaddr=ipaddr, gateway=gwip, masterip=self.master_ip, brname=brname)
vnode_info.vnode.network.CopyFrom(networkinfo)
placed_workers.append(sub_task.worker)
[success, msg] = self.start_vnode(sub_task)
if not success:
sub_task.waiting_for_retry("Fail to start vnode.")
if sub_task.status == WAITING:
self.jobmgr.report(task.username, task.id, 'retrying', "Fail to start vnode.")
sub_task.worker = None
start_all_vnode_success = False
if not start_all_vnode_success:
return
# start tasks
for sub_task in sub_task_list:
task_info = sub_task.command_info
if task_info is None or sub_task.status == RUNNING:
sub_task.status = RUNNING
continue
task_info.token = ''.join(random.sample(string.ascii_letters + string.digits, 8))
[success, msg] = self.start_subtask(sub_task)
if success:
sub_task.status = RUNNING
else:
sub_task.waiting_for_retry("Fail to start task.")
if sub_task.status == WAITING:
self.jobmgr.report(task.username, task.id, 'retrying', "Fail to start task.")
def clear_sub_tasks(self, sub_task_list):
for sub_task in sub_task_list:
self.clear_sub_task(sub_task)
def clear_sub_task(self, sub_task):
if sub_task.task_started:
self.stop_subtask(sub_task)
#pass
if sub_task.vnode_started:
self.stop_vnode(sub_task)
#pass
@data_lock('stop_lock')
def lazy_stop_task(self, taskid):
self.lazy_stop_list.append(taskid)
def stop_remove_task(self, task):
if task is None:
return
self.logger.info("[taskmgr] stop and remove task(%s)"%task.id)
self.clear_sub_tasks(task.subtask_list)
self.release_task_ips(task)
self.remove_tasknet(task)
def check_task_completed(self, task):
if task.status == RUNNING or task.status == WAITING:
for sub_task in task.subtask_list:
if sub_task.command_info != None and (sub_task.status == RUNNING or sub_task.status == WAITING):
return False
self.logger.info('task %s finished, status %d, subtasks: %s' % (task.id, task.status, str([sub_task.status for sub_task in task.subtask_list])))
self.stop_remove_task(task)
self.lazy_delete_list.append(task)
running_time, billing = task.get_billing()
self.logger.info('task %s running_time:%s billing:%d'%(task.id, str(running_time), billing))
running_time = math.ceil(running_time)
if task.status == FAILED:
self.jobmgr.report(task.username,task.id,"failed",task.failed_reason,task.subtask_list[0].max_retry_count+1, running_time, billing)
else:
self.jobmgr.report(task.username,task.id,'finished',running_time=running_time,billing=billing)
return True
# this method is called when worker send heart-beat rpc request
def on_task_report(self, report):
self.logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.vnodeid, report.subTaskStatus))
task = self.get_task(report.taskid)
if task == None:
self.logger.error('[on_task_report] task not found')
return
sub_task = task.subtask_list[report.vnodeid]
if sub_task.command_info.token != report.token:
self.logger.warning('[on_task_report] wrong token, %s %s' % (sub_task.command_info.token, report.token))
return
username = task.username
# container_name = username + '-batch-' + task.info.id + '-' + str(report.instanceid) + '-' + report.token
# self.user_containers[username].remove(container_name)
if sub_task.status != RUNNING:
self.logger.error('[on_task_report] receive task report when vnode is not running')
#sub_task.status = report.subTaskStatus
sub_task.status_reason = report.errmsg
sub_task.task_started = False
if report.subTaskStatus == FAILED or report.subTaskStatus == TIMEOUT:
self.clear_sub_task(sub_task)
sub_task.waiting_for_retry(report.errmsg)
self.logger.info('task %s report failed, status %d, subtasks: %s' % (task.id, task.status, str([sub_task.status for sub_task in task.subtask_list])))
if sub_task.status == WAITING:
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg)
elif report.subTaskStatus == OUTPUTERROR:
self.clear_sub_task(sub_task)
sub_task.status = FAILED
task.status = FAILED
task.failed_reason = report.errmsg
elif report.subTaskStatus == COMPLETED:
sub_task.status = report.subTaskStatus
self.clear_sub_task(sub_task)
# return task, workers
def task_scheduler(self):
# simple FIFO with priority
self.logger.info('[task_scheduler] scheduling... (%d tasks remains)' % len(self.task_queue))
for task in self.task_queue:
if task in self.lazy_delete_list or task.id in self.lazy_stop_list:
continue
self.logger.info('task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
if self.check_task_completed(task):
continue
self.logger.info('schedule task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
if task.at_same_time:
# parallel tasks
if not self.has_waiting(task.subtask_list):
continue
workers = self.find_proper_workers(task.subtask_list)
if len(workers) == 0:
return None, None
else:
for i in range(len(workers)):
task.subtask_list[i].worker = workers[i]
return task, task.subtask_list
else:
# traditional tasks
has_waiting = False
for sub_task in task.subtask_list:
if sub_task.status == WAITING:
has_waiting = True
workers = self.find_proper_workers([sub_task])
if len(workers) > 0:
sub_task.worker = workers[0]
return task, [sub_task]
if has_waiting:
return None, None
return None, None
def has_waiting(self, sub_task_list):
for sub_task in sub_task_list:
if sub_task.status == WAITING:
return True
return False
def find_proper_workers(self, sub_task_list, all_res=False):
nodes = self.get_all_nodes()
if nodes is None or len(nodes) == 0:
self.logger.warning('[task_scheduler] running nodes not found')
return None
proper_workers = []
has_waiting = False
for sub_task in sub_task_list:
if sub_task.status == WAITING:
has_waiting = True
if sub_task.worker is not None and sub_task.vnode_started:
proper_workers.append(sub_task.worker)
continue
needs = sub_task.vnode_info.vnode.instance
self.logger.info('sub_task %s-%d' %(sub_task.root_task.id, sub_task.vnode_info.vnodeid))
self.logger.info(str(needs))
#logger.info(needs)
proper_worker = None
for worker_ip, worker_info in nodes:
self.logger.info('worker ip' + worker_ip)
self.logger.info('cpu usage: ' + str(self.get_cpu_usage(worker_ip)))
self.logger.info('gpu usage: ' + str(self.get_gpu_usage(worker_ip)))
self.logger.info('worker_info: ' + str(worker_info))
#logger.info(worker_info)
#logger.info(self.get_cpu_usage(worker_ip))
if needs.cpu + (not all_res) * self.get_cpu_usage(worker_ip) > worker_info['cpu']:
continue
elif needs.memory > worker_info['memory']:
continue
elif needs.disk > worker_info['disk']:
continue
# try not to assign non-gpu task to a worker with gpu
#if needs['gpu'] == 0 and worker_info['gpu'] > 0:
#continue
elif needs.gpu + (not all_res) * self.get_gpu_usage(worker_ip) > worker_info['gpu']:
continue
else:
worker_info['cpu'] -= needs.cpu
worker_info['memory'] -= needs.memory
worker_info['gpu'] -= needs.gpu
worker_info['disk'] -= needs.disk
proper_worker = worker_ip
break
if proper_worker is not None:
proper_workers.append(proper_worker)
else:
return []
if has_waiting:
return proper_workers
else:
return []
def get_all_nodes(self):
# cache running nodes
# if self.all_nodes is not None and time.time() - self.last_nodes_info_update_time < self.nodes_info_update_interval:
# return self.all_nodes
# get running nodes
node_ips = self.nodemgr.get_batch_nodeips()
all_nodes = [(node_ip, self.get_worker_resource_info(node_ip)) for node_ip in node_ips]
return all_nodes
def is_alive(self, worker):
nodes = self.nodemgr.get_batch_nodeips()
return worker in nodes
def get_worker_resource_info(self, worker_ip):
fetcher = self.monitor_fetcher(worker_ip)
worker_info = fetcher.info
info = {}
info['cpu'] = len(worker_info['cpuconfig'])
info['memory'] = (worker_info['meminfo']['buffers'] + worker_info['meminfo']['cached'] + worker_info['meminfo']['free']) / 1024 # (Mb)
info['disk'] = sum([disk['free'] for disk in worker_info['diskinfo']]) / 1024 / 1024 # (Mb)
info['gpu'] = len(worker_info['gpuinfo'])
return info
def get_cpu_usage(self, worker_ip):
try:
return self.cpu_usage[worker_ip]
except:
self.cpu_usage[worker_ip] = 0
return 0
def get_gpu_usage(self, worker_ip):
try:
return self.gpu_usage[worker_ip]
except:
self.gpu_usage[worker_ip] = 0
return 0
# save the task information into database
# called when jobmgr assign task to taskmgr
@data_lock('add_lock')
def add_task(self, username, taskid, json_task, task_priority=1):
# decode json string to object defined in grpc
self.logger.info('[taskmgr add_task] receive task %s' % taskid)
image_dict = {
"private": Image.PRIVATE,
"base": Image.BASE,
"public": Image.PUBLIC
}
max_size = (1 << self.task_cidr) - 2
if int(json_task['vnodeCount']) > max_size:
# tell jobmgr
self.jobmgr.report(username,taskid,"failed","vnodeCount exceed limits.")
return False
task = Task(
task_id = taskid,
username = username,
# all vnode must be started at the same time
at_same_time = 'atSameTime' in json_task.keys(),
priority = task_priority,
max_size = (1 << self.task_cidr) - 2,
task_infos = [{
'max_retry_count': int(json_task['retryCount']),
'vnode_info': VNodeInfo(
taskid = taskid,
username = username,
vnode = VNode(
image = Image(
name = json_task['image'].split('_')[0], #json_task['cluster']['image']['name'],
type = image_dict[json_task['image'].split('_')[2]], #json_task['cluster']['image']['type'],
owner = username if not json_task['image'].split('_')[1] else json_task['image'].split('_')[1]), #json_task['cluster']['image']['owner']),
instance = Instance(
cpu = int(json_task['cpuSetting']),
memory = int(json_task['memorySetting']),
disk = int(json_task['diskSetting']),
gpu = int(json_task['gpuSetting'])),
mount = [Mount(
provider = json_task['mapping'][mapping_key]['mappingProvider'],
localPath = json_task['mapping'][mapping_key]['mappingMountpath'],
remotePath = json_task['mapping'][mapping_key]['mappingBucketName'],
accessKey = json_task['mapping'][mapping_key]['mappingAccessKey'],
secretKey = json_task['mapping'][mapping_key]['mappingSecretKey'],
other = json_task['mapping'][mapping_key]['mappingEndpoint']
)
for mapping_key in json_task['mapping']] if 'mapping' in json_task else []
),
),
'command_info': TaskInfo(
taskid = taskid,
username = username,
parameters = Parameters(
command = Command(
commandLine = json_task['command'],
packagePath = json_task['srcAddr'],
envVars = {}),
stderrRedirectPath = json_task.get('stdErrRedPth',""),
stdoutRedirectPath = json_task.get('stdOutRedPth',"")),
timeout = int(json_task['expTime'])
# commands are executed in all vnodes / only excuted in the first vnode
# if in traditional mode, commands will be executed in all vnodes
) if (json_task['runon'] == 'all' or vnode_index == 0) else None
} for vnode_index in range(int(json_task['vnodeCount']))])
if task.at_same_time:
workers = self.find_proper_workers(task.subtask_list, all_res=True)
if len(workers) == 0:
task.status = FAILED
# tell jobmgr
self.jobmgr.report(username,taskid,"failed","Resources needs exceed limits")
return False
else:
for sub_task in task.subtask_list:
workers = self.find_proper_workers([sub_task], all_res=True)
if len(workers) == 0:
task.status = FAILED
# tell jobmgr
self.jobmgr.report(username,taskid,"failed","Resources needs exceed limits")
return False
self.lazy_append_list.append(task)
return True
@data_lock('task_queue_lock')
def get_task_list(self):
return self.task_queue.copy()
@data_lock('task_queue_lock')
def get_task(self, taskid):
for task in self.task_queue:
if task.id == taskid:
return task
return None
def set_jobmgr(self, jobmgr):
self.jobmgr = jobmgr
# get names of all the batch containers of the user
def get_user_batch_containers(self,username):
return []
# if not username in self.user_containers.keys():
# return []
# else:
# return self.user_containers[username]

View File

@ -0,0 +1,41 @@
import sys
if sys.path[0].endswith("master"):
sys.path[0] = sys.path[0][:-6]
import grpc,time
from protos import rpc_pb2, rpc_pb2_grpc
import random, string
def run():
channel = grpc.insecure_channel('localhost:50051')
stub = rpc_pb2_grpc.WorkerStub(channel)
comm = rpc_pb2.Command(commandLine="ls /root;sleep 5;ls /root", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}'
paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="/root/nfs/batch_{jobid}/", stdoutRedirectPath="/root/nfs/batch_{jobid}/")
img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet")
inst = rpc_pb2.Instance(cpu=1, memory=1000, disk=1000, gpu=0)
mnt = rpc_pb2.Mount(localPath="",provider='aliyun',remotePath="test-for-docklet",other="oss-cn-beijing.aliyuncs.com",accessKey="LTAIdl7gmmIhfqA9",secretKey="")
clu = rpc_pb2.Cluster(image=img, instance=inst, mount=[])
task = rpc_pb2.TaskInfo(id="test",username="root",instanceid=1,instanceCount=1,maxRetryCount=1,parameters=paras,cluster=clu,timeout=60000,token=''.join(random.sample(string.ascii_letters + string.digits, 8)))
response = stub.process_task(task)
print("Batch client received: " + str(response.status)+" "+response.message)
def stop_task():
channel = grpc.insecure_channel('localhost:50051')
stub = rpc_pb2_grpc.WorkerStub(channel)
taskmsg = rpc_pb2.TaskMsg(taskid="test",username="root",instanceid=1,instanceStatus=rpc_pb2.COMPLETED,token="test",errmsg="")
reportmsg = rpc_pb2.ReportMsg(taskmsgs = [taskmsg])
response = stub.stop_tasks(reportmsg)
print("Batch client received: " + str(response.status)+" "+response.message)
if __name__ == '__main__':
#for i in range(10):
run()
#time.sleep(4)
#stop_task()

193
src/master/testTaskMgr.py Normal file
View File

@ -0,0 +1,193 @@
import master.taskmgr
from concurrent import futures
import grpc
from protos.rpc_pb2 import *
from protos.rpc_pb2_grpc import *
import threading, json, time, random
from utils import env
class SimulatedNodeMgr():
def get_batch_nodeips(self):
return ['0.0.0.0']
class SimulatedMonitorFetcher():
def __init__(self, ip):
self.info = {}
self.info['cpuconfig'] = [1,1,1,1,1,1,1,1]
self.info['meminfo'] = {}
self.info['meminfo']['free'] = 8 * 1024 * 1024 # (kb) simulate 8 GB memory
self.info['meminfo']['buffers'] = 8 * 1024 * 1024
self.info['meminfo']['cached'] = 8 * 1024 * 1024
self.info['diskinfo'] = []
self.info['diskinfo'].append({})
self.info['diskinfo'][0]['free'] = 16 * 1024 * 1024 * 1024 # (b) simulate 16 GB disk
self.info['gpuinfo'] = [1,1]
class SimulatedTaskController(WorkerServicer):
def __init__(self, worker):
self.worker = worker
def start_vnode(self, vnodeinfo, context):
print('[SimulatedTaskController] start vnode, taskid [%s] vnodeid [%d]' % (vnodeinfo.taskid, vnodeinfo.vnodeid))
return Reply(status=Reply.ACCEPTED,message="")
def stop_vnode(self, vnodeinfo, context):
print('[SimulatedTaskController] stop vnode, taskid [%s] vnodeid [%d]' % (vnodeinfo.taskid, vnodeinfo.vnodeid))
return Reply(status=Reply.ACCEPTED,message="")
def start_task(self, taskinfo, context):
print('[SimulatedTaskController] start task, taskid [%s] vnodeid [%d] token [%s]' % (taskinfo.taskid, taskinfo.vnodeid, taskinfo.token))
worker.process(taskinfo)
return Reply(status=Reply.ACCEPTED,message="")
def stop_task(self, taskinfo, context):
print('[SimulatedTaskController] stop task, taskid [%s] vnodeid [%d] token [%s]' % (taskinfo.taskid, taskinfo.vnodeid, taskinfo.token))
return Reply(status=Reply.ACCEPTED,message="")
class SimulatedWorker(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.thread_stop = False
self.tasks = []
def run(self):
worker_port = env.getenv('BATCH_WORKER_PORT')
server = grpc.server(futures.ThreadPoolExecutor(max_workers=5))
add_WorkerServicer_to_server(SimulatedTaskController(self), server)
server.add_insecure_port('[::]:' + worker_port)
server.start()
while not self.thread_stop:
for task in self.tasks:
seed = random.random()
if seed < 0.25:
report(task.taskid, task.vnodeid, RUNNING, task.token)
elif seed < 0.5:
report(task.taskid, task.vnodeid, COMPLETED, task.token)
self.tasks.remove(task)
break
elif seed < 0.75:
report(task.taskid, task.vnodeid, FAILED, task.token)
self.tasks.remove(task)
break
else:
pass
time.sleep(5)
server.stop(0)
def stop(self):
self.thread_stop = True
def process(self, task):
self.tasks.append(task)
class SimulatedJobMgr(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.thread_stop = False
def run(self):
while not self.thread_stop:
time.sleep(5)
server.stop(0)
def stop(self):
self.thread_stop = True
def report(self, task):
print('[SimulatedJobMgr] task[%s] status %d' % (task.info.id, task.status))
def assignTask(self, taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk, gpu):
task = {}
task['instCount'] = instance_count
task['retryCount'] = retry_count
task['expTime'] = timeout
task['at_same_time'] = True
task['multicommand'] = True
task['command'] = 'ls'
task['srcAddr'] = ''
task['envVars'] = {'a':'1'}
task['stdErrRedPth'] = ''
task['stdOutRedPth'] = ''
task['image'] = 'root_root_base'
task['cpuSetting'] = cpu
task['memorySetting'] = memory
task['diskSetting'] = disk
task['gpuSetting'] = 0
task['mapping'] = []
taskmgr.add_task('root', taskid, task)
class SimulatedLogger():
def info(self, msg):
print('[INFO] ' + msg)
def warning(self, msg):
print('[WARNING] ' + msg)
def error(self, msg):
print('[ERROR] ' + msg)
def test():
global worker
global jobmgr
global taskmgr
worker = SimulatedWorker()
worker.start()
jobmgr = SimulatedJobMgr()
jobmgr.start()
taskmgr = master.taskmgr.TaskMgr(SimulatedNodeMgr(), SimulatedMonitorFetcher, master_ip='', scheduler_interval=2, external_logger=SimulatedLogger())
# taskmgr.set_jobmgr(jobmgr)
taskmgr.start()
add('task_0', instance_count=2, retry_count=2, timeout=60, cpu=2, memory=2048, disk=2048, gpu=0)
def test2():
global jobmgr
global taskmgr
jobmgr = SimulatedJobMgr()
jobmgr.start()
taskmgr = master.taskmgr.TaskMgr(SimulatedNodeMgr(), SimulatedMonitorFetcher, master_ip='', scheduler_interval=2, external_logger=SimulatedLogger())
taskmgr.set_jobmgr(jobmgr)
taskmgr.start()
add('task_0', instance_count=2, retry_count=2, timeout=60, cpu=2, memory=2048, disk=2048, gpu=0)
def add(taskid, instance_count, retry_count, timeout, cpu, memory, disk, gpu):
global jobmgr
global taskmgr
jobmgr.assignTask(taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk, gpu)
def report(taskid, instanceid, status, token):
global taskmgr
master_port = env.getenv('BATCH_MASTER_PORT')
channel = grpc.insecure_channel('%s:%s' % ('0.0.0.0', master_port))
stub = MasterStub(channel)
response = stub.report(ReportMsg(taskmsgs=[TaskMsg(taskid=taskid, username='root', vnodeid=instanceid, subTaskStatus=status, token=token)]))
def stop():
global worker
global jobmgr
global taskmgr
worker.stop()
jobmgr.stop()
taskmgr.stop()

View File

@ -0,0 +1,66 @@
import sys
if sys.path[0].endswith("master"):
sys.path[0] = sys.path[0][:-6]
import grpc,time
from protos import rpc_pb2, rpc_pb2_grpc
import random, string
def run():
channel = grpc.insecure_channel('localhost:50051')
stub = rpc_pb2_grpc.WorkerStub(channel)
#comm = rpc_pb2.Command(commandLine="ls /root;sleep 5;ls /root", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}'
#paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="/root/nfs/batch_{jobid}/", stdoutRedirectPath="/root/nfs/batch_{jobid}/")
img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet")
inst = rpc_pb2.Instance(cpu=1, memory=1000, disk=1000, gpu=0)
mnt = rpc_pb2.Mount(localPath="",provider='aliyun',remotePath="test-for-docklet",other="oss-cn-beijing.aliyuncs.com",accessKey="LTAIdl7gmmIhfqA9",secretKey="")
network = rpc_pb2.Network(ipaddr="10.0.4.2/24",gateway="10.0.4.1",masterip="192.168.0.1",brname="batch-root-test")
vnode = rpc_pb2.VNode(image=img, instance=inst, mount=[],network=network,hostname="batch-5")
vnodeinfo = rpc_pb2.VNodeInfo(taskid="test",username="root",vnodeid=1,vnode=vnode)
#task = rpc_pb2.TaskInfo(id="test",username="root",instanceid=1,instanceCount=1,maxRetryCount=1,parameters=paras,cluster=clu,timeout=60000,token=''.join(random.sample(string.ascii_letters + string.digits, 8)))
response = stub.start_vnode(vnodeinfo)
print("Batch client received: " + str(response.status)+" "+response.message)
def stop_task():
channel = grpc.insecure_channel('localhost:50051')
stub = rpc_pb2_grpc.WorkerStub(channel)
taskmsg = rpc_pb2.TaskMsg(taskid="test",username="root",instanceid=1,instanceStatus=rpc_pb2.COMPLETED,token="test",errmsg="")
reportmsg = rpc_pb2.ReportMsg(taskmsgs = [taskmsg])
response = stub.stop_tasks(reportmsg)
print("Batch client received: " + str(response.status)+" "+response.message)
def stop_vnode():
channel = grpc.insecure_channel('localhost:50051')
stub = rpc_pb2_grpc.WorkerStub(channel)
network = rpc_pb2.Network(brname="batch-root-test")
vnodeinfo = rpc_pb2.VNodeInfo(taskid="test",username="root",vnodeid=1,vnode=rpc_pb2.VNode(network=network))
response = stub.stop_vnode(vnodeinfo)
print("Batch client received: " + str(response.status)+" "+response.message)
def start_task():
channel = grpc.insecure_channel('localhost:50051')
stub = rpc_pb2_grpc.WorkerStub(channel)
comm = rpc_pb2.Command(commandLine="ls /root;sleep 5;ls /root", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}'
paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="/root/nfs/batch_{jobid}/", stdoutRedirectPath="/root/nfs/batch_{jobid}/")
taskinfo = rpc_pb2.TaskInfo(taskid="test",username="root",vnodeid=1,parameters=paras,timeout=20,token="test")
response = stub.start_task(taskinfo)
print("Batch client received: " + str(response.status)+" "+response.message)
if __name__ == '__main__':
#for i in range(10):
#run()
#start_task()
stop_vnode()
#time.sleep(4)
#stop_task()

View File

@ -120,7 +120,7 @@ class VclusterMgr(object):
return [False, "the size of disk is not big enough for the image"]
clustersize = int(self.defaultsize)
logger.info ("starting cluster %s with %d containers for %s" % (clustername, int(clustersize), username))
workers = self.nodemgr.get_nodeips()
workers = self.nodemgr.get_base_nodeips()
image_json = json.dumps(image)
groupname = json.loads(user_info)["data"]["group"]
groupquota = json.loads(user_info)["data"]["groupinfo"]
@ -206,7 +206,7 @@ class VclusterMgr(object):
return [False, "cluster:%s not found" % clustername]
if self.imgmgr.get_image_size(image) + 100 > int(setting["disk"]):
return [False, "the size of disk is not big enough for the image"]
workers = self.nodemgr.get_nodeips()
workers = self.nodemgr.get_base_nodeips()
if (len(workers) == 0):
logger.warning("no workers to start containers, scale out failed")
return [False, "no workers are running"]

115
src/protos/rpc.proto Normal file
View File

@ -0,0 +1,115 @@
syntax = "proto3";
service Master {
rpc report (ReportMsg) returns (Reply) {}
}
service Worker {
rpc start_vnode (VNodeInfo) returns (Reply) {}
rpc start_task (TaskInfo) returns (Reply) {}
rpc stop_task (TaskInfo) returns (Reply) {}
rpc stop_vnode (VNodeInfo) returns (Reply) {}
}
message VNodeInfo {
string taskid = 1;
string username = 2;
int32 vnodeid = 3;
VNode vnode = 4; //
}
message Reply {
ReplyStatus status = 1; //
string message = 2;
enum ReplyStatus {
ACCEPTED = 0;
REFUSED = 1;
}
}
message ReportMsg {
repeated TaskMsg taskmsgs = 1;
}
message TaskMsg {
string taskid = 1;
string username = 2;
int32 vnodeid = 3;
Status subTaskStatus = 4; //
string token = 5;
string errmsg = 6;
}
enum Status {
WAITING = 0;
RUNNING = 1;
COMPLETED = 2;
FAILED = 3;
TIMEOUT = 4;
OUTPUTERROR = 5;
}
message TaskInfo {
string taskid = 1;
string username = 2;
int32 vnodeid = 3;
Parameters parameters = 4; //
int32 timeout = 5; //
string token = 6;
}
message Parameters {
Command command = 1; //
string stderrRedirectPath = 2; //
string stdoutRedirectPath = 3; //
}
message Command {
string commandLine = 1; //
string packagePath = 2; //
map<string, string> envVars = 3; //
}
message VNode {
Image image = 1; //
Instance instance = 2; //
repeated Mount mount = 3; //
Network network = 4; //
string hostname = 5; //
}
message Network {
string ipaddr = 1;
string gateway = 2;
string masterip = 3;
string brname = 4;
}
message Image {
string name = 1; //
ImageType type = 2; // public/private)
string owner = 3; //
enum ImageType {
BASE = 0;
PUBLIC = 1;
PRIVATE = 2;
}
}
message Mount {
string provider = 1;
string localPath = 2; //
string remotePath = 3; //
string accessKey = 4;
string secretKey = 5;
string other = 6;
}
message Instance {
int32 cpu = 1; // CPU
int32 memory = 2; // mb
int32 disk = 3; // mb
int32 gpu = 4; //
}

977
src/protos/rpc_pb2.py Normal file
View File

@ -0,0 +1,977 @@
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: rpc.proto
import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf.internal import enum_type_wrapper
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='rpc.proto',
package='',
syntax='proto3',
serialized_pb=_b('\n\trpc.proto\"U\n\tVNodeInfo\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x15\n\x05vnode\x18\x04 \x01(\x0b\x32\x06.VNode\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\'\n\tReportMsg\x12\x1a\n\x08taskmsgs\x18\x01 \x03(\x0b\x32\x08.TaskMsg\"{\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x1e\n\rsubTaskStatus\x18\x04 \x01(\x0e\x32\x07.Status\x12\r\n\x05token\x18\x05 \x01(\t\x12\x0e\n\x06\x65rrmsg\x18\x06 \x01(\t\"~\n\x08TaskInfo\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x1f\n\nparameters\x18\x04 \x01(\x0b\x32\x0b.Parameters\x12\x0f\n\x07timeout\x18\x05 \x01(\x05\x12\r\n\x05token\x18\x06 \x01(\t\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x7f\n\x05VNode\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\x12\x19\n\x07network\x18\x04 \x01(\x0b\x32\x08.Network\x12\x10\n\x08hostname\x18\x05 \x01(\t\"L\n\x07Network\x12\x0e\n\x06ipaddr\x18\x01 \x01(\t\x12\x0f\n\x07gateway\x18\x02 \x01(\t\x12\x10\n\x08masterip\x18\x03 \x01(\t\x12\x0e\n\x06\x62rname\x18\x04 \x01(\t\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\"u\n\x05Mount\x12\x10\n\x08provider\x18\x01 \x01(\t\x12\x11\n\tlocalPath\x18\x02 \x01(\t\x12\x12\n\nremotePath\x18\x03 \x01(\t\x12\x11\n\taccessKey\x18\x04 \x01(\t\x12\x11\n\tsecretKey\x18\x05 \x01(\t\x12\r\n\x05other\x18\x06 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*[\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x12\x0f\n\x0bOUTPUTERROR\x10\x05\x32(\n\x06Master\x12\x1e\n\x06report\x12\n.ReportMsg\x1a\x06.Reply\"\x00\x32\x96\x01\n\x06Worker\x12#\n\x0bstart_vnode\x12\n.VNodeInfo\x1a\x06.Reply\"\x00\x12!\n\nstart_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x12 \n\tstop_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x12\"\n\nstop_vnode\x12\n.VNodeInfo\x1a\x06.Reply\"\x00\x62\x06proto3')
)
_STATUS = _descriptor.EnumDescriptor(
name='Status',
full_name='Status',
filename=None,
file=DESCRIPTOR,
values=[
_descriptor.EnumValueDescriptor(
name='WAITING', index=0, number=0,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='RUNNING', index=1, number=1,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='COMPLETED', index=2, number=2,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='FAILED', index=3, number=3,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='TIMEOUT', index=4, number=4,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='OUTPUTERROR', index=5, number=5,
options=None,
type=None),
],
containing_type=None,
options=None,
serialized_start=1249,
serialized_end=1340,
)
_sym_db.RegisterEnumDescriptor(_STATUS)
Status = enum_type_wrapper.EnumTypeWrapper(_STATUS)
WAITING = 0
RUNNING = 1
COMPLETED = 2
FAILED = 3
TIMEOUT = 4
OUTPUTERROR = 5
_REPLY_REPLYSTATUS = _descriptor.EnumDescriptor(
name='ReplyStatus',
full_name='Reply.ReplyStatus',
filename=None,
file=DESCRIPTOR,
values=[
_descriptor.EnumValueDescriptor(
name='ACCEPTED', index=0, number=0,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='REFUSED', index=1, number=1,
options=None,
type=None),
],
containing_type=None,
options=None,
serialized_start=162,
serialized_end=202,
)
_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS)
_IMAGE_IMAGETYPE = _descriptor.EnumDescriptor(
name='ImageType',
full_name='Image.ImageType',
filename=None,
file=DESCRIPTOR,
values=[
_descriptor.EnumValueDescriptor(
name='BASE', index=0, number=0,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PUBLIC', index=1, number=1,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='PRIVATE', index=2, number=2,
options=None,
type=None),
],
containing_type=None,
options=None,
serialized_start=1014,
serialized_end=1060,
)
_sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE)
_VNODEINFO = _descriptor.Descriptor(
name='VNodeInfo',
full_name='VNodeInfo',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='taskid', full_name='VNodeInfo.taskid', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='username', full_name='VNodeInfo.username', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='vnodeid', full_name='VNodeInfo.vnodeid', index=2,
number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='vnode', full_name='VNodeInfo.vnode', index=3,
number=4, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=13,
serialized_end=98,
)
_REPLY = _descriptor.Descriptor(
name='Reply',
full_name='Reply',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='status', full_name='Reply.status', index=0,
number=1, type=14, cpp_type=8, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='message', full_name='Reply.message', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
_REPLY_REPLYSTATUS,
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=100,
serialized_end=202,
)
_REPORTMSG = _descriptor.Descriptor(
name='ReportMsg',
full_name='ReportMsg',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='taskmsgs', full_name='ReportMsg.taskmsgs', index=0,
number=1, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=204,
serialized_end=243,
)
_TASKMSG = _descriptor.Descriptor(
name='TaskMsg',
full_name='TaskMsg',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='taskid', full_name='TaskMsg.taskid', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='username', full_name='TaskMsg.username', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='vnodeid', full_name='TaskMsg.vnodeid', index=2,
number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='subTaskStatus', full_name='TaskMsg.subTaskStatus', index=3,
number=4, type=14, cpp_type=8, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='token', full_name='TaskMsg.token', index=4,
number=5, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='errmsg', full_name='TaskMsg.errmsg', index=5,
number=6, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=245,
serialized_end=368,
)
_TASKINFO = _descriptor.Descriptor(
name='TaskInfo',
full_name='TaskInfo',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='taskid', full_name='TaskInfo.taskid', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='username', full_name='TaskInfo.username', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='vnodeid', full_name='TaskInfo.vnodeid', index=2,
number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='parameters', full_name='TaskInfo.parameters', index=3,
number=4, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='timeout', full_name='TaskInfo.timeout', index=4,
number=5, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='token', full_name='TaskInfo.token', index=5,
number=6, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=370,
serialized_end=496,
)
_PARAMETERS = _descriptor.Descriptor(
name='Parameters',
full_name='Parameters',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='command', full_name='Parameters.command', index=0,
number=1, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='stderrRedirectPath', full_name='Parameters.stderrRedirectPath', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='stdoutRedirectPath', full_name='Parameters.stdoutRedirectPath', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=498,
serialized_end=593,
)
_COMMAND_ENVVARSENTRY = _descriptor.Descriptor(
name='EnvVarsEntry',
full_name='Command.EnvVarsEntry',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='key', full_name='Command.EnvVarsEntry.key', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='value', full_name='Command.EnvVarsEntry.value', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')),
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=689,
serialized_end=735,
)
_COMMAND = _descriptor.Descriptor(
name='Command',
full_name='Command',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='commandLine', full_name='Command.commandLine', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='packagePath', full_name='Command.packagePath', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='envVars', full_name='Command.envVars', index=2,
number=3, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[_COMMAND_ENVVARSENTRY, ],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=596,
serialized_end=735,
)
_VNODE = _descriptor.Descriptor(
name='VNode',
full_name='VNode',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='image', full_name='VNode.image', index=0,
number=1, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='instance', full_name='VNode.instance', index=1,
number=2, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='mount', full_name='VNode.mount', index=2,
number=3, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='network', full_name='VNode.network', index=3,
number=4, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='hostname', full_name='VNode.hostname', index=4,
number=5, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=737,
serialized_end=864,
)
_NETWORK = _descriptor.Descriptor(
name='Network',
full_name='Network',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='ipaddr', full_name='Network.ipaddr', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='gateway', full_name='Network.gateway', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='masterip', full_name='Network.masterip', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='brname', full_name='Network.brname', index=3,
number=4, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=866,
serialized_end=942,
)
_IMAGE = _descriptor.Descriptor(
name='Image',
full_name='Image',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='name', full_name='Image.name', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='type', full_name='Image.type', index=1,
number=2, type=14, cpp_type=8, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='owner', full_name='Image.owner', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
_IMAGE_IMAGETYPE,
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=944,
serialized_end=1060,
)
_MOUNT = _descriptor.Descriptor(
name='Mount',
full_name='Mount',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='provider', full_name='Mount.provider', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='localPath', full_name='Mount.localPath', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='remotePath', full_name='Mount.remotePath', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='accessKey', full_name='Mount.accessKey', index=3,
number=4, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='secretKey', full_name='Mount.secretKey', index=4,
number=5, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='other', full_name='Mount.other', index=5,
number=6, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=1062,
serialized_end=1179,
)
_INSTANCE = _descriptor.Descriptor(
name='Instance',
full_name='Instance',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='cpu', full_name='Instance.cpu', index=0,
number=1, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='memory', full_name='Instance.memory', index=1,
number=2, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='disk', full_name='Instance.disk', index=2,
number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='gpu', full_name='Instance.gpu', index=3,
number=4, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=1181,
serialized_end=1247,
)
_VNODEINFO.fields_by_name['vnode'].message_type = _VNODE
_REPLY.fields_by_name['status'].enum_type = _REPLY_REPLYSTATUS
_REPLY_REPLYSTATUS.containing_type = _REPLY
_REPORTMSG.fields_by_name['taskmsgs'].message_type = _TASKMSG
_TASKMSG.fields_by_name['subTaskStatus'].enum_type = _STATUS
_TASKINFO.fields_by_name['parameters'].message_type = _PARAMETERS
_PARAMETERS.fields_by_name['command'].message_type = _COMMAND
_COMMAND_ENVVARSENTRY.containing_type = _COMMAND
_COMMAND.fields_by_name['envVars'].message_type = _COMMAND_ENVVARSENTRY
_VNODE.fields_by_name['image'].message_type = _IMAGE
_VNODE.fields_by_name['instance'].message_type = _INSTANCE
_VNODE.fields_by_name['mount'].message_type = _MOUNT
_VNODE.fields_by_name['network'].message_type = _NETWORK
_IMAGE.fields_by_name['type'].enum_type = _IMAGE_IMAGETYPE
_IMAGE_IMAGETYPE.containing_type = _IMAGE
DESCRIPTOR.message_types_by_name['VNodeInfo'] = _VNODEINFO
DESCRIPTOR.message_types_by_name['Reply'] = _REPLY
DESCRIPTOR.message_types_by_name['ReportMsg'] = _REPORTMSG
DESCRIPTOR.message_types_by_name['TaskMsg'] = _TASKMSG
DESCRIPTOR.message_types_by_name['TaskInfo'] = _TASKINFO
DESCRIPTOR.message_types_by_name['Parameters'] = _PARAMETERS
DESCRIPTOR.message_types_by_name['Command'] = _COMMAND
DESCRIPTOR.message_types_by_name['VNode'] = _VNODE
DESCRIPTOR.message_types_by_name['Network'] = _NETWORK
DESCRIPTOR.message_types_by_name['Image'] = _IMAGE
DESCRIPTOR.message_types_by_name['Mount'] = _MOUNT
DESCRIPTOR.message_types_by_name['Instance'] = _INSTANCE
DESCRIPTOR.enum_types_by_name['Status'] = _STATUS
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
VNodeInfo = _reflection.GeneratedProtocolMessageType('VNodeInfo', (_message.Message,), dict(
DESCRIPTOR = _VNODEINFO,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:VNodeInfo)
))
_sym_db.RegisterMessage(VNodeInfo)
Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict(
DESCRIPTOR = _REPLY,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Reply)
))
_sym_db.RegisterMessage(Reply)
ReportMsg = _reflection.GeneratedProtocolMessageType('ReportMsg', (_message.Message,), dict(
DESCRIPTOR = _REPORTMSG,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:ReportMsg)
))
_sym_db.RegisterMessage(ReportMsg)
TaskMsg = _reflection.GeneratedProtocolMessageType('TaskMsg', (_message.Message,), dict(
DESCRIPTOR = _TASKMSG,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:TaskMsg)
))
_sym_db.RegisterMessage(TaskMsg)
TaskInfo = _reflection.GeneratedProtocolMessageType('TaskInfo', (_message.Message,), dict(
DESCRIPTOR = _TASKINFO,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:TaskInfo)
))
_sym_db.RegisterMessage(TaskInfo)
Parameters = _reflection.GeneratedProtocolMessageType('Parameters', (_message.Message,), dict(
DESCRIPTOR = _PARAMETERS,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Parameters)
))
_sym_db.RegisterMessage(Parameters)
Command = _reflection.GeneratedProtocolMessageType('Command', (_message.Message,), dict(
EnvVarsEntry = _reflection.GeneratedProtocolMessageType('EnvVarsEntry', (_message.Message,), dict(
DESCRIPTOR = _COMMAND_ENVVARSENTRY,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Command.EnvVarsEntry)
))
,
DESCRIPTOR = _COMMAND,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Command)
))
_sym_db.RegisterMessage(Command)
_sym_db.RegisterMessage(Command.EnvVarsEntry)
VNode = _reflection.GeneratedProtocolMessageType('VNode', (_message.Message,), dict(
DESCRIPTOR = _VNODE,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:VNode)
))
_sym_db.RegisterMessage(VNode)
Network = _reflection.GeneratedProtocolMessageType('Network', (_message.Message,), dict(
DESCRIPTOR = _NETWORK,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Network)
))
_sym_db.RegisterMessage(Network)
Image = _reflection.GeneratedProtocolMessageType('Image', (_message.Message,), dict(
DESCRIPTOR = _IMAGE,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Image)
))
_sym_db.RegisterMessage(Image)
Mount = _reflection.GeneratedProtocolMessageType('Mount', (_message.Message,), dict(
DESCRIPTOR = _MOUNT,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Mount)
))
_sym_db.RegisterMessage(Mount)
Instance = _reflection.GeneratedProtocolMessageType('Instance', (_message.Message,), dict(
DESCRIPTOR = _INSTANCE,
__module__ = 'rpc_pb2'
# @@protoc_insertion_point(class_scope:Instance)
))
_sym_db.RegisterMessage(Instance)
_COMMAND_ENVVARSENTRY.has_options = True
_COMMAND_ENVVARSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001'))
_MASTER = _descriptor.ServiceDescriptor(
name='Master',
full_name='Master',
file=DESCRIPTOR,
index=0,
options=None,
serialized_start=1342,
serialized_end=1382,
methods=[
_descriptor.MethodDescriptor(
name='report',
full_name='Master.report',
index=0,
containing_service=None,
input_type=_REPORTMSG,
output_type=_REPLY,
options=None,
),
])
_sym_db.RegisterServiceDescriptor(_MASTER)
DESCRIPTOR.services_by_name['Master'] = _MASTER
_WORKER = _descriptor.ServiceDescriptor(
name='Worker',
full_name='Worker',
file=DESCRIPTOR,
index=1,
options=None,
serialized_start=1385,
serialized_end=1535,
methods=[
_descriptor.MethodDescriptor(
name='start_vnode',
full_name='Worker.start_vnode',
index=0,
containing_service=None,
input_type=_VNODEINFO,
output_type=_REPLY,
options=None,
),
_descriptor.MethodDescriptor(
name='start_task',
full_name='Worker.start_task',
index=1,
containing_service=None,
input_type=_TASKINFO,
output_type=_REPLY,
options=None,
),
_descriptor.MethodDescriptor(
name='stop_task',
full_name='Worker.stop_task',
index=2,
containing_service=None,
input_type=_TASKINFO,
output_type=_REPLY,
options=None,
),
_descriptor.MethodDescriptor(
name='stop_vnode',
full_name='Worker.stop_vnode',
index=3,
containing_service=None,
input_type=_VNODEINFO,
output_type=_REPLY,
options=None,
),
])
_sym_db.RegisterServiceDescriptor(_WORKER)
DESCRIPTOR.services_by_name['Worker'] = _WORKER
# @@protoc_insertion_point(module_scope)

139
src/protos/rpc_pb2_grpc.py Normal file
View File

@ -0,0 +1,139 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
import grpc
from protos import rpc_pb2 as rpc__pb2
class MasterStub(object):
# missing associated documentation comment in .proto file
pass
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.report = channel.unary_unary(
'/Master/report',
request_serializer=rpc__pb2.ReportMsg.SerializeToString,
response_deserializer=rpc__pb2.Reply.FromString,
)
class MasterServicer(object):
# missing associated documentation comment in .proto file
pass
def report(self, request, context):
# missing associated documentation comment in .proto file
pass
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_MasterServicer_to_server(servicer, server):
rpc_method_handlers = {
'report': grpc.unary_unary_rpc_method_handler(
servicer.report,
request_deserializer=rpc__pb2.ReportMsg.FromString,
response_serializer=rpc__pb2.Reply.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'Master', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
class WorkerStub(object):
# missing associated documentation comment in .proto file
pass
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.start_vnode = channel.unary_unary(
'/Worker/start_vnode',
request_serializer=rpc__pb2.VNodeInfo.SerializeToString,
response_deserializer=rpc__pb2.Reply.FromString,
)
self.start_task = channel.unary_unary(
'/Worker/start_task',
request_serializer=rpc__pb2.TaskInfo.SerializeToString,
response_deserializer=rpc__pb2.Reply.FromString,
)
self.stop_task = channel.unary_unary(
'/Worker/stop_task',
request_serializer=rpc__pb2.TaskInfo.SerializeToString,
response_deserializer=rpc__pb2.Reply.FromString,
)
self.stop_vnode = channel.unary_unary(
'/Worker/stop_vnode',
request_serializer=rpc__pb2.VNodeInfo.SerializeToString,
response_deserializer=rpc__pb2.Reply.FromString,
)
class WorkerServicer(object):
# missing associated documentation comment in .proto file
pass
def start_vnode(self, request, context):
# missing associated documentation comment in .proto file
pass
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def start_task(self, request, context):
# missing associated documentation comment in .proto file
pass
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def stop_task(self, request, context):
# missing associated documentation comment in .proto file
pass
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def stop_vnode(self, request, context):
# missing associated documentation comment in .proto file
pass
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_WorkerServicer_to_server(servicer, server):
rpc_method_handlers = {
'start_vnode': grpc.unary_unary_rpc_method_handler(
servicer.start_vnode,
request_deserializer=rpc__pb2.VNodeInfo.FromString,
response_serializer=rpc__pb2.Reply.SerializeToString,
),
'start_task': grpc.unary_unary_rpc_method_handler(
servicer.start_task,
request_deserializer=rpc__pb2.TaskInfo.FromString,
response_serializer=rpc__pb2.Reply.SerializeToString,
),
'stop_task': grpc.unary_unary_rpc_method_handler(
servicer.stop_task,
request_deserializer=rpc__pb2.TaskInfo.FromString,
response_serializer=rpc__pb2.Reply.SerializeToString,
),
'stop_vnode': grpc.unary_unary_rpc_method_handler(
servicer.stop_vnode,
request_deserializer=rpc__pb2.VNodeInfo.FromString,
response_serializer=rpc__pb2.Reply.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'Worker', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))

View File

@ -79,5 +79,17 @@ def getenv(key):
return os.environ.get("ALLOCATED_PORTS","10000-65535")
elif key =="ALLOW_SCALE_OUT":
return os.environ.get("ALLOW_SCALE_OUT", "False")
elif key == "BATCH_ON":
return os.environ.get("BATCH_ON","True")
elif key == "BATCH_MASTER_PORT":
return os.environ.get("BATCH_MASTER_PORT","50050")
elif key == "BATCH_WORKER_PORT":
return os.environ.get("BATCH_WORKER_PORT","50051")
elif key == "BATCH_TASK_CIDR":
return os.environ.get("BATCH_TASK_CIDR","4")
elif key == "BATCH_NET":
return os.environ.get("BATCH_NET","10.16.0.0/16")
elif key == "BATCH_MAX_THREAD_WORKER":
return os.environ.get("BATCH_MAX_THREAD_WORKER","5")
else:
return os.environ.get(key,"")

120
src/utils/gputools.py Normal file
View File

@ -0,0 +1,120 @@
import lxc
import subprocess
import os
import signal
from utils.log import logger
# Note: keep physical device id always the same as the virtual device id
# device_path e.g. /dev/nvidia0
def add_device(container_name, device_path):
c = lxc.Container(container_name)
return c.add_device_node(device_path, device_path)
def remove_device(container_name, device_path):
c = lxc.Container(container_name)
return c.remove_device_node('', device_path)
# Mon May 21 10:51:45 2018
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 381.22 Driver Version: 381.22 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# |===============================+======================+======================|
# | 0 GeForce GTX 108... Off | 0000:02:00.0 Off | N/A |
# | 33% 53C P2 59W / 250W | 295MiB / 11172MiB | 2% Default |
# +-------------------------------+----------------------+----------------------+
# | 1 GeForce GTX 108... Off | 0000:84:00.0 Off | N/A |
# | 21% 35C P8 10W / 250W | 161MiB / 11172MiB | 0% Default |
# +-------------------------------+----------------------+----------------------+
#
# +-----------------------------------------------------------------------------+
# | Processes: GPU Memory |
# | GPU PID Type Process name Usage |
# |=============================================================================|
# | 0 111893 C python3 285MiB |
# | 1 111893 C python3 151MiB |
# +-----------------------------------------------------------------------------+
#
def nvidia_smi():
try:
ret = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
return ret.stdout.decode('utf-8').split('\n')
except subprocess.CalledProcessError:
return None
except Exception as e:
return None
def get_gpu_driver_version():
output = nvidia_smi()
if not output:
return None
else:
return output[2].split()[-2]
def get_gpu_status():
output = nvidia_smi()
if not output:
return []
interval_index = [index for index in range(len(output)) if len(output[index].strip()) == 0][0]
status_list = []
for index in range(7, interval_index, 3):
status = {}
status['id'] = output[index].split()[1]
sp = output[index+1].split()
status['fan'] = sp[1]
status['memory'] = sp[8]
status['memory_max'] = sp[10]
status['util'] = sp[12]
status_list.append(status)
return status_list
def get_gpu_processes():
output = nvidia_smi()
if not output:
return []
interval_index = [index for index in range(len(output)) if len(output[index].strip()) == 0][0]
process_list = []
for index in range(interval_index + 5, len(output)):
sp = output[index].split()
if len(sp) != 7:
break
process = {}
process['gpu'] = sp[1]
process['pid'] = sp[2]
process['name'] = sp[4]
process['memory'] = sp[5]
process['container'] = get_container_name_by_pid(sp[2])
process_list.append(process)
return process_list
def get_container_name_by_pid(pid):
with open('/proc/%s/cgroup' % pid) as f:
content = f.readlines()[0].strip().split('/')
if content[1] != 'lxc':
return 'host'
else:
return content[2]
return None
def clean_up_processes_in_gpu(gpu_id):
logger.info('[gputools] start clean up processes in gpu %d' % gpu_id)
processes = get_gpu_processes()
for process in [p for p in processes if p['gpu'] == gpu_id]:
logger.info('[gputools] find process %d running in gpu %d' % (process['pid'], process['gpu']))
if process['container'] == 'host':
logger.warning('[gputools] find process of host, ignored')
else:
logger.warning('[gputools] find process of container [%s], killed' % process['container'])
try:
os.kill(process['pid'], signal.SIGKILL)
except OSError:
continue

View File

@ -44,6 +44,7 @@ app.config['SQLALCHEMY_BINDS'] = {
'history': 'sqlite:///'+fsdir+'/global/sys/HistoryTable.db',
'beansapplication': 'sqlite:///'+fsdir+'/global/sys/BeansApplication.db',
'system': 'sqlite:///'+fsdir+'/global/sys/System.db',
'batch':'sqlite:///'+fsdir+'/global/sys/Batch.db?check_same_thread=False',
'login': 'sqlite:///'+fsdir+'/global/sys/Login.db'
}
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
@ -435,3 +436,105 @@ class Image(db.Model):
def __repr__(self):
return "{\"id\":\"%d\",\"imagename\":\"%s\",\"hasPrivate\":\"%s\",\"hasPublic\":\"%s\",\"ownername\":\"%s\",\"updatetime\":\"%s\",\"description\":\"%s\"}" % (self.id,self.imagename,str(self.hasPrivate),str(self.hasPublic),self.create_time.strftime("%Y-%m-%d %H:%M:%S"),self.ownername,self.description)
class Batchjob(db.Model):
__bind_key__ = 'batch'
id = db.Column(db.String(9), primary_key=True)
username = db.Column(db.String(10))
name = db.Column(db.String(30))
priority = db.Column(db.Integer)
status = db.Column(db.String(10))
failed_reason = db.Column(db.Text)
create_time = db.Column(db.DateTime)
end_time = db.Column(db.DateTime)
billing = db.Column(db.Integer)
tasks = db.relationship('Batchtask', backref='batchjob', lazy='dynamic')
def __init__(self,id,username,name,priority):
self.id = id
self.username = username
self.name = name
self.priority = priority
self.status = "pending"
self.failed_reason = ""
self.create_time = datetime.now()
self.end_time = None
self.billing = 0
def clear(self):
self.status = "pending"
self.failed_reason = ""
self.end_time = None
self.billing = 0
def __repr__(self):
info = {}
info['job_id'] = self.id
info['username'] = self.username
info['job_name'] = self.name
info['priority'] = self.priority
info['status'] = self.status
info['failed_reason'] = self.failed_reason
info['create_time'] = self.create_time.strftime("%Y-%m-%d %H:%M:%S")
if self.end_time is None:
info['end_time'] = "------"
else:
info['end_time'] = self.end_time.strftime("%Y-%m-%d %H:%M:%S")
info['billing'] = self.billing
return json.dumps(info)
class Batchtask(db.Model):
__bind_key__ = 'batch'
id = db.Column(db.String(15), primary_key=True)
idx = db.Column(db.String(10))
jobid = db.Column(db.String(9), db.ForeignKey('batchjob.id'))
status = db.Column(db.String(15))
failed_reason = db.Column(db.Text)
start_time = db.Column(db.DateTime)
end_time = db.Column(db.DateTime)
running_time = db.Column(db.Integer)
billing = db.Column(db.Integer)
config = db.Column(db.Text)
tried_times = db.Column(db.Integer)
def __init__(self, id, idx, config):
self.id = id
self.idx = idx
self.status = "pending"
self.failed_reason = ""
self.start_time = None
self.end_time = None
self.running_time = 0
self.billing = 0
self.config = json.dumps(config)
self.tried_times = 0
def clear(self):
self.status = "pending"
self.failed_reason = ""
self.start_time = None
self.end_time = None
self.running_time = 0
self.billing = 0
self.tried_times = 0
def __repr__(self):
info = {}
info['id'] = self.id
info['idx'] = self.idx
info['jobid'] = self.jobid
info['status'] = self.status
info['failed_reason'] = self.failed_reason
if self.start_time is None:
info['start_time'] = "------"
else:
info['start_time'] = self.start_time.strftime("%Y-%m-%d %H:%M:%S")
if self.end_time is None:
info['end_time'] = "------"
else:
info['end_time'] = self.end_time.strftime("%Y-%m-%d %H:%M:%S")
info['running_time'] = self.running_time
info['billing'] = self.billing
info['config'] = json.loads(self.config)
info['tried_times'] = self.tried_times
return json.dumps(info)

View File

@ -195,7 +195,7 @@ class ovscontrol(object):
@staticmethod
def add_port_internal(bridge, port):
try:
subprocess.run(['ovs-vsctl', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=internal'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
subprocess.run(['ovs-vsctl', '--may-exist', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=internal'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
return [True, str(port)]
except subprocess.CalledProcessError as suberror:
return [False, "add port failed : %s" % suberror.stdout.decode('utf-8')]
@ -211,7 +211,7 @@ class ovscontrol(object):
@staticmethod
def add_port_gre(bridge, port, remote):
try:
subprocess.run(['ovs-vsctl', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=gre', 'options:remote_ip='+str(remote)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
subprocess.run(['ovs-vsctl', '--may-exist', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=gre', 'options:remote_ip='+str(remote)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
return [True, str(port)]
except subprocess.CalledProcessError as suberror:
return [False, "add port failed : %s" % suberror.stdout.decode('utf-8')]

View File

@ -19,7 +19,7 @@ Design:Monitor mainly consists of three parts: Collectors, Master_Collector and
import subprocess,re,os,psutil,math,sys
import time,threading,json,traceback,platform
from utils import env, etcdlib
from utils import env, etcdlib, gputools
import lxc
import xmlrpc.client
from datetime import datetime
@ -262,6 +262,7 @@ class Container_Collector(threading.Thread):
global pid2name
global laststopcpuval
global laststopruntime
is_batch = container_name.split('-')[1] == 'batch'
# collect basic information, such as running time,state,pid,ip,name
container = lxc.Container(container_name)
basic_info = {}
@ -286,7 +287,8 @@ class Container_Collector(threading.Thread):
containerpids.append(container_pid_str)
pid2name[container_pid_str] = container_name
running_time = self.get_proc_etime(container.init_pid)
running_time += laststopruntime[container_name]
if not is_batch:
running_time += laststopruntime[container_name]
basic_info['PID'] = container_pid_str
basic_info['IP'] = container.get_ips()[0]
basic_info['RunningTime'] = running_time
@ -326,7 +328,8 @@ class Container_Collector(threading.Thread):
cpu_use = {}
lastval = 0
try:
lastval = laststopcpuval[container_name]
if not is_batch:
lastval = laststopcpuval[container_name]
except:
logger.warning(traceback.format_exc())
cpu_val += lastval
@ -369,7 +372,7 @@ class Container_Collector(threading.Thread):
# deal with network used data
containerids = re.split("-",container_name)
if len(containerids) >= 3:
if not is_batch and len(containerids) >= 3:
workercinfo[container_name]['net_stats'] = self.net_stats[containerids[1] + '-' + containerids[2]]
#logger.info(workercinfo[container_name]['net_stats'])
@ -378,7 +381,7 @@ class Container_Collector(threading.Thread):
lasttime = lastbillingtime[container_name]
#logger.info(lasttime)
# process real billing if running time reach an hour
if not int(running_time/self.billingtime) == lasttime:
if not is_batch and not int(running_time/self.billingtime) == lasttime:
#logger.info("billing:"+str(float(cpu_val)))
lastbillingtime[container_name] = int(running_time/self.billingtime)
self.billing_increment(container_name)
@ -478,6 +481,10 @@ class Collector(threading.Thread):
info[idx][key] = val
return [cpuset, info]
# collect gpu used information
def collect_gpuinfo(self):
return gputools.get_gpu_status()
# collect disk used information
def collect_diskinfo(self):
global workercinfo
@ -534,9 +541,10 @@ class Collector(threading.Thread):
[cpuinfo,cpuconfig] = self.collect_cpuinfo()
workerinfo['cpuinfo'] = cpuinfo
workerinfo['cpuconfig'] = cpuconfig
workerinfo['gpuinfo'] = self.collect_gpuinfo()
workerinfo['diskinfo'] = self.collect_diskinfo()
workerinfo['running'] = True
#time.sleep(self.interval)
time.sleep(self.interval)
if self.test:
break
# print(self.etcdser.getkey('/meminfo/total'))

72
src/worker/ossmounter.py Normal file
View File

@ -0,0 +1,72 @@
import abc
import subprocess, os
from utils.log import logger
class OssMounter(object):
__metaclass__ = abc.ABCMeta
@staticmethod
def execute_cmd(cmd):
ret = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
msg = ret.stdout.decode(encoding="utf-8")
logger.error(msg)
return [False,msg]
else:
return [True,""]
@staticmethod
@abc.abstractmethod
def mount_oss(datapath, mount_info):
# mount oss
pass
@staticmethod
@abc.abstractmethod
def umount_oss(datapath, mount_info):
# umount oss
pass
class AliyunOssMounter(OssMounter):
@staticmethod
def mount_oss(datapath, mount_info):
# mount oss
try:
pwdfile = open("/etc/passwd-ossfs","w")
pwdfile.write(mount_info.remotePath+":"+mount_info.accessKey+":"+mount_info.secretKey+"\n")
pwdfile.close()
except Exception as err:
logger.error(traceback.format_exc())
return [False,msg]
cmd = "chmod 640 /etc/passwd-ossfs"
[success1, msg] = OssMounter.execute_cmd(cmd)
if not success1:
logger.error("Aliyun OSS mount chmod err:%s" % msg)
return [False, msg]
mountpath = datapath+"/Aliyun/"+mount_info.remotePath
logger.info("Mount oss %s %s" % (mount_info.remotePath, mountpath))
if not os.path.isdir(mountpath):
os.makedirs(mountpath)
cmd = "ossfs %s %s -ourl=%s" % (mount_info.remotePath, mountpath, mount_info.other)
[success, msg] = OssMounter.execute_cmd(cmd)
if not success:
logger.error("Aliyun OSS mount err:%s" % msg)
return [False, msg]
return [True,""]
@staticmethod
def umount_oss(datapath, mount_info):
mountpath = datapath+"/Aliyun/"+mount_info.remotePath
logger.info("UMount oss %s %s" % (mount_info.remotePath, mountpath))
cmd = "fusermount -u %s" % (mountpath)
[success, msg] = OssMounter.execute_cmd(cmd)
if not success:
logger.error("Aliyun OSS umount err:%s"%msg)
return [False,msg]
[success, msg] = OssMounter.execute_cmd("rm -rf %s" % mountpath)
if not success:
logger.error("Aliyun OSS umount err:%s"%msg)
return [False,msg]
return [True,""]

458
src/worker/taskcontroller.py Executable file
View File

@ -0,0 +1,458 @@
#!/usr/bin/python3
import sys
if sys.path[0].endswith("worker"):
sys.path[0] = sys.path[0][:-6]
from utils import env, tools
config = env.getenv("CONFIG")
#config = "/opt/docklet/local/docklet-running.conf"
tools.loadenv(config)
from utils.log import initlogging
initlogging("docklet-taskcontroller")
from utils.log import logger
from concurrent import futures
import grpc
#from utils.log import logger
#from utils import env
import json,lxc,subprocess,threading,os,time,traceback
from utils import imagemgr,etcdlib,gputools
from utils.lvmtool import sys_run
from worker import ossmounter
from protos import rpc_pb2, rpc_pb2_grpc
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
MAX_RUNNING_TIME = _ONE_DAY_IN_SECONDS
def ip_to_int(addr):
[a, b, c, d] = addr.split('.')
return (int(a)<<24) + (int(b)<<16) + (int(c)<<8) + int(d)
def int_to_ip(num):
return str((num>>24)&255)+"."+str((num>>16)&255)+"."+str((num>>8)&255)+"."+str(num&255)
class TaskController(rpc_pb2_grpc.WorkerServicer):
def __init__(self):
rpc_pb2_grpc.WorkerServicer.__init__(self)
etcdaddr = env.getenv("ETCD")
logger.info ("using ETCD %s" % etcdaddr )
clustername = env.getenv("CLUSTER_NAME")
logger.info ("using CLUSTER_NAME %s" % clustername )
# init etcdlib client
try:
self.etcdclient = etcdlib.Client(etcdaddr, prefix = clustername)
except Exception:
logger.error ("connect etcd failed, maybe etcd address not correct...")
sys.exit(1)
else:
logger.info("etcd connected")
# get master ip and report port
[success,masterip] = self.etcdclient.getkey("service/master")
if not success:
logger.error("Fail to get master ip address.")
sys.exit(1)
else:
self.master_ip = masterip
logger.info("Get master ip address: %s" % (self.master_ip))
self.master_port = env.getenv('BATCH_MASTER_PORT')
self.imgmgr = imagemgr.ImageMgr()
self.fspath = env.getenv('FS_PREFIX')
self.confpath = env.getenv('DOCKLET_CONF')
self.taskmsgs = []
self.msgslock = threading.Lock()
self.report_interval = 2
self.lock = threading.Lock()
self.mount_lock = threading.Lock()
self.cons_gateway = env.getenv('BATCH_GATEWAY')
self.cons_ips = env.getenv('BATCH_NET')
logger.info("Batch gateway ip address %s" % self.cons_gateway)
logger.info("Batch ip pools %s" % self.cons_ips)
self.cidr = 32 - int(self.cons_ips.split('/')[1])
self.ipbase = ip_to_int(self.cons_ips.split('/')[0])
self.free_ips = []
for i in range(2, (1 << self.cidr) - 1):
self.free_ips.append(i)
logger.info("Free ip addresses pool %s" % str(self.free_ips))
self.gpu_lock = threading.Lock()
self.gpu_status = {}
gpus = gputools.get_gpu_status()
for gpu in gpus:
self.gpu_status[gpu['id']] = ""
self.start_report()
logger.info('TaskController init success')
# Need Locks
def acquire_ip(self):
self.lock.acquire()
if len(self.free_ips) == 0:
return [False, "No free ips"]
ip = int_to_ip(self.ipbase + self.free_ips[0])
self.free_ips.remove(self.free_ips[0])
logger.info(str(self.free_ips))
self.lock.release()
return [True, ip + "/" + str(32 - self.cidr)]
# Need Locks
def release_ip(self,ipstr):
self.lock.acquire()
ipnum = ip_to_int(ipstr.split('/')[0]) - self.ipbase
self.free_ips.append(ipnum)
logger.info(str(self.free_ips))
self.lock.release()
def add_gpu_device(self, lxcname, gpu_need):
if gpu_need < 1:
return [True, ""]
self.gpu_lock.acquire()
use_gpus = []
for gpuid in self.gpu_status.keys():
if self.gpu_status[gpuid] == "" and gpu_need > 0:
use_gpus.append(gpuid)
gpu_need -= 1
if gpu_need > 0:
self.gpu_lock.release()
return [False, "No free GPUs"]
for gpuid in use_gpus:
self.gpu_status[gpuid] = lxcname
try:
gputools.add_device(lxcname, "/dev/nvidiactl")
gputools.add_device(lxcname, "/dev/nvidia-uvm")
for gpuid in use_gpus:
gputools.add_device(lxcname,"/dev/nvidia"+str(gpuid))
logger.info("Add gpu:"+str(gpuid) +" to lxc:"+str(lxcname))
except Exception as e:
logger.error(traceback.format_exc())
for gpuid in use_gpus:
self.gpu_status[gpuid] = ""
self.gpu_lock.release()
return [False, "Error occurs when adding gpu device."]
self.gpu_lock.release()
return [True, ""]
def release_gpu_device(self, lxcname):
self.gpu_lock.acquire()
for gpuid in self.gpu_status.keys():
if self.gpu_status[gpuid] == lxcname:
self.gpu_status[gpuid] = ""
self.gpu_lock.release()
#mount_oss
def mount_oss(self, datapath, mount_info):
self.mount_lock.acquire()
try:
for mount in mount_info:
provider = mount.provider
mounter = getattr(ossmounter,provider+"OssMounter",None)
if mounter is None:
self.mount_lock.release()
return [False, provider + " doesn't exist!"]
[success, msg] = mounter.mount_oss(datapath,mount)
if not success:
self.mount_lock.release()
return [False, msg]
except Exception as err:
self.mount_lock.release()
logger.error(traceback.format_exc())
return [False,""]
self.mount_lock.release()
return [True,""]
#umount oss
def umount_oss(self, datapath, mount_info):
try:
for mount in mount_info:
provider = mount.provider
mounter = getattr(ossmounter,provider+"OssMounter",None)
if mounter is None:
return [False, provider + " doesn't exist!"]
[success, msg] = mounter.umount_oss(datapath,mount)
if not success:
return [False, msg]
except Exception as err:
logger.error(traceback.format_exc())
return [False,""]
#accquire ip and create a container
def create_container(self,instanceid,username,image,lxcname,quota):
# acquire ip
[status, ip] = self.acquire_ip()
if not status:
return [False, ip]
# prepare image and filesystem
status = self.imgmgr.prepareFS(username,image,lxcname,str(quota.disk))
if not status:
self.release_ip(ip)
return [False, "Create container for batch failed when preparing filesystem"]
rootfs = "/var/lib/lxc/%s/rootfs" % lxcname
if not os.path.isdir("%s/global/users/%s" % (self.fspath,username)):
path = env.getenv('DOCKLET_LIB')
subprocess.call([path+"/master/userinit.sh", username])
logger.info("user %s directory not found, create it" % username)
sys_run("mkdir -p /var/lib/lxc/%s" % lxcname)
logger.info("generate config file for %s" % lxcname)
def config_prepare(content):
content = content.replace("%ROOTFS%",rootfs)
content = content.replace("%HOSTNAME%","batch-%s" % str(instanceid))
content = content.replace("%CONTAINER_MEMORY%",str(quota.memory))
content = content.replace("%CONTAINER_CPU%",str(quota.cpu*100000))
content = content.replace("%FS_PREFIX%",self.fspath)
content = content.replace("%LXCSCRIPT%",env.getenv("LXC_SCRIPT"))
content = content.replace("%USERNAME%",username)
content = content.replace("%LXCNAME%",lxcname)
content = content.replace("%IP%",ip)
content = content.replace("%GATEWAY%",self.cons_gateway)
return content
logger.info(self.confpath)
conffile = open(self.confpath+"/container.batch.conf", 'r')
conftext = conffile.read()
conffile.close()
conftext = config_prepare(conftext)
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'w')
conffile.write(conftext)
conffile.close()
return [True, ip]
def process_task(self, request, context):
logger.info('excute task with parameter: ' + str(request))
taskid = request.id
instanceid = request.instanceid
# get config from request
command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine']
#envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars']
pkgpath = request.parameters.command.packagePath
envs = request.parameters.command.envVars
envs['taskid'] = str(taskid)
envs['instanceid'] = str(instanceid)
image = {}
image['name'] = request.cluster.image.name
if request.cluster.image.type == rpc_pb2.Image.PRIVATE:
image['type'] = 'private'
elif request.cluster.image.type == rpc_pb2.Image.PUBLIC:
image['type'] = 'public'
else:
image['type'] = 'base'
image['owner'] = request.cluster.image.owner
username = request.username
token = request.token
lxcname = '%s-batch-%s-%s-%s' % (username,taskid,str(instanceid),token)
instance_type = request.cluster.instance
mount_list = request.cluster.mount
outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath]
timeout = request.timeout
gpu_need = int(request.cluster.instance.gpu)
reused = request.reused
#create container
[success, ip] = self.create_container(instanceid, username, image, lxcname, instance_type)
if not success:
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=ip)
#mount oss
self.mount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_list)
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+')
mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
for mount in mount_list:
conffile.write("\n"+ mount_str % (self.fspath, username, mount.remotePath, rootfs, mount.remotePath))
conffile.close()
logger.info("Start container %s..." % lxcname)
#container = lxc.Container(lxcname)
ret = subprocess.run('lxc-start -n %s'%lxcname,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
logger.error('start container %s failed' % lxcname)
self.release_ip(ip)
self.imgmgr.deleteFS(lxcname)
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container")
logger.info('start container %s success' % lxcname)
#add GPU
[success, msg] = self.add_gpu_device(lxcname,gpu_need)
if not success:
logger.error("Fail to add gpu device. " + msg)
container.stop()
self.release_ip(ip)
self.imgmgr.deleteFS(lxcname)
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to add gpu device. " + msg)
thread = threading.Thread(target = self.execute_task, args=(username,taskid,instanceid,envs,lxcname,pkgpath,command,timeout,outpath,ip,token,mount_list))
thread.setDaemon(True)
thread.start()
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def write_output(self,lxcname,tmplogpath,filepath):
cmd = "lxc-attach -n " + lxcname + " -- mv %s %s"
if filepath == "" or filepath == "/root/nfs/batch_{jobid}/" or os.path.abspath("/root/nfs/"+tmplogpath) == os.path.abspath(filepath):
return [True,""]
ret = subprocess.run(cmd % ("/root/nfs/"+tmplogpath,filepath),stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
msg = ret.stdout.decode(encoding="utf-8")
logger.error(msg)
return [False,msg]
logger.info("Succeed to moving nfs/%s to %s" % (tmplogpath,filepath))
return [True,""]
def execute_task(self,username,taskid,instanceid,envs,lxcname,pkgpath,command,timeout,outpath,ip,token,mount_info):
lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs/"
scriptname = "batch_job.sh"
try:
scriptfile = open(lxcfspath+"root/"+scriptname,"w")
scriptfile.write("#!/bin/bash\n")
scriptfile.write("cd "+str(pkgpath)+"\n")
scriptfile.write(command)
scriptfile.close()
except Exception as err:
logger.error(traceback.format_exc())
logger.error("Fail to write script file with taskid(%s) instanceid(%s)" % (str(taskid),str(instanceid)))
else:
try:
job_id = taskid.split('_')[1]
except Exception as e:
logger.error(traceback.format_exc())
job_id = "_none"
jobdir = "batch_" + job_id
logdir = "%s/global/users/%s/data/" % (self.fspath,username) + jobdir
if not os.path.exists(logdir):
logger.info("Directory:%s not exists, create it." % logdir)
os.mkdir(logdir)
stdoutname = str(taskid)+"_"+str(instanceid)+"_stdout.txt"
stderrname = str(taskid)+"_"+str(instanceid)+"_stderr.txt"
try:
stdoutfile = open(logdir+"/"+stdoutname,"w")
stderrfile = open(logdir+"/"+stderrname,"w")
logger.info("Create stdout(%s) and stderr(%s) file to log" % (stdoutname, stderrname))
except Exception as e:
logger.error(traceback.format_exc())
stdoutfile = None
stderrfile = None
cmd = "lxc-attach -n " + lxcname
for envkey,envval in envs.items():
cmd = cmd + " -v %s=%s" % (envkey,envval)
cmd = cmd + " -- /bin/bash \"" + "/root/" + scriptname + "\""
logger.info('run task with command - %s' % cmd)
p = subprocess.Popen(cmd,stdout=stdoutfile,stderr=stderrfile, shell=True)
#logger.info(p)
if timeout == 0:
to = MAX_RUNNING_TIME
else:
to = timeout
while p.poll() is None and to > 0:
time.sleep(min(2,to))
to -= 2
if p.poll() is None:
p.kill()
logger.info("Running time(%d) is out. Task(%s-%s-%s) will be killed." % (timeout,str(taskid),str(instanceid),token))
self.add_msg(taskid,username,instanceid,rpc_pb2.TIMEOUT,token,"Running time is out.")
else:
[success1,msg1] = self.write_output(lxcname,jobdir+"/"+stdoutname,outpath[0])
[success2,msg2] = self.write_output(lxcname,jobdir+"/"+stderrname,outpath[1])
if not success1 or not success2:
if not success1:
msg = msg1
else:
msg = msg2
logger.info("Output error on Task(%s-%s-%s)." % (str(taskid),str(instanceid),token))
self.add_msg(taskid,username,instanceid,rpc_pb2.OUTPUTERROR,token,msg)
else:
if p.poll() == 0:
logger.info("Task(%s-%s-%s) completed." % (str(taskid),str(instanceid),token))
self.add_msg(taskid,username,instanceid,rpc_pb2.COMPLETED,token,"")
else:
logger.info("Task(%s-%s-%s) failed." % (str(taskid),str(instanceid),token))
self.add_msg(taskid,username,instanceid,rpc_pb2.FAILED,token,"")
container = lxc.Container(lxcname)
if container.stop():
logger.info("stop container %s success" % lxcname)
else:
logger.error("stop container %s failed" % lxcname)
logger.info("deleting container:%s" % lxcname)
if self.imgmgr.deleteFS(lxcname):
logger.info("delete container %s success" % lxcname)
else:
logger.error("delete container %s failed" % lxcname)
logger.info("release ip address %s" % ip)
self.release_ip(ip)
self.release_gpu_device(lxcname)
#umount oss
self.umount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_info)
def stop_tasks(self, request, context):
for msg in request.taskmsgs:
lxcname = '%s-batch-%s-%s-%s' % (msg.username,msg.taskid,str(msg.instanceid),msg.token)
logger.info("Stop the task with lxc:"+lxcname)
subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def add_msg(self,taskid,username,instanceid,status,token,errmsg):
self.msgslock.acquire()
try:
self.taskmsgs.append(rpc_pb2.TaskMsg(taskid=str(taskid),username=username,instanceid=int(instanceid),instanceStatus=status,token=token,errmsg=errmsg))
except Exception as err:
logger.error(traceback.format_exc())
self.msgslock.release()
#logger.info(str(self.taskmsgs))
def report_msg(self):
channel = grpc.insecure_channel(self.master_ip+":"+self.master_port)
stub = rpc_pb2_grpc.MasterStub(channel)
while True:
self.msgslock.acquire()
reportmsg = rpc_pb2.ReportMsg(taskmsgs = self.taskmsgs)
try:
response = stub.report(reportmsg)
logger.info("Response from master by reporting: "+str(response.status)+" "+response.message)
except Exception as err:
logger.error(traceback.format_exc())
self.taskmsgs = []
self.msgslock.release()
time.sleep(self.report_interval)
def start_report(self):
thread = threading.Thread(target = self.report_msg, args=())
thread.setDaemon(True)
thread.start()
logger.info("Start to report task messages to master every %d seconds." % self.report_interval)
def TaskControllerServe():
max_threads = int(env.getenv('BATCH_MAX_THREAD_WORKER'))
worker_port = int(env.getenv('BATCH_WORKER_PORT'))
logger.info("Max Threads on a worker is %d" % max_threads)
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_threads))
rpc_pb2_grpc.add_WorkerServicer_to_server(TaskController(), server)
server.add_insecure_port('[::]:'+str(worker_port))
server.start()
logger.info("Start TaskController Servicer on port:%d" % worker_port)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
TaskControllerServe()

500
src/worker/taskworker.py Executable file
View File

@ -0,0 +1,500 @@
#!/usr/bin/python3
import sys
if sys.path[0].endswith("worker"):
sys.path[0] = sys.path[0][:-6]
from utils import env, tools
config = env.getenv("CONFIG")
#config = "/opt/docklet/local/docklet-running.conf"
tools.loadenv(config)
from utils.log import initlogging
initlogging("docklet-taskworker")
from utils.log import logger
from concurrent import futures
import grpc
#from utils.log import logger
#from utils import env
import json,lxc,subprocess,threading,os,time,traceback
from utils import imagemgr,etcdlib,gputools
from utils.lvmtool import sys_run
from worker import ossmounter
from protos import rpc_pb2, rpc_pb2_grpc
from utils.nettools import netcontrol
from master.network import getip
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
MAX_RUNNING_TIME = _ONE_DAY_IN_SECONDS
class TaskWorker(rpc_pb2_grpc.WorkerServicer):
def __init__(self):
rpc_pb2_grpc.WorkerServicer.__init__(self)
etcdaddr = env.getenv("ETCD")
logger.info ("using ETCD %s" % etcdaddr )
clustername = env.getenv("CLUSTER_NAME")
logger.info ("using CLUSTER_NAME %s" % clustername )
# init etcdlib client
try:
self.etcdclient = etcdlib.Client(etcdaddr, prefix = clustername)
except Exception:
logger.error ("connect etcd failed, maybe etcd address not correct...")
sys.exit(1)
else:
logger.info("etcd connected")
# get master ip and report port
[success,masterip] = self.etcdclient.getkey("service/master")
if not success:
logger.error("Fail to get master ip address.")
sys.exit(1)
else:
self.master_ip = masterip
logger.info("Get master ip address: %s" % (self.master_ip))
self.master_port = env.getenv('BATCH_MASTER_PORT')
# get worker ip
self.worker_ip = getip(env.getenv('NETWORK_DEVICE'))
logger.info("Worker ip is :%s"%self.worker_ip)
self.imgmgr = imagemgr.ImageMgr()
self.fspath = env.getenv('FS_PREFIX')
self.confpath = env.getenv('DOCKLET_CONF')
self.rm_all_batch_containers()
self.taskmsgs = []
self.msgslock = threading.Lock()
self.report_interval = 2
self.lock = threading.Lock()
self.mount_lock = threading.Lock()
self.gpu_lock = threading.Lock()
self.gpu_status = {}
gpus = gputools.get_gpu_status()
for gpu in gpus:
self.gpu_status[gpu['id']] = ""
self.start_report()
logger.info('TaskWorker init success')
def stop_and_rm_containers(self,lxcname):
logger.info("Stop the container with name:"+lxcname)
subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
lxcpath = "/var/lib/lxc/%s" % lxcname
try:
mount_info = []
for provider in os.listdir(lxcpath+"/oss"):
for bkname in os.listdir(lxcpath+"/oss/"+provider):
mount_info.append(rpc_pb2.Mount(provider=provider,remotePath=bkname))
self.umount_oss(lxcpath+"/oss", mount_info)
except Exception as err:
logger.info(err)
pass
return self.imgmgr.deleteFS(lxcname)
def rm_all_batch_containers(self):
for con in lxc.list_containers():
keys = con.split('-')
if len(keys) < 2 or keys[1] != 'batch':
continue
if self.stop_and_rm_containers(con):
logger.info("delete container %s success" % con)
else:
logger.error("delete container %s failed" % con)
def add_gpu_device(self, lxcname, gpu_need):
if gpu_need < 1:
return [True, ""]
self.gpu_lock.acquire()
use_gpus = []
for gpuid in self.gpu_status.keys():
if self.gpu_status[gpuid] == "" and gpu_need > 0:
use_gpus.append(gpuid)
gpu_need -= 1
if gpu_need > 0:
self.gpu_lock.release()
return [False, "No free GPUs"]
for gpuid in use_gpus:
self.gpu_status[gpuid] = lxcname
try:
gputools.add_device(lxcname, "/dev/nvidiactl")
gputools.add_device(lxcname, "/dev/nvidia-uvm")
for gpuid in use_gpus:
gputools.add_device(lxcname,"/dev/nvidia"+str(gpuid))
logger.info("Add gpu:"+str(gpuid) +" to lxc:"+str(lxcname))
except Exception as e:
logger.error(traceback.format_exc())
for gpuid in use_gpus:
self.gpu_status[gpuid] = ""
self.gpu_lock.release()
return [False, "Error occurs when adding gpu device."]
self.gpu_lock.release()
return [True, ""]
def release_gpu_device(self, lxcname):
self.gpu_lock.acquire()
for gpuid in self.gpu_status.keys():
if self.gpu_status[gpuid] == lxcname:
self.gpu_status[gpuid] = ""
self.gpu_lock.release()
#mount_oss
def mount_oss(self, datapath, mount_info):
self.mount_lock.acquire()
try:
for mount in mount_info:
provider = mount.provider
mounter = getattr(ossmounter,provider+"OssMounter",None)
if mounter is None:
self.mount_lock.release()
return [False, provider + " doesn't exist!"]
[success, msg] = mounter.mount_oss(datapath,mount)
if not success:
self.mount_lock.release()
return [False, msg]
except Exception as err:
self.mount_lock.release()
logger.error(traceback.format_exc())
return [False,""]
self.mount_lock.release()
return [True,""]
#umount oss
def umount_oss(self, datapath, mount_info):
try:
for mount in mount_info:
provider = mount.provider
mounter = getattr(ossmounter,provider+"OssMounter",None)
if mounter is None:
return [False, provider + " doesn't exist!"]
[success, msg] = mounter.umount_oss(datapath,mount)
if not success:
return [False, msg]
except Exception as err:
logger.error(traceback.format_exc())
return [False,""]
def start_vnode(self, request, context):
logger.info('start vnode with config: ' + str(request))
taskid = request.taskid
vnodeid = request.vnodeid
envs = {}
envs['taskid'] = str(taskid)
envs['vnodeid'] = str(vnodeid)
image = {}
image['name'] = request.vnode.image.name
if request.vnode.image.type == rpc_pb2.Image.PRIVATE:
image['type'] = 'private'
elif request.vnode.image.type == rpc_pb2.Image.PUBLIC:
image['type'] = 'public'
else:
image['type'] = 'base'
image['owner'] = request.vnode.image.owner
username = request.username
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
instance_type = request.vnode.instance
mount_list = request.vnode.mount
gpu_need = int(request.vnode.instance.gpu)
ipaddr = request.vnode.network.ipaddr
gateway = request.vnode.network.gateway
brname = request.vnode.network.brname
masterip = request.vnode.network.masterip
hostname = request.vnode.hostname
#create container
[success, msg] = self.create_container(taskid, vnodeid, username, image, lxcname, instance_type, ipaddr, gateway, brname, hostname)
if not success:
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg)
#mount oss
lxcpath = "/var/lib/lxc/%s" % lxcname
rootfs = lxcpath + "/rootfs"
self.mount_oss(lxcpath + "/oss", mount_list)
conffile = open(lxcpath + "/config", 'a+')
mount_str = "lxc.mount.entry = "+ lxcpath +"/oss/%s/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
for mount in mount_list:
conffile.write("\n"+ mount_str % (mount.provider, mount.remotePath, rootfs, mount.remotePath))
conffile.close()
logger.info("Start container %s..." % lxcname)
container = lxc.Container(lxcname)
ret = subprocess.run('lxc-start -n %s'%lxcname,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
logger.error('start container %s failed' % lxcname)
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
self.imgmgr.deleteFS(lxcname)
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container(%s)"%lxcname)
logger.info('start container %s success' % lxcname)
if masterip != self.worker_ip:
netcontrol.setup_gre(brname, masterip)
#add GPU
[success, msg] = self.add_gpu_device(lxcname,gpu_need)
if not success:
logger.error("Fail to add gpu device. " + msg)
container.stop()
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
self.imgmgr.deleteFS(lxcname)
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to add gpu device. " + msg)
#start ssh service
cmd = "lxc-attach -n %s -- service ssh start" % lxcname
ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
logger.error('Fail to start ssh service of container %s' % lxcname)
container.stop()
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
self.imgmgr.deleteFS(lxcname)
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to start ssh service. lxc(%s)"%lxcname)
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def start_task(self, request, context):
logger.info('start task with config: ' + str(request))
taskid = request.taskid
username = request.username
vnodeid = request.vnodeid
# get config from request
command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine']
#envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars']
pkgpath = request.parameters.command.packagePath
envs = request.parameters.command.envVars
envs['taskid'] = str(taskid)
envs['vnodeid'] = str(vnodeid)
timeout = request.timeout
token = request.token
outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath]
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
thread = threading.Thread(target = self.execute_task, args=(username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token))
thread.setDaemon(True)
thread.start()
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def stop_task(self, request, context):
logger.info('stop task with config: ' + str(request))
taskid = request.taskid
username = request.username
vnodeid = request.vnodeid
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
logger.info("Stop the task with lxc:"+lxcname)
subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
# stop and remove container
def stop_vnode(self, request, context):
logger.info('stop vnode with config: ' + str(request))
taskid = request.taskid
username = request.username
vnodeid = request.vnodeid
brname = request.vnode.network.brname
mount_list = request.vnode.mount
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
logger.info("Stop the task with lxc:"+lxcname)
container = lxc.Container(lxcname)
if container.stop():
logger.info("stop container %s success" % lxcname)
else:
logger.error("stop container %s failed" % lxcname)
#umount oss
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
logger.info("deleting container:%s" % lxcname)
if self.imgmgr.deleteFS(lxcname):
logger.info("delete container %s success" % lxcname)
else:
logger.error("delete container %s failed" % lxcname)
#del ovs bridge
if brname is not None:
netcontrol.del_bridge(brname)
#release gpu
self.release_gpu_device(lxcname)
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
#accquire ip and create a container
def create_container(self,taskid,vnodeid,username,image,lxcname,quota,ipaddr,gateway,brname,hostname):
# prepare image and filesystem
status = self.imgmgr.prepareFS(username,image,lxcname,str(quota.disk))
if not status:
return [False, "Create container for batch failed when preparing filesystem"]
rootfs = "/var/lib/lxc/%s/rootfs" % lxcname
if not os.path.isdir("%s/global/users/%s" % (self.fspath,username)):
path = env.getenv('DOCKLET_LIB')
subprocess.call([path+"/master/userinit.sh", username])
logger.info("user %s directory not found, create it" % username)
sys_run("mkdir -p /var/lib/lxc/%s" % lxcname)
logger.info("generate config file for %s" % lxcname)
def config_prepare(content):
content = content.replace("%ROOTFS%",rootfs)
content = content.replace("%HOSTNAME%",hostname)
content = content.replace("%TASKID%",taskid)
content = content.replace("%CONTAINER_MEMORY%",str(quota.memory))
content = content.replace("%CONTAINER_CPU%",str(quota.cpu*100000))
content = content.replace("%FS_PREFIX%",self.fspath)
content = content.replace("%LXCSCRIPT%",env.getenv("LXC_SCRIPT"))
content = content.replace("%USERNAME%",username)
content = content.replace("%LXCNAME%",lxcname)
content = content.replace("%VETHPAIR%",str(taskid)+"-"+str(vnodeid))
content = content.replace("%IP%",ipaddr)
content = content.replace("%BRNAME%",brname)
content = content.replace("%GATEWAY%",gateway)
return content
logger.info(self.confpath)
conffile = open(self.confpath+"/container.batch.conf", 'r')
conftext = conffile.read()
conffile.close()
conftext = config_prepare(conftext)
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'w')
conffile.write(conftext)
conffile.close()
return [True, ""]
def write_output(self,lxcname,tmplogpath,filepath):
cmd = "lxc-attach -n " + lxcname + " -- mv %s %s"
if filepath == "" or filepath == "/root/nfs/batch_{jobid}/" or os.path.abspath("/root/nfs/"+tmplogpath) == os.path.abspath(filepath):
return [True,""]
ret = subprocess.run(cmd % ("/root/nfs/"+tmplogpath,filepath),stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
msg = ret.stdout.decode(encoding="utf-8")
logger.error(msg)
return [False,msg]
logger.info("Succeed to moving nfs/%s to %s" % (tmplogpath,filepath))
return [True,""]
def execute_task(self,username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token):
lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs/"
scriptname = "batch_job.sh"
try:
scriptfile = open(lxcfspath+"root/"+scriptname,"w")
scriptfile.write("#!/bin/bash\n")
scriptfile.write("cd "+str(pkgpath)+"\n")
scriptfile.write(command)
scriptfile.close()
except Exception as err:
logger.error(traceback.format_exc())
logger.error("Fail to write script file with taskid(%s) vnodeid(%s)" % (str(taskid),str(vnodeid)))
else:
try:
job_id = taskid.split('_')[0]
except Exception as e:
logger.error(traceback.format_exc())
job_id = "_none"
jobdir = "batch_" + job_id
logdir = "%s/global/users/%s/data/" % (self.fspath,username) + jobdir
try:
os.mkdir(logdir)
except Exception as e:
logger.info("Error when creating logdir :%s "+str(e))
stdoutname = str(taskid)+"_"+str(vnodeid)+"_stdout.txt"
stderrname = str(taskid)+"_"+str(vnodeid)+"_stderr.txt"
try:
stdoutfile = open(logdir+"/"+stdoutname,"w")
stderrfile = open(logdir+"/"+stderrname,"w")
logger.info("Create stdout(%s) and stderr(%s) file to log" % (stdoutname, stderrname))
except Exception as e:
logger.error(traceback.format_exc())
stdoutfile = None
stderrfile = None
cmd = "lxc-attach -n " + lxcname
for envkey,envval in envs.items():
cmd = cmd + " -v %s=%s" % (envkey,envval)
cmd = cmd + " -- /bin/bash \"" + "/root/" + scriptname + "\""
logger.info('run task with command - %s' % cmd)
p = subprocess.Popen(cmd,stdout=stdoutfile,stderr=stderrfile, shell=True)
#logger.info(p)
if timeout == 0:
to = MAX_RUNNING_TIME
else:
to = timeout
while p.poll() is None and to > 0:
time.sleep(min(2,to))
to -= 2
if p.poll() is None:
p.kill()
logger.info("Running time(%d) is out. Task(%s-%s-%s) will be killed." % (timeout,str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.TIMEOUT,token,"Running time is out.")
else:
[success1,msg1] = self.write_output(lxcname,jobdir+"/"+stdoutname,outpath[0])
[success2,msg2] = self.write_output(lxcname,jobdir+"/"+stderrname,outpath[1])
if not success1 or not success2:
if not success1:
msg = msg1
else:
msg = msg2
logger.info("Output error on Task(%s-%s-%s)." % (str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.OUTPUTERROR,token,msg)
else:
if p.poll() == 0:
logger.info("Task(%s-%s-%s) completed." % (str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.COMPLETED,token,"")
else:
logger.info("Task(%s-%s-%s) failed." % (str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.FAILED,token,"Runtime Error. More information in stderr log.")
def add_msg(self,taskid,username,vnodeid,status,token,errmsg):
self.msgslock.acquire()
try:
self.taskmsgs.append(rpc_pb2.TaskMsg(taskid=str(taskid),username=username,vnodeid=int(vnodeid),subTaskStatus=status,token=token,errmsg=errmsg))
except Exception as err:
logger.error(traceback.format_exc())
self.msgslock.release()
def report_msg(self):
channel = grpc.insecure_channel(self.master_ip+":"+self.master_port)
stub = rpc_pb2_grpc.MasterStub(channel)
while True:
self.msgslock.acquire()
reportmsg = rpc_pb2.ReportMsg(taskmsgs = self.taskmsgs)
try:
response = stub.report(reportmsg)
logger.info("Response from master by reporting: "+str(response.status)+" "+response.message)
except Exception as err:
logger.error(traceback.format_exc())
self.taskmsgs = []
self.msgslock.release()
time.sleep(self.report_interval)
def start_report(self):
thread = threading.Thread(target = self.report_msg, args=())
thread.setDaemon(True)
thread.start()
logger.info("Start to report task messages to master every %d seconds." % self.report_interval)
def TaskWorkerServe():
max_threads = int(env.getenv('BATCH_MAX_THREAD_WORKER'))
worker_port = int(env.getenv('BATCH_WORKER_PORT'))
logger.info("Max Threads on a worker is %d" % max_threads)
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_threads))
rpc_pb2_grpc.add_WorkerServicer_to_server(TaskWorker(), server)
server.add_insecure_port('[::]:'+str(worker_port))
server.start()
logger.info("Start TaskWorker Servicer on port:%d" % worker_port)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
TaskWorkerServe()

View File

@ -57,17 +57,23 @@ class Worker(object):
self.etcd = etcdclient
self.master = self.etcd.getkey("service/master")[1]
self.mode=None
self.mode = None
self.workertype = "normal"
self.key=""
# waiting state is preserved for compatible.
self.etcd.setkey("machines/runnodes/"+self.addr, "waiting")
# get this node's key to judge how to init.
[status, key] = self.etcd.getkey("machines/runnodes/"+self.addr)
if status:
self.key = generatekey("machines/allnodes/"+self.addr)
else:
logger.error("get key failed. %s" % 'machines/runnodes/'+self.addr)
sys.exit(1)
if len(sys.argv) > 1 and sys.argv[1] == "batch-worker":
self.workertype = "batch"
if self.workertype == "normal":
# waiting state is preserved for compatible.
self.etcd.setkey("machines/runnodes/"+self.addr, "waiting")
# get this node's key to judge how to init.
[status, key] = self.etcd.getkey("machines/runnodes/"+self.addr)
if status:
self.key = generatekey("machines/allnodes/"+self.addr)
else:
logger.error("get key failed. %s" % 'machines/runnodes/'+self.addr)
sys.exit(1)
# check token to check global directory
[status, token_1] = self.etcd.getkey("token")
@ -87,7 +93,8 @@ class Worker(object):
if node['key'] == self.key:
value = 'init-recovery'
break
logger.info("worker start in "+value+" mode")
logger.info("worker start in "+value+" mode, worker type is"+self.workertype)
Containers = container.Container(self.addr, etcdclient)
if value == 'init-new':
@ -193,7 +200,8 @@ class Worker(object):
self.hosts_collector.start()
logger.info("Monitor Collector has been started.")
# worker change it state itself. Independedntly from master.
self.etcd.setkey("machines/runnodes/"+self.addr, "work")
if self.workertype == "normal":
self.etcd.setkey("machines/runnodes/"+self.addr, "work")
publicIP = env.getenv("PUBLIC_IP")
self.etcd.setkey("machines/publicIP/"+self.addr,publicIP)
self.thread_sendheartbeat = threading.Thread(target=self.sendheartbeat)
@ -204,17 +212,22 @@ class Worker(object):
# send heardbeat package to keep alive in etcd, ttl=2s
def sendheartbeat(self):
while(True):
# check send heartbeat package every 1s
time.sleep(2)
[status, value] = self.etcd.getkey("machines/runnodes/"+self.addr)
if status:
# master has know the worker so we start send heartbeat package
if value=='ok':
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 3)
else:
logger.error("get key %s failed, master may be crashed" % self.addr)
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 60)
if self.workertype == "normal":
while(True):
# check send heartbeat package every 1s
time.sleep(2)
[status, value] = self.etcd.getkey("machines/runnodes/"+self.addr)
if status:
# master has know the worker so we start send heartbeat package
if value=='ok':
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 60)
else:
logger.error("get key %s failed, master may be crashed" % self.addr)
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 60)
elif self.workertype == "batch":
while(True):
time.sleep(2)
self.etcd.setkey("machines/batchnodes/"+self.addr, "ok", ttl = 60)
if __name__ == '__main__':

View File

@ -240,21 +240,38 @@ function processInfo()
$("#con_disk").html(usedp+"%<br/>"+detail);
//processNetStats
var net_stats = data.monitor.net_stats;
var in_rate = parseInt(net_stats.bytes_recv_per_sec);
var out_rate = parseInt(net_stats.bytes_sent_per_sec);
ingress_rate = in_rate;
egress_rate = out_rate;
$("#net_in_rate").html(num2human(in_rate)+"Bps");
$("#net_out_rate").html(num2human(out_rate)+"Bps");
$("#net_in_bytes").html(num2human(net_stats.bytes_recv)+"B");
$("#net_out_bytes").html(num2human(net_stats.bytes_sent)+"B");
$("#net_in_packs").html(net_stats.packets_recv);
$("#net_out_packs").html(net_stats.packets_sent);
$("#net_in_err").html(net_stats.errout);
$("#net_out_err").html(net_stats.errin);
$("#net_in_drop").html(net_stats.dropout);
$("#net_out_drop").html(net_stats.dropin);
var net_stats = data.monitor.net_stats;
if(!$.isEmptyObject(net_stats))
{
var in_rate = parseInt(net_stats.bytes_recv_per_sec);
var out_rate = parseInt(net_stats.bytes_sent_per_sec);
ingress_rate = in_rate;
egress_rate = out_rate;
$("#net_in_rate").html(num2human(in_rate)+"Bps");
$("#net_out_rate").html(num2human(out_rate)+"Bps");
$("#net_in_bytes").html(num2human(net_stats.bytes_recv)+"B");
$("#net_out_bytes").html(num2human(net_stats.bytes_sent)+"B");
$("#net_in_packs").html(net_stats.packets_recv);
$("#net_out_packs").html(net_stats.packets_sent);
$("#net_in_err").html(net_stats.errout);
$("#net_out_err").html(net_stats.errin);
$("#net_in_drop").html(net_stats.dropout);
$("#net_out_drop").html(net_stats.dropin);
}
else {
ingress_rate = 0;
egress_rate = 0;
$("#net_in_rate").html("--");
$("#net_out_rate").html("--");
$("#net_in_bytes").html("--");
$("#net_out_bytes").html("--");
$("#net_in_packs").html("--");
$("#net_out_packs").html("--");
$("#net_in_err").html("--");
$("#net_out_err").html("--");
$("#net_in_drop").html("--");
$("#net_out_drop").html("--");
}
},"json");
}

View File

@ -174,6 +174,9 @@
<li id="nav_History">
<a href='/history/'><i class="fa fa-history"></i> <span class="nav-label">History</span></a>
</li>
<li id="nav_Batch">
<a href='/batch_jobs/'><i class="fa fa-tasks"></i> <span class="nav-label">Batch</span></a>
</li>
{% if mysession['usergroup'] == 'root' or mysession['usergroup'] == 'admin'%}
@ -241,7 +244,7 @@
<i><a href="https://github.com/unias/docklet">Docklet {{ version }}</a></i>
</div>
<!-- Default to the left -->
<strong>Copyright</strong>&copy;&nbsp;2017 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
<strong>Copyright</strong>&copy;&nbsp;2019 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
</footer>

View File

@ -0,0 +1,359 @@
{% extends 'base_AdminLTE.html' %}
{% block title %}Docklet | Create Batch Job{% endblock %}
{% block css_src %}
<!--<style>
.divcontent { overflow-y:scroll; height:200px;}
</style>-->
<link href="//cdn.bootcss.com/datatables/1.10.11/css/dataTables.bootstrap.min.css" rel="stylesheet">
<link href="//cdn.bootcss.com/datatables/1.10.11/css/jquery.dataTables_themeroller.css" rel="stylesheet">
<link href="/static/dist/css/modalconfig.css" rel="stylesheet">
{% endblock %}
{% block panel_title %}Batch Job Info{% endblock %}
{% block panel_list %}
<ol class="breadcrumb">
<li>
<a href="/dashboard/"><i class="fa fa-dashboard"></i>Home</a>
</li>
</ol>
{% endblock %}
<div>
{% block content %}
<div class="row">
<div class="col-lg-12">
<div class="box box-info">
<div class="box-header with-border">
<h3 class="box-title">Batch Job Create</h3>
<div class="box-tools pull-right">
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
</button>
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
</div>
</div>
<div class="box-body">
<form id="form" class="form-horizontal" action="/batch_job/{{masterips[0].split("@")[0]}}/add/" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<div class="form-group"><label class="col-sm-2 control-label">Job Name</label>
<div class="col-sm-10"><input type="text" class="form-control" name="jobName" id="job_name" required></div>
</div>
<br/>
<div class="form-group"><label class="col-sm-2 control-label">Location</label>
<div class="col-sm-10"><select id="masterselector" class="form-control">
{% for master in masterips %}
<option value="{{master.split("@")[0]}}">{{master.split("@")[1]}}</option>
{% endfor %}
</select></div>
</div>
<div class="hr-line-dashed"></div>
<br/>
<div class="form-group"><label class="col-sm-2 control-label">Priority</label>
<div class="col-sm-10"><select id="priority_selector" class="form-control" name="jobPriority">
{% for priority in range(10) %}
<option value="{{priority}}">{{priority}}</option>
{% endfor %}
</select></div>
</div>
<br/>
<div class="hr-line-dashed"></div>
<div class="panel-group" id="accordion">
<!-- Tasks -->
</div>
<br/>
<div class="hr-line-dashed"></div>
<div class="row">
<div class="form-group">
<div class="col-sm-4 col-sm-offset-2">
<button class="btn btn-primary" type="button" id="add_task" class="btn btn-box-tool" title="add a task">Add Task <i class="fa fa-plus"></i></button>
<button class="btn btn-primary" type="submit">Create Job</button>
</div>
</div>
</div>
</form>
</div>
</div>
</div>
</div>
</div>
{% endblock %}
{% block script_src %}
<script src="//cdn.bootcss.com/pace/1.0.2/pace.min.js"></script>
<!-- Steps -->
<script src="//cdn.bootcss.com/jquery-steps/1.1.0/jquery.steps.min.js"></script>
<!-- Jquery Validate -->
<script src="//cdn.bootcss.com/jquery-validate/1.15.0/jquery.validate.min.js"></script>
<script src="//cdn.bootcss.com/datatables/1.10.11/js/jquery.dataTables.min.js"></script>
<script src="//cdn.bootcss.com/datatables/1.10.11/js/dataTables.bootstrap.min.js"></script>
<script src="//cdn.bootcss.com/datatables-tabletools/2.1.5/js/TableTools.min.js"></script>
<script src="//cdn.bootcss.com/jquery-validate/1.17.0/jquery.validate.js"></script>
<script type="text/javascript">
var task_number = 0;
var mapping_number = 0;
var images_text = "{{ images }}";
images_text = images_text.replace(/&#39;/g,"\"");
console.log(images_text);
var images_info = JSON.parse(images_text);
console.log(images_info);
$().ready(function() {
$("#form").validate();
});
function removeTask(obj) {
$("#task_pannel_" + obj.id).remove();
}
function unfoldTask(obj){
$("#collapse" + obj.id).collapse('hide');
}
function chmountPath(obj,task_num,mapping_num) {
cellid = 'mapping_mountpath_' + task_num + '_' + mapping_num;
$('#'+cellid).val("/root/oss/"+obj.value);
}
function removeMapping(obj) {
$("#mapping_" + obj.id).remove();
}
function addMapping(obj,task_num) {
mapping_number += 1;
var table = $("#storage_mapping_" + obj.id)[0];
var new_mapping = table.insertRow();
new_mapping.id = "mapping_" + task_num + "_" + mapping_number;
var provider = new_mapping.insertCell();
var bucket_name = new_mapping.insertCell();
var accessKey = new_mapping.insertCell();
var secretKey = new_mapping.insertCell();
var endpoint = new_mapping.insertCell();
var mountpath = new_mapping.insertCell();
var remove = new_mapping.insertCell();
bucket_name.innerHTML = '<input type="text" class="form-control" name="mappingBucketName_' + task_num + '_' + mapping_number + '" id="mapping_bucketname_'
+ task_num + '_' + mapping_number + '" onKeyUp="chmountPath(this,'+task_num+','+mapping_number+');" required/>';
accessKey.innerHTML = '<input type="text" class="form-control" name="mappingAccessKey_' + task_num + '_' + mapping_number + '" id="mapping_accessKey_'
+ task_num + '_' + mapping_number + '" required/>';
secretKey.innerHTML = '<input type="text" class="form-control" name="mappingSecretKey_' + task_num + '_' + mapping_number + '" id="mapping_secretKey_'
+ task_num + '_' + mapping_number + '" required/>';
endpoint.innerHTML = 'http://<input type="text" class="form-control" name="mappingEndpoint_' + task_num + '_' + mapping_number + '" id="mapping_endpoint_'
+ task_num + '_' + mapping_number + '" required/>';
mountpath.innerHTML = '<input type="text" class="form-control" name="mappingMountpath_' + task_num + '_' + mapping_number + '" id="mapping_mountpath_'
+ task_num + '_' + mapping_number + '" readonly="true" required/>';
provider.innerHTML = '<select class="form-control" name="mappingProvider_' + task_num + '_' + mapping_number + '" id="mapping_provider_'
+ task_num + '_' + mapping_number + '">'
+'<option>Aliyun</option></select>';
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_num + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">'
+'Remove</button></div>';
}
$("select#masterselector").change(function() {
var masterip=$(this).children('option:selected').val();
$("#form").attr("action","/batch_job/"+ masterip +"/add/");
var mastername=$(this).children('option:selected').html();
console.log(masterip);
var host = window.location.host;
var images = images_info;
for(var tnum = 1; tnum<=task_number; ++tnum)
{
var imagehtml =
"<thead>"
+"<tr>"
+"<th>ImageName</th>"
+"<th>Type</th>"
+"<th>Owner</th>"
+"<th>Size</th>"
+"<th>Description</th>"
+"<th>Choose</th>"
+"</tr>"
+"</thead>"
+"<tbody>"
+"<tr>"
+"<td>base</td>"
+"<td>public</td>"
+"<td>docklet</td>"
+"<td>--</td>"
+"<td>A base image for you</td>"
+'<td><div class="i-checks"><label><input type="radio" name="image_' + tnum + '" value="base_base_base" checked="checked"></label></div></td>'
+"</tr>";
for(var index in images[masterip].private) {
var image = images[masterip].private[index];
imagehtml +=
"<tr>"
+"<td>"+image.name+"</td>"
+"<td>private</td>"
+"<td>{{user}}</td>"
+"<td>"+image.size_format+"</td>"
+'<td><a href="/image/' + masterip + '/description/' + image.name + '_' + '{{user}}' + '_private/" target="_blank">' + image.description + '</a></td>'
+'<td><div class="i-checks"><label><input type="radio" name="image_' + tnum + '" value="'+image.name+'_{{user}}_private"><label></div></td>'
+"</tr>";
}
for(var p_user in images[masterip].public) {
for(var index in images[masterip].public[p_user]) {
image=images[masterip].public[p_user][index];
imagehtml +=
"<tr>"
+"<td>"+image.name+"</td>"
+"<td>public</td>"
+"<td>" + p_user + "</td>"
+"<td>"+image.size_format+"</td>"
+'<td><a href="/image/' + masterip + '/description/' + image.name + "_" + p_user + '_public/" target="_blank">' + image.description + '</a></td>'
+'<td><div class="i-checks"><label><input type="radio" name="image_' + tnum + '" value="'+image.name+'_{{p_user}}_public"><label></div></td>'
+"</tr>";
}
}
imagehtml += "</tbody>";
$("#imagetable"+tnum).html(imagehtml);
}
});
function addTask() {
task_number += 1;
var masterip=$("select#masterselector").children('option:selected').val();
//mapping_number = 0;
var task_html = '';
task_html +=
'<div class="panel panel-default" id="task_pannel_' + task_number + '">'
+'<div class="panel-heading">'
+'<h4 class="panel-title">'
+'<a data-toggle="collapse" data-panel="#accordion" href="#collapse' + task_number + '">'
+'Task #' + task_number
+'</a><div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="removeTask(this)" class="btn btn-box-tool"><i class="fa fa-times"></i></button></div>'
+'</h4></div>'
+'<div id="collapse' + task_number + '" class="panel-collapse collapse in">'
+'<div class="panel-body">'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">CPU</label>'
+'<div class="col-sm-3"><input type="number" class="form-control" name="cpuSetting_' + task_number + '" id="cpuSetting_' + task_number + '" value = 1 min="1" max="8" required/>'
+'</div>'
+'<label class="col-sm-2 control-label">Memory</label>'
+'<div class="col-sm-3"><input type="number" class="form-control" name="memorySetting_' + task_number + '" id="memorySetting_' + task_number + '" value = 1024 min="100" max="8196" required/>'
+'</div>MB</div>'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">GPU</label>'
+'<div class="col-sm-3"><input type="number" class="form-control" name="gpuSetting_' + task_number + '" id="gpuSetting_' + task_number + '" value= 0 min="0" max="2" required/>'
+'</div>'
+'<label class="col-sm-2 control-label">Disk</label>'
+'<div class="col-sm-3"><input type="number" class="form-control" name="diskSetting_' + task_number + '" id="diskSetting_' + task_number + '" value= 1024 min="128" max="10000" required/>'
+'</div>MB</div>'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">VNode Number</label>'
+'<div class="col-sm-3"><input type="number" class="form-control" name="vnodeCount_' + task_number + '" id="vnodeCount_' + task_number + '" value= 1 min="1" max="14" required/>'
+'</div>'
+'<label class="col-sm-2 control-label">Max Retry Times</label>'
+'<div class="col-sm-3"><input type="number" class="form-control" name="retryCount_' + task_number + '" id="retryCount_' + task_number + '" value= 1 min="0" max="5" required/>'
+'</div></div>'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">Running Path</label>'
+'<div class="col-sm-3"><input type="text" class="form-control" name="srcAddr_' + task_number + '" id="srcAddr_' + task_number + '" value="/root" required/>'
+'</div>'
+'<label class="col-sm-2 control-label">Expire Time</label>'
+'<div class="col-sm-3"><input type="number" class="form-control" name="expTime_' + task_number + '" id="expTime_' + task_number + '" value= 60 min="10" max="86400" required/>'
+'</div>Seconds</div>'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">Stderr Redirect Path</label>'
+'<div class="col-sm-3"><input type="text" class="form-control" placeholder="/path/to/file or /path/" name="stdErrRedPth_' + task_number + '" id="stdErrRedPth_' + task_number + '" value="/root/nfs/batch_{jobid}/" required/>'
+'</div>'
+'<label class="col-sm-2 control-label">Stdout Redirect Path</label>'
+'<div class="col-sm-3"><input type="text" class="form-control" placeholder="/path/to/file or /path/" name="stdOutRedPth_' + task_number + '" id="stdOutRedPth_' + task_number + '" value="/root/nfs/batch_{jobid}/" required/>'
+'</div></div>'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">Dependency&nbsp<i class="fa fa-question-circle" title="The tasks ids that this task depends on, seperate them with commas, eg: 1, 2"></i></label>'
+'<div class="col-sm-3"><input type="text" class="form-control" name="dependency_' + task_number + '" id="dependency_' + task_number + '" />'
+'</div>'
+'<label class="col-sm-2 control-label">Command</label>'
+'<div class="col-sm-3"><input type="text" class="form-control" name="command_' + task_number + '" id="command_' + task_number + '" required/>'
+'</div></div>'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">Run on: </label>'
+'<div class="col-sm-3"><input type="radio" name="runon_' + task_number + '" value="all" checked="checked"/>All vnodes &nbsp'
+' <input type="radio" name="runon_' + task_number + '" value="master" />One vnode(master)</div>'
+'<label class="col-sm-2 control-label">Start at the Same Time</label>'
+'<div class="col-sm-3"><input type="checkbox" name="atSameTime_' + task_number + '" checked="checked"/>'
+'</div></div>'
var images = images_info
task_html +=
'<div class="form-group"><label class="col-sm-2 control-label">Image Choose</label>'
+'<div class="col-sm-10">'
+'<table id="imagetable' + task_number +'" class="table table-striped table-bordered table-hover table-image" >'
+"<thead>"
+"<tr>"
+"<th>ImageName</th>"
+"<th>Type</th>"
+"<th>Owner</th>"
+"<th>Size</th>"
+"<th>Description</th>"
+"<th>Choose</th>"
+"</tr>"
+"</thead>"
+"<tbody>"
+"<tr>"
+"<td>base</td>"
+"<td>public</td>"
+"<td>docklet</td>"
+"<td>--</td>"
+"<td>A base image for you</td>"
+'<td><div class="i-checks"><label><input type="radio" name="image_' + task_number + '" value="base_base_base" checked="checked"></label></div></td>'
+"</tr>";
for(var index in images[masterip].private) {
var image = images[masterip].private[index];
task_html +=
"<tr>"
+"<td>"+image.name+"</td>"
+"<td>private</td>"
+"<td>{{user}}</td>"
+"<td>"+image.size_format+"</td>"
+'<td><a href="/image/' + masterip + '/description/' + image.name + '_' + '{{user}}' + '_private/" target="_blank">' + image.description + '</a></td>'
+'<td><div class="i-checks"><label><input type="radio" name="image_' + task_number + '" value="'+image.name+'_{{user}}_private"><label></div></td>'
+"</tr>";
}
for(var p_user in images[masterip].public) {
for(var index in images[masterip].public[p_user]) {
image=images[masterip].public[p_user][index];
task_html +=
"<tr>"
+"<td>"+image.name+"</td>"
+"<td>public</td>"
+"<td>" + p_user + "</td>"
+"<td>"+image.size_format+"</td>"
+'<td><a href="/image/' + masterip + '/description/' + image.name + "_" + p_user + '_public/" target="_blank">' + image.description + '</a></td>'
+'<td><div class="i-checks"><label><input type="radio" name="image_' + task_number + '" value="'+image.name+'_{{p_user}}_public"><label></div></td>'
+"</tr>";
}
}
task_html +=
'</tbody></table>'
+'</div>'
+'</div>'
+'<div class="form-group">'
+'<label class="col-sm-2 control-label">Object Storage Mapping<br/>'
+'<button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this,'+task_number+')">'
+'Add<i class="fa fa-plus"></i></button></label>'
+'<div class="col-sm-10"><table class="table table-bordered" id="storage_mapping_' + task_number + '">'
+'<thead>'
+'<tr><th>Provider</th><th>Bucket Name</th><th>AccessKey ID</th><th>AccessKey Secret</th><th>Endpoint</th><th>Mount Path</th><th>Remove</th></tr>'
+'</thead>'
+'<tbody>'
+'</tbody>'
+'</table></div>'
+'</div>'
+'<div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>'
+'</div></div></div>'
$(task_html).appendTo("#accordion");
}
addTask();
$("#add_task").click(addTask);
</script>
{% endblock %}

View File

@ -0,0 +1,264 @@
{% extends 'base_AdminLTE.html' %}
{% block title %}Docklet | Batch Job Info{% endblock %}
{% block panel_title %}Info for {{ jobinfo['job_id'] }}{% endblock %}
{% block css_src %}
<link href="//cdn.bootcss.com/datatables/1.10.11/css/dataTables.bootstrap.min.css" rel="stylesheet">
<link href="//cdn.bootcss.com/datatables/1.10.11/css/jquery.dataTables_themeroller.css" rel="stylesheet">
<link href="/static/dist/css/modalconfig.css" rel="stylesheet">
{% endblock %}
{% block panel_list %}
<ol class="breadcrumb">
<li>
<a href="/dashboard/"><i class="fa fa-dashboard"></i>Home</a>
</li>
<li>
<a href='/batch_jobs/'>Batch Job</a>
</li>
<li class='active'>
<strong>Info</strong>
</li>
</ol>
{% endblock %}
{% block content %}
<div class="row">
<div class="col-md-12">
<div class="box box-info">
<div class="box-header with-border">
<h3 class="box-title">Overview</h3>
<div class="box-tools pull-right">
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
</button>
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
</div>
</div>
<div class="box-body table-responsive">
<table class="table table-bordered">
<thead>
<tr>
<th>Job ID</th>
<th>Name</th>
<th>Priority</th>
<th>Status</th>
<th>Create Time</th>
<th>End Time</th>
<th>Billing</th>
</tr>
</thead>
<tbody>
<tr>
<td>{{ jobinfo['job_id'] }}</td>
<td>{{ jobinfo['job_name'] }}</td>
<td>{{ jobinfo['priority'] }}</td>
<td>{{ jobinfo['status'] }}</td>
<td>{{ jobinfo['create_time'] }}</td>
<td>{{ jobinfo['end_time'] }}</td>
<td>{{ jobinfo['billing'] }} <img src='/static/img/bean.png' /></td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
<div class="row">
<div class="col-md-12">
<div class="box box-info">
<div class="box-header with-border">
<h3 class="box-title">Tasks Overview</h3>
<div class="box-tools pull-right">
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
</button>
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
</div>
</div>
<div class="box-body table-responsive">
<table width="100%" cellspacing="0" style="margin: 0 auto;" id="table-tasks" class="table table-striped table-bordered table-hover">
<thead>
<tr>
<th>Task Index</th>
<th>Status</th>
<th>Failed Reason(if fails)</th>
<th>Tried Times</th>
<th>Start Time</th>
<th>End Time</th>
<th>Total Running Time</th>
<th>Billing</th>
</tr>
</thead>
<tbody>
{% for task in jobinfo['tasks'] %}
<tr>
<td>{{ task['idx'] }}</td>
<td>{{ task['status'] }}</td>
<td>{{ task['failed_reason'] }}</td>
<td>{{ task['tried_times'] }}</td>
<td>{{ task['start_time'] }}</td>
<td>{{ task['end_time'] }}</td>
<td>{{ task['running_time'] }} s</td>
<td>{{ task['billing'] }} <img src='/static/img/bean.png' /></td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</div>
<div class="row">
<div class="col-md-12">
<div class="box box-info">
<div class="box-header with-border">
<h3 class="box-title">Tasks Configs</h3>
<div class="box-tools pull-right">
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
</button>
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
</div>
</div>
<div class="box-body">
{% for task in jobinfo['tasks'] %}
<div class="panel panel-default" id="task_pannel_{{ task['idx'] }}">
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" data-panel="#accordion" href="#collapse{{ task['idx'] }}">
Task #{{ task['idx'] }}
</a>
</h4>
</div>
<div id="collapse{{ task['idx'] }}" class="panel-collapse collapse in">
<div class="panel-body">
<div class="table-responsive">
<table class="table table-bordered table-hover">
<thead>
<tr>
<th>CPU Cores</th>
<th>Memory</th>
<th>GPU</th>
<th>Disk</th>
<th>VNode Number</th>
<th>Max Retry Times</th>
</tr>
</thead>
<tbody>
<tr>
<td>{{ task['config']['cpuSetting'] }}</td>
<td>{{ task['config']['memorySetting'] }} MB</td>
<td>{{ task['config']['gpuSetting'] }}</td>
<td>{{ task['config']['diskSetting'] }} MB</td>
<td>{{ task['config']['vnodeCount'] }}</td>
<td>{{ task['config']['retryCount'] }}</td>
</tr>
</tbody>
<thead>
<tr>
<th>Running Path</th>
<th>Expire Time</th>
<th>Stdout Redirect Path</th>
<th>Stderr Redirect Path</th>
<th>Dependency</th>
<th>Command</th>
</tr>
</thead>
<tbody>
<tr>
<td>{{ task['config']['srcAddr'] }}</td>
<td>{{ task['config']['expTime'] }} seconds</td>
<td>{{ task['config']['stdOutRedPth'] }}</td>
<td>{{ task['config']['stdErrRedPth'] }}</td>
<td>{{ task['config']['dependency'] }}</td>
<td>{{ task['config']['command'] }}</td>
</tr>
</tbody>
<thead>
<tr>
<th>Run on</th>
<th>Start at the Same Time</th>
<th>Image Name</th>
<th>Image Owner</th>
<th>Image Type</th>
</tr>
</thead>
<tbody>
<tr>
{% if task['config']['runon'] == 'all' %}
<td>all vnodes</td>
{% else %}
<td>master vnode</td>
{% endif %}
{% if 'atSameTime' in task['config'].keys() %}
<td>True</td>
{% else %}
<td>False</td>
{% endif %}
{% if task['config']['image'] == 'base_base_base' %}
<td>base</td>
<td>docklet</td>
<td>public</td>
{% else %}
<td>{{ task['config']['image'].split('_')[0] }}</td>
<td>{{ task['config']['image'].split('_')[1] }}</td>
<td>{{ task['config']['image'].split('_')[2] }}</td>
{% endif %}
</tr>
</tbody>
</table>
</div>
{% if 'mapping' in task['config'].keys() %}
<div class="table-responsive">
<table class="table table-bordered table-hover">
<thead>
<tr>
<th>Provider</th>
<th>Bucket Name</th>
<th>AccessKey ID</th>
<th>Endpoint</th>
<th>Mount Path</th>
</tr>
</thead>
<tbody>
{% for key in task['config']['mapping'].keys() %}
<tr>
<td>{{ task['config']['mapping'][key]['mappingProvider'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingBucketName'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingAccessKey'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingEndpoint'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingMountpath'] }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endif %}
</div>
</div>
</div>
{% endfor %}
</div>
</div>
</div>
</div>
{% endblock %}
{% block script_src %}
<script src="//cdn.bootcss.com/datatables/1.10.11/js/jquery.dataTables.min.js"></script>
<script src="//cdn.bootcss.com/datatables/1.10.11/js/dataTables.bootstrap.min.js"></script>
<script type="text/javascript">
$(document).ready(function() {
$("#table-tasks").DataTable({"scrollX":true,"order":[[ 0, "asc" ]]});
});
</script>
{% endblock %}

View File

@ -0,0 +1,147 @@
{% extends "base_AdminLTE.html"%}
{% block title %}Docklet | Batch Job{% endblock %}
{% block panel_title %}Batch Job{% endblock %}
{% block css_src %}
<link href="//cdn.bootcss.com/datatables/1.10.11/css/dataTables.bootstrap.min.css" rel="stylesheet">
<link href="//cdn.bootcss.com/datatables/1.10.11/css/jquery.dataTables_themeroller.css" rel="stylesheet">
<link href="/static/dist/css/modalconfig.css" rel="stylesheet">
{% endblock %}
{% block panel_list %}
<ol class="breadcrumb">
<li>
<a href="/dashboard/"><i class="fa fa-dashboard"></i>Home</a>
</li>
<li class="active">
<strong>Batch Job</strong>
</li>
</ol>
{% endblock %}
{% block content %}
<div class="row">
<div class="col-lg-12">
<div class="box box-info">
<div class="box-header with-border">
<h3 class="box-title">Batch Job List</h3>
<div class="box-tools pull-right">
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
</button>
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
</div>
</div>
<div class="box-body">
<p>
<a href="/batch_job/create/"><button type="button" class="btn btn-primary btn-sm"><i class="fa fa-plus"></i> Create Batch Job</button></a>
</p>
{% for master in masterips %}
{% for job_info in job_list[master.split('@')[0]] %}
<div class="modal inmodal" id='OutputModal_{{ master.split('@')[1] }}_{{ job_info['job_id'] }}' tabindex="-1" role="dialog" aria-hidden="true">
<div class="modal-dialog">
<div class="modal-content animated fadeIn">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal"><span aria-hidden="true">&times;</span><span class="sr-only">Close</span></button>
<h4 class="modal-title">Job:{{ job_info['job_name'] }}({{ job_info['job_id'] }}) Stdout and Stderr of tasks</h4>
</div>
<div class="modal-body">
<table width="100%" cellspacing="0" class="table table-bordered table-striped table-hover table-output">
<thead>
<tr>
<th>Task ID</th>
<th>Vnode ID</th>
<th>Stdout</th>
<th>Stderr</th>
</tr>
</thead>
<tbody>
{% for taskid in job_info['tasks'] %}
{% for vnodeid in range(job_info['tasks_vnodeCount'][taskid]) %}
<tr>
<td>{{ taskid }}</td>
<td>{{ vnodeid }}</td>
<td><a class="btn btn-info btn-xs" href='/batch_job/output/{{ master.split('@')[0] }}/{{ job_info["job_id"] }}/{{ taskid }}/{{ vnodeid }}/stdout/' target="_blank">Stdout</a></td>
<td><a class="btn btn-info btn-xs" href='/batch_job/output/{{ master.split('@')[0] }}/{{ job_info["job_id"] }}/{{ taskid }}/{{ vnodeid }}/stderr/' target="_blank">Stderr</a></td>
</tr>
{% endfor %}
{% endfor %}
</tbody>
</table>
<div class="modal-footer">
<button type="button" class="btn btn-white" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
{% endfor %}
{% endfor %}
<div class="table">
<table width="100%" cellspacing="0" style="margin: 0 auto;" class="table table-striped table-bordered table-hover table-batch">
<thead>
<tr>
<th>Location</th>
<th>ID</th>
<th>Name</th>
<th>Status</th>
<th>Operations</th>
<th>Create Time</th>
<th>End Time</th>
<th>billing</th>
<th>Stdout and Stderr</th>
<th>Detailed Info</th>
</tr>
<thead>
<tbody>
{% for master in masterips %}
{% for job_info in job_list[master.split('@')[0]] %}
<tr>
<td>{{ master.split('@')[1] }}</td>
<td>{{ job_info['job_id'] }}</td>
<td>{{ job_info['job_name'] }}</td>
<td>
{{ job_info['status'] }}
</td>
{% if job_info['status'] == 'done' or job_info['status'] == 'failed' or job_info['status'] == 'stopping' or job_info['status'] == 'stopped'%}
<td><button type="button" class="btn btn-xs btn-default"> &nbsp;Stop&nbsp;&nbsp; </button></td>
{% else %}
<td><a href="/batch_job/{{master.split("@")[0]}}/stop/{{ job_info['job_id'] }}/"><button type="button" class="btn btn-xs btn-danger"> &nbsp;Stop&nbsp; </button></a></td>
{% endif %}
<td>{{ job_info['create_time'] }}</td>
<td>{{ job_info['end_time'] }}</td>
<td>{{ job_info['billing'] }} <img src='/static/img/bean.png' /></td>
<td><a role="button" class="btn btn-info btn-xs" id='{{ master }}_{{ job_info['job_id'] }}_output' data-toggle="modal" data-target='#OutputModal_{{ master.split('@')[1] }}_{{ job_info['job_id'] }}'>Get Output</a></td>
<td><a href="/batch_job/{{master.split("@")[0]}}/info/{{ job_info['job_id'] }}/"><button type="button" class="btn btn-xs btn-info"> &nbsp;Info&nbsp; </button></a></td>
</tr>
{% endfor %}
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
{% endblock %}
{% block script_src %}
<script src="//cdn.bootcss.com/datatables/1.10.11/js/jquery.dataTables.min.js"></script>
<script src="//cdn.bootcss.com/datatables/1.10.11/js/dataTables.bootstrap.min.js"></script>
<script type="text/javascript">
$(document).ready(function() {
$(".table-batch").DataTable({"scrollX":true,"order":[[ 5, "desc" ]]});
$(".table-output").DataTable({
"lengthChange":false});
});
function sendAdd(){
document.getElementById("addForm").submit();
}
function sendDel(){
document.getElementById("delForm").submit();
}
</script>
{% endblock %}

View File

@ -0,0 +1,62 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Docklet | Batch {{ issue }}: {{ jobid }}/{{ taskid }}/{{ vnodeid }}</title>
<!-- Tell the browser to be responsive to screen width -->
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport">
<link rel="shortcut icon" href="/static/img/favicon.ico">
<link href="//cdn.bootcss.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet">
<!-- Font Awesome -->
<link href="//cdn.bootcss.com/font-awesome/4.3.0/css/font-awesome.min.css" rel="stylesheet">
<!-- Ionicons -->
<link href="//cdn.bootcss.com/ionicons/2.0.1/css/ionicons.min.css" rel="stylesheet">
<link href="//cdn.bootcss.com/animate.css/3.5.1/animate.min.css" rel="stylesheet">
<link href="//cdn.bootcss.com/toastr.js/latest/css/toastr.min.css" rel="stylesheet">
<!-- Theme style -->
<link rel="stylesheet" href="/static/dist/css/AdminLTE.min.css">
<link rel="stylesheet" href="/static/dist/css/skins/skin-blue.min.css">
</head>
<body>
<h3>Jobid: {{ jobid }}</h3>
<h3>Taskid: {{ taskid }}</h3>
<h3>VNodeid: {{ vnodeid }}</h3>
<h4><small>The output of {{ issue }} will be updated in every 2 seconds.</small></h4>
<hr>
<pre id="output">{{ output }}</pre>
<!-- jQuery 2.2.1 -->
<script src="//cdn.bootcss.com/jquery/2.2.1/jquery.min.js"></script>
<!-- Bootstrap 3.3.5 -->
<script src="//cdn.bootcss.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>
<!-- AdminLTE App -->
<script src="/static/dist/js/app.min.js"></script>
<script src="//cdn.bootcss.com/fastclick/1.0.6/fastclick.min.js"></script>
<script src="//cdn.bootcss.com/jQuery-slimScroll/1.3.7/jquery.slimscroll.min.js"></script>
<script src="//cdn.bootcss.com/toastr.js/latest/js/toastr.min.js"></script>
<script type="text/javascript">
$.ajaxSetup({
headers: {'X-CSRFToken':'{{ csrf_token() }}'},
});
function updateOutput()
{
var host = window.location.host;
url = "//" + host + "/batch/job/output/" + "{{ masterip }}" + "/" + "{{ jobid }}" + "/" + "{{ taskid }}" + "/" + "{{ vnodeid }}" + "/" + "{{ issue }}" + "/";
$.post(url,{},function(data){
$("#output").text(String(data.data));
},"json");
}
setInterval(updateOutput,2000);
</script>
</body>
</html>

View File

@ -41,6 +41,7 @@ from webViews.reportbug import *
from webViews.authenticate.auth import login_required, administration_required,activated_required
from webViews.authenticate.register import registerView
from webViews.authenticate.login import loginView, logoutView
from webViews.batch import *
import webViews.dockletrequest
from webViews import cookie_tool
import traceback
@ -127,6 +128,59 @@ def reportBug():
reportBugView.bugmessage = request.form['bugmessage']
return reportBugView.as_view()
@app.route("/batch_jobs/", methods=['GET'])
@login_required
def batch_job():
return batchJobListView().as_view()
@app.route("/batch_job/create/", methods=['GET'])
@login_required
def create_batch_job():
return createBatchJobView().as_view()
@app.route("/batch_job/<masterip>/add/", methods=['POST'])
@login_required
def add_batch_job(masterip):
addBatchJobView.masterip = masterip
addBatchJobView.job_data = request.form
return addBatchJobView().as_view()
@app.route("/batch_job/<masterip>/stop/<jobid>/", methods=['GET'])
@login_required
def stop_batch_job(masterip,jobid):
stopBatchJobView.masterip = masterip
stopBatchJobView.jobid = jobid
return stopBatchJobView().as_view()
@app.route("/batch_job/<masterip>/info/<jobid>/", methods=['GET'])
@login_required
def info_batch_job(masterip,jobid):
infoBatchJobView.masterip = masterip
infoBatchJobView.jobid = jobid
return infoBatchJobView().as_view()
@app.route("/batch_job/output/<masterip>/<jobid>/<taskid>/<vnodeid>/<issue>/", methods=['GET'])
@login_required
def output_batch_job(masterip, jobid, taskid, vnodeid, issue):
outputBatchJobView.masterip = masterip
outputBatchJobView.jobid = jobid
outputBatchJobView.taskid = taskid
outputBatchJobView.vnodeid = vnodeid
outputBatchJobView.issue = issue
return outputBatchJobView().as_view()
@app.route("/batch/job/output/<masterip>/<jobid>/<taskid>/<vnodeid>/<issue>/", methods=['POST'])
@login_required
def output_batch_job_request(masterip, jobid, taskid, vnodeid, issue):
data = {
'jobid':jobid,
'taskid':taskid,
'vnodeid':vnodeid,
'issue':issue
}
result = dockletRequest.post("/batch/job/output/",data,masterip)
return json.dumps(result)
@app.route("/workspace/create/", methods=['GET'])
#@activated_required
def addCluster():

108
web/webViews/batch.py Normal file
View File

@ -0,0 +1,108 @@
from flask import session, redirect, request
from webViews.view import normalView
from webViews.log import logger
from webViews.checkname import checkname
from webViews.dockletrequest import dockletRequest
import json
class batchJobListView(normalView):
template_path = "batch/batch_list.html"
@classmethod
def get(self):
masterips = dockletRequest.post_to_all()
job_list = {}
for ipname in masterips:
ip = ipname.split("@")[0]
result = dockletRequest.post("/batch/job/list/",{},ip)
job_list[ip] = result.get("data")
logger.debug("job_list[%s]: %s" % (ip,job_list[ip]))
if True:
return self.render(self.template_path, masterips=masterips, job_list=job_list)
else:
return self.error()
class createBatchJobView(normalView):
template_path = "batch/batch_create.html"
@classmethod
def get(self):
masterips = dockletRequest.post_to_all()
images = {}
for master in masterips:
images[master.split("@")[0]] = dockletRequest.post("/image/list/",{},master.split("@")[0]).get("images")
logger.info(images)
return self.render(self.template_path, masterips=masterips, images=images)
class infoBatchJobView(normalView):
template_path = "batch/batch_info.html"
error_path = "error.html"
masterip = ""
jobid = ""
@classmethod
def get(self):
data = {
'jobid':self.jobid
}
result = dockletRequest.post("/batch/job/info/",data,self.masterip)
data = result.get("data")
logger.info(str(data))
#logger.debug("job_list: %s" % job_list)
if result.get('success',"") == "true":
return self.render(self.template_path, masterip=self.masterip, jobinfo=data)
else:
return self.render(self.error_path, message = result.get('message'))
class addBatchJobView(normalView):
template_path = "batch/batch_list.html"
error_path = "error.html"
@classmethod
def post(self):
masterip = self.masterip
result = dockletRequest.post("/batch/job/add/", self.job_data, masterip)
if result.get('success', None) == "true":
return redirect('/batch_jobs/')
else:
return self.render(self.error_path, message = result.get('message'))
class stopBatchJobView(normalView):
template_path = "batch/batch_list.html"
error_path = "error.html"
@classmethod
def get(self):
masterip = self.masterip
data = {'jobid':self.jobid}
result = dockletRequest.post("/batch/job/stop/", data, masterip)
if result.get('success', None) == "true":
return redirect('/batch_jobs/')
else:
return self.render(self.error_path, message = result.get('message'))
class outputBatchJobView(normalView):
template_path = "batch/batch_output.html"
masterip = ""
jobid = ""
taskid = ""
vnodeid = ""
issue = ""
@classmethod
def get(self):
data = {
'jobid':self.jobid,
'taskid':self.taskid,
'vnodeid':self.vnodeid,
'issue':self.issue
}
result = dockletRequest.post("/batch/job/output/",data,self.masterip)
output = result.get("data")
#logger.debug("job_list: %s" % job_list)
if result.get('success',"") == "true":
return self.render(self.template_path, masterip=self.masterip, jobid=self.jobid,
taskid=self.taskid, vnodeid=self.vnodeid, issue=self.issue, output=output)
else:
return self.error()

View File

@ -21,7 +21,6 @@ class statusView(normalView):
print(quotainfo)'''
allcontainers = {}
if (result):
containers = {}
for master in allclusters:
allcontainers[master] = {}
for cluster in allclusters[master]:
@ -32,6 +31,18 @@ class statusView(normalView):
else:
self.error()
allcontainers[master][cluster] = message
message = dockletRequest.post('/batch/vnodes/list/', data, master.split("@")[0])
message = message.get('data')
containers = []
for m in message:
container = {}
container['containername'] = m
container['ip'] = '--'
containers.append(container)
tmp = {}
tmp['containers'] = containers
tmp['status'] = 'running'
allcontainers[master]['Batch_Job'] = tmp
return self.render(self.template_path, quotas = quotas, quotanames = quotanames, allcontainers = allcontainers, user = session['username'])
else:
self.error()