commit
0deaa377c6
|
@ -18,9 +18,6 @@ DOCKLET_USER=$DOCKLET_HOME/user
|
|||
# default working directory, default to /opt/docklet
|
||||
FS_PREFIX=/opt/docklet
|
||||
|
||||
RUN_DIR=$FS_PREFIX/local/run
|
||||
LOG_DIR=$FS_PREFIX/local/log
|
||||
|
||||
#network interface , default is eth0
|
||||
NETWORK_DEVICE=eth0
|
||||
#etcd server address, default is localhost:2379
|
||||
|
@ -32,6 +29,8 @@ WEB_PORT=8888
|
|||
USER_PORT=9100
|
||||
#cluster net, default is 172.16.0.1/16
|
||||
CLUSTER_NET="172.16.0.1/16"
|
||||
# ip addresses range of containers for batch job, default is 10.16.0.0/16
|
||||
BATCH_NET="10.16.0.0/16"
|
||||
#configurable-http-proxy public port, default is 8000
|
||||
PROXY_PORT=8000
|
||||
#configurable-http-proxy api port, default is 8001
|
||||
|
@ -42,6 +41,9 @@ DISTRIBUTED_GATEWAY=False
|
|||
|
||||
export FS_PREFIX
|
||||
|
||||
RUN_DIR=$FS_PREFIX/local/run
|
||||
LOG_DIR=$FS_PREFIX/local/log
|
||||
|
||||
# This next line determines what user the script runs as.
|
||||
DAEMON_USER=root
|
||||
|
||||
|
@ -103,6 +105,7 @@ pre_start_master () {
|
|||
# iptables for NAT network for containers to access web
|
||||
iptables -t nat -F
|
||||
iptables -t nat -A POSTROUTING -s $CLUSTER_NET -j MASQUERADE
|
||||
iptables -t nat -A POSTROUTING -s $BATCH_NET -j MASQUERADE
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -18,9 +18,6 @@ DOCKLET_USER=$DOCKLET_HOME/user
|
|||
# default working directory, default to /opt/docklet
|
||||
FS_PREFIX=/opt/docklet
|
||||
|
||||
RUN_DIR=$FS_PREFIX/local/run
|
||||
LOG_DIR=$FS_PREFIX/local/log
|
||||
|
||||
#configurable-http-proxy public port, default is 8000
|
||||
PROXY_PORT=8000
|
||||
#configurable-http-proxy api port, default is 8001
|
||||
|
@ -36,11 +33,16 @@ WEB_PORT=8888
|
|||
USER_PORT=9100
|
||||
#cluster net, default is 172.16.0.1/16
|
||||
CLUSTER_NET="172.16.0.1/16"
|
||||
# ip addresses range of containers for batch job, default is 10.16.0.0/16
|
||||
BATCH_NET="10.16.0.0/16"
|
||||
|
||||
. $DOCKLET_CONF/docklet.conf
|
||||
|
||||
export FS_PREFIX
|
||||
|
||||
RUN_DIR=$FS_PREFIX/local/run
|
||||
LOG_DIR=$FS_PREFIX/local/log
|
||||
|
||||
# This next line determines what user the script runs as.
|
||||
DAEMON_USER=root
|
||||
|
||||
|
@ -103,6 +105,7 @@ pre_start_master () {
|
|||
# iptables for NAT network for containers to access web
|
||||
iptables -t nat -F
|
||||
iptables -t nat -A POSTROUTING -s $CLUSTER_NET -j MASQUERADE
|
||||
iptables -t nat -A POSTROUTING -s $BATCH_NET -j MASQUERADE
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -20,19 +20,21 @@ FS_PREFIX=/opt/docklet
|
|||
|
||||
# cluster net ip range, default is 172.16.0.1/16
|
||||
CLUSTER_NET="172.16.0.1/16"
|
||||
# ip addresses range of containers for batch job, default is 10.16.0.0/16
|
||||
BATCH_NET="10.16.0.0/16"
|
||||
#configurable-http-proxy public port, default is 8000
|
||||
PROXY_PORT=8000
|
||||
#configurable-http-proxy api port, default is 8001
|
||||
PROXY_API_PORT=8001
|
||||
DISTRIBUTED_GATEWAY=False
|
||||
|
||||
RUN_DIR=$FS_PREFIX/local/run
|
||||
LOG_DIR=$FS_PREFIX/local/log
|
||||
|
||||
. $DOCKLET_CONF/docklet.conf
|
||||
|
||||
export FS_PREFIX
|
||||
|
||||
RUN_DIR=$FS_PREFIX/local/run
|
||||
LOG_DIR=$FS_PREFIX/local/log
|
||||
|
||||
# This next line determines what user the script runs as.
|
||||
DAEMON_USER=root
|
||||
|
||||
|
@ -43,6 +45,13 @@ DAEMON_OPTS=
|
|||
# The process ID of the script when it runs is stored here:
|
||||
PIDFILE=$RUN_DIR/$DAEMON_NAME.pid
|
||||
|
||||
# settings for docklet batch worker, which is required for batch job processing system
|
||||
BATCH_ON=True
|
||||
DAEMON_BATCH=$DOCKLET_LIB/worker/taskworker.py
|
||||
DAEMON_NAME_BATCH=docklet-taskworker
|
||||
PIDFILE_BATCH=$RUN_DIR/batch.pid
|
||||
DAEMON_OPTS_BATCH=
|
||||
|
||||
# settings for docklet proxy, which is required for web access
|
||||
DAEMON_PROXY=`which configurable-http-proxy`
|
||||
DAEMON_NAME_PROXY=docklet-proxy
|
||||
|
@ -83,6 +92,7 @@ pre_start () {
|
|||
# iptables for NAT network for containers to access web
|
||||
iptables -t nat -F
|
||||
iptables -t nat -A POSTROUTING -s $CLUSTER_NET -j MASQUERADE
|
||||
iptables -t nat -A POSTROUTING -s $BATCH_NET -j MASQUERADE
|
||||
|
||||
if [ ! -d $FS_PREFIX/local/basefs ]; then
|
||||
log_daemon_msg "basefs does not exist, run prepare.sh first" && exit 1
|
||||
|
@ -95,12 +105,27 @@ pre_start () {
|
|||
|
||||
do_start() {
|
||||
pre_start
|
||||
|
||||
DAEMON_OPTS=$1
|
||||
log_daemon_msg "Starting $DAEMON_NAME in $FS_PREFIX"
|
||||
#python3 $DAEMON
|
||||
start-stop-daemon --start --oknodo --background --pidfile $PIDFILE --make-pidfile --user $DAEMON_USER --chuid $DAEMON_USER --startas $DAEMON -- $DAEMON_OPTS
|
||||
log_end_msg $?
|
||||
}
|
||||
|
||||
do_start_batch () {
|
||||
if [ "$BATCH_ON" = "False" ]
|
||||
then
|
||||
return 1
|
||||
fi
|
||||
log_daemon_msg "Starting $DAEMON_NAME_BATCH in $FS_PREFIX"
|
||||
|
||||
DAEMON_OPTS_BATCH=""
|
||||
|
||||
start-stop-daemon --start --background --pidfile $PIDFILE_BATCH --make-pidfile --user $DAEMON_USER --chuid $DAEMON_USER --startas $DAEMON_BATCH -- $DAEMON_OPTS_BATCH
|
||||
log_end_msg $?
|
||||
}
|
||||
|
||||
do_start_proxy () {
|
||||
if [ "$DISTRIBUTED_GATEWAY" = "False" ]
|
||||
then
|
||||
|
@ -118,6 +143,16 @@ do_stop () {
|
|||
log_end_msg $?
|
||||
}
|
||||
|
||||
do_stop_batch () {
|
||||
if [ "$BATCH_ON" = "False" ]
|
||||
then
|
||||
return 1
|
||||
fi
|
||||
log_daemon_msg "Stopping $DAEMON_NAME_BATCH daemon"
|
||||
start-stop-daemon --stop --quiet --oknodo --remove-pidfile --pidfile $PIDFILE_BATCH --retry 10
|
||||
log_end_msg $?
|
||||
}
|
||||
|
||||
do_stop_proxy () {
|
||||
if [ "$DISTRIBUTED_GATEWAY" = "False" ]
|
||||
then
|
||||
|
@ -145,12 +180,14 @@ do_stop_meter() {
|
|||
|
||||
case "$1" in
|
||||
start)
|
||||
do_start
|
||||
do_start "normal-worker"
|
||||
do_start_batch
|
||||
do_start_proxy
|
||||
;;
|
||||
|
||||
stop)
|
||||
do_stop
|
||||
do_stop_batch
|
||||
do_stop_proxy
|
||||
;;
|
||||
start-meter)
|
||||
|
@ -161,6 +198,16 @@ case "$1" in
|
|||
do_stop_meter
|
||||
;;
|
||||
|
||||
start_batch)
|
||||
do_start "batch-worker"
|
||||
do_start_batch
|
||||
;;
|
||||
|
||||
stop_batch)
|
||||
do_stop
|
||||
do_stop_batch
|
||||
;;
|
||||
|
||||
start_proxy)
|
||||
do_start_proxy
|
||||
;;
|
||||
|
@ -176,13 +223,16 @@ case "$1" in
|
|||
|
||||
restart)
|
||||
do_stop
|
||||
do_stop_batch
|
||||
do_stop_proxy
|
||||
do_start
|
||||
do_start "normal-worker"
|
||||
do_start_batch
|
||||
do_start_proxy
|
||||
;;
|
||||
|
||||
status)
|
||||
status_of_proc -p $PIDFILE "$DAEMON" "$DAEMON_NAME" && exit 0 || exit $?
|
||||
status_of_proc -p $PIDFILE_BATCH "$DAEMON_BATCH" "$DAEMON_NAME_BATCH" || status=$?
|
||||
status_of_proc -p $PIDFILE_PROXY "$DAEMON_PROXY" "$DAEMON_NAME_PROXY" || status=$?
|
||||
;;
|
||||
*)
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
# This is the common container.conf for all containers.
|
||||
# If want set custom settings, you have two choices:
|
||||
# 1. Directly modify this file, which is not recommend, because the
|
||||
# setting will be overriden when new version container.conf released.
|
||||
# 2. Use a custom config file in this conf directory: lxc.custom.conf,
|
||||
# it uses the same grammer as container.conf, and will be merged
|
||||
# with the default container.conf by docklet at runtime.
|
||||
#
|
||||
# The following is an example mounting user html directory
|
||||
# lxc.mount.entry = /public/home/%USERNAME%/public_html %ROOTFS%/root/public_html none bind,rw,create=dir 0 0
|
||||
#
|
||||
|
||||
#### include /usr/share/lxc/config/ubuntu.common.conf
|
||||
lxc.include = /usr/share/lxc/config/ubuntu.common.conf
|
||||
|
||||
############## DOCKLET CONFIG ##############
|
||||
|
||||
# Setup 0 tty devices
|
||||
lxc.tty = 0
|
||||
|
||||
lxc.rootfs = %ROOTFS%
|
||||
lxc.utsname = %HOSTNAME%
|
||||
|
||||
lxc.network.type = veth
|
||||
lxc.network.name = eth0
|
||||
# veth.pair is limited in 16 bytes
|
||||
lxc.network.veth.pair = %VETHPAIR%
|
||||
lxc.network.script.up = Bridge=%BRNAME% %LXCSCRIPT%/lxc-ifup
|
||||
lxc.network.script.down = Bridge=%BRNAME% %LXCSCRIPT%/lxc-ifdown
|
||||
lxc.network.ipv4 = %IP%
|
||||
lxc.network.ipv4.gateway = %GATEWAY%
|
||||
lxc.network.flags = up
|
||||
lxc.network.mtu = 1420
|
||||
|
||||
lxc.cgroup.pids.max = 2048
|
||||
lxc.cgroup.memory.limit_in_bytes = %CONTAINER_MEMORY%M
|
||||
#lxc.cgroup.memory.kmem.limit_in_bytes = 512M
|
||||
#lxc.cgroup.memory.soft_limit_in_bytes = 4294967296
|
||||
#lxc.cgroup.memory.memsw.limit_in_bytes = 8589934592
|
||||
|
||||
# lxc.cgroup.cpu.cfs_period_us : period time of cpu, default 100000, means 100ms
|
||||
# lxc.cgroup.cpu.cfs_quota_us : quota time of this process
|
||||
lxc.cgroup.cpu.cfs_quota_us = %CONTAINER_CPU%
|
||||
|
||||
lxc.cap.drop = sys_admin net_admin mac_admin mac_override sys_time sys_module
|
||||
|
||||
lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/data %ROOTFS%/root/nfs none bind,rw,create=dir 0 0
|
||||
lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/hosts/batch-%TASKID%.hosts %ROOTFS%/etc/hosts none bind,ro,create=file 0 0
|
||||
lxc.mount.entry = %FS_PREFIX%/global/users/%USERNAME%/ssh %ROOTFS%/root/.ssh none bind,ro,create=dir 0 0
|
||||
lxc.mount.entry = %FS_PREFIX%/local/temp/%LXCNAME%/ %ROOTFS%/tmp none bind,rw,create=dir 0 0
|
||||
|
||||
# setting hostname
|
||||
lxc.hook.pre-start = HNAME=%HOSTNAME% %LXCSCRIPT%/lxc-prestart
|
||||
|
||||
# setting nfs softlink
|
||||
#lxc.hook.mount = %LXCSCRIPT%/lxc-mount
|
|
@ -182,3 +182,31 @@
|
|||
# ALLOW_SCALE_OUT: allow docklet to rent server on the cloud to scale out
|
||||
# Only when you deploy docklet on the cloud can you set it to True
|
||||
# ALLOW_SCALE_OUT=False
|
||||
|
||||
# ==================================================
|
||||
#
|
||||
# Batch Config
|
||||
#
|
||||
# ==================================================
|
||||
|
||||
# BATCH_ON: whether to start batch job processing system when start
|
||||
# the docklet. Default: True
|
||||
# BATCH_ON=True
|
||||
|
||||
# BATCH_MASTER_PORT: the rpc server port on master.
|
||||
# default: 50050
|
||||
# BATCH_MASTER_PORT=50050
|
||||
|
||||
# BATCH_WORKER_PORT: the rpc server port on worker.
|
||||
# default: 50051
|
||||
# BATCH_WORKER_PORT=50051
|
||||
|
||||
# BATCH_NET: ip addresses range of containers for batch job, default is 10.16.0.0/16
|
||||
# BATCH_NET=10.16.0.0/16
|
||||
|
||||
# BATCH_TASK_CIDR: 2^(BATCH_TASK_CIDR)-2 is the number of ip addresses for a task, default is 4
|
||||
# BATCH_TASK_CIDR=4
|
||||
|
||||
# BATCH_MAX_THREAD_WORKER: the maximun number of threads of the rpc server on
|
||||
# the batch job worker. default:5
|
||||
# BATCH_MAX_THREAD_WORKER=5
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
ovs-vsctl --if-exists del-port $Bridge $5
|
||||
cnt=$(ovs-vsctl list-ports ${Bridge} | wc -l)
|
||||
if [ "$cnt" = "1" ]; then
|
||||
greport=$(ovs-vsctl list-ports $(Bridge) | grep "^gre-[[:digit:]][[:digit:]]*-[[:digit:]][[:digit:]]*\.[[:digit:]][[:digit:]]*\.[[:digit:]][[:digit:]]*\.[[:digit:]][[:digit:]]*$" | wc -l)
|
||||
greport=$(ovs-vsctl list-ports ${Bridge} | grep "gre" | wc -l)
|
||||
if [ "$greport" = "1" ]; then
|
||||
ovs-vsctl del-br $Bridge
|
||||
fi
|
||||
|
|
21
prepare.sh
21
prepare.sh
|
@ -16,7 +16,7 @@ fi
|
|||
# some packages' name maybe different in debian
|
||||
apt-get install -y cgmanager lxc lxcfs lxc-templates lvm2 bridge-utils curl exim4 openssh-server openvswitch-switch
|
||||
apt-get install -y python3 python3-netifaces python3-flask python3-flask-sqlalchemy python3-pampy python3-httplib2 python3-pip
|
||||
apt-get install -y python3-psutil python3-flask-migrate
|
||||
apt-get install -y python3-psutil python3-flask-migrate python3-paramiko
|
||||
apt-get install -y python3-lxc
|
||||
apt-get install -y python3-requests python3-suds
|
||||
apt-get install -y nodejs nodejs-legacy npm
|
||||
|
@ -24,6 +24,9 @@ apt-get install -y etcd
|
|||
apt-get install -y glusterfs-client attr
|
||||
apt-get install -y nginx
|
||||
pip3 install Flask-WTF
|
||||
apt-get install -y gdebi-core
|
||||
gdebi ossfs_1.80.5_ubuntu16.04_amd64.deb
|
||||
pip3 install grpcio grpcio-tools googleapis-common-protos
|
||||
|
||||
#add ip forward
|
||||
echo "net.ipv4.ip_forward=1" >>/etc/sysctl.conf
|
||||
|
@ -51,15 +54,19 @@ echo ""
|
|||
[[ -f conf/docklet.conf ]] || { echo "Generating docklet.conf from template" && cp conf/docklet.conf.template conf/docklet.conf; }
|
||||
[[ -f web/templates/home.html ]] || { echo "Generating HomePage from home.template" && cp web/templates/home.template web/templates/home.html; }
|
||||
|
||||
mkdir -p /opt/docklet/global
|
||||
mkdir -p /opt/docklet/local/
|
||||
FS_PREFIX=/opt/docklet
|
||||
. conf/docklet.conf
|
||||
export FS_PREFIX
|
||||
|
||||
echo "directory /opt/docklet have been created"
|
||||
mkdir -p $FS_PREFIX/global
|
||||
mkdir -p $FS_PREFIX/local/
|
||||
|
||||
if [[ ! -d /opt/docklet/local/basefs && ! $1 = "withoutfs" ]]; then
|
||||
mkdir -p /opt/docklet/local/basefs
|
||||
echo "directory FS_PREFIX (${FS_PREFIX}) have been created"
|
||||
|
||||
if [[ ! -d $FS_PREFIX/local/basefs && ! $1 = "withoutfs" ]]; then
|
||||
mkdir -p $FS_PREFIX/local/basefs
|
||||
echo "Generating basefs"
|
||||
wget -P /opt/docklet/local http://iwork.pku.edu.cn:1616/basefs-0.11.tar.bz2 && tar xvf /opt/docklet/local/basefs-0.11.tar.bz2 -C /opt/docklet/local/ > /dev/null
|
||||
wget -P $FS_PREFIX/local http://iwork.pku.edu.cn:1616/basefs-0.11.tar.bz2 && tar xvf $FS_PREFIX/local/basefs-0.11.tar.bz2 -C $FS_PREFIX/local/ > /dev/null
|
||||
[ $? != "0" ] && echo "Generate basefs failed, please download it from http://unias.github.io/docklet/download to FS_PREFIX/local and then extract it using root. (defalut FS_PRERIX is /opt/docklet)"
|
||||
fi
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ import http.server, cgi, json, sys, shutil, traceback
|
|||
import xmlrpc.client
|
||||
from socketserver import ThreadingMixIn
|
||||
from utils import etcdlib, imagemgr
|
||||
from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr
|
||||
from master import nodemgr, vclustermgr, notificationmgr, lockmgr, cloudmgr, jobmgr, taskmgr
|
||||
from utils.logs import logs
|
||||
from master import userManager, beansapplicationmgr, monitor, sysmgr, network
|
||||
from worker.monitor import History_Manager
|
||||
|
@ -790,6 +790,147 @@ def resetall_system(user, beans, form):
|
|||
return json.dumps({'success':'false', 'message': message})
|
||||
return json.dumps(result)
|
||||
|
||||
@app.route("/batch/job/add/", methods=['POST'])
|
||||
@login_required
|
||||
@beans_check
|
||||
def add_job(user,beans,form):
|
||||
global G_jobmgr
|
||||
job_data = form.to_dict()
|
||||
job_info = {
|
||||
'tasks': {}
|
||||
}
|
||||
message = {
|
||||
'success': 'true',
|
||||
'message': 'add batch job success'
|
||||
}
|
||||
for key in job_data:
|
||||
if key == 'csrf_token':
|
||||
continue
|
||||
key_arr = key.split('_')
|
||||
value = job_data[key]
|
||||
if key_arr[0] == 'srcAddr' and value == '':
|
||||
#task_idx = 'task_' + key_arr[1]
|
||||
if task_idx in job_info['tasks']:
|
||||
job_info['tasks'][task_idx]['srcAddr'] = '/root'
|
||||
else:
|
||||
job_info['tasks'][task_idx] = {
|
||||
'srcAddr': '/root'
|
||||
}
|
||||
elif key_arr[0] != 'dependency'and value == '':
|
||||
message['success'] = 'false'
|
||||
message['message'] = 'value of %s is null' % key
|
||||
elif len(key_arr) == 1:
|
||||
job_info[key_arr[0]] = value
|
||||
elif len(key_arr) == 2:
|
||||
key_prefix, task_idx = key_arr[0], key_arr[1]
|
||||
#task_idx = 'task_' + task_idx
|
||||
if task_idx in job_info["tasks"]:
|
||||
job_info["tasks"][task_idx][key_prefix] = value
|
||||
else:
|
||||
tmp_dict = {
|
||||
key_prefix: value
|
||||
}
|
||||
job_info["tasks"][task_idx] = tmp_dict
|
||||
elif len(key_arr) == 3:
|
||||
key_prefix, task_idx, mapping_idx = key_arr[0], key_arr[1], key_arr[2]
|
||||
#task_idx = 'task_' + task_idx
|
||||
mapping_idx = 'mapping_' + mapping_idx
|
||||
if task_idx in job_info["tasks"]:
|
||||
if "mapping" in job_info["tasks"][task_idx]:
|
||||
if mapping_idx in job_info["tasks"][task_idx]["mapping"]:
|
||||
job_info["tasks"][task_idx]["mapping"][mapping_idx][key_prefix] = value
|
||||
else:
|
||||
tmp_dict = {
|
||||
key_prefix: value
|
||||
}
|
||||
job_info["tasks"][task_idx]["mapping"][mapping_idx] = tmp_dict
|
||||
else:
|
||||
job_info["tasks"][task_idx]["mapping"] = {
|
||||
mapping_idx: {
|
||||
key_prefix: value
|
||||
}
|
||||
}
|
||||
else:
|
||||
tmp_dict = {
|
||||
"mapping":{
|
||||
mapping_idx: {
|
||||
key_prefix: value
|
||||
}
|
||||
}
|
||||
}
|
||||
job_info["tasks"][task_idx] = tmp_dict
|
||||
logger.debug('batch job adding info %s' % json.dumps(job_info, indent=4))
|
||||
[status, msg] = G_jobmgr.add_job(user, job_info)
|
||||
if status:
|
||||
return json.dumps(message)
|
||||
else:
|
||||
logger.debug('fail to add batch job: %s' % msg)
|
||||
message["success"] = "false"
|
||||
message["message"] = msg
|
||||
return json.dumps(message)
|
||||
return json.dumps(message)
|
||||
|
||||
@app.route("/batch/job/list/", methods=['POST'])
|
||||
@login_required
|
||||
def list_job(user,beans,form):
|
||||
global G_jobmgr
|
||||
result = {
|
||||
'success': 'true',
|
||||
'data': G_jobmgr.list_jobs(user)
|
||||
}
|
||||
return json.dumps(result)
|
||||
|
||||
@app.route("/batch/job/info/", methods=['POST'])
|
||||
@login_required
|
||||
def info_job(user,beans,form):
|
||||
global G_jobmgr
|
||||
jobid = form.get("jobid","")
|
||||
[success, data] = G_jobmgr.get_job(user, jobid)
|
||||
if success:
|
||||
return json.dumps({'success':'true', 'data':data})
|
||||
else:
|
||||
return json.dumps({'success':'false', 'message': data})
|
||||
|
||||
@app.route("/batch/job/stop/", methods=['POST'])
|
||||
@login_required
|
||||
def stop_job(user,beans,form):
|
||||
global G_jobmgr
|
||||
jobid = form.get("jobid","")
|
||||
[success,msg] = G_jobmgr.stop_job(user,jobid)
|
||||
if success:
|
||||
return json.dumps({'success':'true', 'action':'stop job'})
|
||||
else:
|
||||
return json.dumps({'success':'false', 'message': msg})
|
||||
|
||||
@app.route("/batch/job/output/", methods=['POST'])
|
||||
@login_required
|
||||
def get_output(user,beans,form):
|
||||
global G_jobmgr
|
||||
jobid = form.get("jobid","")
|
||||
taskid = form.get("taskid","")
|
||||
vnodeid = form.get("vnodeid","")
|
||||
issue = form.get("issue","")
|
||||
result = {
|
||||
'success': 'true',
|
||||
'data': G_jobmgr.get_output(user,jobid,taskid,vnodeid,issue)
|
||||
}
|
||||
return json.dumps(result)
|
||||
|
||||
@app.route("/batch/task/info/", methods=['POST'])
|
||||
@login_required
|
||||
def info_task(user,beans,form):
|
||||
pass
|
||||
|
||||
@app.route("/batch/vnodes/list/", methods=['POST'])
|
||||
@login_required
|
||||
def batch_vnodes_list(user,beans,form):
|
||||
global G_taskmgr
|
||||
result = {
|
||||
'success': 'true',
|
||||
'data': G_taskmgr.get_user_batch_containers(user)
|
||||
}
|
||||
return json.dumps(result)
|
||||
|
||||
# @app.route("/inside/cluster/scaleout/", methods=['POST'])
|
||||
# @inside_ip_required
|
||||
# def inside_cluster_scalout(cur_user, cluster_info, form):
|
||||
|
@ -857,6 +998,8 @@ if __name__ == '__main__':
|
|||
global G_applicationmgr
|
||||
global G_ulockmgr
|
||||
global G_cloudmgr
|
||||
global G_jobmgr
|
||||
global G_taskmgr
|
||||
# move 'tools.loadenv' to the beginning of this file
|
||||
|
||||
fs_path = env.getenv("FS_PREFIX")
|
||||
|
@ -973,4 +1116,9 @@ if __name__ == '__main__':
|
|||
# server = http.server.HTTPServer((masterip, masterport), DockletHttpHandler)
|
||||
logger.info("starting master server")
|
||||
|
||||
G_taskmgr = taskmgr.TaskMgr(G_nodemgr, monitor.Fetcher, ipaddr)
|
||||
G_jobmgr = jobmgr.JobMgr(G_taskmgr)
|
||||
G_taskmgr.set_jobmgr(G_jobmgr)
|
||||
G_taskmgr.start()
|
||||
|
||||
app.run(host = masterip, port = masterport, threaded=True)
|
||||
|
|
|
@ -0,0 +1,493 @@
|
|||
import time, threading, random, string, os, traceback, requests
|
||||
import master.monitor
|
||||
import subprocess,json
|
||||
from functools import wraps
|
||||
from datetime import datetime
|
||||
|
||||
from utils.log import initlogging, logger
|
||||
from utils.model import db, Batchjob, Batchtask
|
||||
from utils import env
|
||||
|
||||
def db_commit():
|
||||
try:
|
||||
db.session.commit()
|
||||
except Exception as err:
|
||||
db.session.rollback()
|
||||
logger.error(traceback.format_exc())
|
||||
raise
|
||||
|
||||
class BatchJob(object):
|
||||
def __init__(self, jobid, user, job_info, old_job_db=None):
|
||||
if old_job_db is None:
|
||||
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority']))
|
||||
else:
|
||||
self.job_db = old_job_db
|
||||
self.job_db.clear()
|
||||
job_info = {}
|
||||
job_info['jobName'] = self.job_db.name
|
||||
job_info['jobPriority'] = self.job_db.priority
|
||||
all_tasks = self.job_db.tasks.all()
|
||||
job_info['tasks'] = {}
|
||||
for t in all_tasks:
|
||||
job_info['tasks'][t.idx] = json.loads(t.config)
|
||||
self.user = user
|
||||
#self.raw_job_info = job_info
|
||||
self.job_id = jobid
|
||||
self.job_name = job_info['jobName']
|
||||
self.job_priority = int(job_info['jobPriority'])
|
||||
self.lock = threading.Lock()
|
||||
self.tasks = {}
|
||||
self.dependency_out = {}
|
||||
self.tasks_cnt = {'pending':0, 'scheduling':0, 'running':0, 'retrying':0, 'failed':0, 'finished':0, 'stopped':0}
|
||||
|
||||
#init self.tasks & self.dependency_out & self.tasks_cnt
|
||||
logger.debug("Init BatchJob user:%s job_name:%s create_time:%s" % (self.job_db.username, self.job_db.name, str(self.job_db.create_time)))
|
||||
raw_tasks = job_info["tasks"]
|
||||
self.tasks_cnt['pending'] = len(raw_tasks.keys())
|
||||
for task_idx in raw_tasks.keys():
|
||||
task_info = raw_tasks[task_idx]
|
||||
if old_job_db is None:
|
||||
task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info)
|
||||
self.job_db.tasks.append(task_db)
|
||||
else:
|
||||
task_db = Batchtask.query.get(jobid+"_"+task_idx)
|
||||
task_db.clear()
|
||||
self.tasks[task_idx] = {}
|
||||
self.tasks[task_idx]['id'] = jobid+"_"+task_idx
|
||||
self.tasks[task_idx]['config'] = task_info
|
||||
self.tasks[task_idx]['db'] = task_db
|
||||
self.tasks[task_idx]['status'] = 'pending'
|
||||
self.tasks[task_idx]['dependency'] = []
|
||||
dependency = task_info['dependency'].strip().replace(' ', '').split(',')
|
||||
if len(dependency) == 1 and dependency[0] == '':
|
||||
continue
|
||||
for d in dependency:
|
||||
if not d in raw_tasks.keys():
|
||||
raise ValueError('task %s is not defined in the dependency of task %s' % (d, task_idx))
|
||||
self.tasks[task_idx]['dependency'].append(d)
|
||||
if not d in self.dependency_out.keys():
|
||||
self.dependency_out[d] = []
|
||||
self.dependency_out[d].append(task_idx)
|
||||
|
||||
if old_job_db is None:
|
||||
db.session.add(self.job_db)
|
||||
db_commit()
|
||||
|
||||
self.log_status()
|
||||
logger.debug("BatchJob(id:%s) dependency_out: %s" % (self.job_db.id, json.dumps(self.dependency_out, indent=3)))
|
||||
|
||||
def data_lock(f):
|
||||
@wraps(f)
|
||||
def new_f(self, *args, **kwargs):
|
||||
self.lock.acquire()
|
||||
try:
|
||||
result = f(self, *args, **kwargs)
|
||||
except Exception as err:
|
||||
self.lock.release()
|
||||
raise err
|
||||
self.lock.release()
|
||||
return result
|
||||
return new_f
|
||||
|
||||
# return the tasks without dependencies
|
||||
@data_lock
|
||||
def get_tasks_no_dependency(self,update_status=False):
|
||||
logger.debug("Get tasks without dependencies of BatchJob(id:%s)" % self.job_db.id)
|
||||
ret_tasks = []
|
||||
for task_idx in self.tasks.keys():
|
||||
if (self.tasks[task_idx]['status'] == 'pending' and
|
||||
len(self.tasks[task_idx]['dependency']) == 0):
|
||||
if update_status:
|
||||
self.tasks_cnt['pending'] -= 1
|
||||
self.tasks_cnt['scheduling'] += 1
|
||||
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
||||
self.tasks[task_idx]['db'].status = 'scheduling'
|
||||
self.tasks[task_idx]['status'] = 'scheduling'
|
||||
task_name = self.tasks[task_idx]['db'].id
|
||||
ret_tasks.append([task_name, self.tasks[task_idx]['config'], self.job_priority])
|
||||
self.log_status()
|
||||
db_commit()
|
||||
return ret_tasks
|
||||
|
||||
@data_lock
|
||||
def stop_job(self):
|
||||
self.job_db = Batchjob.query.get(self.job_id)
|
||||
self.job_db.status = 'stopping'
|
||||
db_commit()
|
||||
|
||||
# update status of this job based
|
||||
def _update_job_status(self):
|
||||
allcnt = len(self.tasks.keys())
|
||||
if self.tasks_cnt['failed'] != 0:
|
||||
self.job_db.status = 'failed'
|
||||
self.job_db.end_time = datetime.now()
|
||||
elif self.tasks_cnt['finished'] == allcnt:
|
||||
self.job_db.status = 'done'
|
||||
self.job_db.end_time = datetime.now()
|
||||
elif self.job_db.status == 'stopping':
|
||||
if self.tasks_cnt['running'] == 0 and self.tasks_cnt['scheduling'] == 0 and self.tasks_cnt['retrying'] == 0:
|
||||
self.job_db.status = 'stopped'
|
||||
self.job_db.end_time = datetime.now()
|
||||
elif self.tasks_cnt['running'] != 0 or self.tasks_cnt['retrying'] != 0:
|
||||
self.job_db.status = 'running'
|
||||
else:
|
||||
self.job_db.status = 'pending'
|
||||
db_commit()
|
||||
|
||||
# start run a task, update status
|
||||
@data_lock
|
||||
def update_task_running(self, task_idx):
|
||||
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) running." % (task_idx, self.job_id))
|
||||
old_status = self.tasks[task_idx]['status']
|
||||
if old_status == 'stopping':
|
||||
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
||||
return
|
||||
self.tasks_cnt[old_status] -= 1
|
||||
self.tasks[task_idx]['status'] = 'running'
|
||||
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
||||
self.tasks[task_idx]['db'].status = 'running'
|
||||
self.tasks[task_idx]['db'].start_time = datetime.now()
|
||||
self.tasks_cnt['running'] += 1
|
||||
self.job_db = Batchjob.query.get(self.job_id)
|
||||
self._update_job_status()
|
||||
self.log_status()
|
||||
|
||||
# a task has finished, update dependency and return tasks without dependencies
|
||||
@data_lock
|
||||
def finish_task(self, task_idx, running_time, billing):
|
||||
if task_idx not in self.tasks.keys():
|
||||
logger.error('Task_idx %s not in job. user:%s job_name:%s job_id:%s'%(task_idx, self.user, self.job_name, self.job_id))
|
||||
return []
|
||||
logger.debug("Task(idx:%s) of BatchJob(id:%s) has finished(running_time=%d,billing=%d). Update dependency..." % (task_idx, self.job_id, running_time, billing))
|
||||
old_status = self.tasks[task_idx]['status']
|
||||
if old_status == 'stopping':
|
||||
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
||||
return
|
||||
self.tasks_cnt[old_status] -= 1
|
||||
self.tasks[task_idx]['status'] = 'finished'
|
||||
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
||||
self.tasks[task_idx]['db'].status = 'finished'
|
||||
self.tasks[task_idx]['db'].tried_times += 1
|
||||
self.tasks[task_idx]['db'].running_time = running_time
|
||||
self.tasks[task_idx]['db'].end_time = datetime.now()
|
||||
self.tasks[task_idx]['db'].billing = billing
|
||||
self.tasks[task_idx]['db'].failed_reason = ""
|
||||
self.job_db = Batchjob.query.get(self.job_id)
|
||||
self.job_db.billing += billing
|
||||
self.tasks_cnt['finished'] += 1
|
||||
|
||||
if task_idx not in self.dependency_out.keys():
|
||||
self._update_job_status()
|
||||
self.log_status()
|
||||
return []
|
||||
ret_tasks = []
|
||||
for out_idx in self.dependency_out[task_idx]:
|
||||
try:
|
||||
self.tasks[out_idx]['dependency'].remove(task_idx)
|
||||
except Exception as err:
|
||||
logger.warning(traceback.format_exc())
|
||||
continue
|
||||
if (self.tasks[out_idx]['status'] == 'pending' and
|
||||
len(self.tasks[out_idx]['dependency']) == 0):
|
||||
self.tasks_cnt['pending'] -= 1
|
||||
self.tasks_cnt['scheduling'] += 1
|
||||
self.tasks[out_idx]['status'] = 'scheduling'
|
||||
self.tasks[out_idx]['db'] = Batchtask.query.get(self.tasks[out_idx]['id'])
|
||||
self.tasks[out_idx]['db'].status = 'scheduling'
|
||||
task_name = self.job_id + '_' + out_idx
|
||||
ret_tasks.append([task_name, self.tasks[out_idx]['config'], self.job_priority])
|
||||
self._update_job_status()
|
||||
self.log_status()
|
||||
return ret_tasks
|
||||
|
||||
# update retrying status of task
|
||||
@data_lock
|
||||
def update_task_retrying(self, task_idx, reason, tried_times):
|
||||
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) retrying. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
|
||||
old_status = self.tasks[task_idx]['status']
|
||||
if old_status == 'stopping':
|
||||
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
||||
return
|
||||
self.tasks_cnt[old_status] -= 1
|
||||
self.tasks_cnt['retrying'] += 1
|
||||
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
||||
self.tasks[task_idx]['db'].status = 'retrying'
|
||||
self.tasks[task_idx]['db'].failed_reason = reason
|
||||
self.tasks[task_idx]['db'].tried_times += 1
|
||||
self.tasks[task_idx]['status'] = 'retrying'
|
||||
self.job_db = Batchjob.query.get(self.job_id)
|
||||
self._update_job_status()
|
||||
self.log_status()
|
||||
|
||||
# update failed status of task
|
||||
@data_lock
|
||||
def update_task_failed(self, task_idx, reason, tried_times, running_time, billing):
|
||||
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) failed. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
|
||||
old_status = self.tasks[task_idx]['status']
|
||||
self.tasks_cnt[old_status] -= 1
|
||||
self.tasks_cnt['failed'] += 1
|
||||
self.tasks[task_idx]['status'] = 'failed'
|
||||
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
||||
self.tasks[task_idx]['db'].status = 'failed'
|
||||
self.tasks[task_idx]['db'].failed_reason = reason
|
||||
self.tasks[task_idx]['db'].tried_times += 1
|
||||
self.tasks[task_idx]['db'].end_time = datetime.now()
|
||||
self.tasks[task_idx]['db'].running_time = running_time
|
||||
self.tasks[task_idx]['db'].billing = billing
|
||||
self.job_db = Batchjob.query.get(self.job_id)
|
||||
self.job_db.billing += billing
|
||||
self._update_job_status()
|
||||
self.log_status()
|
||||
|
||||
@data_lock
|
||||
def update_task_stopped(self, task_idx, running_time, billing):
|
||||
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) stopped.running_time:%d billing:%d" % (task_idx, self.job_id, int(running_time), billing))
|
||||
old_status = self.tasks[task_idx]['status']
|
||||
if old_status == 'failed' or old_status == 'finished' or old_status == 'stopped':
|
||||
logger.info("task(idx:%s) of BatchJob(id:%s) has been done."%(task_idx, self.job_id))
|
||||
return False
|
||||
self.tasks_cnt[old_status] -= 1
|
||||
self.tasks_cnt['stopped'] += 1
|
||||
self.tasks[task_idx]['status'] = 'stopped'
|
||||
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
||||
self.tasks[task_idx]['db'].status = 'stopped'
|
||||
self.tasks[task_idx]['db'].end_time = datetime.now()
|
||||
self.tasks[task_idx]['db'].running_time = running_time
|
||||
self.tasks[task_idx]['db'].billing = billing
|
||||
self.job_db = Batchjob.query.get(self.job_id)
|
||||
self.job_db.billing += billing
|
||||
self._update_job_status()
|
||||
self.log_status()
|
||||
return True
|
||||
|
||||
# print status for debuging
|
||||
def log_status(self):
|
||||
task_copy = {}
|
||||
for task_idx in self.tasks.keys():
|
||||
task_copy[task_idx] = {}
|
||||
task_copy[task_idx]['status'] = self.tasks[task_idx]['status']
|
||||
task_copy[task_idx]['dependency'] = self.tasks[task_idx]['dependency']
|
||||
logger.debug("BatchJob(id:%s) tasks status: %s" % (self.job_id, json.dumps(task_copy, indent=3)))
|
||||
logger.debug("BatchJob(id:%s) tasks_cnt: %s" % (self.job_id, self.tasks_cnt))
|
||||
logger.debug("BatchJob(id:%s) job_status: %s" %(self.job_id, self.job_db.status))
|
||||
|
||||
|
||||
class JobMgr():
|
||||
# load job information from etcd
|
||||
# initial a job queue and job schedueler
|
||||
def __init__(self, taskmgr):
|
||||
logger.info("Init jobmgr...")
|
||||
try:
|
||||
Batchjob.query.all()
|
||||
except:
|
||||
db.create_all(bind='__all__')
|
||||
self.job_map = {}
|
||||
self.taskmgr = taskmgr
|
||||
self.fspath = env.getenv('FS_PREFIX')
|
||||
self.lock = threading.Lock()
|
||||
self.userpoint = "http://" + env.getenv('USER_IP') + ":" + str(env.getenv('USER_PORT'))
|
||||
self.auth_key = env.getenv('AUTH_KEY')
|
||||
|
||||
self.recover_jobs()
|
||||
|
||||
def recover_jobs(self):
|
||||
logger.info("Rerun the unfailed and unfinished jobs...")
|
||||
try:
|
||||
rejobs = Batchjob.query.filter(~Batchjob.status.in_(['done','failed']))
|
||||
rejobs = rejobs.order_by(Batchjob.create_time).all()
|
||||
for rejob in rejobs:
|
||||
logger.info("Rerun job: "+rejob.id)
|
||||
logger.debug(str(rejob))
|
||||
job = BatchJob(rejob.id, rejob.username, None, rejob)
|
||||
self.job_map[job.job_id] = job
|
||||
self.process_job(job)
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
def charge_beans(self,username,billing):
|
||||
logger.debug("Charge user(%s) for %d beans"%(username, billing))
|
||||
data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key}
|
||||
url = "/billing/beans/"
|
||||
return requests.post(self.userpoint+url,data=data).json()
|
||||
|
||||
def add_lock(f):
|
||||
@wraps(f)
|
||||
def new_f(self, *args, **kwargs):
|
||||
self.lock.acquire()
|
||||
try:
|
||||
result = f(self, *args, **kwargs)
|
||||
except Exception as err:
|
||||
self.lock.release()
|
||||
raise err
|
||||
self.lock.release()
|
||||
return result
|
||||
return new_f
|
||||
|
||||
@add_lock
|
||||
def create_job(self, user, job_info):
|
||||
jobid = self.gen_jobid()
|
||||
job = BatchJob(jobid, user, job_info)
|
||||
return job
|
||||
|
||||
# user: username
|
||||
# job_info: a json string
|
||||
# user submit a new job, add this job to queue and database
|
||||
def add_job(self, user, job_info):
|
||||
try:
|
||||
job = self.create_job(user, job_info)
|
||||
self.job_map[job.job_id] = job
|
||||
self.process_job(job)
|
||||
except ValueError as err:
|
||||
logger.error(err)
|
||||
return [False, err.args[0]]
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
#logger.error(err)
|
||||
return [False, err.args[0]]
|
||||
return [True, "add batch job success"]
|
||||
|
||||
# user: username
|
||||
# jobid: the id of job
|
||||
def stop_job(self, user, job_id):
|
||||
logger.info("[jobmgr] stop job(id:%s) user(%s)"%(job_id, user))
|
||||
if job_id not in self.job_map.keys():
|
||||
return [False,"Job id %s does not exists! Maybe it has been finished."%job_id]
|
||||
try:
|
||||
job = self.job_map[job_id]
|
||||
if job.job_db.status == 'done' or job.job_db.status == 'failed':
|
||||
return [True,""]
|
||||
if job.user != user:
|
||||
raise Exception("Wrong User.")
|
||||
for task_idx in job.tasks.keys():
|
||||
taskid = job_id + '_' + task_idx
|
||||
self.taskmgr.lazy_stop_task(taskid)
|
||||
job.stop_job()
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
#logger.error(err)
|
||||
return [False, err.args[0]]
|
||||
return [True,""]
|
||||
|
||||
# user: username
|
||||
# list a user's all job
|
||||
def list_jobs(self,user):
|
||||
alljobs = Batchjob.query.filter_by(username=user).all()
|
||||
res = []
|
||||
for job in alljobs:
|
||||
jobdata = json.loads(str(job))
|
||||
tasks = job.tasks.all()
|
||||
jobdata['tasks'] = [t.idx for t in tasks]
|
||||
tasks_vnodeCount = {}
|
||||
for t in tasks:
|
||||
tasks_vnodeCount[t.idx] = int(json.loads(t.config)['vnodeCount'])
|
||||
jobdata['tasks_vnodeCount'] = tasks_vnodeCount
|
||||
res.append(jobdata)
|
||||
return res
|
||||
|
||||
# user: username
|
||||
# jobid: the id of job
|
||||
# get the information of a job, including the status, json description and other information
|
||||
def get_job(self, user, job_id):
|
||||
job = Batchjob.query.get(job_id)
|
||||
if job is None:
|
||||
return [False, "Jobid(%s) does not exist."%job_id]
|
||||
if job.username != user:
|
||||
return [False, "Wrong User!"]
|
||||
jobdata = json.loads(str(job))
|
||||
tasks = job.tasks.order_by(Batchtask.idx).all()
|
||||
tasksdata = [json.loads(str(t)) for t in tasks]
|
||||
jobdata['tasks'] = tasksdata
|
||||
return [True, jobdata]
|
||||
|
||||
# check if a job exists
|
||||
def is_job_exist(self, job_id):
|
||||
return Batchjob.query.get(job_id) != None
|
||||
|
||||
# generate a random job id
|
||||
def gen_jobid(self):
|
||||
datestr = datetime.now().strftime("%y%m%d")
|
||||
job_id = datestr+''.join(random.sample(string.ascii_letters + string.digits, 3))
|
||||
while self.is_job_exist(job_id):
|
||||
job_id = datestr+''.join(random.sample(string.ascii_letters + string.digits, 3))
|
||||
return job_id
|
||||
|
||||
# add tasks into taskmgr's queue
|
||||
def add_task_taskmgr(self, user, tasks):
|
||||
for task_name, task_info, task_priority in tasks:
|
||||
if not task_info:
|
||||
logger.error("task_info does not exist! task_name(%s)" % task_name)
|
||||
return False
|
||||
else:
|
||||
logger.debug("Add task(name:%s) with priority(%s) to taskmgr's queue." % (task_name, task_priority) )
|
||||
self.taskmgr.add_task(user, task_name, task_info, task_priority)
|
||||
return True
|
||||
|
||||
# to process a job, add tasks without dependencies of the job into taskmgr
|
||||
def process_job(self, job):
|
||||
tasks = job.get_tasks_no_dependency(True)
|
||||
return self.add_task_taskmgr(job.user, tasks)
|
||||
|
||||
# report task status from taskmgr when running, failed and finished
|
||||
# task_name: job_id + '_' + task_idx
|
||||
# status: 'running', 'finished', 'retrying', 'failed', 'stopped'
|
||||
# reason: reason for failure or retrying, such as "FAILED", "TIMEOUT", "OUTPUTERROR"
|
||||
# tried_times: how many times the task has been tried.
|
||||
def report(self, user, task_name, status, reason="", tried_times=1, running_time=0, billing=0):
|
||||
split_task_name = task_name.split('_')
|
||||
if len(split_task_name) != 2:
|
||||
logger.error("[jobmgr report]Illegal task_name(%s) report from taskmgr" % task_name)
|
||||
return
|
||||
if billing > 0 and (status == 'failed' or status == 'finished'):
|
||||
self.charge_beans(user, billing)
|
||||
job_id, task_idx = split_task_name
|
||||
if job_id not in self.job_map.keys():
|
||||
logger.error("[jobmgr report]jobid(%s) does not exist. task_name(%s)" % (job_id,task_name))
|
||||
#update data in db
|
||||
taskdb = Batchtask.query.get(task_name)
|
||||
if (taskdb is None or taskdb.status == 'finished' or
|
||||
taskdb.status == 'failed' or taskdb.status == 'stopped'):
|
||||
return
|
||||
taskdb.status = status
|
||||
if status == 'failed':
|
||||
taskdb.failed_reason = reason
|
||||
if status == 'failed' or status == 'stopped' or status == 'finished':
|
||||
taskdb.end_time = datetime.now()
|
||||
if billing > 0:
|
||||
taskdb.running_time = running_time
|
||||
taskdb.billing = billing
|
||||
db_commit()
|
||||
return
|
||||
job = self.job_map[job_id]
|
||||
if status == "running":
|
||||
#logger.debug(str(job.job_db))
|
||||
job.update_task_running(task_idx)
|
||||
#logger.debug(str(job.job_db))
|
||||
elif status == "finished":
|
||||
#logger.debug(str(job.job_db))
|
||||
next_tasks = job.finish_task(task_idx, running_time, billing)
|
||||
ret = self.add_task_taskmgr(user, next_tasks)
|
||||
#logger.debug(str(job.job_db))
|
||||
elif status == "retrying":
|
||||
job.update_task_retrying(task_idx, reason, tried_times)
|
||||
elif status == "failed":
|
||||
job.update_task_failed(task_idx, reason, tried_times, running_time, billing)
|
||||
elif status == "stopped":
|
||||
if job.update_task_stopped(task_idx, running_time, billing) and billing > 0:
|
||||
self.charge_beans(user, billing)
|
||||
if job.job_db.status == 'done' or job.job_db.status == 'failed' or job.job_db.status == 'stopped':
|
||||
del self.job_map[job_id]
|
||||
|
||||
# Get Batch job stdout or stderr from its file
|
||||
def get_output(self, username, jobid, taskid, vnodeid, issue):
|
||||
filename = jobid + "_" + taskid + "_" + vnodeid + "_" + issue + ".txt"
|
||||
fpath = "%s/global/users/%s/data/batch_%s/%s" % (self.fspath,username,jobid,filename)
|
||||
logger.info("Get output from:%s" % fpath)
|
||||
try:
|
||||
ret = subprocess.run('tail -n 100 ' + fpath,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
|
||||
if ret.returncode != 0:
|
||||
raise IOError(ret.stdout.decode(encoding="utf-8"))
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
return ""
|
||||
else:
|
||||
return ret.stdout.decode(encoding="utf-8")
|
|
@ -47,6 +47,8 @@ class NodeMgr(object):
|
|||
# get allnodes
|
||||
self.allnodes = self._nodelist_etcd("allnodes")
|
||||
self.runnodes = []
|
||||
self.batchnodes = []
|
||||
self.allrunnodes = []
|
||||
[status, runlist] = self.etcd.listdir("machines/runnodes")
|
||||
for node in runlist:
|
||||
nodeip = node['key'].rsplit('/',1)[1]
|
||||
|
@ -140,6 +142,14 @@ class NodeMgr(object):
|
|||
#print(etcd_runip)
|
||||
#print(self.rpcs)
|
||||
self.runnodes = etcd_runip
|
||||
self.batchnodes = self.runnodes.copy()
|
||||
self.allrunnodes = self.runnodes.copy()
|
||||
[status, batchlist] = self.etcd.listdir("machines/batchnodes")
|
||||
if status:
|
||||
for node in batchlist:
|
||||
nodeip = node['key'].rsplit('/', 1)[1]
|
||||
self.batchnodes.append(nodeip)
|
||||
self.allrunnodes.append(nodeip)
|
||||
|
||||
def recover_node(self,ip,tasks):
|
||||
logger.info("now recover for worker:%s" % ip)
|
||||
|
@ -152,14 +162,19 @@ class NodeMgr(object):
|
|||
|
||||
# get all run nodes' IP addr
|
||||
def get_nodeips(self):
|
||||
return self.runnodes
|
||||
return self.allrunnodes
|
||||
|
||||
def get_batch_nodeips(self):
|
||||
return self.batchnodes
|
||||
|
||||
def get_base_nodeips(self):
|
||||
return self.runnodes
|
||||
|
||||
def get_allnodes(self):
|
||||
return self.allnodes
|
||||
|
||||
def ip_to_rpc(self,ip):
|
||||
if ip in self.runnodes:
|
||||
if ip in self.allrunnodes:
|
||||
return xmlrpc.client.ServerProxy("http://%s:%s" % (ip, env.getenv("WORKER_PORT")))
|
||||
else:
|
||||
logger.info('Worker %s is not connected, create rpc client failed, push task into queue')
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
#!/user/bin/python3
|
||||
import json
|
||||
|
||||
job_data = {'image_1': 'base_base_base', 'mappingRemoteDir_2_2': 'sss', 'dependency_1': 'aaa', 'mappingLocalDir_2_1': 'xxx', 'mappingLocalDir_1_2': 'aaa', 'mappingLocalDir_1_1': 'aaa', 'mappingLocalDir_2_3': 'fdsffdf', 'mappingRemoteDir_1_1': 'ddd', 'mappingRemoteDir_2_3': 'sss', 'srcAddr_1': 'aaa', 'mappingSource_2_1': 'Aliyun', 'cpuSetting_1': '1', 'mappingSource_2_2': 'Aliyun', 'retryCount_2': '1', 'mappingSource_1_1': 'Aliyun', 'expTime_1': '60', 'diskSetting_2': '1024', 'diskSetting_1': '1024', 'dependency_2': 'ddd', 'memorySetting_1': '1024', 'command_2': 'ccc', 'mappingRemoteDir_1_2': 'ddd', 'gpuSetting_2': '0', 'memorySetting_2': '1024', 'gpuSetting_1': '0', 'mappingLocalDir_2_2': 'bbb', 'mappingSource_1_2': 'Aliyun', 'expTime_2': '60', 'mappingRemoteDir_2_1': 'vvv', 'srcAddr_2': 'fff', 'cpuSetting_2': '1', 'instCount_1': '1', 'mappingSource_2_3': 'Aliyun', 'token': 'ZXlKaGJHY2lPaUpJVXpJMU5pSXNJbWxoZENJNk1UVXpNelE0TVRNMU5Td2laWGh3SWpveE5UTXpORGcwT1RVMWZRLmV5SnBaQ0k2TVgwLkF5UnRnaGJHZXhJY2lBSURZTUd5eXZIUVJnUGd1ZTA3OEtGWkVoejJVMkE=', 'instCount_2': '1', 'retryCount_1': '1', 'command_1': 'aaa', 'jobPriority': '0', 'image_2': 'base_base_base', 'jobName': 'aaa'}
|
||||
|
||||
def parse(job_data):
|
||||
job_info = {}
|
||||
message = {}
|
||||
for key in job_data:
|
||||
key_arr = key.split('_')
|
||||
value = job_data[key]
|
||||
if len(key_arr) == 1:
|
||||
job_info[key_arr[0]] = value
|
||||
elif len(key_arr) == 2:
|
||||
key_prefix, task_idx = key_arr[0], key_arr[1]
|
||||
task_idx = 'task_' + task_idx
|
||||
if task_idx in job_info:
|
||||
job_info[task_idx][key_prefix] = value
|
||||
else:
|
||||
tmp_dict = {
|
||||
key_prefix: value
|
||||
}
|
||||
job_info[task_idx] = tmp_dict
|
||||
elif len(key_arr) == 3:
|
||||
key_prefix, task_idx, mapping_idx = key_arr[0], key_arr[1], key_arr[2]
|
||||
task_idx = 'task_' + task_idx
|
||||
mapping_idx = 'mapping_' + mapping_idx
|
||||
if task_idx in job_info:
|
||||
if "mapping" in job_info[task_idx]:
|
||||
if mapping_idx in job_info[task_idx]["mapping"]:
|
||||
job_info[task_idx]["mapping"][mapping_idx][key_prefix] = value
|
||||
else:
|
||||
tmp_dict = {
|
||||
key_prefix: value
|
||||
}
|
||||
job_info[task_idx]["mapping"][mapping_idx] = tmp_dict
|
||||
else:
|
||||
job_info[task_idx]["mapping"] = {
|
||||
mapping_idx: {
|
||||
key_prefix: value
|
||||
}
|
||||
}
|
||||
else:
|
||||
tmp_dict = {
|
||||
"mapping":{
|
||||
mapping_idx: {
|
||||
key_prefix: value
|
||||
}
|
||||
}
|
||||
}
|
||||
job_info[task_idx] = tmp_dict
|
||||
print(json.dumps(job_info, indent=4))
|
||||
|
||||
if __name__ == '__main__':
|
||||
parse(job_data)
|
|
@ -0,0 +1,767 @@
|
|||
import threading
|
||||
import time
|
||||
import string
|
||||
import os
|
||||
import random, copy, subprocess
|
||||
import json, math
|
||||
from functools import wraps
|
||||
|
||||
# must import logger after initlogging, ugly
|
||||
from utils.log import logger
|
||||
|
||||
# grpc
|
||||
from concurrent import futures
|
||||
import grpc
|
||||
from protos.rpc_pb2 import *
|
||||
from protos.rpc_pb2_grpc import MasterServicer, add_MasterServicer_to_server, WorkerStub
|
||||
from utils.nettools import netcontrol
|
||||
from utils import env
|
||||
|
||||
def ip_to_int(addr):
|
||||
[a, b, c, d] = addr.split('.')
|
||||
return (int(a)<<24) + (int(b)<<16) + (int(c)<<8) + int(d)
|
||||
|
||||
def int_to_ip(num):
|
||||
return str((num>>24)&255)+"."+str((num>>16)&255)+"."+str((num>>8)&255)+"."+str(num&255)
|
||||
|
||||
class Task():
|
||||
def __init__(self, task_id, username, at_same_time, priority, max_size, task_infos):
|
||||
self.id = task_id
|
||||
self.username = username
|
||||
self.status = WAITING
|
||||
self.failed_reason = ""
|
||||
# if all the vnodes must be started at the same time
|
||||
self.at_same_time = at_same_time
|
||||
# priority the bigger the better
|
||||
# self.priority the smaller the better
|
||||
self.priority = int(time.time()) / 60 / 60 - priority
|
||||
self.task_base_ip = None
|
||||
self.ips = None
|
||||
self.max_size = max_size
|
||||
|
||||
self.subtask_list = [SubTask(
|
||||
idx = index,
|
||||
root_task = self,
|
||||
vnode_info = task_info['vnode_info'],
|
||||
command_info = task_info['command_info'],
|
||||
max_retry_count = task_info['max_retry_count']
|
||||
) for (index, task_info) in enumerate(task_infos)]
|
||||
|
||||
def get_billing(self):
|
||||
billing_beans = 0
|
||||
running_time = 0
|
||||
cpu_price = 1 / 3600.0 # /core*s
|
||||
mem_price = 1 / 3600.0 # /GB*s
|
||||
disk_price = 1 / 3600.0 # /GB*s
|
||||
gpu_price = 100 / 3600.0 # /core*s
|
||||
for subtask in self.subtask_list:
|
||||
tmp_time = subtask.running_time
|
||||
cpu_beans = subtask.vnode_info.vnode.instance.cpu * tmp_time * cpu_price
|
||||
mem_beans = subtask.vnode_info.vnode.instance.memory / 1024.0 * tmp_time * mem_price
|
||||
disk_beans = subtask.vnode_info.vnode.instance.disk / 1024.0 * tmp_time * disk_price
|
||||
gpu_beans = subtask.vnode_info.vnode.instance.gpu * tmp_time * gpu_price
|
||||
logger.info("subtask:%s running_time=%f beans for: cpu=%f mem_beans=%f disk_beans=%f gpu_beans=%f"
|
||||
%(self.id, tmp_time, cpu_beans, mem_beans, disk_beans, gpu_beans ))
|
||||
beans = math.ceil(cpu_beans + mem_beans + disk_beans + gpu_beans)
|
||||
running_time += tmp_time
|
||||
billing_beans += beans
|
||||
return running_time, billing_beans
|
||||
|
||||
def __lt__(self, other):
|
||||
return self.priority < other.priority
|
||||
|
||||
def gen_ips_from_base(self,base_ip):
|
||||
if self.task_base_ip == None:
|
||||
return
|
||||
self.ips = []
|
||||
for i in range(self.max_size):
|
||||
self.ips.append(int_to_ip(base_ip + self.task_base_ip + i + 2))
|
||||
|
||||
def gen_hosts(self):
|
||||
username = self.username
|
||||
taskid = self.id
|
||||
logger.info("Generate hosts for user(%s) task(%s) base_ip(%s)"%(username,taskid,str(self.task_base_ip)))
|
||||
fspath = env.getenv('FS_PREFIX')
|
||||
if not os.path.isdir("%s/global/users/%s" % (fspath,username)):
|
||||
path = env.getenv('DOCKLET_LIB')
|
||||
subprocess.call([path+"/master/userinit.sh", username])
|
||||
logger.info("user %s directory not found, create it" % username)
|
||||
|
||||
hosts_file = open("%s/global/users/%s/hosts/%s.hosts" % (fspath,username,"batch-"+taskid),"w")
|
||||
hosts_file.write("127.0.0.1 localhost\n")
|
||||
i = 0
|
||||
for ip in self.ips:
|
||||
hosts_file.write(ip+" batch-"+str(i)+"\n")
|
||||
i += 1
|
||||
hosts_file.close()
|
||||
|
||||
class SubTask():
|
||||
def __init__(self, idx, root_task, vnode_info, command_info, max_retry_count):
|
||||
self.root_task = root_task
|
||||
self.vnode_info = vnode_info
|
||||
self.vnode_info.vnodeid = idx
|
||||
self.command_info = command_info
|
||||
if self.command_info != None:
|
||||
self.command_info.vnodeid = idx
|
||||
self.max_retry_count = max_retry_count
|
||||
self.vnode_started = False
|
||||
self.task_started = False
|
||||
self.start_at = 0
|
||||
self.end_at = 0
|
||||
self.running_time = 0
|
||||
self.status = WAITING
|
||||
self.status_reason = ''
|
||||
self.try_count = 0
|
||||
self.worker = None
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def waiting_for_retry(self,reason=""):
|
||||
self.try_count += 1
|
||||
self.status = WAITING if self.try_count <= self.max_retry_count else FAILED
|
||||
if self.status == FAILED:
|
||||
self.root_task.status = FAILED
|
||||
self.failed_reason = reason
|
||||
self.root_task.failed_reason = reason
|
||||
|
||||
class TaskReporter(MasterServicer):
|
||||
|
||||
def __init__(self, taskmgr):
|
||||
self.taskmgr = taskmgr
|
||||
|
||||
def report(self, request, context):
|
||||
for task_report in request.taskmsgs:
|
||||
self.taskmgr.on_task_report(task_report)
|
||||
return Reply(status=Reply.ACCEPTED, message='')
|
||||
|
||||
|
||||
class TaskMgr(threading.Thread):
|
||||
|
||||
# load task information from etcd
|
||||
# initial a task queue and task schedueler
|
||||
# taskmgr: a taskmgr instance
|
||||
def __init__(self, nodemgr, monitor_fetcher, master_ip, scheduler_interval=2, external_logger=None):
|
||||
threading.Thread.__init__(self)
|
||||
self.thread_stop = False
|
||||
self.jobmgr = None
|
||||
self.master_ip = master_ip
|
||||
self.task_queue = []
|
||||
self.lazy_append_list = []
|
||||
self.lazy_delete_list = []
|
||||
self.lazy_stop_list = []
|
||||
self.task_queue_lock = threading.Lock()
|
||||
self.stop_lock = threading.Lock()
|
||||
self.add_lock = threading.Lock()
|
||||
#self.user_containers = {}
|
||||
|
||||
self.scheduler_interval = scheduler_interval
|
||||
self.logger = logger
|
||||
|
||||
self.master_port = env.getenv('BATCH_MASTER_PORT')
|
||||
self.worker_port = env.getenv('BATCH_WORKER_PORT')
|
||||
|
||||
# nodes
|
||||
self.nodemgr = nodemgr
|
||||
self.monitor_fetcher = monitor_fetcher
|
||||
self.cpu_usage = {}
|
||||
self.gpu_usage = {}
|
||||
# self.all_nodes = None
|
||||
# self.last_nodes_info_update_time = 0
|
||||
# self.nodes_info_update_interval = 30 # (s)
|
||||
|
||||
self.network_lock = threading.Lock()
|
||||
batch_net = env.getenv('BATCH_NET')
|
||||
self.batch_cidr = int(batch_net.split('/')[1])
|
||||
batch_net = batch_net.split('/')[0]
|
||||
task_cidr = int(env.getenv('BATCH_TASK_CIDR'))
|
||||
task_cidr = min(task_cidr,31-self.batch_cidr)
|
||||
self.task_cidr = max(task_cidr,2)
|
||||
self.base_ip = ip_to_int(batch_net)
|
||||
self.free_nets = []
|
||||
for i in range(0, (1 << (32-self.batch_cidr)) - 1, (1 << self.task_cidr)):
|
||||
self.free_nets.append(i)
|
||||
self.logger.info("Free nets addresses pool %s" % str(self.free_nets))
|
||||
self.logger.info("Each Batch Net CIDR:%s"%(str(self.task_cidr)))
|
||||
|
||||
def data_lock(lockname):
|
||||
def lock(f):
|
||||
@wraps(f)
|
||||
def new_f(self, *args, **kwargs):
|
||||
lockobj = getattr(self,lockname)
|
||||
lockobj.acquire()
|
||||
try:
|
||||
result = f(self, *args, **kwargs)
|
||||
except Exception as err:
|
||||
lockobj.release()
|
||||
raise err
|
||||
lockobj.release()
|
||||
return result
|
||||
return new_f
|
||||
return lock
|
||||
|
||||
def subtask_lock(f):
|
||||
@wraps(f)
|
||||
def new_f(self, subtask, *args, **kwargs):
|
||||
subtask.lock.acquire()
|
||||
try:
|
||||
result = f(self, subtask, *args, **kwargs)
|
||||
except Exception as err:
|
||||
subtask.lock.release()
|
||||
raise err
|
||||
subtask.lock.release()
|
||||
return result
|
||||
return new_f
|
||||
|
||||
def run(self):
|
||||
self.serve()
|
||||
while not self.thread_stop:
|
||||
self.sort_out_task_queue()
|
||||
task, sub_task_list = self.task_scheduler()
|
||||
if task is not None and sub_task_list is not None:
|
||||
self.task_processor(task, sub_task_list)
|
||||
else:
|
||||
time.sleep(self.scheduler_interval)
|
||||
|
||||
def serve(self):
|
||||
self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
|
||||
add_MasterServicer_to_server(TaskReporter(self), self.server)
|
||||
self.server.add_insecure_port('[::]:' + self.master_port)
|
||||
self.server.start()
|
||||
self.logger.info('[taskmgr_rpc] start rpc server')
|
||||
|
||||
def stop(self):
|
||||
self.thread_stop = True
|
||||
self.server.stop(0)
|
||||
self.logger.info('[taskmgr_rpc] stop rpc server')
|
||||
|
||||
@data_lock('task_queue_lock')
|
||||
@data_lock('add_lock')
|
||||
@data_lock('stop_lock')
|
||||
def sort_out_task_queue(self):
|
||||
|
||||
for task in self.task_queue:
|
||||
if task.id in self.lazy_stop_list:
|
||||
self.stop_remove_task(task)
|
||||
self.lazy_delete_list.append(task)
|
||||
running_time, billing = task.get_billing()
|
||||
self.logger.info('task %s stopped, running_time:%s billing:%d'%(task.id, str(running_time), billing))
|
||||
running_time = math.ceil(running_time)
|
||||
self.jobmgr.report(task.username, task.id,'stopped',running_time=running_time,billing=billing)
|
||||
|
||||
while self.lazy_delete_list:
|
||||
task = self.lazy_delete_list.pop(0)
|
||||
try:
|
||||
self.task_queue.remove(task)
|
||||
except Exception as err:
|
||||
self.logger.warning(str(err))
|
||||
|
||||
new_append_list = []
|
||||
for task in self.lazy_append_list:
|
||||
if task.id in self.lazy_stop_list:
|
||||
self.jobmgr.report(task.username, task.id, 'stopped')
|
||||
else:
|
||||
new_append_list.append(task)
|
||||
|
||||
self.lazy_append_list = new_append_list
|
||||
self.lazy_stop_list.clear()
|
||||
if self.lazy_append_list:
|
||||
self.task_queue.extend(self.lazy_append_list)
|
||||
self.lazy_append_list.clear()
|
||||
self.task_queue = sorted(self.task_queue, key=lambda x: x.priority)
|
||||
|
||||
def start_vnode(self, subtask):
|
||||
try:
|
||||
self.logger.info('[task_processor] Starting vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||
stub = WorkerStub(channel)
|
||||
response = stub.start_vnode(subtask.vnode_info)
|
||||
if response.status != Reply.ACCEPTED:
|
||||
raise Exception(response.message)
|
||||
except Exception as e:
|
||||
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||
subtask.status_reason = str(e)
|
||||
return [False, e]
|
||||
subtask.vnode_started = True
|
||||
subtask.start_at = time.time()
|
||||
self.cpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.cpu
|
||||
self.gpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.gpu
|
||||
return [True, '']
|
||||
|
||||
@subtask_lock
|
||||
def stop_vnode(self, subtask):
|
||||
if not subtask.vnode_started:
|
||||
return [True, ""]
|
||||
try:
|
||||
self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||
stub = WorkerStub(channel)
|
||||
response = stub.stop_vnode(subtask.vnode_info)
|
||||
if response.status != Reply.ACCEPTED:
|
||||
raise Exception(response.message)
|
||||
except Exception as e:
|
||||
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||
subtask.status_reason = str(e)
|
||||
return [False, e]
|
||||
subtask.vnode_started = False
|
||||
subtask.end_at = time.time()
|
||||
subtask.running_time += subtask.end_at - subtask.start_at
|
||||
self.cpu_usage[subtask.worker] -= subtask.vnode_info.vnode.instance.cpu
|
||||
self.gpu_usage[subtask.worker] -= subtask.vnode_info.vnode.instance.gpu
|
||||
return [True, '']
|
||||
|
||||
def start_subtask(self, subtask):
|
||||
try:
|
||||
self.logger.info('[task_processor] Starting task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||
stub = WorkerStub(channel)
|
||||
response = stub.start_task(subtask.command_info)
|
||||
if response.status != Reply.ACCEPTED:
|
||||
raise Exception(response.message)
|
||||
except Exception as e:
|
||||
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||
subtask.status_reason = str(e)
|
||||
return [False, e]
|
||||
subtask.task_started = True
|
||||
return [True, '']
|
||||
|
||||
def stop_subtask(self, subtask):
|
||||
try:
|
||||
self.logger.info('[task_processor] Stopping task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||
stub = WorkerStub(channel)
|
||||
response = stub.stop_task(subtask.command_info)
|
||||
if response.status != Reply.ACCEPTED:
|
||||
raise Exception(response.message)
|
||||
except Exception as e:
|
||||
self.logger.error('[task_processor] rpc error message: %s' % e)
|
||||
subtask.status = FAILED
|
||||
subtask.status_reason = str(e)
|
||||
return [False, e]
|
||||
subtask.task_started = False
|
||||
return [True, '']
|
||||
|
||||
@data_lock('network_lock')
|
||||
def acquire_task_ips(self, task):
|
||||
self.logger.info("[acquire_task_ips] user(%s) task(%s) net(%s)" % (task.username, task.id, str(task.task_base_ip)))
|
||||
if task.task_base_ip == None:
|
||||
task.task_base_ip = self.free_nets.pop(0)
|
||||
return task.task_base_ip
|
||||
|
||||
@data_lock('network_lock')
|
||||
def release_task_ips(self, task):
|
||||
self.logger.info("[release_task_ips] user(%s) task(%s) net(%s)" % (task.username, task.id, str(task.task_base_ip)))
|
||||
if task.task_base_ip == None:
|
||||
return
|
||||
self.free_nets.append(task.task_base_ip)
|
||||
task.task_base_ip = None
|
||||
#self.logger.error('[release task_net] %s' % str(e))
|
||||
|
||||
def setup_tasknet(self, task, workers=None):
|
||||
taskid = task.id
|
||||
username = task.username
|
||||
brname = "docklet-batch-%s-%s"%(username, taskid)
|
||||
gwname = taskid
|
||||
if task.task_base_ip == None:
|
||||
return [False, "task.task_base_ip is None!"]
|
||||
gatewayip = int_to_ip(self.base_ip + task.task_base_ip + 1)
|
||||
gatewayipcidr = gatewayip + "/" + str(32-self.task_cidr)
|
||||
netcontrol.new_bridge(brname)
|
||||
netcontrol.setup_gw(brname,gwname,gatewayipcidr,0,0)
|
||||
|
||||
for wip in workers:
|
||||
if wip != self.master_ip:
|
||||
netcontrol.setup_gre(brname,wip)
|
||||
return [True, gatewayip]
|
||||
|
||||
def remove_tasknet(self, task):
|
||||
taskid = task.id
|
||||
username = task.username
|
||||
brname = "docklet-batch-%s-%s"%(username, taskid)
|
||||
netcontrol.del_bridge(brname)
|
||||
|
||||
def task_processor(self, task, sub_task_list):
|
||||
task.status = RUNNING
|
||||
self.jobmgr.report(task.username, task.id, 'running')
|
||||
|
||||
# properties for transactio
|
||||
|
||||
self.acquire_task_ips(task)
|
||||
task.gen_ips_from_base(self.base_ip)
|
||||
task.gen_hosts()
|
||||
#need to create hosts
|
||||
[success, gwip] = self.setup_tasknet(task, [sub_task.worker for sub_task in sub_task_list])
|
||||
if not success:
|
||||
self.release_task_ips(task)
|
||||
return [False, gwip]
|
||||
|
||||
placed_workers = []
|
||||
|
||||
start_all_vnode_success = True
|
||||
# start vc
|
||||
for sub_task in sub_task_list:
|
||||
vnode_info = sub_task.vnode_info
|
||||
vnode_info.vnode.hostname = "batch-" + str(vnode_info.vnodeid % task.max_size)
|
||||
if sub_task.vnode_started:
|
||||
continue
|
||||
|
||||
username = sub_task.root_task.username
|
||||
#container_name = task.info.username + '-batch-' + task.info.id + '-' + str(instance_id) + '-' + task.info.token
|
||||
#if not username in self.user_containers.keys():
|
||||
#self.user_containers[username] = []
|
||||
#self.user_containers[username].append(container_name)
|
||||
ipaddr = task.ips[vnode_info.vnodeid % task.max_size] + "/" + str(32-self.task_cidr)
|
||||
brname = "docklet-batch-%s-%s" % (username, sub_task.root_task.id)
|
||||
networkinfo = Network(ipaddr=ipaddr, gateway=gwip, masterip=self.master_ip, brname=brname)
|
||||
vnode_info.vnode.network.CopyFrom(networkinfo)
|
||||
|
||||
placed_workers.append(sub_task.worker)
|
||||
[success, msg] = self.start_vnode(sub_task)
|
||||
if not success:
|
||||
sub_task.waiting_for_retry("Fail to start vnode.")
|
||||
if sub_task.status == WAITING:
|
||||
self.jobmgr.report(task.username, task.id, 'retrying', "Fail to start vnode.")
|
||||
sub_task.worker = None
|
||||
start_all_vnode_success = False
|
||||
|
||||
if not start_all_vnode_success:
|
||||
return
|
||||
|
||||
# start tasks
|
||||
for sub_task in sub_task_list:
|
||||
task_info = sub_task.command_info
|
||||
if task_info is None or sub_task.status == RUNNING:
|
||||
sub_task.status = RUNNING
|
||||
continue
|
||||
task_info.token = ''.join(random.sample(string.ascii_letters + string.digits, 8))
|
||||
|
||||
[success, msg] = self.start_subtask(sub_task)
|
||||
if success:
|
||||
sub_task.status = RUNNING
|
||||
else:
|
||||
sub_task.waiting_for_retry("Fail to start task.")
|
||||
if sub_task.status == WAITING:
|
||||
self.jobmgr.report(task.username, task.id, 'retrying', "Fail to start task.")
|
||||
|
||||
def clear_sub_tasks(self, sub_task_list):
|
||||
for sub_task in sub_task_list:
|
||||
self.clear_sub_task(sub_task)
|
||||
|
||||
def clear_sub_task(self, sub_task):
|
||||
if sub_task.task_started:
|
||||
self.stop_subtask(sub_task)
|
||||
#pass
|
||||
if sub_task.vnode_started:
|
||||
self.stop_vnode(sub_task)
|
||||
#pass
|
||||
|
||||
@data_lock('stop_lock')
|
||||
def lazy_stop_task(self, taskid):
|
||||
self.lazy_stop_list.append(taskid)
|
||||
|
||||
def stop_remove_task(self, task):
|
||||
if task is None:
|
||||
return
|
||||
self.logger.info("[taskmgr] stop and remove task(%s)"%task.id)
|
||||
self.clear_sub_tasks(task.subtask_list)
|
||||
self.release_task_ips(task)
|
||||
self.remove_tasknet(task)
|
||||
|
||||
def check_task_completed(self, task):
|
||||
if task.status == RUNNING or task.status == WAITING:
|
||||
for sub_task in task.subtask_list:
|
||||
if sub_task.command_info != None and (sub_task.status == RUNNING or sub_task.status == WAITING):
|
||||
return False
|
||||
self.logger.info('task %s finished, status %d, subtasks: %s' % (task.id, task.status, str([sub_task.status for sub_task in task.subtask_list])))
|
||||
self.stop_remove_task(task)
|
||||
self.lazy_delete_list.append(task)
|
||||
running_time, billing = task.get_billing()
|
||||
self.logger.info('task %s running_time:%s billing:%d'%(task.id, str(running_time), billing))
|
||||
running_time = math.ceil(running_time)
|
||||
if task.status == FAILED:
|
||||
self.jobmgr.report(task.username,task.id,"failed",task.failed_reason,task.subtask_list[0].max_retry_count+1, running_time, billing)
|
||||
else:
|
||||
self.jobmgr.report(task.username,task.id,'finished',running_time=running_time,billing=billing)
|
||||
return True
|
||||
|
||||
# this method is called when worker send heart-beat rpc request
|
||||
def on_task_report(self, report):
|
||||
self.logger.info('[on_task_report] receive task report: id %s-%d, status %d' % (report.taskid, report.vnodeid, report.subTaskStatus))
|
||||
task = self.get_task(report.taskid)
|
||||
if task == None:
|
||||
self.logger.error('[on_task_report] task not found')
|
||||
return
|
||||
|
||||
sub_task = task.subtask_list[report.vnodeid]
|
||||
if sub_task.command_info.token != report.token:
|
||||
self.logger.warning('[on_task_report] wrong token, %s %s' % (sub_task.command_info.token, report.token))
|
||||
return
|
||||
username = task.username
|
||||
# container_name = username + '-batch-' + task.info.id + '-' + str(report.instanceid) + '-' + report.token
|
||||
# self.user_containers[username].remove(container_name)
|
||||
|
||||
if sub_task.status != RUNNING:
|
||||
self.logger.error('[on_task_report] receive task report when vnode is not running')
|
||||
|
||||
#sub_task.status = report.subTaskStatus
|
||||
sub_task.status_reason = report.errmsg
|
||||
sub_task.task_started = False
|
||||
|
||||
if report.subTaskStatus == FAILED or report.subTaskStatus == TIMEOUT:
|
||||
self.clear_sub_task(sub_task)
|
||||
sub_task.waiting_for_retry(report.errmsg)
|
||||
self.logger.info('task %s report failed, status %d, subtasks: %s' % (task.id, task.status, str([sub_task.status for sub_task in task.subtask_list])))
|
||||
if sub_task.status == WAITING:
|
||||
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg)
|
||||
elif report.subTaskStatus == OUTPUTERROR:
|
||||
self.clear_sub_task(sub_task)
|
||||
sub_task.status = FAILED
|
||||
task.status = FAILED
|
||||
task.failed_reason = report.errmsg
|
||||
elif report.subTaskStatus == COMPLETED:
|
||||
sub_task.status = report.subTaskStatus
|
||||
self.clear_sub_task(sub_task)
|
||||
|
||||
# return task, workers
|
||||
def task_scheduler(self):
|
||||
# simple FIFO with priority
|
||||
self.logger.info('[task_scheduler] scheduling... (%d tasks remains)' % len(self.task_queue))
|
||||
|
||||
for task in self.task_queue:
|
||||
if task in self.lazy_delete_list or task.id in self.lazy_stop_list:
|
||||
continue
|
||||
self.logger.info('task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
|
||||
if self.check_task_completed(task):
|
||||
continue
|
||||
self.logger.info('schedule task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
|
||||
|
||||
if task.at_same_time:
|
||||
# parallel tasks
|
||||
if not self.has_waiting(task.subtask_list):
|
||||
continue
|
||||
workers = self.find_proper_workers(task.subtask_list)
|
||||
if len(workers) == 0:
|
||||
return None, None
|
||||
else:
|
||||
for i in range(len(workers)):
|
||||
task.subtask_list[i].worker = workers[i]
|
||||
return task, task.subtask_list
|
||||
else:
|
||||
# traditional tasks
|
||||
has_waiting = False
|
||||
for sub_task in task.subtask_list:
|
||||
if sub_task.status == WAITING:
|
||||
has_waiting = True
|
||||
workers = self.find_proper_workers([sub_task])
|
||||
if len(workers) > 0:
|
||||
sub_task.worker = workers[0]
|
||||
return task, [sub_task]
|
||||
if has_waiting:
|
||||
return None, None
|
||||
|
||||
return None, None
|
||||
|
||||
def has_waiting(self, sub_task_list):
|
||||
for sub_task in sub_task_list:
|
||||
if sub_task.status == WAITING:
|
||||
return True
|
||||
return False
|
||||
|
||||
def find_proper_workers(self, sub_task_list, all_res=False):
|
||||
nodes = self.get_all_nodes()
|
||||
if nodes is None or len(nodes) == 0:
|
||||
self.logger.warning('[task_scheduler] running nodes not found')
|
||||
return None
|
||||
|
||||
proper_workers = []
|
||||
has_waiting = False
|
||||
for sub_task in sub_task_list:
|
||||
if sub_task.status == WAITING:
|
||||
has_waiting = True
|
||||
if sub_task.worker is not None and sub_task.vnode_started:
|
||||
proper_workers.append(sub_task.worker)
|
||||
continue
|
||||
needs = sub_task.vnode_info.vnode.instance
|
||||
self.logger.info('sub_task %s-%d' %(sub_task.root_task.id, sub_task.vnode_info.vnodeid))
|
||||
self.logger.info(str(needs))
|
||||
#logger.info(needs)
|
||||
proper_worker = None
|
||||
for worker_ip, worker_info in nodes:
|
||||
self.logger.info('worker ip' + worker_ip)
|
||||
self.logger.info('cpu usage: ' + str(self.get_cpu_usage(worker_ip)))
|
||||
self.logger.info('gpu usage: ' + str(self.get_gpu_usage(worker_ip)))
|
||||
self.logger.info('worker_info: ' + str(worker_info))
|
||||
#logger.info(worker_info)
|
||||
#logger.info(self.get_cpu_usage(worker_ip))
|
||||
if needs.cpu + (not all_res) * self.get_cpu_usage(worker_ip) > worker_info['cpu']:
|
||||
continue
|
||||
elif needs.memory > worker_info['memory']:
|
||||
continue
|
||||
elif needs.disk > worker_info['disk']:
|
||||
continue
|
||||
# try not to assign non-gpu task to a worker with gpu
|
||||
#if needs['gpu'] == 0 and worker_info['gpu'] > 0:
|
||||
#continue
|
||||
elif needs.gpu + (not all_res) * self.get_gpu_usage(worker_ip) > worker_info['gpu']:
|
||||
continue
|
||||
else:
|
||||
worker_info['cpu'] -= needs.cpu
|
||||
worker_info['memory'] -= needs.memory
|
||||
worker_info['gpu'] -= needs.gpu
|
||||
worker_info['disk'] -= needs.disk
|
||||
proper_worker = worker_ip
|
||||
break
|
||||
if proper_worker is not None:
|
||||
proper_workers.append(proper_worker)
|
||||
else:
|
||||
return []
|
||||
if has_waiting:
|
||||
return proper_workers
|
||||
else:
|
||||
return []
|
||||
|
||||
def get_all_nodes(self):
|
||||
# cache running nodes
|
||||
# if self.all_nodes is not None and time.time() - self.last_nodes_info_update_time < self.nodes_info_update_interval:
|
||||
# return self.all_nodes
|
||||
# get running nodes
|
||||
node_ips = self.nodemgr.get_batch_nodeips()
|
||||
all_nodes = [(node_ip, self.get_worker_resource_info(node_ip)) for node_ip in node_ips]
|
||||
return all_nodes
|
||||
|
||||
def is_alive(self, worker):
|
||||
nodes = self.nodemgr.get_batch_nodeips()
|
||||
return worker in nodes
|
||||
|
||||
def get_worker_resource_info(self, worker_ip):
|
||||
fetcher = self.monitor_fetcher(worker_ip)
|
||||
worker_info = fetcher.info
|
||||
info = {}
|
||||
info['cpu'] = len(worker_info['cpuconfig'])
|
||||
info['memory'] = (worker_info['meminfo']['buffers'] + worker_info['meminfo']['cached'] + worker_info['meminfo']['free']) / 1024 # (Mb)
|
||||
info['disk'] = sum([disk['free'] for disk in worker_info['diskinfo']]) / 1024 / 1024 # (Mb)
|
||||
info['gpu'] = len(worker_info['gpuinfo'])
|
||||
return info
|
||||
|
||||
def get_cpu_usage(self, worker_ip):
|
||||
try:
|
||||
return self.cpu_usage[worker_ip]
|
||||
except:
|
||||
self.cpu_usage[worker_ip] = 0
|
||||
return 0
|
||||
|
||||
|
||||
def get_gpu_usage(self, worker_ip):
|
||||
try:
|
||||
return self.gpu_usage[worker_ip]
|
||||
except:
|
||||
self.gpu_usage[worker_ip] = 0
|
||||
return 0
|
||||
|
||||
# save the task information into database
|
||||
# called when jobmgr assign task to taskmgr
|
||||
@data_lock('add_lock')
|
||||
def add_task(self, username, taskid, json_task, task_priority=1):
|
||||
# decode json string to object defined in grpc
|
||||
self.logger.info('[taskmgr add_task] receive task %s' % taskid)
|
||||
|
||||
image_dict = {
|
||||
"private": Image.PRIVATE,
|
||||
"base": Image.BASE,
|
||||
"public": Image.PUBLIC
|
||||
}
|
||||
max_size = (1 << self.task_cidr) - 2
|
||||
if int(json_task['vnodeCount']) > max_size:
|
||||
# tell jobmgr
|
||||
self.jobmgr.report(username,taskid,"failed","vnodeCount exceed limits.")
|
||||
return False
|
||||
task = Task(
|
||||
task_id = taskid,
|
||||
username = username,
|
||||
# all vnode must be started at the same time
|
||||
at_same_time = 'atSameTime' in json_task.keys(),
|
||||
priority = task_priority,
|
||||
max_size = (1 << self.task_cidr) - 2,
|
||||
task_infos = [{
|
||||
'max_retry_count': int(json_task['retryCount']),
|
||||
'vnode_info': VNodeInfo(
|
||||
taskid = taskid,
|
||||
username = username,
|
||||
vnode = VNode(
|
||||
image = Image(
|
||||
name = json_task['image'].split('_')[0], #json_task['cluster']['image']['name'],
|
||||
type = image_dict[json_task['image'].split('_')[2]], #json_task['cluster']['image']['type'],
|
||||
owner = username if not json_task['image'].split('_')[1] else json_task['image'].split('_')[1]), #json_task['cluster']['image']['owner']),
|
||||
instance = Instance(
|
||||
cpu = int(json_task['cpuSetting']),
|
||||
memory = int(json_task['memorySetting']),
|
||||
disk = int(json_task['diskSetting']),
|
||||
gpu = int(json_task['gpuSetting'])),
|
||||
mount = [Mount(
|
||||
provider = json_task['mapping'][mapping_key]['mappingProvider'],
|
||||
localPath = json_task['mapping'][mapping_key]['mappingMountpath'],
|
||||
remotePath = json_task['mapping'][mapping_key]['mappingBucketName'],
|
||||
accessKey = json_task['mapping'][mapping_key]['mappingAccessKey'],
|
||||
secretKey = json_task['mapping'][mapping_key]['mappingSecretKey'],
|
||||
other = json_task['mapping'][mapping_key]['mappingEndpoint']
|
||||
)
|
||||
for mapping_key in json_task['mapping']] if 'mapping' in json_task else []
|
||||
),
|
||||
),
|
||||
'command_info': TaskInfo(
|
||||
taskid = taskid,
|
||||
username = username,
|
||||
parameters = Parameters(
|
||||
command = Command(
|
||||
commandLine = json_task['command'],
|
||||
packagePath = json_task['srcAddr'],
|
||||
envVars = {}),
|
||||
stderrRedirectPath = json_task.get('stdErrRedPth',""),
|
||||
stdoutRedirectPath = json_task.get('stdOutRedPth',"")),
|
||||
timeout = int(json_task['expTime'])
|
||||
# commands are executed in all vnodes / only excuted in the first vnode
|
||||
# if in traditional mode, commands will be executed in all vnodes
|
||||
) if (json_task['runon'] == 'all' or vnode_index == 0) else None
|
||||
} for vnode_index in range(int(json_task['vnodeCount']))])
|
||||
|
||||
if task.at_same_time:
|
||||
workers = self.find_proper_workers(task.subtask_list, all_res=True)
|
||||
if len(workers) == 0:
|
||||
task.status = FAILED
|
||||
# tell jobmgr
|
||||
self.jobmgr.report(username,taskid,"failed","Resources needs exceed limits")
|
||||
return False
|
||||
else:
|
||||
for sub_task in task.subtask_list:
|
||||
workers = self.find_proper_workers([sub_task], all_res=True)
|
||||
if len(workers) == 0:
|
||||
task.status = FAILED
|
||||
# tell jobmgr
|
||||
self.jobmgr.report(username,taskid,"failed","Resources needs exceed limits")
|
||||
return False
|
||||
self.lazy_append_list.append(task)
|
||||
return True
|
||||
|
||||
|
||||
@data_lock('task_queue_lock')
|
||||
def get_task_list(self):
|
||||
return self.task_queue.copy()
|
||||
|
||||
|
||||
@data_lock('task_queue_lock')
|
||||
def get_task(self, taskid):
|
||||
for task in self.task_queue:
|
||||
if task.id == taskid:
|
||||
return task
|
||||
return None
|
||||
|
||||
|
||||
def set_jobmgr(self, jobmgr):
|
||||
self.jobmgr = jobmgr
|
||||
|
||||
|
||||
# get names of all the batch containers of the user
|
||||
def get_user_batch_containers(self,username):
|
||||
return []
|
||||
# if not username in self.user_containers.keys():
|
||||
# return []
|
||||
# else:
|
||||
# return self.user_containers[username]
|
|
@ -0,0 +1,41 @@
|
|||
import sys
|
||||
if sys.path[0].endswith("master"):
|
||||
sys.path[0] = sys.path[0][:-6]
|
||||
|
||||
import grpc,time
|
||||
|
||||
from protos import rpc_pb2, rpc_pb2_grpc
|
||||
import random, string
|
||||
|
||||
def run():
|
||||
channel = grpc.insecure_channel('localhost:50051')
|
||||
stub = rpc_pb2_grpc.WorkerStub(channel)
|
||||
|
||||
comm = rpc_pb2.Command(commandLine="ls /root;sleep 5;ls /root", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}'
|
||||
paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="/root/nfs/batch_{jobid}/", stdoutRedirectPath="/root/nfs/batch_{jobid}/")
|
||||
|
||||
img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet")
|
||||
inst = rpc_pb2.Instance(cpu=1, memory=1000, disk=1000, gpu=0)
|
||||
mnt = rpc_pb2.Mount(localPath="",provider='aliyun',remotePath="test-for-docklet",other="oss-cn-beijing.aliyuncs.com",accessKey="LTAIdl7gmmIhfqA9",secretKey="")
|
||||
clu = rpc_pb2.Cluster(image=img, instance=inst, mount=[])
|
||||
|
||||
task = rpc_pb2.TaskInfo(id="test",username="root",instanceid=1,instanceCount=1,maxRetryCount=1,parameters=paras,cluster=clu,timeout=60000,token=''.join(random.sample(string.ascii_letters + string.digits, 8)))
|
||||
|
||||
response = stub.process_task(task)
|
||||
print("Batch client received: " + str(response.status)+" "+response.message)
|
||||
|
||||
def stop_task():
|
||||
channel = grpc.insecure_channel('localhost:50051')
|
||||
stub = rpc_pb2_grpc.WorkerStub(channel)
|
||||
|
||||
taskmsg = rpc_pb2.TaskMsg(taskid="test",username="root",instanceid=1,instanceStatus=rpc_pb2.COMPLETED,token="test",errmsg="")
|
||||
reportmsg = rpc_pb2.ReportMsg(taskmsgs = [taskmsg])
|
||||
|
||||
response = stub.stop_tasks(reportmsg)
|
||||
print("Batch client received: " + str(response.status)+" "+response.message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
#for i in range(10):
|
||||
run()
|
||||
#time.sleep(4)
|
||||
#stop_task()
|
|
@ -0,0 +1,193 @@
|
|||
import master.taskmgr
|
||||
from concurrent import futures
|
||||
import grpc
|
||||
from protos.rpc_pb2 import *
|
||||
from protos.rpc_pb2_grpc import *
|
||||
import threading, json, time, random
|
||||
from utils import env
|
||||
|
||||
|
||||
class SimulatedNodeMgr():
|
||||
def get_batch_nodeips(self):
|
||||
return ['0.0.0.0']
|
||||
|
||||
|
||||
class SimulatedMonitorFetcher():
|
||||
def __init__(self, ip):
|
||||
self.info = {}
|
||||
self.info['cpuconfig'] = [1,1,1,1,1,1,1,1]
|
||||
self.info['meminfo'] = {}
|
||||
self.info['meminfo']['free'] = 8 * 1024 * 1024 # (kb) simulate 8 GB memory
|
||||
self.info['meminfo']['buffers'] = 8 * 1024 * 1024
|
||||
self.info['meminfo']['cached'] = 8 * 1024 * 1024
|
||||
self.info['diskinfo'] = []
|
||||
self.info['diskinfo'].append({})
|
||||
self.info['diskinfo'][0]['free'] = 16 * 1024 * 1024 * 1024 # (b) simulate 16 GB disk
|
||||
self.info['gpuinfo'] = [1,1]
|
||||
|
||||
|
||||
class SimulatedTaskController(WorkerServicer):
|
||||
|
||||
def __init__(self, worker):
|
||||
self.worker = worker
|
||||
|
||||
def start_vnode(self, vnodeinfo, context):
|
||||
print('[SimulatedTaskController] start vnode, taskid [%s] vnodeid [%d]' % (vnodeinfo.taskid, vnodeinfo.vnodeid))
|
||||
return Reply(status=Reply.ACCEPTED,message="")
|
||||
|
||||
def stop_vnode(self, vnodeinfo, context):
|
||||
print('[SimulatedTaskController] stop vnode, taskid [%s] vnodeid [%d]' % (vnodeinfo.taskid, vnodeinfo.vnodeid))
|
||||
return Reply(status=Reply.ACCEPTED,message="")
|
||||
|
||||
def start_task(self, taskinfo, context):
|
||||
print('[SimulatedTaskController] start task, taskid [%s] vnodeid [%d] token [%s]' % (taskinfo.taskid, taskinfo.vnodeid, taskinfo.token))
|
||||
worker.process(taskinfo)
|
||||
return Reply(status=Reply.ACCEPTED,message="")
|
||||
|
||||
def stop_task(self, taskinfo, context):
|
||||
print('[SimulatedTaskController] stop task, taskid [%s] vnodeid [%d] token [%s]' % (taskinfo.taskid, taskinfo.vnodeid, taskinfo.token))
|
||||
return Reply(status=Reply.ACCEPTED,message="")
|
||||
|
||||
|
||||
class SimulatedWorker(threading.Thread):
|
||||
|
||||
def __init__(self):
|
||||
threading.Thread.__init__(self)
|
||||
self.thread_stop = False
|
||||
self.tasks = []
|
||||
|
||||
def run(self):
|
||||
worker_port = env.getenv('BATCH_WORKER_PORT')
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=5))
|
||||
add_WorkerServicer_to_server(SimulatedTaskController(self), server)
|
||||
server.add_insecure_port('[::]:' + worker_port)
|
||||
server.start()
|
||||
while not self.thread_stop:
|
||||
for task in self.tasks:
|
||||
seed = random.random()
|
||||
if seed < 0.25:
|
||||
report(task.taskid, task.vnodeid, RUNNING, task.token)
|
||||
elif seed < 0.5:
|
||||
report(task.taskid, task.vnodeid, COMPLETED, task.token)
|
||||
self.tasks.remove(task)
|
||||
break
|
||||
elif seed < 0.75:
|
||||
report(task.taskid, task.vnodeid, FAILED, task.token)
|
||||
self.tasks.remove(task)
|
||||
break
|
||||
else:
|
||||
pass
|
||||
time.sleep(5)
|
||||
server.stop(0)
|
||||
|
||||
def stop(self):
|
||||
self.thread_stop = True
|
||||
|
||||
def process(self, task):
|
||||
self.tasks.append(task)
|
||||
|
||||
|
||||
class SimulatedJobMgr(threading.Thread):
|
||||
|
||||
def __init__(self):
|
||||
threading.Thread.__init__(self)
|
||||
self.thread_stop = False
|
||||
|
||||
def run(self):
|
||||
while not self.thread_stop:
|
||||
time.sleep(5)
|
||||
server.stop(0)
|
||||
|
||||
def stop(self):
|
||||
self.thread_stop = True
|
||||
|
||||
def report(self, task):
|
||||
print('[SimulatedJobMgr] task[%s] status %d' % (task.info.id, task.status))
|
||||
|
||||
def assignTask(self, taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk, gpu):
|
||||
task = {}
|
||||
task['instCount'] = instance_count
|
||||
task['retryCount'] = retry_count
|
||||
task['expTime'] = timeout
|
||||
task['at_same_time'] = True
|
||||
task['multicommand'] = True
|
||||
task['command'] = 'ls'
|
||||
task['srcAddr'] = ''
|
||||
task['envVars'] = {'a':'1'}
|
||||
task['stdErrRedPth'] = ''
|
||||
task['stdOutRedPth'] = ''
|
||||
task['image'] = 'root_root_base'
|
||||
task['cpuSetting'] = cpu
|
||||
task['memorySetting'] = memory
|
||||
task['diskSetting'] = disk
|
||||
task['gpuSetting'] = 0
|
||||
task['mapping'] = []
|
||||
|
||||
taskmgr.add_task('root', taskid, task)
|
||||
|
||||
|
||||
class SimulatedLogger():
|
||||
def info(self, msg):
|
||||
print('[INFO] ' + msg)
|
||||
|
||||
def warning(self, msg):
|
||||
print('[WARNING] ' + msg)
|
||||
|
||||
def error(self, msg):
|
||||
print('[ERROR] ' + msg)
|
||||
|
||||
|
||||
def test():
|
||||
global worker
|
||||
global jobmgr
|
||||
global taskmgr
|
||||
|
||||
worker = SimulatedWorker()
|
||||
worker.start()
|
||||
jobmgr = SimulatedJobMgr()
|
||||
jobmgr.start()
|
||||
|
||||
taskmgr = master.taskmgr.TaskMgr(SimulatedNodeMgr(), SimulatedMonitorFetcher, master_ip='', scheduler_interval=2, external_logger=SimulatedLogger())
|
||||
# taskmgr.set_jobmgr(jobmgr)
|
||||
taskmgr.start()
|
||||
|
||||
add('task_0', instance_count=2, retry_count=2, timeout=60, cpu=2, memory=2048, disk=2048, gpu=0)
|
||||
|
||||
|
||||
def test2():
|
||||
global jobmgr
|
||||
global taskmgr
|
||||
jobmgr = SimulatedJobMgr()
|
||||
jobmgr.start()
|
||||
|
||||
taskmgr = master.taskmgr.TaskMgr(SimulatedNodeMgr(), SimulatedMonitorFetcher, master_ip='', scheduler_interval=2, external_logger=SimulatedLogger())
|
||||
taskmgr.set_jobmgr(jobmgr)
|
||||
taskmgr.start()
|
||||
|
||||
add('task_0', instance_count=2, retry_count=2, timeout=60, cpu=2, memory=2048, disk=2048, gpu=0)
|
||||
|
||||
|
||||
|
||||
def add(taskid, instance_count, retry_count, timeout, cpu, memory, disk, gpu):
|
||||
global jobmgr
|
||||
global taskmgr
|
||||
jobmgr.assignTask(taskmgr, taskid, instance_count, retry_count, timeout, cpu, memory, disk, gpu)
|
||||
|
||||
|
||||
def report(taskid, instanceid, status, token):
|
||||
global taskmgr
|
||||
|
||||
master_port = env.getenv('BATCH_MASTER_PORT')
|
||||
channel = grpc.insecure_channel('%s:%s' % ('0.0.0.0', master_port))
|
||||
stub = MasterStub(channel)
|
||||
response = stub.report(ReportMsg(taskmsgs=[TaskMsg(taskid=taskid, username='root', vnodeid=instanceid, subTaskStatus=status, token=token)]))
|
||||
|
||||
|
||||
def stop():
|
||||
global worker
|
||||
global jobmgr
|
||||
global taskmgr
|
||||
|
||||
worker.stop()
|
||||
jobmgr.stop()
|
||||
taskmgr.stop()
|
|
@ -0,0 +1,66 @@
|
|||
import sys
|
||||
if sys.path[0].endswith("master"):
|
||||
sys.path[0] = sys.path[0][:-6]
|
||||
|
||||
import grpc,time
|
||||
|
||||
from protos import rpc_pb2, rpc_pb2_grpc
|
||||
import random, string
|
||||
|
||||
def run():
|
||||
channel = grpc.insecure_channel('localhost:50051')
|
||||
stub = rpc_pb2_grpc.WorkerStub(channel)
|
||||
|
||||
#comm = rpc_pb2.Command(commandLine="ls /root;sleep 5;ls /root", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}'
|
||||
#paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="/root/nfs/batch_{jobid}/", stdoutRedirectPath="/root/nfs/batch_{jobid}/")
|
||||
|
||||
img = rpc_pb2.Image(name="base", type=rpc_pb2.Image.BASE, owner="docklet")
|
||||
inst = rpc_pb2.Instance(cpu=1, memory=1000, disk=1000, gpu=0)
|
||||
mnt = rpc_pb2.Mount(localPath="",provider='aliyun',remotePath="test-for-docklet",other="oss-cn-beijing.aliyuncs.com",accessKey="LTAIdl7gmmIhfqA9",secretKey="")
|
||||
network = rpc_pb2.Network(ipaddr="10.0.4.2/24",gateway="10.0.4.1",masterip="192.168.0.1",brname="batch-root-test")
|
||||
vnode = rpc_pb2.VNode(image=img, instance=inst, mount=[],network=network,hostname="batch-5")
|
||||
vnodeinfo = rpc_pb2.VNodeInfo(taskid="test",username="root",vnodeid=1,vnode=vnode)
|
||||
|
||||
#task = rpc_pb2.TaskInfo(id="test",username="root",instanceid=1,instanceCount=1,maxRetryCount=1,parameters=paras,cluster=clu,timeout=60000,token=''.join(random.sample(string.ascii_letters + string.digits, 8)))
|
||||
|
||||
response = stub.start_vnode(vnodeinfo)
|
||||
print("Batch client received: " + str(response.status)+" "+response.message)
|
||||
|
||||
def stop_task():
|
||||
channel = grpc.insecure_channel('localhost:50051')
|
||||
stub = rpc_pb2_grpc.WorkerStub(channel)
|
||||
|
||||
taskmsg = rpc_pb2.TaskMsg(taskid="test",username="root",instanceid=1,instanceStatus=rpc_pb2.COMPLETED,token="test",errmsg="")
|
||||
reportmsg = rpc_pb2.ReportMsg(taskmsgs = [taskmsg])
|
||||
|
||||
response = stub.stop_tasks(reportmsg)
|
||||
print("Batch client received: " + str(response.status)+" "+response.message)
|
||||
|
||||
def stop_vnode():
|
||||
channel = grpc.insecure_channel('localhost:50051')
|
||||
stub = rpc_pb2_grpc.WorkerStub(channel)
|
||||
network = rpc_pb2.Network(brname="batch-root-test")
|
||||
vnodeinfo = rpc_pb2.VNodeInfo(taskid="test",username="root",vnodeid=1,vnode=rpc_pb2.VNode(network=network))
|
||||
|
||||
response = stub.stop_vnode(vnodeinfo)
|
||||
print("Batch client received: " + str(response.status)+" "+response.message)
|
||||
|
||||
def start_task():
|
||||
channel = grpc.insecure_channel('localhost:50051')
|
||||
stub = rpc_pb2_grpc.WorkerStub(channel)
|
||||
|
||||
comm = rpc_pb2.Command(commandLine="ls /root;sleep 5;ls /root", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}'
|
||||
paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="/root/nfs/batch_{jobid}/", stdoutRedirectPath="/root/nfs/batch_{jobid}/")
|
||||
taskinfo = rpc_pb2.TaskInfo(taskid="test",username="root",vnodeid=1,parameters=paras,timeout=20,token="test")
|
||||
|
||||
response = stub.start_task(taskinfo)
|
||||
print("Batch client received: " + str(response.status)+" "+response.message)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#for i in range(10):
|
||||
#run()
|
||||
#start_task()
|
||||
stop_vnode()
|
||||
#time.sleep(4)
|
||||
#stop_task()
|
|
@ -120,7 +120,7 @@ class VclusterMgr(object):
|
|||
return [False, "the size of disk is not big enough for the image"]
|
||||
clustersize = int(self.defaultsize)
|
||||
logger.info ("starting cluster %s with %d containers for %s" % (clustername, int(clustersize), username))
|
||||
workers = self.nodemgr.get_nodeips()
|
||||
workers = self.nodemgr.get_base_nodeips()
|
||||
image_json = json.dumps(image)
|
||||
groupname = json.loads(user_info)["data"]["group"]
|
||||
groupquota = json.loads(user_info)["data"]["groupinfo"]
|
||||
|
@ -206,7 +206,7 @@ class VclusterMgr(object):
|
|||
return [False, "cluster:%s not found" % clustername]
|
||||
if self.imgmgr.get_image_size(image) + 100 > int(setting["disk"]):
|
||||
return [False, "the size of disk is not big enough for the image"]
|
||||
workers = self.nodemgr.get_nodeips()
|
||||
workers = self.nodemgr.get_base_nodeips()
|
||||
if (len(workers) == 0):
|
||||
logger.warning("no workers to start containers, scale out failed")
|
||||
return [False, "no workers are running"]
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
syntax = "proto3";
|
||||
|
||||
service Master {
|
||||
rpc report (ReportMsg) returns (Reply) {}
|
||||
}
|
||||
|
||||
service Worker {
|
||||
rpc start_vnode (VNodeInfo) returns (Reply) {}
|
||||
rpc start_task (TaskInfo) returns (Reply) {}
|
||||
rpc stop_task (TaskInfo) returns (Reply) {}
|
||||
rpc stop_vnode (VNodeInfo) returns (Reply) {}
|
||||
}
|
||||
|
||||
message VNodeInfo {
|
||||
string taskid = 1;
|
||||
string username = 2;
|
||||
int32 vnodeid = 3;
|
||||
VNode vnode = 4; // 集群配置
|
||||
}
|
||||
|
||||
message Reply {
|
||||
ReplyStatus status = 1; // 返回值
|
||||
string message = 2;
|
||||
|
||||
enum ReplyStatus {
|
||||
ACCEPTED = 0;
|
||||
REFUSED = 1;
|
||||
}
|
||||
}
|
||||
|
||||
message ReportMsg {
|
||||
repeated TaskMsg taskmsgs = 1;
|
||||
}
|
||||
|
||||
message TaskMsg {
|
||||
string taskid = 1;
|
||||
string username = 2;
|
||||
int32 vnodeid = 3;
|
||||
Status subTaskStatus = 4; // 任务状态
|
||||
string token = 5;
|
||||
string errmsg = 6;
|
||||
}
|
||||
|
||||
enum Status {
|
||||
WAITING = 0;
|
||||
RUNNING = 1;
|
||||
COMPLETED = 2;
|
||||
FAILED = 3;
|
||||
TIMEOUT = 4;
|
||||
OUTPUTERROR = 5;
|
||||
}
|
||||
|
||||
message TaskInfo {
|
||||
string taskid = 1;
|
||||
string username = 2;
|
||||
int32 vnodeid = 3;
|
||||
Parameters parameters = 4; // 参数
|
||||
int32 timeout = 5; // 超时阈值
|
||||
string token = 6;
|
||||
}
|
||||
|
||||
message Parameters {
|
||||
Command command = 1; // 命令配置
|
||||
string stderrRedirectPath = 2; // 错误输出重定向
|
||||
string stdoutRedirectPath = 3; // 标准输出重定向
|
||||
}
|
||||
|
||||
message Command {
|
||||
string commandLine = 1; // 命令
|
||||
string packagePath = 2; // 工作路径
|
||||
map<string, string> envVars = 3; // 自定义环境变量
|
||||
}
|
||||
|
||||
message VNode {
|
||||
Image image = 1; // 镜像配置
|
||||
Instance instance = 2; // 实例配置
|
||||
repeated Mount mount = 3; // 挂载配置
|
||||
Network network = 4; //网络配置
|
||||
string hostname = 5; //主机名
|
||||
}
|
||||
|
||||
message Network {
|
||||
string ipaddr = 1;
|
||||
string gateway = 2;
|
||||
string masterip = 3;
|
||||
string brname = 4;
|
||||
}
|
||||
|
||||
message Image {
|
||||
string name = 1; // 镜像名
|
||||
ImageType type = 2; // 镜像类型(public/private)
|
||||
string owner = 3; // 所有者
|
||||
|
||||
enum ImageType {
|
||||
BASE = 0;
|
||||
PUBLIC = 1;
|
||||
PRIVATE = 2;
|
||||
}
|
||||
}
|
||||
|
||||
message Mount {
|
||||
string provider = 1;
|
||||
string localPath = 2; // 本地路径
|
||||
string remotePath = 3; // 远程路径
|
||||
string accessKey = 4;
|
||||
string secretKey = 5;
|
||||
string other = 6;
|
||||
}
|
||||
|
||||
message Instance {
|
||||
int32 cpu = 1; // CPU,单位 个?
|
||||
int32 memory = 2; // 内存,单位 mb
|
||||
int32 disk = 3; // 磁盘,单位 mb
|
||||
int32 gpu = 4; // 显卡,单位 个
|
||||
}
|
|
@ -0,0 +1,977 @@
|
|||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: rpc.proto
|
||||
|
||||
import sys
|
||||
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
|
||||
from google.protobuf.internal import enum_type_wrapper
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from google.protobuf import reflection as _reflection
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf import descriptor_pb2
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
name='rpc.proto',
|
||||
package='',
|
||||
syntax='proto3',
|
||||
serialized_pb=_b('\n\trpc.proto\"U\n\tVNodeInfo\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x15\n\x05vnode\x18\x04 \x01(\x0b\x32\x06.VNode\"f\n\x05Reply\x12\"\n\x06status\x18\x01 \x01(\x0e\x32\x12.Reply.ReplyStatus\x12\x0f\n\x07message\x18\x02 \x01(\t\"(\n\x0bReplyStatus\x12\x0c\n\x08\x41\x43\x43\x45PTED\x10\x00\x12\x0b\n\x07REFUSED\x10\x01\"\'\n\tReportMsg\x12\x1a\n\x08taskmsgs\x18\x01 \x03(\x0b\x32\x08.TaskMsg\"{\n\x07TaskMsg\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x1e\n\rsubTaskStatus\x18\x04 \x01(\x0e\x32\x07.Status\x12\r\n\x05token\x18\x05 \x01(\t\x12\x0e\n\x06\x65rrmsg\x18\x06 \x01(\t\"~\n\x08TaskInfo\x12\x0e\n\x06taskid\x18\x01 \x01(\t\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x0f\n\x07vnodeid\x18\x03 \x01(\x05\x12\x1f\n\nparameters\x18\x04 \x01(\x0b\x32\x0b.Parameters\x12\x0f\n\x07timeout\x18\x05 \x01(\x05\x12\r\n\x05token\x18\x06 \x01(\t\"_\n\nParameters\x12\x19\n\x07\x63ommand\x18\x01 \x01(\x0b\x32\x08.Command\x12\x1a\n\x12stderrRedirectPath\x18\x02 \x01(\t\x12\x1a\n\x12stdoutRedirectPath\x18\x03 \x01(\t\"\x8b\x01\n\x07\x43ommand\x12\x13\n\x0b\x63ommandLine\x18\x01 \x01(\t\x12\x13\n\x0bpackagePath\x18\x02 \x01(\t\x12&\n\x07\x65nvVars\x18\x03 \x03(\x0b\x32\x15.Command.EnvVarsEntry\x1a.\n\x0c\x45nvVarsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x7f\n\x05VNode\x12\x15\n\x05image\x18\x01 \x01(\x0b\x32\x06.Image\x12\x1b\n\x08instance\x18\x02 \x01(\x0b\x32\t.Instance\x12\x15\n\x05mount\x18\x03 \x03(\x0b\x32\x06.Mount\x12\x19\n\x07network\x18\x04 \x01(\x0b\x32\x08.Network\x12\x10\n\x08hostname\x18\x05 \x01(\t\"L\n\x07Network\x12\x0e\n\x06ipaddr\x18\x01 \x01(\t\x12\x0f\n\x07gateway\x18\x02 \x01(\t\x12\x10\n\x08masterip\x18\x03 \x01(\t\x12\x0e\n\x06\x62rname\x18\x04 \x01(\t\"t\n\x05Image\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1e\n\x04type\x18\x02 \x01(\x0e\x32\x10.Image.ImageType\x12\r\n\x05owner\x18\x03 \x01(\t\".\n\tImageType\x12\x08\n\x04\x42\x41SE\x10\x00\x12\n\n\x06PUBLIC\x10\x01\x12\x0b\n\x07PRIVATE\x10\x02\"u\n\x05Mount\x12\x10\n\x08provider\x18\x01 \x01(\t\x12\x11\n\tlocalPath\x18\x02 \x01(\t\x12\x12\n\nremotePath\x18\x03 \x01(\t\x12\x11\n\taccessKey\x18\x04 \x01(\t\x12\x11\n\tsecretKey\x18\x05 \x01(\t\x12\r\n\x05other\x18\x06 \x01(\t\"B\n\x08Instance\x12\x0b\n\x03\x63pu\x18\x01 \x01(\x05\x12\x0e\n\x06memory\x18\x02 \x01(\x05\x12\x0c\n\x04\x64isk\x18\x03 \x01(\x05\x12\x0b\n\x03gpu\x18\x04 \x01(\x05*[\n\x06Status\x12\x0b\n\x07WAITING\x10\x00\x12\x0b\n\x07RUNNING\x10\x01\x12\r\n\tCOMPLETED\x10\x02\x12\n\n\x06\x46\x41ILED\x10\x03\x12\x0b\n\x07TIMEOUT\x10\x04\x12\x0f\n\x0bOUTPUTERROR\x10\x05\x32(\n\x06Master\x12\x1e\n\x06report\x12\n.ReportMsg\x1a\x06.Reply\"\x00\x32\x96\x01\n\x06Worker\x12#\n\x0bstart_vnode\x12\n.VNodeInfo\x1a\x06.Reply\"\x00\x12!\n\nstart_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x12 \n\tstop_task\x12\t.TaskInfo\x1a\x06.Reply\"\x00\x12\"\n\nstop_vnode\x12\n.VNodeInfo\x1a\x06.Reply\"\x00\x62\x06proto3')
|
||||
)
|
||||
|
||||
_STATUS = _descriptor.EnumDescriptor(
|
||||
name='Status',
|
||||
full_name='Status',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
values=[
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='WAITING', index=0, number=0,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='RUNNING', index=1, number=1,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='COMPLETED', index=2, number=2,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='FAILED', index=3, number=3,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='TIMEOUT', index=4, number=4,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='OUTPUTERROR', index=5, number=5,
|
||||
options=None,
|
||||
type=None),
|
||||
],
|
||||
containing_type=None,
|
||||
options=None,
|
||||
serialized_start=1249,
|
||||
serialized_end=1340,
|
||||
)
|
||||
_sym_db.RegisterEnumDescriptor(_STATUS)
|
||||
|
||||
Status = enum_type_wrapper.EnumTypeWrapper(_STATUS)
|
||||
WAITING = 0
|
||||
RUNNING = 1
|
||||
COMPLETED = 2
|
||||
FAILED = 3
|
||||
TIMEOUT = 4
|
||||
OUTPUTERROR = 5
|
||||
|
||||
|
||||
_REPLY_REPLYSTATUS = _descriptor.EnumDescriptor(
|
||||
name='ReplyStatus',
|
||||
full_name='Reply.ReplyStatus',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
values=[
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='ACCEPTED', index=0, number=0,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='REFUSED', index=1, number=1,
|
||||
options=None,
|
||||
type=None),
|
||||
],
|
||||
containing_type=None,
|
||||
options=None,
|
||||
serialized_start=162,
|
||||
serialized_end=202,
|
||||
)
|
||||
_sym_db.RegisterEnumDescriptor(_REPLY_REPLYSTATUS)
|
||||
|
||||
_IMAGE_IMAGETYPE = _descriptor.EnumDescriptor(
|
||||
name='ImageType',
|
||||
full_name='Image.ImageType',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
values=[
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='BASE', index=0, number=0,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='PUBLIC', index=1, number=1,
|
||||
options=None,
|
||||
type=None),
|
||||
_descriptor.EnumValueDescriptor(
|
||||
name='PRIVATE', index=2, number=2,
|
||||
options=None,
|
||||
type=None),
|
||||
],
|
||||
containing_type=None,
|
||||
options=None,
|
||||
serialized_start=1014,
|
||||
serialized_end=1060,
|
||||
)
|
||||
_sym_db.RegisterEnumDescriptor(_IMAGE_IMAGETYPE)
|
||||
|
||||
|
||||
_VNODEINFO = _descriptor.Descriptor(
|
||||
name='VNodeInfo',
|
||||
full_name='VNodeInfo',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='taskid', full_name='VNodeInfo.taskid', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='username', full_name='VNodeInfo.username', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='vnodeid', full_name='VNodeInfo.vnodeid', index=2,
|
||||
number=3, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='vnode', full_name='VNodeInfo.vnode', index=3,
|
||||
number=4, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=13,
|
||||
serialized_end=98,
|
||||
)
|
||||
|
||||
|
||||
_REPLY = _descriptor.Descriptor(
|
||||
name='Reply',
|
||||
full_name='Reply',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='status', full_name='Reply.status', index=0,
|
||||
number=1, type=14, cpp_type=8, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='message', full_name='Reply.message', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
_REPLY_REPLYSTATUS,
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=100,
|
||||
serialized_end=202,
|
||||
)
|
||||
|
||||
|
||||
_REPORTMSG = _descriptor.Descriptor(
|
||||
name='ReportMsg',
|
||||
full_name='ReportMsg',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='taskmsgs', full_name='ReportMsg.taskmsgs', index=0,
|
||||
number=1, type=11, cpp_type=10, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=204,
|
||||
serialized_end=243,
|
||||
)
|
||||
|
||||
|
||||
_TASKMSG = _descriptor.Descriptor(
|
||||
name='TaskMsg',
|
||||
full_name='TaskMsg',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='taskid', full_name='TaskMsg.taskid', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='username', full_name='TaskMsg.username', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='vnodeid', full_name='TaskMsg.vnodeid', index=2,
|
||||
number=3, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='subTaskStatus', full_name='TaskMsg.subTaskStatus', index=3,
|
||||
number=4, type=14, cpp_type=8, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='token', full_name='TaskMsg.token', index=4,
|
||||
number=5, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='errmsg', full_name='TaskMsg.errmsg', index=5,
|
||||
number=6, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=245,
|
||||
serialized_end=368,
|
||||
)
|
||||
|
||||
|
||||
_TASKINFO = _descriptor.Descriptor(
|
||||
name='TaskInfo',
|
||||
full_name='TaskInfo',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='taskid', full_name='TaskInfo.taskid', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='username', full_name='TaskInfo.username', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='vnodeid', full_name='TaskInfo.vnodeid', index=2,
|
||||
number=3, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='parameters', full_name='TaskInfo.parameters', index=3,
|
||||
number=4, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='timeout', full_name='TaskInfo.timeout', index=4,
|
||||
number=5, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='token', full_name='TaskInfo.token', index=5,
|
||||
number=6, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=370,
|
||||
serialized_end=496,
|
||||
)
|
||||
|
||||
|
||||
_PARAMETERS = _descriptor.Descriptor(
|
||||
name='Parameters',
|
||||
full_name='Parameters',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='command', full_name='Parameters.command', index=0,
|
||||
number=1, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='stderrRedirectPath', full_name='Parameters.stderrRedirectPath', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='stdoutRedirectPath', full_name='Parameters.stdoutRedirectPath', index=2,
|
||||
number=3, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=498,
|
||||
serialized_end=593,
|
||||
)
|
||||
|
||||
|
||||
_COMMAND_ENVVARSENTRY = _descriptor.Descriptor(
|
||||
name='EnvVarsEntry',
|
||||
full_name='Command.EnvVarsEntry',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='key', full_name='Command.EnvVarsEntry.key', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='value', full_name='Command.EnvVarsEntry.value', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')),
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=689,
|
||||
serialized_end=735,
|
||||
)
|
||||
|
||||
_COMMAND = _descriptor.Descriptor(
|
||||
name='Command',
|
||||
full_name='Command',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='commandLine', full_name='Command.commandLine', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='packagePath', full_name='Command.packagePath', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='envVars', full_name='Command.envVars', index=2,
|
||||
number=3, type=11, cpp_type=10, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[_COMMAND_ENVVARSENTRY, ],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=596,
|
||||
serialized_end=735,
|
||||
)
|
||||
|
||||
|
||||
_VNODE = _descriptor.Descriptor(
|
||||
name='VNode',
|
||||
full_name='VNode',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='image', full_name='VNode.image', index=0,
|
||||
number=1, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='instance', full_name='VNode.instance', index=1,
|
||||
number=2, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='mount', full_name='VNode.mount', index=2,
|
||||
number=3, type=11, cpp_type=10, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='network', full_name='VNode.network', index=3,
|
||||
number=4, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='hostname', full_name='VNode.hostname', index=4,
|
||||
number=5, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=737,
|
||||
serialized_end=864,
|
||||
)
|
||||
|
||||
|
||||
_NETWORK = _descriptor.Descriptor(
|
||||
name='Network',
|
||||
full_name='Network',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='ipaddr', full_name='Network.ipaddr', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='gateway', full_name='Network.gateway', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='masterip', full_name='Network.masterip', index=2,
|
||||
number=3, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='brname', full_name='Network.brname', index=3,
|
||||
number=4, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=866,
|
||||
serialized_end=942,
|
||||
)
|
||||
|
||||
|
||||
_IMAGE = _descriptor.Descriptor(
|
||||
name='Image',
|
||||
full_name='Image',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='name', full_name='Image.name', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='type', full_name='Image.type', index=1,
|
||||
number=2, type=14, cpp_type=8, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='owner', full_name='Image.owner', index=2,
|
||||
number=3, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
_IMAGE_IMAGETYPE,
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=944,
|
||||
serialized_end=1060,
|
||||
)
|
||||
|
||||
|
||||
_MOUNT = _descriptor.Descriptor(
|
||||
name='Mount',
|
||||
full_name='Mount',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='provider', full_name='Mount.provider', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='localPath', full_name='Mount.localPath', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='remotePath', full_name='Mount.remotePath', index=2,
|
||||
number=3, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='accessKey', full_name='Mount.accessKey', index=3,
|
||||
number=4, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='secretKey', full_name='Mount.secretKey', index=4,
|
||||
number=5, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='other', full_name='Mount.other', index=5,
|
||||
number=6, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=_b("").decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=1062,
|
||||
serialized_end=1179,
|
||||
)
|
||||
|
||||
|
||||
_INSTANCE = _descriptor.Descriptor(
|
||||
name='Instance',
|
||||
full_name='Instance',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='cpu', full_name='Instance.cpu', index=0,
|
||||
number=1, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='memory', full_name='Instance.memory', index=1,
|
||||
number=2, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='disk', full_name='Instance.disk', index=2,
|
||||
number=3, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='gpu', full_name='Instance.gpu', index=3,
|
||||
number=4, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
options=None, file=DESCRIPTOR),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=1181,
|
||||
serialized_end=1247,
|
||||
)
|
||||
|
||||
_VNODEINFO.fields_by_name['vnode'].message_type = _VNODE
|
||||
_REPLY.fields_by_name['status'].enum_type = _REPLY_REPLYSTATUS
|
||||
_REPLY_REPLYSTATUS.containing_type = _REPLY
|
||||
_REPORTMSG.fields_by_name['taskmsgs'].message_type = _TASKMSG
|
||||
_TASKMSG.fields_by_name['subTaskStatus'].enum_type = _STATUS
|
||||
_TASKINFO.fields_by_name['parameters'].message_type = _PARAMETERS
|
||||
_PARAMETERS.fields_by_name['command'].message_type = _COMMAND
|
||||
_COMMAND_ENVVARSENTRY.containing_type = _COMMAND
|
||||
_COMMAND.fields_by_name['envVars'].message_type = _COMMAND_ENVVARSENTRY
|
||||
_VNODE.fields_by_name['image'].message_type = _IMAGE
|
||||
_VNODE.fields_by_name['instance'].message_type = _INSTANCE
|
||||
_VNODE.fields_by_name['mount'].message_type = _MOUNT
|
||||
_VNODE.fields_by_name['network'].message_type = _NETWORK
|
||||
_IMAGE.fields_by_name['type'].enum_type = _IMAGE_IMAGETYPE
|
||||
_IMAGE_IMAGETYPE.containing_type = _IMAGE
|
||||
DESCRIPTOR.message_types_by_name['VNodeInfo'] = _VNODEINFO
|
||||
DESCRIPTOR.message_types_by_name['Reply'] = _REPLY
|
||||
DESCRIPTOR.message_types_by_name['ReportMsg'] = _REPORTMSG
|
||||
DESCRIPTOR.message_types_by_name['TaskMsg'] = _TASKMSG
|
||||
DESCRIPTOR.message_types_by_name['TaskInfo'] = _TASKINFO
|
||||
DESCRIPTOR.message_types_by_name['Parameters'] = _PARAMETERS
|
||||
DESCRIPTOR.message_types_by_name['Command'] = _COMMAND
|
||||
DESCRIPTOR.message_types_by_name['VNode'] = _VNODE
|
||||
DESCRIPTOR.message_types_by_name['Network'] = _NETWORK
|
||||
DESCRIPTOR.message_types_by_name['Image'] = _IMAGE
|
||||
DESCRIPTOR.message_types_by_name['Mount'] = _MOUNT
|
||||
DESCRIPTOR.message_types_by_name['Instance'] = _INSTANCE
|
||||
DESCRIPTOR.enum_types_by_name['Status'] = _STATUS
|
||||
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
|
||||
|
||||
VNodeInfo = _reflection.GeneratedProtocolMessageType('VNodeInfo', (_message.Message,), dict(
|
||||
DESCRIPTOR = _VNODEINFO,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:VNodeInfo)
|
||||
))
|
||||
_sym_db.RegisterMessage(VNodeInfo)
|
||||
|
||||
Reply = _reflection.GeneratedProtocolMessageType('Reply', (_message.Message,), dict(
|
||||
DESCRIPTOR = _REPLY,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Reply)
|
||||
))
|
||||
_sym_db.RegisterMessage(Reply)
|
||||
|
||||
ReportMsg = _reflection.GeneratedProtocolMessageType('ReportMsg', (_message.Message,), dict(
|
||||
DESCRIPTOR = _REPORTMSG,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:ReportMsg)
|
||||
))
|
||||
_sym_db.RegisterMessage(ReportMsg)
|
||||
|
||||
TaskMsg = _reflection.GeneratedProtocolMessageType('TaskMsg', (_message.Message,), dict(
|
||||
DESCRIPTOR = _TASKMSG,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:TaskMsg)
|
||||
))
|
||||
_sym_db.RegisterMessage(TaskMsg)
|
||||
|
||||
TaskInfo = _reflection.GeneratedProtocolMessageType('TaskInfo', (_message.Message,), dict(
|
||||
DESCRIPTOR = _TASKINFO,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:TaskInfo)
|
||||
))
|
||||
_sym_db.RegisterMessage(TaskInfo)
|
||||
|
||||
Parameters = _reflection.GeneratedProtocolMessageType('Parameters', (_message.Message,), dict(
|
||||
DESCRIPTOR = _PARAMETERS,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Parameters)
|
||||
))
|
||||
_sym_db.RegisterMessage(Parameters)
|
||||
|
||||
Command = _reflection.GeneratedProtocolMessageType('Command', (_message.Message,), dict(
|
||||
|
||||
EnvVarsEntry = _reflection.GeneratedProtocolMessageType('EnvVarsEntry', (_message.Message,), dict(
|
||||
DESCRIPTOR = _COMMAND_ENVVARSENTRY,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Command.EnvVarsEntry)
|
||||
))
|
||||
,
|
||||
DESCRIPTOR = _COMMAND,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Command)
|
||||
))
|
||||
_sym_db.RegisterMessage(Command)
|
||||
_sym_db.RegisterMessage(Command.EnvVarsEntry)
|
||||
|
||||
VNode = _reflection.GeneratedProtocolMessageType('VNode', (_message.Message,), dict(
|
||||
DESCRIPTOR = _VNODE,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:VNode)
|
||||
))
|
||||
_sym_db.RegisterMessage(VNode)
|
||||
|
||||
Network = _reflection.GeneratedProtocolMessageType('Network', (_message.Message,), dict(
|
||||
DESCRIPTOR = _NETWORK,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Network)
|
||||
))
|
||||
_sym_db.RegisterMessage(Network)
|
||||
|
||||
Image = _reflection.GeneratedProtocolMessageType('Image', (_message.Message,), dict(
|
||||
DESCRIPTOR = _IMAGE,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Image)
|
||||
))
|
||||
_sym_db.RegisterMessage(Image)
|
||||
|
||||
Mount = _reflection.GeneratedProtocolMessageType('Mount', (_message.Message,), dict(
|
||||
DESCRIPTOR = _MOUNT,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Mount)
|
||||
))
|
||||
_sym_db.RegisterMessage(Mount)
|
||||
|
||||
Instance = _reflection.GeneratedProtocolMessageType('Instance', (_message.Message,), dict(
|
||||
DESCRIPTOR = _INSTANCE,
|
||||
__module__ = 'rpc_pb2'
|
||||
# @@protoc_insertion_point(class_scope:Instance)
|
||||
))
|
||||
_sym_db.RegisterMessage(Instance)
|
||||
|
||||
|
||||
_COMMAND_ENVVARSENTRY.has_options = True
|
||||
_COMMAND_ENVVARSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001'))
|
||||
|
||||
_MASTER = _descriptor.ServiceDescriptor(
|
||||
name='Master',
|
||||
full_name='Master',
|
||||
file=DESCRIPTOR,
|
||||
index=0,
|
||||
options=None,
|
||||
serialized_start=1342,
|
||||
serialized_end=1382,
|
||||
methods=[
|
||||
_descriptor.MethodDescriptor(
|
||||
name='report',
|
||||
full_name='Master.report',
|
||||
index=0,
|
||||
containing_service=None,
|
||||
input_type=_REPORTMSG,
|
||||
output_type=_REPLY,
|
||||
options=None,
|
||||
),
|
||||
])
|
||||
_sym_db.RegisterServiceDescriptor(_MASTER)
|
||||
|
||||
DESCRIPTOR.services_by_name['Master'] = _MASTER
|
||||
|
||||
|
||||
_WORKER = _descriptor.ServiceDescriptor(
|
||||
name='Worker',
|
||||
full_name='Worker',
|
||||
file=DESCRIPTOR,
|
||||
index=1,
|
||||
options=None,
|
||||
serialized_start=1385,
|
||||
serialized_end=1535,
|
||||
methods=[
|
||||
_descriptor.MethodDescriptor(
|
||||
name='start_vnode',
|
||||
full_name='Worker.start_vnode',
|
||||
index=0,
|
||||
containing_service=None,
|
||||
input_type=_VNODEINFO,
|
||||
output_type=_REPLY,
|
||||
options=None,
|
||||
),
|
||||
_descriptor.MethodDescriptor(
|
||||
name='start_task',
|
||||
full_name='Worker.start_task',
|
||||
index=1,
|
||||
containing_service=None,
|
||||
input_type=_TASKINFO,
|
||||
output_type=_REPLY,
|
||||
options=None,
|
||||
),
|
||||
_descriptor.MethodDescriptor(
|
||||
name='stop_task',
|
||||
full_name='Worker.stop_task',
|
||||
index=2,
|
||||
containing_service=None,
|
||||
input_type=_TASKINFO,
|
||||
output_type=_REPLY,
|
||||
options=None,
|
||||
),
|
||||
_descriptor.MethodDescriptor(
|
||||
name='stop_vnode',
|
||||
full_name='Worker.stop_vnode',
|
||||
index=3,
|
||||
containing_service=None,
|
||||
input_type=_VNODEINFO,
|
||||
output_type=_REPLY,
|
||||
options=None,
|
||||
),
|
||||
])
|
||||
_sym_db.RegisterServiceDescriptor(_WORKER)
|
||||
|
||||
DESCRIPTOR.services_by_name['Worker'] = _WORKER
|
||||
|
||||
# @@protoc_insertion_point(module_scope)
|
|
@ -0,0 +1,139 @@
|
|||
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
||||
import grpc
|
||||
|
||||
from protos import rpc_pb2 as rpc__pb2
|
||||
|
||||
|
||||
class MasterStub(object):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
|
||||
def __init__(self, channel):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
channel: A grpc.Channel.
|
||||
"""
|
||||
self.report = channel.unary_unary(
|
||||
'/Master/report',
|
||||
request_serializer=rpc__pb2.ReportMsg.SerializeToString,
|
||||
response_deserializer=rpc__pb2.Reply.FromString,
|
||||
)
|
||||
|
||||
|
||||
class MasterServicer(object):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
|
||||
def report(self, request, context):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
|
||||
def add_MasterServicer_to_server(servicer, server):
|
||||
rpc_method_handlers = {
|
||||
'report': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.report,
|
||||
request_deserializer=rpc__pb2.ReportMsg.FromString,
|
||||
response_serializer=rpc__pb2.Reply.SerializeToString,
|
||||
),
|
||||
}
|
||||
generic_handler = grpc.method_handlers_generic_handler(
|
||||
'Master', rpc_method_handlers)
|
||||
server.add_generic_rpc_handlers((generic_handler,))
|
||||
|
||||
|
||||
class WorkerStub(object):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
|
||||
def __init__(self, channel):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
channel: A grpc.Channel.
|
||||
"""
|
||||
self.start_vnode = channel.unary_unary(
|
||||
'/Worker/start_vnode',
|
||||
request_serializer=rpc__pb2.VNodeInfo.SerializeToString,
|
||||
response_deserializer=rpc__pb2.Reply.FromString,
|
||||
)
|
||||
self.start_task = channel.unary_unary(
|
||||
'/Worker/start_task',
|
||||
request_serializer=rpc__pb2.TaskInfo.SerializeToString,
|
||||
response_deserializer=rpc__pb2.Reply.FromString,
|
||||
)
|
||||
self.stop_task = channel.unary_unary(
|
||||
'/Worker/stop_task',
|
||||
request_serializer=rpc__pb2.TaskInfo.SerializeToString,
|
||||
response_deserializer=rpc__pb2.Reply.FromString,
|
||||
)
|
||||
self.stop_vnode = channel.unary_unary(
|
||||
'/Worker/stop_vnode',
|
||||
request_serializer=rpc__pb2.VNodeInfo.SerializeToString,
|
||||
response_deserializer=rpc__pb2.Reply.FromString,
|
||||
)
|
||||
|
||||
|
||||
class WorkerServicer(object):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
|
||||
def start_vnode(self, request, context):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
def start_task(self, request, context):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
def stop_task(self, request, context):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
def stop_vnode(self, request, context):
|
||||
# missing associated documentation comment in .proto file
|
||||
pass
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
|
||||
def add_WorkerServicer_to_server(servicer, server):
|
||||
rpc_method_handlers = {
|
||||
'start_vnode': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.start_vnode,
|
||||
request_deserializer=rpc__pb2.VNodeInfo.FromString,
|
||||
response_serializer=rpc__pb2.Reply.SerializeToString,
|
||||
),
|
||||
'start_task': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.start_task,
|
||||
request_deserializer=rpc__pb2.TaskInfo.FromString,
|
||||
response_serializer=rpc__pb2.Reply.SerializeToString,
|
||||
),
|
||||
'stop_task': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.stop_task,
|
||||
request_deserializer=rpc__pb2.TaskInfo.FromString,
|
||||
response_serializer=rpc__pb2.Reply.SerializeToString,
|
||||
),
|
||||
'stop_vnode': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.stop_vnode,
|
||||
request_deserializer=rpc__pb2.VNodeInfo.FromString,
|
||||
response_serializer=rpc__pb2.Reply.SerializeToString,
|
||||
),
|
||||
}
|
||||
generic_handler = grpc.method_handlers_generic_handler(
|
||||
'Worker', rpc_method_handlers)
|
||||
server.add_generic_rpc_handlers((generic_handler,))
|
|
@ -79,5 +79,17 @@ def getenv(key):
|
|||
return os.environ.get("ALLOCATED_PORTS","10000-65535")
|
||||
elif key =="ALLOW_SCALE_OUT":
|
||||
return os.environ.get("ALLOW_SCALE_OUT", "False")
|
||||
elif key == "BATCH_ON":
|
||||
return os.environ.get("BATCH_ON","True")
|
||||
elif key == "BATCH_MASTER_PORT":
|
||||
return os.environ.get("BATCH_MASTER_PORT","50050")
|
||||
elif key == "BATCH_WORKER_PORT":
|
||||
return os.environ.get("BATCH_WORKER_PORT","50051")
|
||||
elif key == "BATCH_TASK_CIDR":
|
||||
return os.environ.get("BATCH_TASK_CIDR","4")
|
||||
elif key == "BATCH_NET":
|
||||
return os.environ.get("BATCH_NET","10.16.0.0/16")
|
||||
elif key == "BATCH_MAX_THREAD_WORKER":
|
||||
return os.environ.get("BATCH_MAX_THREAD_WORKER","5")
|
||||
else:
|
||||
return os.environ.get(key,"")
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
import lxc
|
||||
import subprocess
|
||||
import os
|
||||
import signal
|
||||
from utils.log import logger
|
||||
|
||||
|
||||
# Note: keep physical device id always the same as the virtual device id
|
||||
# device_path e.g. /dev/nvidia0
|
||||
def add_device(container_name, device_path):
|
||||
c = lxc.Container(container_name)
|
||||
return c.add_device_node(device_path, device_path)
|
||||
|
||||
|
||||
def remove_device(container_name, device_path):
|
||||
c = lxc.Container(container_name)
|
||||
return c.remove_device_node('', device_path)
|
||||
|
||||
|
||||
# Mon May 21 10:51:45 2018
|
||||
# +-----------------------------------------------------------------------------+
|
||||
# | NVIDIA-SMI 381.22 Driver Version: 381.22 |
|
||||
# |-------------------------------+----------------------+----------------------+
|
||||
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|
||||
# |===============================+======================+======================|
|
||||
# | 0 GeForce GTX 108... Off | 0000:02:00.0 Off | N/A |
|
||||
# | 33% 53C P2 59W / 250W | 295MiB / 11172MiB | 2% Default |
|
||||
# +-------------------------------+----------------------+----------------------+
|
||||
# | 1 GeForce GTX 108... Off | 0000:84:00.0 Off | N/A |
|
||||
# | 21% 35C P8 10W / 250W | 161MiB / 11172MiB | 0% Default |
|
||||
# +-------------------------------+----------------------+----------------------+
|
||||
#
|
||||
# +-----------------------------------------------------------------------------+
|
||||
# | Processes: GPU Memory |
|
||||
# | GPU PID Type Process name Usage |
|
||||
# |=============================================================================|
|
||||
# | 0 111893 C python3 285MiB |
|
||||
# | 1 111893 C python3 151MiB |
|
||||
# +-----------------------------------------------------------------------------+
|
||||
#
|
||||
def nvidia_smi():
|
||||
try:
|
||||
ret = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
|
||||
return ret.stdout.decode('utf-8').split('\n')
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def get_gpu_driver_version():
|
||||
output = nvidia_smi()
|
||||
if not output:
|
||||
return None
|
||||
else:
|
||||
return output[2].split()[-2]
|
||||
|
||||
|
||||
def get_gpu_status():
|
||||
output = nvidia_smi()
|
||||
if not output:
|
||||
return []
|
||||
interval_index = [index for index in range(len(output)) if len(output[index].strip()) == 0][0]
|
||||
status_list = []
|
||||
for index in range(7, interval_index, 3):
|
||||
status = {}
|
||||
status['id'] = output[index].split()[1]
|
||||
sp = output[index+1].split()
|
||||
status['fan'] = sp[1]
|
||||
status['memory'] = sp[8]
|
||||
status['memory_max'] = sp[10]
|
||||
status['util'] = sp[12]
|
||||
status_list.append(status)
|
||||
return status_list
|
||||
|
||||
|
||||
def get_gpu_processes():
|
||||
output = nvidia_smi()
|
||||
if not output:
|
||||
return []
|
||||
interval_index = [index for index in range(len(output)) if len(output[index].strip()) == 0][0]
|
||||
process_list = []
|
||||
for index in range(interval_index + 5, len(output)):
|
||||
sp = output[index].split()
|
||||
if len(sp) != 7:
|
||||
break
|
||||
process = {}
|
||||
process['gpu'] = sp[1]
|
||||
process['pid'] = sp[2]
|
||||
process['name'] = sp[4]
|
||||
process['memory'] = sp[5]
|
||||
process['container'] = get_container_name_by_pid(sp[2])
|
||||
process_list.append(process)
|
||||
return process_list
|
||||
|
||||
|
||||
def get_container_name_by_pid(pid):
|
||||
with open('/proc/%s/cgroup' % pid) as f:
|
||||
content = f.readlines()[0].strip().split('/')
|
||||
if content[1] != 'lxc':
|
||||
return 'host'
|
||||
else:
|
||||
return content[2]
|
||||
return None
|
||||
|
||||
|
||||
def clean_up_processes_in_gpu(gpu_id):
|
||||
logger.info('[gputools] start clean up processes in gpu %d' % gpu_id)
|
||||
processes = get_gpu_processes()
|
||||
for process in [p for p in processes if p['gpu'] == gpu_id]:
|
||||
logger.info('[gputools] find process %d running in gpu %d' % (process['pid'], process['gpu']))
|
||||
if process['container'] == 'host':
|
||||
logger.warning('[gputools] find process of host, ignored')
|
||||
else:
|
||||
logger.warning('[gputools] find process of container [%s], killed' % process['container'])
|
||||
try:
|
||||
os.kill(process['pid'], signal.SIGKILL)
|
||||
except OSError:
|
||||
continue
|
|
@ -44,6 +44,7 @@ app.config['SQLALCHEMY_BINDS'] = {
|
|||
'history': 'sqlite:///'+fsdir+'/global/sys/HistoryTable.db',
|
||||
'beansapplication': 'sqlite:///'+fsdir+'/global/sys/BeansApplication.db',
|
||||
'system': 'sqlite:///'+fsdir+'/global/sys/System.db',
|
||||
'batch':'sqlite:///'+fsdir+'/global/sys/Batch.db?check_same_thread=False',
|
||||
'login': 'sqlite:///'+fsdir+'/global/sys/Login.db'
|
||||
}
|
||||
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
|
||||
|
@ -435,3 +436,105 @@ class Image(db.Model):
|
|||
|
||||
def __repr__(self):
|
||||
return "{\"id\":\"%d\",\"imagename\":\"%s\",\"hasPrivate\":\"%s\",\"hasPublic\":\"%s\",\"ownername\":\"%s\",\"updatetime\":\"%s\",\"description\":\"%s\"}" % (self.id,self.imagename,str(self.hasPrivate),str(self.hasPublic),self.create_time.strftime("%Y-%m-%d %H:%M:%S"),self.ownername,self.description)
|
||||
|
||||
class Batchjob(db.Model):
|
||||
__bind_key__ = 'batch'
|
||||
id = db.Column(db.String(9), primary_key=True)
|
||||
username = db.Column(db.String(10))
|
||||
name = db.Column(db.String(30))
|
||||
priority = db.Column(db.Integer)
|
||||
status = db.Column(db.String(10))
|
||||
failed_reason = db.Column(db.Text)
|
||||
create_time = db.Column(db.DateTime)
|
||||
end_time = db.Column(db.DateTime)
|
||||
billing = db.Column(db.Integer)
|
||||
tasks = db.relationship('Batchtask', backref='batchjob', lazy='dynamic')
|
||||
|
||||
def __init__(self,id,username,name,priority):
|
||||
self.id = id
|
||||
self.username = username
|
||||
self.name = name
|
||||
self.priority = priority
|
||||
self.status = "pending"
|
||||
self.failed_reason = ""
|
||||
self.create_time = datetime.now()
|
||||
self.end_time = None
|
||||
self.billing = 0
|
||||
|
||||
def clear(self):
|
||||
self.status = "pending"
|
||||
self.failed_reason = ""
|
||||
self.end_time = None
|
||||
self.billing = 0
|
||||
|
||||
def __repr__(self):
|
||||
info = {}
|
||||
info['job_id'] = self.id
|
||||
info['username'] = self.username
|
||||
info['job_name'] = self.name
|
||||
info['priority'] = self.priority
|
||||
info['status'] = self.status
|
||||
info['failed_reason'] = self.failed_reason
|
||||
info['create_time'] = self.create_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
if self.end_time is None:
|
||||
info['end_time'] = "------"
|
||||
else:
|
||||
info['end_time'] = self.end_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
info['billing'] = self.billing
|
||||
return json.dumps(info)
|
||||
|
||||
class Batchtask(db.Model):
|
||||
__bind_key__ = 'batch'
|
||||
id = db.Column(db.String(15), primary_key=True)
|
||||
idx = db.Column(db.String(10))
|
||||
jobid = db.Column(db.String(9), db.ForeignKey('batchjob.id'))
|
||||
status = db.Column(db.String(15))
|
||||
failed_reason = db.Column(db.Text)
|
||||
start_time = db.Column(db.DateTime)
|
||||
end_time = db.Column(db.DateTime)
|
||||
running_time = db.Column(db.Integer)
|
||||
billing = db.Column(db.Integer)
|
||||
config = db.Column(db.Text)
|
||||
tried_times = db.Column(db.Integer)
|
||||
|
||||
def __init__(self, id, idx, config):
|
||||
self.id = id
|
||||
self.idx = idx
|
||||
self.status = "pending"
|
||||
self.failed_reason = ""
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
self.running_time = 0
|
||||
self.billing = 0
|
||||
self.config = json.dumps(config)
|
||||
self.tried_times = 0
|
||||
|
||||
def clear(self):
|
||||
self.status = "pending"
|
||||
self.failed_reason = ""
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
self.running_time = 0
|
||||
self.billing = 0
|
||||
self.tried_times = 0
|
||||
|
||||
def __repr__(self):
|
||||
info = {}
|
||||
info['id'] = self.id
|
||||
info['idx'] = self.idx
|
||||
info['jobid'] = self.jobid
|
||||
info['status'] = self.status
|
||||
info['failed_reason'] = self.failed_reason
|
||||
if self.start_time is None:
|
||||
info['start_time'] = "------"
|
||||
else:
|
||||
info['start_time'] = self.start_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
if self.end_time is None:
|
||||
info['end_time'] = "------"
|
||||
else:
|
||||
info['end_time'] = self.end_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
info['running_time'] = self.running_time
|
||||
info['billing'] = self.billing
|
||||
info['config'] = json.loads(self.config)
|
||||
info['tried_times'] = self.tried_times
|
||||
return json.dumps(info)
|
||||
|
|
|
@ -195,7 +195,7 @@ class ovscontrol(object):
|
|||
@staticmethod
|
||||
def add_port_internal(bridge, port):
|
||||
try:
|
||||
subprocess.run(['ovs-vsctl', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=internal'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
|
||||
subprocess.run(['ovs-vsctl', '--may-exist', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=internal'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
|
||||
return [True, str(port)]
|
||||
except subprocess.CalledProcessError as suberror:
|
||||
return [False, "add port failed : %s" % suberror.stdout.decode('utf-8')]
|
||||
|
@ -211,7 +211,7 @@ class ovscontrol(object):
|
|||
@staticmethod
|
||||
def add_port_gre(bridge, port, remote):
|
||||
try:
|
||||
subprocess.run(['ovs-vsctl', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=gre', 'options:remote_ip='+str(remote)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
|
||||
subprocess.run(['ovs-vsctl', '--may-exist', 'add-port', str(bridge), str(port), '--', 'set', 'interface', str(port), 'type=gre', 'options:remote_ip='+str(remote)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, check=True)
|
||||
return [True, str(port)]
|
||||
except subprocess.CalledProcessError as suberror:
|
||||
return [False, "add port failed : %s" % suberror.stdout.decode('utf-8')]
|
||||
|
|
|
@ -19,7 +19,7 @@ Design:Monitor mainly consists of three parts: Collectors, Master_Collector and
|
|||
|
||||
import subprocess,re,os,psutil,math,sys
|
||||
import time,threading,json,traceback,platform
|
||||
from utils import env, etcdlib
|
||||
from utils import env, etcdlib, gputools
|
||||
import lxc
|
||||
import xmlrpc.client
|
||||
from datetime import datetime
|
||||
|
@ -262,6 +262,7 @@ class Container_Collector(threading.Thread):
|
|||
global pid2name
|
||||
global laststopcpuval
|
||||
global laststopruntime
|
||||
is_batch = container_name.split('-')[1] == 'batch'
|
||||
# collect basic information, such as running time,state,pid,ip,name
|
||||
container = lxc.Container(container_name)
|
||||
basic_info = {}
|
||||
|
@ -286,7 +287,8 @@ class Container_Collector(threading.Thread):
|
|||
containerpids.append(container_pid_str)
|
||||
pid2name[container_pid_str] = container_name
|
||||
running_time = self.get_proc_etime(container.init_pid)
|
||||
running_time += laststopruntime[container_name]
|
||||
if not is_batch:
|
||||
running_time += laststopruntime[container_name]
|
||||
basic_info['PID'] = container_pid_str
|
||||
basic_info['IP'] = container.get_ips()[0]
|
||||
basic_info['RunningTime'] = running_time
|
||||
|
@ -326,7 +328,8 @@ class Container_Collector(threading.Thread):
|
|||
cpu_use = {}
|
||||
lastval = 0
|
||||
try:
|
||||
lastval = laststopcpuval[container_name]
|
||||
if not is_batch:
|
||||
lastval = laststopcpuval[container_name]
|
||||
except:
|
||||
logger.warning(traceback.format_exc())
|
||||
cpu_val += lastval
|
||||
|
@ -369,7 +372,7 @@ class Container_Collector(threading.Thread):
|
|||
|
||||
# deal with network used data
|
||||
containerids = re.split("-",container_name)
|
||||
if len(containerids) >= 3:
|
||||
if not is_batch and len(containerids) >= 3:
|
||||
workercinfo[container_name]['net_stats'] = self.net_stats[containerids[1] + '-' + containerids[2]]
|
||||
#logger.info(workercinfo[container_name]['net_stats'])
|
||||
|
||||
|
@ -378,7 +381,7 @@ class Container_Collector(threading.Thread):
|
|||
lasttime = lastbillingtime[container_name]
|
||||
#logger.info(lasttime)
|
||||
# process real billing if running time reach an hour
|
||||
if not int(running_time/self.billingtime) == lasttime:
|
||||
if not is_batch and not int(running_time/self.billingtime) == lasttime:
|
||||
#logger.info("billing:"+str(float(cpu_val)))
|
||||
lastbillingtime[container_name] = int(running_time/self.billingtime)
|
||||
self.billing_increment(container_name)
|
||||
|
@ -478,6 +481,10 @@ class Collector(threading.Thread):
|
|||
info[idx][key] = val
|
||||
return [cpuset, info]
|
||||
|
||||
# collect gpu used information
|
||||
def collect_gpuinfo(self):
|
||||
return gputools.get_gpu_status()
|
||||
|
||||
# collect disk used information
|
||||
def collect_diskinfo(self):
|
||||
global workercinfo
|
||||
|
@ -534,9 +541,10 @@ class Collector(threading.Thread):
|
|||
[cpuinfo,cpuconfig] = self.collect_cpuinfo()
|
||||
workerinfo['cpuinfo'] = cpuinfo
|
||||
workerinfo['cpuconfig'] = cpuconfig
|
||||
workerinfo['gpuinfo'] = self.collect_gpuinfo()
|
||||
workerinfo['diskinfo'] = self.collect_diskinfo()
|
||||
workerinfo['running'] = True
|
||||
#time.sleep(self.interval)
|
||||
time.sleep(self.interval)
|
||||
if self.test:
|
||||
break
|
||||
# print(self.etcdser.getkey('/meminfo/total'))
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
import abc
|
||||
import subprocess, os
|
||||
from utils.log import logger
|
||||
|
||||
class OssMounter(object):
|
||||
__metaclass__ = abc.ABCMeta
|
||||
|
||||
@staticmethod
|
||||
def execute_cmd(cmd):
|
||||
ret = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
|
||||
if ret.returncode != 0:
|
||||
msg = ret.stdout.decode(encoding="utf-8")
|
||||
logger.error(msg)
|
||||
return [False,msg]
|
||||
else:
|
||||
return [True,""]
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def mount_oss(datapath, mount_info):
|
||||
# mount oss
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def umount_oss(datapath, mount_info):
|
||||
# umount oss
|
||||
pass
|
||||
|
||||
class AliyunOssMounter(OssMounter):
|
||||
|
||||
@staticmethod
|
||||
def mount_oss(datapath, mount_info):
|
||||
# mount oss
|
||||
try:
|
||||
pwdfile = open("/etc/passwd-ossfs","w")
|
||||
pwdfile.write(mount_info.remotePath+":"+mount_info.accessKey+":"+mount_info.secretKey+"\n")
|
||||
pwdfile.close()
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
return [False,msg]
|
||||
|
||||
cmd = "chmod 640 /etc/passwd-ossfs"
|
||||
[success1, msg] = OssMounter.execute_cmd(cmd)
|
||||
if not success1:
|
||||
logger.error("Aliyun OSS mount chmod err:%s" % msg)
|
||||
return [False, msg]
|
||||
mountpath = datapath+"/Aliyun/"+mount_info.remotePath
|
||||
logger.info("Mount oss %s %s" % (mount_info.remotePath, mountpath))
|
||||
if not os.path.isdir(mountpath):
|
||||
os.makedirs(mountpath)
|
||||
cmd = "ossfs %s %s -ourl=%s" % (mount_info.remotePath, mountpath, mount_info.other)
|
||||
[success, msg] = OssMounter.execute_cmd(cmd)
|
||||
if not success:
|
||||
logger.error("Aliyun OSS mount err:%s" % msg)
|
||||
return [False, msg]
|
||||
return [True,""]
|
||||
|
||||
@staticmethod
|
||||
def umount_oss(datapath, mount_info):
|
||||
mountpath = datapath+"/Aliyun/"+mount_info.remotePath
|
||||
logger.info("UMount oss %s %s" % (mount_info.remotePath, mountpath))
|
||||
cmd = "fusermount -u %s" % (mountpath)
|
||||
[success, msg] = OssMounter.execute_cmd(cmd)
|
||||
if not success:
|
||||
logger.error("Aliyun OSS umount err:%s"%msg)
|
||||
return [False,msg]
|
||||
[success, msg] = OssMounter.execute_cmd("rm -rf %s" % mountpath)
|
||||
if not success:
|
||||
logger.error("Aliyun OSS umount err:%s"%msg)
|
||||
return [False,msg]
|
||||
return [True,""]
|
|
@ -0,0 +1,458 @@
|
|||
#!/usr/bin/python3
|
||||
import sys
|
||||
if sys.path[0].endswith("worker"):
|
||||
sys.path[0] = sys.path[0][:-6]
|
||||
from utils import env, tools
|
||||
config = env.getenv("CONFIG")
|
||||
#config = "/opt/docklet/local/docklet-running.conf"
|
||||
tools.loadenv(config)
|
||||
from utils.log import initlogging
|
||||
initlogging("docklet-taskcontroller")
|
||||
from utils.log import logger
|
||||
|
||||
from concurrent import futures
|
||||
import grpc
|
||||
#from utils.log import logger
|
||||
#from utils import env
|
||||
import json,lxc,subprocess,threading,os,time,traceback
|
||||
from utils import imagemgr,etcdlib,gputools
|
||||
from utils.lvmtool import sys_run
|
||||
from worker import ossmounter
|
||||
from protos import rpc_pb2, rpc_pb2_grpc
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
MAX_RUNNING_TIME = _ONE_DAY_IN_SECONDS
|
||||
|
||||
def ip_to_int(addr):
|
||||
[a, b, c, d] = addr.split('.')
|
||||
return (int(a)<<24) + (int(b)<<16) + (int(c)<<8) + int(d)
|
||||
|
||||
def int_to_ip(num):
|
||||
return str((num>>24)&255)+"."+str((num>>16)&255)+"."+str((num>>8)&255)+"."+str(num&255)
|
||||
|
||||
class TaskController(rpc_pb2_grpc.WorkerServicer):
|
||||
|
||||
def __init__(self):
|
||||
rpc_pb2_grpc.WorkerServicer.__init__(self)
|
||||
etcdaddr = env.getenv("ETCD")
|
||||
logger.info ("using ETCD %s" % etcdaddr )
|
||||
|
||||
clustername = env.getenv("CLUSTER_NAME")
|
||||
logger.info ("using CLUSTER_NAME %s" % clustername )
|
||||
|
||||
# init etcdlib client
|
||||
try:
|
||||
self.etcdclient = etcdlib.Client(etcdaddr, prefix = clustername)
|
||||
except Exception:
|
||||
logger.error ("connect etcd failed, maybe etcd address not correct...")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info("etcd connected")
|
||||
|
||||
# get master ip and report port
|
||||
[success,masterip] = self.etcdclient.getkey("service/master")
|
||||
if not success:
|
||||
logger.error("Fail to get master ip address.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
self.master_ip = masterip
|
||||
logger.info("Get master ip address: %s" % (self.master_ip))
|
||||
self.master_port = env.getenv('BATCH_MASTER_PORT')
|
||||
|
||||
self.imgmgr = imagemgr.ImageMgr()
|
||||
self.fspath = env.getenv('FS_PREFIX')
|
||||
self.confpath = env.getenv('DOCKLET_CONF')
|
||||
|
||||
self.taskmsgs = []
|
||||
self.msgslock = threading.Lock()
|
||||
self.report_interval = 2
|
||||
|
||||
self.lock = threading.Lock()
|
||||
self.mount_lock = threading.Lock()
|
||||
self.cons_gateway = env.getenv('BATCH_GATEWAY')
|
||||
self.cons_ips = env.getenv('BATCH_NET')
|
||||
logger.info("Batch gateway ip address %s" % self.cons_gateway)
|
||||
logger.info("Batch ip pools %s" % self.cons_ips)
|
||||
|
||||
self.cidr = 32 - int(self.cons_ips.split('/')[1])
|
||||
self.ipbase = ip_to_int(self.cons_ips.split('/')[0])
|
||||
self.free_ips = []
|
||||
for i in range(2, (1 << self.cidr) - 1):
|
||||
self.free_ips.append(i)
|
||||
logger.info("Free ip addresses pool %s" % str(self.free_ips))
|
||||
|
||||
self.gpu_lock = threading.Lock()
|
||||
self.gpu_status = {}
|
||||
gpus = gputools.get_gpu_status()
|
||||
for gpu in gpus:
|
||||
self.gpu_status[gpu['id']] = ""
|
||||
|
||||
self.start_report()
|
||||
logger.info('TaskController init success')
|
||||
|
||||
# Need Locks
|
||||
def acquire_ip(self):
|
||||
self.lock.acquire()
|
||||
if len(self.free_ips) == 0:
|
||||
return [False, "No free ips"]
|
||||
ip = int_to_ip(self.ipbase + self.free_ips[0])
|
||||
self.free_ips.remove(self.free_ips[0])
|
||||
logger.info(str(self.free_ips))
|
||||
self.lock.release()
|
||||
return [True, ip + "/" + str(32 - self.cidr)]
|
||||
|
||||
# Need Locks
|
||||
def release_ip(self,ipstr):
|
||||
self.lock.acquire()
|
||||
ipnum = ip_to_int(ipstr.split('/')[0]) - self.ipbase
|
||||
self.free_ips.append(ipnum)
|
||||
logger.info(str(self.free_ips))
|
||||
self.lock.release()
|
||||
|
||||
def add_gpu_device(self, lxcname, gpu_need):
|
||||
if gpu_need < 1:
|
||||
return [True, ""]
|
||||
self.gpu_lock.acquire()
|
||||
use_gpus = []
|
||||
for gpuid in self.gpu_status.keys():
|
||||
if self.gpu_status[gpuid] == "" and gpu_need > 0:
|
||||
use_gpus.append(gpuid)
|
||||
gpu_need -= 1
|
||||
if gpu_need > 0:
|
||||
self.gpu_lock.release()
|
||||
return [False, "No free GPUs"]
|
||||
for gpuid in use_gpus:
|
||||
self.gpu_status[gpuid] = lxcname
|
||||
try:
|
||||
gputools.add_device(lxcname, "/dev/nvidiactl")
|
||||
gputools.add_device(lxcname, "/dev/nvidia-uvm")
|
||||
for gpuid in use_gpus:
|
||||
gputools.add_device(lxcname,"/dev/nvidia"+str(gpuid))
|
||||
logger.info("Add gpu:"+str(gpuid) +" to lxc:"+str(lxcname))
|
||||
except Exception as e:
|
||||
logger.error(traceback.format_exc())
|
||||
for gpuid in use_gpus:
|
||||
self.gpu_status[gpuid] = ""
|
||||
self.gpu_lock.release()
|
||||
return [False, "Error occurs when adding gpu device."]
|
||||
|
||||
self.gpu_lock.release()
|
||||
return [True, ""]
|
||||
|
||||
def release_gpu_device(self, lxcname):
|
||||
self.gpu_lock.acquire()
|
||||
for gpuid in self.gpu_status.keys():
|
||||
if self.gpu_status[gpuid] == lxcname:
|
||||
self.gpu_status[gpuid] = ""
|
||||
self.gpu_lock.release()
|
||||
|
||||
#mount_oss
|
||||
def mount_oss(self, datapath, mount_info):
|
||||
self.mount_lock.acquire()
|
||||
try:
|
||||
for mount in mount_info:
|
||||
provider = mount.provider
|
||||
mounter = getattr(ossmounter,provider+"OssMounter",None)
|
||||
if mounter is None:
|
||||
self.mount_lock.release()
|
||||
return [False, provider + " doesn't exist!"]
|
||||
[success, msg] = mounter.mount_oss(datapath,mount)
|
||||
if not success:
|
||||
self.mount_lock.release()
|
||||
return [False, msg]
|
||||
except Exception as err:
|
||||
self.mount_lock.release()
|
||||
logger.error(traceback.format_exc())
|
||||
return [False,""]
|
||||
|
||||
self.mount_lock.release()
|
||||
return [True,""]
|
||||
|
||||
#umount oss
|
||||
def umount_oss(self, datapath, mount_info):
|
||||
try:
|
||||
for mount in mount_info:
|
||||
provider = mount.provider
|
||||
mounter = getattr(ossmounter,provider+"OssMounter",None)
|
||||
if mounter is None:
|
||||
return [False, provider + " doesn't exist!"]
|
||||
[success, msg] = mounter.umount_oss(datapath,mount)
|
||||
if not success:
|
||||
return [False, msg]
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
return [False,""]
|
||||
#accquire ip and create a container
|
||||
def create_container(self,instanceid,username,image,lxcname,quota):
|
||||
# acquire ip
|
||||
[status, ip] = self.acquire_ip()
|
||||
if not status:
|
||||
return [False, ip]
|
||||
|
||||
# prepare image and filesystem
|
||||
status = self.imgmgr.prepareFS(username,image,lxcname,str(quota.disk))
|
||||
if not status:
|
||||
self.release_ip(ip)
|
||||
return [False, "Create container for batch failed when preparing filesystem"]
|
||||
|
||||
rootfs = "/var/lib/lxc/%s/rootfs" % lxcname
|
||||
|
||||
if not os.path.isdir("%s/global/users/%s" % (self.fspath,username)):
|
||||
path = env.getenv('DOCKLET_LIB')
|
||||
subprocess.call([path+"/master/userinit.sh", username])
|
||||
logger.info("user %s directory not found, create it" % username)
|
||||
sys_run("mkdir -p /var/lib/lxc/%s" % lxcname)
|
||||
logger.info("generate config file for %s" % lxcname)
|
||||
|
||||
def config_prepare(content):
|
||||
content = content.replace("%ROOTFS%",rootfs)
|
||||
content = content.replace("%HOSTNAME%","batch-%s" % str(instanceid))
|
||||
content = content.replace("%CONTAINER_MEMORY%",str(quota.memory))
|
||||
content = content.replace("%CONTAINER_CPU%",str(quota.cpu*100000))
|
||||
content = content.replace("%FS_PREFIX%",self.fspath)
|
||||
content = content.replace("%LXCSCRIPT%",env.getenv("LXC_SCRIPT"))
|
||||
content = content.replace("%USERNAME%",username)
|
||||
content = content.replace("%LXCNAME%",lxcname)
|
||||
content = content.replace("%IP%",ip)
|
||||
content = content.replace("%GATEWAY%",self.cons_gateway)
|
||||
return content
|
||||
|
||||
logger.info(self.confpath)
|
||||
conffile = open(self.confpath+"/container.batch.conf", 'r')
|
||||
conftext = conffile.read()
|
||||
conffile.close()
|
||||
|
||||
conftext = config_prepare(conftext)
|
||||
|
||||
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'w')
|
||||
conffile.write(conftext)
|
||||
conffile.close()
|
||||
return [True, ip]
|
||||
|
||||
def process_task(self, request, context):
|
||||
logger.info('excute task with parameter: ' + str(request))
|
||||
taskid = request.id
|
||||
instanceid = request.instanceid
|
||||
|
||||
# get config from request
|
||||
command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine']
|
||||
#envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars']
|
||||
pkgpath = request.parameters.command.packagePath
|
||||
envs = request.parameters.command.envVars
|
||||
envs['taskid'] = str(taskid)
|
||||
envs['instanceid'] = str(instanceid)
|
||||
image = {}
|
||||
image['name'] = request.cluster.image.name
|
||||
if request.cluster.image.type == rpc_pb2.Image.PRIVATE:
|
||||
image['type'] = 'private'
|
||||
elif request.cluster.image.type == rpc_pb2.Image.PUBLIC:
|
||||
image['type'] = 'public'
|
||||
else:
|
||||
image['type'] = 'base'
|
||||
image['owner'] = request.cluster.image.owner
|
||||
username = request.username
|
||||
token = request.token
|
||||
lxcname = '%s-batch-%s-%s-%s' % (username,taskid,str(instanceid),token)
|
||||
instance_type = request.cluster.instance
|
||||
mount_list = request.cluster.mount
|
||||
outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath]
|
||||
timeout = request.timeout
|
||||
gpu_need = int(request.cluster.instance.gpu)
|
||||
reused = request.reused
|
||||
|
||||
#create container
|
||||
[success, ip] = self.create_container(instanceid, username, image, lxcname, instance_type)
|
||||
if not success:
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=ip)
|
||||
|
||||
#mount oss
|
||||
self.mount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_list)
|
||||
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+')
|
||||
mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
|
||||
for mount in mount_list:
|
||||
conffile.write("\n"+ mount_str % (self.fspath, username, mount.remotePath, rootfs, mount.remotePath))
|
||||
conffile.close()
|
||||
|
||||
|
||||
logger.info("Start container %s..." % lxcname)
|
||||
#container = lxc.Container(lxcname)
|
||||
ret = subprocess.run('lxc-start -n %s'%lxcname,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
|
||||
if ret.returncode != 0:
|
||||
logger.error('start container %s failed' % lxcname)
|
||||
self.release_ip(ip)
|
||||
self.imgmgr.deleteFS(lxcname)
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container")
|
||||
|
||||
logger.info('start container %s success' % lxcname)
|
||||
|
||||
#add GPU
|
||||
[success, msg] = self.add_gpu_device(lxcname,gpu_need)
|
||||
if not success:
|
||||
logger.error("Fail to add gpu device. " + msg)
|
||||
container.stop()
|
||||
self.release_ip(ip)
|
||||
self.imgmgr.deleteFS(lxcname)
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to add gpu device. " + msg)
|
||||
|
||||
thread = threading.Thread(target = self.execute_task, args=(username,taskid,instanceid,envs,lxcname,pkgpath,command,timeout,outpath,ip,token,mount_list))
|
||||
thread.setDaemon(True)
|
||||
thread.start()
|
||||
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
|
||||
|
||||
def write_output(self,lxcname,tmplogpath,filepath):
|
||||
cmd = "lxc-attach -n " + lxcname + " -- mv %s %s"
|
||||
if filepath == "" or filepath == "/root/nfs/batch_{jobid}/" or os.path.abspath("/root/nfs/"+tmplogpath) == os.path.abspath(filepath):
|
||||
return [True,""]
|
||||
ret = subprocess.run(cmd % ("/root/nfs/"+tmplogpath,filepath),stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
|
||||
if ret.returncode != 0:
|
||||
msg = ret.stdout.decode(encoding="utf-8")
|
||||
logger.error(msg)
|
||||
return [False,msg]
|
||||
logger.info("Succeed to moving nfs/%s to %s" % (tmplogpath,filepath))
|
||||
return [True,""]
|
||||
|
||||
def execute_task(self,username,taskid,instanceid,envs,lxcname,pkgpath,command,timeout,outpath,ip,token,mount_info):
|
||||
lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs/"
|
||||
scriptname = "batch_job.sh"
|
||||
try:
|
||||
scriptfile = open(lxcfspath+"root/"+scriptname,"w")
|
||||
scriptfile.write("#!/bin/bash\n")
|
||||
scriptfile.write("cd "+str(pkgpath)+"\n")
|
||||
scriptfile.write(command)
|
||||
scriptfile.close()
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
logger.error("Fail to write script file with taskid(%s) instanceid(%s)" % (str(taskid),str(instanceid)))
|
||||
else:
|
||||
try:
|
||||
job_id = taskid.split('_')[1]
|
||||
except Exception as e:
|
||||
logger.error(traceback.format_exc())
|
||||
job_id = "_none"
|
||||
jobdir = "batch_" + job_id
|
||||
logdir = "%s/global/users/%s/data/" % (self.fspath,username) + jobdir
|
||||
if not os.path.exists(logdir):
|
||||
logger.info("Directory:%s not exists, create it." % logdir)
|
||||
os.mkdir(logdir)
|
||||
stdoutname = str(taskid)+"_"+str(instanceid)+"_stdout.txt"
|
||||
stderrname = str(taskid)+"_"+str(instanceid)+"_stderr.txt"
|
||||
try:
|
||||
stdoutfile = open(logdir+"/"+stdoutname,"w")
|
||||
stderrfile = open(logdir+"/"+stderrname,"w")
|
||||
logger.info("Create stdout(%s) and stderr(%s) file to log" % (stdoutname, stderrname))
|
||||
except Exception as e:
|
||||
logger.error(traceback.format_exc())
|
||||
stdoutfile = None
|
||||
stderrfile = None
|
||||
|
||||
cmd = "lxc-attach -n " + lxcname
|
||||
for envkey,envval in envs.items():
|
||||
cmd = cmd + " -v %s=%s" % (envkey,envval)
|
||||
cmd = cmd + " -- /bin/bash \"" + "/root/" + scriptname + "\""
|
||||
logger.info('run task with command - %s' % cmd)
|
||||
p = subprocess.Popen(cmd,stdout=stdoutfile,stderr=stderrfile, shell=True)
|
||||
#logger.info(p)
|
||||
if timeout == 0:
|
||||
to = MAX_RUNNING_TIME
|
||||
else:
|
||||
to = timeout
|
||||
while p.poll() is None and to > 0:
|
||||
time.sleep(min(2,to))
|
||||
to -= 2
|
||||
if p.poll() is None:
|
||||
p.kill()
|
||||
logger.info("Running time(%d) is out. Task(%s-%s-%s) will be killed." % (timeout,str(taskid),str(instanceid),token))
|
||||
self.add_msg(taskid,username,instanceid,rpc_pb2.TIMEOUT,token,"Running time is out.")
|
||||
else:
|
||||
[success1,msg1] = self.write_output(lxcname,jobdir+"/"+stdoutname,outpath[0])
|
||||
[success2,msg2] = self.write_output(lxcname,jobdir+"/"+stderrname,outpath[1])
|
||||
if not success1 or not success2:
|
||||
if not success1:
|
||||
msg = msg1
|
||||
else:
|
||||
msg = msg2
|
||||
logger.info("Output error on Task(%s-%s-%s)." % (str(taskid),str(instanceid),token))
|
||||
self.add_msg(taskid,username,instanceid,rpc_pb2.OUTPUTERROR,token,msg)
|
||||
else:
|
||||
if p.poll() == 0:
|
||||
logger.info("Task(%s-%s-%s) completed." % (str(taskid),str(instanceid),token))
|
||||
self.add_msg(taskid,username,instanceid,rpc_pb2.COMPLETED,token,"")
|
||||
else:
|
||||
logger.info("Task(%s-%s-%s) failed." % (str(taskid),str(instanceid),token))
|
||||
self.add_msg(taskid,username,instanceid,rpc_pb2.FAILED,token,"")
|
||||
|
||||
container = lxc.Container(lxcname)
|
||||
if container.stop():
|
||||
logger.info("stop container %s success" % lxcname)
|
||||
else:
|
||||
logger.error("stop container %s failed" % lxcname)
|
||||
|
||||
logger.info("deleting container:%s" % lxcname)
|
||||
if self.imgmgr.deleteFS(lxcname):
|
||||
logger.info("delete container %s success" % lxcname)
|
||||
else:
|
||||
logger.error("delete container %s failed" % lxcname)
|
||||
|
||||
logger.info("release ip address %s" % ip)
|
||||
self.release_ip(ip)
|
||||
self.release_gpu_device(lxcname)
|
||||
|
||||
#umount oss
|
||||
self.umount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_info)
|
||||
|
||||
def stop_tasks(self, request, context):
|
||||
for msg in request.taskmsgs:
|
||||
lxcname = '%s-batch-%s-%s-%s' % (msg.username,msg.taskid,str(msg.instanceid),msg.token)
|
||||
logger.info("Stop the task with lxc:"+lxcname)
|
||||
subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
|
||||
|
||||
def add_msg(self,taskid,username,instanceid,status,token,errmsg):
|
||||
self.msgslock.acquire()
|
||||
try:
|
||||
self.taskmsgs.append(rpc_pb2.TaskMsg(taskid=str(taskid),username=username,instanceid=int(instanceid),instanceStatus=status,token=token,errmsg=errmsg))
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
self.msgslock.release()
|
||||
#logger.info(str(self.taskmsgs))
|
||||
|
||||
def report_msg(self):
|
||||
channel = grpc.insecure_channel(self.master_ip+":"+self.master_port)
|
||||
stub = rpc_pb2_grpc.MasterStub(channel)
|
||||
while True:
|
||||
self.msgslock.acquire()
|
||||
reportmsg = rpc_pb2.ReportMsg(taskmsgs = self.taskmsgs)
|
||||
try:
|
||||
response = stub.report(reportmsg)
|
||||
logger.info("Response from master by reporting: "+str(response.status)+" "+response.message)
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
self.taskmsgs = []
|
||||
self.msgslock.release()
|
||||
time.sleep(self.report_interval)
|
||||
|
||||
def start_report(self):
|
||||
thread = threading.Thread(target = self.report_msg, args=())
|
||||
thread.setDaemon(True)
|
||||
thread.start()
|
||||
logger.info("Start to report task messages to master every %d seconds." % self.report_interval)
|
||||
|
||||
|
||||
def TaskControllerServe():
|
||||
max_threads = int(env.getenv('BATCH_MAX_THREAD_WORKER'))
|
||||
worker_port = int(env.getenv('BATCH_WORKER_PORT'))
|
||||
logger.info("Max Threads on a worker is %d" % max_threads)
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_threads))
|
||||
rpc_pb2_grpc.add_WorkerServicer_to_server(TaskController(), server)
|
||||
server.add_insecure_port('[::]:'+str(worker_port))
|
||||
server.start()
|
||||
logger.info("Start TaskController Servicer on port:%d" % worker_port)
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
TaskControllerServe()
|
|
@ -0,0 +1,500 @@
|
|||
#!/usr/bin/python3
|
||||
import sys
|
||||
if sys.path[0].endswith("worker"):
|
||||
sys.path[0] = sys.path[0][:-6]
|
||||
from utils import env, tools
|
||||
config = env.getenv("CONFIG")
|
||||
#config = "/opt/docklet/local/docklet-running.conf"
|
||||
tools.loadenv(config)
|
||||
from utils.log import initlogging
|
||||
initlogging("docklet-taskworker")
|
||||
from utils.log import logger
|
||||
|
||||
from concurrent import futures
|
||||
import grpc
|
||||
#from utils.log import logger
|
||||
#from utils import env
|
||||
import json,lxc,subprocess,threading,os,time,traceback
|
||||
from utils import imagemgr,etcdlib,gputools
|
||||
from utils.lvmtool import sys_run
|
||||
from worker import ossmounter
|
||||
from protos import rpc_pb2, rpc_pb2_grpc
|
||||
from utils.nettools import netcontrol
|
||||
from master.network import getip
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
MAX_RUNNING_TIME = _ONE_DAY_IN_SECONDS
|
||||
|
||||
class TaskWorker(rpc_pb2_grpc.WorkerServicer):
|
||||
|
||||
def __init__(self):
|
||||
rpc_pb2_grpc.WorkerServicer.__init__(self)
|
||||
etcdaddr = env.getenv("ETCD")
|
||||
logger.info ("using ETCD %s" % etcdaddr )
|
||||
|
||||
clustername = env.getenv("CLUSTER_NAME")
|
||||
logger.info ("using CLUSTER_NAME %s" % clustername )
|
||||
|
||||
# init etcdlib client
|
||||
try:
|
||||
self.etcdclient = etcdlib.Client(etcdaddr, prefix = clustername)
|
||||
except Exception:
|
||||
logger.error ("connect etcd failed, maybe etcd address not correct...")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info("etcd connected")
|
||||
|
||||
# get master ip and report port
|
||||
[success,masterip] = self.etcdclient.getkey("service/master")
|
||||
if not success:
|
||||
logger.error("Fail to get master ip address.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
self.master_ip = masterip
|
||||
logger.info("Get master ip address: %s" % (self.master_ip))
|
||||
self.master_port = env.getenv('BATCH_MASTER_PORT')
|
||||
|
||||
# get worker ip
|
||||
self.worker_ip = getip(env.getenv('NETWORK_DEVICE'))
|
||||
logger.info("Worker ip is :%s"%self.worker_ip)
|
||||
|
||||
self.imgmgr = imagemgr.ImageMgr()
|
||||
self.fspath = env.getenv('FS_PREFIX')
|
||||
self.confpath = env.getenv('DOCKLET_CONF')
|
||||
self.rm_all_batch_containers()
|
||||
|
||||
self.taskmsgs = []
|
||||
self.msgslock = threading.Lock()
|
||||
self.report_interval = 2
|
||||
|
||||
self.lock = threading.Lock()
|
||||
self.mount_lock = threading.Lock()
|
||||
|
||||
self.gpu_lock = threading.Lock()
|
||||
self.gpu_status = {}
|
||||
gpus = gputools.get_gpu_status()
|
||||
for gpu in gpus:
|
||||
self.gpu_status[gpu['id']] = ""
|
||||
|
||||
self.start_report()
|
||||
logger.info('TaskWorker init success')
|
||||
|
||||
def stop_and_rm_containers(self,lxcname):
|
||||
logger.info("Stop the container with name:"+lxcname)
|
||||
subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
|
||||
lxcpath = "/var/lib/lxc/%s" % lxcname
|
||||
try:
|
||||
mount_info = []
|
||||
for provider in os.listdir(lxcpath+"/oss"):
|
||||
for bkname in os.listdir(lxcpath+"/oss/"+provider):
|
||||
mount_info.append(rpc_pb2.Mount(provider=provider,remotePath=bkname))
|
||||
self.umount_oss(lxcpath+"/oss", mount_info)
|
||||
except Exception as err:
|
||||
logger.info(err)
|
||||
pass
|
||||
return self.imgmgr.deleteFS(lxcname)
|
||||
|
||||
def rm_all_batch_containers(self):
|
||||
for con in lxc.list_containers():
|
||||
keys = con.split('-')
|
||||
if len(keys) < 2 or keys[1] != 'batch':
|
||||
continue
|
||||
if self.stop_and_rm_containers(con):
|
||||
logger.info("delete container %s success" % con)
|
||||
else:
|
||||
logger.error("delete container %s failed" % con)
|
||||
|
||||
def add_gpu_device(self, lxcname, gpu_need):
|
||||
if gpu_need < 1:
|
||||
return [True, ""]
|
||||
self.gpu_lock.acquire()
|
||||
use_gpus = []
|
||||
for gpuid in self.gpu_status.keys():
|
||||
if self.gpu_status[gpuid] == "" and gpu_need > 0:
|
||||
use_gpus.append(gpuid)
|
||||
gpu_need -= 1
|
||||
if gpu_need > 0:
|
||||
self.gpu_lock.release()
|
||||
return [False, "No free GPUs"]
|
||||
for gpuid in use_gpus:
|
||||
self.gpu_status[gpuid] = lxcname
|
||||
try:
|
||||
gputools.add_device(lxcname, "/dev/nvidiactl")
|
||||
gputools.add_device(lxcname, "/dev/nvidia-uvm")
|
||||
for gpuid in use_gpus:
|
||||
gputools.add_device(lxcname,"/dev/nvidia"+str(gpuid))
|
||||
logger.info("Add gpu:"+str(gpuid) +" to lxc:"+str(lxcname))
|
||||
except Exception as e:
|
||||
logger.error(traceback.format_exc())
|
||||
for gpuid in use_gpus:
|
||||
self.gpu_status[gpuid] = ""
|
||||
self.gpu_lock.release()
|
||||
return [False, "Error occurs when adding gpu device."]
|
||||
|
||||
self.gpu_lock.release()
|
||||
return [True, ""]
|
||||
|
||||
def release_gpu_device(self, lxcname):
|
||||
self.gpu_lock.acquire()
|
||||
for gpuid in self.gpu_status.keys():
|
||||
if self.gpu_status[gpuid] == lxcname:
|
||||
self.gpu_status[gpuid] = ""
|
||||
self.gpu_lock.release()
|
||||
|
||||
#mount_oss
|
||||
def mount_oss(self, datapath, mount_info):
|
||||
self.mount_lock.acquire()
|
||||
try:
|
||||
for mount in mount_info:
|
||||
provider = mount.provider
|
||||
mounter = getattr(ossmounter,provider+"OssMounter",None)
|
||||
if mounter is None:
|
||||
self.mount_lock.release()
|
||||
return [False, provider + " doesn't exist!"]
|
||||
[success, msg] = mounter.mount_oss(datapath,mount)
|
||||
if not success:
|
||||
self.mount_lock.release()
|
||||
return [False, msg]
|
||||
except Exception as err:
|
||||
self.mount_lock.release()
|
||||
logger.error(traceback.format_exc())
|
||||
return [False,""]
|
||||
|
||||
self.mount_lock.release()
|
||||
return [True,""]
|
||||
|
||||
#umount oss
|
||||
def umount_oss(self, datapath, mount_info):
|
||||
try:
|
||||
for mount in mount_info:
|
||||
provider = mount.provider
|
||||
mounter = getattr(ossmounter,provider+"OssMounter",None)
|
||||
if mounter is None:
|
||||
return [False, provider + " doesn't exist!"]
|
||||
[success, msg] = mounter.umount_oss(datapath,mount)
|
||||
if not success:
|
||||
return [False, msg]
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
return [False,""]
|
||||
|
||||
def start_vnode(self, request, context):
|
||||
logger.info('start vnode with config: ' + str(request))
|
||||
taskid = request.taskid
|
||||
vnodeid = request.vnodeid
|
||||
|
||||
envs = {}
|
||||
envs['taskid'] = str(taskid)
|
||||
envs['vnodeid'] = str(vnodeid)
|
||||
image = {}
|
||||
image['name'] = request.vnode.image.name
|
||||
if request.vnode.image.type == rpc_pb2.Image.PRIVATE:
|
||||
image['type'] = 'private'
|
||||
elif request.vnode.image.type == rpc_pb2.Image.PUBLIC:
|
||||
image['type'] = 'public'
|
||||
else:
|
||||
image['type'] = 'base'
|
||||
image['owner'] = request.vnode.image.owner
|
||||
username = request.username
|
||||
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
|
||||
instance_type = request.vnode.instance
|
||||
mount_list = request.vnode.mount
|
||||
gpu_need = int(request.vnode.instance.gpu)
|
||||
ipaddr = request.vnode.network.ipaddr
|
||||
gateway = request.vnode.network.gateway
|
||||
brname = request.vnode.network.brname
|
||||
masterip = request.vnode.network.masterip
|
||||
hostname = request.vnode.hostname
|
||||
|
||||
#create container
|
||||
[success, msg] = self.create_container(taskid, vnodeid, username, image, lxcname, instance_type, ipaddr, gateway, brname, hostname)
|
||||
if not success:
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg)
|
||||
|
||||
#mount oss
|
||||
lxcpath = "/var/lib/lxc/%s" % lxcname
|
||||
rootfs = lxcpath + "/rootfs"
|
||||
self.mount_oss(lxcpath + "/oss", mount_list)
|
||||
conffile = open(lxcpath + "/config", 'a+')
|
||||
mount_str = "lxc.mount.entry = "+ lxcpath +"/oss/%s/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
|
||||
for mount in mount_list:
|
||||
conffile.write("\n"+ mount_str % (mount.provider, mount.remotePath, rootfs, mount.remotePath))
|
||||
conffile.close()
|
||||
|
||||
logger.info("Start container %s..." % lxcname)
|
||||
container = lxc.Container(lxcname)
|
||||
ret = subprocess.run('lxc-start -n %s'%lxcname,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
|
||||
if ret.returncode != 0:
|
||||
logger.error('start container %s failed' % lxcname)
|
||||
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
|
||||
self.imgmgr.deleteFS(lxcname)
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container(%s)"%lxcname)
|
||||
|
||||
logger.info('start container %s success' % lxcname)
|
||||
|
||||
if masterip != self.worker_ip:
|
||||
netcontrol.setup_gre(brname, masterip)
|
||||
|
||||
#add GPU
|
||||
[success, msg] = self.add_gpu_device(lxcname,gpu_need)
|
||||
if not success:
|
||||
logger.error("Fail to add gpu device. " + msg)
|
||||
container.stop()
|
||||
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
|
||||
self.imgmgr.deleteFS(lxcname)
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to add gpu device. " + msg)
|
||||
|
||||
#start ssh service
|
||||
cmd = "lxc-attach -n %s -- service ssh start" % lxcname
|
||||
ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
|
||||
if ret.returncode != 0:
|
||||
logger.error('Fail to start ssh service of container %s' % lxcname)
|
||||
container.stop()
|
||||
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
|
||||
self.imgmgr.deleteFS(lxcname)
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to start ssh service. lxc(%s)"%lxcname)
|
||||
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
|
||||
|
||||
def start_task(self, request, context):
|
||||
logger.info('start task with config: ' + str(request))
|
||||
taskid = request.taskid
|
||||
username = request.username
|
||||
vnodeid = request.vnodeid
|
||||
# get config from request
|
||||
command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine']
|
||||
#envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars']
|
||||
pkgpath = request.parameters.command.packagePath
|
||||
envs = request.parameters.command.envVars
|
||||
envs['taskid'] = str(taskid)
|
||||
envs['vnodeid'] = str(vnodeid)
|
||||
timeout = request.timeout
|
||||
token = request.token
|
||||
outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath]
|
||||
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
|
||||
|
||||
thread = threading.Thread(target = self.execute_task, args=(username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token))
|
||||
thread.setDaemon(True)
|
||||
thread.start()
|
||||
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
|
||||
|
||||
def stop_task(self, request, context):
|
||||
logger.info('stop task with config: ' + str(request))
|
||||
taskid = request.taskid
|
||||
username = request.username
|
||||
vnodeid = request.vnodeid
|
||||
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
|
||||
logger.info("Stop the task with lxc:"+lxcname)
|
||||
subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
|
||||
|
||||
# stop and remove container
|
||||
def stop_vnode(self, request, context):
|
||||
logger.info('stop vnode with config: ' + str(request))
|
||||
taskid = request.taskid
|
||||
username = request.username
|
||||
vnodeid = request.vnodeid
|
||||
brname = request.vnode.network.brname
|
||||
mount_list = request.vnode.mount
|
||||
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
|
||||
|
||||
logger.info("Stop the task with lxc:"+lxcname)
|
||||
container = lxc.Container(lxcname)
|
||||
if container.stop():
|
||||
logger.info("stop container %s success" % lxcname)
|
||||
else:
|
||||
logger.error("stop container %s failed" % lxcname)
|
||||
|
||||
#umount oss
|
||||
self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
|
||||
|
||||
logger.info("deleting container:%s" % lxcname)
|
||||
if self.imgmgr.deleteFS(lxcname):
|
||||
logger.info("delete container %s success" % lxcname)
|
||||
else:
|
||||
logger.error("delete container %s failed" % lxcname)
|
||||
|
||||
#del ovs bridge
|
||||
if brname is not None:
|
||||
netcontrol.del_bridge(brname)
|
||||
|
||||
#release gpu
|
||||
self.release_gpu_device(lxcname)
|
||||
|
||||
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
|
||||
|
||||
|
||||
#accquire ip and create a container
|
||||
def create_container(self,taskid,vnodeid,username,image,lxcname,quota,ipaddr,gateway,brname,hostname):
|
||||
# prepare image and filesystem
|
||||
status = self.imgmgr.prepareFS(username,image,lxcname,str(quota.disk))
|
||||
if not status:
|
||||
return [False, "Create container for batch failed when preparing filesystem"]
|
||||
|
||||
rootfs = "/var/lib/lxc/%s/rootfs" % lxcname
|
||||
|
||||
if not os.path.isdir("%s/global/users/%s" % (self.fspath,username)):
|
||||
path = env.getenv('DOCKLET_LIB')
|
||||
subprocess.call([path+"/master/userinit.sh", username])
|
||||
logger.info("user %s directory not found, create it" % username)
|
||||
sys_run("mkdir -p /var/lib/lxc/%s" % lxcname)
|
||||
logger.info("generate config file for %s" % lxcname)
|
||||
|
||||
def config_prepare(content):
|
||||
content = content.replace("%ROOTFS%",rootfs)
|
||||
content = content.replace("%HOSTNAME%",hostname)
|
||||
content = content.replace("%TASKID%",taskid)
|
||||
content = content.replace("%CONTAINER_MEMORY%",str(quota.memory))
|
||||
content = content.replace("%CONTAINER_CPU%",str(quota.cpu*100000))
|
||||
content = content.replace("%FS_PREFIX%",self.fspath)
|
||||
content = content.replace("%LXCSCRIPT%",env.getenv("LXC_SCRIPT"))
|
||||
content = content.replace("%USERNAME%",username)
|
||||
content = content.replace("%LXCNAME%",lxcname)
|
||||
content = content.replace("%VETHPAIR%",str(taskid)+"-"+str(vnodeid))
|
||||
content = content.replace("%IP%",ipaddr)
|
||||
content = content.replace("%BRNAME%",brname)
|
||||
content = content.replace("%GATEWAY%",gateway)
|
||||
return content
|
||||
|
||||
logger.info(self.confpath)
|
||||
conffile = open(self.confpath+"/container.batch.conf", 'r')
|
||||
conftext = conffile.read()
|
||||
conffile.close()
|
||||
|
||||
conftext = config_prepare(conftext)
|
||||
|
||||
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'w')
|
||||
conffile.write(conftext)
|
||||
conffile.close()
|
||||
return [True, ""]
|
||||
|
||||
def write_output(self,lxcname,tmplogpath,filepath):
|
||||
cmd = "lxc-attach -n " + lxcname + " -- mv %s %s"
|
||||
if filepath == "" or filepath == "/root/nfs/batch_{jobid}/" or os.path.abspath("/root/nfs/"+tmplogpath) == os.path.abspath(filepath):
|
||||
return [True,""]
|
||||
ret = subprocess.run(cmd % ("/root/nfs/"+tmplogpath,filepath),stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
|
||||
if ret.returncode != 0:
|
||||
msg = ret.stdout.decode(encoding="utf-8")
|
||||
logger.error(msg)
|
||||
return [False,msg]
|
||||
logger.info("Succeed to moving nfs/%s to %s" % (tmplogpath,filepath))
|
||||
return [True,""]
|
||||
|
||||
def execute_task(self,username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token):
|
||||
lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs/"
|
||||
scriptname = "batch_job.sh"
|
||||
try:
|
||||
scriptfile = open(lxcfspath+"root/"+scriptname,"w")
|
||||
scriptfile.write("#!/bin/bash\n")
|
||||
scriptfile.write("cd "+str(pkgpath)+"\n")
|
||||
scriptfile.write(command)
|
||||
scriptfile.close()
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
logger.error("Fail to write script file with taskid(%s) vnodeid(%s)" % (str(taskid),str(vnodeid)))
|
||||
else:
|
||||
try:
|
||||
job_id = taskid.split('_')[0]
|
||||
except Exception as e:
|
||||
logger.error(traceback.format_exc())
|
||||
job_id = "_none"
|
||||
jobdir = "batch_" + job_id
|
||||
logdir = "%s/global/users/%s/data/" % (self.fspath,username) + jobdir
|
||||
try:
|
||||
os.mkdir(logdir)
|
||||
except Exception as e:
|
||||
logger.info("Error when creating logdir :%s "+str(e))
|
||||
stdoutname = str(taskid)+"_"+str(vnodeid)+"_stdout.txt"
|
||||
stderrname = str(taskid)+"_"+str(vnodeid)+"_stderr.txt"
|
||||
try:
|
||||
stdoutfile = open(logdir+"/"+stdoutname,"w")
|
||||
stderrfile = open(logdir+"/"+stderrname,"w")
|
||||
logger.info("Create stdout(%s) and stderr(%s) file to log" % (stdoutname, stderrname))
|
||||
except Exception as e:
|
||||
logger.error(traceback.format_exc())
|
||||
stdoutfile = None
|
||||
stderrfile = None
|
||||
|
||||
cmd = "lxc-attach -n " + lxcname
|
||||
for envkey,envval in envs.items():
|
||||
cmd = cmd + " -v %s=%s" % (envkey,envval)
|
||||
cmd = cmd + " -- /bin/bash \"" + "/root/" + scriptname + "\""
|
||||
logger.info('run task with command - %s' % cmd)
|
||||
p = subprocess.Popen(cmd,stdout=stdoutfile,stderr=stderrfile, shell=True)
|
||||
#logger.info(p)
|
||||
if timeout == 0:
|
||||
to = MAX_RUNNING_TIME
|
||||
else:
|
||||
to = timeout
|
||||
while p.poll() is None and to > 0:
|
||||
time.sleep(min(2,to))
|
||||
to -= 2
|
||||
if p.poll() is None:
|
||||
p.kill()
|
||||
logger.info("Running time(%d) is out. Task(%s-%s-%s) will be killed." % (timeout,str(taskid),str(vnodeid),token))
|
||||
self.add_msg(taskid,username,vnodeid,rpc_pb2.TIMEOUT,token,"Running time is out.")
|
||||
else:
|
||||
[success1,msg1] = self.write_output(lxcname,jobdir+"/"+stdoutname,outpath[0])
|
||||
[success2,msg2] = self.write_output(lxcname,jobdir+"/"+stderrname,outpath[1])
|
||||
if not success1 or not success2:
|
||||
if not success1:
|
||||
msg = msg1
|
||||
else:
|
||||
msg = msg2
|
||||
logger.info("Output error on Task(%s-%s-%s)." % (str(taskid),str(vnodeid),token))
|
||||
self.add_msg(taskid,username,vnodeid,rpc_pb2.OUTPUTERROR,token,msg)
|
||||
else:
|
||||
if p.poll() == 0:
|
||||
logger.info("Task(%s-%s-%s) completed." % (str(taskid),str(vnodeid),token))
|
||||
self.add_msg(taskid,username,vnodeid,rpc_pb2.COMPLETED,token,"")
|
||||
else:
|
||||
logger.info("Task(%s-%s-%s) failed." % (str(taskid),str(vnodeid),token))
|
||||
self.add_msg(taskid,username,vnodeid,rpc_pb2.FAILED,token,"Runtime Error. More information in stderr log.")
|
||||
|
||||
def add_msg(self,taskid,username,vnodeid,status,token,errmsg):
|
||||
self.msgslock.acquire()
|
||||
try:
|
||||
self.taskmsgs.append(rpc_pb2.TaskMsg(taskid=str(taskid),username=username,vnodeid=int(vnodeid),subTaskStatus=status,token=token,errmsg=errmsg))
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
self.msgslock.release()
|
||||
|
||||
def report_msg(self):
|
||||
channel = grpc.insecure_channel(self.master_ip+":"+self.master_port)
|
||||
stub = rpc_pb2_grpc.MasterStub(channel)
|
||||
while True:
|
||||
self.msgslock.acquire()
|
||||
reportmsg = rpc_pb2.ReportMsg(taskmsgs = self.taskmsgs)
|
||||
try:
|
||||
response = stub.report(reportmsg)
|
||||
logger.info("Response from master by reporting: "+str(response.status)+" "+response.message)
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
self.taskmsgs = []
|
||||
self.msgslock.release()
|
||||
time.sleep(self.report_interval)
|
||||
|
||||
def start_report(self):
|
||||
thread = threading.Thread(target = self.report_msg, args=())
|
||||
thread.setDaemon(True)
|
||||
thread.start()
|
||||
logger.info("Start to report task messages to master every %d seconds." % self.report_interval)
|
||||
|
||||
def TaskWorkerServe():
|
||||
max_threads = int(env.getenv('BATCH_MAX_THREAD_WORKER'))
|
||||
worker_port = int(env.getenv('BATCH_WORKER_PORT'))
|
||||
logger.info("Max Threads on a worker is %d" % max_threads)
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_threads))
|
||||
rpc_pb2_grpc.add_WorkerServicer_to_server(TaskWorker(), server)
|
||||
server.add_insecure_port('[::]:'+str(worker_port))
|
||||
server.start()
|
||||
logger.info("Start TaskWorker Servicer on port:%d" % worker_port)
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
TaskWorkerServe()
|
|
@ -57,17 +57,23 @@ class Worker(object):
|
|||
|
||||
self.etcd = etcdclient
|
||||
self.master = self.etcd.getkey("service/master")[1]
|
||||
self.mode=None
|
||||
self.mode = None
|
||||
self.workertype = "normal"
|
||||
self.key=""
|
||||
|
||||
# waiting state is preserved for compatible.
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "waiting")
|
||||
# get this node's key to judge how to init.
|
||||
[status, key] = self.etcd.getkey("machines/runnodes/"+self.addr)
|
||||
if status:
|
||||
self.key = generatekey("machines/allnodes/"+self.addr)
|
||||
else:
|
||||
logger.error("get key failed. %s" % 'machines/runnodes/'+self.addr)
|
||||
sys.exit(1)
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "batch-worker":
|
||||
self.workertype = "batch"
|
||||
|
||||
if self.workertype == "normal":
|
||||
# waiting state is preserved for compatible.
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "waiting")
|
||||
# get this node's key to judge how to init.
|
||||
[status, key] = self.etcd.getkey("machines/runnodes/"+self.addr)
|
||||
if status:
|
||||
self.key = generatekey("machines/allnodes/"+self.addr)
|
||||
else:
|
||||
logger.error("get key failed. %s" % 'machines/runnodes/'+self.addr)
|
||||
sys.exit(1)
|
||||
|
||||
# check token to check global directory
|
||||
[status, token_1] = self.etcd.getkey("token")
|
||||
|
@ -87,7 +93,8 @@ class Worker(object):
|
|||
if node['key'] == self.key:
|
||||
value = 'init-recovery'
|
||||
break
|
||||
logger.info("worker start in "+value+" mode")
|
||||
|
||||
logger.info("worker start in "+value+" mode, worker type is"+self.workertype)
|
||||
|
||||
Containers = container.Container(self.addr, etcdclient)
|
||||
if value == 'init-new':
|
||||
|
@ -193,7 +200,8 @@ class Worker(object):
|
|||
self.hosts_collector.start()
|
||||
logger.info("Monitor Collector has been started.")
|
||||
# worker change it state itself. Independedntly from master.
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "work")
|
||||
if self.workertype == "normal":
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "work")
|
||||
publicIP = env.getenv("PUBLIC_IP")
|
||||
self.etcd.setkey("machines/publicIP/"+self.addr,publicIP)
|
||||
self.thread_sendheartbeat = threading.Thread(target=self.sendheartbeat)
|
||||
|
@ -204,17 +212,22 @@ class Worker(object):
|
|||
|
||||
# send heardbeat package to keep alive in etcd, ttl=2s
|
||||
def sendheartbeat(self):
|
||||
while(True):
|
||||
# check send heartbeat package every 1s
|
||||
time.sleep(2)
|
||||
[status, value] = self.etcd.getkey("machines/runnodes/"+self.addr)
|
||||
if status:
|
||||
# master has know the worker so we start send heartbeat package
|
||||
if value=='ok':
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 3)
|
||||
else:
|
||||
logger.error("get key %s failed, master may be crashed" % self.addr)
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 60)
|
||||
if self.workertype == "normal":
|
||||
while(True):
|
||||
# check send heartbeat package every 1s
|
||||
time.sleep(2)
|
||||
[status, value] = self.etcd.getkey("machines/runnodes/"+self.addr)
|
||||
if status:
|
||||
# master has know the worker so we start send heartbeat package
|
||||
if value=='ok':
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 60)
|
||||
else:
|
||||
logger.error("get key %s failed, master may be crashed" % self.addr)
|
||||
self.etcd.setkey("machines/runnodes/"+self.addr, "ok", ttl = 60)
|
||||
elif self.workertype == "batch":
|
||||
while(True):
|
||||
time.sleep(2)
|
||||
self.etcd.setkey("machines/batchnodes/"+self.addr, "ok", ttl = 60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -240,21 +240,38 @@ function processInfo()
|
|||
$("#con_disk").html(usedp+"%<br/>"+detail);
|
||||
|
||||
//processNetStats
|
||||
var net_stats = data.monitor.net_stats;
|
||||
var in_rate = parseInt(net_stats.bytes_recv_per_sec);
|
||||
var out_rate = parseInt(net_stats.bytes_sent_per_sec);
|
||||
ingress_rate = in_rate;
|
||||
egress_rate = out_rate;
|
||||
$("#net_in_rate").html(num2human(in_rate)+"Bps");
|
||||
$("#net_out_rate").html(num2human(out_rate)+"Bps");
|
||||
$("#net_in_bytes").html(num2human(net_stats.bytes_recv)+"B");
|
||||
$("#net_out_bytes").html(num2human(net_stats.bytes_sent)+"B");
|
||||
$("#net_in_packs").html(net_stats.packets_recv);
|
||||
$("#net_out_packs").html(net_stats.packets_sent);
|
||||
$("#net_in_err").html(net_stats.errout);
|
||||
$("#net_out_err").html(net_stats.errin);
|
||||
$("#net_in_drop").html(net_stats.dropout);
|
||||
$("#net_out_drop").html(net_stats.dropin);
|
||||
var net_stats = data.monitor.net_stats;
|
||||
if(!$.isEmptyObject(net_stats))
|
||||
{
|
||||
var in_rate = parseInt(net_stats.bytes_recv_per_sec);
|
||||
var out_rate = parseInt(net_stats.bytes_sent_per_sec);
|
||||
ingress_rate = in_rate;
|
||||
egress_rate = out_rate;
|
||||
$("#net_in_rate").html(num2human(in_rate)+"Bps");
|
||||
$("#net_out_rate").html(num2human(out_rate)+"Bps");
|
||||
$("#net_in_bytes").html(num2human(net_stats.bytes_recv)+"B");
|
||||
$("#net_out_bytes").html(num2human(net_stats.bytes_sent)+"B");
|
||||
$("#net_in_packs").html(net_stats.packets_recv);
|
||||
$("#net_out_packs").html(net_stats.packets_sent);
|
||||
$("#net_in_err").html(net_stats.errout);
|
||||
$("#net_out_err").html(net_stats.errin);
|
||||
$("#net_in_drop").html(net_stats.dropout);
|
||||
$("#net_out_drop").html(net_stats.dropin);
|
||||
}
|
||||
else {
|
||||
ingress_rate = 0;
|
||||
egress_rate = 0;
|
||||
$("#net_in_rate").html("--");
|
||||
$("#net_out_rate").html("--");
|
||||
$("#net_in_bytes").html("--");
|
||||
$("#net_out_bytes").html("--");
|
||||
$("#net_in_packs").html("--");
|
||||
$("#net_out_packs").html("--");
|
||||
$("#net_in_err").html("--");
|
||||
$("#net_out_err").html("--");
|
||||
$("#net_in_drop").html("--");
|
||||
$("#net_out_drop").html("--");
|
||||
}
|
||||
},"json");
|
||||
}
|
||||
|
||||
|
|
|
@ -174,6 +174,9 @@
|
|||
<li id="nav_History">
|
||||
<a href='/history/'><i class="fa fa-history"></i> <span class="nav-label">History</span></a>
|
||||
</li>
|
||||
<li id="nav_Batch">
|
||||
<a href='/batch_jobs/'><i class="fa fa-tasks"></i> <span class="nav-label">Batch</span></a>
|
||||
</li>
|
||||
|
||||
|
||||
{% if mysession['usergroup'] == 'root' or mysession['usergroup'] == 'admin'%}
|
||||
|
@ -241,7 +244,7 @@
|
|||
<i><a href="https://github.com/unias/docklet">Docklet {{ version }}</a></i>
|
||||
</div>
|
||||
<!-- Default to the left -->
|
||||
<strong>Copyright</strong>© 2017 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
|
||||
<strong>Copyright</strong>© 2019 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
|
||||
|
||||
</footer>
|
||||
|
||||
|
|
|
@ -0,0 +1,359 @@
|
|||
{% extends 'base_AdminLTE.html' %}
|
||||
|
||||
{% block title %}Docklet | Create Batch Job{% endblock %}
|
||||
|
||||
{% block css_src %}
|
||||
<!--<style>
|
||||
.divcontent { overflow-y:scroll; height:200px;}
|
||||
</style>-->
|
||||
<link href="//cdn.bootcss.com/datatables/1.10.11/css/dataTables.bootstrap.min.css" rel="stylesheet">
|
||||
<link href="//cdn.bootcss.com/datatables/1.10.11/css/jquery.dataTables_themeroller.css" rel="stylesheet">
|
||||
<link href="/static/dist/css/modalconfig.css" rel="stylesheet">
|
||||
|
||||
{% endblock %}
|
||||
|
||||
{% block panel_title %}Batch Job Info{% endblock %}
|
||||
|
||||
{% block panel_list %}
|
||||
<ol class="breadcrumb">
|
||||
<li>
|
||||
<a href="/dashboard/"><i class="fa fa-dashboard"></i>Home</a>
|
||||
</li>
|
||||
</ol>
|
||||
{% endblock %}
|
||||
|
||||
<div>
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col-lg-12">
|
||||
<div class="box box-info">
|
||||
<div class="box-header with-border">
|
||||
<h3 class="box-title">Batch Job Create</h3>
|
||||
|
||||
<div class="box-tools pull-right">
|
||||
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
|
||||
</button>
|
||||
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="box-body">
|
||||
<form id="form" class="form-horizontal" action="/batch_job/{{masterips[0].split("@")[0]}}/add/" method="POST">
|
||||
|
||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
|
||||
<div class="form-group"><label class="col-sm-2 control-label">Job Name</label>
|
||||
<div class="col-sm-10"><input type="text" class="form-control" name="jobName" id="job_name" required></div>
|
||||
</div>
|
||||
<br/>
|
||||
<div class="form-group"><label class="col-sm-2 control-label">Location</label>
|
||||
<div class="col-sm-10"><select id="masterselector" class="form-control">
|
||||
{% for master in masterips %}
|
||||
<option value="{{master.split("@")[0]}}">{{master.split("@")[1]}}</option>
|
||||
{% endfor %}
|
||||
</select></div>
|
||||
</div>
|
||||
<div class="hr-line-dashed"></div>
|
||||
<br/>
|
||||
<div class="form-group"><label class="col-sm-2 control-label">Priority</label>
|
||||
<div class="col-sm-10"><select id="priority_selector" class="form-control" name="jobPriority">
|
||||
{% for priority in range(10) %}
|
||||
<option value="{{priority}}">{{priority}}</option>
|
||||
{% endfor %}
|
||||
</select></div>
|
||||
</div>
|
||||
<br/>
|
||||
|
||||
<div class="hr-line-dashed"></div>
|
||||
<div class="panel-group" id="accordion">
|
||||
<!-- Tasks -->
|
||||
</div>
|
||||
<br/>
|
||||
<div class="hr-line-dashed"></div>
|
||||
<div class="row">
|
||||
<div class="form-group">
|
||||
<div class="col-sm-4 col-sm-offset-2">
|
||||
<button class="btn btn-primary" type="button" id="add_task" class="btn btn-box-tool" title="add a task">Add Task <i class="fa fa-plus"></i></button>
|
||||
<button class="btn btn-primary" type="submit">Create Job</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block script_src %}
|
||||
|
||||
<script src="//cdn.bootcss.com/pace/1.0.2/pace.min.js"></script>
|
||||
|
||||
<!-- Steps -->
|
||||
<script src="//cdn.bootcss.com/jquery-steps/1.1.0/jquery.steps.min.js"></script>
|
||||
|
||||
<!-- Jquery Validate -->
|
||||
<script src="//cdn.bootcss.com/jquery-validate/1.15.0/jquery.validate.min.js"></script>
|
||||
|
||||
|
||||
<script src="//cdn.bootcss.com/datatables/1.10.11/js/jquery.dataTables.min.js"></script>
|
||||
<script src="//cdn.bootcss.com/datatables/1.10.11/js/dataTables.bootstrap.min.js"></script>
|
||||
<script src="//cdn.bootcss.com/datatables-tabletools/2.1.5/js/TableTools.min.js"></script>
|
||||
<script src="//cdn.bootcss.com/jquery-validate/1.17.0/jquery.validate.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
var task_number = 0;
|
||||
var mapping_number = 0;
|
||||
var images_text = "{{ images }}";
|
||||
images_text = images_text.replace(/'/g,"\"");
|
||||
console.log(images_text);
|
||||
var images_info = JSON.parse(images_text);
|
||||
console.log(images_info);
|
||||
$().ready(function() {
|
||||
$("#form").validate();
|
||||
});
|
||||
|
||||
function removeTask(obj) {
|
||||
$("#task_pannel_" + obj.id).remove();
|
||||
}
|
||||
|
||||
function unfoldTask(obj){
|
||||
$("#collapse" + obj.id).collapse('hide');
|
||||
}
|
||||
|
||||
function chmountPath(obj,task_num,mapping_num) {
|
||||
cellid = 'mapping_mountpath_' + task_num + '_' + mapping_num;
|
||||
$('#'+cellid).val("/root/oss/"+obj.value);
|
||||
}
|
||||
|
||||
function removeMapping(obj) {
|
||||
$("#mapping_" + obj.id).remove();
|
||||
}
|
||||
|
||||
function addMapping(obj,task_num) {
|
||||
mapping_number += 1;
|
||||
var table = $("#storage_mapping_" + obj.id)[0];
|
||||
var new_mapping = table.insertRow();
|
||||
new_mapping.id = "mapping_" + task_num + "_" + mapping_number;
|
||||
var provider = new_mapping.insertCell();
|
||||
var bucket_name = new_mapping.insertCell();
|
||||
var accessKey = new_mapping.insertCell();
|
||||
var secretKey = new_mapping.insertCell();
|
||||
var endpoint = new_mapping.insertCell();
|
||||
var mountpath = new_mapping.insertCell();
|
||||
var remove = new_mapping.insertCell();
|
||||
bucket_name.innerHTML = '<input type="text" class="form-control" name="mappingBucketName_' + task_num + '_' + mapping_number + '" id="mapping_bucketname_'
|
||||
+ task_num + '_' + mapping_number + '" onKeyUp="chmountPath(this,'+task_num+','+mapping_number+');" required/>';
|
||||
accessKey.innerHTML = '<input type="text" class="form-control" name="mappingAccessKey_' + task_num + '_' + mapping_number + '" id="mapping_accessKey_'
|
||||
+ task_num + '_' + mapping_number + '" required/>';
|
||||
secretKey.innerHTML = '<input type="text" class="form-control" name="mappingSecretKey_' + task_num + '_' + mapping_number + '" id="mapping_secretKey_'
|
||||
+ task_num + '_' + mapping_number + '" required/>';
|
||||
endpoint.innerHTML = 'http://<input type="text" class="form-control" name="mappingEndpoint_' + task_num + '_' + mapping_number + '" id="mapping_endpoint_'
|
||||
+ task_num + '_' + mapping_number + '" required/>';
|
||||
mountpath.innerHTML = '<input type="text" class="form-control" name="mappingMountpath_' + task_num + '_' + mapping_number + '" id="mapping_mountpath_'
|
||||
+ task_num + '_' + mapping_number + '" readonly="true" required/>';
|
||||
provider.innerHTML = '<select class="form-control" name="mappingProvider_' + task_num + '_' + mapping_number + '" id="mapping_provider_'
|
||||
+ task_num + '_' + mapping_number + '">'
|
||||
+'<option>Aliyun</option></select>';
|
||||
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_num + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">'
|
||||
+'Remove</button></div>';
|
||||
}
|
||||
|
||||
$("select#masterselector").change(function() {
|
||||
var masterip=$(this).children('option:selected').val();
|
||||
$("#form").attr("action","/batch_job/"+ masterip +"/add/");
|
||||
var mastername=$(this).children('option:selected').html();
|
||||
console.log(masterip);
|
||||
var host = window.location.host;
|
||||
var images = images_info;
|
||||
for(var tnum = 1; tnum<=task_number; ++tnum)
|
||||
{
|
||||
var imagehtml =
|
||||
"<thead>"
|
||||
+"<tr>"
|
||||
+"<th>ImageName</th>"
|
||||
+"<th>Type</th>"
|
||||
+"<th>Owner</th>"
|
||||
+"<th>Size</th>"
|
||||
+"<th>Description</th>"
|
||||
+"<th>Choose</th>"
|
||||
+"</tr>"
|
||||
+"</thead>"
|
||||
+"<tbody>"
|
||||
+"<tr>"
|
||||
+"<td>base</td>"
|
||||
+"<td>public</td>"
|
||||
+"<td>docklet</td>"
|
||||
+"<td>--</td>"
|
||||
+"<td>A base image for you</td>"
|
||||
+'<td><div class="i-checks"><label><input type="radio" name="image_' + tnum + '" value="base_base_base" checked="checked"></label></div></td>'
|
||||
+"</tr>";
|
||||
for(var index in images[masterip].private) {
|
||||
var image = images[masterip].private[index];
|
||||
imagehtml +=
|
||||
"<tr>"
|
||||
+"<td>"+image.name+"</td>"
|
||||
+"<td>private</td>"
|
||||
+"<td>{{user}}</td>"
|
||||
+"<td>"+image.size_format+"</td>"
|
||||
+'<td><a href="/image/' + masterip + '/description/' + image.name + '_' + '{{user}}' + '_private/" target="_blank">' + image.description + '</a></td>'
|
||||
+'<td><div class="i-checks"><label><input type="radio" name="image_' + tnum + '" value="'+image.name+'_{{user}}_private"><label></div></td>'
|
||||
+"</tr>";
|
||||
}
|
||||
for(var p_user in images[masterip].public) {
|
||||
for(var index in images[masterip].public[p_user]) {
|
||||
image=images[masterip].public[p_user][index];
|
||||
imagehtml +=
|
||||
"<tr>"
|
||||
+"<td>"+image.name+"</td>"
|
||||
+"<td>public</td>"
|
||||
+"<td>" + p_user + "</td>"
|
||||
+"<td>"+image.size_format+"</td>"
|
||||
+'<td><a href="/image/' + masterip + '/description/' + image.name + "_" + p_user + '_public/" target="_blank">' + image.description + '</a></td>'
|
||||
+'<td><div class="i-checks"><label><input type="radio" name="image_' + tnum + '" value="'+image.name+'_{{p_user}}_public"><label></div></td>'
|
||||
+"</tr>";
|
||||
}
|
||||
}
|
||||
imagehtml += "</tbody>";
|
||||
$("#imagetable"+tnum).html(imagehtml);
|
||||
}
|
||||
});
|
||||
|
||||
function addTask() {
|
||||
task_number += 1;
|
||||
var masterip=$("select#masterselector").children('option:selected').val();
|
||||
//mapping_number = 0;
|
||||
var task_html = '';
|
||||
task_html +=
|
||||
'<div class="panel panel-default" id="task_pannel_' + task_number + '">'
|
||||
+'<div class="panel-heading">'
|
||||
+'<h4 class="panel-title">'
|
||||
+'<a data-toggle="collapse" data-panel="#accordion" href="#collapse' + task_number + '">'
|
||||
+'Task #' + task_number
|
||||
+'</a><div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="removeTask(this)" class="btn btn-box-tool"><i class="fa fa-times"></i></button></div>'
|
||||
+'</h4></div>'
|
||||
+'<div id="collapse' + task_number + '" class="panel-collapse collapse in">'
|
||||
+'<div class="panel-body">'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">CPU</label>'
|
||||
+'<div class="col-sm-3"><input type="number" class="form-control" name="cpuSetting_' + task_number + '" id="cpuSetting_' + task_number + '" value = 1 min="1" max="8" required/>'
|
||||
+'</div>'
|
||||
+'<label class="col-sm-2 control-label">Memory</label>'
|
||||
+'<div class="col-sm-3"><input type="number" class="form-control" name="memorySetting_' + task_number + '" id="memorySetting_' + task_number + '" value = 1024 min="100" max="8196" required/>'
|
||||
+'</div>MB</div>'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">GPU</label>'
|
||||
+'<div class="col-sm-3"><input type="number" class="form-control" name="gpuSetting_' + task_number + '" id="gpuSetting_' + task_number + '" value= 0 min="0" max="2" required/>'
|
||||
+'</div>'
|
||||
+'<label class="col-sm-2 control-label">Disk</label>'
|
||||
+'<div class="col-sm-3"><input type="number" class="form-control" name="diskSetting_' + task_number + '" id="diskSetting_' + task_number + '" value= 1024 min="128" max="10000" required/>'
|
||||
+'</div>MB</div>'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">VNode Number</label>'
|
||||
+'<div class="col-sm-3"><input type="number" class="form-control" name="vnodeCount_' + task_number + '" id="vnodeCount_' + task_number + '" value= 1 min="1" max="14" required/>'
|
||||
+'</div>'
|
||||
+'<label class="col-sm-2 control-label">Max Retry Times</label>'
|
||||
+'<div class="col-sm-3"><input type="number" class="form-control" name="retryCount_' + task_number + '" id="retryCount_' + task_number + '" value= 1 min="0" max="5" required/>'
|
||||
+'</div></div>'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">Running Path</label>'
|
||||
+'<div class="col-sm-3"><input type="text" class="form-control" name="srcAddr_' + task_number + '" id="srcAddr_' + task_number + '" value="/root" required/>'
|
||||
+'</div>'
|
||||
+'<label class="col-sm-2 control-label">Expire Time</label>'
|
||||
+'<div class="col-sm-3"><input type="number" class="form-control" name="expTime_' + task_number + '" id="expTime_' + task_number + '" value= 60 min="10" max="86400" required/>'
|
||||
+'</div>Seconds</div>'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">Stderr Redirect Path</label>'
|
||||
+'<div class="col-sm-3"><input type="text" class="form-control" placeholder="/path/to/file or /path/" name="stdErrRedPth_' + task_number + '" id="stdErrRedPth_' + task_number + '" value="/root/nfs/batch_{jobid}/" required/>'
|
||||
+'</div>'
|
||||
+'<label class="col-sm-2 control-label">Stdout Redirect Path</label>'
|
||||
+'<div class="col-sm-3"><input type="text" class="form-control" placeholder="/path/to/file or /path/" name="stdOutRedPth_' + task_number + '" id="stdOutRedPth_' + task_number + '" value="/root/nfs/batch_{jobid}/" required/>'
|
||||
+'</div></div>'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">Dependency <i class="fa fa-question-circle" title="The tasks ids that this task depends on, seperate them with commas, eg: 1, 2"></i></label>'
|
||||
+'<div class="col-sm-3"><input type="text" class="form-control" name="dependency_' + task_number + '" id="dependency_' + task_number + '" />'
|
||||
+'</div>'
|
||||
+'<label class="col-sm-2 control-label">Command</label>'
|
||||
+'<div class="col-sm-3"><input type="text" class="form-control" name="command_' + task_number + '" id="command_' + task_number + '" required/>'
|
||||
+'</div></div>'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">Run on: </label>'
|
||||
+'<div class="col-sm-3"><input type="radio" name="runon_' + task_number + '" value="all" checked="checked"/>All vnodes  '
|
||||
+' <input type="radio" name="runon_' + task_number + '" value="master" />One vnode(master)</div>'
|
||||
+'<label class="col-sm-2 control-label">Start at the Same Time</label>'
|
||||
+'<div class="col-sm-3"><input type="checkbox" name="atSameTime_' + task_number + '" checked="checked"/>'
|
||||
+'</div></div>'
|
||||
var images = images_info
|
||||
task_html +=
|
||||
'<div class="form-group"><label class="col-sm-2 control-label">Image Choose</label>'
|
||||
+'<div class="col-sm-10">'
|
||||
+'<table id="imagetable' + task_number +'" class="table table-striped table-bordered table-hover table-image" >'
|
||||
+"<thead>"
|
||||
+"<tr>"
|
||||
+"<th>ImageName</th>"
|
||||
+"<th>Type</th>"
|
||||
+"<th>Owner</th>"
|
||||
+"<th>Size</th>"
|
||||
+"<th>Description</th>"
|
||||
+"<th>Choose</th>"
|
||||
+"</tr>"
|
||||
+"</thead>"
|
||||
+"<tbody>"
|
||||
+"<tr>"
|
||||
+"<td>base</td>"
|
||||
+"<td>public</td>"
|
||||
+"<td>docklet</td>"
|
||||
+"<td>--</td>"
|
||||
+"<td>A base image for you</td>"
|
||||
+'<td><div class="i-checks"><label><input type="radio" name="image_' + task_number + '" value="base_base_base" checked="checked"></label></div></td>'
|
||||
+"</tr>";
|
||||
for(var index in images[masterip].private) {
|
||||
var image = images[masterip].private[index];
|
||||
task_html +=
|
||||
"<tr>"
|
||||
+"<td>"+image.name+"</td>"
|
||||
+"<td>private</td>"
|
||||
+"<td>{{user}}</td>"
|
||||
+"<td>"+image.size_format+"</td>"
|
||||
+'<td><a href="/image/' + masterip + '/description/' + image.name + '_' + '{{user}}' + '_private/" target="_blank">' + image.description + '</a></td>'
|
||||
+'<td><div class="i-checks"><label><input type="radio" name="image_' + task_number + '" value="'+image.name+'_{{user}}_private"><label></div></td>'
|
||||
+"</tr>";
|
||||
}
|
||||
for(var p_user in images[masterip].public) {
|
||||
for(var index in images[masterip].public[p_user]) {
|
||||
image=images[masterip].public[p_user][index];
|
||||
task_html +=
|
||||
"<tr>"
|
||||
+"<td>"+image.name+"</td>"
|
||||
+"<td>public</td>"
|
||||
+"<td>" + p_user + "</td>"
|
||||
+"<td>"+image.size_format+"</td>"
|
||||
+'<td><a href="/image/' + masterip + '/description/' + image.name + "_" + p_user + '_public/" target="_blank">' + image.description + '</a></td>'
|
||||
+'<td><div class="i-checks"><label><input type="radio" name="image_' + task_number + '" value="'+image.name+'_{{p_user}}_public"><label></div></td>'
|
||||
+"</tr>";
|
||||
}
|
||||
}
|
||||
task_html +=
|
||||
'</tbody></table>'
|
||||
+'</div>'
|
||||
+'</div>'
|
||||
+'<div class="form-group">'
|
||||
+'<label class="col-sm-2 control-label">Object Storage Mapping<br/>'
|
||||
+'<button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this,'+task_number+')">'
|
||||
+'Add<i class="fa fa-plus"></i></button></label>'
|
||||
+'<div class="col-sm-10"><table class="table table-bordered" id="storage_mapping_' + task_number + '">'
|
||||
+'<thead>'
|
||||
+'<tr><th>Provider</th><th>Bucket Name</th><th>AccessKey ID</th><th>AccessKey Secret</th><th>Endpoint</th><th>Mount Path</th><th>Remove</th></tr>'
|
||||
+'</thead>'
|
||||
+'<tbody>'
|
||||
+'</tbody>'
|
||||
+'</table></div>'
|
||||
+'</div>'
|
||||
+'<div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>'
|
||||
+'</div></div></div>'
|
||||
$(task_html).appendTo("#accordion");
|
||||
}
|
||||
addTask();
|
||||
$("#add_task").click(addTask);
|
||||
</script>
|
||||
{% endblock %}
|
|
@ -0,0 +1,264 @@
|
|||
{% extends 'base_AdminLTE.html' %}
|
||||
|
||||
{% block title %}Docklet | Batch Job Info{% endblock %}
|
||||
|
||||
{% block panel_title %}Info for {{ jobinfo['job_id'] }}{% endblock %}
|
||||
|
||||
{% block css_src %}
|
||||
<link href="//cdn.bootcss.com/datatables/1.10.11/css/dataTables.bootstrap.min.css" rel="stylesheet">
|
||||
<link href="//cdn.bootcss.com/datatables/1.10.11/css/jquery.dataTables_themeroller.css" rel="stylesheet">
|
||||
<link href="/static/dist/css/modalconfig.css" rel="stylesheet">
|
||||
|
||||
{% endblock %}
|
||||
|
||||
{% block panel_list %}
|
||||
<ol class="breadcrumb">
|
||||
<li>
|
||||
<a href="/dashboard/"><i class="fa fa-dashboard"></i>Home</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href='/batch_jobs/'>Batch Job</a>
|
||||
</li>
|
||||
<li class='active'>
|
||||
<strong>Info</strong>
|
||||
</li>
|
||||
</ol>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col-md-12">
|
||||
<div class="box box-info">
|
||||
<div class="box-header with-border">
|
||||
<h3 class="box-title">Overview</h3>
|
||||
|
||||
<div class="box-tools pull-right">
|
||||
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
|
||||
</button>
|
||||
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="box-body table-responsive">
|
||||
<table class="table table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Job ID</th>
|
||||
<th>Name</th>
|
||||
<th>Priority</th>
|
||||
<th>Status</th>
|
||||
<th>Create Time</th>
|
||||
<th>End Time</th>
|
||||
<th>Billing</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>{{ jobinfo['job_id'] }}</td>
|
||||
<td>{{ jobinfo['job_name'] }}</td>
|
||||
<td>{{ jobinfo['priority'] }}</td>
|
||||
<td>{{ jobinfo['status'] }}</td>
|
||||
<td>{{ jobinfo['create_time'] }}</td>
|
||||
<td>{{ jobinfo['end_time'] }}</td>
|
||||
<td>{{ jobinfo['billing'] }} <img src='/static/img/bean.png' /></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-12">
|
||||
<div class="box box-info">
|
||||
<div class="box-header with-border">
|
||||
<h3 class="box-title">Tasks Overview</h3>
|
||||
|
||||
<div class="box-tools pull-right">
|
||||
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
|
||||
</button>
|
||||
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="box-body table-responsive">
|
||||
<table width="100%" cellspacing="0" style="margin: 0 auto;" id="table-tasks" class="table table-striped table-bordered table-hover">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Task Index</th>
|
||||
<th>Status</th>
|
||||
<th>Failed Reason(if fails)</th>
|
||||
<th>Tried Times</th>
|
||||
<th>Start Time</th>
|
||||
<th>End Time</th>
|
||||
<th>Total Running Time</th>
|
||||
<th>Billing</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for task in jobinfo['tasks'] %}
|
||||
<tr>
|
||||
<td>{{ task['idx'] }}</td>
|
||||
<td>{{ task['status'] }}</td>
|
||||
<td>{{ task['failed_reason'] }}</td>
|
||||
<td>{{ task['tried_times'] }}</td>
|
||||
<td>{{ task['start_time'] }}</td>
|
||||
<td>{{ task['end_time'] }}</td>
|
||||
<td>{{ task['running_time'] }} s</td>
|
||||
<td>{{ task['billing'] }} <img src='/static/img/bean.png' /></td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-12">
|
||||
<div class="box box-info">
|
||||
<div class="box-header with-border">
|
||||
<h3 class="box-title">Tasks Configs</h3>
|
||||
|
||||
<div class="box-tools pull-right">
|
||||
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
|
||||
</button>
|
||||
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="box-body">
|
||||
{% for task in jobinfo['tasks'] %}
|
||||
<div class="panel panel-default" id="task_pannel_{{ task['idx'] }}">
|
||||
<div class="panel-heading">
|
||||
<h4 class="panel-title">
|
||||
<a data-toggle="collapse" data-panel="#accordion" href="#collapse{{ task['idx'] }}">
|
||||
Task #{{ task['idx'] }}
|
||||
</a>
|
||||
</h4>
|
||||
</div>
|
||||
<div id="collapse{{ task['idx'] }}" class="panel-collapse collapse in">
|
||||
<div class="panel-body">
|
||||
<div class="table-responsive">
|
||||
<table class="table table-bordered table-hover">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>CPU Cores</th>
|
||||
<th>Memory</th>
|
||||
<th>GPU</th>
|
||||
<th>Disk</th>
|
||||
<th>VNode Number</th>
|
||||
<th>Max Retry Times</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>{{ task['config']['cpuSetting'] }}</td>
|
||||
<td>{{ task['config']['memorySetting'] }} MB</td>
|
||||
<td>{{ task['config']['gpuSetting'] }}</td>
|
||||
<td>{{ task['config']['diskSetting'] }} MB</td>
|
||||
<td>{{ task['config']['vnodeCount'] }}</td>
|
||||
<td>{{ task['config']['retryCount'] }}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Running Path</th>
|
||||
<th>Expire Time</th>
|
||||
<th>Stdout Redirect Path</th>
|
||||
<th>Stderr Redirect Path</th>
|
||||
<th>Dependency</th>
|
||||
<th>Command</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>{{ task['config']['srcAddr'] }}</td>
|
||||
<td>{{ task['config']['expTime'] }} seconds</td>
|
||||
<td>{{ task['config']['stdOutRedPth'] }}</td>
|
||||
<td>{{ task['config']['stdErrRedPth'] }}</td>
|
||||
<td>{{ task['config']['dependency'] }}</td>
|
||||
<td>{{ task['config']['command'] }}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Run on</th>
|
||||
<th>Start at the Same Time</th>
|
||||
<th>Image Name</th>
|
||||
<th>Image Owner</th>
|
||||
<th>Image Type</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
{% if task['config']['runon'] == 'all' %}
|
||||
<td>all vnodes</td>
|
||||
{% else %}
|
||||
<td>master vnode</td>
|
||||
{% endif %}
|
||||
{% if 'atSameTime' in task['config'].keys() %}
|
||||
<td>True</td>
|
||||
{% else %}
|
||||
<td>False</td>
|
||||
{% endif %}
|
||||
{% if task['config']['image'] == 'base_base_base' %}
|
||||
<td>base</td>
|
||||
<td>docklet</td>
|
||||
<td>public</td>
|
||||
{% else %}
|
||||
<td>{{ task['config']['image'].split('_')[0] }}</td>
|
||||
<td>{{ task['config']['image'].split('_')[1] }}</td>
|
||||
<td>{{ task['config']['image'].split('_')[2] }}</td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% if 'mapping' in task['config'].keys() %}
|
||||
<div class="table-responsive">
|
||||
<table class="table table-bordered table-hover">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Provider</th>
|
||||
<th>Bucket Name</th>
|
||||
<th>AccessKey ID</th>
|
||||
<th>Endpoint</th>
|
||||
<th>Mount Path</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for key in task['config']['mapping'].keys() %}
|
||||
<tr>
|
||||
<td>{{ task['config']['mapping'][key]['mappingProvider'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingBucketName'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingAccessKey'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingEndpoint'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingMountpath'] }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block script_src %}
|
||||
<script src="//cdn.bootcss.com/datatables/1.10.11/js/jquery.dataTables.min.js"></script>
|
||||
<script src="//cdn.bootcss.com/datatables/1.10.11/js/dataTables.bootstrap.min.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
$(document).ready(function() {
|
||||
$("#table-tasks").DataTable({"scrollX":true,"order":[[ 0, "asc" ]]});
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
|
@ -0,0 +1,147 @@
|
|||
{% extends "base_AdminLTE.html"%}
|
||||
{% block title %}Docklet | Batch Job{% endblock %}
|
||||
|
||||
{% block panel_title %}Batch Job{% endblock %}
|
||||
|
||||
{% block css_src %}
|
||||
<link href="//cdn.bootcss.com/datatables/1.10.11/css/dataTables.bootstrap.min.css" rel="stylesheet">
|
||||
<link href="//cdn.bootcss.com/datatables/1.10.11/css/jquery.dataTables_themeroller.css" rel="stylesheet">
|
||||
<link href="/static/dist/css/modalconfig.css" rel="stylesheet">
|
||||
|
||||
{% endblock %}
|
||||
|
||||
{% block panel_list %}
|
||||
<ol class="breadcrumb">
|
||||
<li>
|
||||
<a href="/dashboard/"><i class="fa fa-dashboard"></i>Home</a>
|
||||
</li>
|
||||
<li class="active">
|
||||
<strong>Batch Job</strong>
|
||||
</li>
|
||||
</ol>
|
||||
{% endblock %}
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col-lg-12">
|
||||
<div class="box box-info">
|
||||
<div class="box-header with-border">
|
||||
<h3 class="box-title">Batch Job List</h3>
|
||||
|
||||
<div class="box-tools pull-right">
|
||||
<button type="button" class="btn btn-box-tool" data-widget="collapse"><i class="fa fa-minus"></i>
|
||||
</button>
|
||||
<button type="button" class="btn btn-box-tool" data-widget="remove"><i class="fa fa-times"></i></button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="box-body">
|
||||
|
||||
<p>
|
||||
<a href="/batch_job/create/"><button type="button" class="btn btn-primary btn-sm"><i class="fa fa-plus"></i> Create Batch Job</button></a>
|
||||
</p>
|
||||
{% for master in masterips %}
|
||||
{% for job_info in job_list[master.split('@')[0]] %}
|
||||
<div class="modal inmodal" id='OutputModal_{{ master.split('@')[1] }}_{{ job_info['job_id'] }}' tabindex="-1" role="dialog" aria-hidden="true">
|
||||
<div class="modal-dialog">
|
||||
<div class="modal-content animated fadeIn">
|
||||
<div class="modal-header">
|
||||
<button type="button" class="close" data-dismiss="modal"><span aria-hidden="true">×</span><span class="sr-only">Close</span></button>
|
||||
<h4 class="modal-title">Job:{{ job_info['job_name'] }}({{ job_info['job_id'] }}) Stdout and Stderr of tasks</h4>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<table width="100%" cellspacing="0" class="table table-bordered table-striped table-hover table-output">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Task ID</th>
|
||||
<th>Vnode ID</th>
|
||||
<th>Stdout</th>
|
||||
<th>Stderr</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for taskid in job_info['tasks'] %}
|
||||
{% for vnodeid in range(job_info['tasks_vnodeCount'][taskid]) %}
|
||||
<tr>
|
||||
<td>{{ taskid }}</td>
|
||||
<td>{{ vnodeid }}</td>
|
||||
<td><a class="btn btn-info btn-xs" href='/batch_job/output/{{ master.split('@')[0] }}/{{ job_info["job_id"] }}/{{ taskid }}/{{ vnodeid }}/stdout/' target="_blank">Stdout</a></td>
|
||||
<td><a class="btn btn-info btn-xs" href='/batch_job/output/{{ master.split('@')[0] }}/{{ job_info["job_id"] }}/{{ taskid }}/{{ vnodeid }}/stderr/' target="_blank">Stderr</a></td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-white" data-dismiss="modal">Close</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
<div class="table">
|
||||
<table width="100%" cellspacing="0" style="margin: 0 auto;" class="table table-striped table-bordered table-hover table-batch">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Location</th>
|
||||
<th>ID</th>
|
||||
<th>Name</th>
|
||||
<th>Status</th>
|
||||
<th>Operations</th>
|
||||
<th>Create Time</th>
|
||||
<th>End Time</th>
|
||||
<th>billing</th>
|
||||
<th>Stdout and Stderr</th>
|
||||
<th>Detailed Info</th>
|
||||
</tr>
|
||||
<thead>
|
||||
<tbody>
|
||||
{% for master in masterips %}
|
||||
{% for job_info in job_list[master.split('@')[0]] %}
|
||||
<tr>
|
||||
<td>{{ master.split('@')[1] }}</td>
|
||||
<td>{{ job_info['job_id'] }}</td>
|
||||
<td>{{ job_info['job_name'] }}</td>
|
||||
<td>
|
||||
{{ job_info['status'] }}
|
||||
</td>
|
||||
{% if job_info['status'] == 'done' or job_info['status'] == 'failed' or job_info['status'] == 'stopping' or job_info['status'] == 'stopped'%}
|
||||
<td><button type="button" class="btn btn-xs btn-default"> Stop </button></td>
|
||||
{% else %}
|
||||
<td><a href="/batch_job/{{master.split("@")[0]}}/stop/{{ job_info['job_id'] }}/"><button type="button" class="btn btn-xs btn-danger"> Stop </button></a></td>
|
||||
{% endif %}
|
||||
<td>{{ job_info['create_time'] }}</td>
|
||||
<td>{{ job_info['end_time'] }}</td>
|
||||
<td>{{ job_info['billing'] }} <img src='/static/img/bean.png' /></td>
|
||||
<td><a role="button" class="btn btn-info btn-xs" id='{{ master }}_{{ job_info['job_id'] }}_output' data-toggle="modal" data-target='#OutputModal_{{ master.split('@')[1] }}_{{ job_info['job_id'] }}'>Get Output</a></td>
|
||||
<td><a href="/batch_job/{{master.split("@")[0]}}/info/{{ job_info['job_id'] }}/"><button type="button" class="btn btn-xs btn-info"> Info </button></a></td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
{% block script_src %}
|
||||
<script src="//cdn.bootcss.com/datatables/1.10.11/js/jquery.dataTables.min.js"></script>
|
||||
<script src="//cdn.bootcss.com/datatables/1.10.11/js/dataTables.bootstrap.min.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
$(document).ready(function() {
|
||||
$(".table-batch").DataTable({"scrollX":true,"order":[[ 5, "desc" ]]});
|
||||
$(".table-output").DataTable({
|
||||
"lengthChange":false});
|
||||
});
|
||||
function sendAdd(){
|
||||
document.getElementById("addForm").submit();
|
||||
}
|
||||
function sendDel(){
|
||||
document.getElementById("delForm").submit();
|
||||
}
|
||||
</script>
|
||||
{% endblock %}
|
|
@ -0,0 +1,62 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<title>Docklet | Batch {{ issue }}: {{ jobid }}/{{ taskid }}/{{ vnodeid }}</title>
|
||||
<!-- Tell the browser to be responsive to screen width -->
|
||||
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport">
|
||||
<link rel="shortcut icon" href="/static/img/favicon.ico">
|
||||
|
||||
<link href="//cdn.bootcss.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet">
|
||||
|
||||
<!-- Font Awesome -->
|
||||
<link href="//cdn.bootcss.com/font-awesome/4.3.0/css/font-awesome.min.css" rel="stylesheet">
|
||||
|
||||
<!-- Ionicons -->
|
||||
<link href="//cdn.bootcss.com/ionicons/2.0.1/css/ionicons.min.css" rel="stylesheet">
|
||||
|
||||
<link href="//cdn.bootcss.com/animate.css/3.5.1/animate.min.css" rel="stylesheet">
|
||||
<link href="//cdn.bootcss.com/toastr.js/latest/css/toastr.min.css" rel="stylesheet">
|
||||
|
||||
<!-- Theme style -->
|
||||
|
||||
<link rel="stylesheet" href="/static/dist/css/AdminLTE.min.css">
|
||||
|
||||
<link rel="stylesheet" href="/static/dist/css/skins/skin-blue.min.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h3>Jobid: {{ jobid }}</h3>
|
||||
<h3>Taskid: {{ taskid }}</h3>
|
||||
<h3>VNodeid: {{ vnodeid }}</h3>
|
||||
<h4><small>The output of {{ issue }} will be updated in every 2 seconds.</small></h4>
|
||||
<hr>
|
||||
<pre id="output">{{ output }}</pre>
|
||||
<!-- jQuery 2.2.1 -->
|
||||
<script src="//cdn.bootcss.com/jquery/2.2.1/jquery.min.js"></script>
|
||||
<!-- Bootstrap 3.3.5 -->
|
||||
<script src="//cdn.bootcss.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>
|
||||
<!-- AdminLTE App -->
|
||||
<script src="/static/dist/js/app.min.js"></script>
|
||||
|
||||
<script src="//cdn.bootcss.com/fastclick/1.0.6/fastclick.min.js"></script>
|
||||
<script src="//cdn.bootcss.com/jQuery-slimScroll/1.3.7/jquery.slimscroll.min.js"></script>
|
||||
<script src="//cdn.bootcss.com/toastr.js/latest/js/toastr.min.js"></script>
|
||||
|
||||
<script type="text/javascript">
|
||||
$.ajaxSetup({
|
||||
headers: {'X-CSRFToken':'{{ csrf_token() }}'},
|
||||
});
|
||||
function updateOutput()
|
||||
{
|
||||
var host = window.location.host;
|
||||
url = "//" + host + "/batch/job/output/" + "{{ masterip }}" + "/" + "{{ jobid }}" + "/" + "{{ taskid }}" + "/" + "{{ vnodeid }}" + "/" + "{{ issue }}" + "/";
|
||||
$.post(url,{},function(data){
|
||||
$("#output").text(String(data.data));
|
||||
},"json");
|
||||
}
|
||||
setInterval(updateOutput,2000);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
54
web/web.py
54
web/web.py
|
@ -41,6 +41,7 @@ from webViews.reportbug import *
|
|||
from webViews.authenticate.auth import login_required, administration_required,activated_required
|
||||
from webViews.authenticate.register import registerView
|
||||
from webViews.authenticate.login import loginView, logoutView
|
||||
from webViews.batch import *
|
||||
import webViews.dockletrequest
|
||||
from webViews import cookie_tool
|
||||
import traceback
|
||||
|
@ -127,6 +128,59 @@ def reportBug():
|
|||
reportBugView.bugmessage = request.form['bugmessage']
|
||||
return reportBugView.as_view()
|
||||
|
||||
@app.route("/batch_jobs/", methods=['GET'])
|
||||
@login_required
|
||||
def batch_job():
|
||||
return batchJobListView().as_view()
|
||||
|
||||
@app.route("/batch_job/create/", methods=['GET'])
|
||||
@login_required
|
||||
def create_batch_job():
|
||||
return createBatchJobView().as_view()
|
||||
|
||||
@app.route("/batch_job/<masterip>/add/", methods=['POST'])
|
||||
@login_required
|
||||
def add_batch_job(masterip):
|
||||
addBatchJobView.masterip = masterip
|
||||
addBatchJobView.job_data = request.form
|
||||
return addBatchJobView().as_view()
|
||||
|
||||
@app.route("/batch_job/<masterip>/stop/<jobid>/", methods=['GET'])
|
||||
@login_required
|
||||
def stop_batch_job(masterip,jobid):
|
||||
stopBatchJobView.masterip = masterip
|
||||
stopBatchJobView.jobid = jobid
|
||||
return stopBatchJobView().as_view()
|
||||
|
||||
@app.route("/batch_job/<masterip>/info/<jobid>/", methods=['GET'])
|
||||
@login_required
|
||||
def info_batch_job(masterip,jobid):
|
||||
infoBatchJobView.masterip = masterip
|
||||
infoBatchJobView.jobid = jobid
|
||||
return infoBatchJobView().as_view()
|
||||
|
||||
@app.route("/batch_job/output/<masterip>/<jobid>/<taskid>/<vnodeid>/<issue>/", methods=['GET'])
|
||||
@login_required
|
||||
def output_batch_job(masterip, jobid, taskid, vnodeid, issue):
|
||||
outputBatchJobView.masterip = masterip
|
||||
outputBatchJobView.jobid = jobid
|
||||
outputBatchJobView.taskid = taskid
|
||||
outputBatchJobView.vnodeid = vnodeid
|
||||
outputBatchJobView.issue = issue
|
||||
return outputBatchJobView().as_view()
|
||||
|
||||
@app.route("/batch/job/output/<masterip>/<jobid>/<taskid>/<vnodeid>/<issue>/", methods=['POST'])
|
||||
@login_required
|
||||
def output_batch_job_request(masterip, jobid, taskid, vnodeid, issue):
|
||||
data = {
|
||||
'jobid':jobid,
|
||||
'taskid':taskid,
|
||||
'vnodeid':vnodeid,
|
||||
'issue':issue
|
||||
}
|
||||
result = dockletRequest.post("/batch/job/output/",data,masterip)
|
||||
return json.dumps(result)
|
||||
|
||||
@app.route("/workspace/create/", methods=['GET'])
|
||||
#@activated_required
|
||||
def addCluster():
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
from flask import session, redirect, request
|
||||
from webViews.view import normalView
|
||||
from webViews.log import logger
|
||||
from webViews.checkname import checkname
|
||||
from webViews.dockletrequest import dockletRequest
|
||||
import json
|
||||
|
||||
class batchJobListView(normalView):
|
||||
template_path = "batch/batch_list.html"
|
||||
|
||||
@classmethod
|
||||
def get(self):
|
||||
masterips = dockletRequest.post_to_all()
|
||||
job_list = {}
|
||||
for ipname in masterips:
|
||||
ip = ipname.split("@")[0]
|
||||
result = dockletRequest.post("/batch/job/list/",{},ip)
|
||||
job_list[ip] = result.get("data")
|
||||
logger.debug("job_list[%s]: %s" % (ip,job_list[ip]))
|
||||
if True:
|
||||
return self.render(self.template_path, masterips=masterips, job_list=job_list)
|
||||
else:
|
||||
return self.error()
|
||||
|
||||
class createBatchJobView(normalView):
|
||||
template_path = "batch/batch_create.html"
|
||||
|
||||
@classmethod
|
||||
def get(self):
|
||||
masterips = dockletRequest.post_to_all()
|
||||
images = {}
|
||||
for master in masterips:
|
||||
images[master.split("@")[0]] = dockletRequest.post("/image/list/",{},master.split("@")[0]).get("images")
|
||||
logger.info(images)
|
||||
return self.render(self.template_path, masterips=masterips, images=images)
|
||||
|
||||
|
||||
class infoBatchJobView(normalView):
|
||||
template_path = "batch/batch_info.html"
|
||||
error_path = "error.html"
|
||||
masterip = ""
|
||||
jobid = ""
|
||||
|
||||
@classmethod
|
||||
def get(self):
|
||||
data = {
|
||||
'jobid':self.jobid
|
||||
}
|
||||
result = dockletRequest.post("/batch/job/info/",data,self.masterip)
|
||||
data = result.get("data")
|
||||
logger.info(str(data))
|
||||
#logger.debug("job_list: %s" % job_list)
|
||||
if result.get('success',"") == "true":
|
||||
return self.render(self.template_path, masterip=self.masterip, jobinfo=data)
|
||||
else:
|
||||
return self.render(self.error_path, message = result.get('message'))
|
||||
|
||||
class addBatchJobView(normalView):
|
||||
template_path = "batch/batch_list.html"
|
||||
error_path = "error.html"
|
||||
|
||||
@classmethod
|
||||
def post(self):
|
||||
masterip = self.masterip
|
||||
result = dockletRequest.post("/batch/job/add/", self.job_data, masterip)
|
||||
if result.get('success', None) == "true":
|
||||
return redirect('/batch_jobs/')
|
||||
else:
|
||||
return self.render(self.error_path, message = result.get('message'))
|
||||
|
||||
class stopBatchJobView(normalView):
|
||||
template_path = "batch/batch_list.html"
|
||||
error_path = "error.html"
|
||||
|
||||
@classmethod
|
||||
def get(self):
|
||||
masterip = self.masterip
|
||||
data = {'jobid':self.jobid}
|
||||
result = dockletRequest.post("/batch/job/stop/", data, masterip)
|
||||
if result.get('success', None) == "true":
|
||||
return redirect('/batch_jobs/')
|
||||
else:
|
||||
return self.render(self.error_path, message = result.get('message'))
|
||||
|
||||
class outputBatchJobView(normalView):
|
||||
template_path = "batch/batch_output.html"
|
||||
masterip = ""
|
||||
jobid = ""
|
||||
taskid = ""
|
||||
vnodeid = ""
|
||||
issue = ""
|
||||
|
||||
@classmethod
|
||||
def get(self):
|
||||
data = {
|
||||
'jobid':self.jobid,
|
||||
'taskid':self.taskid,
|
||||
'vnodeid':self.vnodeid,
|
||||
'issue':self.issue
|
||||
}
|
||||
result = dockletRequest.post("/batch/job/output/",data,self.masterip)
|
||||
output = result.get("data")
|
||||
#logger.debug("job_list: %s" % job_list)
|
||||
if result.get('success',"") == "true":
|
||||
return self.render(self.template_path, masterip=self.masterip, jobid=self.jobid,
|
||||
taskid=self.taskid, vnodeid=self.vnodeid, issue=self.issue, output=output)
|
||||
else:
|
||||
return self.error()
|
|
@ -21,7 +21,6 @@ class statusView(normalView):
|
|||
print(quotainfo)'''
|
||||
allcontainers = {}
|
||||
if (result):
|
||||
containers = {}
|
||||
for master in allclusters:
|
||||
allcontainers[master] = {}
|
||||
for cluster in allclusters[master]:
|
||||
|
@ -32,6 +31,18 @@ class statusView(normalView):
|
|||
else:
|
||||
self.error()
|
||||
allcontainers[master][cluster] = message
|
||||
message = dockletRequest.post('/batch/vnodes/list/', data, master.split("@")[0])
|
||||
message = message.get('data')
|
||||
containers = []
|
||||
for m in message:
|
||||
container = {}
|
||||
container['containername'] = m
|
||||
container['ip'] = '--'
|
||||
containers.append(container)
|
||||
tmp = {}
|
||||
tmp['containers'] = containers
|
||||
tmp['status'] = 'running'
|
||||
allcontainers[master]['Batch_Job'] = tmp
|
||||
return self.render(self.template_path, quotas = quotas, quotanames = quotanames, allcontainers = allcontainers, user = session['username'])
|
||||
else:
|
||||
self.error()
|
||||
|
|
Loading…
Reference in New Issue