update taskworker.py

This commit is contained in:
zhuyj17 2019-03-04 11:50:50 +08:00
parent bd732e679f
commit 4e222d46be
2 changed files with 118 additions and 4 deletions

View File

@ -44,9 +44,22 @@ def stop_vnode():
response = stub.stop_vnode(vnodeinfo)
print("Batch client received: " + str(response.status)+" "+response.message)
def start_task():
channel = grpc.insecure_channel('localhost:50051')
stub = rpc_pb2_grpc.WorkerStub(channel)
comm = rpc_pb2.Command(commandLine="ls /root;sleep 5;ls /root", packagePath="/root", envVars={'test1':'10','test2':'20'}) # | awk '{print \"test\\\"\\n\"}'
paras = rpc_pb2.Parameters(command=comm, stderrRedirectPath="/root/nfs/batch_{jobid}/", stdoutRedirectPath="/root/nfs/batch_{jobid}/")
taskinfo = rpc_pb2.TaskInfo(taskid="test",username="root",vnodeid=1,parameters=paras,timeout=20,token="test")
response = stub.start_task(taskinfo)
print("Batch client received: " + str(response.status)+" "+response.message)
if __name__ == '__main__':
#for i in range(10):
run()
#run()
start_task()
#stop_vnode()
#time.sleep(4)
#stop_task()

View File

@ -70,7 +70,7 @@ class TaskWorker(rpc_pb2_grpc.WorkerServicer):
#self.start_report()
logger.info('TaskWorker init success')
def add_gpu_device(self, lxcname, gpu_need):
if gpu_need < 1:
return [True, ""]
@ -205,11 +205,30 @@ class TaskWorker(rpc_pb2_grpc.WorkerServicer):
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def start_task(self, request, context):
pass
taskid = request.taskid
username = request.username
vnodeid = request.vnodeid
# get config from request
command = request.parameters.command.commandLine #'/root/getenv.sh' #parameter['Parameters']['Command']['CommandLine']
#envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars']
pkgpath = request.parameters.command.packagePath
envs = request.parameters.command.envVars
envs['taskid'] = str(taskid)
envs['vnodeid'] = str(vnodeid)
timeout = request.timeout
token = request.token
outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath]
lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
thread = threading.Thread(target = self.execute_task, args=(username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token))
thread.setDaemon(True)
thread.start()
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
def stop_task(self, request, context):
for msg in request.taskmsgs:
lxcname = '%s-batch-%s-%s-%s' % (msg.username,msg.taskid,str(msg.instanceid),msg.token)
lxcname = '%s-batch-%s-%s-%s' % (msg.username,msg.taskid,str(msg.vnodeid),msg.token)
logger.info("Stop the task with lxc:"+lxcname)
subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
@ -287,6 +306,88 @@ class TaskWorker(rpc_pb2_grpc.WorkerServicer):
conffile.close()
return [True, ""]
def write_output(self,lxcname,tmplogpath,filepath):
cmd = "lxc-attach -n " + lxcname + " -- mv %s %s"
if filepath == "" or filepath == "/root/nfs/batch_{jobid}/" or os.path.abspath("/root/nfs/"+tmplogpath) == os.path.abspath(filepath):
return [True,""]
ret = subprocess.run(cmd % ("/root/nfs/"+tmplogpath,filepath),stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
if ret.returncode != 0:
msg = ret.stdout.decode(encoding="utf-8")
logger.error(msg)
return [False,msg]
logger.info("Succeed to moving nfs/%s to %s" % (tmplogpath,filepath))
return [True,""]
def execute_task(self,username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token):
lxcfspath = "/var/lib/lxc/"+lxcname+"/rootfs/"
scriptname = "batch_job.sh"
try:
scriptfile = open(lxcfspath+"root/"+scriptname,"w")
scriptfile.write("#!/bin/bash\n")
scriptfile.write("cd "+str(pkgpath)+"\n")
scriptfile.write(command)
scriptfile.close()
except Exception as err:
logger.error(traceback.format_exc())
logger.error("Fail to write script file with taskid(%s) vnodeid(%s)" % (str(taskid),str(vnodeid)))
else:
try:
job_id = taskid.split('_')[1]
except Exception as e:
logger.error(traceback.format_exc())
job_id = "_none"
jobdir = "batch_" + job_id
logdir = "%s/global/users/%s/data/" % (self.fspath,username) + jobdir
if not os.path.exists(logdir):
logger.info("Directory:%s not exists, create it." % logdir)
os.mkdir(logdir)
stdoutname = str(taskid)+"_"+str(vnodeid)+"_stdout.txt"
stderrname = str(taskid)+"_"+str(vnodeid)+"_stderr.txt"
try:
stdoutfile = open(logdir+"/"+stdoutname,"w")
stderrfile = open(logdir+"/"+stderrname,"w")
logger.info("Create stdout(%s) and stderr(%s) file to log" % (stdoutname, stderrname))
except Exception as e:
logger.error(traceback.format_exc())
stdoutfile = None
stderrfile = None
cmd = "lxc-attach -n " + lxcname
for envkey,envval in envs.items():
cmd = cmd + " -v %s=%s" % (envkey,envval)
cmd = cmd + " -- /bin/bash \"" + "/root/" + scriptname + "\""
logger.info('run task with command - %s' % cmd)
p = subprocess.Popen(cmd,stdout=stdoutfile,stderr=stderrfile, shell=True)
#logger.info(p)
if timeout == 0:
to = MAX_RUNNING_TIME
else:
to = timeout
while p.poll() is None and to > 0:
time.sleep(min(2,to))
to -= 2
if p.poll() is None:
p.kill()
logger.info("Running time(%d) is out. Task(%s-%s-%s) will be killed." % (timeout,str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.TIMEOUT,token,"Running time is out.")
else:
[success1,msg1] = self.write_output(lxcname,jobdir+"/"+stdoutname,outpath[0])
[success2,msg2] = self.write_output(lxcname,jobdir+"/"+stderrname,outpath[1])
if not success1 or not success2:
if not success1:
msg = msg1
else:
msg = msg2
logger.info("Output error on Task(%s-%s-%s)." % (str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.OUTPUTERROR,token,msg)
else:
if p.poll() == 0:
logger.info("Task(%s-%s-%s) completed." % (str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.COMPLETED,token,"")
else:
logger.info("Task(%s-%s-%s) failed." % (str(taskid),str(vnodeid),token))
self.add_msg(taskid,username,vnodeid,rpc_pb2.FAILED,token,"")
def add_msg(self,taskid,username,vnodeid,status,token,errmsg):
self.msgslock.acquire()
try: