Merge pull request #379 from FirmlyReality/batch

Batch
This commit is contained in:
Yujian Zhu 2019-04-04 01:12:36 +08:00 committed by GitHub
commit 70979e1b35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 155 additions and 39 deletions

View File

@ -17,8 +17,19 @@ def db_commit():
raise raise
class BatchJob(object): class BatchJob(object):
def __init__(self, jobid, user, job_info): def __init__(self, jobid, user, job_info, old_job_db=None):
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority'])) if old_job_db is None:
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority']))
else:
self.job_db = old_job_db
self.job_db.clear()
job_info = {}
job_info['jobName'] = self.job_db.name
job_info['jobPriority'] = self.job_db.priority
all_tasks = self.job_db.tasks.all()
job_info['tasks'] = {}
for t in all_tasks:
job_info['tasks'][t.idx] = json.loads(t.config)
self.user = user self.user = user
#self.raw_job_info = job_info #self.raw_job_info = job_info
self.job_id = jobid self.job_id = jobid
@ -35,8 +46,12 @@ class BatchJob(object):
self.tasks_cnt['pending'] = len(raw_tasks.keys()) self.tasks_cnt['pending'] = len(raw_tasks.keys())
for task_idx in raw_tasks.keys(): for task_idx in raw_tasks.keys():
task_info = raw_tasks[task_idx] task_info = raw_tasks[task_idx]
task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info) if old_job_db is None:
self.job_db.tasks.append(task_db) task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info)
self.job_db.tasks.append(task_db)
else:
task_db = Batchtask.query.get(jobid+"_"+task_idx)
task_db.clear()
self.tasks[task_idx] = {} self.tasks[task_idx] = {}
self.tasks[task_idx]['id'] = jobid+"_"+task_idx self.tasks[task_idx]['id'] = jobid+"_"+task_idx
self.tasks[task_idx]['config'] = task_info self.tasks[task_idx]['config'] = task_info
@ -54,7 +69,8 @@ class BatchJob(object):
self.dependency_out[d] = [] self.dependency_out[d] = []
self.dependency_out[d].append(task_idx) self.dependency_out[d].append(task_idx)
db.session.add(self.job_db) if old_job_db is None:
db.session.add(self.job_db)
db_commit() db_commit()
self.log_status() self.log_status()
@ -259,6 +275,7 @@ class JobMgr():
# load job information from etcd # load job information from etcd
# initial a job queue and job schedueler # initial a job queue and job schedueler
def __init__(self, taskmgr): def __init__(self, taskmgr):
logger.info("Init jobmgr...")
try: try:
Batchjob.query.all() Batchjob.query.all()
except: except:
@ -270,6 +287,22 @@ class JobMgr():
self.userpoint = "http://" + env.getenv('USER_IP') + ":" + str(env.getenv('USER_PORT')) self.userpoint = "http://" + env.getenv('USER_IP') + ":" + str(env.getenv('USER_PORT'))
self.auth_key = env.getenv('AUTH_KEY') self.auth_key = env.getenv('AUTH_KEY')
self.recover_jobs()
def recover_jobs(self):
logger.info("Rerun the unfailed and unfinished jobs...")
try:
rejobs = Batchjob.query.filter(~Batchjob.status.in_(['done','failed']))
rejobs = rejobs.order_by(Batchjob.create_time).all()
for rejob in rejobs:
logger.info("Rerun job: "+rejob.id)
logger.debug(str(rejob))
job = BatchJob(rejob.id, rejob.username, None, rejob)
self.job_map[job.job_id] = job
self.process_job(job)
except Exception as err:
logger.error(traceback.format_exc())
def charge_beans(self,username,billing): def charge_beans(self,username,billing):
logger.debug("Charge user(%s) for %d beans"%(username, billing)) logger.debug("Charge user(%s) for %d beans"%(username, billing))
data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key} data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key}

View File

@ -113,6 +113,7 @@ class SubTask():
self.status_reason = '' self.status_reason = ''
self.try_count = 0 self.try_count = 0
self.worker = None self.worker = None
self.lock = threading.Lock()
def waiting_for_retry(self,reason=""): def waiting_for_retry(self,reason=""):
self.try_count += 1 self.try_count += 1
@ -120,7 +121,7 @@ class SubTask():
if self.status == FAILED: if self.status == FAILED:
self.root_task.status = FAILED self.root_task.status = FAILED
self.failed_reason = reason self.failed_reason = reason
self.root_task.failed_reason = reason
class TaskReporter(MasterServicer): class TaskReporter(MasterServicer):
@ -197,6 +198,19 @@ class TaskMgr(threading.Thread):
return new_f return new_f
return lock return lock
def subtask_lock(f):
@wraps(f)
def new_f(self, subtask, *args, **kwargs):
subtask.lock.acquire()
try:
result = f(self, subtask, *args, **kwargs)
except Exception as err:
subtask.lock.release()
raise err
subtask.lock.release()
return result
return new_f
def run(self): def run(self):
self.serve() self.serve()
while not self.thread_stop: while not self.thread_stop:
@ -272,7 +286,10 @@ class TaskMgr(threading.Thread):
self.gpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.gpu self.gpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.gpu
return [True, ''] return [True, '']
@subtask_lock
def stop_vnode(self, subtask): def stop_vnode(self, subtask):
if not subtask.vnode_started:
return [True, ""]
try: try:
self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid)) self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port)) channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
@ -482,13 +499,18 @@ class TaskMgr(threading.Thread):
if sub_task.status != RUNNING: if sub_task.status != RUNNING:
self.logger.error('[on_task_report] receive task report when vnode is not running') self.logger.error('[on_task_report] receive task report when vnode is not running')
sub_task.status = report.subTaskStatus #sub_task.status = report.subTaskStatus
sub_task.status_reason = report.errmsg sub_task.status_reason = report.errmsg
sub_task.task_started = False
if report.subTaskStatus == FAILED or report.subTaskStatus == TIMEOUT: if report.subTaskStatus == FAILED or report.subTaskStatus == TIMEOUT:
self.clear_sub_task(sub_task)
sub_task.waiting_for_retry(report.errmsg) sub_task.waiting_for_retry(report.errmsg)
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg) self.logger.info('task %s report failed, status %d, subtasks: %s' % (task.id, task.status, str([sub_task.status for sub_task in task.subtask_list])))
if sub_task.status == WAITING:
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg)
elif report.subTaskStatus == OUTPUTERROR: elif report.subTaskStatus == OUTPUTERROR:
self.clear_sub_task(sub_task)
sub_task.status = FAILED sub_task.status = FAILED
task.status = FAILED task.status = FAILED
task.failed_reason = report.errmsg task.failed_reason = report.errmsg
@ -506,7 +528,7 @@ class TaskMgr(threading.Thread):
self.logger.info('task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list]))) self.logger.info('task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
if self.check_task_completed(task): if self.check_task_completed(task):
continue continue
self.logger.info("test") self.logger.info('schedule task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
if task.at_same_time: if task.at_same_time:
# parallel tasks # parallel tasks
@ -671,8 +693,13 @@ class TaskMgr(threading.Thread):
disk = int(json_task['diskSetting']), disk = int(json_task['diskSetting']),
gpu = int(json_task['gpuSetting'])), gpu = int(json_task['gpuSetting'])),
mount = [Mount( mount = [Mount(
localPath = json_task['mapping'][mapping_key]['mappingLocalDir'], provider = json_task['mapping'][mapping_key]['mappingProvider'],
remotePath=json_task['mapping'][mapping_key]['mappingRemoteDir']) localPath = json_task['mapping'][mapping_key]['mappingMountpath'],
remotePath = json_task['mapping'][mapping_key]['mappingBucketName'],
accessKey = json_task['mapping'][mapping_key]['mappingAccessKey'],
secretKey = json_task['mapping'][mapping_key]['mappingSecretKey'],
other = json_task['mapping'][mapping_key]['mappingEndpoint']
)
for mapping_key in json_task['mapping']] if 'mapping' in json_task else [] for mapping_key in json_task['mapping']] if 'mapping' in json_task else []
), ),
), ),
@ -689,7 +716,7 @@ class TaskMgr(threading.Thread):
timeout = int(json_task['expTime']) timeout = int(json_task['expTime'])
# commands are executed in all vnodes / only excuted in the first vnode # commands are executed in all vnodes / only excuted in the first vnode
# if in traditional mode, commands will be executed in all vnodes # if in traditional mode, commands will be executed in all vnodes
) if (not 'atSameTime' in json_task.keys() or json_task['runon'] == 'all' or vnode_index == 0) else None ) if (json_task['runon'] == 'all' or vnode_index == 0) else None
} for vnode_index in range(int(json_task['vnodeCount']))]) } for vnode_index in range(int(json_task['vnodeCount']))])
if task.at_same_time: if task.at_same_time:

View File

@ -461,6 +461,12 @@ class Batchjob(db.Model):
self.end_time = None self.end_time = None
self.billing = 0 self.billing = 0
def clear(self):
self.status = "pending"
self.failed_reason = ""
self.end_time = None
self.billing = 0
def __repr__(self): def __repr__(self):
info = {} info = {}
info['job_id'] = self.id info['job_id'] = self.id
@ -503,6 +509,15 @@ class Batchtask(db.Model):
self.config = json.dumps(config) self.config = json.dumps(config)
self.tried_times = 0 self.tried_times = 0
def clear(self):
self.status = "pending"
self.failed_reason = ""
self.start_time = None
self.end_time = None
self.running_time = 0
self.billing = 0
self.tried_times = 0
def __repr__(self): def __repr__(self):
info = {} info = {}
info['id'] = self.id info['id'] = self.id

View File

@ -27,7 +27,7 @@ class OssMounter(object):
# umount oss # umount oss
pass pass
class aliyunOssMounter(OssMounter): class AliyunOssMounter(OssMounter):
@staticmethod @staticmethod
def mount_oss(datapath, mount_info): def mount_oss(datapath, mount_info):
@ -42,7 +42,7 @@ class aliyunOssMounter(OssMounter):
cmd = "chmod 640 /etc/passwd-ossfs" cmd = "chmod 640 /etc/passwd-ossfs"
[success1, msg] = OssMounter.execute_cmd(cmd) [success1, msg] = OssMounter.execute_cmd(cmd)
mountpath = datapath+"/"+mount_info.remotePath mountpath = datapath+"/Aliyun/"+mount_info.remotePath
logger.info("Mount oss %s %s" % (mount_info.remotePath, mountpath)) logger.info("Mount oss %s %s" % (mount_info.remotePath, mountpath))
if not os.path.isdir(mountpath): if not os.path.isdir(mountpath):
os.makedirs(mountpath) os.makedirs(mountpath)

View File

@ -202,11 +202,12 @@ class TaskWorker(rpc_pb2_grpc.WorkerServicer):
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg) return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg)
#mount oss #mount oss
rootfs = "/var/lib/lxc/%s/rootfs" % lxcname
self.mount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_list) self.mount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_list)
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+') conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+')
mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s %s/root/oss/%s none bind,rw,create=dir 0 0" mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
for mount in mount_list: for mount in mount_list:
conffile.write("\n"+ mount_str % (self.fspath, username, mount.remotePath, rootfs, mount.remotePath)) conffile.write("\n"+ mount_str % (self.fspath, username, mount.provider, mount.remotePath, rootfs, mount.remotePath))
conffile.close() conffile.close()
logger.info("Start container %s..." % lxcname) logger.info("Start container %s..." % lxcname)

View File

@ -244,7 +244,7 @@
<i><a href="https://github.com/unias/docklet">Docklet {{ version }}</a></i> <i><a href="https://github.com/unias/docklet">Docklet {{ version }}</a></i>
</div> </div>
<!-- Default to the left --> <!-- Default to the left -->
<strong>Copyright</strong>&copy;&nbsp;2017 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a> <strong>Copyright</strong>&copy;&nbsp;2019 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
</footer> </footer>

View File

@ -121,27 +121,41 @@
$("#collapse" + obj.id).collapse('hide'); $("#collapse" + obj.id).collapse('hide');
} }
function chmountPath(obj,task_num,mapping_num) {
cellid = 'mapping_mountpath_' + task_num + '_' + mapping_num;
$('#'+cellid).val("/root/oss/"+obj.value);
}
function removeMapping(obj) { function removeMapping(obj) {
$("#mapping_" + obj.id).remove(); $("#mapping_" + obj.id).remove();
} }
function addMapping(obj) { function addMapping(obj,task_num) {
mapping_number += 1; mapping_number += 1;
var table = $("#storage_mapping_" + obj.id)[0]; var table = $("#storage_mapping_" + obj.id)[0];
var new_mapping = table.insertRow(); var new_mapping = table.insertRow();
new_mapping.id = "mapping_" + task_number + "_" + mapping_number; new_mapping.id = "mapping_" + task_num + "_" + mapping_number;
var local_dir = new_mapping.insertCell(); var provider = new_mapping.insertCell();
var remote_dir = new_mapping.insertCell(); var bucket_name = new_mapping.insertCell();
var source = new_mapping.insertCell(); var accessKey = new_mapping.insertCell();
var secretKey = new_mapping.insertCell();
var endpoint = new_mapping.insertCell();
var mountpath = new_mapping.insertCell();
var remove = new_mapping.insertCell(); var remove = new_mapping.insertCell();
local_dir.innerHTML = '<input type="text" class="form-control" name="mappingLocalDir_' + task_number + '_' + mapping_number + '" id="mapping_local_dir_' bucket_name.innerHTML = '<input type="text" class="form-control" name="mappingBucketName_' + task_num + '_' + mapping_number + '" id="mapping_bucketname_'
+ task_number + '_' + mapping_number + '" />'; + task_num + '_' + mapping_number + '" onKeyUp="chmountPath(this,'+task_num+','+mapping_number+');" required/>';
remote_dir.innerHTML = '<input type="text" class="form-control" name="mappingRemoteDir_' + task_number + '_' + mapping_number + '" id="mapping_remote_dir_' accessKey.innerHTML = '<input type="text" class="form-control" name="mappingAccessKey_' + task_num + '_' + mapping_number + '" id="mapping_accessKey_'
+ task_number + '_' + mapping_number + '" />'; + task_num + '_' + mapping_number + '" required/>';
source.innerHTML = '<select class="form-control" name="mappingSource_' + task_number + '_' + mapping_number + '" id="mapping_source_' secretKey.innerHTML = '<input type="text" class="form-control" name="mappingSecretKey_' + task_num + '_' + mapping_number + '" id="mapping_secretKey_'
+ task_number + '_' + mapping_number + '">' + task_num + '_' + mapping_number + '" required/>';
+'<option>Aliyun</option><option>AWS</option></select>'; endpoint.innerHTML = 'http://<input type="text" class="form-control" name="mappingEndpoint_' + task_num + '_' + mapping_number + '" id="mapping_endpoint_'
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_number + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">' + task_num + '_' + mapping_number + '" required/>';
mountpath.innerHTML = '<input type="text" class="form-control" name="mappingMountpath_' + task_num + '_' + mapping_number + '" id="mapping_mountpath_'
+ task_num + '_' + mapping_number + '" readonly="true" required/>';
provider.innerHTML = '<select class="form-control" name="mappingProvider_' + task_num + '_' + mapping_number + '" id="mapping_provider_'
+ task_num + '_' + mapping_number + '">'
+'<option>Aliyun</option></select>';
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_num + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">'
+'Remove</button></div>'; +'Remove</button></div>';
} }
@ -208,7 +222,7 @@
function addTask() { function addTask() {
task_number += 1; task_number += 1;
var masterip=$("select#masterselector").children('option:selected').val(); var masterip=$("select#masterselector").children('option:selected').val();
mapping_number = 0; //mapping_number = 0;
var task_html = ''; var task_html = '';
task_html += task_html +=
'<div class="panel panel-default" id="task_pannel_' + task_number + '">' '<div class="panel panel-default" id="task_pannel_' + task_number + '">'
@ -324,18 +338,18 @@
+'</div>' +'</div>'
+'</div>' +'</div>'
+'<div class="form-group">' +'<div class="form-group">'
+'<span>' +'<label class="col-sm-2 control-label">Object Storage Mapping<br/>'
+'<label class="col-sm-2 contril-label">Exteranl Storage Mapping</label>' +'<button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this,'+task_number+')">'
+'<table class="table table-bordered" id="storage_mapping_' + task_number + '" style="display:inline;">' +'Add<i class="fa fa-plus"></i></button></label>'
+'<div class="col-sm-10"><table class="table table-bordered" id="storage_mapping_' + task_number + '">'
+'<thead>' +'<thead>'
+'<tr><td><button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this)">' +'<tr><th>Provider</th><th>Bucket Name</th><th>AccessKey ID</th><th>AccessKey Secret</th><th>Endpoint</th><th>Mount Path</th><th>Remove</th></tr>'
+'<i class="fa fa-plus"></i></button></td></tr>'
+'<tr><th style="width:217px">Local Dir</th><th style="width:217px">Remote Dir</th><th style="width:217px">source</th><th style="width:217px">Operation</th></tr>'
+'</thead>' +'</thead>'
+'<tbody>' +'<tbody>'
+'</tbody>' +'</tbody>'
+'</table>' +'</table></div>'
+'</span></div><div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>' +'</div>'
+'<div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>'
+'</div></div></div>' +'</div></div></div>'
$(task_html).appendTo("#accordion"); $(task_html).appendTo("#accordion");
} }

View File

@ -216,6 +216,32 @@
</tbody> </tbody>
</table> </table>
</div> </div>
{% if 'mapping' in task['config'].keys() %}
<div class="table-responsive">
<table class="table table-bordered table-hover">
<thead>
<tr>
<th>Provider</th>
<th>Bucket Name</th>
<th>AccessKey ID</th>
<th>Endpoint</th>
<th>Mount Path</th>
</tr>
</thead>
<tbody>
{% for key in task['config']['mapping'].keys() %}
<tr>
<td>{{ task['config']['mapping'][key]['mappingProvider'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingBucketName'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingAccessKey'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingEndpoint'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingMountpath'] }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endif %}
</div> </div>
</div> </div>
</div> </div>