Merge pull request #379 from FirmlyReality/batch

Batch
This commit is contained in:
Yujian Zhu 2019-04-04 01:12:36 +08:00 committed by GitHub
commit 70979e1b35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 155 additions and 39 deletions

View File

@ -17,8 +17,19 @@ def db_commit():
raise
class BatchJob(object):
def __init__(self, jobid, user, job_info):
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority']))
def __init__(self, jobid, user, job_info, old_job_db=None):
if old_job_db is None:
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority']))
else:
self.job_db = old_job_db
self.job_db.clear()
job_info = {}
job_info['jobName'] = self.job_db.name
job_info['jobPriority'] = self.job_db.priority
all_tasks = self.job_db.tasks.all()
job_info['tasks'] = {}
for t in all_tasks:
job_info['tasks'][t.idx] = json.loads(t.config)
self.user = user
#self.raw_job_info = job_info
self.job_id = jobid
@ -35,8 +46,12 @@ class BatchJob(object):
self.tasks_cnt['pending'] = len(raw_tasks.keys())
for task_idx in raw_tasks.keys():
task_info = raw_tasks[task_idx]
task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info)
self.job_db.tasks.append(task_db)
if old_job_db is None:
task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info)
self.job_db.tasks.append(task_db)
else:
task_db = Batchtask.query.get(jobid+"_"+task_idx)
task_db.clear()
self.tasks[task_idx] = {}
self.tasks[task_idx]['id'] = jobid+"_"+task_idx
self.tasks[task_idx]['config'] = task_info
@ -54,7 +69,8 @@ class BatchJob(object):
self.dependency_out[d] = []
self.dependency_out[d].append(task_idx)
db.session.add(self.job_db)
if old_job_db is None:
db.session.add(self.job_db)
db_commit()
self.log_status()
@ -259,6 +275,7 @@ class JobMgr():
# load job information from etcd
# initial a job queue and job schedueler
def __init__(self, taskmgr):
logger.info("Init jobmgr...")
try:
Batchjob.query.all()
except:
@ -270,6 +287,22 @@ class JobMgr():
self.userpoint = "http://" + env.getenv('USER_IP') + ":" + str(env.getenv('USER_PORT'))
self.auth_key = env.getenv('AUTH_KEY')
self.recover_jobs()
def recover_jobs(self):
logger.info("Rerun the unfailed and unfinished jobs...")
try:
rejobs = Batchjob.query.filter(~Batchjob.status.in_(['done','failed']))
rejobs = rejobs.order_by(Batchjob.create_time).all()
for rejob in rejobs:
logger.info("Rerun job: "+rejob.id)
logger.debug(str(rejob))
job = BatchJob(rejob.id, rejob.username, None, rejob)
self.job_map[job.job_id] = job
self.process_job(job)
except Exception as err:
logger.error(traceback.format_exc())
def charge_beans(self,username,billing):
logger.debug("Charge user(%s) for %d beans"%(username, billing))
data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key}

View File

@ -113,6 +113,7 @@ class SubTask():
self.status_reason = ''
self.try_count = 0
self.worker = None
self.lock = threading.Lock()
def waiting_for_retry(self,reason=""):
self.try_count += 1
@ -120,7 +121,7 @@ class SubTask():
if self.status == FAILED:
self.root_task.status = FAILED
self.failed_reason = reason
self.root_task.failed_reason = reason
class TaskReporter(MasterServicer):
@ -197,6 +198,19 @@ class TaskMgr(threading.Thread):
return new_f
return lock
def subtask_lock(f):
@wraps(f)
def new_f(self, subtask, *args, **kwargs):
subtask.lock.acquire()
try:
result = f(self, subtask, *args, **kwargs)
except Exception as err:
subtask.lock.release()
raise err
subtask.lock.release()
return result
return new_f
def run(self):
self.serve()
while not self.thread_stop:
@ -272,7 +286,10 @@ class TaskMgr(threading.Thread):
self.gpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.gpu
return [True, '']
@subtask_lock
def stop_vnode(self, subtask):
if not subtask.vnode_started:
return [True, ""]
try:
self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
@ -482,13 +499,18 @@ class TaskMgr(threading.Thread):
if sub_task.status != RUNNING:
self.logger.error('[on_task_report] receive task report when vnode is not running')
sub_task.status = report.subTaskStatus
#sub_task.status = report.subTaskStatus
sub_task.status_reason = report.errmsg
sub_task.task_started = False
if report.subTaskStatus == FAILED or report.subTaskStatus == TIMEOUT:
self.clear_sub_task(sub_task)
sub_task.waiting_for_retry(report.errmsg)
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg)
self.logger.info('task %s report failed, status %d, subtasks: %s' % (task.id, task.status, str([sub_task.status for sub_task in task.subtask_list])))
if sub_task.status == WAITING:
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg)
elif report.subTaskStatus == OUTPUTERROR:
self.clear_sub_task(sub_task)
sub_task.status = FAILED
task.status = FAILED
task.failed_reason = report.errmsg
@ -506,7 +528,7 @@ class TaskMgr(threading.Thread):
self.logger.info('task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
if self.check_task_completed(task):
continue
self.logger.info("test")
self.logger.info('schedule task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
if task.at_same_time:
# parallel tasks
@ -671,8 +693,13 @@ class TaskMgr(threading.Thread):
disk = int(json_task['diskSetting']),
gpu = int(json_task['gpuSetting'])),
mount = [Mount(
localPath = json_task['mapping'][mapping_key]['mappingLocalDir'],
remotePath=json_task['mapping'][mapping_key]['mappingRemoteDir'])
provider = json_task['mapping'][mapping_key]['mappingProvider'],
localPath = json_task['mapping'][mapping_key]['mappingMountpath'],
remotePath = json_task['mapping'][mapping_key]['mappingBucketName'],
accessKey = json_task['mapping'][mapping_key]['mappingAccessKey'],
secretKey = json_task['mapping'][mapping_key]['mappingSecretKey'],
other = json_task['mapping'][mapping_key]['mappingEndpoint']
)
for mapping_key in json_task['mapping']] if 'mapping' in json_task else []
),
),
@ -689,7 +716,7 @@ class TaskMgr(threading.Thread):
timeout = int(json_task['expTime'])
# commands are executed in all vnodes / only excuted in the first vnode
# if in traditional mode, commands will be executed in all vnodes
) if (not 'atSameTime' in json_task.keys() or json_task['runon'] == 'all' or vnode_index == 0) else None
) if (json_task['runon'] == 'all' or vnode_index == 0) else None
} for vnode_index in range(int(json_task['vnodeCount']))])
if task.at_same_time:

View File

@ -461,6 +461,12 @@ class Batchjob(db.Model):
self.end_time = None
self.billing = 0
def clear(self):
self.status = "pending"
self.failed_reason = ""
self.end_time = None
self.billing = 0
def __repr__(self):
info = {}
info['job_id'] = self.id
@ -503,6 +509,15 @@ class Batchtask(db.Model):
self.config = json.dumps(config)
self.tried_times = 0
def clear(self):
self.status = "pending"
self.failed_reason = ""
self.start_time = None
self.end_time = None
self.running_time = 0
self.billing = 0
self.tried_times = 0
def __repr__(self):
info = {}
info['id'] = self.id

View File

@ -27,7 +27,7 @@ class OssMounter(object):
# umount oss
pass
class aliyunOssMounter(OssMounter):
class AliyunOssMounter(OssMounter):
@staticmethod
def mount_oss(datapath, mount_info):
@ -42,7 +42,7 @@ class aliyunOssMounter(OssMounter):
cmd = "chmod 640 /etc/passwd-ossfs"
[success1, msg] = OssMounter.execute_cmd(cmd)
mountpath = datapath+"/"+mount_info.remotePath
mountpath = datapath+"/Aliyun/"+mount_info.remotePath
logger.info("Mount oss %s %s" % (mount_info.remotePath, mountpath))
if not os.path.isdir(mountpath):
os.makedirs(mountpath)

View File

@ -202,11 +202,12 @@ class TaskWorker(rpc_pb2_grpc.WorkerServicer):
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg)
#mount oss
rootfs = "/var/lib/lxc/%s/rootfs" % lxcname
self.mount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_list)
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+')
mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
for mount in mount_list:
conffile.write("\n"+ mount_str % (self.fspath, username, mount.remotePath, rootfs, mount.remotePath))
conffile.write("\n"+ mount_str % (self.fspath, username, mount.provider, mount.remotePath, rootfs, mount.remotePath))
conffile.close()
logger.info("Start container %s..." % lxcname)

View File

@ -244,7 +244,7 @@
<i><a href="https://github.com/unias/docklet">Docklet {{ version }}</a></i>
</div>
<!-- Default to the left -->
<strong>Copyright</strong>&copy;&nbsp;2017 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
<strong>Copyright</strong>&copy;&nbsp;2019 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
</footer>

View File

@ -121,27 +121,41 @@
$("#collapse" + obj.id).collapse('hide');
}
function chmountPath(obj,task_num,mapping_num) {
cellid = 'mapping_mountpath_' + task_num + '_' + mapping_num;
$('#'+cellid).val("/root/oss/"+obj.value);
}
function removeMapping(obj) {
$("#mapping_" + obj.id).remove();
}
function addMapping(obj) {
function addMapping(obj,task_num) {
mapping_number += 1;
var table = $("#storage_mapping_" + obj.id)[0];
var new_mapping = table.insertRow();
new_mapping.id = "mapping_" + task_number + "_" + mapping_number;
var local_dir = new_mapping.insertCell();
var remote_dir = new_mapping.insertCell();
var source = new_mapping.insertCell();
new_mapping.id = "mapping_" + task_num + "_" + mapping_number;
var provider = new_mapping.insertCell();
var bucket_name = new_mapping.insertCell();
var accessKey = new_mapping.insertCell();
var secretKey = new_mapping.insertCell();
var endpoint = new_mapping.insertCell();
var mountpath = new_mapping.insertCell();
var remove = new_mapping.insertCell();
local_dir.innerHTML = '<input type="text" class="form-control" name="mappingLocalDir_' + task_number + '_' + mapping_number + '" id="mapping_local_dir_'
+ task_number + '_' + mapping_number + '" />';
remote_dir.innerHTML = '<input type="text" class="form-control" name="mappingRemoteDir_' + task_number + '_' + mapping_number + '" id="mapping_remote_dir_'
+ task_number + '_' + mapping_number + '" />';
source.innerHTML = '<select class="form-control" name="mappingSource_' + task_number + '_' + mapping_number + '" id="mapping_source_'
+ task_number + '_' + mapping_number + '">'
+'<option>Aliyun</option><option>AWS</option></select>';
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_number + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">'
bucket_name.innerHTML = '<input type="text" class="form-control" name="mappingBucketName_' + task_num + '_' + mapping_number + '" id="mapping_bucketname_'
+ task_num + '_' + mapping_number + '" onKeyUp="chmountPath(this,'+task_num+','+mapping_number+');" required/>';
accessKey.innerHTML = '<input type="text" class="form-control" name="mappingAccessKey_' + task_num + '_' + mapping_number + '" id="mapping_accessKey_'
+ task_num + '_' + mapping_number + '" required/>';
secretKey.innerHTML = '<input type="text" class="form-control" name="mappingSecretKey_' + task_num + '_' + mapping_number + '" id="mapping_secretKey_'
+ task_num + '_' + mapping_number + '" required/>';
endpoint.innerHTML = 'http://<input type="text" class="form-control" name="mappingEndpoint_' + task_num + '_' + mapping_number + '" id="mapping_endpoint_'
+ task_num + '_' + mapping_number + '" required/>';
mountpath.innerHTML = '<input type="text" class="form-control" name="mappingMountpath_' + task_num + '_' + mapping_number + '" id="mapping_mountpath_'
+ task_num + '_' + mapping_number + '" readonly="true" required/>';
provider.innerHTML = '<select class="form-control" name="mappingProvider_' + task_num + '_' + mapping_number + '" id="mapping_provider_'
+ task_num + '_' + mapping_number + '">'
+'<option>Aliyun</option></select>';
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_num + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">'
+'Remove</button></div>';
}
@ -208,7 +222,7 @@
function addTask() {
task_number += 1;
var masterip=$("select#masterselector").children('option:selected').val();
mapping_number = 0;
//mapping_number = 0;
var task_html = '';
task_html +=
'<div class="panel panel-default" id="task_pannel_' + task_number + '">'
@ -324,18 +338,18 @@
+'</div>'
+'</div>'
+'<div class="form-group">'
+'<span>'
+'<label class="col-sm-2 contril-label">Exteranl Storage Mapping</label>'
+'<table class="table table-bordered" id="storage_mapping_' + task_number + '" style="display:inline;">'
+'<label class="col-sm-2 control-label">Object Storage Mapping<br/>'
+'<button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this,'+task_number+')">'
+'Add<i class="fa fa-plus"></i></button></label>'
+'<div class="col-sm-10"><table class="table table-bordered" id="storage_mapping_' + task_number + '">'
+'<thead>'
+'<tr><td><button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this)">'
+'<i class="fa fa-plus"></i></button></td></tr>'
+'<tr><th style="width:217px">Local Dir</th><th style="width:217px">Remote Dir</th><th style="width:217px">source</th><th style="width:217px">Operation</th></tr>'
+'<tr><th>Provider</th><th>Bucket Name</th><th>AccessKey ID</th><th>AccessKey Secret</th><th>Endpoint</th><th>Mount Path</th><th>Remove</th></tr>'
+'</thead>'
+'<tbody>'
+'</tbody>'
+'</table>'
+'</span></div><div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>'
+'</table></div>'
+'</div>'
+'<div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>'
+'</div></div></div>'
$(task_html).appendTo("#accordion");
}

View File

@ -216,6 +216,32 @@
</tbody>
</table>
</div>
{% if 'mapping' in task['config'].keys() %}
<div class="table-responsive">
<table class="table table-bordered table-hover">
<thead>
<tr>
<th>Provider</th>
<th>Bucket Name</th>
<th>AccessKey ID</th>
<th>Endpoint</th>
<th>Mount Path</th>
</tr>
</thead>
<tbody>
{% for key in task['config']['mapping'].keys() %}
<tr>
<td>{{ task['config']['mapping'][key]['mappingProvider'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingBucketName'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingAccessKey'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingEndpoint'] }}</td>
<td>{{ task['config']['mapping'][key]['mappingMountpath'] }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endif %}
</div>
</div>
</div>