commit
70979e1b35
|
@ -17,8 +17,19 @@ def db_commit():
|
|||
raise
|
||||
|
||||
class BatchJob(object):
|
||||
def __init__(self, jobid, user, job_info):
|
||||
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority']))
|
||||
def __init__(self, jobid, user, job_info, old_job_db=None):
|
||||
if old_job_db is None:
|
||||
self.job_db = Batchjob(jobid,user,job_info['jobName'],int(job_info['jobPriority']))
|
||||
else:
|
||||
self.job_db = old_job_db
|
||||
self.job_db.clear()
|
||||
job_info = {}
|
||||
job_info['jobName'] = self.job_db.name
|
||||
job_info['jobPriority'] = self.job_db.priority
|
||||
all_tasks = self.job_db.tasks.all()
|
||||
job_info['tasks'] = {}
|
||||
for t in all_tasks:
|
||||
job_info['tasks'][t.idx] = json.loads(t.config)
|
||||
self.user = user
|
||||
#self.raw_job_info = job_info
|
||||
self.job_id = jobid
|
||||
|
@ -35,8 +46,12 @@ class BatchJob(object):
|
|||
self.tasks_cnt['pending'] = len(raw_tasks.keys())
|
||||
for task_idx in raw_tasks.keys():
|
||||
task_info = raw_tasks[task_idx]
|
||||
task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info)
|
||||
self.job_db.tasks.append(task_db)
|
||||
if old_job_db is None:
|
||||
task_db = Batchtask(jobid+"_"+task_idx, task_idx, task_info)
|
||||
self.job_db.tasks.append(task_db)
|
||||
else:
|
||||
task_db = Batchtask.query.get(jobid+"_"+task_idx)
|
||||
task_db.clear()
|
||||
self.tasks[task_idx] = {}
|
||||
self.tasks[task_idx]['id'] = jobid+"_"+task_idx
|
||||
self.tasks[task_idx]['config'] = task_info
|
||||
|
@ -54,7 +69,8 @@ class BatchJob(object):
|
|||
self.dependency_out[d] = []
|
||||
self.dependency_out[d].append(task_idx)
|
||||
|
||||
db.session.add(self.job_db)
|
||||
if old_job_db is None:
|
||||
db.session.add(self.job_db)
|
||||
db_commit()
|
||||
|
||||
self.log_status()
|
||||
|
@ -259,6 +275,7 @@ class JobMgr():
|
|||
# load job information from etcd
|
||||
# initial a job queue and job schedueler
|
||||
def __init__(self, taskmgr):
|
||||
logger.info("Init jobmgr...")
|
||||
try:
|
||||
Batchjob.query.all()
|
||||
except:
|
||||
|
@ -270,6 +287,22 @@ class JobMgr():
|
|||
self.userpoint = "http://" + env.getenv('USER_IP') + ":" + str(env.getenv('USER_PORT'))
|
||||
self.auth_key = env.getenv('AUTH_KEY')
|
||||
|
||||
self.recover_jobs()
|
||||
|
||||
def recover_jobs(self):
|
||||
logger.info("Rerun the unfailed and unfinished jobs...")
|
||||
try:
|
||||
rejobs = Batchjob.query.filter(~Batchjob.status.in_(['done','failed']))
|
||||
rejobs = rejobs.order_by(Batchjob.create_time).all()
|
||||
for rejob in rejobs:
|
||||
logger.info("Rerun job: "+rejob.id)
|
||||
logger.debug(str(rejob))
|
||||
job = BatchJob(rejob.id, rejob.username, None, rejob)
|
||||
self.job_map[job.job_id] = job
|
||||
self.process_job(job)
|
||||
except Exception as err:
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
def charge_beans(self,username,billing):
|
||||
logger.debug("Charge user(%s) for %d beans"%(username, billing))
|
||||
data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key}
|
||||
|
|
|
@ -113,6 +113,7 @@ class SubTask():
|
|||
self.status_reason = ''
|
||||
self.try_count = 0
|
||||
self.worker = None
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def waiting_for_retry(self,reason=""):
|
||||
self.try_count += 1
|
||||
|
@ -120,7 +121,7 @@ class SubTask():
|
|||
if self.status == FAILED:
|
||||
self.root_task.status = FAILED
|
||||
self.failed_reason = reason
|
||||
|
||||
self.root_task.failed_reason = reason
|
||||
|
||||
class TaskReporter(MasterServicer):
|
||||
|
||||
|
@ -197,6 +198,19 @@ class TaskMgr(threading.Thread):
|
|||
return new_f
|
||||
return lock
|
||||
|
||||
def subtask_lock(f):
|
||||
@wraps(f)
|
||||
def new_f(self, subtask, *args, **kwargs):
|
||||
subtask.lock.acquire()
|
||||
try:
|
||||
result = f(self, subtask, *args, **kwargs)
|
||||
except Exception as err:
|
||||
subtask.lock.release()
|
||||
raise err
|
||||
subtask.lock.release()
|
||||
return result
|
||||
return new_f
|
||||
|
||||
def run(self):
|
||||
self.serve()
|
||||
while not self.thread_stop:
|
||||
|
@ -272,7 +286,10 @@ class TaskMgr(threading.Thread):
|
|||
self.gpu_usage[subtask.worker] += subtask.vnode_info.vnode.instance.gpu
|
||||
return [True, '']
|
||||
|
||||
@subtask_lock
|
||||
def stop_vnode(self, subtask):
|
||||
if not subtask.vnode_started:
|
||||
return [True, ""]
|
||||
try:
|
||||
self.logger.info('[task_processor] Stopping vnode for task [%s] vnode [%d]' % (subtask.vnode_info.taskid, subtask.vnode_info.vnodeid))
|
||||
channel = grpc.insecure_channel('%s:%s' % (subtask.worker, self.worker_port))
|
||||
|
@ -482,13 +499,18 @@ class TaskMgr(threading.Thread):
|
|||
if sub_task.status != RUNNING:
|
||||
self.logger.error('[on_task_report] receive task report when vnode is not running')
|
||||
|
||||
sub_task.status = report.subTaskStatus
|
||||
#sub_task.status = report.subTaskStatus
|
||||
sub_task.status_reason = report.errmsg
|
||||
sub_task.task_started = False
|
||||
|
||||
if report.subTaskStatus == FAILED or report.subTaskStatus == TIMEOUT:
|
||||
self.clear_sub_task(sub_task)
|
||||
sub_task.waiting_for_retry(report.errmsg)
|
||||
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg)
|
||||
self.logger.info('task %s report failed, status %d, subtasks: %s' % (task.id, task.status, str([sub_task.status for sub_task in task.subtask_list])))
|
||||
if sub_task.status == WAITING:
|
||||
self.jobmgr.report(task.username, task.id, 'retrying', report.errmsg)
|
||||
elif report.subTaskStatus == OUTPUTERROR:
|
||||
self.clear_sub_task(sub_task)
|
||||
sub_task.status = FAILED
|
||||
task.status = FAILED
|
||||
task.failed_reason = report.errmsg
|
||||
|
@ -506,7 +528,7 @@ class TaskMgr(threading.Thread):
|
|||
self.logger.info('task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
|
||||
if self.check_task_completed(task):
|
||||
continue
|
||||
self.logger.info("test")
|
||||
self.logger.info('schedule task %s sub_tasks %s' % (task.id, str([sub_task.status for sub_task in task.subtask_list])))
|
||||
|
||||
if task.at_same_time:
|
||||
# parallel tasks
|
||||
|
@ -671,8 +693,13 @@ class TaskMgr(threading.Thread):
|
|||
disk = int(json_task['diskSetting']),
|
||||
gpu = int(json_task['gpuSetting'])),
|
||||
mount = [Mount(
|
||||
localPath = json_task['mapping'][mapping_key]['mappingLocalDir'],
|
||||
remotePath=json_task['mapping'][mapping_key]['mappingRemoteDir'])
|
||||
provider = json_task['mapping'][mapping_key]['mappingProvider'],
|
||||
localPath = json_task['mapping'][mapping_key]['mappingMountpath'],
|
||||
remotePath = json_task['mapping'][mapping_key]['mappingBucketName'],
|
||||
accessKey = json_task['mapping'][mapping_key]['mappingAccessKey'],
|
||||
secretKey = json_task['mapping'][mapping_key]['mappingSecretKey'],
|
||||
other = json_task['mapping'][mapping_key]['mappingEndpoint']
|
||||
)
|
||||
for mapping_key in json_task['mapping']] if 'mapping' in json_task else []
|
||||
),
|
||||
),
|
||||
|
@ -689,7 +716,7 @@ class TaskMgr(threading.Thread):
|
|||
timeout = int(json_task['expTime'])
|
||||
# commands are executed in all vnodes / only excuted in the first vnode
|
||||
# if in traditional mode, commands will be executed in all vnodes
|
||||
) if (not 'atSameTime' in json_task.keys() or json_task['runon'] == 'all' or vnode_index == 0) else None
|
||||
) if (json_task['runon'] == 'all' or vnode_index == 0) else None
|
||||
} for vnode_index in range(int(json_task['vnodeCount']))])
|
||||
|
||||
if task.at_same_time:
|
||||
|
|
|
@ -461,6 +461,12 @@ class Batchjob(db.Model):
|
|||
self.end_time = None
|
||||
self.billing = 0
|
||||
|
||||
def clear(self):
|
||||
self.status = "pending"
|
||||
self.failed_reason = ""
|
||||
self.end_time = None
|
||||
self.billing = 0
|
||||
|
||||
def __repr__(self):
|
||||
info = {}
|
||||
info['job_id'] = self.id
|
||||
|
@ -503,6 +509,15 @@ class Batchtask(db.Model):
|
|||
self.config = json.dumps(config)
|
||||
self.tried_times = 0
|
||||
|
||||
def clear(self):
|
||||
self.status = "pending"
|
||||
self.failed_reason = ""
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
self.running_time = 0
|
||||
self.billing = 0
|
||||
self.tried_times = 0
|
||||
|
||||
def __repr__(self):
|
||||
info = {}
|
||||
info['id'] = self.id
|
||||
|
|
|
@ -27,7 +27,7 @@ class OssMounter(object):
|
|||
# umount oss
|
||||
pass
|
||||
|
||||
class aliyunOssMounter(OssMounter):
|
||||
class AliyunOssMounter(OssMounter):
|
||||
|
||||
@staticmethod
|
||||
def mount_oss(datapath, mount_info):
|
||||
|
@ -42,7 +42,7 @@ class aliyunOssMounter(OssMounter):
|
|||
|
||||
cmd = "chmod 640 /etc/passwd-ossfs"
|
||||
[success1, msg] = OssMounter.execute_cmd(cmd)
|
||||
mountpath = datapath+"/"+mount_info.remotePath
|
||||
mountpath = datapath+"/Aliyun/"+mount_info.remotePath
|
||||
logger.info("Mount oss %s %s" % (mount_info.remotePath, mountpath))
|
||||
if not os.path.isdir(mountpath):
|
||||
os.makedirs(mountpath)
|
||||
|
|
|
@ -202,11 +202,12 @@ class TaskWorker(rpc_pb2_grpc.WorkerServicer):
|
|||
return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg)
|
||||
|
||||
#mount oss
|
||||
rootfs = "/var/lib/lxc/%s/rootfs" % lxcname
|
||||
self.mount_oss("%s/global/users/%s/oss" % (self.fspath,username), mount_list)
|
||||
conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+')
|
||||
mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
|
||||
mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
|
||||
for mount in mount_list:
|
||||
conffile.write("\n"+ mount_str % (self.fspath, username, mount.remotePath, rootfs, mount.remotePath))
|
||||
conffile.write("\n"+ mount_str % (self.fspath, username, mount.provider, mount.remotePath, rootfs, mount.remotePath))
|
||||
conffile.close()
|
||||
|
||||
logger.info("Start container %s..." % lxcname)
|
||||
|
|
|
@ -244,7 +244,7 @@
|
|||
<i><a href="https://github.com/unias/docklet">Docklet {{ version }}</a></i>
|
||||
</div>
|
||||
<!-- Default to the left -->
|
||||
<strong>Copyright</strong>© 2017 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
|
||||
<strong>Copyright</strong>© 2019 <a href="https://unias.github.io/docklet">UniAS</a>@<a href="http://www.sei.pku.edu.cn"> SEI, PKU</a>
|
||||
|
||||
</footer>
|
||||
|
||||
|
|
|
@ -121,27 +121,41 @@
|
|||
$("#collapse" + obj.id).collapse('hide');
|
||||
}
|
||||
|
||||
function chmountPath(obj,task_num,mapping_num) {
|
||||
cellid = 'mapping_mountpath_' + task_num + '_' + mapping_num;
|
||||
$('#'+cellid).val("/root/oss/"+obj.value);
|
||||
}
|
||||
|
||||
function removeMapping(obj) {
|
||||
$("#mapping_" + obj.id).remove();
|
||||
}
|
||||
|
||||
function addMapping(obj) {
|
||||
function addMapping(obj,task_num) {
|
||||
mapping_number += 1;
|
||||
var table = $("#storage_mapping_" + obj.id)[0];
|
||||
var new_mapping = table.insertRow();
|
||||
new_mapping.id = "mapping_" + task_number + "_" + mapping_number;
|
||||
var local_dir = new_mapping.insertCell();
|
||||
var remote_dir = new_mapping.insertCell();
|
||||
var source = new_mapping.insertCell();
|
||||
new_mapping.id = "mapping_" + task_num + "_" + mapping_number;
|
||||
var provider = new_mapping.insertCell();
|
||||
var bucket_name = new_mapping.insertCell();
|
||||
var accessKey = new_mapping.insertCell();
|
||||
var secretKey = new_mapping.insertCell();
|
||||
var endpoint = new_mapping.insertCell();
|
||||
var mountpath = new_mapping.insertCell();
|
||||
var remove = new_mapping.insertCell();
|
||||
local_dir.innerHTML = '<input type="text" class="form-control" name="mappingLocalDir_' + task_number + '_' + mapping_number + '" id="mapping_local_dir_'
|
||||
+ task_number + '_' + mapping_number + '" />';
|
||||
remote_dir.innerHTML = '<input type="text" class="form-control" name="mappingRemoteDir_' + task_number + '_' + mapping_number + '" id="mapping_remote_dir_'
|
||||
+ task_number + '_' + mapping_number + '" />';
|
||||
source.innerHTML = '<select class="form-control" name="mappingSource_' + task_number + '_' + mapping_number + '" id="mapping_source_'
|
||||
+ task_number + '_' + mapping_number + '">'
|
||||
+'<option>Aliyun</option><option>AWS</option></select>';
|
||||
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_number + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">'
|
||||
bucket_name.innerHTML = '<input type="text" class="form-control" name="mappingBucketName_' + task_num + '_' + mapping_number + '" id="mapping_bucketname_'
|
||||
+ task_num + '_' + mapping_number + '" onKeyUp="chmountPath(this,'+task_num+','+mapping_number+');" required/>';
|
||||
accessKey.innerHTML = '<input type="text" class="form-control" name="mappingAccessKey_' + task_num + '_' + mapping_number + '" id="mapping_accessKey_'
|
||||
+ task_num + '_' + mapping_number + '" required/>';
|
||||
secretKey.innerHTML = '<input type="text" class="form-control" name="mappingSecretKey_' + task_num + '_' + mapping_number + '" id="mapping_secretKey_'
|
||||
+ task_num + '_' + mapping_number + '" required/>';
|
||||
endpoint.innerHTML = 'http://<input type="text" class="form-control" name="mappingEndpoint_' + task_num + '_' + mapping_number + '" id="mapping_endpoint_'
|
||||
+ task_num + '_' + mapping_number + '" required/>';
|
||||
mountpath.innerHTML = '<input type="text" class="form-control" name="mappingMountpath_' + task_num + '_' + mapping_number + '" id="mapping_mountpath_'
|
||||
+ task_num + '_' + mapping_number + '" readonly="true" required/>';
|
||||
provider.innerHTML = '<select class="form-control" name="mappingProvider_' + task_num + '_' + mapping_number + '" id="mapping_provider_'
|
||||
+ task_num + '_' + mapping_number + '">'
|
||||
+'<option>Aliyun</option></select>';
|
||||
remove.innerHTML = '<div class="box-tool pull-left"><button type="button" id="' + task_num + '_' + mapping_number +'" onclick="removeMapping(this)" class="btn btn-xs btn-danger">'
|
||||
+'Remove</button></div>';
|
||||
}
|
||||
|
||||
|
@ -208,7 +222,7 @@
|
|||
function addTask() {
|
||||
task_number += 1;
|
||||
var masterip=$("select#masterselector").children('option:selected').val();
|
||||
mapping_number = 0;
|
||||
//mapping_number = 0;
|
||||
var task_html = '';
|
||||
task_html +=
|
||||
'<div class="panel panel-default" id="task_pannel_' + task_number + '">'
|
||||
|
@ -324,18 +338,18 @@
|
|||
+'</div>'
|
||||
+'</div>'
|
||||
+'<div class="form-group">'
|
||||
+'<span>'
|
||||
+'<label class="col-sm-2 contril-label">Exteranl Storage Mapping</label>'
|
||||
+'<table class="table table-bordered" id="storage_mapping_' + task_number + '" style="display:inline;">'
|
||||
+'<label class="col-sm-2 control-label">Object Storage Mapping<br/>'
|
||||
+'<button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this,'+task_number+')">'
|
||||
+'Add<i class="fa fa-plus"></i></button></label>'
|
||||
+'<div class="col-sm-10"><table class="table table-bordered" id="storage_mapping_' + task_number + '">'
|
||||
+'<thead>'
|
||||
+'<tr><td><button type="button" id="' + task_number + '" class="btn btn-primary btn-xs" title="add an external storage mapping" onclick="addMapping(this)">'
|
||||
+'<i class="fa fa-plus"></i></button></td></tr>'
|
||||
+'<tr><th style="width:217px">Local Dir</th><th style="width:217px">Remote Dir</th><th style="width:217px">source</th><th style="width:217px">Operation</th></tr>'
|
||||
+'<tr><th>Provider</th><th>Bucket Name</th><th>AccessKey ID</th><th>AccessKey Secret</th><th>Endpoint</th><th>Mount Path</th><th>Remove</th></tr>'
|
||||
+'</thead>'
|
||||
+'<tbody>'
|
||||
+'</tbody>'
|
||||
+'</table>'
|
||||
+'</span></div><div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>'
|
||||
+'</table></div>'
|
||||
+'</div>'
|
||||
+'<div class="box-tools pull-right"><button type="button" id="' + task_number + '" onclick="unfoldTask(this)" class="btn btn-primary">Confirm</button></div>'
|
||||
+'</div></div></div>'
|
||||
$(task_html).appendTo("#accordion");
|
||||
}
|
||||
|
|
|
@ -216,6 +216,32 @@
|
|||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% if 'mapping' in task['config'].keys() %}
|
||||
<div class="table-responsive">
|
||||
<table class="table table-bordered table-hover">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Provider</th>
|
||||
<th>Bucket Name</th>
|
||||
<th>AccessKey ID</th>
|
||||
<th>Endpoint</th>
|
||||
<th>Mount Path</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for key in task['config']['mapping'].keys() %}
|
||||
<tr>
|
||||
<td>{{ task['config']['mapping'][key]['mappingProvider'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingBucketName'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingAccessKey'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingEndpoint'] }}</td>
|
||||
<td>{{ task['config']['mapping'][key]['mappingMountpath'] }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
Loading…
Reference in New Issue