update process of stop job action
This commit is contained in:
parent
ce295cb041
commit
cee574fd5c
|
@ -27,7 +27,7 @@ class BatchJob(object):
|
||||||
self.lock = threading.Lock()
|
self.lock = threading.Lock()
|
||||||
self.tasks = {}
|
self.tasks = {}
|
||||||
self.dependency_out = {}
|
self.dependency_out = {}
|
||||||
self.tasks_cnt = {'pending':0, 'scheduling':0, 'running':0, 'retrying':0, 'failed':0, 'finished':0}
|
self.tasks_cnt = {'pending':0, 'scheduling':0, 'running':0, 'retrying':0, 'failed':0, 'finished':0, 'stopped':0}
|
||||||
|
|
||||||
#init self.tasks & self.dependency_out & self.tasks_cnt
|
#init self.tasks & self.dependency_out & self.tasks_cnt
|
||||||
logger.debug("Init BatchJob user:%s job_name:%s create_time:%s" % (self.job_db.username, self.job_db.name, str(self.job_db.create_time)))
|
logger.debug("Init BatchJob user:%s job_name:%s create_time:%s" % (self.job_db.username, self.job_db.name, str(self.job_db.create_time)))
|
||||||
|
@ -95,12 +95,8 @@ class BatchJob(object):
|
||||||
|
|
||||||
@data_lock
|
@data_lock
|
||||||
def stop_job(self):
|
def stop_job(self):
|
||||||
for task_idx in self.tasks.keys():
|
|
||||||
self.tasks[task_idx]['status'] = 'stopped'
|
|
||||||
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
|
||||||
self.tasks[task_idx]['db'].status = 'stopped'
|
|
||||||
self.job_db = Batchjob.query.get(self.job_id)
|
self.job_db = Batchjob.query.get(self.job_id)
|
||||||
self.job_db.status = 'stopped'
|
self.job_db.status = 'stopping'
|
||||||
db_commit()
|
db_commit()
|
||||||
|
|
||||||
# update status of this job based
|
# update status of this job based
|
||||||
|
@ -108,10 +104,13 @@ class BatchJob(object):
|
||||||
allcnt = len(self.tasks.keys())
|
allcnt = len(self.tasks.keys())
|
||||||
if self.tasks_cnt['failed'] != 0:
|
if self.tasks_cnt['failed'] != 0:
|
||||||
self.job_db.status = 'failed'
|
self.job_db.status = 'failed'
|
||||||
elif self.tasks_cnt['running'] != 0:
|
|
||||||
self.job_db.status = 'running'
|
|
||||||
elif self.tasks_cnt['finished'] == allcnt:
|
elif self.tasks_cnt['finished'] == allcnt:
|
||||||
self.job_db.status = 'done'
|
self.job_db.status = 'done'
|
||||||
|
elif self.job_db.status == 'stopping':
|
||||||
|
if self.tasks_cnt['running'] == 0 and self.tasks_cnt['scheduling'] == 0 and self.tasks_cnt['retrying'] == 0:
|
||||||
|
self.job_db.status = 'stopped'
|
||||||
|
elif self.tasks_cnt['running'] != 0 or self.tasks_cnt['retrying'] != 0:
|
||||||
|
self.job_db.status = 'running'
|
||||||
else:
|
else:
|
||||||
self.job_db.status = 'pending'
|
self.job_db.status = 'pending'
|
||||||
db_commit()
|
db_commit()
|
||||||
|
@ -121,7 +120,7 @@ class BatchJob(object):
|
||||||
def update_task_running(self, task_idx):
|
def update_task_running(self, task_idx):
|
||||||
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) running." % (task_idx, self.job_id))
|
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) running." % (task_idx, self.job_id))
|
||||||
old_status = self.tasks[task_idx]['status']
|
old_status = self.tasks[task_idx]['status']
|
||||||
if old_status == 'stopped':
|
if old_status == 'stopping':
|
||||||
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
||||||
return
|
return
|
||||||
self.tasks_cnt[old_status] -= 1
|
self.tasks_cnt[old_status] -= 1
|
||||||
|
@ -141,7 +140,7 @@ class BatchJob(object):
|
||||||
return []
|
return []
|
||||||
logger.debug("Task(idx:%s) of BatchJob(id:%s) has finished(running_time=%d,billing=%d). Update dependency..." % (task_idx, self.job_id, running_time, billing))
|
logger.debug("Task(idx:%s) of BatchJob(id:%s) has finished(running_time=%d,billing=%d). Update dependency..." % (task_idx, self.job_id, running_time, billing))
|
||||||
old_status = self.tasks[task_idx]['status']
|
old_status = self.tasks[task_idx]['status']
|
||||||
if old_status == 'stopped':
|
if old_status == 'stopping':
|
||||||
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
||||||
return
|
return
|
||||||
self.tasks_cnt[old_status] -= 1
|
self.tasks_cnt[old_status] -= 1
|
||||||
|
@ -184,7 +183,7 @@ class BatchJob(object):
|
||||||
def update_task_retrying(self, task_idx, reason, tried_times):
|
def update_task_retrying(self, task_idx, reason, tried_times):
|
||||||
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) retrying. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
|
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) retrying. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
|
||||||
old_status = self.tasks[task_idx]['status']
|
old_status = self.tasks[task_idx]['status']
|
||||||
if old_status == 'stopped':
|
if old_status == 'stopping':
|
||||||
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
||||||
return
|
return
|
||||||
self.tasks_cnt[old_status] -= 1
|
self.tasks_cnt[old_status] -= 1
|
||||||
|
@ -203,9 +202,6 @@ class BatchJob(object):
|
||||||
def update_task_failed(self, task_idx, reason, tried_times, running_time, billing):
|
def update_task_failed(self, task_idx, reason, tried_times, running_time, billing):
|
||||||
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) failed. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
|
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) failed. reason:%s tried_times:%d" % (task_idx, self.job_id, reason, int(tried_times)))
|
||||||
old_status = self.tasks[task_idx]['status']
|
old_status = self.tasks[task_idx]['status']
|
||||||
if old_status == 'stopped':
|
|
||||||
logger.info("Task(idx:%s) of BatchJob(id:%s) has been stopped."% (task_idx, self.job_id))
|
|
||||||
return
|
|
||||||
self.tasks_cnt[old_status] -= 1
|
self.tasks_cnt[old_status] -= 1
|
||||||
self.tasks_cnt['failed'] += 1
|
self.tasks_cnt['failed'] += 1
|
||||||
self.tasks[task_idx]['status'] = 'failed'
|
self.tasks[task_idx]['status'] = 'failed'
|
||||||
|
@ -220,6 +216,26 @@ class BatchJob(object):
|
||||||
self._update_job_status()
|
self._update_job_status()
|
||||||
self.log_status()
|
self.log_status()
|
||||||
|
|
||||||
|
@data_lock
|
||||||
|
def update_task_stopped(self, task_idx, running_time, billing):
|
||||||
|
logger.debug("Update status of task(idx:%s) of BatchJob(id:%s) stopped.running_time:%d billing:%d" % (task_idx, self.job_id, int(running_time), billing))
|
||||||
|
old_status = self.tasks[task_idx]['status']
|
||||||
|
if old_status == 'failed' or old_status == 'finished' or old_status == 'stopped':
|
||||||
|
logger.info("task(idx:%s) of BatchJob(id:%s) has been done."%(task_idx, self.job_id))
|
||||||
|
return False
|
||||||
|
self.tasks_cnt[old_status] -= 1
|
||||||
|
self.tasks_cnt['stopped'] += 1
|
||||||
|
self.tasks[task_idx]['status'] = 'stopped'
|
||||||
|
self.tasks[task_idx]['db'] = Batchtask.query.get(self.tasks[task_idx]['id'])
|
||||||
|
self.tasks[task_idx]['db'].status = 'stopped'
|
||||||
|
self.tasks[task_idx]['db'].running_time = running_time
|
||||||
|
self.tasks[task_idx]['db'].billing = billing
|
||||||
|
self.job_db = Batchjob.query.get(self.job_id)
|
||||||
|
self.job_db.billing += billing
|
||||||
|
self._update_job_status()
|
||||||
|
self.log_status()
|
||||||
|
return True
|
||||||
|
|
||||||
# print status for debuging
|
# print status for debuging
|
||||||
def log_status(self):
|
def log_status(self):
|
||||||
task_copy = {}
|
task_copy = {}
|
||||||
|
@ -248,6 +264,7 @@ class JobMgr():
|
||||||
self.auth_key = env.getenv('AUTH_KEY')
|
self.auth_key = env.getenv('AUTH_KEY')
|
||||||
|
|
||||||
def charge_beans(self,username,billing):
|
def charge_beans(self,username,billing):
|
||||||
|
logger.debug("Charge user(%s) for %d beans"%(username, billing))
|
||||||
data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key}
|
data = {"owner_name":username,"billing":billing, "auth_key":self.auth_key}
|
||||||
url = "/billing/beans/"
|
url = "/billing/beans/"
|
||||||
return requests.post(self.userpoint+url,data=data).json()
|
return requests.post(self.userpoint+url,data=data).json()
|
||||||
|
@ -304,7 +321,6 @@ class JobMgr():
|
||||||
taskid = job_id + '_' + task_idx
|
taskid = job_id + '_' + task_idx
|
||||||
self.taskmgr.lazy_stop_task(taskid)
|
self.taskmgr.lazy_stop_task(taskid)
|
||||||
job.stop_job()
|
job.stop_job()
|
||||||
del self.job_map[job_id]
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
#logger.error(err)
|
#logger.error(err)
|
||||||
|
@ -314,11 +330,11 @@ class JobMgr():
|
||||||
# user: username
|
# user: username
|
||||||
# list a user's all job
|
# list a user's all job
|
||||||
def list_jobs(self,user):
|
def list_jobs(self,user):
|
||||||
alljobs = Batchjob.query.filter_by(username=user)
|
alljobs = Batchjob.query.filter_by(username=user).all()
|
||||||
res = []
|
res = []
|
||||||
for job in alljobs:
|
for job in alljobs:
|
||||||
jobdata = json.loads(str(job))
|
jobdata = json.loads(str(job))
|
||||||
tasks = job.tasks
|
tasks = job.tasks.all()
|
||||||
jobdata['tasks'] = [t.idx for t in tasks]
|
jobdata['tasks'] = [t.idx for t in tasks]
|
||||||
tasks_vnodeCount = {}
|
tasks_vnodeCount = {}
|
||||||
for t in tasks:
|
for t in tasks:
|
||||||
|
@ -372,9 +388,12 @@ class JobMgr():
|
||||||
if len(split_task_name) != 2:
|
if len(split_task_name) != 2:
|
||||||
logger.error("[jobmgr report]Illegal task_name(%s) report from taskmgr" % task_name)
|
logger.error("[jobmgr report]Illegal task_name(%s) report from taskmgr" % task_name)
|
||||||
return
|
return
|
||||||
|
if billing > 0 and (status == 'failed' or status == 'finished'):
|
||||||
|
self.charge_beans(user, billing)
|
||||||
job_id, task_idx = split_task_name
|
job_id, task_idx = split_task_name
|
||||||
if job_id not in self.job_map.keys():
|
if job_id not in self.job_map.keys():
|
||||||
logger.error("[jobmgr report]jobid(%s) does not exist. task_name(%s)" % (job_id,task_name))
|
logger.error("[jobmgr report]jobid(%s) does not exist. task_name(%s)" % (job_id,task_name))
|
||||||
|
#TODO: update data in db
|
||||||
return
|
return
|
||||||
job = self.job_map[job_id]
|
job = self.job_map[job_id]
|
||||||
if status == "running":
|
if status == "running":
|
||||||
|
@ -383,16 +402,17 @@ class JobMgr():
|
||||||
#logger.debug(str(job.job_db))
|
#logger.debug(str(job.job_db))
|
||||||
elif status == "finished":
|
elif status == "finished":
|
||||||
#logger.debug(str(job.job_db))
|
#logger.debug(str(job.job_db))
|
||||||
self.charge_beans(user, billing)
|
|
||||||
next_tasks = job.finish_task(task_idx, running_time, billing)
|
next_tasks = job.finish_task(task_idx, running_time, billing)
|
||||||
ret = self.add_task_taskmgr(user, next_tasks)
|
ret = self.add_task_taskmgr(user, next_tasks)
|
||||||
#logger.debug(str(job.job_db))
|
#logger.debug(str(job.job_db))
|
||||||
elif status == "retrying":
|
elif status == "retrying":
|
||||||
job.update_task_retrying(task_idx, reason, tried_times)
|
job.update_task_retrying(task_idx, reason, tried_times)
|
||||||
elif status == "failed":
|
elif status == "failed":
|
||||||
self.charge_beans(user, billing)
|
|
||||||
job.update_task_failed(task_idx, reason, tried_times, running_time, billing)
|
job.update_task_failed(task_idx, reason, tried_times, running_time, billing)
|
||||||
if job.job_db.status == 'done' or job.job_db.status == 'failed':
|
elif status == "stopped":
|
||||||
|
if job.update_task_stopped(task_idx, running_time, billing) and billing > 0:
|
||||||
|
self.charge_beans(user, billing)
|
||||||
|
if job.job_db.status == 'done' or job.job_db.status == 'failed' or job.job_db.status == 'stopped':
|
||||||
del self.job_map[job_id]
|
del self.job_map[job_id]
|
||||||
|
|
||||||
# Get Batch job stdout or stderr from its file
|
# Get Batch job stdout or stderr from its file
|
||||||
|
|
|
@ -228,6 +228,10 @@ class TaskMgr(threading.Thread):
|
||||||
if task.id in self.lazy_stop_list:
|
if task.id in self.lazy_stop_list:
|
||||||
self.stop_remove_task(task)
|
self.stop_remove_task(task)
|
||||||
self.lazy_delete_list.append(task)
|
self.lazy_delete_list.append(task)
|
||||||
|
running_time, billing = task.get_billing()
|
||||||
|
self.logger.info('task %s stopped, running_time:%s billing:%d'%(task.id, str(running_time), billing))
|
||||||
|
running_time = math.ceil(running_time)
|
||||||
|
self.jobmgr.report(task.username, task.id,'stopped',running_time=running_time,billing=billing)
|
||||||
|
|
||||||
while self.lazy_delete_list:
|
while self.lazy_delete_list:
|
||||||
task = self.lazy_delete_list.pop(0)
|
task = self.lazy_delete_list.pop(0)
|
||||||
|
@ -236,7 +240,14 @@ class TaskMgr(threading.Thread):
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.logger.warning(str(err))
|
self.logger.warning(str(err))
|
||||||
|
|
||||||
self.lazy_append_list = [t for t in self.lazy_append_list if t.id not in self.lazy_stop_list]
|
new_append_list = []
|
||||||
|
for task in self.lazy_append_list:
|
||||||
|
if task.id in self.lazy_stop_list:
|
||||||
|
self.jobmgr.report(task.username, task.id, 'stopped')
|
||||||
|
else:
|
||||||
|
new_append_list.append(task)
|
||||||
|
|
||||||
|
self.lazy_append_list = new_append_list
|
||||||
self.lazy_stop_list.clear()
|
self.lazy_stop_list.clear()
|
||||||
if self.lazy_append_list:
|
if self.lazy_append_list:
|
||||||
self.task_queue.extend(self.lazy_append_list)
|
self.task_queue.extend(self.lazy_append_list)
|
||||||
|
|
|
@ -44,7 +44,7 @@ app.config['SQLALCHEMY_BINDS'] = {
|
||||||
'history': 'sqlite:///'+fsdir+'/global/sys/HistoryTable.db',
|
'history': 'sqlite:///'+fsdir+'/global/sys/HistoryTable.db',
|
||||||
'beansapplication': 'sqlite:///'+fsdir+'/global/sys/BeansApplication.db',
|
'beansapplication': 'sqlite:///'+fsdir+'/global/sys/BeansApplication.db',
|
||||||
'system': 'sqlite:///'+fsdir+'/global/sys/System.db',
|
'system': 'sqlite:///'+fsdir+'/global/sys/System.db',
|
||||||
'batch':'sqlite:///'+fsdir+'/global/sys/Batch.db',
|
'batch':'sqlite:///'+fsdir+'/global/sys/Batch.db?check_same_thread=False',
|
||||||
'login': 'sqlite:///'+fsdir+'/global/sys/Login.db'
|
'login': 'sqlite:///'+fsdir+'/global/sys/Login.db'
|
||||||
}
|
}
|
||||||
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
|
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
|
||||||
|
|
Loading…
Reference in New Issue