Skip to content

Commit 2b8f423

Browse files
authored
report failing reason to log and console. FIx #5342 (#5343)
* better reporting, log and doc. Fix #5342 * cleanup * fix except order
1 parent d145700 commit 2b8f423

File tree

2 files changed

+87
-33
lines changed

2 files changed

+87
-33
lines changed

bin/crab.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,13 @@ def log_exception(exc_type, exc_value, tback):
132132
print("'" + str(args[0]) + "' is not a valid command.")
133133
self.parser.print_help()
134134
sys.exit(-1)
135-
self.cmd = sub_cmd(self.logger, args[1:])
135+
self.cmd = sub_cmd(self.logger, args[1:]) # the crab command to be executed
136136

137+
# Every command returns a dictionary which MUST contain the "commandStatus" key.
138+
# Any value other then "SUCCESS" for this key indicates command failure
139+
# and will result in 'crab' terminating with non-zero exit code
140+
# Additional keys may be present in the dictionary, depending on the specific command,
141+
# which are used to pass information to caller when CRABAPI is used
137142
returnDict = self.cmd()
138143
if returnDict['commandStatus'] != 'SUCCESS':
139144
raise CommandFailedException("Command %s failed" % str(args[0]))
@@ -172,14 +177,14 @@ def log_exception(exc_type, exc_value, tback):
172177
errorId = re.search(r'(?<=X-Error-Id:\s)[^\n]*', err).group(0)
173178
client.logger.info('Error Id: %s', errorId)
174179
logging.getLogger('CRAB3').exception('Caught RESTInterfaceException exception')
175-
except ClientException as ce:
176-
client.logger.error(ce)
177-
logging.getLogger('CRAB3').exception('Caught ClientException exception')
178-
exitcode = ce.exitcode
179180
except CommandFailedException as ce:
180181
client.logger.warning(ce)
181182
logging.getLogger('CRAB3').exception('Caught CommandFailedException exception')
182183
exitcode = 1
184+
except ClientException as ce:
185+
client.logger.error(ce)
186+
logging.getLogger('CRAB3').exception('Caught ClientException exception')
187+
exitcode = ce.exitcode
183188
except KeyboardInterrupt:
184189
client.logger.error('Keyboard Interrupted')
185190
exitcode = 1

src/python/CRABClient/Commands/recover.py

+77-28
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
# step submit
2929
from CRABClient.Commands.submit import submit
3030
from CRABClient.UserUtilities import getColumn
31-
from CRABClient.ClientUtilities import colors
3231
from ServerUtilities import SERVICE_INSTANCES
3332

3433
SPLITTING_RECOVER_LUMIBASED = set(("LumiBased", "Automatic", "EventAwareLumiBased"))
@@ -44,6 +43,17 @@ class recover(SubCommand):
4443
shortnames = ["rec"]
4544

4645
def __call__(self):
46+
"""
47+
Code is organized as a series of steps, if any step fails, command exits
48+
Each step returns a "retval" dictionary which always contains keys: 'commandStatus' and 'step'
49+
'step' value is the name of the step
50+
'commandStatus' can be: SUCCESS, FAILED, NothingToDo
51+
Only the first two can be returned by this method to crab.py, the latter "NothingToDo"
52+
is used as a "break" to exit the chain of steps early and will be converted to SUCCES before
53+
this method exits
54+
Other keys may be present as present in the return dict of subcommands used in here
55+
if a 'msg' key is present, stepExit will log that message
56+
"""
4757

4858
retval = self.stepInit()
4959
if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
@@ -62,6 +72,7 @@ def __call__(self):
6272
self.logger.debug("no need to run crab remake - self.restHostCommonname %s", self.restHostCommonname)
6373
self.crabProjDir = self.requestarea
6474

75+
6576
retval = self.stepValidate()
6677
if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
6778

@@ -83,7 +94,7 @@ def __call__(self):
8394
if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
8495

8596
if "recoverLumimaskPath" not in retval:
86-
return retval
97+
return self.stepExit(retval)
8798

8899
retval = self.stepSubmitLumiBased(retval["recoverLumimaskPath"])
89100
if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
@@ -110,6 +121,11 @@ def stepExit(self, retval):
110121
> retval = self.stepYYY()
111122
> if retval["commandStatus"] != "SUCCESS": return self.stepExit(retval)
112123
"""
124+
if 'msg' in retval:
125+
self.logger.info("recover process prematurely exited during %s step", retval['step'])
126+
self.logger.info(retval['msg'])
127+
if retval['commandStatus'] == 'NothingToDo':
128+
retval['commandStatus'] = "SUCCESS" # tell crab.py to exit cleanly with no error
113129
return retval
114130

115131
def stepInit(self):
@@ -132,7 +148,7 @@ def stepInit(self):
132148
self.failingTaskInfo = {}
133149
self.failedJobs = []
134150

135-
return {"commandStatus": "SUCCESS", "init": None }
151+
return {"commandStatus": "SUCCESS", "step": "init"}
136152

137153
def stepRemake(self):
138154
"""
@@ -169,6 +185,9 @@ def stepRemake(self):
169185
retval = remakeCmd.remakecache(self.failingTaskName)
170186
self.logger.debug("stepRemakeAndValidate() - remake, retval: %s", retval)
171187
self.logger.debug("stepRemakeAndValidate() - remake, after, self.configuration: %s", self.configuration)
188+
retval['step'] = "remake"
189+
if retval['commandStatus'] != "SUCCESS":
190+
retval['msg'] = "Could not remake the task project directory"
172191
return retval
173192

174193
def stepValidate(self):
@@ -202,6 +221,7 @@ def stepValidate(self):
202221
splitalgo = getColumn(self.failingCrabDBInfo, 'tm_split_algo')
203222
if not splitalgo in SPLITTING_RECOVER_LUMIBASED.union(SPLITTING_RECOVER_FILEBASED):
204223
msg = 'crab recover supports only tasks with LumiBased and FileBased splitting, you have {}'.format(splitalgo)
224+
self.logger.info(msg)
205225
return {"commandStatus": "FAILED", "step": "RemakeAndValidate" , "msg": msg }
206226

207227
self.failingTaskInfo["splitalgo"] = splitalgo
@@ -210,7 +230,7 @@ def stepValidate(self):
210230

211231
self.logger.debug("stepRemakeAndValidate() - failingtaskinfo - %s", self.failingTaskInfo)
212232

213-
return {"commandStatus": "SUCCESS", "validate": None }
233+
return {"commandStatus": "SUCCESS", "step": "validate"}
214234

215235
def stepStatus(self):
216236
"""
@@ -261,6 +281,9 @@ def stepStatus(self):
261281
# [1, 2, 4]
262282
self.failedJobs = [job[1] for job in retval["jobList"] if job[0] == "failed"]
263283
self.logger.debug("stepStatus() - status, failedJobs: %s", self.failedJobs)
284+
retval['step'] = "status"
285+
if retval['commandStatus'] != "SUCCESS":
286+
retval['msg'] = "Could not retrieve task status"
264287

265288
return retval
266289

@@ -270,22 +293,23 @@ def stepKill(self):
270293
- kills the original failing task
271294
"""
272295
## step2: kill
296+
retval = {"step": "kill"}
273297

274298
# if the task is already killed or about to be killed, do not kill again
275299
if self.failingTaskStatus["dbStatus"] == "KILLED" or \
276300
(self.failingTaskStatus["dbStatus"] in ("NEW", "QUEUED") and self.failingTaskStatus["command"] == "KILL"):
277-
returnDict = {'kill' : 'already killed', 'commandStatus': 'SUCCESS'}
278-
self.logger.info("step kill - task already killed")
279-
return returnDict
301+
retval['commandStatus'] = "SUCCESS"
302+
self.logger.debug("step kill - task already killed")
303+
return retval
280304

281305
# avoid that crab operators kill users tasks by mistake.
282306
# if the user who is running crab recover differs from the one who submitted the original task,
283307
# then kill the task only if the option "--forcekill" is used.
284308
username = getUsername(self.proxyfilename, logger=self.logger)
285309
if self.failingTaskInfo["username"] != username and not self.options.__dict__["forceKill"]:
286-
returnDict = {'kill' : 'do not kill task submitted by another user', 'commandStatus': 'FAILED'}
287-
self.logger.info("step kill - task submitted by another user, will not kill it")
288-
return returnDict
310+
retval['commandStatus'] = "FAILED"
311+
retval['msg'] = "task submitted by another user, will not kill it"
312+
return retval
289313

290314
cmdargs = []
291315
cmdargs.append("-d")
@@ -301,7 +325,7 @@ def stepKill(self):
301325
self.logger.debug("stepKill() - cmdargs: %s", cmdargs)
302326
killCmd = kill(logger=self.logger, cmdargs=cmdargs)
303327
with SubcommandExecution(self.logger, "kill") as _:
304-
retval = killCmd()
328+
retval.update(killCmd())
305329

306330
self.logger.debug("stepKill() - retval: %s", retval)
307331
self.logger.debug("stepKill() - after, self.configuration: %s", self.configuration)
@@ -369,23 +393,28 @@ def stepCheckKill(self):
369393
self.logger.debug("stepCheckKill() - dagStatus %s", self.failingTaskStatus["dagStatus"])
370394
self.logger.debug("stepCheckKill() - dbStatus %s", self.failingTaskStatus["dbStatus"])
371395

396+
retval = {'step': "checkKill"}
397+
372398
# check the task status.
373399
# it does not make sense to recover a task in COMPLETED
374400
if not self.failingTaskStatus["status"] in ("SUBMITTED", "FAILED", "FAILED (KILLED)"):
375-
msg = "In order to recover a task, the combined status of the task needs can not be {}".format(self.failingTaskStatus["status"])
376-
return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
401+
msg = "Tasks in status {} can not be recovered".format(self.failingTaskStatus["status"])
402+
retval.update({"commandStatus": "NothingToDo", "msg": msg})
403+
return retval
377404

378405
# the status on the db should be submitted or killed. or about to be killed
379406
if self.failingTaskStatus["dbStatus"] in ("NEW", "QUEUED"):
380407
if not self.failingTaskStatus["command"] in ("KILL"):
381408
msg = "In order to recover a task, when the status of the task in the oracle DB is {}, the task command can not be {}"\
382409
.format(self.failingTaskStatus["dbStatus"], self.failingTaskStatus["command"])
383-
return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
410+
retval.update({"commandStatus": "FAILED", "msg": msg})
411+
return retval
384412
else:
385413
if not self.failingTaskStatus["dbStatus"] in ("SUBMITTED", "KILLED"):
386414
msg = "In order to recover a task, the status of the task in the oracle DB can not be {}"\
387415
.format(self.failingTaskStatus["dbStatus"])
388-
return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
416+
retval.update({"commandStatus": "FAILED", "msg": msg})
417+
return retval
389418

390419
# make sure that the jobs ad publications are in a final state.
391420
# - [x] make sure that there are no ongoing transfers
@@ -396,23 +425,27 @@ def stepCheckKill(self):
396425
if not set(self.failingTaskStatus["jobsPerStatus"].keys()).issubset(terminalStates):
397426
msg = "In order to recover a task, all the jobs need to be in a terminal state ({}). You have {}"\
398427
.format(terminalStates, self.failingTaskStatus["jobsPerStatus"].keys())
399-
return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
428+
retval.update({"commandStatus": "FAILED", "msg": msg})
429+
return retval
400430

401431
# - [x] make sure that there are no ongoing publications
402432
self.logger.debug("stepCheckKill - publication %s", self.failingTaskStatus["publication"] )
403433
terminalStatesPub = set(("failed", "done", "not_required", "disabled"))
404434
if not set(self.failingTaskStatus["publication"].keys()).issubset(terminalStatesPub):
405435
msg = "In order to recover a task, publication for all the jobs need to be in a terminal state ({}). You have {}"\
406436
.format(terminalStatesPub, self.failingTaskStatus["publication"].keys())
407-
return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
437+
retval.update({"commandStatus": "FAILED", "msg": msg})
438+
return retval
408439

409440
# - [x] if all jobs failed, then exit. it is better to submit again the task than using crab recover :)
410441
# check that "failed" is the only key of the jobsPerStatus dictionary
411442
if set(self.failingTaskStatus["jobsPerStatus"].keys()) == set(("failed",)):
412-
msg = "All the jobs of the original task failed. better submitting it again from scratch than recovering it."
413-
return {"commandStatus": "FAILED", "step": "checkKill" , "msg": msg }
443+
msg = "All the jobs of the original task failed. Better investigate and submit it again than recover."
444+
retval.update({"commandStatus": "FAILED", "msg": msg})
445+
return retval
414446

415-
return {"commandStatus": "SUCCESS", "checkkill": "task can be recovered"}
447+
retval.update({"commandStatus": "SUCCESS", "msg": "task can be recovered"})
448+
return retval
416449

417450
def stepReport(self):
418451
"""
@@ -425,6 +458,8 @@ def stepReport(self):
425458
with the output of crab report
426459
"""
427460

461+
retval = {"step": "report"}
462+
428463
failingTaskPublish = getColumn(self.failingCrabDBInfo, 'tm_publication')
429464
self.logger.debug("stepReport() - tm_publication: %s %s", type(failingTaskPublish), failingTaskPublish)
430465
# - if the user specified --strategy=notPublished but the original failing task
@@ -459,7 +494,7 @@ def stepReport(self):
459494
reportCmd = report(logger=self.logger, cmdargs=cmdargs)
460495
with SubcommandExecution(self.logger, "report") as _:
461496
# FIXME - stays noisy because interference with getMutedStatusInfo()
462-
retval = reportCmd()
497+
retval.update(reportCmd())
463498
self.logger.debug("stepReport() - report, after, self.configuration: %s", self.configuration)
464499
self.logger.debug("stepReport() - report, retval: %s", retval)
465500

@@ -486,24 +521,29 @@ def stepReport(self):
486521
# we will likely never reach this if, because in this case the status on the schedd
487522
# should be COMPLETED, which is not accepted by stepCheckKill
488523
self.logger.info("stepReport() - all lumis have been processed by original task. crab recover will exit")
524+
retval.update({'commandStatus' : 'SUCCESS'})
525+
return retval
489526

490527
self.logger.debug("crab report - recovery task will process lumis contained in file %s", recoverLumimaskPath)
491528

492529

493530
if os.path.exists(recoverLumimaskPath):
494-
returnDict = {'commandStatus' : 'SUCCESS', 'recoverLumimaskPath': recoverLumimaskPath}
531+
retval.update({'commandStatus' : 'SUCCESS', 'recoverLumimaskPath': recoverLumimaskPath})
495532
else:
496-
msg = 'the file {} does not exist. crab report could not produce it, the task can not be recovered'.format(recoverLumimaskPath)
497-
returnDict = {'commandStatus' : 'FAILED', 'msg': msg}
533+
msg = 'File {} does not exist. crab report could not produce it, the task can not be recovered'.format(recoverLumimaskPath)
534+
self.logger.info(msg)
535+
retval.update({'commandStatus' : 'FAILED', 'msg': msg})
498536

499-
return returnDict
537+
return retval
500538

501539
def stepGetsandbox(self):
502540
"""
503541
side effects:
504542
- download the user_ and debug_sandbox from s3 or from the schedd
505543
"""
506544

545+
retval = {"step": "getSandbox"}
546+
507547
cmdargs = []
508548
cmdargs.append("-d")
509549
cmdargs.append(str(self.crabProjDir))
@@ -516,7 +556,7 @@ def stepGetsandbox(self):
516556
self.logger.debug("stepGetsandbox() - cmdargs: %s", cmdargs)
517557
getsandboxCmd = getsandbox(logger=self.logger, cmdargs=cmdargs)
518558
with SubcommandExecution(self.logger, "getsandbox") as _:
519-
retval = getsandboxCmd()
559+
retval.update(getsandboxCmd())
520560
self.logger.debug("stepGetsandbox() - retval: %s", retval)
521561
return retval
522562

@@ -528,6 +568,9 @@ def stepExtractSandbox(self, sandbox_paths):
528568
- extracts the user_ and debug_sandbox, so that the files that they contain
529569
can be used by crab submit at a later step
530570
"""
571+
572+
retval = {"step": "extractSandbox"}
573+
531574
debug_sandbox = tarfile.open(sandbox_paths[0])
532575
debug_sandbox.extractall(path=os.path.join(self.crabProjDir, "user_sandbox"))
533576
debug_sandbox.close()
@@ -539,7 +582,8 @@ def stepExtractSandbox(self, sandbox_paths):
539582
self.recoverconfig = os.path.join(self.crabProjDir, "debug_sandbox",
540583
"debug" , "crabConfig.py")
541584

542-
return {"commandStatus": "SUCCESS", }
585+
retval.update({"commandStatus": "SUCCESS"})
586+
return retval
543587

544588
def stepSubmitLumiBased(self, notFinishedJsonPath):
545589
"""
@@ -551,6 +595,8 @@ def stepSubmitLumiBased(self, notFinishedJsonPath):
551595
- submits a new task
552596
"""
553597

598+
retval = {"step": "submitLumiBased"}
599+
554600
cmdargs = []
555601
cmdargs.append("-c")
556602
cmdargs.append(self.recoverconfig)
@@ -588,7 +634,7 @@ def stepSubmitLumiBased(self, notFinishedJsonPath):
588634
submitCmd = submit(logger=self.logger, cmdargs=cmdargs)
589635

590636
# with SubcommandExecution(self.logger, "submit") as _:
591-
retval = submitCmd()
637+
retval.update(submitCmd())
592638
self.logger.debug("stepSubmit() - retval %s", retval)
593639
return retval
594640

@@ -602,8 +648,11 @@ def stepSubmitFileBased(self):
602648
- [ ] if the input is from DBS, then write info to runs_and_lumis.tar.gz
603649
"""
604650

651+
retval = {"step": "submitFileBased"}
652+
605653
# TODO
606654
# I will need to implement this!
655+
raise NotImplementedError
607656
return {'commandStatus': 'FAILED', 'error': 'not implemented yet'}
608657

609658
def setOptions(self):

0 commit comments

Comments
 (0)