Repository: airavata Updated Branches: refs/heads/master 1ba83f11f -> 144bb8f6d
Merge changes have done for release branch to master Project: http://git-wip-us.apache.org/repos/asf/airavata/repo Commit: http://git-wip-us.apache.org/repos/asf/airavata/commit/144bb8f6 Tree: http://git-wip-us.apache.org/repos/asf/airavata/tree/144bb8f6 Diff: http://git-wip-us.apache.org/repos/asf/airavata/diff/144bb8f6 Branch: refs/heads/master Commit: 144bb8f6d4ab1b7af305877c77ac8d6054473ae8 Parents: 1ba83f1 Author: Shameera Rathanyaka <[email protected]> Authored: Mon Jun 8 11:42:43 2015 -0400 Committer: Shameera Rathanyaka <[email protected]> Committed: Mon Jun 8 11:42:43 2015 -0400 ---------------------------------------------------------------------- .../gsi/ssh/api/job/PBSJobConfiguration.java | 2 +- .../gfac/gsi/ssh/api/job/PBSOutputParser.java | 6 +++- .../gsi/ssh/impl/GSISSHAbstractCluster.java | 1 + .../gfac/monitor/email/EmailBasedMonitor.java | 9 ++++++ .../monitor/email/parser/PBSEmailParser.java | 12 ++++---- .../gfac/ssh/provider/impl/SSHProvider.java | 29 +++++++++++++------- 6 files changed, 41 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/airavata/blob/144bb8f6/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSJobConfiguration.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSJobConfiguration.java b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSJobConfiguration.java index d3f6c9c..c5be412 100644 --- a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSJobConfiguration.java +++ b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSJobConfiguration.java @@ -101,7 +101,7 @@ public class PBSJobConfiguration implements JobManagerConfiguration { @Override public RawCommandInfo getJobIdMonitorCommand(String jobName, String userName) { // For PBS there is no option to get jobDetails by JobName, so we search with userName - return new RawCommandInfo(this.installedPath + "qstat -u " + userName); + return new RawCommandInfo(this.installedPath + "qstat -u " + userName + " -f | grep \"Job_Name = " + jobName + "\" -B1"); } @Override http://git-wip-us.apache.org/repos/asf/airavata/blob/144bb8f6/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSOutputParser.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSOutputParser.java b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSOutputParser.java index a86d7f0..15e2405 100644 --- a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSOutputParser.java +++ b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/api/job/PBSOutputParser.java @@ -194,8 +194,12 @@ public class PBSOutputParser implements OutputParser { @Override public String parseJobId(String jobName, String rawOutput) throws SSHApiException { + /* output will look like + Job Id: 2080802.gordon-fe2.local + Job_Name = A312402627 + */ String regJobId = "jobId"; - Pattern pattern = Pattern.compile("\\s*(?<" + regJobId + ">[^\\s]*).* " + jobName + " "); // regex , JOB_ID will come as first column + Pattern pattern = Pattern.compile("(?<" + regJobId + ">[^\\s]*)\\s*.* " + jobName); if (rawOutput != null) { Matcher matcher = pattern.matcher(rawOutput); if (matcher.find()) { http://git-wip-us.apache.org/repos/asf/airavata/blob/144bb8f6/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/impl/GSISSHAbstractCluster.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/impl/GSISSHAbstractCluster.java b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/impl/GSISSHAbstractCluster.java index 04241c8..113e4ec 100644 --- a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/impl/GSISSHAbstractCluster.java +++ b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/gsi/ssh/impl/GSISSHAbstractCluster.java @@ -631,6 +631,7 @@ public class GSISSHAbstractCluster implements RemoteCluster { return files; } + @Deprecated public synchronized void getJobStatuses(String userName, Map<String,JobStatus> jobIDs)throws SSHApiException { int retry = 3; RawCommandInfo rawCommandInfo = jobManagerConfiguration.getUserBasedMonitorCommand(userName); http://git-wip-us.apache.org/repos/asf/airavata/blob/144bb8f6/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/EmailBasedMonitor.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/EmailBasedMonitor.java b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/EmailBasedMonitor.java index eea6ef6..992317d 100644 --- a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/EmailBasedMonitor.java +++ b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/EmailBasedMonitor.java @@ -24,6 +24,8 @@ import org.apache.airavata.common.exception.AiravataException; import org.apache.airavata.common.logger.AiravataLogger; import org.apache.airavata.common.logger.AiravataLoggerFactory; import org.apache.airavata.common.utils.ServerSettings; +import org.apache.airavata.gfac.core.GFacException; +import org.apache.airavata.gfac.core.GFacUtils; import org.apache.airavata.gfac.core.context.JobExecutionContext; import org.apache.airavata.gfac.core.GFacThreadPoolExecutor; import org.apache.airavata.gfac.core.monitor.JobStatusResult; @@ -36,6 +38,8 @@ import org.apache.airavata.gfac.monitor.email.parser.UGEEmailParser; import org.apache.airavata.model.appcatalog.computeresource.ResourceJobManagerType; import org.apache.airavata.model.messaging.event.JobIdentifier; import org.apache.airavata.model.messaging.event.JobStatusChangeRequestEvent; +import org.apache.airavata.model.workspace.experiment.CorrectiveAction; +import org.apache.airavata.model.workspace.experiment.ErrorCategory; import org.apache.airavata.model.workspace.experiment.JobState; import org.apache.airavata.model.workspace.experiment.JobStatus; @@ -284,6 +288,11 @@ public class EmailBasedMonitor implements Runnable{ jobMonitorMap.remove(jobStatusResult.getJobId()); runOutHandlers = true; log.info("[EJM]: Job failed email received , removed job from job monitoring. " + jobDetails); + try { + GFacUtils.saveErrorDetails(jEC, "Job runs on remote compute resource failed", CorrectiveAction.RETRY_SUBMISSION, ErrorCategory.APPLICATION_FAILURE); + } catch (GFacException e) { + log.info("[EJM]: Error while saving error details for jobId:{}, expId: {}", jEC.getJobDetails().getJobID(), jEC.getExperimentID()); + } }else if (resultState == JobState.CANCELED) { jobMonitorMap.remove(jobStatusResult.getJobId()); runOutHandlers = false; // Do we need to run out handlers in canceled case? http://git-wip-us.apache.org/repos/asf/airavata/blob/144bb8f6/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/parser/PBSEmailParser.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/parser/PBSEmailParser.java b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/parser/PBSEmailParser.java index 4a3c88b..8474d62 100644 --- a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/parser/PBSEmailParser.java +++ b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/monitor/email/parser/PBSEmailParser.java @@ -34,17 +34,17 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; public class PBSEmailParser implements EmailParser { - private static final Logger log = LoggerFactory.getLogger(PBSEmailParser.class); - - - private static final String REGEX = "[a-zA-Z ]*:[ ]*(?<" + JOBID + ">[a-zA-Z0-9-\\.]*)\\s+[a-zA-Z ]*:[ ]*(?<"+ - JOBNAME + ">[a-zA-Z0-9-\\.]*)\\s+.*\\s+(?<" + STATUS + ">[a-zA-Z\\ ]*)"; - private static final String REGEX_EXIT_STATUS = "Exit_status=(?<" + EXIT_STATUS + ">[\\d]+)"; public static final String BEGUN_EXECUTION = "Begun execution"; public static final String EXECUTION_TERMINATED = "Execution terminated"; public static final String ABORTED_BY_PBS_SERVER = "Aborted by PBS Server"; + static final String REGEX = "[a-zA-Z ]*:[ ]*(?<" + JOBID + ">[a-zA-Z0-9-\\.]*)\\s+[a-zA-Z ]*:[ ]*(?<" + + JOBNAME + ">[a-zA-Z0-9-\\.]*)\\s[\\S|\\s]*(?<" + STATUS + ">" + BEGUN_EXECUTION + "|" + + EXECUTION_TERMINATED + "|" + ABORTED_BY_PBS_SERVER + ")"; + + private static final String REGEX_EXIT_STATUS = "Exit_status=(?<" + EXIT_STATUS + ">[\\d]+)"; + @Override public JobStatusResult parseEmail(Message message) throws MessagingException, AiravataException { JobStatusResult jobStatusResult = new JobStatusResult(); http://git-wip-us.apache.org/repos/asf/airavata/blob/144bb8f6/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/ssh/provider/impl/SSHProvider.java ---------------------------------------------------------------------- diff --git a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/ssh/provider/impl/SSHProvider.java b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/ssh/provider/impl/SSHProvider.java index d1b35cd..d00c698 100644 --- a/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/ssh/provider/impl/SSHProvider.java +++ b/modules/gfac/gfac-impl/src/main/java/org/apache/airavata/gfac/ssh/provider/impl/SSHProvider.java @@ -22,6 +22,7 @@ package org.apache.airavata.gfac.ssh.provider.impl; import org.apache.airavata.gfac.core.cluster.RemoteCluster; +import org.apache.airavata.model.workspace.experiment.TaskState; import org.apache.airavata.registry.cpi.AppCatalogException; import org.apache.airavata.common.exception.AiravataException; import org.apache.airavata.common.exception.ApplicationSettingsException; @@ -172,20 +173,28 @@ public class SSHProvider extends AbstractProvider { } } else { jobExecutionContext.setJobDetails(jobDetails); - String verifyJobId = verifyJobSubmission(remoteCluster, jobDetails); - if (verifyJobId != null && !verifyJobId.isEmpty()) { - // JobStatus either changed from SUBMITTED to QUEUED or directly to QUEUED - jobID = verifyJobId; - jobDetails.setJobID(jobID); - monitorPublisher.publish(new GfacExperimentStateChangeRequest(new MonitorID(jobExecutionContext) - , GfacExperimentState.JOBSUBMITTED)); - GFacUtils.saveJobStatus(jobExecutionContext, jobDetails, JobState.QUEUED); + int verificationTryCount = 0; + while (verificationTryCount++ < 3) { + String verifyJobId = verifyJobSubmission(remoteCluster, jobDetails); + if (verifyJobId != null && !verifyJobId.isEmpty()) { + // JobStatus either changed from SUBMITTED to QUEUED or directly to QUEUED + jobID = verifyJobId; + jobDetails.setJobID(jobID); + monitorPublisher.publish(new GfacExperimentStateChangeRequest(new MonitorID(jobExecutionContext) + , GfacExperimentState.JOBSUBMITTED)); + GFacUtils.saveJobStatus(jobExecutionContext, jobDetails, JobState.QUEUED); + break; + } + Thread.sleep(verificationTryCount * 1000); } } if (jobID == null || jobID.isEmpty()) { - log.error("Couldn't find remote jobId for JobName:" + jobDetails.getJobName() + ", ExperimentId:" + jobExecutionContext.getExperimentID()); - GFacUtils.updateExperimentStatus(jobExecutionContext.getExperimentID(), ExperimentState.FAILED); + String msg = "expId:" + jobExecutionContext.getExperimentID() + " Couldn't find remote jobId for JobName:" + + jobDetails.getJobName() + ", both submit and verify steps doesn't return a valid JobId. Hence changing experiment state to Failed"; + log.error(msg); + GFacUtils.saveErrorDetails(jobExecutionContext, msg, CorrectiveAction.CONTACT_SUPPORT, ErrorCategory.AIRAVATA_INTERNAL_ERROR); + GFacUtils.publishTaskStatus(jobExecutionContext, monitorPublisher, TaskState.FAILED); return; } data.append("jobDesc=").append(jobDescriptor.toXML());
