From: Jason Guiditta <[email protected]> This patch teaches dbomatic how to get error messages into the database. I added a 'last_error' field to the instance model so that we can stick errors from condor in there.
Note that this is really not the right way to do this. We should be using the event logging system and tying the errors back to the instance from that. However this will work for now and give the users something useful in case of an error. Note that I haven't had a chance to test this patch live yet since my setup is not working end to end yet. Signed-off-by: Ian Main <[email protected]> --- src/app/util/condormatic.rb | 9 +++++++++ src/db/migrate/20090804142049_create_instances.rb | 1 + src/dbomatic/dbomatic | 12 ++++++++---- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/app/util/condormatic.rb b/src/app/util/condormatic.rb index 9b678ee..b003c57 100644 --- a/src/app/util/condormatic.rb +++ b/src/app/util/condormatic.rb @@ -128,17 +128,26 @@ def condormatic_instances_sync_states doc = Nokogiri::XML(xml) jobs_state = {} + jobs_error_msg = {} doc.xpath('/classads/c').each do |jobs| + job_hold_reason = (v = jobs.at_xpath('./a...@n="HoldReason"]/s')) ? v.text : nil job_name = (v = jobs.at_xpath('./a...@n="Cmd"]/s')) ? v.text : nil job_state= (v = jobs.at_xpath('./a...@n="JobStatus"]/i')) ? v.text : nil + puts "job name is #{job_name}" + puts "job state is #{job_state}" + puts "hold reason is #{job_hold_reason}" + Rails.logger.info "job name is #{job_name}" Rails.logger.info "job state is #{job_state}" + Rails.logger.info "hold reason is #{job_hold_reason}" jobs_state[job_name] = condor_to_instance_state(job_state) if job_name + jobs_error_msg[job_name] = job_hold_reason if job_hold_reason end Instance.find(:all).each do |instance| + instance.last_error = jobs_error_msg[instance.condor_job_id] if jobs_error_msg.has_key?(instance.condor_job_id) instance.state = jobs_state[instance.condor_job_id] || Instance::STATE_STOPPED instance.save! Rails.logger.info "Instance state updated to #{instance.state}" diff --git a/src/db/migrate/20090804142049_create_instances.rb b/src/db/migrate/20090804142049_create_instances.rb index b485031..e6ba245 100644 --- a/src/db/migrate/20090804142049_create_instances.rb +++ b/src/db/migrate/20090804142049_create_instances.rb @@ -34,6 +34,7 @@ class CreateInstances < ActiveRecord::Migration t.string :private_address t.string :state t.string :condor_job_id + t.string :last_error t.integer :instance_key_id t.integer :lock_version, :default => 0 t.integer :acc_pending_time, :default => 0 diff --git a/src/dbomatic/dbomatic b/src/dbomatic/dbomatic index 083d294..2a5211f 100755 --- a/src/dbomatic/dbomatic +++ b/src/dbomatic/dbomatic @@ -102,7 +102,7 @@ end # Handle the event log's xml class CondorEventLog < Nokogiri::XML::SAX::Document - attr_accessor :tag, :event_type, :event_cmd, :event_time, :trigger_type, :grid_resource, :execute_host + attr_accessor :tag, :event_type, :event_cmd, :event_time, :trigger_type, :grid_resource, :execute_host, :hold_reason def initialize(logger) @logger = logger @@ -128,6 +128,8 @@ class CondorEventLog < Nokogiri::XML::SAX::Document @grid_resource = string elsif @tag == "ExecuteHost" @execute_host = string + elsif @tag == "HoldReason" + @hold_reason = string end end end @@ -146,8 +148,6 @@ class CondorEventLog < Nokogiri::XML::SAX::Document elsif @trigger_type == "ULOG_JOB_HELD" # The job has some error condition. # - # FIXME: if this happens, we probably want to add the HoldReason field - # to the database so we can display it to the user # FIXME: we also may want to delete this job from condor, depending # on the error. For instance, if you are trying to start an instance # with a mismatched image and hardwareprofile architecture, the only @@ -158,7 +158,11 @@ class CondorEventLog < Nokogiri::XML::SAX::Document # FIXME: Right now we don't parse out the HoldReason (or HoldReasonCode) # so for now I'm going to set this to STATE_ERROR as there are multiple # possible reasons for going into the 'held' state. - + # + # FIXME: This only adds the error to the instance 'last_error' field. We + # should really be logging this information into the event log but that is not + # set up at this time so for now this will do. + inst.last_error = @hold_reason inst.state = Instance::STATE_ERROR else @logger.info "Unexpected trigger type #...@trigger_type}, not updating instance state" -- 1.7.2.3 _______________________________________________ deltacloud-devel mailing list [email protected] https://fedorahosted.org/mailman/listinfo/deltacloud-devel
