On Wed, 2012-09-05 at 14:49 +0200, [email protected] wrote: > From: Jan Provaznik <[email protected]> > > RHEVM instances goes to stopped state after creating and they need > to be started explicitly. This patch improves check if explicit > start request should be sent. > > It also extends stopping of RHEVM instances - it's possible that > RHEVM isntances freeze in shutting_down state on provider side > if development tools are not installed inside VM. In such case, > another 'stop' request is sent to the instance. > --- > src/app/controllers/instances_controller.rb | 2 +- > src/app/models/instance.rb | 38 > +++++++++++++++++++++++++++++ > src/app/models/provider_type.rb | 9 +++++++ > src/app/util/taskomatic.rb | 33 ++++++++++++++----------- > src/dbomatic/dbomatic | 29 +++++++++++----------- > 5 files changed, 82 insertions(+), 29 deletions(-) > > diff --git a/src/app/controllers/instances_controller.rb > b/src/app/controllers/instances_controller.rb > index f34a58e..31d70a4 100644 > --- a/src/app/controllers/instances_controller.rb > +++ b/src/app/controllers/instances_controller.rb > @@ -271,7 +271,7 @@ class InstancesController < ApplicationController > > def check_inaccessible_instances > # @instance is set only on stop action > - @instances_to_stop = @instance ? @instance.to_a : > Instance.find(params[:instance_selected].to_a) > + @instances_to_stop = @instance ? Array(@instance) : > Instance.find(Array(params[:instance_selected])) > @instances_to_stop.reject! { |inst| !check_privilege(Privilege::USE, > inst) } > @inaccessible_instances = > Instance.stoppable_inaccessible_instances(@instances_to_stop) > if params[:terminate].blank? and @inaccessible_instances.any? > diff --git a/src/app/models/instance.rb b/src/app/models/instance.rb > index d354f23..4f03633 100644 > --- a/src/app/models/instance.rb > +++ b/src/app/models/instance.rb > @@ -447,6 +447,11 @@ class Instance < ActiveRecord::Base > do_operation(user, 'stop') > end > > + > + def start(user) > + do_operation(user, 'start') > + end > + > def stop_with_event(user) > stop(user) > true > @@ -491,6 +496,27 @@ class Instance < ActiveRecord::Base > deployed? ? (Time.now - time_last_running) : 0 > end > > + def stopped_after_creation? > + state == Instance::STATE_STOPPED && > + time_last_pending.to_i > time_last_running.to_i && > + provider_account && > + provider_account.provider.provider_type.goes_to_stop_after_creation? > + end > + > + def requires_explicit_start? > + # this is for RHEVM/VSPHERE instances where instance goes to 'stopped' > state > + # after creation - we check if it wasn't running before this stopped > state > + # and if we already did send start request to it > + stopped_after_creation? && !pending_or_successful_start? > + end > + > + def stuck_in_stopping? > + state == Instance::STATE_SHUTTING_DOWN && > + Time.now - time_last_shutting_down > 120 && > + provider_account && > + provider_account.provider.provider_type.goes_to_stop_after_creation? > + end > + > PRESET_FILTERS_OPTIONS = [ > {:title => "instances.preset_filters.other_than_stopped", :id => > "other_than_stopped", :query => where("instances.state != ?", "stopped")}, > {:title => "instances.preset_filters.new", :id => "new", :query => > where("instances.state" => "new")}, > @@ -567,4 +593,16 @@ class Instance < ActiveRecord::Base > Taskomatic.send("#{operation}_instance", task) > end > > + def pending_or_successful_start? > + task = tasks.find_last_by_action('start') > + return false unless task > + return true if task.state == Task::STATE_FINISHED > + # it's possible that start request takes more than 30 secs on rhevm, > + # but dbomatic kills child process after 30sec by default, so > + # task may stay in 'pending' state. If task is in pending state for > + # more than 2 mins, consider previous start request as failed. > + return true if task.state == Task::STATE_PENDING && > + Time.now - task.created_at < 120 > + false > + end > end > diff --git a/src/app/models/provider_type.rb b/src/app/models/provider_type.rb > index 205752d..4557d62 100644 > --- a/src/app/models/provider_type.rb > +++ b/src/app/models/provider_type.rb > @@ -37,4 +37,13 @@ class ProviderType < ActiveRecord::Base > validates_uniqueness_of :name > validates_presence_of :deltacloud_driver > validates_uniqueness_of :deltacloud_driver > + > + # TODO: this is little bit hacky, it would be cleaner to ask directly > + # dc-api in what state instance will be after creation, but > + # this method is called for each instance in periodic instance check from > + # dbomatic so it should be fast, we might keep this info as a provider > + # attribute in future > + def goes_to_stop_after_creation? > + %w(rhevm vsphere).include?(deltacloud_driver) > + end > end > diff --git a/src/app/util/taskomatic.rb b/src/app/util/taskomatic.rb > index 1708970..28faf35 100644 > --- a/src/app/util/taskomatic.rb > +++ b/src/app/util/taskomatic.rb > @@ -56,6 +56,7 @@ module Taskomatic > task.time_started = Time.now > > begin > + raise "asdfddddd" > client = task.instance.provider_account.connect > dcloud_instance = client.instance(task.instance.external_key) > > @@ -81,6 +82,7 @@ module Taskomatic > # we also have to handle create_failed state events separately > if task.instance.state == Instance::STATE_STOPPED && task.action == > InstanceTask::ACTION_START && > > task.instance.provider_account.provider.provider_type.deltacloud_driver == > 'rhevm' > + create_failure_events(task.instance, ex) > task.instance.update_attributes(:state => > Instance::STATE_CREATE_FAILED) > end > ensure > @@ -135,20 +137,7 @@ module Taskomatic > Rails.logger.error ex.backtrace.join("\n") > task.state = Task::STATE_FAILED > task.instance.state = Instance::STATE_CREATE_FAILED > - > - task.instance.events << Event.create( > - :source => task.instance, > - :event_time => DateTime.now, > - :status_code => 'instance_launch_failed', > - :summary => "#{task.instance.name}: > #{ex.message.to_s.split("\n").first}" > - ) > - > - task.instance.provider_account.events << Event.create( > - :source => task.instance.provider_account, > - :event_time => DateTime.now, > - :status_code => 'provider_account_failure', > - :summary => "#{task.instance.name}: > #{ex.message.to_s.split("\n").first}" > - ) > + create_failure_events(task.instance, ex) > > raise ex > end > @@ -170,4 +159,20 @@ module Taskomatic > client_args.merge!({:user_data => instance.user_data}) if > instance.user_data.present? > client.create_instance(client_args) > end > + > + def self.create_failure_events(instance, ex) > + instance.events << Event.create( > + :source => instance, > + :event_time => DateTime.now, > + :status_code => 'instance_launch_failed', > + :summary => "#{instance.name}: #{ex.message.to_s.split("\n").first}" > + ) > + > + instance.provider_account.events << Event.create( > + :source => instance.provider_account, > + :event_time => DateTime.now, > + :status_code => 'provider_account_failure', > + :summary => "#{instance.name}: #{ex.message.to_s.split("\n").first}" > + ) > + end > end > diff --git a/src/dbomatic/dbomatic b/src/dbomatic/dbomatic > index a8633a0..c2fb384 100755 > --- a/src/dbomatic/dbomatic > +++ b/src/dbomatic/dbomatic > @@ -188,11 +188,12 @@ end > > def check_one_account(account) > connection = account.connect > + ignored_states = [Instance::STATE_NEW, Instance::STATE_STOPPED, > Instance::STATE_CREATE_FAILED] > > account.instances.order("checked_at ASC").each do |instance| > # optimization; right now we ignore instances that are in the STOPPED, > NEW, or CREATE_FAILED states. > # when we get to stateful instances, this will need to change > - unless [Instance::STATE_NEW, Instance::STATE_STOPPED, > Instance::STATE_CREATE_FAILED].include?(instance.state) > + if !ignored_states.include?(instance.state) or > instance.stopped_after_creation? > instance.update_attribute(:checked_at,Time.now) > > begin > @@ -203,7 +204,6 @@ def check_one_account(account) > end > > if api_instance > - DBomaticLogger.instance.info("updating instance state for > #{instance.name}: #{instance.external_key}") > instance.state = > Taskomatic.dcloud_to_instance_state(api_instance.state) > > # only update the public and private addresses if they are not nil. > @@ -225,18 +225,19 @@ def check_one_account(account) > > # For RHEV, we need to start up the instance after the vm has been > created > # and state changes from PENDING to STOPPED > - if instance.state == Instance::STATE_STOPPED and > instance.total_state_time(Instance::STATE_RUNNING) == 0 > - api_instance = connection.instance(instance.external_key) > - if api_instance > - DBomaticLogger.instance.info("starting instance #{instance.name}: > #{instance.external_key}") > - # TODO: do we need to set an actual user here instead of nil? > - task = instance.queue_action(nil, 'start') > - unless task > - raise ActionError.new("start cannot be performed on this > instance.") > - end > - Taskomatic.start_instance(task) > - instance.state = > Taskomatic.dcloud_to_instance_state(api_instance.state) > - instance.save! > + if instance.requires_explicit_start? > + DBomaticLogger.instance.info("sending explicit start request to > #{instance.name}") > + begin > + instance.start(nil) > + rescue > + DBomaticLogger.instance.info("failed to start instance > #{instance.name}: #{$!.message}") > + end > + elsif instance.stuck_in_stopping? > + DBomaticLogger.instance.info("sending second stop request to > #{instance.name}, instance is stuck in stopping state") > + begin > + instance.stop_with_event(nil) > + rescue > + DBomaticLogger.instance.info("failed to stop instance > #{instance.name}: #{$!.message}") > end > end > end
Patch mostly works, but there is problem saving Event in case you have multiple instances in deployment and all have errors because Event.summary is string(255) so dbomatic raise PGError: ERROR: value too long for type character varying(255). Another minor problem is predefined error text asdfff displayed in conductor. After this is fixed patch can be pushed.
