From 199c39bce9394c7ecffbf0a0b0c003ba452cd921 Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Tue, 16 Apr 2013 17:56:33 -0700 Subject: [PATCH 1/8] suspend processes that interfere with cycling servers --- aws-ha-release/aws-ha-release.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index 139ac2d..d15398f 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -86,11 +86,14 @@ fi if [[ `echo -e "$asg_result" | grep -c "^AUTO-SCALING-GROUP"` < 1 ]] then echo "No Auto Scaling Group was found. Because no Auto Scaling Group has been found, $app_name does not know which Auto Scaling Group should have Instances terminated." 1>&2 ; exit 64 fi -#confirms that the selected Auto Scaling Group is not currently in state "Suspended Processing" - the "Suspending Processing" state prevents the termination of Auto Scaling Group instances and thus prevents aws-ha-release from running properly -if [[ `echo -e "$asg_result" | grep -c "SUSPENDED-PROCESS"` > 1 ]] - then echo "Scaling Processes for the Auto Scaling Group $asg_group_name are currently suspended. $app_name will now exit as Scaling Processes are required for $app_name to run properly." 1>&2 ; exit 77 -fi - +#confirms that certain Auto Scaling processes are not suspended. For certain processes, the "Suspending Processing" state prevents the termination of Auto Scaling Group instances and thus prevents aws-ha-release from running properly. +necessary_processes=(RemoveFromLoadBalancerLowPriority Terminate Launch ReplaceUnhealthy HealthCheck) +for process in "${necessary_processes[@]}" +do + if [[ `echo -e "$asg_result" | grep -c "SUSPENDED-PROCESS$delimiter$process"` > 0 ]] + then echo "Scaling Process $process for the Auto Scaling Group $asg_group_name is currently suspended. $app_name will now exit as Scaling Processes ${necessary_processes[@]} are required for $app_name to run properly." 1>&2 ; exit 77 + fi +done #gets Auto Scaling Group max-size asg_initial_max_size=`echo $asg_result | grep ^AUTO-SCALING-GROUP | cut -d "$delimiter" -f 9` @@ -108,6 +111,8 @@ fi #echo a list of Instances that are slated for termination echo -e "The list of Instances in Auto Scaling Group $asg_group_name that will be terminated is below:\n$asg_instance_list" +as-suspend-processes $asg_group_name --processes ReplaceUnhealthy,AlarmNotification,ScheduledActions,AZRebalance + #if the desired-capacity of an Auto Scaling Group group is greater than or equal to the max-size of an Auto Scaling Group, the max-size must be increased by 1 to cycle instances while maintaining desired-capacity. This is particularly true of groups of 1 instance (where we'd be removing all instances if we cycled). if [[ $asg_initial_desired_capacity -ge $asg_initial_max_size ]] then echo "$asg_group_name has a max-size of $asg_initial_max_size. In order to recycle instances max-size will be temporarily increased by 1 to max-size $asg_temporary_max_size." @@ -162,3 +167,5 @@ done return_as_initial_maxsize #return temporary desired-capacity to initial desired-capacity return_as_initial_desiredcapacity + +as-resume-processes $asg_group_name From a37bdd5c263ada293bc05934c5ee2553a0952a20 Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Tue, 16 Apr 2013 18:23:47 -0700 Subject: [PATCH 2/8] fix auto scaling processes necessary for aws-ha-release --- aws-ha-release/aws-ha-release.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index d15398f..d8f9236 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -87,7 +87,7 @@ if [[ `echo -e "$asg_result" | grep -c "^AUTO-SCALING-GROUP"` < 1 ]] then echo "No Auto Scaling Group was found. Because no Auto Scaling Group has been found, $app_name does not know which Auto Scaling Group should have Instances terminated." 1>&2 ; exit 64 fi #confirms that certain Auto Scaling processes are not suspended. For certain processes, the "Suspending Processing" state prevents the termination of Auto Scaling Group instances and thus prevents aws-ha-release from running properly. -necessary_processes=(RemoveFromLoadBalancerLowPriority Terminate Launch ReplaceUnhealthy HealthCheck) +necessary_processes=(RemoveFromLoadBalancerLowPriority Terminate Launch HealthCheck AddToLoadBalancer) for process in "${necessary_processes[@]}" do if [[ `echo -e "$asg_result" | grep -c "SUSPENDED-PROCESS$delimiter$process"` > 0 ]] From 2b0693803d2191a4a65cff86cdb4a25fdb8a3457 Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Tue, 16 Apr 2013 18:35:42 -0700 Subject: [PATCH 3/8] resume as processes if aws-ha-release fails --- aws-ha-release/aws-ha-release.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index d8f9236..7c2c353 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -138,6 +138,9 @@ do return_as_initial_maxsize #return temporary desired-capacity to initial desired-capacity return_as_initial_desiredcapacity + + as-resume-processes $asg_group_name + exit 79 fi inservice_instance_list=`elb-describe-instance-health $asg_elb --region $region --show-long | grep InService` From 5feaf83e57e174ff246c9c6c56e7ffb31460b36d Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Tue, 16 Apr 2013 18:57:45 -0700 Subject: [PATCH 4/8] allow for multiple load balancers --- aws-ha-release/aws-ha-release.sh | 33 +++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index 7c2c353..78d5bd6 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -103,7 +103,11 @@ asg_initial_desired_capacity=`echo "$asg_result" | grep ^AUTO-SCALING-GROUP | cu asg_temporary_desired_capacity=$((asg_initial_desired_capacity+1)) #gets list of Auto Scaling Group Instances - these Instances will be terminated asg_instance_list=`echo "$asg_result" | grep ^INSTANCE | cut -d "$delimiter" -f 2` -asg_elb=`echo "$asg_result" | grep ^AUTO-SCALING-GROUP | cut -d "$delimiter" -f 6` + +#builds an array of load balancers +IFS=',' read -a asg_elbs <<< `echo "$asg_result" | grep ^AUTO-SCALING-GROUP | cut -d "$delimiter" -f 6` + + #if the max-size of the Auto Scaling Group is zero there is no reason to run if [[ $asg_initial_max_size -eq 0 ]] then echo "$asg_group_name has a max-size of 0. As the Auto Scaling Group \"$asg_group_name\" has no active Instances there is no reason to run." ; exit 79 @@ -129,8 +133,10 @@ as-update-auto-scaling-group $asg_group_name --region $region --desired-capacity #and begin recycling instances for instance_selected in $asg_instance_list do + all_instances_inservice=false + #the while loop below sleeps for the auto scaling group to have an InService capacity that is equal to the desired-capacity + 1 - while [[ $inservice_instance_count -lt $asg_temporary_desired_capacity ]] + while [[ !$all_instances_inservice ]] do if [[ $inservice_time_taken -gt $inservice_time_allowed ]] then echo "During the last $inservice_time_allowed seconds the InService capacity of the $asg_group_name Auto Scaling Group did not meet the Auto Scaling Group's desired capacity of $asg_temporary_desired_capacity." 1>&2 @@ -143,22 +149,35 @@ do exit 79 fi - inservice_instance_list=`elb-describe-instance-health $asg_elb --region $region --show-long | grep InService` - inservice_instance_count=`echo "$inservice_instance_list" | wc -l` + + for elb in "${asg_elbs[@]}" + do + inservice_instance_list=`elb-describe-instance-health $elb --region $region --show-long | grep InService` + inservice_instance_count=`echo "$inservice_instance_list" | wc -l` + + [[ $inservice_instance_count -lt $asg_temporary_desired_capacity ]] && all_instances_inservice=false || all_instances_inservice=true + done + #sleeps a particular amount of time sleep $inservice_polling_time + inservice_time_taken=$(($inservice_time_taken+$inservice_polling_time)) echo $inservice_instance_count "Instances are InService status. $asg_temporary_desired_capacity Instances are required to terminate the next instance. $inservice_time_taken seconds have elapsed while waiting for an Instance to reach InService status." - #if any status in $elbinstsancehealth != "InService" repeat + #if any status in $elbinstancehealth != "InService" repeat done + #if the echo "$asg_group_name has reached a desired-capacity of $asg_temporary_desired_capacity. $app_name can now remove an Instance from service." inservice_instance_count=0 inservice_time_taken=0 #remove instance from ELB - this ensures no traffic will be directed at an instance that will be terminated - echo "Instance $instance_selected will now be deregistered from ELB \"$asg_elb.\"" - elb-deregister-instances-from-lb $asg_elb --region $region --instances $instance_selected > /dev/null + echo "Instance $instance_selected will now be deregistered from ELBs \"${asg_elbs[@]}.\"" + for elb in "${asg_elbs[@]}" + do + elb-deregister-instances-from-lb $elb --region $region --instances $instance_selected > /dev/null + done + #sleep for "elb_timeout" seconds so that the instance can complete all processing before being terminated sleep $elb_timeout #terminates a pre-existing instance within the autoscaling group From 42145773696c7cd6053b6ab4bd88df0d1e1ddbbe Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Tue, 16 Apr 2013 19:22:26 -0700 Subject: [PATCH 5/8] use an integer instead of a boolean --- aws-ha-release/aws-ha-release.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index 78d5bd6..7554de4 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -107,7 +107,6 @@ asg_instance_list=`echo "$asg_result" | grep ^INSTANCE | cut -d "$delimiter" -f #builds an array of load balancers IFS=',' read -a asg_elbs <<< `echo "$asg_result" | grep ^AUTO-SCALING-GROUP | cut -d "$delimiter" -f 6` - #if the max-size of the Auto Scaling Group is zero there is no reason to run if [[ $asg_initial_max_size -eq 0 ]] then echo "$asg_group_name has a max-size of 0. As the Auto Scaling Group \"$asg_group_name\" has no active Instances there is no reason to run." ; exit 79 @@ -133,10 +132,10 @@ as-update-auto-scaling-group $asg_group_name --region $region --desired-capacity #and begin recycling instances for instance_selected in $asg_instance_list do - all_instances_inservice=false + all_instances_inservice=0 #the while loop below sleeps for the auto scaling group to have an InService capacity that is equal to the desired-capacity + 1 - while [[ !$all_instances_inservice ]] + while [[ $all_instances_inservice -eq 0 ]] do if [[ $inservice_time_taken -gt $inservice_time_allowed ]] then echo "During the last $inservice_time_allowed seconds the InService capacity of the $asg_group_name Auto Scaling Group did not meet the Auto Scaling Group's desired capacity of $asg_temporary_desired_capacity." 1>&2 @@ -155,7 +154,7 @@ do inservice_instance_list=`elb-describe-instance-health $elb --region $region --show-long | grep InService` inservice_instance_count=`echo "$inservice_instance_list" | wc -l` - [[ $inservice_instance_count -lt $asg_temporary_desired_capacity ]] && all_instances_inservice=false || all_instances_inservice=true + [[ $inservice_instance_count -lt $asg_temporary_desired_capacity ]] && all_instances_inservice=0 || all_instances_inservice=1 done #sleeps a particular amount of time From 1a9c38f9752e7b19e66f506f2f94f09c3760a504 Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Tue, 16 Apr 2013 19:29:03 -0700 Subject: [PATCH 6/8] DRY code for returning settings to default --- aws-ha-release/aws-ha-release.sh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index 7554de4..eaad4cc 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -33,6 +33,16 @@ return_as_initial_desiredcapacity() as-update-auto-scaling-group $asg_group_name --region $region --desired-capacity=$asg_initial_desired_capacity } +return_to_defaults() +{ + #return max-size to initial size + return_as_initial_maxsize + #return temporary desired-capacity to initial desired-capacity + return_as_initial_desiredcapacity + + as-resume-processes $asg_group_name +} + #set application defaults app_name=`basename $0` elb_timeout=60 @@ -139,12 +149,8 @@ do do if [[ $inservice_time_taken -gt $inservice_time_allowed ]] then echo "During the last $inservice_time_allowed seconds the InService capacity of the $asg_group_name Auto Scaling Group did not meet the Auto Scaling Group's desired capacity of $asg_temporary_desired_capacity." 1>&2 - #return max-size to initial size - return_as_initial_maxsize - #return temporary desired-capacity to initial desired-capacity - return_as_initial_desiredcapacity - as-resume-processes $asg_group_name + return_to_defaults exit 79 fi @@ -184,9 +190,4 @@ do as-terminate-instance-in-auto-scaling-group --region $region --instance $instance_selected --no-decrement-desired-capacity --force > /dev/null done -#return max-size to initial size -return_as_initial_maxsize -#return temporary desired-capacity to initial desired-capacity -return_as_initial_desiredcapacity - -as-resume-processes $asg_group_name +return_to_defaults From b2ecd4f19f15dd81f1424ed3c122b8a06050dc12 Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Wed, 17 Apr 2013 13:27:36 -0700 Subject: [PATCH 7/8] don't return to defaults if aws-ha-release fails because we might kill healthy instances and leave unhealthy ones --- aws-ha-release/aws-ha-release.sh | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index eaad4cc..bafc02e 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -33,16 +33,6 @@ return_as_initial_desiredcapacity() as-update-auto-scaling-group $asg_group_name --region $region --desired-capacity=$asg_initial_desired_capacity } -return_to_defaults() -{ - #return max-size to initial size - return_as_initial_maxsize - #return temporary desired-capacity to initial desired-capacity - return_as_initial_desiredcapacity - - as-resume-processes $asg_group_name -} - #set application defaults app_name=`basename $0` elb_timeout=60 @@ -124,7 +114,8 @@ fi #echo a list of Instances that are slated for termination echo -e "The list of Instances in Auto Scaling Group $asg_group_name that will be terminated is below:\n$asg_instance_list" -as-suspend-processes $asg_group_name --processes ReplaceUnhealthy,AlarmNotification,ScheduledActions,AZRebalance +as_processes_to_suspend="ReplaceUnhealthy,AlarmNotification,ScheduledActions,AZRebalance" +as-suspend-processes $asg_group_name --processes $as_processes_to_suspend #if the desired-capacity of an Auto Scaling Group group is greater than or equal to the max-size of an Auto Scaling Group, the max-size must be increased by 1 to cycle instances while maintaining desired-capacity. This is particularly true of groups of 1 instance (where we'd be removing all instances if we cycled). if [[ $asg_initial_desired_capacity -ge $asg_initial_max_size ]] @@ -149,8 +140,14 @@ do do if [[ $inservice_time_taken -gt $inservice_time_allowed ]] then echo "During the last $inservice_time_allowed seconds the InService capacity of the $asg_group_name Auto Scaling Group did not meet the Auto Scaling Group's desired capacity of $asg_temporary_desired_capacity." 1>&2 + echo "Because we can't be sure that instances created by this script are healthy, settings that were changed are being left as is. Settings that were changed:" - return_to_defaults + if [[ $max_size_change -eq 1 ]] + then echo "max size was increased by $max_size_change" + fi + + echo "desired capacity was increased by 1" + echo "AutoScaling processes \"$as_processes_to_suspend\" were suspended." exit 79 fi @@ -190,4 +187,10 @@ do as-terminate-instance-in-auto-scaling-group --region $region --instance $instance_selected --no-decrement-desired-capacity --force > /dev/null done -return_to_defaults +#return max-size to initial size +return_as_initial_maxsize + +#return temporary desired-capacity to initial desired-capacity +return_as_initial_desiredcapacity + +as-resume-processes $asg_group_name From 45b2394a919c95b5003a16fbc430cf5ff0ea964e Mon Sep 17 00:00:00 2001 From: Anuj Biyani Date: Wed, 17 Apr 2013 14:19:53 -0700 Subject: [PATCH 8/8] fix bug where only last load balancer mattered --- aws-ha-release/aws-ha-release.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/aws-ha-release/aws-ha-release.sh b/aws-ha-release/aws-ha-release.sh index bafc02e..f5f5e6c 100755 --- a/aws-ha-release/aws-ha-release.sh +++ b/aws-ha-release/aws-ha-release.sh @@ -152,12 +152,16 @@ do exit 79 fi - for elb in "${asg_elbs[@]}" + for index in "${!asg_elbs[@]}" do - inservice_instance_list=`elb-describe-instance-health $elb --region $region --show-long | grep InService` + inservice_instance_list=`elb-describe-instance-health ${asg_elbs[$index]} --region $region --show-long | grep InService` inservice_instance_count=`echo "$inservice_instance_list" | wc -l` - [[ $inservice_instance_count -lt $asg_temporary_desired_capacity ]] && all_instances_inservice=0 || all_instances_inservice=1 + if [ $index -eq 0 ] + then [ $inservice_instance_count -eq $asg_temporary_desired_capacity ] && all_instances_inservice=1 || all_instances_inservice=0 + else + [[ ($all_instances_inservice -eq 1) && ($inservice_instance_count -eq $asg_temporary_desired_capacity) ]] && all_instances_inservice=1 || all_instances_inservice=0 + fi done #sleeps a particular amount of time