@@ -517,16 +517,31 @@ def state(self, name):
517517 return JobState .error
518518
519519 def resolve_state (self , pod ):
520+ if pod is None :
521+ return JobState .destroyed
522+
520523 # See "Pod Phase" at http://kubernetes.io/v1.1/docs/user-guide/pod-states.html
521524 states = {
522- "Pending" : JobState .initialized ,
523- "Running" : JobState .up ,
524- "Succeeded" : JobState .down ,
525- "Failed" : JobState .crashed ,
526- "Unknown" : JobState .error ,
525+ 'Pending' : JobState .initialized ,
526+ 'Starting' : JobState .starting ,
527+ 'Running' : JobState .up ,
528+ 'Terminating' : JobState .terminating ,
529+ 'Succeeded' : JobState .down ,
530+ 'Failed' : JobState .crashed ,
531+ 'Unknown' : JobState .error ,
527532 }
528533
529- return states [pod ["status" ]["phase" ]]
534+ # being in a running state can mean a pod is starting, actually running or terminating
535+ if pod ['status' ]['phase' ] == 'Running' :
536+ # is the readiness probe passing?
537+ container_status = self ._pod_readiness_status (pod )
538+ if container_status in ['Starting' , 'Terminating' ]:
539+ return states [container_status ]
540+ elif container_status == 'Running' and self ._pod_liveness_status (pod ):
541+ # is the pod ready to serve requests?
542+ return states [container_status ]
543+
544+ return states [pod ['status' ]['phase' ]]
530545
531546 def _api (self , tmpl , * args ):
532547 """Return a fully-qualified Kubernetes API URL from a string template with args."""
@@ -685,9 +700,43 @@ def _get_schedule_status(self, namespace, name, current, desired, resource_versi
685700
686701 logger .debug ("{} out of {} pods in namespace {} are in state {}" .format (len (waiting_pods ), state_count , namespace , reason )) # noqa
687702
688- def _get_pod_ready_status (self , namespace , name , num ):
703+ # if it was a scale down operation, wait until terminating pods are done
704+ if reason == 'Killing' :
705+ self ._wait_until_pods_terminate (namespace , name , state_count )
706+
707+ def _wait_until_pods_terminate (self , namespace , name , desired ):
708+ logger .debug ("waiting for {} pods in {} namespace to be terminated (120s timeout)" .format (desired , namespace )) # noqa
709+ for waited in range (120 ):
710+ count = 0
711+ pods = self ._get_pods (namespace ).json ()
712+ for pod in pods ['items' ]:
713+ # now that state is running time to see if probes are passing
714+ if (
715+ pod ['metadata' ]['generateName' ] == name + '-' and
716+ pod ['status' ]['phase' ] == 'Running' and
717+ # is the readiness probe passing?
718+ self ._pod_readiness_status (pod ) == 'Terminating'
719+ ):
720+ count += 1
721+
722+ # stop when all pods are terminated as expected
723+ if count == 0 :
724+ break
725+
726+ if waited > 0 and (waited % 10 ) == 0 :
727+ logger .debug ("waited {}s and {} pods out of {} are fully terminated" .format (waited , (desired - count ), desired )) # noqa
728+
729+ time .sleep (1 )
730+
731+ logger .debug ("{} pods in namespace {} are terminated" .format (desired , namespace )) # noqa
732+
733+ def _get_pod_ready_status (self , namespace , name , desired ):
734+ # If desired is 0 then there is no ready state to check on
735+ if desired == 0 :
736+ return
737+
689738 # Ensure the minimum desired number of pods are available
690- logger .debug ("waiting for {} pods in {} namespace to be in services (120s timeout)" .format (num , namespace )) # noqa
739+ logger .debug ("waiting for {} pods in {} namespace to be in services (120s timeout)" .format (desired , namespace )) # noqa
691740 for waited in range (120 ):
692741 count = 0
693742 pods = self ._get_pods (namespace ).json ()
@@ -697,21 +746,21 @@ def _get_pod_ready_status(self, namespace, name, num):
697746 pod ['metadata' ]['generateName' ] == name + '-' and
698747 pod ['status' ]['phase' ] == 'Running' and
699748 # is the readiness probe passing?
700- self ._pod_readiness_status (pod , name ) and
749+ self ._pod_readiness_status (pod ) == 'Running' and
701750 # is the pod ready to serve requests?
702751 self ._pod_liveness_status (pod )
703752 ):
704753 count += 1
705754
706- if count == num :
755+ if count == desired :
707756 break
708757
709758 if waited > 0 and (waited % 10 ) == 0 :
710759 logger .debug ("waited {}s and {} pods are in service" .format (waited , count ))
711760
712761 time .sleep (1 )
713762
714- logger .debug ("{} out of {} pods in namespace {} are in service" .format (count , num , namespace )) # noqa
763+ logger .debug ("{} out of {} pods in namespace {} are in service" .format (count , desired , namespace )) # noqa
715764
716765 def _scale_rc (self , name , namespace , desired ):
717766 rc = self ._get_rc (name , namespace )
@@ -757,6 +806,7 @@ def _scale_rc(self, name, namespace, desired):
757806 js_template ['metadata' ]['resourceVersion' ]
758807 )
759808
809+ # Double check enough pods are in the required state to service the application
760810 self ._get_pod_ready_status (namespace , name , desired )
761811
762812 def _create_rc (self , name , image , command , ** kwargs ): # noqa
@@ -948,7 +998,6 @@ def _create_secret(self, namespace, name, data):
948998
949999 url = self ._api ("/namespaces/{}/secrets" , namespace )
9501000 response = self .session .post (url , json = template )
951- logger .critical (response )
9521001 if unhealthy (response .status_code ):
9531002 error (response , 'failed to create secret "{}" in Namespace "{}"' , name , namespace )
9541003
@@ -1063,14 +1112,22 @@ def _pod_log(self, name, namespace):
10631112
10641113 return resp .status_code , resp .text , resp .reason
10651114
1066- def _pod_readiness_status (self , pod , name ):
1115+ def _pod_readiness_status (self , pod ):
10671116 """Check if the pod container have passed the readiness probes"""
1117+ name = '{}-{}' .format (pod ['metadata' ]['labels' ]['app' ], pod ['metadata' ]['labels' ]['type' ])
10681118 for container in pod ['status' ]['containerStatuses' ]:
10691119 # find the right container in case there are many on the pod
1070- if container ['name' ] == name and not container ['ready' ]:
1071- return False
1072-
1073- return True
1120+ if container ['name' ] == name :
1121+ if not container ['ready' ]:
1122+ if 'running' in container ['state' ]:
1123+ return 'Starting'
1124+ elif 'terminated' in container ['state' ]:
1125+ return 'Terminating'
1126+ else :
1127+ return 'Running'
1128+
1129+ # Seems like the most sensible default
1130+ return 'Unknown'
10741131
10751132 def _pod_liveness_status (self , pod ):
10761133 """Check if the pods liveness probe status has passed all checks"""
0 commit comments