@@ -933,28 +933,29 @@ def _wait_until_pods_are_ready(self, namespace, containers, labels, desired, tim
933933 # Ensure the minimum desired number of pods are available
934934 waited = 0
935935 while waited < timeout :
936+ # figure out if there are any pending pod issues
937+ additional_timeout = self ._handle_pending_pods (namespace , labels )
938+ if additional_timeout :
939+ timeout += additional_timeout
940+ # add 10 minutes to timeout to allow a pull image operation to finish
941+ self .log (namespace , 'Kubernetes has been pulling the image for {}s' .format (seconds )) # noqa
942+ self .log (namespace , 'Increasing timeout by {}s to allow a pull image operation to finish for pods' .format (additional_timeout )) # noqa
943+
936944 count = 0 # ready pods
937945 pods = self .get_pods (namespace , labels = labels ).json ()
938946 for pod in pods ['items' ]:
939- # Get more information on why a pod is pending
940- if pod ['status' ]['phase' ] in ['Pending' , 'ContainerCreating' ]:
941- reason , message = self ._pod_pending_status (pod )
942- # If pulling an image is taking long then increase the timeout
943- timeout += self ._handle_pod_long_image_pulling (pod , reason )
944-
945- # handle errors and bubble up if need be
946- self ._handle_pod_errors (pod , reason , message )
947-
948947 # now that state is running time to see if probes are passing
949948 if self ._pod_ready (pod ):
950949 count += 1
950+ continue
951951
952952 # Find out if any pod goes beyond the Running (up) state
953953 # Allow that to happen to account for very fast `deis run` as
954954 # an example. Code using this function will account for it
955955 state = self .pod_state (pod )
956956 if isinstance (state , PodState ) and state > PodState .up :
957957 count += 1
958+ continue
958959
959960 if count == desired :
960961 break
@@ -1568,16 +1569,35 @@ def _handle_pod_long_image_pulling(self, reason, pod):
15681569
15691570 seconds = 60 # time threshold before padding timeout
15701571 if (start + timedelta (seconds = seconds )) < datetime .utcnow ():
1571- # add 10 minutes to timeout to allow a pull image operation to finish
1572- self .log (namespace , 'Kubernetes has been pulling the image for {} seconds' .format (seconds )) # noqa
1573- self .log (namespace , 'Increasing timeout by 10 minutes to allow a pull image operation to finish for pods' ) # noqa
1574-
15751572 # make it so function doesn't do processing again
15761573 setattr (self , '_handle_pod_long_image_pulling_applied' , True )
15771574 return 600
15781575
15791576 return 0
15801577
1578+ def _handle_pending_pods (self , namespace , labels ):
1579+ """
1580+ Detects if any pod is in the starting phases and handles
1581+ any potential issues around that, and increases timeouts
1582+ or throws errors as needed
1583+ """
1584+ timeout = 0
1585+ pods = self .get_pods (namespace , labels = labels ).json ()
1586+ for pod in pods ['items' ]:
1587+ # only care about pods that are not starting or in the starting phases
1588+ if pod ['status' ]['phase' ] not in ['Pending' , 'ContainerCreating' ]:
1589+ continue
1590+
1591+ # Get more information on why a pod is pending
1592+ reason , message = self ._pod_pending_status (pod )
1593+ # If pulling an image is taking long then increase the timeout
1594+ timeout += self ._handle_pod_long_image_pulling (pod , reason )
1595+
1596+ # handle errors and bubble up if need be
1597+ self ._handle_pod_errors (pod , reason , message )
1598+
1599+ return timeout
1600+
15811601 # NODES #
15821602
15831603 def get_nodes (self , ** kwargs ):
@@ -1720,6 +1740,78 @@ def create_deployment(self, namespace, name, image, entrypoint, command, **kwarg
17201740
17211741 return response
17221742
1743+ def deployment_in_progress (self , namespace , name , deploy_timeout , batches , replicas , tags ):
1744+ """
1745+ Determine if a Deployment has a deploy in progress
1746+
1747+ First is a very basic check to see if replicas are ready.
1748+
1749+ If they are not ready then it is time to see if there are problems with any of the pods
1750+ such as image pull issues or similar.
1751+
1752+ And then if that is still all okay then it is time to see if the deploy has
1753+ been in progress for longer than the allocated deploy time. Reason to do this
1754+ check is if a client has had a dropped connection.
1755+
1756+ Returns 2 booleans, first one is for if the Deployment is in progress or not, second
1757+ one is or if a rollback action is advised while leaving the rollback up to the caller
1758+ """
1759+ self .log (namespace , 'Checking if Deployment {} is in progress' .format (name ), level = logging .DEBUG ) # noqa
1760+ try :
1761+ ready , _ = self .are_deployment_replicas_ready (namespace , name )
1762+ if ready :
1763+ # nothing more to do - False since it is not in progress
1764+ self .log (namespace , 'All replicas for Deployment {} are ready' .format (name ), level = logging .DEBUG ) # noqa
1765+ return False , False
1766+ except KubeHTTPException as e :
1767+ # Deployment doesn't exist
1768+ if e .response .status_code == 404 :
1769+ self .log (namespace , 'Deployment {} does not exist yet' .format (name ), level = logging .DEBUG ) # noqa
1770+ return False , False
1771+
1772+ # get deployment information
1773+ deployment = self .get_deployment (namespace , name ).json ()
1774+ # get pod template labels since they include the release version
1775+ labels = deployment ['spec' ]['template' ]['metadata' ]['labels' ]
1776+ containers = deployment ['spec' ]['template' ]['spec' ]['containers' ]
1777+
1778+ # calculate base deploy timeout
1779+ deploy_timeout = self ._deploy_probe_timeout (deploy_timeout , namespace , labels , containers )
1780+
1781+ # a rough calculation that figures out an overall timeout
1782+ steps = self ._get_deploy_steps (batches , tags )
1783+ batches = self ._get_deploy_batches (steps , replicas )
1784+ timeout = len (batches ) * deploy_timeout
1785+
1786+ # is there a slow image pull or image issues
1787+ try :
1788+ timeout += self ._handle_pending_pods (namespace , labels )
1789+ except KubeException as e :
1790+ self .log (namespace , 'Deployment {} had stalled due an error and will be rolled back. {}' .format (name , str (e )), level = logging .DEBUG ) # noqa
1791+ return False , True
1792+
1793+ # fetch the latest RS for Deployment and use the start time to compare to deploy timeout
1794+ replicasets = self .get_replicasets (namespace , labels = labels ).json ()['items' ]
1795+ # the labels should ensure that only 1 replicaset due to the version label
1796+ if len (replicasets ) != 1 :
1797+ # if more than one then sort by start time to newest is first
1798+ replicasets .sort (key = lambda x : x ['metadata' ]['creationTimestamp' ], reverse = True )
1799+
1800+ # work with the latest copy
1801+ replica = replicasets .pop ()
1802+
1803+ # throw an exception if over TTL so error is bubbled up
1804+ start = datetime .strptime (
1805+ replica ['metadata' ]['creationTimestamp' ],
1806+ settings .DEIS_DATETIME_FORMAT
1807+ )
1808+
1809+ if (start + timedelta (seconds = timeout )) < datetime .utcnow ():
1810+ self .log (namespace , 'Deploy operation for Deployment {} in has expired. Rolling back to last good known release' .format (name ), level = logging .DEBUG ) # noqa
1811+ return False , True
1812+
1813+ return True , False
1814+
17231815 def _wait_until_deployment_is_ready (self , namespace , name , ** kwargs ):
17241816 replicas = int (kwargs .get ('replicas' , 0 ))
17251817 # If desired is 0 then there is no ready state to check on
@@ -1743,7 +1835,7 @@ def _wait_until_deployment_is_ready(self, namespace, name, **kwargs):
17431835 self ._wait_until_pods_terminate (namespace , labels , current , replicas )
17441836 return
17451837
1746- # get health info from container
1838+ # calculate base deploy timeout
17471839 deploy_timeout = self ._deploy_probe_timeout (deploy_timeout , namespace , labels , containers )
17481840
17491841 # a rough calculation that figures out an overall timeout
@@ -1759,16 +1851,12 @@ def _wait_until_deployment_is_ready(self, namespace, name, **kwargs):
17591851 # check every 10 seconds for pod failures.
17601852 # Depend on Deployment checks for ready pods
17611853 if waited > 0 and (waited % 10 ) == 0 :
1762- pods = self .get_pods (namespace , labels = labels ).json ()
1763- for pod in pods ['items' ]:
1764- # Get more information on why a pod is pending
1765- if pod ['status' ]['phase' ] in ['Pending' , 'ContainerCreating' ]:
1766- reason , message = self ._pod_pending_status (pod )
1767- # If pulling an image is taking long then increase the timeout
1768- timeout += self ._handle_pod_long_image_pulling (pod , reason )
1769-
1770- # handle errors and bubble up if need be
1771- self ._handle_pod_errors (pod , reason , message )
1854+ additional_timeout = self ._handle_pending_pods (namespace , labels )
1855+ if additional_timeout :
1856+ timeout += additional_timeout
1857+ # add 10 minutes to timeout to allow a pull image operation to finish
1858+ self .log (namespace , 'Kubernetes has been pulling the image for {}s' .format (seconds )) # noqa
1859+ self .log (namespace , 'Increasing timeout by {}s to allow a pull image operation to finish for pods' .format (additional_timeout )) # noqa
17721860
17731861 self .log (namespace , "waited {}s and {} pods are in service" .format (waited , availablePods )) # noqa
17741862
0 commit comments