1- from datetime import datetime
1+ from datetime import datetime , timedelta
22import json
33import logging
44import os
@@ -671,18 +671,22 @@ def resolve_state(self, pod):
671671 'Unknown' : JobState .error .name ,
672672 }
673673
674+ # being in a Pending state can mean different things, introspecting app container first
675+ if pod ['status' ]['phase' ] == 'Pending' :
676+ pod_state = self ._pod_pending_status (pod )
674677 # being in a running state can mean a pod is starting, actually running or terminating
675- if pod ['status' ]['phase' ] == 'Running' :
678+ elif pod ['status' ]['phase' ] == 'Running' :
676679 # is the readiness probe passing?
677- container_status = self ._pod_readiness_status (pod )
678- if container_status in ['Starting' , 'Terminating' ]:
679- return states [container_status ]
680- elif container_status == 'Running' and self ._pod_liveness_status (pod ):
680+ pod_state = self ._pod_readiness_status (pod )
681+ if pod_state in ['Starting' , 'Terminating' ]:
682+ return states [pod_state ]
683+ elif pod_state == 'Running' and self ._pod_liveness_status (pod ):
681684 # is the pod ready to serve requests?
682- return states [container_status ]
685+ return states [pod_state ]
686+ else :
687+ # if no match was found for deis mapping then passthrough the real state
688+ pod_state = pod ['status' ]['phase' ]
683689
684- # if no match was found for deis mapping then passthrough the real state
685- pod_state = pod ['status' ]['phase' ]
686690 return states .get (pod_state , pod_state )
687691
688692 def _get_port (self , image ):
@@ -857,10 +861,40 @@ def _get_pod_ready_status(self, namespace, labels, desired):
857861
858862 # Ensure the minimum desired number of pods are available
859863 logger .debug ("waiting for {} pods in {} namespace to be in services (120s timeout)" .format (desired , namespace )) # noqa
860- for waited in range (120 ):
861- count = 0
864+ waited = 0
865+ timeout = 120 # 2 minutes
866+ timeout_padded = False # has timeout been increased or not
867+ while True :
868+ # timed out, time to bail
869+ if waited > timeout :
870+ logger .debug ('timed out waiting for pods to come up in namespace {}' .format (namespace )) # noqa
871+ break
872+
873+ count = 0 # ready pods
862874 pods = self ._get_pods (namespace , labels = labels ).json ()
863875 for pod in pods ['items' ]:
876+ # If pulling an image is taking long then increase the timout
877+ if (
878+ pod ['status' ]['phase' ] == 'Pending' and
879+ self ._pod_pending_status (pod ) == 'Pulling' and
880+ not timeout_padded
881+ ):
882+ # last event should be Pulling in this case
883+ event = self ._pod_events (pod ).pop ()
884+ # see if pull operation has been happening for over 1 minute
885+ start = datetime .strptime (
886+ event ['firstTimestamp' ],
887+ settings .DEIS_DATETIME_FORMAT
888+ )
889+
890+ seconds = 60
891+ if (start + timedelta (seconds = seconds )) < datetime .utcnow ():
892+ # add 10 minutes to timeout to allow a pull image operation to finish
893+ logger .debug ('Kubernetes has been pulling the image for {} seconds' .format (seconds )) # noqa
894+ logger .debug ('Increasing timeout by 10 minutes to allow a pull image operation to finish for pods in namespace {}' .format (namespace )) # noqa
895+ timeout += (60 * 10 )
896+ timeout_padded = True
897+
864898 # now that state is running time to see if probes are passing
865899 if self ._pod_ready (pod ):
866900 count += 1
@@ -871,6 +905,8 @@ def _get_pod_ready_status(self, namespace, labels, desired):
871905 if waited > 0 and (waited % 10 ) == 0 :
872906 logger .debug ("waited {}s and {} pods are in service" .format (waited , count ))
873907
908+ # increase wait time without dealing with jitters from above code
909+ waited += 1
874910 time .sleep (1 )
875911
876912 logger .debug ("{} out of {} pods in namespace {} are in service" .format (count , desired , namespace )) # noqa
@@ -1335,26 +1371,63 @@ def _pod_log(self, namespace, name):
13351371
13361372 return response
13371373
1374+ def _pod_pending_status (self , pod ):
1375+ """Introspect the pod containers when pod is in Pending state"""
1376+ if 'containerStatuses' not in pod ['status' ]:
1377+ return 'Pending'
1378+
1379+ name = '{}-{}' .format (pod ['metadata' ]['labels' ]['app' ], pod ['metadata' ]['labels' ]['type' ])
1380+ for container in pod ['status' ]['containerStatuses' ]:
1381+ # find the right container in case there are many on the pod
1382+ if container ['name' ] != name :
1383+ continue
1384+
1385+ if 'waiting' in container ['state' ]:
1386+ reason = container ['state' ]['waiting' ]['reason' ]
1387+ if reason == 'ContainerCreating' :
1388+ # get the last event
1389+ event = self ._pod_events (pod ).pop ()
1390+ return event ['reason' ]
1391+
1392+ # Return Pending if nothing else can be found
1393+ return 'Pending'
1394+
1395+ def _pod_events (self , pod ):
1396+ """Process events for a given Pod to find if Pulling is happening, among other events"""
1397+ # fetch all events for this pod
1398+ fields = {
1399+ 'involvedObject.name' : pod ['metadata' ]['name' ],
1400+ 'involvedObject.namespace' : pod ['metadata' ]['namespace' ],
1401+ 'involvedObject.uid' : pod ['metadata' ]['uid' ]
1402+ }
1403+ events = self ._get_namespace_events (pod ['metadata' ]['namespace' ], fields = fields ).json ()
1404+ # make sure that events are sorted
1405+ events ['items' ].sort (key = lambda x : x ['lastTimestamp' ])
1406+ return events ['items' ]
1407+
13381408 def _pod_readiness_status (self , pod ):
13391409 """Check if the pod container have passed the readiness probes"""
13401410 name = '{}-{}' .format (pod ['metadata' ]['labels' ]['app' ], pod ['metadata' ]['labels' ]['type' ])
13411411 for container in pod ['status' ]['containerStatuses' ]:
13421412 # find the right container in case there are many on the pod
1343- if container ['name' ] == name :
1344- if not container ['ready' ]:
1345- if 'running' in container ['state' ].keys ():
1346- return 'Starting'
1347- elif (
1348- 'terminated' in container ['state' ].keys () or
1349- 'deletionTimestamp' in pod ['metadata' ]
1350- ):
1351- return 'Terminating'
1352- else :
1353- # See if k8s is in Terminating state
1354- if 'deletionTimestamp' in pod ['metadata' ]:
1355- return 'Terminating'
1356-
1357- return 'Running'
1413+ if container ['name' ] != name :
1414+ continue
1415+
1416+ if not container ['ready' ]:
1417+ if 'running' in container ['state' ].keys ():
1418+ return 'Starting'
1419+
1420+ if (
1421+ 'terminated' in container ['state' ].keys () or
1422+ 'deletionTimestamp' in pod ['metadata' ]
1423+ ):
1424+ return 'Terminating'
1425+ else :
1426+ # See if k8s is in Terminating state
1427+ if 'deletionTimestamp' in pod ['metadata' ]:
1428+ return 'Terminating'
1429+
1430+ return 'Running'
13581431
13591432 # Seems like the most sensible default
13601433 return 'Unknown'
0 commit comments