Skip to content

Commit 0a541e6

Browse files
committed
Merge pull request #704 from helgi/more_state_info
feat(scheduler): add 10 mins to deploy timeout if image pull is slow
2 parents 01bb7e5 + 8ab1f78 commit 0a541e6

2 files changed

Lines changed: 108 additions & 27 deletions

File tree

rootfs/scheduler/__init__.py

Lines changed: 99 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import datetime
1+
from datetime import datetime, timedelta
22
import json
33
import logging
44
import os
@@ -671,18 +671,22 @@ def resolve_state(self, pod):
671671
'Unknown': JobState.error.name,
672672
}
673673

674+
# being in a Pending state can mean different things, introspecting app container first
675+
if pod['status']['phase'] == 'Pending':
676+
pod_state = self._pod_pending_status(pod)
674677
# being in a running state can mean a pod is starting, actually running or terminating
675-
if pod['status']['phase'] == 'Running':
678+
elif pod['status']['phase'] == 'Running':
676679
# is the readiness probe passing?
677-
container_status = self._pod_readiness_status(pod)
678-
if container_status in ['Starting', 'Terminating']:
679-
return states[container_status]
680-
elif container_status == 'Running' and self._pod_liveness_status(pod):
680+
pod_state = self._pod_readiness_status(pod)
681+
if pod_state in ['Starting', 'Terminating']:
682+
return states[pod_state]
683+
elif pod_state == 'Running' and self._pod_liveness_status(pod):
681684
# is the pod ready to serve requests?
682-
return states[container_status]
685+
return states[pod_state]
686+
else:
687+
# if no match was found for deis mapping then passthrough the real state
688+
pod_state = pod['status']['phase']
683689

684-
# if no match was found for deis mapping then passthrough the real state
685-
pod_state = pod['status']['phase']
686690
return states.get(pod_state, pod_state)
687691

688692
def _get_port(self, image):
@@ -857,10 +861,40 @@ def _get_pod_ready_status(self, namespace, labels, desired):
857861

858862
# Ensure the minimum desired number of pods are available
859863
logger.debug("waiting for {} pods in {} namespace to be in services (120s timeout)".format(desired, namespace)) # noqa
860-
for waited in range(120):
861-
count = 0
864+
waited = 0
865+
timeout = 120 # 2 minutes
866+
timeout_padded = False # has timeout been increased or not
867+
while True:
868+
# timed out, time to bail
869+
if waited > timeout:
870+
logger.debug('timed out waiting for pods to come up in namespace {}'.format(namespace)) # noqa
871+
break
872+
873+
count = 0 # ready pods
862874
pods = self._get_pods(namespace, labels=labels).json()
863875
for pod in pods['items']:
876+
# If pulling an image is taking long then increase the timout
877+
if (
878+
pod['status']['phase'] == 'Pending' and
879+
self._pod_pending_status(pod) == 'Pulling' and
880+
not timeout_padded
881+
):
882+
# last event should be Pulling in this case
883+
event = self._pod_events(pod).pop()
884+
# see if pull operation has been happening for over 1 minute
885+
start = datetime.strptime(
886+
event['firstTimestamp'],
887+
settings.DEIS_DATETIME_FORMAT
888+
)
889+
890+
seconds = 60
891+
if (start + timedelta(seconds=seconds)) < datetime.utcnow():
892+
# add 10 minutes to timeout to allow a pull image operation to finish
893+
logger.debug('Kubernetes has been pulling the image for {} seconds'.format(seconds)) # noqa
894+
logger.debug('Increasing timeout by 10 minutes to allow a pull image operation to finish for pods in namespace {}'.format(namespace)) # noqa
895+
timeout += (60 * 10)
896+
timeout_padded = True
897+
864898
# now that state is running time to see if probes are passing
865899
if self._pod_ready(pod):
866900
count += 1
@@ -871,6 +905,8 @@ def _get_pod_ready_status(self, namespace, labels, desired):
871905
if waited > 0 and (waited % 10) == 0:
872906
logger.debug("waited {}s and {} pods are in service".format(waited, count))
873907

908+
# increase wait time without dealing with jitters from above code
909+
waited += 1
874910
time.sleep(1)
875911

876912
logger.debug("{} out of {} pods in namespace {} are in service".format(count, desired, namespace)) # noqa
@@ -1335,26 +1371,63 @@ def _pod_log(self, namespace, name):
13351371

13361372
return response
13371373

1374+
def _pod_pending_status(self, pod):
1375+
"""Introspect the pod containers when pod is in Pending state"""
1376+
if 'containerStatuses' not in pod['status']:
1377+
return 'Pending'
1378+
1379+
name = '{}-{}'.format(pod['metadata']['labels']['app'], pod['metadata']['labels']['type'])
1380+
for container in pod['status']['containerStatuses']:
1381+
# find the right container in case there are many on the pod
1382+
if container['name'] != name:
1383+
continue
1384+
1385+
if 'waiting' in container['state']:
1386+
reason = container['state']['waiting']['reason']
1387+
if reason == 'ContainerCreating':
1388+
# get the last event
1389+
event = self._pod_events(pod).pop()
1390+
return event['reason']
1391+
1392+
# Return Pending if nothing else can be found
1393+
return 'Pending'
1394+
1395+
def _pod_events(self, pod):
1396+
"""Process events for a given Pod to find if Pulling is happening, among other events"""
1397+
# fetch all events for this pod
1398+
fields = {
1399+
'involvedObject.name': pod['metadata']['name'],
1400+
'involvedObject.namespace': pod['metadata']['namespace'],
1401+
'involvedObject.uid': pod['metadata']['uid']
1402+
}
1403+
events = self._get_namespace_events(pod['metadata']['namespace'], fields=fields).json()
1404+
# make sure that events are sorted
1405+
events['items'].sort(key=lambda x: x['lastTimestamp'])
1406+
return events['items']
1407+
13381408
def _pod_readiness_status(self, pod):
13391409
"""Check if the pod container have passed the readiness probes"""
13401410
name = '{}-{}'.format(pod['metadata']['labels']['app'], pod['metadata']['labels']['type'])
13411411
for container in pod['status']['containerStatuses']:
13421412
# find the right container in case there are many on the pod
1343-
if container['name'] == name:
1344-
if not container['ready']:
1345-
if 'running' in container['state'].keys():
1346-
return 'Starting'
1347-
elif (
1348-
'terminated' in container['state'].keys() or
1349-
'deletionTimestamp' in pod['metadata']
1350-
):
1351-
return 'Terminating'
1352-
else:
1353-
# See if k8s is in Terminating state
1354-
if 'deletionTimestamp' in pod['metadata']:
1355-
return 'Terminating'
1356-
1357-
return 'Running'
1413+
if container['name'] != name:
1414+
continue
1415+
1416+
if not container['ready']:
1417+
if 'running' in container['state'].keys():
1418+
return 'Starting'
1419+
1420+
if (
1421+
'terminated' in container['state'].keys() or
1422+
'deletionTimestamp' in pod['metadata']
1423+
):
1424+
return 'Terminating'
1425+
else:
1426+
# See if k8s is in Terminating state
1427+
if 'deletionTimestamp' in pod['metadata']:
1428+
return 'Terminating'
1429+
1430+
return 'Running'
13581431

13591432
# Seems like the most sensible default
13601433
return 'Unknown'

rootfs/scheduler/mock.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def fetch_all(request, context):
250250

251251

252252
def prepare_query_filters(query):
253-
filters = {'labels': {}}
253+
filters = {'labels': {}, 'fields': {}}
254254
if query:
255255
queries = parse_qs(query)
256256
if 'labelSelector' in queries:
@@ -259,6 +259,12 @@ def prepare_query_filters(query):
259259
key, value = item.split('=')
260260
filters['labels'][key] = value
261261

262+
if 'fieldSelector' in queries:
263+
for items in queries['fieldSelector']:
264+
for item in items.split(','):
265+
key, value = item.split('=')
266+
filters['fields'][key] = value
267+
262268
return filters
263269

264270

@@ -344,6 +350,8 @@ def put(request, context):
344350

345351
if resource_type == 'replicationcontrollers':
346352
data['metadata']['resourceVersion'] += 1
353+
data['metadata']['generation'] += 1
354+
data['status']['observedGeneration'] += 1
347355
upsert_pods(data, url)
348356

349357
# Update the individual resource

0 commit comments

Comments
 (0)