Skip to content

Commit 8ab1f78

Browse files
committed
feat(scheduler): add 10 mins to deploy timeout if image pull is slow
After a minute of pulling the image (considered pretty high) then additionl 10 minutes are added on top. This will help when the cache is cold. Slugrunner and generally big images are being used (1GB or higher generally) Fixes #459
1 parent 01bb7e5 commit 8ab1f78

2 files changed

Lines changed: 108 additions & 27 deletions

File tree

rootfs/scheduler/__init__.py

Lines changed: 99 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import datetime
1+
from datetime import datetime, timedelta
22
import json
33
import logging
44
import os
@@ -671,18 +671,22 @@ def resolve_state(self, pod):
671671
'Unknown': JobState.error.name,
672672
}
673673

674+
# being in a Pending state can mean different things, introspecting app container first
675+
if pod['status']['phase'] == 'Pending':
676+
pod_state = self._pod_pending_status(pod)
674677
# being in a running state can mean a pod is starting, actually running or terminating
675-
if pod['status']['phase'] == 'Running':
678+
elif pod['status']['phase'] == 'Running':
676679
# is the readiness probe passing?
677-
container_status = self._pod_readiness_status(pod)
678-
if container_status in ['Starting', 'Terminating']:
679-
return states[container_status]
680-
elif container_status == 'Running' and self._pod_liveness_status(pod):
680+
pod_state = self._pod_readiness_status(pod)
681+
if pod_state in ['Starting', 'Terminating']:
682+
return states[pod_state]
683+
elif pod_state == 'Running' and self._pod_liveness_status(pod):
681684
# is the pod ready to serve requests?
682-
return states[container_status]
685+
return states[pod_state]
686+
else:
687+
# if no match was found for deis mapping then passthrough the real state
688+
pod_state = pod['status']['phase']
683689

684-
# if no match was found for deis mapping then passthrough the real state
685-
pod_state = pod['status']['phase']
686690
return states.get(pod_state, pod_state)
687691

688692
def _get_port(self, image):
@@ -857,10 +861,40 @@ def _get_pod_ready_status(self, namespace, labels, desired):
857861

858862
# Ensure the minimum desired number of pods are available
859863
logger.debug("waiting for {} pods in {} namespace to be in services (120s timeout)".format(desired, namespace)) # noqa
860-
for waited in range(120):
861-
count = 0
864+
waited = 0
865+
timeout = 120 # 2 minutes
866+
timeout_padded = False # has timeout been increased or not
867+
while True:
868+
# timed out, time to bail
869+
if waited > timeout:
870+
logger.debug('timed out waiting for pods to come up in namespace {}'.format(namespace)) # noqa
871+
break
872+
873+
count = 0 # ready pods
862874
pods = self._get_pods(namespace, labels=labels).json()
863875
for pod in pods['items']:
876+
# If pulling an image is taking long then increase the timout
877+
if (
878+
pod['status']['phase'] == 'Pending' and
879+
self._pod_pending_status(pod) == 'Pulling' and
880+
not timeout_padded
881+
):
882+
# last event should be Pulling in this case
883+
event = self._pod_events(pod).pop()
884+
# see if pull operation has been happening for over 1 minute
885+
start = datetime.strptime(
886+
event['firstTimestamp'],
887+
settings.DEIS_DATETIME_FORMAT
888+
)
889+
890+
seconds = 60
891+
if (start + timedelta(seconds=seconds)) < datetime.utcnow():
892+
# add 10 minutes to timeout to allow a pull image operation to finish
893+
logger.debug('Kubernetes has been pulling the image for {} seconds'.format(seconds)) # noqa
894+
logger.debug('Increasing timeout by 10 minutes to allow a pull image operation to finish for pods in namespace {}'.format(namespace)) # noqa
895+
timeout += (60 * 10)
896+
timeout_padded = True
897+
864898
# now that state is running time to see if probes are passing
865899
if self._pod_ready(pod):
866900
count += 1
@@ -871,6 +905,8 @@ def _get_pod_ready_status(self, namespace, labels, desired):
871905
if waited > 0 and (waited % 10) == 0:
872906
logger.debug("waited {}s and {} pods are in service".format(waited, count))
873907

908+
# increase wait time without dealing with jitters from above code
909+
waited += 1
874910
time.sleep(1)
875911

876912
logger.debug("{} out of {} pods in namespace {} are in service".format(count, desired, namespace)) # noqa
@@ -1335,26 +1371,63 @@ def _pod_log(self, namespace, name):
13351371

13361372
return response
13371373

1374+
def _pod_pending_status(self, pod):
1375+
"""Introspect the pod containers when pod is in Pending state"""
1376+
if 'containerStatuses' not in pod['status']:
1377+
return 'Pending'
1378+
1379+
name = '{}-{}'.format(pod['metadata']['labels']['app'], pod['metadata']['labels']['type'])
1380+
for container in pod['status']['containerStatuses']:
1381+
# find the right container in case there are many on the pod
1382+
if container['name'] != name:
1383+
continue
1384+
1385+
if 'waiting' in container['state']:
1386+
reason = container['state']['waiting']['reason']
1387+
if reason == 'ContainerCreating':
1388+
# get the last event
1389+
event = self._pod_events(pod).pop()
1390+
return event['reason']
1391+
1392+
# Return Pending if nothing else can be found
1393+
return 'Pending'
1394+
1395+
def _pod_events(self, pod):
1396+
"""Process events for a given Pod to find if Pulling is happening, among other events"""
1397+
# fetch all events for this pod
1398+
fields = {
1399+
'involvedObject.name': pod['metadata']['name'],
1400+
'involvedObject.namespace': pod['metadata']['namespace'],
1401+
'involvedObject.uid': pod['metadata']['uid']
1402+
}
1403+
events = self._get_namespace_events(pod['metadata']['namespace'], fields=fields).json()
1404+
# make sure that events are sorted
1405+
events['items'].sort(key=lambda x: x['lastTimestamp'])
1406+
return events['items']
1407+
13381408
def _pod_readiness_status(self, pod):
13391409
"""Check if the pod container have passed the readiness probes"""
13401410
name = '{}-{}'.format(pod['metadata']['labels']['app'], pod['metadata']['labels']['type'])
13411411
for container in pod['status']['containerStatuses']:
13421412
# find the right container in case there are many on the pod
1343-
if container['name'] == name:
1344-
if not container['ready']:
1345-
if 'running' in container['state'].keys():
1346-
return 'Starting'
1347-
elif (
1348-
'terminated' in container['state'].keys() or
1349-
'deletionTimestamp' in pod['metadata']
1350-
):
1351-
return 'Terminating'
1352-
else:
1353-
# See if k8s is in Terminating state
1354-
if 'deletionTimestamp' in pod['metadata']:
1355-
return 'Terminating'
1356-
1357-
return 'Running'
1413+
if container['name'] != name:
1414+
continue
1415+
1416+
if not container['ready']:
1417+
if 'running' in container['state'].keys():
1418+
return 'Starting'
1419+
1420+
if (
1421+
'terminated' in container['state'].keys() or
1422+
'deletionTimestamp' in pod['metadata']
1423+
):
1424+
return 'Terminating'
1425+
else:
1426+
# See if k8s is in Terminating state
1427+
if 'deletionTimestamp' in pod['metadata']:
1428+
return 'Terminating'
1429+
1430+
return 'Running'
13581431

13591432
# Seems like the most sensible default
13601433
return 'Unknown'

rootfs/scheduler/mock.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def fetch_all(request, context):
250250

251251

252252
def prepare_query_filters(query):
253-
filters = {'labels': {}}
253+
filters = {'labels': {}, 'fields': {}}
254254
if query:
255255
queries = parse_qs(query)
256256
if 'labelSelector' in queries:
@@ -259,6 +259,12 @@ def prepare_query_filters(query):
259259
key, value = item.split('=')
260260
filters['labels'][key] = value
261261

262+
if 'fieldSelector' in queries:
263+
for items in queries['fieldSelector']:
264+
for item in items.split(','):
265+
key, value = item.split('=')
266+
filters['fields'][key] = value
267+
262268
return filters
263269

264270

@@ -344,6 +350,8 @@ def put(request, context):
344350

345351
if resource_type == 'replicationcontrollers':
346352
data['metadata']['resourceVersion'] += 1
353+
data['metadata']['generation'] += 1
354+
data['status']['observedGeneration'] += 1
347355
upsert_pods(data, url)
348356

349357
# Update the individual resource

0 commit comments

Comments
 (0)