Skip to content

Commit 6cfeba0

Browse files
authored
fix(scheduler): when user asks for limits beyond their allowance then error out faster (#975)
This detect errors from the pods event stream Events: FirstSeenLastSeenCountFromSubobjectPathTypeReasonMessage ------------------------------------------------------------ 4m1m2{default-scheduler }WarningFailedSchedulingpod (gaslit-joyrider-cmd-2657344266-tv41n) failed to fit in any node fit failure on node (ip-172-20-0-109.us-west-2.compute.internal): Node didn't have enough resource: CPU, requested: 2000000, used: 520, capacity: 2000
1 parent 7f19c48 commit 6cfeba0

2 files changed

Lines changed: 32 additions & 18 deletions

File tree

rootfs/scheduler/__init__.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ def deploy(self, namespace, name, image, entrypoint, command, **kwargs): # noqa
127127

128128
raise KubeException(
129129
'There was a problem while deploying {} of {}-{}. '
130-
'Going back to the previous release'.format(version, namespace, app_type)
130+
'Going back to the previous release. '
131+
"Additional information:\n{}".format(version, namespace, app_type, str(e))
131132
) from e
132133

133134
# Make sure the application is routable and uses the correct port
@@ -942,7 +943,7 @@ def _wait_until_pods_are_ready(self, namespace, containers, labels, desired, tim
942943
timeout += self._handle_pod_long_image_pulling(pod, reason)
943944

944945
# handle errors and bubble up if need be
945-
self._handle_pod_image_errors(pod, reason, message)
946+
self._handle_pod_errors(pod, reason, message)
946947

947948
# now that state is running time to see if probes are passing
948949
if self._pod_ready(pod):
@@ -1498,39 +1499,49 @@ def pod_deleted(self, pod):
14981499

14991500
return False
15001501

1501-
def _handle_pod_image_errors(self, pod, reason, message):
1502+
def _handle_pod_errors(self, pod, reason, message):
15021503
"""
1503-
Handle potential pod image errors based on the Pending
1504+
Handle potential pod errors based on the Pending
15041505
reason passed into the function
1506+
1507+
Images, FailedScheduling and others are needed
15051508
"""
15061509
# image error reported on the container level
1507-
image_container_errors = [
1510+
container_errors = [
1511+
'Pending', # often an indication of deeper inspection is needed
15081512
'ErrImagePull',
15091513
'ImagePullBackOff',
15101514
'RegistryUnavailable',
15111515
'ErrImageInspect',
15121516
]
15131517
# Image event reason mapping
1514-
image_event_errors = {
1518+
event_errors = {
15151519
"Failed": "FailedToPullImage",
15161520
"InspectFailed": "FailedToInspectImage",
15171521
"ErrImageNeverPull": "ErrImageNeverPullPolicy",
15181522
# Not including this one for now as the message is not useful
15191523
# "BackOff": "BackOffPullImage",
1524+
# FailedScheduling relates limits
1525+
"FailedScheduling": "FailedScheduling",
15201526
}
1521-
if reason in image_container_errors:
1522-
# Nicer error than from the event
1523-
# Often this gets to ImageBullBackOff before we can introspect tho
1524-
if reason == 'ErrImagePull':
1525-
raise KubeException(message)
1526-
1527-
# collect all error messages relevant to images
1528-
messages = []
1527+
1528+
# Nicer error than from the event
1529+
# Often this gets to ImageBullBackOff before we can introspect tho
1530+
if reason == 'ErrImagePull':
1531+
raise KubeException(message)
1532+
1533+
# collect all error messages of worth
1534+
messages = []
1535+
if reason in container_errors:
15291536
for event in self._pod_events(pod):
1530-
if event['reason'] in image_event_errors.keys():
1531-
# remove new lines and any extra white space
1532-
message = ' '.join(event['message'].split())
1537+
if event['reason'] in event_errors.keys():
1538+
# only show a given error once
1539+
event_errors.pop(event['reason'])
1540+
# strip out whitespaces on either side
1541+
message = "\n".join([x.strip() for x in event['message'].split("\n")])
15331542
messages.append(message)
1543+
1544+
if messages:
15341545
raise KubeException("\n".join(messages))
15351546

15361547
def _handle_pod_long_image_pulling(self, reason, pod):
@@ -1757,7 +1768,7 @@ def _wait_until_deployment_is_ready(self, namespace, name, **kwargs):
17571768
timeout += self._handle_pod_long_image_pulling(pod, reason)
17581769

17591770
# handle errors and bubble up if need be
1760-
self._handle_pod_image_errors(pod, reason, message)
1771+
self._handle_pod_errors(pod, reason, message)
17611772

17621773
self.log(namespace, "waited {}s and {} pods are in service".format(waited, availablePods)) # noqa
17631774

rootfs/scheduler/mock.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import string
99
import time
1010
from urllib.parse import urlparse, parse_qs
11+
import uuid
1112
from zlib import adler32
1213

1314
from . import KubeHTTPClient, KubeHTTPException
@@ -253,6 +254,7 @@ def create_pods(url, labels, base, new_pods):
253254
# creation time
254255
timestamp = str(datetime.utcnow().strftime(settings.DEIS_DATETIME_FORMAT))
255256
data['metadata']['creationTimestamp'] = timestamp
257+
data['metadata']['uid'] = str(uuid.uuid4())
256258

257259
# generate the pod name and combine with RC name
258260
if 'generateName' in data['metadata']:
@@ -573,6 +575,7 @@ def post(request, context):
573575
timestamp = str(datetime.utcnow().strftime(settings.DEIS_DATETIME_FORMAT))
574576
data['metadata']['creationTimestamp'] = timestamp
575577
data['metadata']['resourceVersion'] = 1
578+
data['metadata']['uid'] = str(uuid.uuid4())
576579

577580
# don't bother adding it to those two resources since they live outside namespace
578581
if resource_type not in ['nodes', 'namespaces']:

0 commit comments

Comments
 (0)