Skip to content

Commit 9c2d584

Browse files
rvadimMatthew Fisher
authored andcommitted
Handle cases with namespace quotas limits sets (#1182)
* feat(resourcequota): Handle cases with namespace quotas limits sets Handle 2 cases: * User did not set quota for applications or set it incorrectly * User tries to scale the application when limits are already exceeded (overuse) * Don't raise RuntimeError if no events in namespace * Add test for _handle_not_ready_pors function * Fix indents required by flake8 * Use replicaset events for handle quota exceed cases * Don't try to wait for pods started if we have failed events in ReplicaSet
1 parent 8c8b5ab commit 9c2d584

5 files changed

Lines changed: 111 additions & 4 deletions

File tree

rootfs/scheduler/mock.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ def filter_data(filters, path):
551551
continue
552552

553553
# check if item has labels
554-
if 'labels' not in item['metadata']:
554+
if 'labels' not in item['metadata'] and item['kind'] != 'Event':
555555
continue
556556

557557
# Do extra filtering based on labelSelector

rootfs/scheduler/resources/deployment.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,12 @@ def in_progress(self, namespace, name, timeout, batches, replicas, tags):
237237
self.log(namespace, 'Deploy operation for Deployment {} in has expired. Rolling back to last good known release'.format(name), level='DEBUG') # noqa
238238
return False, True
239239

240+
try:
241+
self._check_for_failed_events(namespace, labels=labels)
242+
except KubeException as e:
243+
self.log(namespace, e)
244+
return False, True
245+
240246
return True, False
241247

242248
def are_replicas_ready(self, namespace, name):
@@ -326,6 +332,9 @@ def wait_until_ready(self, namespace, name, **kwargs):
326332
timeout = len(batches) * deploy_timeout
327333
self.log(namespace, 'This deployments overall timeout is {}s - batch timout is {}s and there are {} batches to deploy with a total of {} pods'.format(timeout, deploy_timeout, len(batches), replicas)) # noqa
328334

335+
# check for failed events(when quota exceeded for example)
336+
self._check_for_failed_events(namespace, labels=labels)
337+
329338
waited = 0
330339
while waited < timeout:
331340
ready, availablePods = self.are_replicas_ready(namespace, name)
@@ -352,6 +361,39 @@ def wait_until_ready(self, namespace, name, **kwargs):
352361
if not ready:
353362
self.pod._handle_not_ready_pods(namespace, labels)
354363

364+
def _check_for_failed_events(self, namespace, labels):
365+
"""
366+
Request for new ReplicaSet of Deployment and search for failed events involved by that RS
367+
Raises: KubeException when RS have events with FailedCreate reason
368+
"""
369+
response = self.rs.get(namespace, labels=labels)
370+
data = response.json()
371+
fields = {
372+
'involvedObject.kind': 'ReplicaSet',
373+
'involvedObject.name': data['items'][0]['metadata']['name'],
374+
'involvedObject.namespace': namespace,
375+
'involvedObject.uid': data['items'][0]['metadata']['uid'],
376+
}
377+
events_list = self.ns.events(namespace, fields=fields).json()
378+
events = events_list.get('items', [])
379+
if events is not None and len(events) != 0:
380+
for event in events:
381+
if event['reason'] == 'FailedCreate':
382+
log = self._get_formatted_messages(events)
383+
self.log(namespace, log)
384+
raise KubeException(log)
385+
386+
@staticmethod
387+
def _get_formatted_messages(events):
388+
"""
389+
Format each event by string and join all events to one string
390+
"""
391+
message_format = 'Message:{message}, lastTimestamp:{lastTimestamp}, reason: {reason}, count: {count}' # noqa
392+
output = []
393+
for event in events:
394+
output.append(message_format.format(**event))
395+
return '\n'.join(output)
396+
355397
def _get_deploy_steps(self, batches, tags):
356398
# if there is no batch information available default to available nodes for app
357399
if not batches:
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from scheduler.exceptions import KubeHTTPException
2+
from scheduler.resources import Resource
3+
from datetime import datetime
4+
import uuid
5+
6+
DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
7+
8+
9+
class Events(Resource):
10+
"""
11+
Events resource.
12+
Warning! Used ONLY for testing purposes
13+
"""
14+
short_name = 'ev'
15+
16+
def create(self, namespace, name, message, **kwargs):
17+
url = self.api('/namespaces/{}/events'.format(namespace))
18+
data = {
19+
'kind': 'Event',
20+
'apiVersion': 'v1',
21+
'count': kwargs.get('count', 1),
22+
'metadata': {
23+
'creationTimestamp': datetime.now().strftime(DATETIME_FORMAT),
24+
'namespace': namespace,
25+
'name': name,
26+
'resourceVersion': kwargs.get('resourceVersion', ''),
27+
'uid': str(uuid.uuid4()),
28+
},
29+
'message': message,
30+
'type': kwargs.get('type', 'Normal'),
31+
'firstTimestamp': datetime.now().strftime(DATETIME_FORMAT),
32+
'lastTimestamp': datetime.now().strftime(DATETIME_FORMAT),
33+
'reason': kwargs.get('reason', ''),
34+
'source': {
35+
'component': kwargs.get('component', ''),
36+
},
37+
'involvedObject': kwargs.get('involvedObject', {})
38+
}
39+
40+
response = self.http_post(url, json=data)
41+
if not response.status_code == 201:
42+
raise KubeHTTPException(response, 'create Event for namespace {}'.format(namespace)) # noqa
43+
44+
return response

rootfs/scheduler/resources/pod.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -719,8 +719,6 @@ def _handle_not_ready_pods(self, namespace, labels):
719719
message = "\n".join([x.strip() for x in event['message'].split("\n")])
720720
raise KubeException(message)
721721

722-
return None
723-
724722
def deploy_probe_timeout(self, timeout, namespace, labels, containers):
725723
"""
726724
Added in additional timeouts based on readiness and liveness probe

rootfs/scheduler/tests/test_deployments.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Run the tests with './manage.py test scheduler'
55
"""
6-
from scheduler import KubeHTTPException
6+
from scheduler import KubeHTTPException, KubeException
77
from scheduler.tests import TestCase
88
from scheduler.utils import generate_random_name
99

@@ -240,3 +240,26 @@ def test_get_deployment_replicaset(self):
240240
data['metadata']['labels'],
241241
data
242242
)
243+
244+
def test_check_for_failed_events(self):
245+
deploy_name = self.create(self.namespace)
246+
deployment = self.scheduler.deployment.get(self.namespace, deploy_name).json()
247+
response = self.scheduler.rs.get(self.namespace, labels=deployment['metadata']['labels'])
248+
rs = response.json()
249+
involved_object = {
250+
'involvedObject.kind': 'ReplicaSet',
251+
'involvedObject.name': rs['items'][0]['metadata']['name'],
252+
'involvedObject.namespace': self.namespace,
253+
'involvedObject.uid': rs['items'][0]['metadata']['uid'],
254+
}
255+
message = 'Quota exeeded'
256+
self.scheduler.ev.create(self.namespace,
257+
'{}'.format(generate_random_name()),
258+
message,
259+
type='Warning',
260+
involved_object=involved_object,
261+
reason='FailedCreate')
262+
with self.assertRaisesRegex(KubeException,
263+
'Message:{}.*'.format(message)):
264+
self.scheduler.deployment._check_for_failed_events(self.namespace,
265+
labels=deployment['metadata']['labels']) # noqa

0 commit comments

Comments
 (0)