Skip to content

Commit 8686b8b

Browse files
committed
Merge pull request #242 from helgi/healthchecks_k8s
feat(healthcheck): use k8s liveness and readiness probes for app healthchecks
2 parents 15df775 + 15c7c3a commit 8686b8b

4 files changed

Lines changed: 106 additions & 232 deletions

File tree

docs/src/using-deis/configuring-an-application.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ appname to the old one:
7878
## Custom Health Checks
7979

8080
By default, Deis only checks that a container is running. You can add a healthcheck by configuring a
81-
URL, initial delay, and timeout value:
81+
URL, port, initial delay, and timeout value:
8282

8383
$ deis config:set HEALTHCHECK_URL=/200.html
8484
=== peachy-waxworks
@@ -92,12 +92,19 @@ URL, initial delay, and timeout value:
9292
HEALTHCHECK_TIMEOUT: 5
9393
HEALTHCHECK_INITIAL_DELAY: 5
9494
HEALTHCHECK_URL: /200.html
95+
$ deis config:set HEALTHCHECK_PORT=5000
96+
=== peachy-waxworks
97+
HEALTHCHECK_TIMEOUT: 5
98+
HEALTHCHECK_INITIAL_DELAY: 5
99+
HEALTHCHECK_URL: /200.html
100+
HEALTHCHECK_PORT: 5000
95101

96102
If a new release does not pass the healthcheck, the application will be rolled back to the previous
97103
release. Beyond that, if an application container responds to a heartbeat check with a different
98-
status than a 200 OK, the [router][] will mark that container as down and stop sending
104+
status than a 200-399, the [router][] will mark that container as down and stop sending
99105
requests to that container.
100106

107+
The health checks are provided by [kubernetes health checks][]. Currently only HTTP GET is supported.
101108

102109
## Track Changes
103110

@@ -135,3 +142,4 @@ Use `deis rollback` to revert to a previous release.
135142
[stores config in environment variables]: http://12factor.net/config
136143
[release]: ../reference-guide/terms.md#release
137144
[router]: ../understanding-deis/components.md#router
145+
[kubernetes health checks]: http://kubernetes.io/v1.1/docs/user-guide/pod-states.html#container-probes

rootfs/api/models.py

Lines changed: 38 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import importlib
1212
import logging
1313
import re
14-
import time
1514
import uuid
1615
import morph
1716
from threading import Thread
@@ -29,7 +28,7 @@
2928
import requests
3029
from rest_framework.authtoken.models import Token
3130

32-
from api import utils, exceptions
31+
from api import utils
3332
from registry import publish_release
3433
from utils import dict_diff, dict_merge, fingerprint
3534

@@ -322,13 +321,18 @@ def _scale_containers(self, scale_types, to_remove):
322321
for scale_type in scale_types:
323322
image = release.image
324323
version = "v{}".format(release.version)
325-
kwargs = {'memory': release.config.memory,
326-
'cpu': release.config.cpu,
327-
'tags': release.config.tags,
328-
'envs': release.config.values,
329-
'version': version,
330-
'aname': self.id,
331-
'num': scale_types[scale_type]}
324+
kwargs = {
325+
'memory': release.config.memory,
326+
'cpu': release.config.cpu,
327+
'tags': release.config.tags,
328+
'envs': release.config.values,
329+
'version': version,
330+
'aname': self.id,
331+
'num': scale_types[scale_type],
332+
'app_type': scale_type,
333+
'healthcheck': release.config.healthcheck()
334+
}
335+
332336
job_id = self._get_job_id(scale_type)
333337
command = self._get_command(scale_type)
334338
try:
@@ -343,6 +347,7 @@ def _scale_containers(self, scale_types, to_remove):
343347
# scheduler.scale creates the required service on apps:create
344348
if not Domain.objects.filter(owner=self.owner, app=self, domain=self).exists():
345349
Domain(owner=self.owner, app=self, domain=str(self)).save()
350+
346351
except Exception as e:
347352
err = '{} (scale): {}'.format(job_id, e)
348353
log_event(self, err, logging.ERROR)
@@ -367,59 +372,6 @@ def _start_containers(self, to_add):
367372
if set([c.state for c in to_add]) != set(['up']):
368373
err = 'warning, some containers failed to start'
369374
log_event(self, err, logging.WARNING)
370-
# if the user specified a health check, try checking to see if it's running
371-
try:
372-
config = self.config_set.latest()
373-
if 'HEALTHCHECK_URL' in config.values.keys():
374-
self._healthcheck(to_add, config.values)
375-
except Config.DoesNotExist:
376-
pass
377-
378-
def _healthcheck(self, containers, config):
379-
# if at first it fails, back off and try again at 10%, 50% and 100% of INITIAL_DELAY
380-
intervals = [1.0, 0.1, 0.5, 1.0]
381-
# HACK (bacongobbler): we need to wait until publisher has a chance to publish each
382-
# service to etcd, which can take up to 20 seconds.
383-
time.sleep(20)
384-
for i in xrange(len(intervals)):
385-
delay = int(config.get('HEALTHCHECK_INITIAL_DELAY', 0))
386-
try:
387-
# sleep until the initial timeout is over
388-
if delay > 0:
389-
time.sleep(delay * intervals[i])
390-
to_healthcheck = [c for c in containers if c.type in ['web', 'cmd']]
391-
self._do_healthcheck(to_healthcheck, config)
392-
break
393-
except exceptions.HealthcheckException as e:
394-
try:
395-
next_delay = delay * intervals[i+1]
396-
msg = "{}; trying again in {} seconds".format(e, next_delay)
397-
log_event(self, msg, logging.WARNING)
398-
except IndexError:
399-
log_event(self, e, logging.WARNING)
400-
else:
401-
self._destroy_containers(containers)
402-
msg = "aborting, app containers failed to respond to health check"
403-
log_event(self, msg, logging.ERROR)
404-
raise RuntimeError(msg)
405-
406-
def _do_healthcheck(self, containers, config):
407-
path = config.get('HEALTHCHECK_URL', '/')
408-
timeout = int(config.get('HEALTHCHECK_TIMEOUT', 1))
409-
if not _etcd_client:
410-
raise exceptions.HealthcheckException('no etcd client available')
411-
for container in containers:
412-
try:
413-
key = "/deis/services/{self}/{container.job_id}".format(**locals())
414-
url = "http://{}{}".format(_etcd_client.get(key).value, path)
415-
response = requests.get(url, timeout=timeout)
416-
if response.status_code != requests.codes.OK:
417-
raise exceptions.HealthcheckException(
418-
"app failed health check (got '{}', expected: '200')".format(
419-
response.status_code))
420-
except (requests.Timeout, requests.ConnectionError, KeyError) as e:
421-
raise exceptions.HealthcheckException(
422-
'failed to connect to container ({})'.format(e))
423375

424376
def _restart_containers(self, to_restart):
425377
"""Restarts containers via the scheduler"""
@@ -479,13 +431,18 @@ def _deploy_app(self, scale_types, release, existing):
479431
for scale_type in scale_types:
480432
image = release.image
481433
version = "v{}".format(release.version)
482-
kwargs = {'memory': release.config.memory,
483-
'cpu': release.config.cpu,
484-
'tags': release.config.tags,
485-
'envs': release.config.values,
486-
'aname': self.id,
487-
'num': 0,
488-
'version': version}
434+
kwargs = {
435+
'memory': release.config.memory,
436+
'cpu': release.config.cpu,
437+
'tags': release.config.tags,
438+
'envs': release.config.values,
439+
'aname': self.id,
440+
'num': 0,
441+
'version': version,
442+
'app_type': scale_type,
443+
'healthcheck': release.config.healthcheck()
444+
}
445+
489446
job_id = self._get_job_id(scale_type)
490447
command = self._get_command(scale_type)
491448
try:
@@ -793,6 +750,15 @@ class Meta:
793750
def __str__(self):
794751
return "{}-{}".format(self.app.id, str(self.uuid)[:7])
795752

753+
def healthcheck(self):
754+
# Update healthcheck - Scheduler determines the app type
755+
path = self.values.get('HEALTHCHECK_URL', '/')
756+
timeout = int(self.values.get('HEALTHCHECK_TIMEOUT', 1))
757+
delay = int(self.values.get('HEALTHCHECK_INITIAL_DELAY', 10))
758+
port = int(self.values.get('HEALTHCHECK_PORT', 8080))
759+
760+
return {'path': path, 'timeout': timeout, 'delay': delay, 'port': port}
761+
796762
def save(self, **kwargs):
797763
"""merge the old config with the new"""
798764
try:
@@ -804,16 +770,19 @@ def save(self, **kwargs):
804770
data = getattr(previous_config, attr).copy()
805771
except AttributeError:
806772
data = {}
773+
807774
try:
808775
new_data = getattr(self, attr).copy()
809776
except AttributeError:
810777
new_data = {}
778+
811779
data.update(new_data)
812780
# remove config keys if we provided a null value
813781
[data.pop(k) for k, v in new_data.viewitems() if v is None]
814782
setattr(self, attr, data)
815783
except Config.DoesNotExist:
816784
pass
785+
817786
return super(Config, self).save(**kwargs)
818787

819788

rootfs/api/tests/test_config.py

Lines changed: 0 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,13 @@
88
from __future__ import unicode_literals
99

1010
import json
11-
import logging
1211
import requests
1312

1413
from django.contrib.auth.models import User
1514
from django.test import TransactionTestCase
16-
import etcd
1715
import mock
1816
from rest_framework.authtoken.models import Token
1917

20-
import api.exceptions
2118
from api.models import App, Config
2219
from . import mock_status_ok
2320

@@ -33,19 +30,6 @@ def mock_request_connection_error(*args, **kwargs):
3330
raise requests.exceptions.ConnectionError("connection error")
3431

3532

36-
class MockEtcdClient:
37-
38-
def __init__(self, app):
39-
self.app = app
40-
41-
def get(self, key, *args, **kwargs):
42-
node = {
43-
'key': '/deis/services/{}/{}_v2.web.1'.format(self.app, self.app),
44-
'value': '127.0.0.1:1234'
45-
}
46-
return etcd.EtcdResult(None, node)
47-
48-
4933
@mock.patch('api.models.publish_release', lambda *args: None)
5034
class ConfigTest(TransactionTestCase):
5135

@@ -570,142 +554,3 @@ def test_unauthorized_user_cannot_modify_config(self):
570554
response = self.client.post(url, json.dumps(body), content_type='application/json',
571555
HTTP_AUTHORIZATION='token {}'.format(unauthorized_token))
572556
self.assertEqual(response.status_code, 403)
573-
574-
def _test_app_healthcheck(self):
575-
# post a new build, expecting it to pass as usual
576-
url = "/v2/apps/{self.app}/builds".format(**locals())
577-
body = {'image': 'autotest/example'}
578-
response = self.client.post(url, json.dumps(body), content_type='application/json',
579-
HTTP_AUTHORIZATION='token {}'.format(self.token))
580-
self.assertEqual(response.status_code, 201)
581-
# mock out the etcd client
582-
api.models._etcd_client = MockEtcdClient(self.app)
583-
# set an initial healthcheck url.
584-
url = "/v2/apps/{self.app}/config".format(**locals())
585-
body = {'values': json.dumps({'HEALTHCHECK_URL': '/'})}
586-
return self.client.post(url, json.dumps(body), content_type='application/json',
587-
HTTP_AUTHORIZATION='token {}'.format(self.token))
588-
589-
@mock.patch('requests.get', mock_status_ok)
590-
@mock.patch('time.sleep', lambda func: func)
591-
def test_app_healthcheck_good(self):
592-
"""
593-
If a user deploys an app with a config value set for HEALTHCHECK_URL, the controller
594-
should check that the application responds with a 200 OK.
595-
"""
596-
response = self._test_app_healthcheck()
597-
self.assertEqual(response.status_code, 201)
598-
self.assertEqual(self.app.release_set.latest().version, 3)
599-
600-
@mock.patch('requests.get', mock_status_not_found)
601-
@mock.patch('api.models.get_etcd_client', lambda func: func)
602-
@mock.patch('time.sleep', lambda func: func)
603-
@mock.patch('api.models.logger')
604-
def test_app_healthcheck_bad(self, mock_logger):
605-
"""
606-
If a user deploys an app with a config value set for HEALTHCHECK_URL, the controller
607-
should check that the application responds with a 200 OK. If it's down, the app should be
608-
rolled back.
609-
"""
610-
response = self._test_app_healthcheck()
611-
self.assertEqual(response.status_code, 503)
612-
self.assertEqual(
613-
response.data,
614-
{'detail': 'aborting, app containers failed to respond to health check'})
615-
# check that only the build and initial release exist
616-
self.assertEqual(self.app.release_set.latest().version, 2)
617-
# assert that the reason why the containers failed was because
618-
# they failed the health check 4 times; we do this by looking
619-
# at logs-- there may be a better way
620-
exp_msg = "{}: app failed health check (got '404', expected: '200'); trying again in 0.0 \
621-
seconds".format(self.app.id)
622-
exp_log_call = mock.call(logging.WARNING, exp_msg)
623-
log_calls = mock_logger.log.mock_calls
624-
self.assertEqual(log_calls.count(exp_log_call), 3)
625-
exp_msg = "{}: app failed health check (got '404', expected: '200')".format(self.app.id)
626-
exp_log_call = mock.call(logging.WARNING, exp_msg)
627-
self.assertEqual(log_calls.count(exp_log_call), 1)
628-
629-
@mock.patch('requests.get', mock_status_not_found)
630-
@mock.patch('api.models.get_etcd_client', lambda func: func)
631-
@mock.patch('time.sleep')
632-
def test_app_backoff_interval(self, mock_time):
633-
"""
634-
Ensure that when a healthcheck fails, a backoff strategy is used before trying again.
635-
"""
636-
# post a new build, expecting it to pass as usual
637-
url = "/v2/apps/{self.app}/builds".format(**locals())
638-
body = {'image': 'autotest/example'}
639-
response = self.client.post(url, json.dumps(body), content_type='application/json',
640-
HTTP_AUTHORIZATION='token {}'.format(self.token))
641-
self.assertEqual(response.status_code, 201)
642-
# mock out the etcd client
643-
api.models._etcd_client = MockEtcdClient(self.app)
644-
# set an initial healthcheck url.
645-
url = "/v2/apps/{self.app}/config".format(**locals())
646-
body = {'values': json.dumps({'HEALTHCHECK_URL': '/'})}
647-
return self.client.post(url, json.dumps(body), content_type='application/json',
648-
HTTP_AUTHORIZATION='token {}'.format(self.token))
649-
self.assertEqual(mock_time.call_count, 5)
650-
651-
@mock.patch('requests.get', mock_status_ok)
652-
@mock.patch('time.sleep')
653-
def test_app_healthcheck_initial_delay(self, mock_time):
654-
"""
655-
Ensure that when an initial delay is set, the request will sleep for x seconds, where
656-
x is the number of seconds in the initial timeout.
657-
"""
658-
# post a new build, expecting it to pass as usual
659-
url = "/v2/apps/{self.app}/builds".format(**locals())
660-
body = {'image': 'autotest/example'}
661-
response = self.client.post(url, json.dumps(body), content_type='application/json',
662-
HTTP_AUTHORIZATION='token {}'.format(self.token))
663-
self.assertEqual(response.status_code, 201)
664-
# mock out the etcd client
665-
api.models._etcd_client = MockEtcdClient(self.app)
666-
# set an initial healthcheck url.
667-
url = "/v2/apps/{self.app}/config".format(**locals())
668-
body = {'values': json.dumps({'HEALTHCHECK_URL': '/'})}
669-
return self.client.post(url, json.dumps(body), content_type='application/json',
670-
HTTP_AUTHORIZATION='token {}'.format(self.token))
671-
# mock_time increments by one each time its called, so we should expect 2 calls to
672-
# mock_time; one for the call in the code, and one for this invocation.
673-
mock_time.assert_called_with(0)
674-
app = App.objects.all()[0]
675-
url = "/v2/apps/{app}/config".format(**locals())
676-
body = {'values': json.dumps({'HEALTHCHECK_INITIAL_DELAY': 10})}
677-
self.client.post(url, json.dumps(body), content_type='application/json',
678-
HTTP_AUTHORIZATION='token {}'.format(self.token))
679-
mock_time.assert_called_with(10)
680-
681-
@mock.patch('requests.get')
682-
@mock.patch('time.sleep', lambda func: func)
683-
def test_app_healthcheck_timeout(self, mock_request):
684-
"""
685-
Ensure when a timeout value is set, the controller respects that value
686-
when making a request.
687-
"""
688-
self._test_app_healthcheck()
689-
app = App.objects.all()[0]
690-
url = "/v2/apps/{app}/config".format(**locals())
691-
body = {'values': json.dumps({'HEALTHCHECK_TIMEOUT': 10})}
692-
self.client.post(url, json.dumps(body), content_type='application/json',
693-
HTTP_AUTHORIZATION='token {}'.format(self.token))
694-
mock_request.assert_called_with('http://127.0.0.1:1234/', timeout=10)
695-
696-
@mock.patch('requests.get', mock_request_connection_error)
697-
@mock.patch('time.sleep', lambda func: func)
698-
def test_app_healthcheck_connection_error(self):
699-
"""
700-
If a user deploys an app with a config value set for HEALTHCHECK_URL but the app
701-
returns a connection error, the controller should continue checking until either the app
702-
responds or the app fails to respond within the timeout.
703-
704-
NOTE (bacongobbler): the Docker userland proxy listens for connections and returns a
705-
ConnectionError, hence the unit test.
706-
"""
707-
response = self._test_app_healthcheck()
708-
self.assertEqual(response.status_code, 503)
709-
self.assertEqual(
710-
response.data,
711-
{'detail': 'aborting, app containers failed to respond to health check'})

0 commit comments

Comments
 (0)