Skip to content

Commit 15c7c3a

Browse files
committed
feat(healthcheck): use k8s liveness and readiness probes for app healthchecks
Use k8s liveness and readiness probes (HTTP GET only for now) to replace the defunct publisher healthcheck system. For now those two are conflated into one to support the current documented deis healthcheck workflow and an assumption of port 80 is in the code, as well as HTTP (no ssl). All healthcheck related tests were axed as they are no longer applicable and in fact not possible to test until there is a more robust k8s testing strategy for workflow. http://kubernetes.io/v1.1/docs/user-guide/pod-states.html#container-probes has more information on what probes can do and what's available Fixes #76
1 parent 15df775 commit 15c7c3a

4 files changed

Lines changed: 106 additions & 232 deletions

File tree

docs/src/using-deis/configuring-an-application.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ appname to the old one:
7878
## Custom Health Checks
7979

8080
By default, Deis only checks that a container is running. You can add a healthcheck by configuring a
81-
URL, initial delay, and timeout value:
81+
URL, port, initial delay, and timeout value:
8282

8383
$ deis config:set HEALTHCHECK_URL=/200.html
8484
=== peachy-waxworks
@@ -92,12 +92,19 @@ URL, initial delay, and timeout value:
9292
HEALTHCHECK_TIMEOUT: 5
9393
HEALTHCHECK_INITIAL_DELAY: 5
9494
HEALTHCHECK_URL: /200.html
95+
$ deis config:set HEALTHCHECK_PORT=5000
96+
=== peachy-waxworks
97+
HEALTHCHECK_TIMEOUT: 5
98+
HEALTHCHECK_INITIAL_DELAY: 5
99+
HEALTHCHECK_URL: /200.html
100+
HEALTHCHECK_PORT: 5000
95101

96102
If a new release does not pass the healthcheck, the application will be rolled back to the previous
97103
release. Beyond that, if an application container responds to a heartbeat check with a different
98-
status than a 200 OK, the [router][] will mark that container as down and stop sending
104+
status than a 200-399, the [router][] will mark that container as down and stop sending
99105
requests to that container.
100106

107+
The health checks are provided by [kubernetes health checks][]. Currently only HTTP GET is supported.
101108

102109
## Track Changes
103110

@@ -135,3 +142,4 @@ Use `deis rollback` to revert to a previous release.
135142
[stores config in environment variables]: http://12factor.net/config
136143
[release]: ../reference-guide/terms.md#release
137144
[router]: ../understanding-deis/components.md#router
145+
[kubernetes health checks]: http://kubernetes.io/v1.1/docs/user-guide/pod-states.html#container-probes

rootfs/api/models.py

Lines changed: 38 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import importlib
1212
import logging
1313
import re
14-
import time
1514
import uuid
1615
import morph
1716
from threading import Thread
@@ -29,7 +28,7 @@
2928
import requests
3029
from rest_framework.authtoken.models import Token
3130

32-
from api import utils, exceptions
31+
from api import utils
3332
from registry import publish_release
3433
from utils import dict_diff, dict_merge, fingerprint
3534

@@ -322,13 +321,18 @@ def _scale_containers(self, scale_types, to_remove):
322321
for scale_type in scale_types:
323322
image = release.image
324323
version = "v{}".format(release.version)
325-
kwargs = {'memory': release.config.memory,
326-
'cpu': release.config.cpu,
327-
'tags': release.config.tags,
328-
'envs': release.config.values,
329-
'version': version,
330-
'aname': self.id,
331-
'num': scale_types[scale_type]}
324+
kwargs = {
325+
'memory': release.config.memory,
326+
'cpu': release.config.cpu,
327+
'tags': release.config.tags,
328+
'envs': release.config.values,
329+
'version': version,
330+
'aname': self.id,
331+
'num': scale_types[scale_type],
332+
'app_type': scale_type,
333+
'healthcheck': release.config.healthcheck()
334+
}
335+
332336
job_id = self._get_job_id(scale_type)
333337
command = self._get_command(scale_type)
334338
try:
@@ -343,6 +347,7 @@ def _scale_containers(self, scale_types, to_remove):
343347
# scheduler.scale creates the required service on apps:create
344348
if not Domain.objects.filter(owner=self.owner, app=self, domain=self).exists():
345349
Domain(owner=self.owner, app=self, domain=str(self)).save()
350+
346351
except Exception as e:
347352
err = '{} (scale): {}'.format(job_id, e)
348353
log_event(self, err, logging.ERROR)
@@ -367,59 +372,6 @@ def _start_containers(self, to_add):
367372
if set([c.state for c in to_add]) != set(['up']):
368373
err = 'warning, some containers failed to start'
369374
log_event(self, err, logging.WARNING)
370-
# if the user specified a health check, try checking to see if it's running
371-
try:
372-
config = self.config_set.latest()
373-
if 'HEALTHCHECK_URL' in config.values.keys():
374-
self._healthcheck(to_add, config.values)
375-
except Config.DoesNotExist:
376-
pass
377-
378-
def _healthcheck(self, containers, config):
379-
# if at first it fails, back off and try again at 10%, 50% and 100% of INITIAL_DELAY
380-
intervals = [1.0, 0.1, 0.5, 1.0]
381-
# HACK (bacongobbler): we need to wait until publisher has a chance to publish each
382-
# service to etcd, which can take up to 20 seconds.
383-
time.sleep(20)
384-
for i in xrange(len(intervals)):
385-
delay = int(config.get('HEALTHCHECK_INITIAL_DELAY', 0))
386-
try:
387-
# sleep until the initial timeout is over
388-
if delay > 0:
389-
time.sleep(delay * intervals[i])
390-
to_healthcheck = [c for c in containers if c.type in ['web', 'cmd']]
391-
self._do_healthcheck(to_healthcheck, config)
392-
break
393-
except exceptions.HealthcheckException as e:
394-
try:
395-
next_delay = delay * intervals[i+1]
396-
msg = "{}; trying again in {} seconds".format(e, next_delay)
397-
log_event(self, msg, logging.WARNING)
398-
except IndexError:
399-
log_event(self, e, logging.WARNING)
400-
else:
401-
self._destroy_containers(containers)
402-
msg = "aborting, app containers failed to respond to health check"
403-
log_event(self, msg, logging.ERROR)
404-
raise RuntimeError(msg)
405-
406-
def _do_healthcheck(self, containers, config):
407-
path = config.get('HEALTHCHECK_URL', '/')
408-
timeout = int(config.get('HEALTHCHECK_TIMEOUT', 1))
409-
if not _etcd_client:
410-
raise exceptions.HealthcheckException('no etcd client available')
411-
for container in containers:
412-
try:
413-
key = "/deis/services/{self}/{container.job_id}".format(**locals())
414-
url = "http://{}{}".format(_etcd_client.get(key).value, path)
415-
response = requests.get(url, timeout=timeout)
416-
if response.status_code != requests.codes.OK:
417-
raise exceptions.HealthcheckException(
418-
"app failed health check (got '{}', expected: '200')".format(
419-
response.status_code))
420-
except (requests.Timeout, requests.ConnectionError, KeyError) as e:
421-
raise exceptions.HealthcheckException(
422-
'failed to connect to container ({})'.format(e))
423375

424376
def _restart_containers(self, to_restart):
425377
"""Restarts containers via the scheduler"""
@@ -479,13 +431,18 @@ def _deploy_app(self, scale_types, release, existing):
479431
for scale_type in scale_types:
480432
image = release.image
481433
version = "v{}".format(release.version)
482-
kwargs = {'memory': release.config.memory,
483-
'cpu': release.config.cpu,
484-
'tags': release.config.tags,
485-
'envs': release.config.values,
486-
'aname': self.id,
487-
'num': 0,
488-
'version': version}
434+
kwargs = {
435+
'memory': release.config.memory,
436+
'cpu': release.config.cpu,
437+
'tags': release.config.tags,
438+
'envs': release.config.values,
439+
'aname': self.id,
440+
'num': 0,
441+
'version': version,
442+
'app_type': scale_type,
443+
'healthcheck': release.config.healthcheck()
444+
}
445+
489446
job_id = self._get_job_id(scale_type)
490447
command = self._get_command(scale_type)
491448
try:
@@ -793,6 +750,15 @@ class Meta:
793750
def __str__(self):
794751
return "{}-{}".format(self.app.id, str(self.uuid)[:7])
795752

753+
def healthcheck(self):
754+
# Update healthcheck - Scheduler determines the app type
755+
path = self.values.get('HEALTHCHECK_URL', '/')
756+
timeout = int(self.values.get('HEALTHCHECK_TIMEOUT', 1))
757+
delay = int(self.values.get('HEALTHCHECK_INITIAL_DELAY', 10))
758+
port = int(self.values.get('HEALTHCHECK_PORT', 8080))
759+
760+
return {'path': path, 'timeout': timeout, 'delay': delay, 'port': port}
761+
796762
def save(self, **kwargs):
797763
"""merge the old config with the new"""
798764
try:
@@ -804,16 +770,19 @@ def save(self, **kwargs):
804770
data = getattr(previous_config, attr).copy()
805771
except AttributeError:
806772
data = {}
773+
807774
try:
808775
new_data = getattr(self, attr).copy()
809776
except AttributeError:
810777
new_data = {}
778+
811779
data.update(new_data)
812780
# remove config keys if we provided a null value
813781
[data.pop(k) for k, v in new_data.viewitems() if v is None]
814782
setattr(self, attr, data)
815783
except Config.DoesNotExist:
816784
pass
785+
817786
return super(Config, self).save(**kwargs)
818787

819788

rootfs/api/tests/test_config.py

Lines changed: 0 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,13 @@
88
from __future__ import unicode_literals
99

1010
import json
11-
import logging
1211
import requests
1312

1413
from django.contrib.auth.models import User
1514
from django.test import TransactionTestCase
16-
import etcd
1715
import mock
1816
from rest_framework.authtoken.models import Token
1917

20-
import api.exceptions
2118
from api.models import App, Config
2219
from . import mock_status_ok
2320

@@ -33,19 +30,6 @@ def mock_request_connection_error(*args, **kwargs):
3330
raise requests.exceptions.ConnectionError("connection error")
3431

3532

36-
class MockEtcdClient:
37-
38-
def __init__(self, app):
39-
self.app = app
40-
41-
def get(self, key, *args, **kwargs):
42-
node = {
43-
'key': '/deis/services/{}/{}_v2.web.1'.format(self.app, self.app),
44-
'value': '127.0.0.1:1234'
45-
}
46-
return etcd.EtcdResult(None, node)
47-
48-
4933
@mock.patch('api.models.publish_release', lambda *args: None)
5034
class ConfigTest(TransactionTestCase):
5135

@@ -570,142 +554,3 @@ def test_unauthorized_user_cannot_modify_config(self):
570554
response = self.client.post(url, json.dumps(body), content_type='application/json',
571555
HTTP_AUTHORIZATION='token {}'.format(unauthorized_token))
572556
self.assertEqual(response.status_code, 403)
573-
574-
def _test_app_healthcheck(self):
575-
# post a new build, expecting it to pass as usual
576-
url = "/v2/apps/{self.app}/builds".format(**locals())
577-
body = {'image': 'autotest/example'}
578-
response = self.client.post(url, json.dumps(body), content_type='application/json',
579-
HTTP_AUTHORIZATION='token {}'.format(self.token))
580-
self.assertEqual(response.status_code, 201)
581-
# mock out the etcd client
582-
api.models._etcd_client = MockEtcdClient(self.app)
583-
# set an initial healthcheck url.
584-
url = "/v2/apps/{self.app}/config".format(**locals())
585-
body = {'values': json.dumps({'HEALTHCHECK_URL': '/'})}
586-
return self.client.post(url, json.dumps(body), content_type='application/json',
587-
HTTP_AUTHORIZATION='token {}'.format(self.token))
588-
589-
@mock.patch('requests.get', mock_status_ok)
590-
@mock.patch('time.sleep', lambda func: func)
591-
def test_app_healthcheck_good(self):
592-
"""
593-
If a user deploys an app with a config value set for HEALTHCHECK_URL, the controller
594-
should check that the application responds with a 200 OK.
595-
"""
596-
response = self._test_app_healthcheck()
597-
self.assertEqual(response.status_code, 201)
598-
self.assertEqual(self.app.release_set.latest().version, 3)
599-
600-
@mock.patch('requests.get', mock_status_not_found)
601-
@mock.patch('api.models.get_etcd_client', lambda func: func)
602-
@mock.patch('time.sleep', lambda func: func)
603-
@mock.patch('api.models.logger')
604-
def test_app_healthcheck_bad(self, mock_logger):
605-
"""
606-
If a user deploys an app with a config value set for HEALTHCHECK_URL, the controller
607-
should check that the application responds with a 200 OK. If it's down, the app should be
608-
rolled back.
609-
"""
610-
response = self._test_app_healthcheck()
611-
self.assertEqual(response.status_code, 503)
612-
self.assertEqual(
613-
response.data,
614-
{'detail': 'aborting, app containers failed to respond to health check'})
615-
# check that only the build and initial release exist
616-
self.assertEqual(self.app.release_set.latest().version, 2)
617-
# assert that the reason why the containers failed was because
618-
# they failed the health check 4 times; we do this by looking
619-
# at logs-- there may be a better way
620-
exp_msg = "{}: app failed health check (got '404', expected: '200'); trying again in 0.0 \
621-
seconds".format(self.app.id)
622-
exp_log_call = mock.call(logging.WARNING, exp_msg)
623-
log_calls = mock_logger.log.mock_calls
624-
self.assertEqual(log_calls.count(exp_log_call), 3)
625-
exp_msg = "{}: app failed health check (got '404', expected: '200')".format(self.app.id)
626-
exp_log_call = mock.call(logging.WARNING, exp_msg)
627-
self.assertEqual(log_calls.count(exp_log_call), 1)
628-
629-
@mock.patch('requests.get', mock_status_not_found)
630-
@mock.patch('api.models.get_etcd_client', lambda func: func)
631-
@mock.patch('time.sleep')
632-
def test_app_backoff_interval(self, mock_time):
633-
"""
634-
Ensure that when a healthcheck fails, a backoff strategy is used before trying again.
635-
"""
636-
# post a new build, expecting it to pass as usual
637-
url = "/v2/apps/{self.app}/builds".format(**locals())
638-
body = {'image': 'autotest/example'}
639-
response = self.client.post(url, json.dumps(body), content_type='application/json',
640-
HTTP_AUTHORIZATION='token {}'.format(self.token))
641-
self.assertEqual(response.status_code, 201)
642-
# mock out the etcd client
643-
api.models._etcd_client = MockEtcdClient(self.app)
644-
# set an initial healthcheck url.
645-
url = "/v2/apps/{self.app}/config".format(**locals())
646-
body = {'values': json.dumps({'HEALTHCHECK_URL': '/'})}
647-
return self.client.post(url, json.dumps(body), content_type='application/json',
648-
HTTP_AUTHORIZATION='token {}'.format(self.token))
649-
self.assertEqual(mock_time.call_count, 5)
650-
651-
@mock.patch('requests.get', mock_status_ok)
652-
@mock.patch('time.sleep')
653-
def test_app_healthcheck_initial_delay(self, mock_time):
654-
"""
655-
Ensure that when an initial delay is set, the request will sleep for x seconds, where
656-
x is the number of seconds in the initial timeout.
657-
"""
658-
# post a new build, expecting it to pass as usual
659-
url = "/v2/apps/{self.app}/builds".format(**locals())
660-
body = {'image': 'autotest/example'}
661-
response = self.client.post(url, json.dumps(body), content_type='application/json',
662-
HTTP_AUTHORIZATION='token {}'.format(self.token))
663-
self.assertEqual(response.status_code, 201)
664-
# mock out the etcd client
665-
api.models._etcd_client = MockEtcdClient(self.app)
666-
# set an initial healthcheck url.
667-
url = "/v2/apps/{self.app}/config".format(**locals())
668-
body = {'values': json.dumps({'HEALTHCHECK_URL': '/'})}
669-
return self.client.post(url, json.dumps(body), content_type='application/json',
670-
HTTP_AUTHORIZATION='token {}'.format(self.token))
671-
# mock_time increments by one each time its called, so we should expect 2 calls to
672-
# mock_time; one for the call in the code, and one for this invocation.
673-
mock_time.assert_called_with(0)
674-
app = App.objects.all()[0]
675-
url = "/v2/apps/{app}/config".format(**locals())
676-
body = {'values': json.dumps({'HEALTHCHECK_INITIAL_DELAY': 10})}
677-
self.client.post(url, json.dumps(body), content_type='application/json',
678-
HTTP_AUTHORIZATION='token {}'.format(self.token))
679-
mock_time.assert_called_with(10)
680-
681-
@mock.patch('requests.get')
682-
@mock.patch('time.sleep', lambda func: func)
683-
def test_app_healthcheck_timeout(self, mock_request):
684-
"""
685-
Ensure when a timeout value is set, the controller respects that value
686-
when making a request.
687-
"""
688-
self._test_app_healthcheck()
689-
app = App.objects.all()[0]
690-
url = "/v2/apps/{app}/config".format(**locals())
691-
body = {'values': json.dumps({'HEALTHCHECK_TIMEOUT': 10})}
692-
self.client.post(url, json.dumps(body), content_type='application/json',
693-
HTTP_AUTHORIZATION='token {}'.format(self.token))
694-
mock_request.assert_called_with('http://127.0.0.1:1234/', timeout=10)
695-
696-
@mock.patch('requests.get', mock_request_connection_error)
697-
@mock.patch('time.sleep', lambda func: func)
698-
def test_app_healthcheck_connection_error(self):
699-
"""
700-
If a user deploys an app with a config value set for HEALTHCHECK_URL but the app
701-
returns a connection error, the controller should continue checking until either the app
702-
responds or the app fails to respond within the timeout.
703-
704-
NOTE (bacongobbler): the Docker userland proxy listens for connections and returns a
705-
ConnectionError, hence the unit test.
706-
"""
707-
response = self._test_app_healthcheck()
708-
self.assertEqual(response.status_code, 503)
709-
self.assertEqual(
710-
response.data,
711-
{'detail': 'aborting, app containers failed to respond to health check'})

0 commit comments

Comments
 (0)