Skip to content

Commit 67445dc

Browse files
author
Matthew
committed
feat(controller): add health check feature to app config
This feature extends the publisher and the controller to check if the user specified a health check for the application. If they did, publisher will only publish the app if the application passes the health check, and the controller will roll back if a deployment fails to pass this health check. By default, the expected status code is 200 unless the user specifies otherwise. This variable is non-configurable on the platform side as both the controller and the publisher have to rely on this value, and I could not find a happy medium for setting up a shared config variable.
1 parent e4b1805 commit 67445dc

8 files changed

Lines changed: 238 additions & 19 deletions

File tree

controller/api/models.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,45 @@ def _start_containers(self, to_add):
299299
if set([c.state for c in to_add]) != set(['up']):
300300
err = 'warning, some containers failed to start'
301301
log_event(self, err, logging.WARNING)
302+
# if the user specified a health check, try checking to see if it's running
303+
try:
304+
config = self.config_set.latest()
305+
if 'HEALTHCHECK_URL' in config.values.keys():
306+
# check the app to see if it's healthy
307+
try:
308+
self._do_healthcheck(config)
309+
except Exception as e:
310+
log_event(self, str(e), logging.ERROR)
311+
self._destroy_containers(to_add)
312+
raise
313+
except Config.DoesNotExist:
314+
pass
315+
316+
def _do_healthcheck(self, config):
317+
timeout = time.time() + 60 # 1 minute from now
318+
while True:
319+
if time.time() > timeout:
320+
raise RuntimeError(
321+
'app failed to respond to health check within 60 seconds of launch')
322+
try:
323+
response = requests.get(
324+
'http://{}{}'.format(
325+
self.url,
326+
config.values['HEALTHCHECK_URL']),
327+
timeout=5)
328+
expected_status_code = config.values['HEALTHCHECK_STATUS_CODE'] if \
329+
'HEALTHCHECK_STATUS_CODE' in config.values.keys() else \
330+
settings.DEIS_HEALTHCHECK_STATUS_CODE
331+
if str(response.status_code) != str(expected_status_code):
332+
err = "aborting, app failed health check (got '{}', expected: '{}')".format(
333+
response.status_code,
334+
expected_status_code)
335+
raise RuntimeError(err)
336+
break
337+
except requests.exceptions.ConnectionError:
338+
continue
339+
except requests.exceptions.Timeout:
340+
continue
302341

303342
def _restart_containers(self, to_restart):
304343
"""Restarts containers via the scheduler"""

controller/api/tests/test_config.py

Lines changed: 103 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,38 @@
1515
from django.test import TransactionTestCase
1616
from rest_framework.authtoken.models import Token
1717

18-
from api.models import Config
18+
from api.models import App, Config
1919

2020

21-
def mock_import_repository_task(*args, **kwargs):
21+
def mock_status_ok(*args, **kwargs):
2222
resp = requests.Response()
2323
resp.status_code = 200
2424
resp._content_consumed = True
2525
return resp
2626

2727

28+
def mock_status_not_found(*args, **kwargs):
29+
resp = requests.Response()
30+
resp.status_code = 404
31+
resp._content_consumed = True
32+
return resp
33+
34+
35+
def mock_request_timed_out(*args, **kwargs):
36+
raise requests.exceptions.Timeout()
37+
38+
39+
def mock_request_connection_error(*args, **kwargs):
40+
raise requests.exceptions.ConnectionError()
41+
42+
43+
def mock_time(*args, **kwargs):
44+
if not hasattr(mock_time, "counter"):
45+
mock_time.counter = 0 # it doesn't exist yet, so initialize it
46+
mock_time.counter += 1
47+
return mock_time.counter
48+
49+
2850
class ConfigTest(TransactionTestCase):
2951

3052
"""Tests setting and updating config values"""
@@ -35,7 +57,7 @@ def setUp(self):
3557
self.user = User.objects.get(username='autotest')
3658
self.token = Token.objects.get(user=self.user).key
3759

38-
@mock.patch('requests.post', mock_import_repository_task)
60+
@mock.patch('requests.post', mock_status_ok)
3961
def test_config(self):
4062
"""
4163
Test that config is auto-created for a new app and that
@@ -107,7 +129,7 @@ def test_config(self):
107129
self.assertEqual(response.status_code, 405)
108130
return config5
109131

110-
@mock.patch('requests.post', mock_import_repository_task)
132+
@mock.patch('requests.post', mock_status_ok)
111133
def test_response_data(self):
112134
"""Test that the serialized response contains only relevant data."""
113135
body = {'id': 'test'}
@@ -132,7 +154,7 @@ def test_response_data(self):
132154
}
133155
self.assertDictContainsSubset(expected, response.data)
134156

135-
@mock.patch('requests.post', mock_import_repository_task)
157+
@mock.patch('requests.post', mock_status_ok)
136158
def test_config_set_same_key(self):
137159
"""
138160
Test that config sets on the same key function properly
@@ -156,7 +178,7 @@ def test_config_set_same_key(self):
156178
self.assertIn('PORT', response.data['values'])
157179
self.assertEqual(response.data['values']['PORT'], '5001')
158180

159-
@mock.patch('requests.post', mock_import_repository_task)
181+
@mock.patch('requests.post', mock_status_ok)
160182
def test_config_set_unicode(self):
161183
"""
162184
Test that config sets with unicode values are accepted.
@@ -187,14 +209,14 @@ def test_config_set_unicode(self):
187209
self.assertIn('INTEGER', response.data['values'])
188210
self.assertEqual(response.data['values']['INTEGER'], 1)
189211

190-
@mock.patch('requests.post', mock_import_repository_task)
212+
@mock.patch('requests.post', mock_status_ok)
191213
def test_config_str(self):
192214
"""Test the text representation of a node."""
193215
config5 = self.test_config()
194216
config = Config.objects.get(uuid=config5['uuid'])
195217
self.assertEqual(str(config), "{}-{}".format(config5['app'], config5['uuid'][:7]))
196218

197-
@mock.patch('requests.post', mock_import_repository_task)
219+
@mock.patch('requests.post', mock_status_ok)
198220
def test_admin_can_create_config_on_other_apps(self):
199221
"""If a non-admin creates an app, an administrator should be able to set config
200222
values for that app.
@@ -214,7 +236,7 @@ def test_admin_can_create_config_on_other_apps(self):
214236
self.assertIn('PORT', response.data['values'])
215237
return response
216238

217-
@mock.patch('requests.post', mock_import_repository_task)
239+
@mock.patch('requests.post', mock_status_ok)
218240
def test_limit_memory(self):
219241
"""
220242
Test that limit is auto-created for a new app and that
@@ -302,7 +324,7 @@ def test_limit_memory(self):
302324
self.assertEqual(response.status_code, 405)
303325
return limit4
304326

305-
@mock.patch('requests.post', mock_import_repository_task)
327+
@mock.patch('requests.post', mock_status_ok)
306328
def test_limit_cpu(self):
307329
"""
308330
Test that CPU limits can be set
@@ -373,7 +395,7 @@ def test_limit_cpu(self):
373395
self.assertEqual(response.status_code, 405)
374396
return limit4
375397

376-
@mock.patch('requests.post', mock_import_repository_task)
398+
@mock.patch('requests.post', mock_status_ok)
377399
def test_tags(self):
378400
"""
379401
Test that tags can be set on an application
@@ -477,3 +499,73 @@ def test_unauthorized_user_cannot_modify_config(self):
477499
response = self.client.post(url, json.dumps(body), content_type='application/json',
478500
HTTP_AUTHORIZATION='token {}'.format(unauthorized_token))
479501
self.assertEqual(response.status_code, 403)
502+
503+
def _test_app_healthcheck(self):
504+
url = '/v1/apps'
505+
response = self.client.post(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
506+
self.assertEqual(response.status_code, 201)
507+
app_id = response.data['id']
508+
# post a new build, expecting it to pass as usual
509+
url = "/v1/apps/{app_id}/builds".format(**locals())
510+
body = {'image': 'autotest/example'}
511+
response = self.client.post(url, json.dumps(body), content_type='application/json',
512+
HTTP_AUTHORIZATION='token {}'.format(self.token))
513+
self.assertEqual(response.status_code, 201)
514+
# set an initial healthcheck url.
515+
url = "/v1/apps/{app_id}/config".format(**locals())
516+
body = {'values': json.dumps({'HEALTHCHECK_URL': '/'})}
517+
return self.client.post(url, json.dumps(body), content_type='application/json',
518+
HTTP_AUTHORIZATION='token {}'.format(self.token))
519+
520+
@mock.patch('requests.get', mock_status_not_found)
521+
def test_app_healthcheck_good(self):
522+
"""
523+
If a user deploys an app with a config value set for HEALTHCHECK_URL, the controller
524+
should check that the application is up. If it's down, the app should be rolled back.
525+
"""
526+
response = self._test_app_healthcheck()
527+
self.assertEqual(response.status_code, 503)
528+
self.assertEqual(
529+
response.data,
530+
{
531+
'detail':
532+
"aborting, app failed health check (got '404', expected: '200')"
533+
})
534+
# add in the expected app healthcheck status, which will result in a successful deployment
535+
app_id = App.objects.all()[0]
536+
url = "/v1/apps/{app_id}/config".format(**locals())
537+
body = {'values': json.dumps({'HEALTHCHECK_STATUS_CODE': '404'})}
538+
response = self.client.post(url, json.dumps(body), content_type='application/json',
539+
HTTP_AUTHORIZATION='token {}'.format(self.token))
540+
self.assertEqual(response.status_code, 201)
541+
542+
@mock.patch('requests.get', mock_request_timed_out)
543+
@mock.patch('time.time', mock_time)
544+
def test_app_healthcheck_timeout(self):
545+
"""
546+
If a user deploys an app with a config value set for HEALTHCHECK_URL but the app
547+
times out, the controller should continue checking until either the app
548+
responds or the app fails to respond within the timeout.
549+
"""
550+
response = self._test_app_healthcheck()
551+
self.assertEqual(response.status_code, 503)
552+
self.assertEqual(
553+
response.data,
554+
{'detail': 'app failed to respond to health check within 60 seconds of launch'})
555+
556+
@mock.patch('requests.get', mock_request_connection_error)
557+
@mock.patch('time.time', mock_time)
558+
def test_app_healthcheck_connection_error(self):
559+
"""
560+
If a user deploys an app with a config value set for HEALTHCHECK_URL but the app
561+
returns a connection error, the controller should continue checking until either the app
562+
responds or the app fails to respond within the timeout.
563+
564+
NOTE (bacongobbler): the Docker userland proxy listens for connections and returns a
565+
ConnectionError, hence the unit test.
566+
"""
567+
response = self._test_app_healthcheck()
568+
self.assertEqual(response.status_code, 503)
569+
self.assertEqual(
570+
response.data,
571+
{'detail': 'app failed to respond to health check within 60 seconds of launch'})

controller/deis/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@
290290
# names which apps cannot reserve for routing
291291
DEIS_RESERVED_NAMES = ['deis']
292292

293+
# the default expected status code for healthchecks
294+
DEIS_HEALTHCHECK_STATUS_CODE = '200'
295+
293296
# default scheduler settings
294297
SCHEDULER_MODULE = 'scheduler.mock'
295298
SCHEDULER_TARGET = '' # path to scheduler endpoint (e.g. /var/run/fleet.sock)

docs/managing_deis/backing_up_data.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ use in the ``export`` command should correspond to the IP of the host machine wh
212212
[a.save() for a in App.objects.all()]
213213
[d.save() for d in Domain.objects.all()]
214214
[c.save() for c in Certificate.objects.all()]
215+
[c.save() for c in Config.objects.all()]
215216
EOF
216217
$ exit
217218

docs/using_deis/config-application.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,27 @@ appname to the old one:
9191
an application, however you can work around this by pointing the @ record to the
9292
address of the load balancer (if any).
9393

94+
Custom Health Checks
95+
--------------------
96+
97+
By default, Deis checks if the container is running for health checks. You can change both
98+
the healthcheck URL and the expected status code by setting custom config:
99+
100+
.. code-block:: console
101+
102+
$ deis config:set HEALTHCHECK_URL=/404.html
103+
=== peachy-waxworks
104+
HEALTHCHECK_URL: /404.html
105+
$ deis config:set HEALTHCHECK_STATUS_CODE=404
106+
=== peachy-waxworks
107+
HEALTHCHECK_STATUS_CODE: 404
108+
HEALTHCHECK_URL: /404.html
109+
110+
If a new release does not pass the healthcheck, the application will be rolled back to the previous
111+
release. Beyond that, if an application container responds to a heartbeat check with a different
112+
status than is expected, the :ref:`router` will mark that container as down and stop sending
113+
requests to that container.
114+
94115
Track Changes
95116
-------------
96117
Each time a build or config change is made to your application, a new :ref:`release` is created.

publisher/server/publisher.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"fmt"
55
"log"
66
"net"
7+
"net/http"
78
"regexp"
89
"strconv"
910
"sync"
@@ -115,6 +116,19 @@ func (s *Server) publishContainer(container *docker.APIContainers, ttl time.Dura
115116
port := strconv.Itoa(int(p.PublicPort))
116117
hostAndPort := s.host + ":" + port
117118
if s.IsPublishableApp(containerName) && s.IsPortOpen(hostAndPort) {
119+
configKey := fmt.Sprintf("/deis/services/%s/config/", appName)
120+
// check if the user specified an upcheck URL
121+
healthcheckURL := s.getEtcd(configKey + "healthcheck_url")
122+
var healthcheckStatusCode int
123+
healthcheckStatusCode, err := strconv.Atoi(s.getEtcd(configKey + "healthcheck_status_code"))
124+
if err != nil {
125+
healthcheckStatusCode = 200
126+
}
127+
if healthcheckURL != "" {
128+
if !s.HealthCheckOK("http://"+hostAndPort+healthcheckURL, healthcheckStatusCode) {
129+
continue
130+
}
131+
}
118132
s.setEtcd(keyPath, hostAndPort, uint64(ttl.Seconds()))
119133
safeMap.Lock()
120134
safeMap.data[container.ID] = appPath
@@ -169,6 +183,19 @@ func (s *Server) IsPortOpen(hostAndPort string) bool {
169183
return portOpen
170184
}
171185

186+
func (s *Server) HealthCheckOK(url string, expectedStatusCode int) bool {
187+
resp, err := http.Get(url)
188+
if err != nil {
189+
log.Printf("healthcheck failed for %s (expected %d, got %v)\n", url, expectedStatusCode, err)
190+
return false
191+
}
192+
if resp.StatusCode == expectedStatusCode {
193+
return true
194+
}
195+
log.Printf("healthcheck failed for %s (expected %d, got %d)\n", url, expectedStatusCode, resp.StatusCode)
196+
return false
197+
}
198+
172199
// latestRunningVersion retrieves the highest version of the application published
173200
// to etcd. If no app has been published, returns 0.
174201
func latestRunningVersion(client *etcd.Client, appName string) int {
@@ -213,6 +240,21 @@ func max(n []int) int {
213240
return val
214241
}
215242

243+
// getEtcd retrieves the etcd key's value. Returns an empty string if the key was not found.
244+
func (s *Server) getEtcd(key string) string {
245+
if s.logLevel == "debug" {
246+
log.Println("get", key)
247+
}
248+
resp, err := s.EtcdClient.Get(key, false, false)
249+
if err != nil {
250+
return ""
251+
}
252+
if resp != nil && resp.Node != nil {
253+
return resp.Node.Value
254+
}
255+
return ""
256+
}
257+
216258
// setEtcd sets the corresponding etcd key with the value and ttl
217259
func (s *Server) setEtcd(key, value string, ttl uint64) {
218260
if _, err := s.EtcdClient.Set(key, value, ttl); err != nil {

publisher/server/publisher_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
package server
22

33
import (
4+
"fmt"
45
"net"
6+
"net/http"
7+
"net/http/httptest"
58
"testing"
69
)
710

@@ -45,3 +48,18 @@ func TestIsPortOpen(t *testing.T) {
4548
t.Errorf("Port should be closed")
4649
}
4750
}
51+
52+
func TestHealthCheckOK(t *testing.T) {
53+
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
54+
fmt.Fprintln(w, "Hello, client")
55+
}))
56+
defer ts.Close()
57+
58+
s := &Server{}
59+
if !s.HealthCheckOK(ts.URL, 200) {
60+
t.Errorf("Health check should be OK")
61+
}
62+
if s.HealthCheckOK("127.0.0.1:-1", 200) {
63+
t.Errorf("Health check should be NOT OK")
64+
}
65+
}

0 commit comments

Comments
 (0)