Skip to content

Commit a2d6010

Browse files
committed
fix(healthchecks): when migrating from old healthchecks also add in readiness probe
Also get the max() between liveness and readiness initial seconds delay to use in deploy timeout. It is smart enough to deal with if one or both are missing as well. Fixes #874
1 parent fbf05a1 commit a2d6010

4 files changed

Lines changed: 74 additions & 91 deletions

File tree

rootfs/api/models/config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,20 @@ def _migrate_legacy_healthcheck(self):
5656
}
5757
}
5858

59+
self.healthcheck['readinessProbe'] = {
60+
'initialDelaySeconds': delay,
61+
'timeoutSeconds': timeout,
62+
'periodSeconds': period_seconds,
63+
'successThreshold': success_threshold,
64+
'failureThreshold': failure_threshold,
65+
'httpGet': {
66+
'path': path,
67+
}
68+
}
69+
70+
# Unset all the old values
71+
self.values = {k: v for k, v in self.values.items() if not k.startswith('HEALTHCHECK_')}
72+
5973
def set_registry(self):
6074
# lower case all registry options for consistency
6175
self.registry = {key.lower(): value for key, value in self.registry.copy().items()}

rootfs/api/tests/deployments/test_config.py

Lines changed: 16 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -773,39 +773,6 @@ def test_unauthorized_user_cannot_modify_config(self, mock_requests):
773773
response = self.client.post(url, body)
774774
self.assertEqual(response.status_code, 403)
775775

776-
def test_healthchecks(self, mock_requests):
777-
"""
778-
Test that healthchecks can be applied
779-
"""
780-
response = self.client.post('/v2/apps')
781-
self.assertEqual(response.status_code, 201, response.data)
782-
app_id = response.data['id']
783-
784-
# Set a healthcheck option before URL is around (URL is required for full setting)
785-
resp = self.client.post(
786-
'/v2/apps/{app_id}/config'.format(**locals()),
787-
{'values': json.dumps({'HEALTHCHECK_INITIAL_DELAY': '25'})}
788-
)
789-
self.assertEqual(resp.status_code, 201, response.data)
790-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', resp.data['values'])
791-
self.assertEqual(resp.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
792-
793-
# Set healthcheck URL to get defaults set
794-
resp = self.client.post(
795-
'/v2/apps/{app_id}/config'.format(**locals()),
796-
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
797-
)
798-
self.assertEqual(resp.status_code, 201, response.data)
799-
self.assertIn('HEALTHCHECK_URL', resp.data['values'])
800-
self.assertEqual(resp.data['values']['HEALTHCHECK_URL'], '/health')
801-
802-
# post a new build
803-
response = self.client.post(
804-
"/v2/apps/{app_id}/builds".format(**locals()),
805-
{'image': 'quay.io/autotest/example'}
806-
)
807-
self.assertEqual(response.status_code, 201, response.data)
808-
809776
def test_healthchecks_validations(self, mock_requests):
810777
"""
811778
Test that healthchecks validations work
@@ -929,8 +896,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
929896
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
930897
)
931898
self.assertEqual(response.status_code, 201, response.data)
932-
self.assertIn('HEALTHCHECK_URL', response.data['values'])
933-
self.assertEqual(response.data['values']['HEALTHCHECK_URL'], '/health')
899+
# this gets migrated to the new healtcheck format
900+
self.assertNotIn('HEALTHCHECK_URL', response.data['values'])
934901
# legacy defaults
935902
expected = {
936903
'livenessProbe': {
@@ -942,6 +909,16 @@ def test_config_healthchecks_legacy(self, mock_requests):
942909
'httpGet': {
943910
'path': '/health'
944911
}
912+
},
913+
'readinessProbe': {
914+
'initialDelaySeconds': 50,
915+
'timeoutSeconds': 50,
916+
'periodSeconds': 10,
917+
'successThreshold': 1,
918+
'failureThreshold': 3,
919+
'httpGet': {
920+
'path': '/health'
921+
}
945922
}
946923
}
947924
actual = app.config_set.latest().healthcheck
@@ -951,6 +928,7 @@ def test_config_healthchecks_legacy(self, mock_requests):
951928
'/v2/apps/{app.id}/config'.format(**locals()),
952929
{
953930
'values': json.dumps({
931+
'HEALTHCHECK_URL': '/health',
954932
'HEALTHCHECK_INITIAL_DELAY': '25',
955933
'HEALTHCHECK_TIMEOUT': '10',
956934
'HEALTHCHECK_PERIOD_SECONDS': '5',
@@ -959,8 +937,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
959937
}
960938
)
961939
self.assertEqual(response.status_code, 201, response.data)
962-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
963-
self.assertEqual(response.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
940+
# this gets migrated to the new healtcheck format
941+
self.assertNotIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
964942
expected['livenessProbe'] = {
965943
'initialDelaySeconds': 25,
966944
'timeoutSeconds': 10,
@@ -971,5 +949,6 @@ def test_config_healthchecks_legacy(self, mock_requests):
971949
'path': '/health'
972950
}
973951
}
952+
expected['readinessProbe'] = expected['livenessProbe']
974953
actual = app.config_set.latest().healthcheck
975954
self.assertEqual(expected, actual)

rootfs/api/tests/test_config.py

Lines changed: 16 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -771,39 +771,6 @@ def test_unauthorized_user_cannot_modify_config(self, mock_requests):
771771
response = self.client.post(url, body)
772772
self.assertEqual(response.status_code, 403)
773773

774-
def test_healthchecks(self, mock_requests):
775-
"""
776-
Test that healthchecks can be applied
777-
"""
778-
response = self.client.post('/v2/apps')
779-
self.assertEqual(response.status_code, 201, response.data)
780-
app_id = response.data['id']
781-
782-
# Set a healthcheck option before URL is around (URL is required for full setting)
783-
resp = self.client.post(
784-
'/v2/apps/{app_id}/config'.format(**locals()),
785-
{'values': json.dumps({'HEALTHCHECK_INITIAL_DELAY': '25'})}
786-
)
787-
self.assertEqual(resp.status_code, 201, response.data)
788-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', resp.data['values'])
789-
self.assertEqual(resp.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
790-
791-
# Set healthcheck URL to get defaults set
792-
resp = self.client.post(
793-
'/v2/apps/{app_id}/config'.format(**locals()),
794-
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
795-
)
796-
self.assertEqual(resp.status_code, 201, response.data)
797-
self.assertIn('HEALTHCHECK_URL', resp.data['values'])
798-
self.assertEqual(resp.data['values']['HEALTHCHECK_URL'], '/health')
799-
800-
# post a new build
801-
response = self.client.post(
802-
"/v2/apps/{app_id}/builds".format(**locals()),
803-
{'image': 'quay.io/autotest/example'}
804-
)
805-
self.assertEqual(response.status_code, 201, response.data)
806-
807774
def test_healthchecks_validations(self, mock_requests):
808775
"""
809776
Test that healthchecks validations work
@@ -927,8 +894,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
927894
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
928895
)
929896
self.assertEqual(response.status_code, 201, response.data)
930-
self.assertIn('HEALTHCHECK_URL', response.data['values'])
931-
self.assertEqual(response.data['values']['HEALTHCHECK_URL'], '/health')
897+
# this gets migrated to the new healtcheck format
898+
self.assertNotIn('HEALTHCHECK_URL', response.data['values'])
932899
# legacy defaults
933900
expected = {
934901
'livenessProbe': {
@@ -940,6 +907,16 @@ def test_config_healthchecks_legacy(self, mock_requests):
940907
'httpGet': {
941908
'path': '/health'
942909
}
910+
},
911+
'readinessProbe': {
912+
'initialDelaySeconds': 50,
913+
'timeoutSeconds': 50,
914+
'periodSeconds': 10,
915+
'successThreshold': 1,
916+
'failureThreshold': 3,
917+
'httpGet': {
918+
'path': '/health'
919+
}
943920
}
944921
}
945922
actual = app.config_set.latest().healthcheck
@@ -949,6 +926,7 @@ def test_config_healthchecks_legacy(self, mock_requests):
949926
'/v2/apps/{app.id}/config'.format(**locals()),
950927
{
951928
'values': json.dumps({
929+
'HEALTHCHECK_URL': '/health',
952930
'HEALTHCHECK_INITIAL_DELAY': '25',
953931
'HEALTHCHECK_TIMEOUT': '10',
954932
'HEALTHCHECK_PERIOD_SECONDS': '5',
@@ -957,8 +935,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
957935
}
958936
)
959937
self.assertEqual(response.status_code, 201, response.data)
960-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
961-
self.assertEqual(response.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
938+
# this gets migrated to the new healtcheck format
939+
self.assertNotIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
962940
expected['livenessProbe'] = {
963941
'initialDelaySeconds': 25,
964942
'timeoutSeconds': 10,
@@ -969,5 +947,6 @@ def test_config_healthchecks_legacy(self, mock_requests):
969947
'path': '/health'
970948
}
971949
}
950+
expected['readinessProbe'] = expected['livenessProbe']
972951
actual = app.config_set.latest().healthcheck
973952
self.assertEqual(expected, actual)

rootfs/scheduler/__init__.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -875,26 +875,45 @@ def _wait_until_pods_terminate(self, namespace, labels, current, desired):
875875

876876
self.log(namespace, "{} pods are terminated".format(delta))
877877

878-
def _wait_until_pods_are_ready(self, namespace, containers, labels, desired): # noqa
879-
# If desired is 0 then there is no ready state to check on
880-
if desired == 0:
881-
return
878+
def _deploy_probe_timeout(self, timeout, namespace, labels, containers):
879+
"""
880+
Added in additional timeouts based on readiness and liveness probe
882881
883-
timeout = self.deploy_timeout
882+
Uses the max of the two instead of combining them as the checks are stacked.
883+
"""
884884

885885
container_name = '{}-{}'.format(labels['app'], labels['type'])
886886
container = self._find_container(container_name, containers)
887887

888888
# get health info from container
889+
added_timeout = []
889890
if 'readinessProbe' in container:
890891
# If there is initial delay on the readiness check then timeout needs to be higher
891892
# this is to account for kubernetes having readiness check report as failure until
892893
# the initial delay period is up
893-
delay = int(container['readinessProbe'].get('initialDelaySeconds', 50))
894-
self.log(namespace, "adding {}s on to the original {}s timeout to account for the initial delay specified in the readiness probe".format(delay, timeout)) # noqa
894+
added_timeout.append(int(container['readinessProbe'].get('initialDelaySeconds', 50)))
895+
896+
if 'livenessProbe' in container:
897+
# If there is initial delay on the readiness check then timeout needs to be higher
898+
# this is to account for kubernetes having liveness check report as failure until
899+
# the initial delay period is up
900+
added_timeout.append(int(container['livenessProbe'].get('initialDelaySeconds', 50)))
901+
902+
if added_timeout:
903+
delay = max(added_timeout)
904+
self.log(namespace, "adding {}s on to the original {}s timeout to account for the initial delay specified in the liveness / readiness probe".format(delay, timeout)) # noqa
895905
timeout += delay
896906

897-
self.log(namespace, "waiting for {} pods to be in services ({}s timeout)".format(desired, timeout)) # noqa
907+
return timeout
908+
909+
def _wait_until_pods_are_ready(self, namespace, containers, labels, desired): # noqa
910+
# If desired is 0 then there is no ready state to check on
911+
if desired == 0:
912+
return
913+
914+
timeout = self.deploy_timeout
915+
timeout = self._deploy_probe_timeout(timeout, namespace, labels, containers)
916+
self.log(namespace, "waiting for {} pods in {} namespace to be in services ({}s timeout)".format(desired, namespace, timeout)) # noqa
898917

899918
# Ensure the minimum desired number of pods are available
900919
waited = 0
@@ -1655,15 +1674,7 @@ def _wait_until_deployment_is_ready(self, namespace, name, **kwargs):
16551674
return
16561675

16571676
# get health info from container
1658-
container_name = '{}-{}'.format(labels['app'], labels['type'])
1659-
container = self._find_container(container_name, containers)
1660-
if 'readinessProbe' in container:
1661-
# If there is initial delay on the readiness check then timeout needs to be higher
1662-
# this is to account for kubernetes having readiness check report as failure until
1663-
# the initial delay period is up
1664-
delay = int(container['readinessProbe'].get('initialDelaySeconds', 50))
1665-
self.log(namespace, "adding {}s on to the original {}s timeout to account for the initial delay specified in the readiness probe".format(delay, deploy_timeout)) # noqa
1666-
deploy_timeout += delay
1677+
deploy_timeout = self._deploy_probe_timeout(deploy_timeout, namespace, labels, containers)
16671678

16681679
# a rough calculation that figures out an overall timeout
16691680
timeout = len(batches) * deploy_timeout

0 commit comments

Comments
 (0)