Skip to content

Commit 26a75f2

Browse files
authored
Merge pull request #884 from helgi/fix_probes
fix(healthchecks): when migrating from old healthchecks also add in readiness probe
2 parents fbf05a1 + a2d6010 commit 26a75f2

4 files changed

Lines changed: 74 additions & 91 deletions

File tree

rootfs/api/models/config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,20 @@ def _migrate_legacy_healthcheck(self):
5656
}
5757
}
5858

59+
self.healthcheck['readinessProbe'] = {
60+
'initialDelaySeconds': delay,
61+
'timeoutSeconds': timeout,
62+
'periodSeconds': period_seconds,
63+
'successThreshold': success_threshold,
64+
'failureThreshold': failure_threshold,
65+
'httpGet': {
66+
'path': path,
67+
}
68+
}
69+
70+
# Unset all the old values
71+
self.values = {k: v for k, v in self.values.items() if not k.startswith('HEALTHCHECK_')}
72+
5973
def set_registry(self):
6074
# lower case all registry options for consistency
6175
self.registry = {key.lower(): value for key, value in self.registry.copy().items()}

rootfs/api/tests/deployments/test_config.py

Lines changed: 16 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -773,39 +773,6 @@ def test_unauthorized_user_cannot_modify_config(self, mock_requests):
773773
response = self.client.post(url, body)
774774
self.assertEqual(response.status_code, 403)
775775

776-
def test_healthchecks(self, mock_requests):
777-
"""
778-
Test that healthchecks can be applied
779-
"""
780-
response = self.client.post('/v2/apps')
781-
self.assertEqual(response.status_code, 201, response.data)
782-
app_id = response.data['id']
783-
784-
# Set a healthcheck option before URL is around (URL is required for full setting)
785-
resp = self.client.post(
786-
'/v2/apps/{app_id}/config'.format(**locals()),
787-
{'values': json.dumps({'HEALTHCHECK_INITIAL_DELAY': '25'})}
788-
)
789-
self.assertEqual(resp.status_code, 201, response.data)
790-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', resp.data['values'])
791-
self.assertEqual(resp.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
792-
793-
# Set healthcheck URL to get defaults set
794-
resp = self.client.post(
795-
'/v2/apps/{app_id}/config'.format(**locals()),
796-
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
797-
)
798-
self.assertEqual(resp.status_code, 201, response.data)
799-
self.assertIn('HEALTHCHECK_URL', resp.data['values'])
800-
self.assertEqual(resp.data['values']['HEALTHCHECK_URL'], '/health')
801-
802-
# post a new build
803-
response = self.client.post(
804-
"/v2/apps/{app_id}/builds".format(**locals()),
805-
{'image': 'quay.io/autotest/example'}
806-
)
807-
self.assertEqual(response.status_code, 201, response.data)
808-
809776
def test_healthchecks_validations(self, mock_requests):
810777
"""
811778
Test that healthchecks validations work
@@ -929,8 +896,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
929896
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
930897
)
931898
self.assertEqual(response.status_code, 201, response.data)
932-
self.assertIn('HEALTHCHECK_URL', response.data['values'])
933-
self.assertEqual(response.data['values']['HEALTHCHECK_URL'], '/health')
899+
# this gets migrated to the new healtcheck format
900+
self.assertNotIn('HEALTHCHECK_URL', response.data['values'])
934901
# legacy defaults
935902
expected = {
936903
'livenessProbe': {
@@ -942,6 +909,16 @@ def test_config_healthchecks_legacy(self, mock_requests):
942909
'httpGet': {
943910
'path': '/health'
944911
}
912+
},
913+
'readinessProbe': {
914+
'initialDelaySeconds': 50,
915+
'timeoutSeconds': 50,
916+
'periodSeconds': 10,
917+
'successThreshold': 1,
918+
'failureThreshold': 3,
919+
'httpGet': {
920+
'path': '/health'
921+
}
945922
}
946923
}
947924
actual = app.config_set.latest().healthcheck
@@ -951,6 +928,7 @@ def test_config_healthchecks_legacy(self, mock_requests):
951928
'/v2/apps/{app.id}/config'.format(**locals()),
952929
{
953930
'values': json.dumps({
931+
'HEALTHCHECK_URL': '/health',
954932
'HEALTHCHECK_INITIAL_DELAY': '25',
955933
'HEALTHCHECK_TIMEOUT': '10',
956934
'HEALTHCHECK_PERIOD_SECONDS': '5',
@@ -959,8 +937,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
959937
}
960938
)
961939
self.assertEqual(response.status_code, 201, response.data)
962-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
963-
self.assertEqual(response.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
940+
# this gets migrated to the new healtcheck format
941+
self.assertNotIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
964942
expected['livenessProbe'] = {
965943
'initialDelaySeconds': 25,
966944
'timeoutSeconds': 10,
@@ -971,5 +949,6 @@ def test_config_healthchecks_legacy(self, mock_requests):
971949
'path': '/health'
972950
}
973951
}
952+
expected['readinessProbe'] = expected['livenessProbe']
974953
actual = app.config_set.latest().healthcheck
975954
self.assertEqual(expected, actual)

rootfs/api/tests/test_config.py

Lines changed: 16 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -771,39 +771,6 @@ def test_unauthorized_user_cannot_modify_config(self, mock_requests):
771771
response = self.client.post(url, body)
772772
self.assertEqual(response.status_code, 403)
773773

774-
def test_healthchecks(self, mock_requests):
775-
"""
776-
Test that healthchecks can be applied
777-
"""
778-
response = self.client.post('/v2/apps')
779-
self.assertEqual(response.status_code, 201, response.data)
780-
app_id = response.data['id']
781-
782-
# Set a healthcheck option before URL is around (URL is required for full setting)
783-
resp = self.client.post(
784-
'/v2/apps/{app_id}/config'.format(**locals()),
785-
{'values': json.dumps({'HEALTHCHECK_INITIAL_DELAY': '25'})}
786-
)
787-
self.assertEqual(resp.status_code, 201, response.data)
788-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', resp.data['values'])
789-
self.assertEqual(resp.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
790-
791-
# Set healthcheck URL to get defaults set
792-
resp = self.client.post(
793-
'/v2/apps/{app_id}/config'.format(**locals()),
794-
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
795-
)
796-
self.assertEqual(resp.status_code, 201, response.data)
797-
self.assertIn('HEALTHCHECK_URL', resp.data['values'])
798-
self.assertEqual(resp.data['values']['HEALTHCHECK_URL'], '/health')
799-
800-
# post a new build
801-
response = self.client.post(
802-
"/v2/apps/{app_id}/builds".format(**locals()),
803-
{'image': 'quay.io/autotest/example'}
804-
)
805-
self.assertEqual(response.status_code, 201, response.data)
806-
807774
def test_healthchecks_validations(self, mock_requests):
808775
"""
809776
Test that healthchecks validations work
@@ -927,8 +894,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
927894
{'values': json.dumps({'HEALTHCHECK_URL': '/health'})}
928895
)
929896
self.assertEqual(response.status_code, 201, response.data)
930-
self.assertIn('HEALTHCHECK_URL', response.data['values'])
931-
self.assertEqual(response.data['values']['HEALTHCHECK_URL'], '/health')
897+
# this gets migrated to the new healtcheck format
898+
self.assertNotIn('HEALTHCHECK_URL', response.data['values'])
932899
# legacy defaults
933900
expected = {
934901
'livenessProbe': {
@@ -940,6 +907,16 @@ def test_config_healthchecks_legacy(self, mock_requests):
940907
'httpGet': {
941908
'path': '/health'
942909
}
910+
},
911+
'readinessProbe': {
912+
'initialDelaySeconds': 50,
913+
'timeoutSeconds': 50,
914+
'periodSeconds': 10,
915+
'successThreshold': 1,
916+
'failureThreshold': 3,
917+
'httpGet': {
918+
'path': '/health'
919+
}
943920
}
944921
}
945922
actual = app.config_set.latest().healthcheck
@@ -949,6 +926,7 @@ def test_config_healthchecks_legacy(self, mock_requests):
949926
'/v2/apps/{app.id}/config'.format(**locals()),
950927
{
951928
'values': json.dumps({
929+
'HEALTHCHECK_URL': '/health',
952930
'HEALTHCHECK_INITIAL_DELAY': '25',
953931
'HEALTHCHECK_TIMEOUT': '10',
954932
'HEALTHCHECK_PERIOD_SECONDS': '5',
@@ -957,8 +935,8 @@ def test_config_healthchecks_legacy(self, mock_requests):
957935
}
958936
)
959937
self.assertEqual(response.status_code, 201, response.data)
960-
self.assertIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
961-
self.assertEqual(response.data['values']['HEALTHCHECK_INITIAL_DELAY'], '25')
938+
# this gets migrated to the new healtcheck format
939+
self.assertNotIn('HEALTHCHECK_INITIAL_DELAY', response.data['values'])
962940
expected['livenessProbe'] = {
963941
'initialDelaySeconds': 25,
964942
'timeoutSeconds': 10,
@@ -969,5 +947,6 @@ def test_config_healthchecks_legacy(self, mock_requests):
969947
'path': '/health'
970948
}
971949
}
950+
expected['readinessProbe'] = expected['livenessProbe']
972951
actual = app.config_set.latest().healthcheck
973952
self.assertEqual(expected, actual)

rootfs/scheduler/__init__.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -875,26 +875,45 @@ def _wait_until_pods_terminate(self, namespace, labels, current, desired):
875875

876876
self.log(namespace, "{} pods are terminated".format(delta))
877877

878-
def _wait_until_pods_are_ready(self, namespace, containers, labels, desired): # noqa
879-
# If desired is 0 then there is no ready state to check on
880-
if desired == 0:
881-
return
878+
def _deploy_probe_timeout(self, timeout, namespace, labels, containers):
879+
"""
880+
Added in additional timeouts based on readiness and liveness probe
882881
883-
timeout = self.deploy_timeout
882+
Uses the max of the two instead of combining them as the checks are stacked.
883+
"""
884884

885885
container_name = '{}-{}'.format(labels['app'], labels['type'])
886886
container = self._find_container(container_name, containers)
887887

888888
# get health info from container
889+
added_timeout = []
889890
if 'readinessProbe' in container:
890891
# If there is initial delay on the readiness check then timeout needs to be higher
891892
# this is to account for kubernetes having readiness check report as failure until
892893
# the initial delay period is up
893-
delay = int(container['readinessProbe'].get('initialDelaySeconds', 50))
894-
self.log(namespace, "adding {}s on to the original {}s timeout to account for the initial delay specified in the readiness probe".format(delay, timeout)) # noqa
894+
added_timeout.append(int(container['readinessProbe'].get('initialDelaySeconds', 50)))
895+
896+
if 'livenessProbe' in container:
897+
# If there is initial delay on the readiness check then timeout needs to be higher
898+
# this is to account for kubernetes having liveness check report as failure until
899+
# the initial delay period is up
900+
added_timeout.append(int(container['livenessProbe'].get('initialDelaySeconds', 50)))
901+
902+
if added_timeout:
903+
delay = max(added_timeout)
904+
self.log(namespace, "adding {}s on to the original {}s timeout to account for the initial delay specified in the liveness / readiness probe".format(delay, timeout)) # noqa
895905
timeout += delay
896906

897-
self.log(namespace, "waiting for {} pods to be in services ({}s timeout)".format(desired, timeout)) # noqa
907+
return timeout
908+
909+
def _wait_until_pods_are_ready(self, namespace, containers, labels, desired): # noqa
910+
# If desired is 0 then there is no ready state to check on
911+
if desired == 0:
912+
return
913+
914+
timeout = self.deploy_timeout
915+
timeout = self._deploy_probe_timeout(timeout, namespace, labels, containers)
916+
self.log(namespace, "waiting for {} pods in {} namespace to be in services ({}s timeout)".format(desired, namespace, timeout)) # noqa
898917

899918
# Ensure the minimum desired number of pods are available
900919
waited = 0
@@ -1655,15 +1674,7 @@ def _wait_until_deployment_is_ready(self, namespace, name, **kwargs):
16551674
return
16561675

16571676
# get health info from container
1658-
container_name = '{}-{}'.format(labels['app'], labels['type'])
1659-
container = self._find_container(container_name, containers)
1660-
if 'readinessProbe' in container:
1661-
# If there is initial delay on the readiness check then timeout needs to be higher
1662-
# this is to account for kubernetes having readiness check report as failure until
1663-
# the initial delay period is up
1664-
delay = int(container['readinessProbe'].get('initialDelaySeconds', 50))
1665-
self.log(namespace, "adding {}s on to the original {}s timeout to account for the initial delay specified in the readiness probe".format(delay, deploy_timeout)) # noqa
1666-
deploy_timeout += delay
1677+
deploy_timeout = self._deploy_probe_timeout(deploy_timeout, namespace, labels, containers)
16671678

16681679
# a rough calculation that figures out an overall timeout
16691680
timeout = len(batches) * deploy_timeout

0 commit comments

Comments
 (0)