Skip to content

Commit f15b194

Browse files
committed
Merge pull request #675 from helgi/fix_checks
fix(scheduler): check if the pods are ready in scale up operations before comparing to desired state
2 parents 73c3304 + bb84471 commit f15b194

1 file changed

Lines changed: 33 additions & 23 deletions

File tree

rootfs/scheduler/__init__.py

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -855,13 +855,7 @@ def _get_pod_ready_status(self, namespace, labels, desired):
855855
pods = self._get_pods(namespace, labels=labels).json()
856856
for pod in pods['items']:
857857
# now that state is running time to see if probes are passing
858-
if (
859-
pod['status']['phase'] == 'Running' and
860-
# is the readiness probe passing?
861-
self._pod_readiness_status(pod) == 'Running' and
862-
# is the pod ready to serve requests?
863-
self._pod_liveness_status(pod)
864-
):
858+
if self._pod_ready(pod):
865859
count += 1
866860

867861
if count == desired:
@@ -883,32 +877,38 @@ def _scale_rc(self, namespace, name, desired):
883877
'type': rc['spec']['selector']['type'],
884878
'version': rc['spec']['selector']['version']
885879
}
886-
current = len(self._get_pods(namespace, labels=labels).json()['items'])
880+
881+
# Are there any pods running (and verified as ready) available?
882+
pods = self._get_pods(namespace, labels=labels).json()['items']
883+
current = 0
884+
for pod in pods:
885+
if self._pod_ready(pod):
886+
current += 1
887887

888888
if desired == current:
889889
logger.debug("Not scaling RC {} in Namespace {} to {} replicas. Already at desired replicas".format(name, namespace, desired)) # noqa
890890
return
891+
elif desired != rc['spec']['replicas']: # RC needs new replica count
892+
# Set the new desired replica count
893+
rc['spec']['replicas'] = desired
891894

892-
# Set the new desired replica count
893-
rc['spec']['replicas'] = desired
894-
895-
logger.debug("scaling RC {} in Namespace {} from {} to {} replicas".format(name, namespace, current, desired)) # noqa
895+
logger.debug("scaling RC {} in Namespace {} from {} to {} replicas".format(name, namespace, current, desired)) # noqa
896896

897-
self._update_rc(namespace, name, rc)
897+
self._update_rc(namespace, name, rc)
898898

899-
resource_ver = rc['metadata']['resourceVersion']
900-
logger.debug("waiting for RC {} to get a newer resource version than {} (30s timeout)".format(name, resource_ver)) # noqa
901-
for waited in range(30):
902-
js_template = self._get_rc(namespace, name).json()
903-
if js_template["metadata"]["resourceVersion"] != resource_ver:
904-
break
899+
resource_ver = rc['metadata']['resourceVersion']
900+
logger.debug("waiting for RC {} to get a newer resource version than {} (30s timeout)".format(name, resource_ver)) # noqa
901+
for waited in range(30):
902+
js_template = self._get_rc(namespace, name).json()
903+
if js_template["metadata"]["resourceVersion"] != resource_ver:
904+
break
905905

906-
if waited > 0 and (waited % 10) == 0:
907-
logger.debug("waited {}s so far for a new resource version".format(waited))
906+
if waited > 0 and (waited % 10) == 0:
907+
logger.debug("waited {}s so far for a new resource version".format(waited))
908908

909-
time.sleep(1)
909+
time.sleep(1)
910910

911-
logger.debug("RC {} has a new resource version {}".format(name, js_template["metadata"]["resourceVersion"])) # noqa
911+
logger.debug("RC {} has a new resource version {}".format(name, js_template["metadata"]["resourceVersion"])) # noqa
912912

913913
# Double check enough pods are in the required state to service the application
914914
self._get_pod_ready_status(namespace, labels, desired)
@@ -1371,6 +1371,16 @@ def _pod_liveness_status(self, pod):
13711371

13721372
return True
13731373

1374+
def _pod_ready(self, pod):
1375+
"""Combines various checks to see if the pod is considered up or not by checking probes"""
1376+
return (
1377+
pod['status']['phase'] == 'Running' and
1378+
# is the readiness probe passing?
1379+
self._pod_readiness_status(pod) == 'Running' and
1380+
# is the pod ready to serve requests?
1381+
self._pod_liveness_status(pod)
1382+
)
1383+
13741384
# NODES #
13751385

13761386
def _get_nodes(self, **kwargs):

0 commit comments

Comments
 (0)