Skip to content

Commit a378b09

Browse files
committed
Merge pull request #432 from helgi/states
fix(scale): add more state information to make scale smarter
2 parents b8d36bc + 3e8451d commit a378b09

2 files changed

Lines changed: 81 additions & 22 deletions

File tree

rootfs/scheduler/__init__.py

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -517,16 +517,31 @@ def state(self, name):
517517
return JobState.error
518518

519519
def resolve_state(self, pod):
520+
if pod is None:
521+
return JobState.destroyed
522+
520523
# See "Pod Phase" at http://kubernetes.io/v1.1/docs/user-guide/pod-states.html
521524
states = {
522-
"Pending": JobState.initialized,
523-
"Running": JobState.up,
524-
"Succeeded": JobState.down,
525-
"Failed": JobState.crashed,
526-
"Unknown": JobState.error,
525+
'Pending': JobState.initialized,
526+
'Starting': JobState.starting,
527+
'Running': JobState.up,
528+
'Terminating': JobState.terminating,
529+
'Succeeded': JobState.down,
530+
'Failed': JobState.crashed,
531+
'Unknown': JobState.error,
527532
}
528533

529-
return states[pod["status"]["phase"]]
534+
# being in a running state can mean a pod is starting, actually running or terminating
535+
if pod['status']['phase'] == 'Running':
536+
# is the readiness probe passing?
537+
container_status = self._pod_readiness_status(pod)
538+
if container_status in ['Starting', 'Terminating']:
539+
return states[container_status]
540+
elif container_status == 'Running' and self._pod_liveness_status(pod):
541+
# is the pod ready to serve requests?
542+
return states[container_status]
543+
544+
return states[pod['status']['phase']]
530545

531546
def _api(self, tmpl, *args):
532547
"""Return a fully-qualified Kubernetes API URL from a string template with args."""
@@ -685,9 +700,43 @@ def _get_schedule_status(self, namespace, name, current, desired, resource_versi
685700

686701
logger.debug("{} out of {} pods in namespace {} are in state {}".format(len(waiting_pods), state_count, namespace, reason)) # noqa
687702

688-
def _get_pod_ready_status(self, namespace, name, num):
703+
# if it was a scale down operation, wait until terminating pods are done
704+
if reason == 'Killing':
705+
self._wait_until_pods_terminate(namespace, name, state_count)
706+
707+
def _wait_until_pods_terminate(self, namespace, name, desired):
708+
logger.debug("waiting for {} pods in {} namespace to be terminated (120s timeout)".format(desired, namespace)) # noqa
709+
for waited in range(120):
710+
count = 0
711+
pods = self._get_pods(namespace).json()
712+
for pod in pods['items']:
713+
# now that state is running time to see if probes are passing
714+
if (
715+
pod['metadata']['generateName'] == name+'-' and
716+
pod['status']['phase'] == 'Running' and
717+
# is the readiness probe passing?
718+
self._pod_readiness_status(pod) == 'Terminating'
719+
):
720+
count += 1
721+
722+
# stop when all pods are terminated as expected
723+
if count == 0:
724+
break
725+
726+
if waited > 0 and (waited % 10) == 0:
727+
logger.debug("waited {}s and {} pods out of {} are fully terminated".format(waited, (desired - count), desired)) # noqa
728+
729+
time.sleep(1)
730+
731+
logger.debug("{} pods in namespace {} are terminated".format(desired, namespace)) # noqa
732+
733+
def _get_pod_ready_status(self, namespace, name, desired):
734+
# If desired is 0 then there is no ready state to check on
735+
if desired == 0:
736+
return
737+
689738
# Ensure the minimum desired number of pods are available
690-
logger.debug("waiting for {} pods in {} namespace to be in services (120s timeout)".format(num, namespace)) # noqa
739+
logger.debug("waiting for {} pods in {} namespace to be in services (120s timeout)".format(desired, namespace)) # noqa
691740
for waited in range(120):
692741
count = 0
693742
pods = self._get_pods(namespace).json()
@@ -697,21 +746,21 @@ def _get_pod_ready_status(self, namespace, name, num):
697746
pod['metadata']['generateName'] == name+'-' and
698747
pod['status']['phase'] == 'Running' and
699748
# is the readiness probe passing?
700-
self._pod_readiness_status(pod, name) and
749+
self._pod_readiness_status(pod) == 'Running' and
701750
# is the pod ready to serve requests?
702751
self._pod_liveness_status(pod)
703752
):
704753
count += 1
705754

706-
if count == num:
755+
if count == desired:
707756
break
708757

709758
if waited > 0 and (waited % 10) == 0:
710759
logger.debug("waited {}s and {} pods are in service".format(waited, count))
711760

712761
time.sleep(1)
713762

714-
logger.debug("{} out of {} pods in namespace {} are in service".format(count, num, namespace)) # noqa
763+
logger.debug("{} out of {} pods in namespace {} are in service".format(count, desired, namespace)) # noqa
715764

716765
def _scale_rc(self, name, namespace, desired):
717766
rc = self._get_rc(name, namespace)
@@ -757,6 +806,7 @@ def _scale_rc(self, name, namespace, desired):
757806
js_template['metadata']['resourceVersion']
758807
)
759808

809+
# Double check enough pods are in the required state to service the application
760810
self._get_pod_ready_status(namespace, name, desired)
761811

762812
def _create_rc(self, name, image, command, **kwargs): # noqa
@@ -948,7 +998,6 @@ def _create_secret(self, namespace, name, data):
948998

949999
url = self._api("/namespaces/{}/secrets", namespace)
9501000
response = self.session.post(url, json=template)
951-
logger.critical(response)
9521001
if unhealthy(response.status_code):
9531002
error(response, 'failed to create secret "{}" in Namespace "{}"', name, namespace)
9541003

@@ -1063,14 +1112,22 @@ def _pod_log(self, name, namespace):
10631112

10641113
return resp.status_code, resp.text, resp.reason
10651114

1066-
def _pod_readiness_status(self, pod, name):
1115+
def _pod_readiness_status(self, pod):
10671116
"""Check if the pod container have passed the readiness probes"""
1117+
name = '{}-{}'.format(pod['metadata']['labels']['app'], pod['metadata']['labels']['type'])
10681118
for container in pod['status']['containerStatuses']:
10691119
# find the right container in case there are many on the pod
1070-
if container['name'] == name and not container['ready']:
1071-
return False
1072-
1073-
return True
1120+
if container['name'] == name:
1121+
if not container['ready']:
1122+
if 'running' in container['state']:
1123+
return 'Starting'
1124+
elif 'terminated' in container['state']:
1125+
return 'Terminating'
1126+
else:
1127+
return 'Running'
1128+
1129+
# Seems like the most sensible default
1130+
return 'Unknown'
10741131

10751132
def _pod_liveness_status(self, pod):
10761133
"""Check if the pods liveness probe status has passed all checks"""

rootfs/scheduler/states.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ def __init__(self, prev, next, msg):
1313
class JobState(enum.Enum):
1414
initialized = 1
1515
created = 2
16-
up = 3
17-
down = 4
18-
destroyed = 5
19-
crashed = 6
20-
error = 7
16+
starting = 3
17+
up = 4
18+
terminating = 5
19+
down = 6
20+
destroyed = 7
21+
crashed = 8
22+
error = 9

0 commit comments

Comments
 (0)