Skip to content

Commit 689df78

Browse files
authored
feat(scheduler): add the ability to set KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS per application (#1026)
Close #807
1 parent ffa9040 commit 689df78

5 files changed

Lines changed: 40 additions & 8 deletions

File tree

rootfs/api/models/app.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,9 @@ def _scale_pods(self, scale_types):
405405
# see if the app config has deploy timeout preference, otherwise use global
406406
deploy_timeout = release.config.values.get('DEIS_DEPLOY_TIMEOUT', settings.DEIS_DEPLOY_TIMEOUT) # noqa
407407

408+
# get application level pod termination grace period
409+
pod_termination_grace_period_seconds = release.config.values.get('KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS', settings.KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS) # noqa
410+
408411
tasks = []
409412
for scale_type, replicas in scale_types.items():
410413
# only web / cmd are routable
@@ -433,6 +436,7 @@ def _scale_pods(self, scale_types):
433436
'routable': routable,
434437
'deploy_batches': batches,
435438
'deploy_timeout': deploy_timeout,
439+
'pod_termination_grace_period_seconds': pod_termination_grace_period_seconds,
436440
}
437441

438442
# gather all proc types to be deployed
@@ -486,6 +490,9 @@ def deploy(self, release, force_deploy=False):
486490

487491
deployment_history = release.config.values.get('KUBERNETES_DEPLOYMENTS_REVISION_HISTORY_LIMIT', settings.KUBERNETES_DEPLOYMENTS_REVISION_HISTORY_LIMIT) # noqa
488492

493+
# get application level pod termination grace period
494+
pod_termination_grace_period_seconds = release.config.values.get('KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS', settings.KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS) # noqa
495+
489496
# deploy application to k8s. Also handles initial scaling
490497
deploys = {}
491498
image = release.image
@@ -512,7 +519,6 @@ def deploy(self, release, force_deploy=False):
512519
'tags': tags,
513520
'envs': envs,
514521
'registry': release.config.registry,
515-
# only used if there is no previous RC
516522
'replicas': replicas,
517523
'version': version,
518524
'app_type': scale_type,
@@ -522,7 +528,8 @@ def deploy(self, release, force_deploy=False):
522528
'deploy_batches': batches,
523529
'deploy_timeout': deploy_timeout,
524530
'deployment_history_limit': deployment_history,
525-
'release_summary': release.summary
531+
'release_summary': release.summary,
532+
'pod_termination_grace_period_seconds': pod_termination_grace_period_seconds,
526533
}
527534

528535
# Sort deploys so routable comes first
@@ -729,6 +736,9 @@ def pod_name(size=5, chars=string.ascii_lowercase + string.digits):
729736
# see if the app config has deploy timeout preference, otherwise use global
730737
deploy_timeout = release.config.values.get('DEIS_DEPLOY_TIMEOUT', settings.DEIS_DEPLOY_TIMEOUT) # noqa
731738

739+
# get application level pod termination grace period
740+
pod_termination_grace_period_seconds = release.config.values.get('KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS', settings.KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS) # noqa
741+
732742
name = self._get_job_id(scale_type) + '-' + pod_name()
733743
self.log("{} on {} runs '{}'".format(user.username, name, command))
734744

@@ -743,7 +753,8 @@ def pod_name(size=5, chars=string.ascii_lowercase + string.digits):
743753
'registry': release.config.registry,
744754
'version': version,
745755
'build_type': release.build.type,
746-
'deploy_timeout': deploy_timeout
756+
'deploy_timeout': deploy_timeout,
757+
'pod_termination_grace_period_seconds': pod_termination_grace_period_seconds,
747758
}
748759

749760
try:

rootfs/scheduler/resources/pod.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,8 @@ def manifest(self, namespace, name, image, **kwargs):
110110
# apply tags as needed to restrict pod to particular node(s)
111111
spec['nodeSelector'] = kwargs.get('tags', {})
112112

113-
# How long until a pod is forcefully terminated
114-
spec['terminationGracePeriodSeconds'] = settings.KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS # noqa
113+
# How long until a pod is forcefully terminated. 30 is kubernetes default
114+
spec['terminationGracePeriodSeconds'] = kwargs.get('pod_termination_grace_period_seconds', 30) # noqa
115115

116116
# set the image pull policy that is associated with the application container
117117
kwargs['image_pull_policy'] = settings.DOCKER_BUILDER_IMAGE_PULL_POLICY
@@ -384,14 +384,20 @@ def _set_image_secret(self, data, namespace, **kwargs):
384384
data['imagePullSecrets'] = [{'name': secret_name}]
385385

386386
def delete(self, namespace, name):
387+
# get timeout info from pod
388+
pod = self.pod.get(namespace, name).json()
389+
# 30 seconds is the kubernetes default
390+
timeout = pod['spec'].get('terminationGracePeriodSeconds', 30)
391+
392+
# delete pod
387393
url = self.api("/namespaces/{}/pods/{}", namespace, name)
388394
resp = self.session.delete(url)
389395
if self.unhealthy(resp.status_code):
390396
raise KubeHTTPException(resp, 'delete Pod "{}" in Namespace "{}"', name, namespace)
391397

392398
# Verify the pod has been deleted
393399
# Only wait as long as the grace period is - k8s will eventually GC
394-
for _ in range(settings.KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS):
400+
for _ in range(timeout):
395401
try:
396402
pod = self.pod.get(namespace, name).json()
397403
# hide pod if it is passed the graceful termination period
@@ -630,7 +636,15 @@ def wait_until_terminated(self, namespace, labels, current, desired):
630636
# https://github.com/kubernetes/kubernetes/blob/release-1.2/docs/devel/api-conventions.md#metadata
631637
# http://kubernetes.io/docs/user-guide/pods/#termination-of-pods
632638

633-
timeout = settings.KUBERNETES_POD_TERMINATION_GRACE_PERIOD_SECONDS
639+
# fetch timeout from the first pod
640+
pods = self.get(namespace, labels=labels).json()
641+
if not pods['items']:
642+
return
643+
644+
spec = pods['items'][0]['spec']
645+
# default to 30 since that's kubernetes default
646+
timeout = spec.get('terminationGracePeriodSeconds', 30)
647+
634648
delta = current - desired
635649
self.log(namespace, "waiting for {} pods to be terminated ({}s timeout)".format(delta, timeout)) # noqa
636650
for waited in range(timeout):

rootfs/scheduler/tests/test_deployments.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def create(self, namespace=None, name=generate_random_name(), **kwargs):
2121
'app_type': kwargs.get('app_type', 'web'),
2222
'version': kwargs.get('version', 'v99'),
2323
'replicas': kwargs.get('replicas', 4),
24+
'pod_termination_grace_period_seconds': 2,
2425
}
2526

2627
deployment = self.scheduler.deployment.create(namespace, name, 'quay.io/fake/image',
@@ -38,6 +39,7 @@ def update(self, namespace=None, name=generate_random_name(), **kwargs):
3839
'app_type': kwargs.get('app_type', 'web'),
3940
'version': kwargs.get('version', 'v99'),
4041
'replicas': kwargs.get('replicas', 4),
42+
'pod_termination_grace_period_seconds': 2,
4143
}
4244

4345
deployment = self.scheduler.deployment.update(namespace, name, 'quay.io/fake/image',
@@ -56,6 +58,7 @@ def scale(self, namespace=None, name=generate_random_name(), **kwargs):
5658
'app_type': kwargs.get('app_type', 'web'),
5759
'version': kwargs.get('version', 'v99'),
5860
'replicas': kwargs.get('replicas', 4),
61+
'pod_termination_grace_period_seconds': 2,
5962
}
6063

6164
self.scheduler.scale(namespace, name, 'quay.io/fake/image', 'sh', 'start', **kwargs)

rootfs/scheduler/tests/test_horizontalpodautoscaler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def create(self, namespace=None, name=generate_random_name(), **kwargs):
2323
'app_type': kwargs.get('app_type', 'web'),
2424
'version': kwargs.get('version', 'v99'),
2525
'replicas': kwargs.get('replicas', 1),
26+
'pod_termination_grace_period_seconds': 2,
2627
}
2728

2829
# create a Deployment to test HPA with
@@ -68,6 +69,7 @@ def update_deployment(self, namespace=None, name=generate_random_name(), **kwarg
6869
'app_type': kwargs.get('app_type', 'web'),
6970
'version': kwargs.get('version', 'v99'),
7071
'replicas': kwargs.get('replicas', 4),
72+
'pod_termination_grace_period_seconds': 2,
7173
}
7274

7375
deployment = self.scheduler.deployment.update(namespace, name, 'quay.io/fake/image',

rootfs/scheduler/tests/test_replicationcontrollers.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def create(self, namespace=None, name=generate_random_name(), **kwargs):
2121
'app_type': kwargs.get('app_type', 'web'),
2222
'version': kwargs.get('version', 'v99'),
2323
'replicas': kwargs.get('replicas', 4),
24+
'pod_termination_grace_period_seconds': 2,
2425
}
2526

2627
rc = self.scheduler.rc.create(namespace, name, 'quay.io/fake/image',
@@ -39,7 +40,8 @@ def scale_rc(self, namespace=None, name=generate_random_name(), **kwargs):
3940
'app_type': kwargs.get('app_type', 'web'),
4041
'version': kwargs.get('version', 'v99'),
4142
'replicas': kwargs.get('replicas', 4),
42-
'deploy_timeout': 120
43+
'deploy_timeout': 120,
44+
'pod_termination_grace_period_seconds': 2,
4345
}
4446

4547
self.scheduler.scale_rc(namespace, name, 'quay.io/fake/image', 'sh', 'start', **kwargs)

0 commit comments

Comments
 (0)