Skip to content

Commit 491a066

Browse files
committed
feat(scheduler): clean up stray pods when a release is deleted
Scaling an RC to 0 and deleting it does not always clear up all the active pods. This can happen if there is an image pull problem
1 parent d27cbff commit 491a066

2 files changed

Lines changed: 52 additions & 9 deletions

File tree

rootfs/api/models/release.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,11 @@ def cleanup_old(self):
183183
)
184184

185185
# Cleanup controllers
186+
labels = {
187+
'heritage': 'deis'
188+
}
186189
controller_removal = []
187-
controllers = self._scheduler._get_rcs(self.app.id).json()
190+
controllers = self._scheduler._get_rcs(self.app.id, labels=labels).json()
188191
for controller in controllers['items']:
189192
current_version = controller['metadata']['labels']['version']
190193
# skip the latest release
@@ -208,6 +211,7 @@ def cleanup_old(self):
208211
# find stray env secrets to remove that may have been missed
209212
log_event(self.app, 'Cleaning up orphaned environment var secrets', level=logging.DEBUG)
210213
labels = {
214+
'heritage': 'deis',
211215
'app': self.app.id,
212216
'type': 'env'
213217
}
@@ -220,6 +224,22 @@ def cleanup_old(self):
220224

221225
self._scheduler._delete_secret(self.app.id, secret['metadata']['name'])
222226

227+
# Remove stray pods
228+
labels = {
229+
'heritage': 'deis'
230+
}
231+
pods = self._scheduler._get_pods(self.app.id, labels=labels).json()
232+
for pod in pods['items']:
233+
if self._scheduler._pod_deleted(pod):
234+
continue
235+
236+
current_version = pod['metadata']['labels']['version']
237+
# skip the latest release
238+
if current_version == latest_version:
239+
continue
240+
241+
self._scheduler._delete_pod(self.app.id, pod['metadata']['name'])
242+
223243
def _delete_release_in_scheduler(self, namespace, version):
224244
"""
225245
Deletes a specific release in k8s
@@ -228,13 +248,22 @@ def _delete_release_in_scheduler(self, namespace, version):
228248
secret that container the env var
229249
"""
230250
labels = {
251+
'heritage': 'deis',
231252
'app': namespace,
232253
'version': 'v{}'.format(version)
233254
}
234-
controllers = self._scheduler._get_rcs(namespace, labels=labels)
235-
for controller in controllers.json()['items']:
255+
controllers = self._scheduler._get_rcs(namespace, labels=labels).json()
256+
for controller in controllers['items']:
236257
self._scheduler._scale_rc(namespace, controller['metadata']['name'], 0)
237258
self._scheduler._delete_rc(namespace, controller['metadata']['name'])
259+
# Remove stray pods
260+
labels = controller['metadata']['labels']
261+
pods = self._scheduler._get_pods(namespace, labels=labels).json()
262+
for pod in pods['items']:
263+
if self._scheduler._pod_deleted(pod):
264+
continue
265+
266+
self._scheduler._delete_pod(namespace, pod['metadata']['name'])
238267

239268
# remove secret that contains env vars for the release
240269
try:

rootfs/scheduler/__init__.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -419,25 +419,39 @@ def deploy(self, namespace, name, image, command, **kwargs): # noqa
419419
)
420420

421421
# Remove new release of the RC
422-
self._scale_rc(namespace, new_rc["metadata"]["name"], 0)
423-
self._delete_rc(namespace, new_rc["metadata"]["name"])
422+
self._cleanup_release(namespace, new_rc)
424423

425-
# Bring back old release if available of the RC
424+
# If there was a previous release then bring that back
426425
if old_rc:
427426
self._scale_rc(namespace, old_rc["metadata"]["name"], desired)
428427

429-
raise KubeException('{} (scheduler::deploy): {}'.format(name, e))
428+
raise KubeException(str(e))
430429

431430
# New release is live and kicking. Clean up old release
432431
if old_rc:
433-
self._scale_rc(namespace, old_rc["metadata"]["name"], 0)
434-
self._delete_rc(namespace, old_rc["metadata"]["name"])
432+
self._cleanup_release(namespace, old_rc)
435433

436434
# Make sure the application is routable and uses the correct port
437435
# Done after the fact to let initial deploy settle before routing
438436
# traffic to the application
439437
self._update_application_service(namespace, name, app_type, port, routable)
440438

439+
def _cleanup_release(self, namespace, controller):
440+
"""
441+
Cleans up resources related to an application deployment
442+
"""
443+
# Have the RC scale down pods and delete itself
444+
self._scale_rc(namespace, controller['metadata']['name'], 0)
445+
self._delete_rc(namespace, controller['metadata']['name'])
446+
447+
# Remove stray pods that the scale down will have missed (this can occassionally happen)
448+
pods = self._get_pods(namespace, labels=controller['metadata']['labels']).json()
449+
for pod in pods['items']:
450+
if self._pod_deleted(pod):
451+
continue
452+
453+
self._delete_pod(namespace, pod['metadata']['name'])
454+
441455
def _update_application_service(self, namespace, name, app_type, port, routable=False):
442456
"""Update application service with all the various required information"""
443457
try:

0 commit comments

Comments
 (0)