Skip to content

Commit 0ac6780

Browse files
committed
Merge pull request #3127 from mboersma/fix-worker-timeouts
fix(controller): ensure cleanup of "deis run" fleet units
2 parents dc5a3a5 + caa3eac commit 0ac6780

1 file changed

Lines changed: 84 additions & 81 deletions

File tree

controller/scheduler/fleet.py

Lines changed: 84 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -221,92 +221,95 @@ def run(self, name, image, entrypoint, command): # noqa
221221
time.sleep(1)
222222
else:
223223
raise RuntimeError('container did not report state')
224-
machineID = state.get('machineID')
225-
226-
# find the machine
227-
machines = self._get_machines()
228-
if not machines:
229-
raise RuntimeError('no available hosts to run command')
230-
231-
# find the machine's primaryIP
232-
primaryIP = None
233-
for m in machines.get('machines', []):
234-
if m['id'] == machineID:
235-
primaryIP = m['primaryIP']
236-
if not primaryIP:
237-
raise RuntimeError('could not find host')
238-
239-
# prepare ssh key
240-
file_obj = cStringIO.StringIO(base64.b64decode(self.pkey))
241-
pkey = paramiko.RSAKey(file_obj=file_obj)
242-
243-
# grab output via docker logs over SSH
244-
ssh = paramiko.SSHClient()
245-
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
246-
ssh.connect(primaryIP, username="core", pkey=pkey)
247-
# share a transport
248-
tran = ssh.get_transport()
249-
250-
def _do_ssh(cmd):
251-
chan = tran.open_session()
252-
# get a pty so stdout/stderr look right
253-
chan.get_pty()
254-
out = chan.makefile()
255-
chan.exec_command(cmd)
256-
rc, output = chan.recv_exit_status(), out.read()
257-
return rc, output
258-
259-
# wait for container to launch
260-
# we loop indefinitely here, as we have no idea how long the docker pull will take
261-
while True:
262-
rc, _ = _do_ssh('docker inspect {name}'.format(**locals()))
263-
if rc == 0:
264-
break
265-
time.sleep(1)
266-
else:
267-
raise RuntimeError('failed to create container')
268224

269-
# wait for container to start
270-
for _ in range(2):
271-
_rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
225+
try:
226+
machineID = state.get('machineID')
227+
228+
# find the machine
229+
machines = self._get_machines()
230+
if not machines:
231+
raise RuntimeError('no available hosts to run command')
232+
233+
# find the machine's primaryIP
234+
primaryIP = None
235+
for m in machines.get('machines', []):
236+
if m['id'] == machineID:
237+
primaryIP = m['primaryIP']
238+
if not primaryIP:
239+
raise RuntimeError('could not find host')
240+
241+
# prepare ssh key
242+
file_obj = cStringIO.StringIO(base64.b64decode(self.pkey))
243+
pkey = paramiko.RSAKey(file_obj=file_obj)
244+
245+
# grab output via docker logs over SSH
246+
ssh = paramiko.SSHClient()
247+
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
248+
ssh.connect(primaryIP, username="core", pkey=pkey)
249+
# share a transport
250+
tran = ssh.get_transport()
251+
252+
def _do_ssh(cmd):
253+
chan = tran.open_session()
254+
# get a pty so stdout/stderr look right
255+
chan.get_pty()
256+
out = chan.makefile()
257+
chan.exec_command(cmd)
258+
rc, output = chan.recv_exit_status(), out.read()
259+
return rc, output
260+
261+
# wait for container to launch
262+
# we loop indefinitely here, as we have no idea how long the docker pull will take
263+
while True:
264+
rc, _ = _do_ssh('docker inspect {name}'.format(**locals()))
265+
if rc == 0:
266+
break
267+
time.sleep(1)
268+
else:
269+
raise RuntimeError('failed to create container')
270+
271+
# wait for container to start
272+
for _ in range(2):
273+
_rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
274+
if _rc != 0:
275+
raise RuntimeError('failed to inspect container')
276+
_container = json.loads(_output)
277+
started_at = _container[0]["State"]["StartedAt"]
278+
if not started_at.startswith('0001'):
279+
break
280+
time.sleep(1)
281+
else:
282+
raise RuntimeError('container failed to start')
283+
284+
# wait for container to complete
285+
for _ in range(1200):
286+
_rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
287+
if _rc != 0:
288+
raise RuntimeError('failed to inspect container')
289+
_container = json.loads(_output)
290+
finished_at = _container[0]["State"]["FinishedAt"]
291+
if not finished_at.startswith('0001'):
292+
break
293+
time.sleep(1)
294+
else:
295+
raise RuntimeError('container timed out')
296+
297+
# gather container output
298+
_rc, output = _do_ssh('docker logs {name}'.format(**locals()))
272299
if _rc != 0:
273-
raise RuntimeError('failed to inspect container')
274-
_container = json.loads(_output)
275-
started_at = _container[0]["State"]["StartedAt"]
276-
if not started_at.startswith('0001'):
277-
break
278-
time.sleep(1)
279-
else:
280-
raise RuntimeError('container failed to start')
300+
raise RuntimeError('could not attach to container')
281301

282-
# wait for container to complete
283-
for _ in range(1200):
302+
# determine container exit code
284303
_rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
285304
if _rc != 0:
286-
raise RuntimeError('failed to inspect container')
287-
_container = json.loads(_output)
288-
finished_at = _container[0]["State"]["FinishedAt"]
289-
if not finished_at.startswith('0001'):
290-
break
291-
time.sleep(1)
292-
else:
293-
raise RuntimeError('container timed out')
294-
295-
# gather container output
296-
_rc, output = _do_ssh('docker logs {name}'.format(**locals()))
297-
if _rc != 0:
298-
raise RuntimeError('could not attach to container')
299-
300-
# determine container exit code
301-
_rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
302-
if _rc != 0:
303-
raise RuntimeError('could not determine exit code')
304-
container = json.loads(_output)
305-
rc = container[0]["State"]["ExitCode"]
306-
307-
# cleanup
308-
self._destroy_container(name)
309-
self._wait_for_destroy(name)
305+
raise RuntimeError('could not determine exit code')
306+
container = json.loads(_output)
307+
rc = container[0]["State"]["ExitCode"]
308+
309+
finally:
310+
# cleanup
311+
self._destroy_container(name)
312+
self._wait_for_destroy(name)
310313

311314
# return rc and output
312315
return rc, output

0 commit comments

Comments
 (0)