Skip to content

Commit 3c462b7

Browse files
author
Matthew Fisher
committed
fix(controller): fixup fleet reporting failed state
when fleet loads a job, sometimes it'll automatically start and stop the container, which in our case will return as 'failed', even though the container is perfectly fine.
1 parent 9ee7525 commit 3c462b7

1 file changed

Lines changed: 30 additions & 13 deletions

File tree

controller/scheduler/fleet.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,30 @@ def __init__(self, target, auth, options, pkey):
4444

4545
# connection helpers
4646

47+
def _request_unit(self, method, name, body=None):
48+
headers = {'Content-Type': 'application/json'}
49+
self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()),
50+
headers=headers, body=json.dumps(body))
51+
return self.conn.getresponse()
52+
53+
def _get_unit(self, name):
54+
for attempt in range(RETRIES):
55+
try:
56+
resp = self._request_unit('GET', name)
57+
data = resp.read()
58+
if not 200 <= resp.status <= 299:
59+
errmsg = "Failed to retrieve unit: {} {} - {}".format(
60+
resp.status, resp.reason, data)
61+
raise RuntimeError(errmsg)
62+
return data
63+
except:
64+
if attempt >= (RETRIES - 1):
65+
raise
66+
4767
def _put_unit(self, name, body):
4868
for attempt in range(RETRIES):
4969
try:
50-
headers = {'Content-Type': 'application/json'}
51-
self.conn.request('PUT', '/v1-alpha/units/{name}.service'.format(**locals()),
52-
headers=headers, body=json.dumps(body))
53-
resp = self.conn.getresponse()
70+
resp = self._request_unit('PUT', name, body)
5471
data = resp.read()
5572
if not 200 <= resp.status <= 299:
5673
errmsg = "Failed to create unit: {} {} - {}".format(
@@ -171,17 +188,8 @@ def _wait_for_container_state(self, name):
171188
raise RuntimeError('container timeout while retrieving state')
172189

173190
def _wait_for_container_running(self, name):
174-
failures = 0
175191
# we bump to 20 minutes here to match the timeout on the router and in the app unit files
176192
for _ in range(1200):
177-
# FIXME: fleet unit state reports failed when containers are fine
178-
state = self._wait_for_container_state(name)
179-
if state.get('systemdSubState') == 'failed':
180-
failures += 1
181-
if failures == 10:
182-
raise RuntimeError('container failed to start')
183-
time.sleep(1)
184-
continue
185193
if self.state(name) == JobState.up:
186194
return
187195
time.sleep(1)
@@ -329,8 +337,17 @@ def state(self, name):
329337
"deactivating": "down",
330338
}
331339
try:
340+
# NOTE (bacongobbler): this call to ._get_unit() also acts as a pre-emptive check to
341+
# determine if the job no longer exists (will raise a RuntimeError on 404)
342+
unit = self._get_unit(name)
332343
state = self._wait_for_container_state(name)
333344
activeState = state['systemdActiveState']
345+
# FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and
346+
# stop the container, which in our case will return as 'failed', even though
347+
# the container is perfectly fine.
348+
if activeState == 'failed':
349+
if json.loads(unit)['currentState'] == 'loaded':
350+
return JobState.created
334351
return getattr(JobState, systemdActiveStateMap[activeState])
335352
except KeyError:
336353
# failed retrieving a proper response from the fleet API

0 commit comments

Comments
 (0)