Skip to content

Commit 3eb39c0

Browse files
author
Gabriel Monroy
committed
fix(controller): work around fleet state reporting bug
where units report as "failed" before they go "active"
1 parent 2c1ae82 commit 3eb39c0

1 file changed

Lines changed: 5 additions & 1 deletion

File tree

controller/scheduler/coreos.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ def start(self, name):
148148
self._wait_for_container(name)
149149

150150
def _wait_for_container(self, name):
151+
failures = 0
151152
# we bump to 20 minutes here to match the timeout on the router and in the app unit files
152153
for _ in range(1200):
153154
states = self._get_state(name)
@@ -157,7 +158,10 @@ def _wait_for_container(self, name):
157158
if subState == 'running' or subState == 'exited':
158159
break
159160
elif subState == 'failed':
160-
raise RuntimeError('container failed to start')
161+
# FIXME: fleet unit state reports failed when containers are fine
162+
failures += 1
163+
if failures == 10:
164+
raise RuntimeError('container failed to start')
161165
time.sleep(1)
162166
else:
163167
raise RuntimeError('container timeout on start')

0 commit comments

Comments
 (0)