Skip to content

Commit f3f8131

Browse files
author
Gabriel Monroy
committed
Merge pull request #2072 from gabrtv/retry
Retry Fleet container scheduling operations
2 parents 2714073 + f81c766 commit f3f8131

1 file changed

Lines changed: 22 additions & 5 deletions

File tree

controller/scheduler/coreos.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
MATCH = re.compile(
1313
'(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)')
14+
RETRIES = 3
1415

1516

1617
class UHTTPConnection(httplib.HTTPConnection):
@@ -133,14 +134,21 @@ def _create_container(self, name, image, command, unit, **kwargs):
133134
tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.items()])
134135
unit.append({"section": "X-Fleet", "name": "MachineMetadata",
135136
"value": tagset})
136-
# post unit to fleet
137-
self._put_unit(name, {"desiredState": "launched", "options": unit})
137+
# post unit to fleet and retry
138+
for attempt in range(RETRIES):
139+
try:
140+
self._put_unit(name, {"desiredState": "launched", "options": unit})
141+
break
142+
except:
143+
if attempt == (RETRIES - 1): # account for 0 indexing
144+
raise
138145

139146
def start(self, name):
140147
"""Start a container"""
141148
self._wait_for_container(name)
142149

143150
def _wait_for_container(self, name):
151+
failures = 0
144152
# we bump to 20 minutes here to match the timeout on the router and in the app unit files
145153
for _ in range(1200):
146154
states = self._get_state(name)
@@ -150,10 +158,13 @@ def _wait_for_container(self, name):
150158
if subState == 'running' or subState == 'exited':
151159
break
152160
elif subState == 'failed':
153-
raise RuntimeError('container failed to start')
161+
# FIXME: fleet unit state reports failed when containers are fine
162+
failures += 1
163+
if failures == 10:
164+
raise RuntimeError('container failed to start')
154165
time.sleep(1)
155166
else:
156-
raise RuntimeError('container failed to start')
167+
raise RuntimeError('container timeout on start')
157168

158169
def _wait_for_destroy(self, name):
159170
for _ in range(30):
@@ -178,7 +189,13 @@ def destroy(self, name):
178189
self._wait_for_destroy(name)
179190

180191
def _destroy_container(self, name):
181-
return self._delete_unit(name)
192+
for attempt in range(RETRIES):
193+
try:
194+
self._delete_unit(name)
195+
break
196+
except:
197+
if attempt == (RETRIES - 1): # account for 0 indexing
198+
raise
182199

183200
def run(self, name, image, entrypoint, command): # noqa
184201
"""Run a one-off command"""

0 commit comments

Comments
 (0)