1111
1212MATCH = re .compile (
1313 '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)' )
14+ RETRIES = 3
1415
1516
1617class UHTTPConnection (httplib .HTTPConnection ):
@@ -133,14 +134,21 @@ def _create_container(self, name, image, command, unit, **kwargs):
133134 tagset = ' ' .join (['"{}={}"' .format (k , v ) for k , v in tags .items ()])
134135 unit .append ({"section" : "X-Fleet" , "name" : "MachineMetadata" ,
135136 "value" : tagset })
136- # post unit to fleet
137- self ._put_unit (name , {"desiredState" : "launched" , "options" : unit })
137+ # post unit to fleet and retry
138+ for attempt in range (RETRIES ):
139+ try :
140+ self ._put_unit (name , {"desiredState" : "launched" , "options" : unit })
141+ break
142+ except :
143+ if attempt == (RETRIES - 1 ): # account for 0 indexing
144+ raise
138145
139146 def start (self , name ):
140147 """Start a container"""
141148 self ._wait_for_container (name )
142149
143150 def _wait_for_container (self , name ):
151+ failures = 0
144152 # we bump to 20 minutes here to match the timeout on the router and in the app unit files
145153 for _ in range (1200 ):
146154 states = self ._get_state (name )
@@ -150,10 +158,13 @@ def _wait_for_container(self, name):
150158 if subState == 'running' or subState == 'exited' :
151159 break
152160 elif subState == 'failed' :
153- raise RuntimeError ('container failed to start' )
161+ # FIXME: fleet unit state reports failed when containers are fine
162+ failures += 1
163+ if failures == 10 :
164+ raise RuntimeError ('container failed to start' )
154165 time .sleep (1 )
155166 else :
156- raise RuntimeError ('container failed to start' )
167+ raise RuntimeError ('container timeout on start' )
157168
158169 def _wait_for_destroy (self , name ):
159170 for _ in range (30 ):
@@ -178,7 +189,13 @@ def destroy(self, name):
178189 self ._wait_for_destroy (name )
179190
180191 def _destroy_container (self , name ):
181- return self ._delete_unit (name )
192+ for attempt in range (RETRIES ):
193+ try :
194+ self ._delete_unit (name )
195+ break
196+ except :
197+ if attempt == (RETRIES - 1 ): # account for 0 indexing
198+ raise
182199
183200 def run (self , name , image , entrypoint , command ): # noqa
184201 """Run a one-off command"""
0 commit comments