@@ -44,13 +44,30 @@ def __init__(self, target, auth, options, pkey):
4444
4545 # connection helpers
4646
47+ def _request_unit (self , method , name , body = None ):
48+ headers = {'Content-Type' : 'application/json' }
49+ self .conn .request (method , '/v1-alpha/units/{name}.service' .format (** locals ()),
50+ headers = headers , body = json .dumps (body ))
51+ return self .conn .getresponse ()
52+
53+ def _get_unit (self , name ):
54+ for attempt in range (RETRIES ):
55+ try :
56+ resp = self ._request_unit ('GET' , name )
57+ data = resp .read ()
58+ if not 200 <= resp .status <= 299 :
59+ errmsg = "Failed to retrieve unit: {} {} - {}" .format (
60+ resp .status , resp .reason , data )
61+ raise RuntimeError (errmsg )
62+ return data
63+ except :
64+ if attempt >= (RETRIES - 1 ):
65+ raise
66+
4767 def _put_unit (self , name , body ):
4868 for attempt in range (RETRIES ):
4969 try :
50- headers = {'Content-Type' : 'application/json' }
51- self .conn .request ('PUT' , '/v1-alpha/units/{name}.service' .format (** locals ()),
52- headers = headers , body = json .dumps (body ))
53- resp = self .conn .getresponse ()
70+ resp = self ._request_unit ('PUT' , name , body )
5471 data = resp .read ()
5572 if not 200 <= resp .status <= 299 :
5673 errmsg = "Failed to create unit: {} {} - {}" .format (
@@ -171,17 +188,8 @@ def _wait_for_container_state(self, name):
171188 raise RuntimeError ('container timeout while retrieving state' )
172189
173190 def _wait_for_container_running (self , name ):
174- failures = 0
175191 # we bump to 20 minutes here to match the timeout on the router and in the app unit files
176192 for _ in range (1200 ):
177- # FIXME: fleet unit state reports failed when containers are fine
178- state = self ._wait_for_container_state (name )
179- if state .get ('systemdSubState' ) == 'failed' :
180- failures += 1
181- if failures == 10 :
182- raise RuntimeError ('container failed to start' )
183- time .sleep (1 )
184- continue
185193 if self .state (name ) == JobState .up :
186194 return
187195 time .sleep (1 )
@@ -329,8 +337,17 @@ def state(self, name):
329337 "deactivating" : "down" ,
330338 }
331339 try :
340+ # NOTE (bacongobbler): this call to ._get_unit() also acts as a pre-emptive check to
341+ # determine if the job no longer exists (will raise a RuntimeError on 404)
342+ unit = self ._get_unit (name )
332343 state = self ._wait_for_container_state (name )
333344 activeState = state ['systemdActiveState' ]
345+ # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and
346+ # stop the container, which in our case will return as 'failed', even though
347+ # the container is perfectly fine.
348+ if activeState == 'failed' :
349+ if json .loads (unit )['currentState' ] == 'loaded' :
350+ return JobState .created
334351 return getattr (JobState , systemdActiveStateMap [activeState ])
335352 except KeyError :
336353 # failed retrieving a proper response from the fleet API
0 commit comments