Skip to content

Commit 2e76cf1

Browse files
author
Matthew Fisher
committed
feat(controller): add ps:restart command
1 parent 4b6cc67 commit 2e76cf1

10 files changed

Lines changed: 184 additions & 12 deletions

File tree

client/deis.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1700,6 +1700,7 @@ def ps(self, args):
17001700
Valid commands for processes:
17011701
17021702
ps:list list application processes
1703+
ps:restart restart process types (e.g. web, worker)
17031704
ps:scale scale processes (e.g. web=4 worker=2)
17041705
17051706
Use `deis help [command]` to learn more.
@@ -1737,6 +1738,45 @@ def ps_list(self, args, app=None):
17371738
self._logger.info("{type}.{num} {state} ({release})".format(**c))
17381739
self._logger.info('')
17391740

1741+
def ps_restart(self, args):
1742+
"""
1743+
Restarts an application's processes by type.
1744+
1745+
Usage: deis ps:restart [<type>] [options]
1746+
1747+
Arguments:
1748+
<type>
1749+
the process name as defined in your Procfile, such as 'web' or 'worker'.
1750+
1751+
Options:
1752+
-a --app=<app>
1753+
the uniquely identifiable name for the application.
1754+
"""
1755+
app = args.get('--app')
1756+
if not app:
1757+
app = self._session.app
1758+
restarting_cmd = 'Restarting processes... but first, {}!\n'.format(
1759+
os.environ.get('DEIS_DRINK_OF_CHOICE', 'coffee'))
1760+
sys.stdout.write(restarting_cmd)
1761+
sys.stdout.flush()
1762+
try:
1763+
progress = TextProgress()
1764+
progress.start()
1765+
before = time.time()
1766+
url = '/v1/apps/{}/containers/restart'.format(app)
1767+
if args.get('<type>'):
1768+
url = '/v1/apps/{}/containers/{}/restart'.format(app,
1769+
args.get('<type>'))
1770+
response = self._dispatch('post', url)
1771+
finally:
1772+
progress.cancel()
1773+
progress.join()
1774+
if response.status_code == requests.codes.ok:
1775+
self._logger.info('done in {}s'.format(int(time.time() - before)))
1776+
self.ps_list({}, app)
1777+
else:
1778+
raise ResponseError(response)
1779+
17401780
def ps_scale(self, args):
17411781
"""
17421782
Scales an application's processes by type.

controller/api/models.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,13 @@ def delete(self, *args, **kwargs):
198198
self._clean_app_logs()
199199
return super(App, self).delete(*args, **kwargs)
200200

201+
def restart(self, **kwargs):
202+
to_restart = self.container_set.all()
203+
if kwargs.get('type'):
204+
to_restart = self.container_set.filter(type=kwargs.get('type'))
205+
self._restart_containers(to_restart)
206+
return to_restart
207+
201208
def _clean_app_logs(self):
202209
"""Delete application logs stored by the logger component"""
203210
path = os.path.join(settings.DEIS_LOG_DIR, self.id + '.log')
@@ -282,6 +289,27 @@ def _start_containers(self, to_add):
282289
err = 'warning, some containers failed to start'
283290
log_event(self, err, logging.WARNING)
284291

292+
def _restart_containers(self, to_restart):
293+
"""Restarts containers via the scheduler"""
294+
stop_threads = []
295+
start_threads = []
296+
if not to_restart:
297+
# do nothing if we didn't request any containers
298+
return
299+
for c in to_restart:
300+
stop_threads.append(threading.Thread(target=c.stop))
301+
start_threads.append(threading.Thread(target=c.start))
302+
[t.start() for t in stop_threads]
303+
[t.join() for t in stop_threads]
304+
if set([c.state for c in to_restart]) != set(['created']):
305+
err = 'warning, some containers failed to stop'
306+
log_event(self, err, logging.WARNING)
307+
[t.start() for t in start_threads]
308+
[t.join() for t in start_threads]
309+
if set([c.state for c in to_restart]) != set(['up']):
310+
err = 'warning, some containers failed to start'
311+
log_event(self, err, logging.WARNING)
312+
285313
def _destroy_containers(self, to_destroy):
286314
"""Destroys containers via the scheduler"""
287315
destroy_threads = []

controller/api/tests/test_container.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from django.contrib.auth.models import User
1414
from django.test import TransactionTestCase
15-
from scheduler.states import TransitionNotAllowed
15+
from scheduler.states import TransitionError
1616
from rest_framework.authtoken.models import Token
1717

1818
from api.models import App, Build, Container, Release
@@ -57,7 +57,7 @@ def test_container_state_good(self):
5757
num=1)
5858
self.assertEqual(c.state, 'initialized')
5959
# test an illegal transition
60-
self.assertRaises(TransitionNotAllowed, lambda: c.start())
60+
self.assertRaises(TransitionError, lambda: c.start())
6161
c.create()
6262
self.assertEqual(c.state, 'created')
6363
c.start()

controller/api/tests/test_scheduler.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,75 @@ def test_start_chaos(self):
119119
states = set([c['state'] for c in response.data['results']])
120120
self.assertEqual(states, set(['crashed', 'up']))
121121

122+
def test_restart_chaos(self):
123+
url = '/v1/apps'
124+
response = self.client.post(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
125+
self.assertEqual(response.status_code, 201)
126+
app_id = response.data['id']
127+
# post a new build
128+
url = "/v1/apps/{app_id}/builds".format(**locals())
129+
body = {'image': 'autotest/example', 'sha': 'a'*40,
130+
'procfile': json.dumps({'web': 'node server.js', 'worker': 'node worker.js'})}
131+
response = self.client.post(url, json.dumps(body), content_type='application/json',
132+
HTTP_AUTHORIZATION='token {}'.format(self.token))
133+
self.assertEqual(response.status_code, 201)
134+
url = "/v1/apps/{app_id}/containers".format(**locals())
135+
response = self.client.get(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
136+
self.assertEqual(response.status_code, 200)
137+
self.assertEqual(len(response.data['results']), 1)
138+
# scale up, which will allow some crashed containers
139+
url = "/v1/apps/{app_id}/scale".format(**locals())
140+
body = {'web': 20, 'worker': 20}
141+
response = self.client.post(url, json.dumps(body), content_type='application/json',
142+
HTTP_AUTHORIZATION='token {}'.format(self.token))
143+
self.assertEqual(response.status_code, 204)
144+
# let's get chaotic
145+
chaos.STOP_ERROR_RATE = 0.5
146+
chaos.START_ERROR_RATE = 0.5
147+
# reboot the web processes
148+
url = "/v1/apps/{app_id}/containers/web/restart".format(**locals())
149+
response = self.client.post(url,
150+
content_type='application/json',
151+
HTTP_AUTHORIZATION='token {}'.format(self.token))
152+
self.assertEqual(response.status_code, 200, response.data)
153+
# inspect broken containers
154+
url = "/v1/apps/{app_id}/containers".format(**locals())
155+
response = self.client.get(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
156+
self.assertEqual(response.status_code, 200)
157+
self.assertEqual(response.data['count'], 40)
158+
# make sure some failed
159+
states = set([c['state'] for c in response.data['results']])
160+
self.assertEqual(states, set(['crashed', 'up']))
161+
# make sure that we only rebooted the web processes
162+
types = set([c['type'] for c in response.data['results'] if c['state'] == 'crashed'])
163+
self.assertEqual(types, set(['web']))
164+
# start fresh
165+
chaos.STOP_ERROR_RATE = 0.0
166+
chaos.START_ERROR_RATE = 0.0
167+
url = "/v1/apps/{app_id}/containers/web/restart".format(**locals())
168+
response = self.client.post(url,
169+
content_type='application/json',
170+
HTTP_AUTHORIZATION='token {}'.format(self.token))
171+
# let the carnage continue
172+
chaos.STOP_ERROR_RATE = 0.5
173+
chaos.START_ERROR_RATE = 0.5
174+
# reboot ALL the containers!
175+
url = "/v1/apps/{app_id}/containers/restart".format(**locals())
176+
response = self.client.post(url,
177+
content_type='application/json',
178+
HTTP_AUTHORIZATION='token {}'.format(self.token))
179+
self.assertEqual(response.status_code, 200)
180+
# inspect broken containers
181+
url = "/v1/apps/{app_id}/containers".format(**locals())
182+
response = self.client.get(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
183+
self.assertEqual(response.status_code, 200)
184+
self.assertEqual(len(response.data['results']), 40)
185+
# make sure some failed
186+
states = set([c['state'] for c in response.data['results']])
187+
self.assertEqual(states, set(['crashed', 'up']))
188+
types = set([c['type'] for c in response.data['results']])
189+
self.assertEqual(types, set(['web', 'worker']))
190+
122191
def test_destroy_chaos(self):
123192
url = '/v1/apps'
124193
response = self.client.post(url, HTTP_AUTHORIZATION='token {}'.format(self.token))

controller/api/urls.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
url(r'^apps/(?P<id>{})/releases/?'.format(settings.APP_URL_REGEX),
2727
views.ReleaseViewSet.as_view({'get': 'list'})),
2828
# application infrastructure
29+
url(r'^apps/(?P<id>{})/containers/restart/?'.format(settings.APP_URL_REGEX),
30+
views.ContainerViewSet.as_view({'post': 'restart'})),
31+
url(r'^apps/(?P<id>{})/containers/(?P<type>[-_\w.]+)/restart/?'.format(settings.APP_URL_REGEX),
32+
views.ContainerViewSet.as_view({'post': 'restart'})),
2933
url(r'^apps/(?P<id>{})/containers/(?P<type>[-_\w]+)/(?P<num>[-_\w]+)/?'.format(
3034
settings.APP_URL_REGEX),
3135
views.ContainerViewSet.as_view({'get': 'retrieve'})),

controller/api/views.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,14 @@ def get_object(self, **kwargs):
213213
qs = self.get_queryset(**kwargs)
214214
return qs.get(num=self.kwargs['num'])
215215

216+
def restart(self, *args, **kwargs):
217+
try:
218+
containers = self.get_app().restart(**kwargs)
219+
serializer = self.get_serializer(containers, many=True)
220+
return Response(serializer.data, status=status.HTTP_200_OK)
221+
except Exception as e:
222+
return Response({'detail': str(e)}, status=status.HTTP_503_SERVICE_UNAVAILABLE)
223+
216224

217225
class DomainViewSet(AppResourceViewSet):
218226
"""A viewset for interacting with Domain objects."""

controller/scheduler/fleet.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,20 @@ def _wait_for_container_state(self, name):
188188
raise RuntimeError('container timeout while retrieving state')
189189

190190
def _wait_for_container_running(self, name):
191+
# we bump to 20 minutes here to match the timeout on the router and in the app unit files
192+
try:
193+
self._wait_for_job_state(name, JobState.up)
194+
except RuntimeError:
195+
raise RuntimeError('container failed to start')
196+
197+
def _wait_for_job_state(self, name, state):
191198
# we bump to 20 minutes here to match the timeout on the router and in the app unit files
192199
for _ in range(1200):
193-
if self.state(name) == JobState.up:
200+
if self.state(name) == state:
194201
return
195202
time.sleep(1)
196203
else:
197-
raise RuntimeError('container failed to start')
204+
raise RuntimeError('timeout waiting for job state: {}'.format(state))
198205

199206
def _wait_for_destroy(self, name):
200207
for _ in range(30):
@@ -206,7 +213,8 @@ def _wait_for_destroy(self, name):
206213

207214
def stop(self, name):
208215
"""Stop a container"""
209-
raise NotImplementedError
216+
self._put_unit(name, {"desiredState": "loaded"})
217+
self._wait_for_job_state(name, JobState.created)
210218

211219
def destroy(self, name):
212220
"""Destroy a container"""

controller/scheduler/mock.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import json
22
from cStringIO import StringIO
3-
from .states import JobState, TransitionNotAllowed
3+
from .states import JobState, TransitionError
44

55

66
# HACK: MockSchedulerClient is not persistent across requests
@@ -55,8 +55,14 @@ def start(self, name):
5555
"""
5656
Start a container
5757
"""
58-
if self.state(name) not in [JobState.created, JobState.up, JobState.down]:
59-
raise TransitionNotAllowed
58+
if self.state(name) not in [JobState.created,
59+
JobState.up,
60+
JobState.down,
61+
JobState.crashed,
62+
JobState.error]:
63+
raise TransitionError(self.state(name),
64+
JobState.up,
65+
'the container must be stopped or up to start')
6066
job = jobs.get(name, {})
6167
job.update({'state': JobState.up})
6268
jobs[name] = job
@@ -77,9 +83,11 @@ def stop(self, name):
7783
Stop a container
7884
"""
7985
job = jobs.get(name, {})
80-
if job.get('state') != JobState.up:
81-
raise TransitionNotAllowed
82-
job.update({'state': JobState.stopped})
86+
if job.get('state') not in [JobState.up, JobState.crashed, JobState.error]:
87+
raise TransitionError(job.get('state'),
88+
JobState.up,
89+
'the container must be up to stop')
90+
job.update({'state': JobState.down})
8391
jobs[name] = job
8492
return
8593

controller/scheduler/states.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
import enum
22

33

4-
class TransitionNotAllowed(Exception):
4+
class TransitionError(Exception):
55
"""Raised when a transition from one state to another is illegal"""
66

7+
def __init__(self, prev, next, msg):
8+
self.prev = prev
9+
self.next = next
10+
self.msg = msg
11+
712

813
class JobState(enum.Enum):
914
initialized = 1

tests/ps_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@ var (
1616
psListCmd = "ps:list --app={{.AppName}}"
1717
psScaleCmd = "ps:scale web={{.ProcessNum}} --app={{.AppName}}"
1818
psDownScaleCmd = "ps:scale web=0 --app={{.AppName}}"
19+
psRestartCmd = "ps:restart web --app={{.AppName}}"
1920
)
2021

2122
func TestPs(t *testing.T) {
2223
params := psSetup(t)
2324
psScaleTest(t, params, psScaleCmd)
2425
appsOpenTest(t, params)
2526
psListTest(t, params, false)
27+
psScaleTest(t, params, psRestartCmd)
2628
psScaleTest(t, params, psDownScaleCmd)
2729

2830
// FIXME if we don't wait here, some of the routers may give us a 502 before

0 commit comments

Comments
 (0)