Skip to content

Commit d2122e2

Browse files
author
Matthew Fisher
committed
Merge pull request #3586 from bacongobbler/467-ps-restart
feat(controller): add ps:restart command
2 parents 958ad28 + 9ce4886 commit d2122e2

11 files changed

Lines changed: 341 additions & 12 deletions

File tree

client/deis.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1700,6 +1700,7 @@ def ps(self, args):
17001700
Valid commands for processes:
17011701
17021702
ps:list list application processes
1703+
ps:restart restart process types (e.g. web, worker)
17031704
ps:scale scale processes (e.g. web=4 worker=2)
17041705
17051706
Use `deis help [command]` to learn more.
@@ -1737,6 +1738,53 @@ def ps_list(self, args, app=None):
17371738
self._logger.info("{type}.{num} {state} ({release})".format(**c))
17381739
self._logger.info('')
17391740

1741+
def ps_restart(self, args):
1742+
"""
1743+
Restarts an application's processes by type.
1744+
1745+
Usage: deis ps:restart [<type>] [options]
1746+
1747+
Arguments:
1748+
<type>
1749+
the process name as defined in your Procfile, such as 'web' or 'worker'.
1750+
To restart a particular process, use 'web.1'.
1751+
1752+
Options:
1753+
-a --app=<app>
1754+
the uniquely identifiable name for the application.
1755+
"""
1756+
app = args.get('--app')
1757+
procname = args.get('<type>')
1758+
if not app:
1759+
app = self._session.app
1760+
restarting_cmd = 'Restarting processes... but first, {}!\n'.format(
1761+
os.environ.get('DEIS_DRINK_OF_CHOICE', 'coffee'))
1762+
sys.stdout.write(restarting_cmd)
1763+
sys.stdout.flush()
1764+
try:
1765+
progress = TextProgress()
1766+
progress.start()
1767+
before = time.time()
1768+
url = '/v1/apps/{}/containers/restart'.format(app)
1769+
if procname:
1770+
if '.' in procname:
1771+
# format is web.2
1772+
proctype, procnum = procname.split('.')
1773+
url = '/v1/apps/{}/containers/{}/{}/restart'.format(app,
1774+
proctype,
1775+
procnum)
1776+
else:
1777+
url = '/v1/apps/{}/containers/{}/restart'.format(app, procname)
1778+
response = self._dispatch('post', url)
1779+
finally:
1780+
progress.cancel()
1781+
progress.join()
1782+
if response.status_code == requests.codes.ok:
1783+
self._logger.info('done in {}s'.format(int(time.time() - before)))
1784+
self.ps_list({}, app)
1785+
else:
1786+
raise ResponseError(response)
1787+
17401788
def ps_scale(self, args):
17411789
"""
17421790
Scales an application's processes by type.

controller/api/models.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,15 @@ def delete(self, *args, **kwargs):
198198
self._clean_app_logs()
199199
return super(App, self).delete(*args, **kwargs)
200200

201+
def restart(self, **kwargs):
202+
to_restart = self.container_set.all()
203+
if kwargs.get('type'):
204+
to_restart = to_restart.filter(type=kwargs.get('type'))
205+
if kwargs.get('num'):
206+
to_restart = to_restart.filter(num=kwargs.get('num'))
207+
self._restart_containers(to_restart)
208+
return to_restart
209+
201210
def _clean_app_logs(self):
202211
"""Delete application logs stored by the logger component"""
203212
path = os.path.join(settings.DEIS_LOG_DIR, self.id + '.log')
@@ -282,6 +291,27 @@ def _start_containers(self, to_add):
282291
err = 'warning, some containers failed to start'
283292
log_event(self, err, logging.WARNING)
284293

294+
def _restart_containers(self, to_restart):
295+
"""Restarts containers via the scheduler"""
296+
stop_threads = []
297+
start_threads = []
298+
if not to_restart:
299+
# do nothing if we didn't request any containers
300+
return
301+
for c in to_restart:
302+
stop_threads.append(threading.Thread(target=c.stop))
303+
start_threads.append(threading.Thread(target=c.start))
304+
[t.start() for t in stop_threads]
305+
[t.join() for t in stop_threads]
306+
if set([c.state for c in to_restart]) != set(['created']):
307+
err = 'warning, some containers failed to stop'
308+
log_event(self, err, logging.WARNING)
309+
[t.start() for t in start_threads]
310+
[t.join() for t in start_threads]
311+
if set([c.state for c in to_restart]) != set(['up']):
312+
err = 'warning, some containers failed to start'
313+
log_event(self, err, logging.WARNING)
314+
285315
def _destroy_containers(self, to_destroy):
286316
"""Destroys containers via the scheduler"""
287317
destroy_threads = []

controller/api/tests/test_container.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from django.contrib.auth.models import User
1414
from django.test import TransactionTestCase
15-
from scheduler.states import TransitionNotAllowed
15+
from scheduler.states import TransitionError
1616
from rest_framework.authtoken.models import Token
1717

1818
from api.models import App, Build, Container, Release
@@ -57,11 +57,13 @@ def test_container_state_good(self):
5757
num=1)
5858
self.assertEqual(c.state, 'initialized')
5959
# test an illegal transition
60-
self.assertRaises(TransitionNotAllowed, lambda: c.start())
60+
self.assertRaises(TransitionError, lambda: c.start())
6161
c.create()
6262
self.assertEqual(c.state, 'created')
6363
c.start()
6464
self.assertEqual(c.state, 'up')
65+
c.stop()
66+
self.assertEqual(c.state, 'down')
6567
c.destroy()
6668
self.assertEqual(c.state, 'destroyed')
6769

@@ -634,3 +636,39 @@ def test_modified_procfile_from_build_removes_containers(self):
634636
HTTP_AUTHORIZATION='token {}'.format(self.token))
635637
self.assertEqual(response.status_code, 201)
636638
self.assertEqual(Container.objects.filter(type='web').count(), 0)
639+
640+
def test_restart_containers(self):
641+
url = '/v1/apps'
642+
response = self.client.post(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
643+
self.assertEqual(response.status_code, 201)
644+
app_id = response.data['id']
645+
# post a new build
646+
build_url = "/v1/apps/{app_id}/builds".format(**locals())
647+
body = {'image': 'autotest/example', 'sha': 'a'*40,
648+
'procfile': json.dumps({'web': 'node server.js', 'worker': 'node worker.js'})}
649+
response = self.client.post(build_url, json.dumps(body), content_type='application/json',
650+
HTTP_AUTHORIZATION='token {}'.format(self.token))
651+
url = "/v1/apps/{app_id}/scale".format(**locals())
652+
body = {'web': 4, 'worker': 8}
653+
response = self.client.post(url, json.dumps(body), content_type='application/json',
654+
HTTP_AUTHORIZATION='token {}'.format(self.token))
655+
self.assertEqual(response.status_code, 204)
656+
container_set = App.objects.get(id=app_id).container_set.all()
657+
# restart all containers
658+
response = self.client.post('/v1/apps/{}/containers/restart'.format(app_id),
659+
content_type='application/json',
660+
HTTP_AUTHORIZATION='token {}'.format(self.token))
661+
self.assertEqual(response.status_code, 200)
662+
self.assertEqual(len(response.data), container_set.count())
663+
# restart only the workers
664+
response = self.client.post('/v1/apps/{}/containers/worker/restart'.format(app_id),
665+
content_type='application/json',
666+
HTTP_AUTHORIZATION='token {}'.format(self.token))
667+
self.assertEqual(response.status_code, 200)
668+
self.assertEqual(len(response.data), container_set.filter(type='worker').count())
669+
# restart only web.2
670+
response = self.client.post('/v1/apps/{}/containers/web/1/restart'.format(app_id),
671+
content_type='application/json',
672+
HTTP_AUTHORIZATION='token {}'.format(self.token))
673+
self.assertEqual(response.status_code, 200)
674+
self.assertEqual(len(response.data), container_set.filter(type='web', num=1).count())

controller/api/tests/test_scheduler.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,75 @@ def test_start_chaos(self):
119119
states = set([c['state'] for c in response.data['results']])
120120
self.assertEqual(states, set(['crashed', 'up']))
121121

122+
def test_restart_chaos(self):
123+
url = '/v1/apps'
124+
response = self.client.post(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
125+
self.assertEqual(response.status_code, 201)
126+
app_id = response.data['id']
127+
# post a new build
128+
url = "/v1/apps/{app_id}/builds".format(**locals())
129+
body = {'image': 'autotest/example', 'sha': 'a'*40,
130+
'procfile': json.dumps({'web': 'node server.js', 'worker': 'node worker.js'})}
131+
response = self.client.post(url, json.dumps(body), content_type='application/json',
132+
HTTP_AUTHORIZATION='token {}'.format(self.token))
133+
self.assertEqual(response.status_code, 201)
134+
url = "/v1/apps/{app_id}/containers".format(**locals())
135+
response = self.client.get(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
136+
self.assertEqual(response.status_code, 200)
137+
self.assertEqual(len(response.data['results']), 1)
138+
# scale up, which will allow some crashed containers
139+
url = "/v1/apps/{app_id}/scale".format(**locals())
140+
body = {'web': 20, 'worker': 20}
141+
response = self.client.post(url, json.dumps(body), content_type='application/json',
142+
HTTP_AUTHORIZATION='token {}'.format(self.token))
143+
self.assertEqual(response.status_code, 204)
144+
# let's get chaotic
145+
chaos.STOP_ERROR_RATE = 0.5
146+
chaos.START_ERROR_RATE = 0.5
147+
# reboot the web processes
148+
url = "/v1/apps/{app_id}/containers/web/restart".format(**locals())
149+
response = self.client.post(url,
150+
content_type='application/json',
151+
HTTP_AUTHORIZATION='token {}'.format(self.token))
152+
self.assertEqual(response.status_code, 200, response.data)
153+
# inspect broken containers
154+
url = "/v1/apps/{app_id}/containers".format(**locals())
155+
response = self.client.get(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
156+
self.assertEqual(response.status_code, 200)
157+
self.assertEqual(response.data['count'], 40)
158+
# make sure some failed
159+
states = set([c['state'] for c in response.data['results']])
160+
self.assertEqual(states, set(['crashed', 'up']))
161+
# make sure that we only rebooted the web processes
162+
types = set([c['type'] for c in response.data['results'] if c['state'] == 'crashed'])
163+
self.assertEqual(types, set(['web']))
164+
# start fresh
165+
chaos.STOP_ERROR_RATE = 0.0
166+
chaos.START_ERROR_RATE = 0.0
167+
url = "/v1/apps/{app_id}/containers/web/restart".format(**locals())
168+
response = self.client.post(url,
169+
content_type='application/json',
170+
HTTP_AUTHORIZATION='token {}'.format(self.token))
171+
# let the carnage continue
172+
chaos.STOP_ERROR_RATE = 0.5
173+
chaos.START_ERROR_RATE = 0.5
174+
# reboot ALL the containers!
175+
url = "/v1/apps/{app_id}/containers/restart".format(**locals())
176+
response = self.client.post(url,
177+
content_type='application/json',
178+
HTTP_AUTHORIZATION='token {}'.format(self.token))
179+
self.assertEqual(response.status_code, 200)
180+
# inspect broken containers
181+
url = "/v1/apps/{app_id}/containers".format(**locals())
182+
response = self.client.get(url, HTTP_AUTHORIZATION='token {}'.format(self.token))
183+
self.assertEqual(response.status_code, 200)
184+
self.assertEqual(len(response.data['results']), 40)
185+
# make sure some failed
186+
states = set([c['state'] for c in response.data['results']])
187+
self.assertEqual(states, set(['crashed', 'up']))
188+
types = set([c['type'] for c in response.data['results']])
189+
self.assertEqual(types, set(['web', 'worker']))
190+
122191
def test_destroy_chaos(self):
123192
url = '/v1/apps'
124193
response = self.client.post(url, HTTP_AUTHORIZATION='token {}'.format(self.token))

controller/api/urls.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@
2626
url(r'^apps/(?P<id>{})/releases/?'.format(settings.APP_URL_REGEX),
2727
views.ReleaseViewSet.as_view({'get': 'list'})),
2828
# application infrastructure
29+
url(r'^apps/(?P<id>{})/containers/restart/?'.format(settings.APP_URL_REGEX),
30+
views.ContainerViewSet.as_view({'post': 'restart'})),
31+
url(r'^apps/(?P<id>{})/containers/(?P<type>[-_\w.]+)/restart/?'.format(settings.APP_URL_REGEX),
32+
views.ContainerViewSet.as_view({'post': 'restart'})),
33+
url(r'^apps/(?P<id>{})/containers/(?P<type>[-_\w]+)/(?P<num>[-_\w]+)/restart/?'.format(
34+
settings.APP_URL_REGEX),
35+
views.ContainerViewSet.as_view({'post': 'restart'})),
2936
url(r'^apps/(?P<id>{})/containers/(?P<type>[-_\w]+)/(?P<num>[-_\w]+)/?'.format(
3037
settings.APP_URL_REGEX),
3138
views.ContainerViewSet.as_view({'get': 'retrieve'})),

controller/api/views.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,14 @@ def get_object(self, **kwargs):
213213
qs = self.get_queryset(**kwargs)
214214
return qs.get(num=self.kwargs['num'])
215215

216+
def restart(self, *args, **kwargs):
217+
try:
218+
containers = self.get_app().restart(**kwargs)
219+
serializer = self.get_serializer(containers, many=True)
220+
return Response(serializer.data, status=status.HTTP_200_OK)
221+
except Exception as e:
222+
return Response({'detail': str(e)}, status=status.HTTP_503_SERVICE_UNAVAILABLE)
223+
216224

217225
class DomainViewSet(AppResourceViewSet):
218226
"""A viewset for interacting with Domain objects."""

controller/scheduler/fleet.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,20 @@ def _wait_for_container_state(self, name):
188188
raise RuntimeError('container timeout while retrieving state')
189189

190190
def _wait_for_container_running(self, name):
191+
# we bump to 20 minutes here to match the timeout on the router and in the app unit files
192+
try:
193+
self._wait_for_job_state(name, JobState.up)
194+
except RuntimeError:
195+
raise RuntimeError('container failed to start')
196+
197+
def _wait_for_job_state(self, name, state):
191198
# we bump to 20 minutes here to match the timeout on the router and in the app unit files
192199
for _ in range(1200):
193-
if self.state(name) == JobState.up:
200+
if self.state(name) == state:
194201
return
195202
time.sleep(1)
196203
else:
197-
raise RuntimeError('container failed to start')
204+
raise RuntimeError('timeout waiting for job state: {}'.format(state))
198205

199206
def _wait_for_destroy(self, name):
200207
for _ in range(30):
@@ -206,7 +213,8 @@ def _wait_for_destroy(self, name):
206213

207214
def stop(self, name):
208215
"""Stop a container"""
209-
raise NotImplementedError
216+
self._put_unit(name, {"desiredState": "loaded"})
217+
self._wait_for_job_state(name, JobState.created)
210218

211219
def destroy(self, name):
212220
"""Destroy a container"""

controller/scheduler/mock.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import json
22
from cStringIO import StringIO
3-
from .states import JobState, TransitionNotAllowed
3+
from .states import JobState, TransitionError
44

55

66
# HACK: MockSchedulerClient is not persistent across requests
@@ -55,8 +55,14 @@ def start(self, name):
5555
"""
5656
Start a container
5757
"""
58-
if self.state(name) not in [JobState.created, JobState.up, JobState.down]:
59-
raise TransitionNotAllowed
58+
if self.state(name) not in [JobState.created,
59+
JobState.up,
60+
JobState.down,
61+
JobState.crashed,
62+
JobState.error]:
63+
raise TransitionError(self.state(name),
64+
JobState.up,
65+
'the container must be stopped or up to start')
6066
job = jobs.get(name, {})
6167
job.update({'state': JobState.up})
6268
jobs[name] = job
@@ -77,9 +83,11 @@ def stop(self, name):
7783
Stop a container
7884
"""
7985
job = jobs.get(name, {})
80-
if job.get('state') != JobState.up:
81-
raise TransitionNotAllowed
82-
job.update({'state': JobState.stopped})
86+
if job.get('state') not in [JobState.up, JobState.crashed, JobState.error]:
87+
raise TransitionError(job.get('state'),
88+
JobState.up,
89+
'the container must be up to stop')
90+
job.update({'state': JobState.down})
8391
jobs[name] = job
8492
return
8593

controller/scheduler/states.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
import enum
22

33

4-
class TransitionNotAllowed(Exception):
4+
class TransitionError(Exception):
55
"""Raised when a transition from one state to another is illegal"""
66

7+
def __init__(self, prev, next, msg):
8+
self.prev = prev
9+
self.next = next
10+
self.msg = msg
11+
712

813
class JobState(enum.Enum):
914
initialized = 1

0 commit comments

Comments
 (0)