Skip to content

Commit 4451bf5

Browse files
committed
fix(controller): prevent DeployLock deadlocks and remove redundant admission locks
1 parent 5ca0046 commit 4451bf5

25 files changed

Lines changed: 520 additions & 223 deletions

rootfs/api/admissions.py

Lines changed: 35 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -27,30 +27,28 @@ def detect(self, request: Request) -> bool:
2727

2828
def handle(self, request: Request) -> bool:
2929
app_id = request["object"]["metadata"]["namespace"]
30-
app = models.app.App.objects.filter(id=app_id).first()
3130
ptype = request["object"]["metadata"].get("labels", {}).get("type", "")
32-
if app and ptype:
33-
lock = app.lock()
34-
try:
35-
lock.acquire()
36-
status = request["object"]["status"]
37-
replicas = request["object"]["spec"].get("replicas", 0)
38-
if "active" in status:
39-
replicas += 1
40-
elif "succeeded" in status or "failed" in status:
41-
replicas -= 1
42-
replicas = 0 if replicas < 0 else replicas
43-
if app.structure.get(ptype, 0) != replicas:
44-
models.app.App.objects.filter(id=app.id).update(
45-
structure=Func(
46-
F("structure"),
47-
Value([ptype]),
48-
Value(replicas, JSONField()),
49-
function="jsonb_set",
50-
)
51-
)
52-
finally:
53-
lock.release()
31+
if not ptype:
32+
return True
33+
status = request["object"]["status"]
34+
replicas = request["object"]["spec"].get("replicas", 0)
35+
if "active" in status:
36+
replicas += 1
37+
elif "succeeded" in status or "failed" in status:
38+
replicas -= 1
39+
replicas = 0 if replicas < 0 else replicas
40+
# jsonb_set on a single row is atomic at the PostgreSQL level; no lock
41+
# is needed because `replicas` is computed from the request payload
42+
# alone (not read-modify-write of app.structure). Filter touches 0 rows
43+
# if the app no longer exists, which is safe.
44+
models.app.App.objects.filter(id=app_id).update(
45+
structure=Func(
46+
F("structure"),
47+
Value([ptype]),
48+
Value(replicas, JSONField()),
49+
function="jsonb_set",
50+
)
51+
)
5452
return True
5553

5654

@@ -68,28 +66,25 @@ def detect(self, request: Request) -> bool:
6866

6967
def handle(self, request: Request) -> bool:
7068
app_id = request["object"]["metadata"]["namespace"]
71-
app = models.app.App.objects.filter(id=app_id).first()
7269
ptype = None
7370
for item in request["object"]["status"]["selector"].split(","):
7471
key, value = item.split("=")
7572
if key == "type":
7673
ptype = value
77-
if app and ptype:
78-
lock = app.lock()
79-
try:
80-
lock.acquire()
81-
replicas = request["object"]["spec"].get("replicas", 0)
82-
if app.structure.get(ptype, 0) != replicas:
83-
models.app.App.objects.filter(id=app.id).update(
84-
structure=Func(
85-
F("structure"),
86-
Value([ptype]),
87-
Value(replicas, JSONField()),
88-
function="jsonb_set",
89-
)
90-
)
91-
finally:
92-
lock.release()
74+
if not ptype:
75+
return True
76+
replicas = request["object"]["spec"].get("replicas", 0)
77+
# jsonb_set is an atomic single-row UPDATE in PostgreSQL; no lock is
78+
# needed because `replicas` comes straight from the request payload.
79+
# Filter touches 0 rows if the app no longer exists, which is safe.
80+
models.app.App.objects.filter(id=app_id).update(
81+
structure=Func(
82+
F("structure"),
83+
Value([ptype]),
84+
Value(replicas, JSONField()),
85+
function="jsonb_set",
86+
)
87+
)
9388
return True
9489

9590

rootfs/api/exceptions.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ def custom_exception_handler(exc, context):
4242
# No response means DRF couldn't handle it
4343
# Output a generic 500 in a JSON format
4444
if response is None:
45+
import traceback
46+
traceback.print_exc()
4547
logging.exception('Uncaught Exception', exc_info=exc)
4648
set_rollback()
4749
return Response({'detail': 'Server Error'}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)

rootfs/api/migrations/0028_workspace_remove_app_owner_remove_appsettings_owner_and_more.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import api.models.workspace
44
import django.db.models.deletion
5+
import uuid
56
from django.conf import settings
67
from django.db import migrations, models
78

@@ -16,12 +17,16 @@ class Migration(migrations.Migration):
1617
migrations.CreateModel(
1718
name='Workspace',
1819
fields=[
19-
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
20-
('name', models.SlugField(max_length=150, unique=True, validators=[api.models.workspace.validate_workspace_name], verbose_name='workspace name')),
21-
('email', models.EmailField(max_length=254, verbose_name='email address')),
2220
('created', models.DateTimeField(auto_now_add=True)),
2321
('updated', models.DateTimeField(auto_now=True)),
22+
('uuid', models.UUIDField(auto_created=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='UUID')),
23+
('id', models.SlugField(max_length=150, unique=True, validators=[api.models.workspace.validate_workspace_id], verbose_name='workspace name')),
24+
('email', models.EmailField(max_length=254, verbose_name='email address')),
25+
('uid', models.PositiveIntegerField(unique=True)),
2426
],
27+
options={
28+
'abstract': False,
29+
},
2530
),
2631
migrations.RemoveField(
2732
model_name='app',
@@ -116,4 +121,4 @@ class Migration(migrations.Migration):
116121
'unique_together': {('user', 'workspace')},
117122
},
118123
),
119-
]
124+
]
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Generated by Django 5.2.14 on 2026-05-26 01:01
2+
3+
from django.db import migrations, models
4+
5+
6+
def set_app_uids(apps, schema_editor):
7+
App = apps.get_model('api', 'App')
8+
9+
# 模拟 get_next_uid 的类似逻辑,或者是直接调用,但需要防范现有库里全是 null 导致的 TypeError
10+
# 使用 id 排序来保证赋值的确定性
11+
uid = App.objects.aggregate(models.Max('uid'))['uid__max'] or 0
12+
for app in App.objects.filter(uid__isnull=True).order_by('created'):
13+
uid += 1
14+
app.uid = uid
15+
app.save(update_fields=['uid'])
16+
17+
18+
class Migration(migrations.Migration):
19+
20+
dependencies = [
21+
('api', '0030_delete_blocklist_remove_app_suspended_state'),
22+
]
23+
24+
operations = [
25+
migrations.AddField(
26+
model_name='app',
27+
name='uid',
28+
field=models.PositiveIntegerField(blank=True, null=True),
29+
),
30+
migrations.RunPython(set_app_uids, migrations.RunPython.noop),
31+
migrations.AlterField(
32+
model_name='app',
33+
name='uid',
34+
field=models.PositiveIntegerField(unique=True),
35+
),
36+
]

rootfs/api/models/app.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from rest_framework.exceptions import ValidationError
2323

2424
from api.tasks import send_app_log
25-
from api.utils import get_httpclient, dict_diff
25+
from api.utils import get_httpclient, dict_diff, get_next_uid
2626
from api.exceptions import AlreadyExists, DryccException, ServiceUnavailable
2727
from api.utils import (
2828
CacheLock, DeployLock, generate_app_name, apply_tasks, validate_reserved_names)
@@ -71,6 +71,7 @@ class App(UuidAuditedModel):
7171

7272
id = models.SlugField(max_length=63, unique=True, null=True,
7373
validators=[validate_app_id])
74+
uid = models.PositiveIntegerField(unique=True, editable=False)
7475
workspace = models.ForeignKey('Workspace', on_delete=models.CASCADE)
7576
structure = models.JSONField(
7677
default=dict, blank=True, validators=[validate_app_structure])
@@ -99,6 +100,8 @@ def save(self, *args, **kwargs):
99100
except KubeHTTPException:
100101
pass
101102

103+
if not self.uid:
104+
self.uid = get_next_uid(App)
102105
application = super(App, self).save(**kwargs)
103106

104107
# create all the required resources
@@ -1062,8 +1065,11 @@ def _check_deployment_in_progress(self, deploys, force_deploy=False):
10621065
def _merge_structure(self, release, prev_release):
10631066
"""Scale to default structure based on release type"""
10641067
lock = self.lock()
1068+
if not lock.acquire():
1069+
raise ServiceUnavailable(
1070+
f'could not acquire app lock for {self.id}'
1071+
)
10651072
try:
1066-
lock.acquire()
10671073
self.refresh_from_db()
10681074
default_structure = {}
10691075
for ptype in release.ptypes:

rootfs/api/models/appsettings.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def _update_field(self, field, previous_settings):
7777
setattr(self, field, True)
7878
elif old != new:
7979
self.summary += [
80-
"{} changed {} from {} to {}".format(self.app.workspace.name, field, old, new)
80+
"{} changed {} from {} to {}".format(self.app.workspace.id, field, old, new)
8181
]
8282

8383
def _update_autoscale(self, previous_settings):
@@ -118,7 +118,7 @@ def _update_autoscale(self, previous_settings):
118118
deleted = 'deleted autoscale for process type ' + deleted if deleted else ''
119119
changes = ', '.join(i for i in (added, changed, deleted) if i)
120120
if changes:
121-
self.summary += ["{} {}".format(self.app.workspace.name, changes)]
121+
self.summary += ["{} {}".format(self.app.workspace.id, changes)]
122122

123123
def _update_label(self, previous_settings):
124124
data = getattr(previous_settings, 'label', {}).copy()
@@ -150,7 +150,7 @@ def _update_label(self, previous_settings):
150150
if changes:
151151
if self.summary:
152152
self.summary += ' and '
153-
self.summary += ["{} {}".format(self.app.workspace.name, changes)]
153+
self.summary += ["{} {}".format(self.app.workspace.id, changes)]
154154

155155
@transaction.atomic
156156
def save(self, ignore_update_fields=None, *args, **kwargs):
@@ -176,7 +176,7 @@ def save(self, ignore_update_fields=None, *args, **kwargs):
176176

177177
if not self.summary and previous_settings:
178178
self.delete()
179-
raise AlreadyExists("{} changed nothing".format(self.app.workspace.name))
179+
raise AlreadyExists("{} changed nothing".format(self.app.workspace.id))
180180
super(AppSettings, self).save(**kwargs)
181181
summary = ' '.join(self.summary)
182182
self.log('summary of app setting changes: {}'.format(summary), logging.DEBUG)

rootfs/api/models/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def scheduler(self):
6363
labels = annotations = {}
6464
if hasattr(self, 'app'):
6565
labels["drycc.cc/app_id"] = str(self.app.id)
66-
labels["drycc.cc/workspace_id"] = str(self.app.workspace_id)
66+
labels["drycc.cc/workspace_id"] = str(self.app.workspace.id)
6767
return get_scheduler(metadata={"labels": labels, "annotations": annotations})
6868

6969

rootfs/api/models/release.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -425,28 +425,28 @@ def save(self, *args, **kwargs): # noqa
425425
old_config = prev_release.config if prev_release else None
426426
# if the build changed, log it and who pushed it
427427
if self.version == 1:
428-
self.summary += "{} created initial release".format(self.app.workspace.name)
428+
self.summary += "{} created initial release".format(self.app.workspace.id)
429429
elif self.build != old_build:
430430
if self.build.sha:
431431
self.summary += "{} deployed {}".format(
432-
self.app.workspace.name, self.build.sha[:7])
432+
self.app.workspace.id, self.build.sha[:7])
433433
else:
434434
self.summary += "{} deployed {}".format(
435-
self.app.workspace.name, self.build.image)
435+
self.app.workspace.id, self.build.image)
436436
elif self.config != old_config:
437437
for field, diff in self.config.diff(old_config).items():
438438
diff_list = []
439439
for diff_type, values in diff.items():
440440
diff_list.append(f'{diff_type} {field} {", ".join(values)}')
441441
if diff_list:
442442
changes = ', '.join(diff_list)
443-
self.summary += "{} {}".format(self.app.workspace.name, changes)
443+
self.summary += "{} {}".format(self.app.workspace.id, changes)
444444
if not self.summary:
445445
if self.version == 1:
446-
self.summary = "{} created the initial release".format(self.app.workspace.name)
446+
self.summary = "{} created the initial release".format(self.app.workspace.id)
447447
else:
448448
# There were no changes to this release
449449
raise AlreadyExists(
450-
"{} changed nothing - release stopped".format(self.app.workspace.name)
450+
"{} changed nothing - release stopped".format(self.app.workspace.id)
451451
)
452452
super(Release, self).save(*args, **kwargs)

rootfs/api/models/tls.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,12 @@ def _check_previous_tls_settings(self):
133133
if self.https_enforced is not None:
134134
if previous_tls_settings.https_enforced == self.https_enforced:
135135
raise AlreadyExists(
136-
"{} changed nothing".format(self.app.workspace.name))
136+
"{} changed nothing".format(self.app.workspace.id))
137137
self.certs_auto_enabled = previous_tls_settings.certs_auto_enabled
138138
elif self.certs_auto_enabled is not None:
139139
if previous_tls_settings.certs_auto_enabled == self.certs_auto_enabled:
140140
raise AlreadyExists(
141-
"{} changed nothing".format(self.app.workspace.name))
141+
"{} changed nothing".format(self.app.workspace.id))
142142
self.https_enforced = previous_tls_settings.https_enforced
143143
previous_tls_settings.delete()
144144
except TLS.DoesNotExist:

rootfs/api/models/workspace.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@
1010
from django.template.loader import render_to_string
1111

1212
from rest_framework.exceptions import ValidationError
13-
from api.utils import validate_reserved_names, get_local_host
13+
from api.utils import validate_reserved_names, get_local_host, get_next_uid
14+
from api.models.base import UuidAuditedModel
1415

1516
User = get_user_model()
1617
logger = logging.getLogger(__name__)
1718

1819

19-
def validate_workspace_name(value):
20+
def validate_workspace_id(value):
2021
"""
2122
Check that the value follows the kubernetes name constraints
2223
"""
@@ -27,25 +28,29 @@ def validate_workspace_name(value):
2728
validate_reserved_names(value)
2829

2930

30-
class Workspace(models.Model):
31-
name = models.SlugField(
31+
class Workspace(UuidAuditedModel):
32+
id = models.SlugField(
3233
_("workspace name"),
3334
max_length=150,
3435
unique=True,
35-
validators=[validate_workspace_name],
36+
validators=[validate_workspace_id],
3637
)
38+
uid = models.PositiveIntegerField(unique=True, editable=False)
3739
email = models.EmailField(_("email address"))
38-
created = models.DateTimeField(auto_now_add=True)
39-
updated = models.DateTimeField(auto_now=True)
4040

4141
def has_member(self, user, role=None):
4242
kwargs = {'user': user, 'workspace': self}
4343
if role:
4444
kwargs['role'] = role
4545
return WorkspaceMember.objects.filter(**kwargs).exists()
4646

47+
def save(self, *args, **kwargs):
48+
if not self.uid:
49+
self.uid = get_next_uid(Workspace)
50+
super().save(*args, **kwargs)
51+
4752
def __str__(self):
48-
return self.name
53+
return self.id
4954

5055

5156
class WorkspaceMember(models.Model):
@@ -63,7 +68,7 @@ class WorkspaceMember(models.Model):
6368
workspace = models.ForeignKey(Workspace, on_delete=models.CASCADE)
6469

6570
def __str__(self):
66-
return f"{self.user.username} - {self.workspace.name} ({self.role})"
71+
return f"{self.user.username} - {self.workspace.id} ({self.role})"
6772

6873
class Meta:
6974
unique_together = ('user', 'workspace')
@@ -97,12 +102,12 @@ def send_email(self, request):
97102
if count > settings.DRYCC_INVITATION_EMAIL_LIMIT:
98103
raise ValidationError("Too many invitation emails, please try again later")
99104
domain = get_local_host(request)
100-
mail_subject = f'We Invite You to Join the {self.workspace.name} Workspace.'
105+
mail_subject = f'We Invite You to Join the {self.workspace.id} Workspace.'
101106
message = render_to_string(
102107
'workspace/workspace_invitation.html',
103108
{'domain': domain, 'invitation': self}
104109
)
105110
send_mail(mail_subject, message, None, [self.email], fail_silently=True)
106111

107112
def __str__(self):
108-
return f"Invitation for {self.email} to join {self.workspace.name}"
113+
return f"Invitation for {self.email} to join {self.workspace.id}"

0 commit comments

Comments
 (0)