diff --git a/Dockerfile b/Dockerfile index a244ad8..7425306 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,19 @@ -FROM registry.drycc.cc/drycc/base:bookworm +ARG CODENAME +FROM registry.drycc.cc/drycc/base:${CODENAME} -ENV DRYCC_UID=1001 \ +ARG DRYCC_UID=1001 \ DRYCC_GID=1001 \ DRYCC_HOME_DIR=/data \ - MC_VERSION="2025.04.03.17.07.56" \ - MINIO_VERSION="2025.04.03.14.56.28" + RUSTFS_VERSION="1.0.0-alpha.73" \ + OPENTELEMETRY_COLLECTOR_VERSION=0.141.0 +COPY rootfs/etc/otelcol /etc/otelcol RUN groupadd drycc --gid ${DRYCC_GID} \ && useradd drycc -u ${DRYCC_UID} -g ${DRYCC_GID} -s /bin/bash -m -d ${DRYCC_HOME_DIR} \ - && install-stack mc $MC_VERSION \ - && install-stack minio $MINIO_VERSION \ + && install-packages dnsutils \ + && install-stack rustfs $RUSTFS_VERSION \ + && install-stack opentelemetry-collector $OPENTELEMETRY_COLLECTOR_VERSION \ && rm -rf \ /usr/share/doc \ /usr/share/man \ diff --git a/Makefile b/Makefile index 0b7a16f..ce10cc9 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,9 @@ podman-build: podman build --build-arg CODENAME=${CODENAME} -t ${IMAGE} . podman tag ${IMAGE} ${MUTABLE_IMAGE} +podman-buildx: + podman buildx build --build-arg CODENAME=${CODENAME} --platform ${PLATFORM} -t ${IMAGE} . --push + deploy: build podman-build podman-push .PHONY: all bootstrap build test podman-build deploy diff --git a/README.md b/README.md index 75e31ea..9b5e528 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ We welcome your input! If you have feedback, please submit an [issue][issues]. I # About -The Drycc storage component provides an [S3 API][s3-api] compatible object storage server, based on [MINIO](https://github.com/minio/minio), that can be run on Kubernetes. It's intended for use within the [Drycc v2 platform][drycc-docs] as an object storage server, but it's flexible enough to be run as a standalone pod on any Kubernetes cluster. +The Drycc storage component provides an [S3 API][s3-api] compatible object storage server, based on [RUSTFS](https://github.com/rustfs/rustfs), that can be run on Kubernetes. It's intended for use within the [Drycc v2 platform][drycc-docs] as an object storage server, but it's flexible enough to be run as a standalone pod on any Kubernetes cluster. Note that in the default [Helm chart for the Drycc platform](https://github.com/drycc/charts/tree/main/drycc-dev), this component is used as a storage location for the following components: diff --git a/_tests/test.sh b/_tests/test.sh index e7ee580..5fd09b8 100755 --- a/_tests/test.sh +++ b/_tests/test.sh @@ -1,17 +1,16 @@ #!/usr/bin/env bash BASE_DIR=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")") -MINIO_ROOT_USER=f4c4281665bc11ee8e0400163e04a9cd -MINIO_ROOT_PASSWORD=f4c4281665bc11ee8e0400163e04a9cd +RUSTFS_ACCESS_KEY=f4c4281665bc11ee8e0400163e04a9cd +RUSTFS_SECRET_KEY=f4c4281665bc11ee8e0400163e04a9cd function start-storage { mkdir -p "${BASE_DIR}/data" -podman run --rm -d --name test-storage \ - -e MINIO_PROMETHEUS_AUTH_TYPE=public \ - -e MINIO_ROOT_USER=${MINIO_ROOT_USER} \ - -e MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD} \ + podman run --rm -d --name test-storage \ + -e RUSTFS_ACCESS_KEY=${RUSTFS_ACCESS_KEY} \ + -e RUSTFS_SECRET_KEY=${RUSTFS_SECRET_KEY} \ registry.drycc.cc/drycc/storage:canary \ - minio server /tmp --address :9000 --console-address :9001 + rustfs /tmp --address :9000 --console-address :9001 } # shellcheck disable=SC2317 @@ -30,12 +29,20 @@ function main { echo -e "\\033[32m---> Waitting for ${S3_IP}:9000\\033[0m" wait-for-port --host="${S3_IP}" 9000 echo -e "\\033[32m---> S3 service ${S3_IP}:9000 ready...\\033[0m" - # test by minio client - mc --config-dir /tmp/.mc config host add storage "${S3_ENDPOINT}" ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} --lookup path --api s3v4 - mc --config-dir /tmp/.mc mb storage/test - mc --config-dir /tmp/.mc cp "${BASE_DIR}"/test.sh storage/test + # test by rclone client + mkdir -p /tmp/.config/rclone + cat > /tmp/.config/rclone/rclone.conf << EOF +[storage] +type = s3 +provider = Other +endpoint = ${S3_ENDPOINT} +access_key_id = ${RUSTFS_ACCESS_KEY} +secret_access_key = ${RUSTFS_SECRET_KEY} +EOF + rclone --config /tmp/.config/rclone/rclone.conf mkdir storage:test + rclone --config /tmp/.config/rclone/rclone.conf copyto "${BASE_DIR}"/test.sh storage:test/test.sh exit_code=$? - rm -rf /tmp/.mc + rm -rf /tmp/.config/rclone exit $exit_code } diff --git a/charts/storage/Chart.yaml b/charts/storage/Chart.yaml index 7ebd898..185f568 100644 --- a/charts/storage/Chart.yaml +++ b/charts/storage/Chart.yaml @@ -6,7 +6,7 @@ dependencies: - name: common repository: oci://registry.drycc.cc/charts version: ~1.1.2 -description: Minio Object Storage in Kubernetes, used by Drycc Workflow. +description: Object Storage in Kubernetes, used by Drycc Workflow. maintainers: - name: Drycc Team email: engineering@drycc.com diff --git a/charts/storage/templates/_helper.tpl b/charts/storage/templates/_helper.tpl deleted file mode 100644 index c35cdc5..0000000 --- a/charts/storage/templates/_helper.tpl +++ /dev/null @@ -1,29 +0,0 @@ -{{- /* keep randAlphaNum values consistent */ -}} -{{- define "storage.accesskey" -}} - {{- if not (index .Release "secrets") -}} - {{- $_ := set .Release "secrets" dict -}} - {{- end -}} - {{- if not (index .Release.secrets "accesskey") -}} - {{- if .Values.accesskey | default "" | ne "" -}} - {{- $_ := set .Release.secrets "accesskey" .Values.accesskey -}} - {{- else -}} - {{- $_ := set .Release.secrets "accesskey" (randAlphaNum 32) -}} - {{- end -}} - {{- end -}} - {{- index .Release.secrets "accesskey" -}} -{{- end -}} - -{{- /* keep randAlphaNum values consistent */ -}} -{{- define "storage.secretkey" -}} - {{- if not (index .Release "secrets") -}} - {{- $_ := set .Release "secrets" dict -}} - {{- end -}} - {{- if not (index .Release.secrets "secretkey") -}} - {{- if .Values.secretkey | default "" | ne "" -}} - {{- $_ := set .Release.secrets "secretkey" .Values.secretkey -}} - {{- else -}} - {{- $_ := set .Release.secrets "secretkey" (randAlphaNum 32) -}} - {{- end -}} - {{- end -}} - {{- index .Release.secrets "secretkey" -}} -{{- end -}} diff --git a/charts/storage/templates/storage-secret-creds.yaml b/charts/storage/templates/storage-secret-creds.yaml index 3cdff7e..fcfb29f 100644 --- a/charts/storage/templates/storage-secret-creds.yaml +++ b/charts/storage/templates/storage-secret-creds.yaml @@ -6,17 +6,5 @@ metadata: heritage: drycc type: Opaque data: - {{- if eq .Values.global.storageLocation "on-cluster"}} - lookup: {{ "path" | b64enc }} - {{- $endpoint := (printf "http://drycc-storage.%s.svc.%s:9000" .Release.Namespace .Values.global.clusterDomain) }} - endpoint: {{ $endpoint | b64enc }} - builder-bucket: {{ "builder" | b64enc }} - registry-bucket: {{ "registry" | b64enc }} - {{- else }} - lookup: {{ .Values.lookup | b64enc }} - endpoint: {{ .Values.endpoint | b64enc }} - builder-bucket: {{ .Values.builderBucket | b64enc }} - registry-bucket: {{ .Values.registryBucket | b64enc }} - {{- end }} - accesskey: {{ include "common.secrets.lookup" (dict "secret" "storage-creds" "key" "accesskey" "defaultValue" (include "storage.accesskey" .) "context" $) }} - secretkey: {{ include "common.secrets.lookup" (dict "secret" "storage-creds" "key" "secretkey" "defaultValue" (include "storage.secretkey" .) "context" $) }} \ No newline at end of file + accesskey: {{ include "common.secrets.lookup" (dict "secret" "storage-creds" "key" "accesskey" "defaultValue" (default (randAlphaNum 32) .Values.accesskey) "context" $) }} + secretkey: {{ include "common.secrets.lookup" (dict "secret" "storage-creds" "key" "secretkey" "defaultValue" (default (randAlphaNum 32) .Values.secretkey) "context" $) }} diff --git a/charts/storage/templates/storage-service-account.yaml b/charts/storage/templates/storage-service-account.yaml index e09c31b..9b17ef9 100644 --- a/charts/storage/templates/storage-service-account.yaml +++ b/charts/storage/templates/storage-service-account.yaml @@ -1,8 +1,6 @@ -{{- if eq .Values.global.storageLocation "on-cluster" }} apiVersion: v1 kind: ServiceAccount metadata: name: drycc-storage labels: heritage: drycc -{{- end }} diff --git a/charts/storage/templates/storage-service.yaml b/charts/storage/templates/storage-service.yaml index b8e651b..959e1bf 100644 --- a/charts/storage/templates/storage-service.yaml +++ b/charts/storage/templates/storage-service.yaml @@ -1,11 +1,10 @@ -{{- if eq .Values.global.storageLocation "on-cluster" }} apiVersion: v1 kind: Service metadata: name: drycc-storage annotations: - prometheus.io/path: /minio/v2/metrics/resource - prometheus.io/port: "9000" + prometheus.io/path: /metrics + prometheus.io/port: "9200" prometheus.io/scrape: "true" {{- with .Values.service.annotations }} {{- toYaml . | nindent 4 }} @@ -14,6 +13,8 @@ metadata: heritage: drycc spec: type: ClusterIP + clusterIP: None + publishNotReadyAddresses: true selector: app: drycc-storage ports: @@ -21,4 +22,5 @@ spec: port: 9000 - name: console port: 9001 -{{- end }} + - name: metrics + port: 9200 diff --git a/charts/storage/templates/storage-statefulset.yaml b/charts/storage/templates/storage-statefulset.yaml index f4ccbaf..3346f81 100644 --- a/charts/storage/templates/storage-statefulset.yaml +++ b/charts/storage/templates/storage-statefulset.yaml @@ -1,7 +1,19 @@ -{{- if eq .Values.global.storageLocation "on-cluster" }} {{- $zoneCount := int .Values.zones }} -{{- $driveCount := int .Values.drives }} +{{- $driveCount := int .Values.drivesPerNode }} {{- $replicaCount := int .Values.replicas }} +{{- $volumesList := list }} +{{- $hostsList := list }} +{{- range $i := until $zoneCount }} +{{- $factor := mul $i $replicaCount }} +{{- $endIndex := sub (add $factor $replicaCount) 1 }} +{{- $beginIndex := mul $i $replicaCount }} +{{- $volumes := (printf "http://drycc-storage-{%d...%d}.drycc-storage:9000/data/{0...%d}" $beginIndex $endIndex (sub $driveCount 1) ) }} +{{- $volumesList = append $volumesList $volumes }} +{{- range $j := until $replicaCount }} +{{- $nodeIndex := add $factor $j }} +{{- $hostsList = append $hostsList (printf "drycc-storage-%d.drycc-storage" $nodeIndex) }} +{{- end }} +{{- end }} apiVersion: apps/v1 kind: StatefulSet metadata: @@ -21,6 +33,10 @@ spec: app: drycc-storage template: metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: "8888" + prometheus.io/scrape: "true" labels: app: drycc-storage spec: @@ -29,19 +45,42 @@ spec: podAntiAffinity: {{- include "common.affinities.pods" (dict "type" .Values.podAntiAffinityPreset.type "component" "" "extraMatchLabels" .Values.podAntiAffinityPreset.extraMatchLabels "topologyKey" "" "context" $) | nindent 10 }} nodeAffinity: {{- include "common.affinities.nodes" (dict "type" .Values.nodeAffinityPreset.type "key" .Values.nodeAffinityPreset.key "values" .Values.nodeAffinityPreset.values ) | nindent 10 }} serviceAccount: drycc-storage + initContainers: + - name: dns-check + image: {{.Values.imageRegistry}}/{{.Values.imageOrg}}/storage:{{.Values.imageTag}} + imagePullPolicy: {{.Values.imagePullPolicy}} + command: + - /usr/bin/env + - bash + - -ec + - | + echo "Checking DNS resolution for storage hosts..." + # Check all hosts in hostsList + hosts=({{- range $host := $hostsList }} "{{$host}}"{{- end }}) + for host in "${hosts[@]}"; do + echo "Checking DNS resolution for $host..." + while true; do + if nslookup "$host" > /dev/null 2>&1; then + echo "DNS resolution for $host: OK" + break + else + echo "DNS resolution for $host failed, retrying in 5 seconds..." + sleep 5 + fi + done + done + echo "All DNS checks passed, ready to start storage containers." containers: - name: drycc-storage image: {{.Values.imageRegistry}}/{{.Values.imageOrg}}/storage:{{.Values.imageTag}} imagePullPolicy: {{.Values.imagePullPolicy}} env: - - name: MINIO_PROMETHEUS_AUTH_TYPE - value: "public" - - name: "MINIO_ROOT_USER" + - name: "RUSTFS_ACCESS_KEY" valueFrom: secretKeyRef: name: storage-creds key: accesskey - - name: "MINIO_ROOT_PASSWORD" + - name: "RUSTFS_SECRET_KEY" valueFrom: secretKeyRef: name: storage-creds @@ -53,13 +92,17 @@ spec: - name: console containerPort: 9001 protocol: TCP + {{- with index .Values "resources" }} + resources: + {{- toYaml . | nindent 10 }} + {{- end }} livenessProbe: httpGet: - path: /minio/health/live + path: /health port: 9000 - initialDelaySeconds: 90 + initialDelaySeconds: 5 periodSeconds: 5 - timeoutSeconds: 1 + timeoutSeconds: 5 successThreshold: 1 failureThreshold: 5 readinessProbe: @@ -70,25 +113,70 @@ spec: timeoutSeconds: 1 successThreshold: 1 failureThreshold: 5 + startupProbe: + tcpSocket: + port: 9000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 60 args: - - minio - - server - {{- $clusters := list }} - {{- range $i := until $zoneCount }} - {{- $factor := mul $i $replicaCount }} - {{- $endIndex := sub (add $factor $replicaCount) 1 }} - {{- $beginIndex := mul $i $replicaCount }} - - {{(printf "http://drycc-storage-{%d...%d}.drycc-storage.%s.svc.%s:9000/data/{0...%d}" $beginIndex $endIndex $.Release.Namespace $.Values.global.clusterDomain (sub $driveCount 1) ) }} - {{- end }} - - --address - - :9000 - - --console-address - - :9001 + - rustfs + - {{ join " " $volumesList }} + - --address=:9000 + - --console-address=:9001 + - --obs-endpoint=http://localhost:4317 volumeMounts: {{- range $diskId := until $driveCount }} - name: storage-data-{{$diskId}} mountPath: /data/{{$diskId}} {{- end }} + - name: drycc-storage-otelcol + image: {{.Values.imageRegistry}}/{{.Values.imageOrg}}/storage:{{.Values.imageTag}} + imagePullPolicy: {{.Values.imagePullPolicy}} + ports: + - name: receiver + containerPort: 4317 + protocol: TCP + - name: otelcol + containerPort: 9200 + protocol: TCP + - name: metrics + containerPort: 8888 + protocol: TCP + {{- with index .Values "resources" }} + resources: + {{- toYaml . | nindent 10 }} + {{- end }} + livenessProbe: + httpGet: + path: /metrics + port: 9200 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 5 + readinessProbe: + tcpSocket: + port: 9200 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 5 + startupProbe: + tcpSocket: + port: 9200 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 60 + args: + - otelcol + - --config=/etc/otelcol/otelcol.yaml securityContext: fsGroup: 1001 runAsGroup: 1001 @@ -118,4 +206,3 @@ spec: emptyDir: {} {{- end }} {{- end }} -{{- end }} diff --git a/charts/storage/values.yaml b/charts/storage/values.yaml index db160b0..1a9e68f 100644 --- a/charts/storage/values.yaml +++ b/charts/storage/values.yaml @@ -3,7 +3,15 @@ imagePullPolicy: "Always" imageTag: "canary" imageRegistry: "registry.drycc.cc" -# The following parameters are configured only when using an on-cluster Minio instance +resources: {} + # limits: + # cpu: 200m + # memory: 50Mi + # requests: + # cpu: 100m + # memory: 30Mi + +# The following parameters are configured only when using an on-cluster object storage instance nodeAffinityPreset: key: "drycc.cc/node" type: "soft" @@ -18,12 +26,12 @@ podAntiAffinityPreset: extraMatchLabels: app: "drycc-storage" -# Minio zone count -zones: 1 -# Device count of per replica -drives: 1 # Number of replicas per zone replicas: 4 +# Object storage zone count +zones: 1 +# Device count of per replica +drivesPerNode: 1 # Access key and secret key for storage accesskey: "" @@ -38,16 +46,3 @@ persistence: enabled: false size: 5Gi storageClass: "" - -global: - # A domain name consists of one or more parts. - # Periods (.) are used to separate these parts. - # Each part must be 1 to 63 characters in length and can contain lowercase letters, digits, and hyphens (-). - # It must start and end with a lowercase letter or digit. - clusterDomain: "cluster.local" - # Set the location of Workflow's Object Storage - # - # Valid values are: - # - on-cluster: Run drycc storage within the Kubernetes cluster - # - off-cluster: Run Storage outside the Kubernetes cluster (Compatible with s3 API) - storageLocation: on-cluster \ No newline at end of file diff --git a/rootfs/etc/otelcol/otelcol.yaml b/rootfs/etc/otelcol/otelcol.yaml new file mode 100644 index 0000000..bdc0575 --- /dev/null +++ b/rootfs/etc/otelcol/otelcol.yaml @@ -0,0 +1,35 @@ +# Simplified version: Focus on OTLP metrics collection and Prometheus export +receivers: + otlp: # OTLP protocol receiver + protocols: + grpc: # OTLP gRPC receiver for rustfs metrics data + endpoint: 0.0.0.0:4317 # gRPC endpoint for receiving metrics +processors: + batch: # Batch processor to improve throughput + timeout: 5s # Maximum time to wait before sending a batch + send_batch_size: 1000 # Maximum batch size + memory_limiter: # Memory limiter to prevent OOM + check_interval: 1s # Interval to check memory usage + limit_mib: 256 # Reduce memory limit to 256MB +exporters: + prometheus: # Prometheus exporter for Grafana scraping + endpoint: "0.0.0.0:9200" # Endpoint for Grafana to scrape metrics + namespace: "rustfs" # Metrics prefix (will prefix all metrics with "rustfs_") + send_timestamps: true # Include timestamps with metrics +service: + pipelines: + metrics: # Keep only metrics pipeline + receivers: [otlp] # Receive OTLP metrics from rustfs + processors: [memory_limiter, batch] # Apply memory limiting and batching + exporters: [prometheus] # Export to Prometheus for Grafana + telemetry: + logs: + level: "warn" # Reduce log level to minimize output + metrics: + readers: + - pull: + exporter: + prometheus: + host: 0.0.0.0 # Host for internal metrics endpoint + port: 8888 # Port for internal metrics endpoint + without_units: true # Export metrics without units for compatibility