feat(monitor): use prometheus as grafana datasource

jianxiaoguo · jianxiaoguo · commit 03d664b2eb14 · 2023-03-14T17:01:03.000+08:00
diff --git a/README.md b/README.md
@@ -19,7 +19,8 @@ Please see below for links and descriptions of each component:
 - [registry](https://github.com/drycc/registry) - The Docker registry
 - [logger](https://github.com/drycc/logger) - The (in-memory) log buffer for `drycc logs`
 - [monitor](https://github.com/drycc/monitor) - The platform monitoring components
-- [influxdb](https://github.com/drycc/influxdb) - The monitor database
+- [influxdb](https://github.com/drycc/influxdb) - The controller app metrics database
+- [prometheus](https://github.com/drycc/prometheus) - The monitor database
 - [rabbitmq](https://github.com/drycc/rabbitmq) - RabbitMQ is a message broker used with controller celery
 - [storage](https://github.com/drycc/storage) - The in-cluster, kubernetes storage, s3 api compatible, hybrid storage system.
 - [workflow-cli](https://github.com/drycc/workflow-cli) - Workflow CLI `drycc`
diff --git a/_scripts/install.sh b/_scripts/install.sh
@@ -478,6 +478,14 @@ monitor:
   telegraf:
     imageRegistry: ${DRYCC_REGISTRY}
 
+prometheus:
+  prometheus-server:
+    retention: ${PROMETHEUS_SERVER_RETENTION:-"15d"}
+    persistence:
+      enabled: true
+      accessMode: ReadWriteOnce
+      size: ${PROMETHEUS_SERVER_PERSISTENCE_SIZE:-10Gi}
+      storageClass: ${PROMETHEUS_SERVER_PERSISTENCE_STORAGE_CLASS:-""}
 
 passport:
   replicas: ${PASSPORT_REPLICAS}
diff --git a/charts/workflow/Chart.yaml b/charts/workflow/Chart.yaml
@@ -42,6 +42,9 @@ dependencies:
 - name: passport
   repository: oci://registry.drycc.cc/charts-testing
   version: x.x.x
+- name: prometheus
+  repository: oci://registry.drycc.cc/charts-testing
+  version: x.x.x
 description: Drycc Workflow
 home: https://github.com/drycc/workflow
 maintainers:
diff --git a/charts/workflow/values.yaml b/charts/workflow/values.yaml
@@ -42,6 +42,12 @@ global:
   # - on-cluster: Run Influxdb within the Kubernetes cluster
   # - off-cluster: Influxdb is running outside of the cluster and credentials and connection information will be provided.
   influxdbLocation: "on-cluster"
+  # Set the location of Workflow's influxdb cluster
+  #
+  # Valid values are:
+  # - on-cluster: Run prometheus within the Kubernetes cluster
+  # - off-cluster: prometheus is running outside of the cluster and credentials and connection information will be provided.
+  prometheusLocation: "on-cluster"
   # Set the location of Workflow's grafana instance
   #
   # Valid values are:
@@ -286,6 +292,29 @@ passport:
   databaseUrl: ""
   databaseReplicaUrl: ""
 
+prometheus:
+  ## prometheus-server configuration##
+  prometheus-server:
+    replicas: 1
+    retention: 15d
+    # persistence config
+    persistence:
+      enabled: true
+      accessMode: ReadWriteOnce
+      size: 10Gi
+      storageClass: ""
+  ## node-exporter configuration##
+  node-exporter:
+    enabled: true
+  ## kube-state-metrics configuration
+  ##
+  kube-state-metrics:
+    enabled: true
+  # Configure the following ONLY if using an off-cluster prometheus database
+  # URL configuration is only available in off-cluster prometheus database
+  url: "http://my.prometheus.url:9090"
+
+
 # acme configuration takes effect if and only if certManagerEnabled is true
 acme:
   server: https://acme-v02.api.letsencrypt.org/directory
diff --git a/src/managing-workflow/platform-logging.md b/src/managing-workflow/platform-logging.md
@@ -39,29 +39,13 @@ Error: There are currently no log messages. Please check the following things:
                         │ Router │                  ┌────────┐     ┌─────┐
                         └────────┘                  │ Logger │◀───▶│Redis│
                             │                       └────────┘     └─────┘
-                         Log file                       ▲                
+                        Log file                        ▲                
                             │                           │                
                             ▼                           │                
-┌────────┐             ┌─────────┐    logs/metrics   ┌──────────────┐             
-│App Logs│──Log File──▶│ fluentd │───────topics─────▶│ Redis Stream │             
-└────────┘             └─────────┘                   └──────────────┘             
-                                                        │                
-                                                        │                
-┌─────────────┐                                         │                
-│ HOST        │                                         ▼                
-│  Telegraf   │───┐                                 ┌────────┐            
-└─────────────┘   │                                 │Telegraf│            
-                  │                                 └────────┘            
-┌─────────────┐   │                                      │                
-│ HOST        │   │    ┌───────────┐                     │                
-│  Telegraf   │───┼───▶│ InfluxDB  │◀────Wire ───────────┘                
-└─────────────┘   │    └───────────┘   Protocol                   
-                  │          ▲                                    
-┌─────────────┐   │          │                                    
-│ HOST        │   │          ▼                                    
-│  Telegraf   │───┘    ┌──────────┐                               
-└─────────────┘        │ Grafana  │                               
-                       └──────────┘                               
+┌────────┐             ┌─────────┐    logs/metrics   ┌──────────────┐     
+│App Logs│──Log File──▶│ fluentd │───────topics─────▶│ Redis Stream │     
+└────────┘             └─────────┘                   └──────────────┘     
+                                                                          
 ```
 
 ## Default Configuration
diff --git a/src/managing-workflow/platform-monitoring.md b/src/managing-workflow/platform-monitoring.md
@@ -2,42 +2,31 @@
 
 ## Description
 
-We now include a monitoring stack for introspection on a running Kubernetes cluster. The stack includes 3 components:
+We now include a monitoring stack for introspection on a running Kubernetes cluster. The stack includes 4 components:
 
-* [Telegraf](https://docs.influxdata.com/telegraf) - Metrics collection daemon written by team behind InfluxDB.
-* [InfluxDB](https://docs.influxdata.com/influxdb) - Time series database
-* [Grafana](http://grafana.org/) - Graphing tool for time series data
+* [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics), kube-state-metrics (KSM) is a simple service that listens to the Kubernetes API server and generates metrics about the state of the objects.
+* [Node Exporter](http://github.com/prometheus/node_exporter), Prometheus exporter for hardware and OS metrics exposed by *NIX kernels.
+* [Prometheus](https://prometheus.io/), a [Cloud Native Computing Foundation](https://cncf.io/) project, is a systems and service monitoring system.
+* [Grafana](http://grafana.org/), Graphing tool for time series data
 
 ## Architecture Diagram
 
 ```
-                        ┌────────┐                                        
-                        │ Router │                  ┌────────┐     ┌─────┐
-                        └────────┘                  │ Logger │◀───▶│Redis│
-                            │                       └────────┘     └─────┘
-                        Log file                        ▲                
-                            │                           │                
-                            ▼                           │                
-┌────────┐             ┌─────────┐    logs/metrics   ┌──────────────┐             
-│App Logs│──Log File──▶│ fluentd │───────topics─────▶│ Redis Stream │             
-└────────┘             └─────────┘                   └──────────────┘             
-                                                        │                
-                                                        │                
-┌─────────────┐                                         │                
-│ HOST        │                                         ▼                
-│  Telegraf   │───┐                                 ┌────────┐            
-└─────────────┘   │                                 │Telegraf│            
-                  │                                 └────────┘            
-┌─────────────┐   │                                      │                
-│ HOST        │   │    ┌───────────┐                     │                
-│  Telegraf   │───┼───▶│ InfluxDB  │◀────Wire ───────────┘                
-└─────────────┘   │    └───────────┘   Protocol                   
-                  │          ▲                                    
-┌─────────────┐   │          │                                    
-│ HOST        │   │          ▼                                    
-│  Telegraf   │───┘    ┌──────────┐                               
-└─────────────┘        │ Grafana  │                               
-                       └──────────┘                               
+┌────────────────┐                                                        
+│ HOST           │                                                        
+│  node-exporter │◀──┐                       ┌──────────────────┐         
+└────────────────┘   │                       │kube-state-metrics│         
+                     │                       └──────────────────┘         
+┌────────────────┐   │                               ▲                    
+│ HOST           │   │    ┌────────────┐             │                    
+│  node-exporter │◀──┼────│ Prometheus │─────────────┘                    
+└────────────────┘   │    └────────────┘                                  
+                     │          ▲                                         
+┌───────────────┐    │          │                                         
+│ HOST          │    │          ▼                                         
+│  node-exporter│◀───┘    ┌──────────┐                                    
+└───────────────┘         │ Grafana  │                                    
+                          └──────────┘                                    
 ```
 
 ## [Grafana](https://grafana.com/)
@@ -75,44 +64,28 @@ If you wish to have persistence for Grafana you can set `enabled` to `true` in t
 
 If you wish to provide your own Grafana instance you can set `grafanaLocation` in the `values.yaml` file before running `helm install`.
 
-## [InfluxDB](https://docs.influxdata.com/influxdb)
-InfluxDB writes data to the host disk; however, if the InfluxDB pod dies and comes back on another host, the data will not be recovered. The InfluxDB Admin UI is also exposed through the router allowing users to access the query engine by going to `influx.mydomain.com`. You will need to configure where to find the `influx-api` endpoint by clicking the "gear" icon at the top right and changing the host to `influx-api.mydomain.com` and port to `80`.
+## [Prometheus](https://prometheus.io/)
+Prometheus writes data to the host disk; however, if the prometheus pod dies and comes back on another host, the data will not be recovered. The prometheus graph UI is also exposed through the router allowing users to access the query engine by going to `prometheus.mydomain.com`. 
 
 ### On Cluster Persistence
-If you wish to have persistence for InfluxDB you can set `enabled` to `true` in the `values.yaml` file before running `helm install`.
+You can set `node-exporter` and `kube-state-metrics` to `true` or `false` in the `values.yaml`.
+If you wish to have persistence for Prometheus you can set `enabled` to `true` in the `values.yaml` file before running `helm install`.
 
 ```
- influxdb:
-   # Configure the following ONLY if you want persistence for on-cluster grafana
-   # GCP PDs and EBS volumes are supported only
-   persistence:
-     enabled: true # Set to true to enable persistence
-     size: 5Gi # PVC size
+prometheus:
+  prometheus-server:
+    persistence:
+      enabled: true # Set to true to enable persistence
+      size: 10Gi # PVC size
+node-exporter:
+  enabled: true
+kube-state-metrics:
+  enabled: true
 ```
 
-### Off Cluster Influxdb
-
-To use off-cluster Influx v2, please provide the following values in the `values.yaml` file before running `helm install`.
-
-* `influxdbLocation=off-cluster`
-* `url = "http://my-influxhost.com:8086"`
-* `bucket = "metrics"`
-* `org = "drycc"`
-* `token = "MysuperSecurePassword"`
-
-
-## [Telegraf](https://docs.influxdata.com/telegraf)
-
-Telegraf is the metrics collection daemon used within the monitoring stack. It will collect and send the following metrics to InfluxDB:
-
-* System level metrics such as CPU, Load Average, Memory, Disk, and Network stats
-* Container level metrics such as CPU and Memory
-* Kubernetes metrics such as API request latency, Pod Startup Latency, and number of running pods
-
-It is possible to send these metrics to other endpoints besides InfluxDB. For more information please consult the following [file](https://github.com/drycc/monitor/blob/main/telegraf/rootfs/config.toml.tpl)
-
-### Customizing the Monitoring Stack
+### Off Cluster Prometheus
 
-To learn more about customizing each of the above components please visit the [Tuning Component Settings][] section.
+To use off-cluster Prometheus, please provide the following values in the `values.yaml` file before running `helm install`.
 
-[Tuning Component Settings]: tuning-component-settings.md#customizing-the-monitor
+* `global.prometheusLocation=off-cluster`
+* `url = "http://my.prometheus.url:9090"`
diff --git a/src/quickstart/install-workflow.md b/src/quickstart/install-workflow.md
@@ -234,6 +234,9 @@ HELMBROKER_REPLICAS                        | Number of helmbroker api replicas t
 HELMBROKER_CELERY_REPLICAS                 | Number of helmbroker celery replicas to deploy
 HELMBROKER_PERSISTENCE_SIZE                | The size of the persistence space allocated to `helmbroker`, which is `5Gi` by default
 HELMBROKER_PERSISTENCE_STORAGE_CLASS       | StorangeClass of `helmbroker`; default storangeclass is used by default
+PROMETHEUS_SERVER_RETENTION                | Prometheus data retention period (default if not specified is 15 days)
+PROMETHEUS_SERVER_PERSISTENCE_SIZE         | The size of the persistence space allocated to `prometheus-server`, which is `10Gi` by default
+PROMETHEUS_SERVER_PERSISTENCE_STORAGE_CLASS| StorangeClass of `prometheus-server`; default storangeclass is used by default
 K3S_DATA_DIR                               | The config of k3s data dir; If not set, the default path is used
 ACME_SERVER                                | ACME Server url, default use letsencrypt
 ACME_EAB_KEY_ID                            | The key ID of which your external account binding is indexed by the external account
diff --git a/src/understanding-workflow/components.md b/src/understanding-workflow/components.md
@@ -146,6 +146,12 @@ Helm Broker is a Service Broker that exposes Helm charts as Service Classes in S
 To do so, Helm Broker uses the concept of addons. An addon is an abstraction layer over a Helm chart
 which provides all information required to convert the chart into a Service Class.
 
+## Prometheus
+
+**Project Location:** [drycc/rabbitmq](https://github.com/drycc/prometheus)
+
+Prometheus is an open-source systemsmonitoring and alerting toolkit originally built atSoundCloud.
+
 ## See Also
 
 * [Workflow Concepts][concepts]