Skip to content

Production Deployment: Monitoring & Observability

Part of: Production Deployment Guide


6.1 Prometheus Setup

prometheus.yml:

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'heliosdb-prod'
    environment: 'production'

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

# Load rules
rule_files:
  - 'alerts/*.yml'

# Scrape configurations
scrape_configs:
  # HeliosDB metadata nodes
  - job_name: 'heliosdb-metadata'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - heliosdb
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_component]
        action: keep
        regex: metadata
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: instance
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: namespace

  # HeliosDB storage nodes
  - job_name: 'heliosdb-storage'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - heliosdb
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_component]
        action: keep
        regex: storage
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: instance

  # HeliosDB compute nodes
  - job_name: 'heliosdb-compute'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - heliosdb
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_component]
        action: keep
        regex: compute
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: instance

  # Node exporter (system metrics)
  - job_name: 'node-exporter'
    kubernetes_sd_configs:
      - role: node
    relabel_configs:
      - source_labels: [__address__]
        regex: '(.*):10250'
        replacement: '${1}:9100'
        target_label: __address__

  # Kubernetes metrics
  - job_name: 'kubernetes-apiservers'
    kubernetes_sd_configs:
      - role: endpoints
    scheme: https
    tls_config:
      ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https

Alert Rules (alerts/heliosdb.yml):

groups:
  - name: heliosdb_alerts
    interval: 30s
    rules:
      # High query latency
      - alert: HighQueryLatency
        expr: histogram_quantile(0.95, rate(heliosdb_query_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High query latency on {{ $labels.instance }}"
          description: "95th percentile query latency is {{ $value }}s"

      # Node down
      - alert: NodeDown
        expr: up{job=~"heliosdb-.*"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "HeliosDB node {{ $labels.instance }} is down"
          description: "Node has been down for more than 1 minute"

      # High memory usage
      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value | humanizePercentage }}"

      # High CPU usage
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value }}%"

      # Replication lag
      - alert: HighReplicationLag
        expr: heliosdb_replication_lag_seconds > 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High replication lag on {{ $labels.instance }}"
          description: "Replication lag is {{ $value }}s"

      # Disk space
      - alert: LowDiskSpace
        expr: (node_filesystem_avail_bytes{mountpoint="/data"} / node_filesystem_size_bytes{mountpoint="/data"}) < 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Only {{ $value | humanizePercentage }} disk space remaining"

      # Failed transactions
      - alert: HighTransactionFailureRate
        expr: rate(heliosdb_transaction_failures_total[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High transaction failure rate on {{ $labels.instance }}"
          description: "{{ $value }} transactions/sec failing"

      # Connection pool exhaustion
      - alert: ConnectionPoolExhausted
        expr: heliosdb_connection_pool_active / heliosdb_connection_pool_size > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Connection pool nearly exhausted on {{ $labels.instance }}"
          description: "{{ $value | humanizePercentage }} of connections in use"

6.2 Grafana Dashboards

Dashboard Provisioning (grafana/dashboards/heliosdb-overview.json):

{
  "dashboard": {
    "title": "HeliosDB Overview",
    "tags": ["heliosdb", "database"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "Query Throughput",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(heliosdb_queries_total[5m])) by (instance)",
            "legendFormat": "{{instance}}"
          }
        ]
      },
      {
        "id": 2,
        "title": "Query Latency (p95)",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(heliosdb_query_duration_seconds_bucket[5m]))",
            "legendFormat": "p95"
          }
        ]
      },
      {
        "id": 3,
        "title": "Active Connections",
        "type": "graph",
        "targets": [
          {
            "expr": "heliosdb_connection_pool_active",
            "legendFormat": "{{instance}}"
          }
        ]
      },
      {
        "id": 4,
        "title": "Transaction Success Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(heliosdb_transaction_commits_total[5m]) / (rate(heliosdb_transaction_commits_total[5m]) + rate(heliosdb_transaction_rollbacks_total[5m]))",
            "legendFormat": "Success Rate"
          }
        ]
      }
    ]
  }
}

6.3 Log Aggregation

Fluentd Configuration (fluentd.conf):

<source>
  @type tail
  path /var/log/heliosdb/*.log
  pos_file /var/log/fluentd/heliosdb.pos
  tag heliosdb.*
  <parse>
    @type json
    time_key timestamp
    time_format %Y-%m-%dT%H:%M:%S.%NZ
  </parse>
</source>

<filter heliosdb.**>
  @type record_transformer
  <record>
    hostname "#{Socket.gethostname}"
    cluster "heliosdb-prod"
    environment "production"
  </record>
</filter>

<match heliosdb.**>
  @type elasticsearch
  host elasticsearch
  port 9200
  logstash_format true
  logstash_prefix heliosdb
  include_tag_key true
  <buffer>
    @type file
    path /var/log/fluentd/buffer/heliosdb
    flush_interval 5s
    retry_max_times 3
  </buffer>
</match>