Skip to content

Production Deployment: Kubernetes Deployment

Part of: Production Deployment Guide


3.1 Cluster Creation

We covered cluster creation in Section 2.1. Now let's configure the cluster for HeliosDB.

Install Required Components:

# Install CSI drivers
kubectl apply -k "github.com/kubernetes-sigs/aws-ebs-csi-driver/deploy/kubernetes/overlays/stable/?ref=release-1.25"

# Install metrics server
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml

# Install Prometheus Operator
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update

helm install prometheus prometheus-community/kube-prometheus-stack \
  --namespace monitoring \
  --create-namespace \
  --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false

3.2 Namespace Setup

Create Namespaces:

# namespaces.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: heliosdb
  labels:
    name: heliosdb
    environment: production
---
apiVersion: v1
kind: Namespace
metadata:
  name: heliosdb-system
  labels:
    name: heliosdb-system
    environment: production
---
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
  labels:
    name: monitoring

Apply namespaces:

kubectl apply -f namespaces.yaml

Resource Quotas:

# resource-quotas.yaml
apiVersion: v1
kind: ResourceQuota
metadata:
  name: heliosdb-quota
  namespace: heliosdb
spec:
  hard:
    requests.cpu: "100"
    requests.memory: 500Gi
    limits.cpu: "200"
    limits.memory: 1000Gi
    persistentvolumeclaims: "50"
    requests.storage: 10Ti

Apply quotas:

kubectl apply -f resource-quotas.yaml

3.3 StatefulSet Deployment

3.3.1 Metadata Nodes (Raft Consensus)

metadata-statefulset.yaml:

apiVersion: v1
kind: Service
metadata:
  name: heliosdb-metadata
  namespace: heliosdb
  labels:
    app: heliosdb
    component: metadata
spec:
  type: ClusterIP
  clusterIP: None
  ports:
    - port: 7001
      name: metadata
    - port: 8300
      name: raft
    - port: 9090
      name: metrics
  selector:
    app: heliosdb
    component: metadata
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: heliosdb-metadata
  namespace: heliosdb
spec:
  serviceName: heliosdb-metadata
  replicas: 3
  selector:
    matchLabels:
      app: heliosdb
      component: metadata
  template:
    metadata:
      labels:
        app: heliosdb
        component: metadata
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "9090"
    spec:
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: component
                    operator: In
                    values:
                      - metadata
              topologyKey: kubernetes.io/hostname
      tolerations:
        - key: dedicated
          operator: Equal
          value: metadata
          effect: NoSchedule
      nodeSelector:
        role: metadata
      containers:
        - name: metadata
          image: heliosdb/heliosdb:6.0.0
          imagePullPolicy: IfNotPresent
          command:
            - /usr/local/bin/heliosdb-metadata
          args:
            - --node-id=$(POD_NAME)
            - --listen-addr=0.0.0.0:7001
            - --raft-addr=0.0.0.0:8300
            - --data-dir=/data/metadata
            - --cluster-peers=heliosdb-metadata-0.heliosdb-metadata:8300,heliosdb-metadata-1.heliosdb-metadata:8300,heliosdb-metadata-2.heliosdb-metadata:8300
          env:
            - name: POD_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.name
            - name: POD_NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
            - name: RUST_LOG
              value: "info,heliosdb=debug"
            - name: RUST_BACKTRACE
              value: "1"
          ports:
            - containerPort: 7001
              name: metadata
            - containerPort: 8300
              name: raft
            - containerPort: 9090
              name: metrics
          resources:
            requests:
              cpu: "2"
              memory: 4Gi
            limits:
              cpu: "4"
              memory: 8Gi
          volumeMounts:
            - name: data
              mountPath: /data
            - name: config
              mountPath: /etc/heliosdb
          livenessProbe:
            httpGet:
              path: /health
              port: 9090
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 3
          readinessProbe:
            httpGet:
              path: /ready
              port: 9090
            initialDelaySeconds: 10
            periodSeconds: 5
            timeoutSeconds: 3
            failureThreshold: 3
      volumes:
        - name: config
          configMap:
            name: heliosdb-config
  volumeClaimTemplates:
    - metadata:
        name: data
      spec:
        accessModes: ["ReadWriteOnce"]
        storageClassName: heliosdb-gp3
        resources:
          requests:
            storage: 100Gi

3.3.2 Storage Nodes

storage-statefulset.yaml:

apiVersion: v1
kind: Service
metadata:
  name: heliosdb-storage
  namespace: heliosdb
  labels:
    app: heliosdb
    component: storage
spec:
  type: ClusterIP
  clusterIP: None
  ports:
    - port: 7002
      name: storage
    - port: 9090
      name: metrics
  selector:
    app: heliosdb
    component: storage
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: heliosdb-storage
  namespace: heliosdb
spec:
  serviceName: heliosdb-storage
  replicas: 5
  selector:
    matchLabels:
      app: heliosdb
      component: storage
  template:
    metadata:
      labels:
        app: heliosdb
        component: storage
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "9090"
    spec:
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: component
                      operator: In
                      values:
                        - storage
                topologyKey: kubernetes.io/hostname
      tolerations:
        - key: dedicated
          operator: Equal
          value: storage
          effect: NoSchedule
      nodeSelector:
        role: storage
      containers:
        - name: storage
          image: heliosdb/heliosdb:6.0.0
          imagePullPolicy: IfNotPresent
          command:
            - /usr/local/bin/heliosdb-storage
          args:
            - --node-id=$(POD_NAME)
            - --listen-addr=0.0.0.0:7002
            - --data-dir=/data/storage
            - --wal-dir=/wal
            - --metadata-endpoints=heliosdb-metadata:7001
            - --replication-factor=3
            - --enable-compression=true
            - --compression-algorithm=zstd
          env:
            - name: POD_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.name
            - name: POD_IP
              valueFrom:
                fieldRef:
                  fieldPath: status.podIP
            - name: RUST_LOG
              value: "info,heliosdb=debug"
          ports:
            - containerPort: 7002
              name: storage
            - containerPort: 9090
              name: metrics
          resources:
            requests:
              cpu: "4"
              memory: 16Gi
            limits:
              cpu: "8"
              memory: 32Gi
          volumeMounts:
            - name: data
              mountPath: /data
            - name: wal
              mountPath: /wal
            - name: config
              mountPath: /etc/heliosdb
          livenessProbe:
            httpGet:
              path: /health
              port: 9090
            initialDelaySeconds: 60
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 5
          readinessProbe:
            httpGet:
              path: /ready
              port: 9090
            initialDelaySeconds: 30
            periodSeconds: 5
            timeoutSeconds: 3
            failureThreshold: 3
      volumes:
        - name: config
          configMap:
            name: heliosdb-config
  volumeClaimTemplates:
    - metadata:
        name: data
      spec:
        accessModes: ["ReadWriteOnce"]
        storageClassName: heliosdb-io2
        resources:
          requests:
            storage: 500Gi
    - metadata:
        name: wal
      spec:
        accessModes: ["ReadWriteOnce"]
        storageClassName: heliosdb-io2
        resources:
          requests:
            storage: 100Gi

3.3.3 Compute Nodes

compute-deployment.yaml:

apiVersion: v1
kind: Service
metadata:
  name: heliosdb-compute
  namespace: heliosdb
  labels:
    app: heliosdb
    component: compute
spec:
  type: LoadBalancer
  ports:
    - port: 5432
      targetPort: 5432
      name: postgres
    - port: 10000
      targetPort: 10000
      name: graphql
    - port: 9090
      targetPort: 9090
      name: metrics
  selector:
    app: heliosdb
    component: compute
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: heliosdb-compute
  namespace: heliosdb
spec:
  replicas: 3
  selector:
    matchLabels:
      app: heliosdb
      component: compute
  template:
    metadata:
      labels:
        app: heliosdb
        component: compute
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "9090"
    spec:
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: component
                      operator: In
                      values:
                        - compute
                topologyKey: kubernetes.io/hostname
      containers:
        - name: compute
          image: heliosdb/heliosdb:6.0.0
          imagePullPolicy: IfNotPresent
          command:
            - /usr/local/bin/heliosdb-compute
          args:
            - --listen-addr=0.0.0.0:5432
            - --graphql-addr=0.0.0.0:10000
            - --metadata-endpoints=heliosdb-metadata:7001
            - --storage-endpoints=heliosdb-storage:7002
            - --max-connections=1000
            - --enable-query-cache=true
            - --enable-ai-optimization=true
          env:
            - name: RUST_LOG
              value: "info,heliosdb=debug"
            - name: DATABASE_URL
              valueFrom:
                secretKeyRef:
                  name: heliosdb-secrets
                  key: database-url
          ports:
            - containerPort: 5432
              name: postgres
            - containerPort: 10000
              name: graphql
            - containerPort: 9090
              name: metrics
          resources:
            requests:
              cpu: "4"
              memory: 8Gi
            limits:
              cpu: "8"
              memory: 16Gi
          volumeMounts:
            - name: config
              mountPath: /etc/heliosdb
            - name: cache
              mountPath: /cache
          livenessProbe:
            tcpSocket:
              port: 5432
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 3
          readinessProbe:
            tcpSocket:
              port: 5432
            initialDelaySeconds: 10
            periodSeconds: 5
            timeoutSeconds: 3
            failureThreshold: 3
      volumes:
        - name: config
          configMap:
            name: heliosdb-config
        - name: cache
          emptyDir:
            sizeLimit: 10Gi

3.4 Service Configuration

Service Types:

  1. ClusterIP (default): Internal access only
  2. NodePort: External access via node ports (30000-32767)
  3. LoadBalancer: Cloud provider load balancer
  4. ExternalName: DNS CNAME alias

3.5 Ingress Setup

ingress.yaml:

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: heliosdb-ingress
  namespace: heliosdb
  annotations:
    kubernetes.io/ingress.class: nginx
    cert-manager.io/cluster-issuer: letsencrypt-prod
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
    nginx.ingress.kubernetes.io/backend-protocol: "TCP"
    nginx.ingress.kubernetes.io/proxy-body-size: "50m"
    nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
    nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
    nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
spec:
  tls:
    - hosts:
        - heliosdb.example.com
        - api.heliosdb.example.com
      secretName: heliosdb-tls
  rules:
    - host: heliosdb.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: heliosdb-compute
                port:
                  number: 5432
    - host: api.heliosdb.example.com
      http:
        paths:
          - path: /graphql
            pathType: Prefix
            backend:
              service:
                name: heliosdb-compute
                port:
                  number: 10000

3.6 Auto-scaling Configuration

3.6.1 Horizontal Pod Autoscaler (HPA)

compute-hpa.yaml:

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: heliosdb-compute-hpa
  namespace: heliosdb
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: heliosdb-compute
  minReplicas: 3
  maxReplicas: 20
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80
    - type: Pods
      pods:
        metric:
          name: heliosdb_query_latency_seconds
        target:
          type: AverageValue
          averageValue: "500m"  # 500ms
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
        - type: Percent
          value: 50
          periodSeconds: 60
        - type: Pods
          value: 2
          periodSeconds: 60
      selectPolicy: Max
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
        - type: Percent
          value: 10
          periodSeconds: 60
        - type: Pods
          value: 1
          periodSeconds: 60
      selectPolicy: Min

3.6.2 Vertical Pod Autoscaler (VPA)

storage-vpa.yaml:

apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
  name: heliosdb-storage-vpa
  namespace: heliosdb
spec:
  targetRef:
    apiVersion: apps/v1
    kind: StatefulSet
    name: heliosdb-storage
  updatePolicy:
    updateMode: "Auto"
  resourcePolicy:
    containerPolicies:
      - containerName: storage
        minAllowed:
          cpu: "2"
          memory: 8Gi
        maxAllowed:
          cpu: "16"
          memory: 64Gi
        controlledResources:
          - cpu
          - memory