Skip to content

Production Deployment: Backup & Disaster Recovery

Part of: Production Deployment Guide


8.1 Backup Strategies

Full Backup:

# Create full backup
heliosdb-cli backup create \
  --type full \
  --destination s3://heliosdb-backups-prod/full/$(date +%Y%m%d) \
  --compression zstd \
  --encryption aes-256-gcm \
  --parallelism 8

# Schedule with cron
0 2 * * 0 /usr/local/bin/heliosdb-backup-full.sh

Incremental Backup:

# Create incremental backup
heliosdb-cli backup create \
  --type incremental \
  --base-backup s3://heliosdb-backups-prod/full/20251101 \
  --destination s3://heliosdb-backups-prod/incremental/$(date +%Y%m%d) \
  --compression zstd \
  --parallelism 4

# Schedule with cron (daily)
0 2 * * 1-6 /usr/local/bin/heliosdb-backup-incremental.sh

Continuous WAL Archiving:

[backup.wal]
enabled = true
destination = "s3://heliosdb-backups-prod/wal"
archive_interval_sec = 60
max_archive_size_mb = 100
compression_enabled = true

8.2 Point-in-Time Recovery

Restore from Backup:

# List available backups
heliosdb-cli backup list --destination s3://heliosdb-backups-prod

# Restore to specific point in time
heliosdb-cli restore \
  --backup s3://heliosdb-backups-prod/full/20251101 \
  --target-time "2025-11-02 14:30:00 UTC" \
  --data-dir /data/restore \
  --parallelism 8

# Verify restored data
heliosdb-cli restore verify --data-dir /data/restore

8.3 Multi-Region Failover

Cross-Region Replication:

[replication.cross_region]
enabled = true
regions = ["us-east-1", "us-west-2", "eu-west-1"]
replication_mode = "async"
max_replication_lag_sec = 30
automatic_failover = true
failover_timeout_sec = 60

[replication.cross_region.us_west_2]
priority = 1  # Primary failover target
endpoint = "heliosdb-us-west-2.example.com:5432"

[replication.cross_region.eu_west_1]
priority = 2  # Secondary failover target
endpoint = "heliosdb-eu-west-1.example.com:5432"

Failover Procedure:

# Manual failover
heliosdb-cli failover \
  --from-region us-east-1 \
  --to-region us-west-2 \
  --verify-replication

# Automatic failover (configured in heliosdb.toml)
# Triggers automatically when primary region is unavailable for > 60 seconds