Skip to content

Cloud Catalogs Quick Start Guide

Quick reference for using Azure, GCS, and Unity Catalog integrations in HeliosDB.

Table of Contents

Azure Data Lake Gen2

Basic Setup

use heliosdb_catalog_unified::{
    azure::{AzureCatalog, AzureCatalogConfig},
    catalog::CatalogBackend,
};
use std::collections::HashMap;

let config = AzureCatalogConfig {
    account_name: "mystorageaccount".to_string(),
    container_name: "data".to_string(),
    tenant_id: Some(std::env::var("AZURE_TENANT_ID")?),
    client_id: Some(std::env::var("AZURE_CLIENT_ID")?),
    client_secret: Some(std::env::var("AZURE_CLIENT_SECRET")?),
    use_managed_identity: false,
    purview_account: None,
    enable_governance: false,
    region: "eastus".to_string(),
    properties: HashMap::new(),
};

let catalog = AzureCatalog::new("azure_prod".to_string(), config).await?;
let config = AzureCatalogConfig {
    account_name: "mystorageaccount".to_string(),
    container_name: "data".to_string(),
    tenant_id: None,
    client_id: None,
    client_secret: None,
    use_managed_identity: true,  // Use MSI
    purview_account: None,
    enable_governance: false,
    region: "eastus".to_string(),
    properties: HashMap::new(),
};

With Azure Purview (Data Governance)

let config = AzureCatalogConfig {
    account_name: "mystorageaccount".to_string(),
    container_name: "data".to_string(),
    tenant_id: Some(std::env::var("AZURE_TENANT_ID")?),
    client_id: Some(std::env::var("AZURE_CLIENT_ID")?),
    client_secret: Some(std::env::var("AZURE_CLIENT_SECRET")?),
    use_managed_identity: false,
    purview_account: Some("mypurview".to_string()),
    enable_governance: true,  // Enable Purview tracking
    region: "eastus".to_string(),
    properties: HashMap::new(),
};

Environment Variables

export AZURE_TENANT_ID="your-tenant-id"
export AZURE_CLIENT_ID="your-client-id"
export AZURE_CLIENT_SECRET="your-client-secret"

Google Cloud Storage

Basic Setup

use heliosdb_catalog_unified::{
    gcs::{GcsCatalog, GcsCatalogConfig},
    catalog::CatalogBackend,
};
use std::collections::HashMap;

let config = GcsCatalogConfig {
    project_id: "my-gcp-project".to_string(),
    bucket_name: "my-data-bucket".to_string(),
    service_account_key: Some("/path/to/service-account-key.json".to_string()),
    use_workload_identity: false,
    region: "us-central1".to_string(),
    metadata_prefix: "metadata".to_string(),
    enable_versioning: false,
    properties: HashMap::new(),
};

let catalog = GcsCatalog::new("gcs_prod".to_string(), config).await?;
let config = GcsCatalogConfig {
    project_id: "my-gcp-project".to_string(),
    bucket_name: "my-data-bucket".to_string(),
    service_account_key: None,
    use_workload_identity: true,  // Use GKE workload identity
    region: "us-central1".to_string(),
    metadata_prefix: "metadata".to_string(),
    enable_versioning: false,
    properties: HashMap::new(),
};

With Object Versioning

let config = GcsCatalogConfig {
    project_id: "my-gcp-project".to_string(),
    bucket_name: "my-data-bucket".to_string(),
    service_account_key: Some("/path/to/key.json".to_string()),
    use_workload_identity: false,
    region: "us-central1".to_string(),
    metadata_prefix: "metadata".to_string(),
    enable_versioning: true,  // Enable versioning
    properties: HashMap::new(),
};

Environment Variables

export GCP_PROJECT_ID="my-gcp-project"
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account-key.json"

Unity Catalog (Databricks)

Basic Setup with Personal Access Token

use heliosdb_catalog_unified::{
    unity::{UnityCatalog, UnityCatalogConfig},
    catalog::CatalogBackend,
};
use std::collections::HashMap;

let config = UnityCatalogConfig {
    workspace_url: "https://my-workspace.cloud.databricks.com".to_string(),
    catalog_name: "main".to_string(),
    token: Some(std::env::var("DATABRICKS_TOKEN")?),
    client_id: None,
    client_secret: None,
    use_service_principal: false,
    cloud_provider: "aws".to_string(),
    enable_delta_sharing: false,
    properties: HashMap::new(),
};

let catalog = UnityCatalog::new("unity_prod".to_string(), config).await?;
let config = UnityCatalogConfig {
    workspace_url: "https://my-workspace.cloud.databricks.com".to_string(),
    catalog_name: "main".to_string(),
    token: None,
    client_id: Some(std::env::var("DATABRICKS_CLIENT_ID")?),
    client_secret: Some(std::env::var("DATABRICKS_CLIENT_SECRET")?),
    use_service_principal: true,  // Use service principal
    cloud_provider: "aws".to_string(),
    enable_delta_sharing: false,
    properties: HashMap::new(),
};

Multi-Cloud Configurations

AWS

let config = UnityCatalogConfig {
    workspace_url: "https://my-workspace.cloud.databricks.com".to_string(),
    catalog_name: "main".to_string(),
    token: Some(std::env::var("DATABRICKS_TOKEN")?),
    // ... other fields ...
    cloud_provider: "aws".to_string(),
    // ... other fields ...
};

Azure

let config = UnityCatalogConfig {
    workspace_url: "https://my-workspace.azuredatabricks.net".to_string(),
    catalog_name: "main".to_string(),
    token: Some(std::env::var("DATABRICKS_TOKEN")?),
    // ... other fields ...
    cloud_provider: "azure".to_string(),
    // ... other fields ...
};

GCP

let config = UnityCatalogConfig {
    workspace_url: "https://my-workspace.gcp.databricks.com".to_string(),
    catalog_name: "main".to_string(),
    token: Some(std::env::var("DATABRICKS_TOKEN")?),
    // ... other fields ...
    cloud_provider: "gcp".to_string(),
    // ... other fields ...
};

Environment Variables

export DATABRICKS_WORKSPACE_URL="https://my-workspace.cloud.databricks.com"
export DATABRICKS_TOKEN="dapi1234567890abcdef"

# OR for service principal
export DATABRICKS_CLIENT_ID="your-client-id"
export DATABRICKS_CLIENT_SECRET="your-client-secret"

Common Operations

List Tables

// Works for all catalog types
let tables = catalog.list_tables().await?;
for table in tables {
    println!("Table: {}", table.fully_qualified_name());
}

Get Table Metadata

use heliosdb_catalog_unified::catalog::UnifiedTableIdentifier;

let table_id = UnifiedTableIdentifier::parse("catalog.schema.table")?;
let metadata = catalog.get_table(&table_id).await?;

println!("Format: {:?}", metadata.format);
println!("Location: {}", metadata.location);
println!("Last Modified: {}", metadata.last_modified);

Check Table Existence

let table_id = UnifiedTableIdentifier::new(
    Some("catalog".to_string()),
    vec!["schema".to_string()],
    "table",
);

let exists = catalog.table_exists(&table_id).await?;
println!("Table exists: {}", exists);

Create Namespace

use std::collections::HashMap;

let namespace = vec!["analytics".to_string()];
let mut properties = HashMap::new();
properties.insert("description".to_string(), "Analytics namespace".to_string());

catalog.create_namespace(namespace, properties).await?;

List Namespaces

let namespaces = catalog.list_namespaces().await?;
for namespace in namespaces {
    println!("Namespace: {}", namespace.join("."));
}

Drop Namespace

let namespace = vec!["analytics".to_string()];
let cascade = true;  // Delete all contents

catalog.drop_namespace(namespace, cascade).await?;

Using with Unified Catalog

Configuration

use heliosdb_catalog_unified::{
    UnifiedCatalog, UnifiedCatalogConfig,
    config::{BackendConfig, BackendType},
};
use std::collections::HashMap;

// Configure Azure backend
let mut azure_props = HashMap::new();
azure_props.insert("account_name".to_string(), "mystorageaccount".to_string());
azure_props.insert("container_name".to_string(), "data".to_string());
azure_props.insert("tenant_id".to_string(), std::env::var("AZURE_TENANT_ID")?);

let azure_backend = BackendConfig {
    name: "azure_prod".to_string(),
    backend_type: BackendType::Azure,
    uri: None,
    warehouse: None,
    properties: azure_props,
    enabled: true,
};

// Configure GCS backend
let mut gcs_props = HashMap::new();
gcs_props.insert("project_id".to_string(), "my-project".to_string());
gcs_props.insert("bucket_name".to_string(), "my-bucket".to_string());
gcs_props.insert("service_account_key".to_string(), "/path/to/key.json".to_string());

let gcs_backend = BackendConfig {
    name: "gcs_prod".to_string(),
    backend_type: BackendType::Gcs,
    uri: None,
    warehouse: None,
    properties: gcs_props,
    enabled: true,
};

// Configure Unity backend
let mut unity_props = HashMap::new();
unity_props.insert("catalog_name".to_string(), "main".to_string());
unity_props.insert("token".to_string(), std::env::var("DATABRICKS_TOKEN")?);

let unity_backend = BackendConfig {
    name: "unity_prod".to_string(),
    backend_type: BackendType::Unity,
    uri: Some("https://my-workspace.cloud.databricks.com".to_string()),
    warehouse: None,
    properties: unity_props,
    enabled: true,
};

// Create unified catalog
let config = UnifiedCatalogConfig {
    name: "multi_cloud".to_string(),
    backends: vec![azure_backend, gcs_backend, unity_backend],
    cache_config: Default::default(),
    federation_config: Default::default(),
    scaling_config: Default::default(),
    query_timeout: std::time::Duration::from_secs(30),
};

let unified_catalog = UnifiedCatalog::new(config).await?;

Query Across Catalogs

// Get table from Azure
let azure_table = unified_catalog.get_table("azure_prod.sales.orders").await?;

// Get table from GCS
let gcs_table = unified_catalog.get_table("gcs_prod.warehouse.customers").await?;

// Get table from Unity
let unity_table = unified_catalog.get_table("unity_prod.default.products").await?;

// List tables from all catalogs
let all_tables = unified_catalog.list_tables().await?;

Troubleshooting

Azure Issues

Authentication Errors

# Verify credentials
az login
az account show

# Test service principal
az login --service-principal \
  --username $AZURE_CLIENT_ID \
  --password $AZURE_CLIENT_SECRET \
  --tenant $AZURE_TENANT_ID

Storage Access Issues

# Check storage account access
az storage account show --name mystorageaccount

# Check container permissions
az storage container show \
  --account-name mystorageaccount \
  --name data

GCS Issues

Authentication Errors

# Verify service account
gcloud auth activate-service-account \
  --key-file=/path/to/key.json

# Test access
gcloud projects list

Bucket Access Issues

# Check bucket exists
gsutil ls gs://my-bucket

# Check permissions
gsutil iam get gs://my-bucket

Unity Catalog Issues

Token Errors

# Verify token
curl -H "Authorization: Bearer $DATABRICKS_TOKEN" \
  https://my-workspace.cloud.databricks.com/api/2.0/clusters/list

Catalog Access Issues

# List catalogs
curl -H "Authorization: Bearer $DATABRICKS_TOKEN" \
  https://my-workspace.cloud.databricks.com/api/2.1/unity-catalog/catalogs

Common Error Messages

Error Likely Cause Solution
AuthenticationError: Token acquisition failed Invalid credentials Verify tenant ID, client ID, and secret
ConnectionError: Network timeout Network issues or wrong URL Check connectivity and endpoint URLs
NotFound: Table not found Table doesn't exist or wrong path Verify table path and catalog name
PermissionDenied Insufficient permissions Check IAM roles and permissions
ConfigurationError: Missing project_id Incomplete configuration Provide all required configuration fields

Best Practices

Security

  • Use managed identities/workload identities in production
  • Store credentials in secure vaults (Azure Key Vault, GCP Secret Manager)
  • Rotate credentials regularly
  • Use least-privilege IAM policies

Performance

  • Enable metadata caching for frequently accessed tables
  • Use batch operations when available
  • Configure appropriate timeouts
  • Monitor API rate limits

Reliability

  • Implement retry logic with exponential backoff
  • Handle token expiration gracefully
  • Log errors with context
  • Use health checks to verify connectivity

Additional Resources

Support

For issues or questions: 1. Check the troubleshooting section 2. Review error logs with RUST_LOG=debug 3. Consult cloud provider documentation 4. Open an issue in the HeliosDB repository


Last Updated: 2025-11-25 Version: 6.0.0