Skip to content

8-Modality Vector Search Quick Start

Complete Multimodal AI System

Supported Modalities: 8 (Text, Image, Audio, Video, 3D, Chemical, Code, TimeSeries) Embedding Dimension: 1536D unified space Cross-Modal Search: Fully supported


Quick Start (5 Minutes)

1. Create Service

use heliosdb_multimodal_vector::{
    MultimodalVectorService, MultimodalContent, ModalityType,
    ImageFormat, AudioFormat, VideoFormat,
};

#[tokio::main]
async fn main() -> anyhow::Result<()> {
    let service = MultimodalVectorService::new().await?;

    // Ready to use!
    Ok(())
}

2. Embed All 8 Modalities

// 1. TEXT
let text = MultimodalContent::Text {
    text: "sunset at the beach".to_string(),
    language: Some("en".to_string()),
};
let text_emb = service.embed(text).await?;

// 2. IMAGE
let image = MultimodalContent::Image {
    data: image_bytes,
    format: ImageFormat::Jpeg,
    metadata: Default::default(),
};
let image_emb = service.embed(image).await?;

// 3. AUDIO
let audio = MultimodalContent::Audio {
    data: audio_bytes,
    format: AudioFormat::Mp3,
    sample_rate: 44100,
    duration_ms: 3000,
};
let audio_emb = service.embed(audio).await?;

// 4. VIDEO
let video = MultimodalContent::Video {
    data: video_bytes,
    format: VideoFormat::Mp4,
    frame_rate: 30.0,
    duration_ms: 5000,
    extract_frames: FrameExtractionStrategy::OnePerSecond,
};
let video_emb = service.embed(video).await?;

// 5. POINT CLOUD (3D)
let point_cloud = MultimodalContent::PointCloud {
    data: ply_bytes,
    format: PointCloudFormat::Ply,
    num_points: Some(10000),
};
let pc_emb = service.embed(point_cloud).await?;

// 6. CHEMICAL
let chemical = MultimodalContent::Chemical {
    smiles: "CCO".to_string(),  // Ethanol
    molecular_weight: Some(46.07),
};
let chem_emb = service.embed(chemical).await?;

// 7. CODE
let code = MultimodalContent::Code {
    code: r#"
        fn fibonacci(n: u64) -> u64 {
            match n {
                0 => 0,
                1 => 1,
                _ => fibonacci(n-1) + fibonacci(n-2)
            }
        }
    "#.to_string(),
    language: "rust".to_string(),
    file_path: Some("fibonacci.rs".to_string()),
};
let code_emb = service.embed(code).await?;

// 8. TIME SERIES
let timeseries = MultimodalContent::TimeSeries {
    values: vec![20.5, 21.2, 22.1, 21.8, 20.9],
    timestamps: vec![1000, 2000, 3000, 4000, 5000],
    metadata: serde_json::json!({"sensor": "temperature", "unit": "celsius"}),
};
let ts_emb = service.embed(timeseries).await?;

Search Across All Modalities

// Query with text, find similar content in ANY modality
let query = MultimodalContent::Text {
    text: "happy birthday celebration".to_string(),
    language: None,
};

let results = service.search(
    query,
    10,        // top-k results
    None,      // no modality filter (search all)
).await?;

for result in results {
    println!(
        "Modality: {:?}, Similarity: {:.3}",
        result.modality,
        result.similarity
    );
}

Filter by Modality

// Only return images
let image_results = service.search(
    query.clone(),
    10,
    Some(ModalityType::Image),
).await?;

// Only return code
let code_results = service.search(
    query.clone(),
    10,
    Some(ModalityType::Code),
).await?;

Batch Processing

High-Throughput Embedding

// Embed 100 items of mixed modalities
let batch = vec![
    MultimodalContent::Text { text: "item 1".to_string(), language: None },
    MultimodalContent::Image { data: img1, format: ImageFormat::Png, metadata: Default::default() },
    MultimodalContent::Code { code: "fn test() {}".to_string(), language: "rust".to_string(), file_path: None },
    // ... 97 more items
];

let embeddings = service.embed_batch(batch).await?;
assert_eq!(embeddings.len(), 100);

// Throughput: ~35 embeddings/sec (mixed modalities)
// With GPU: ~200+ embeddings/sec

Complete Example: Multi-Modal RAG System

use heliosdb_multimodal_vector::*;

struct MultimodalRAG {
    service: MultimodalVectorService,
    index: Vec<(UnifiedEmbedding, MultimodalContent)>,
}

impl MultimodalRAG {
    async fn new() -> Result<Self> {
        Ok(Self {
            service: MultimodalVectorService::new().await?,
            index: Vec::new(),
        })
    }

    // Index any type of content
    async fn index(&mut self, content: MultimodalContent) -> Result<()> {
        let embedding = self.service.embed(content.clone()).await?;
        self.index.push((embedding, content));
        Ok(())
    }

    // Search with any query type
    async fn query(&self, query: MultimodalContent, top_k: usize)
        -> Result<Vec<(f32, &MultimodalContent)>>
    {
        let query_emb = self.service.embed(query).await?;

        // Compute similarities
        let mut results: Vec<_> = self.index
            .iter()
            .map(|(emb, content)| {
                let similarity = query_emb.similarity(emb);
                (similarity, content)
            })
            .collect();

        // Sort by similarity
        results.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap());

        // Return top-k
        Ok(results.into_iter().take(top_k).collect())
    }
}

#[tokio::main]
async fn main() -> Result<()> {
    let mut rag = MultimodalRAG::new().await?;

    // Index various content types
    rag.index(MultimodalContent::Text {
        text: "Machine learning tutorial".to_string(),
        language: None,
    }).await?;

    rag.index(MultimodalContent::Code {
        code: "def train_model(data): ...".to_string(),
        language: "python".to_string(),
        file_path: None,
    }).await?;

    rag.index(MultimodalContent::Image {
        data: diagram_image,
        format: ImageFormat::Png,
        metadata: Default::default(),
    }).await?;

    // Query with text, get results across all modalities
    let results = rag.query(
        MultimodalContent::Text {
            text: "how to train a neural network".to_string(),
            language: None,
        },
        5,
    ).await?;

    for (similarity, content) in results {
        println!(
            "Similarity: {:.3}, Type: {:?}",
            similarity,
            content.modality()
        );
    }

    Ok(())
}

Modality Specifications

1. Text

  • Languages: Auto-detected or specified
  • Max Length: 8K tokens
  • Encoding: UTF-8
  • Performance: 80 emb/s (CPU), 1,200 emb/s (GPU)

2. Image

  • Formats: JPEG, PNG, GIF, WebP, BMP
  • Max Size: 10MB
  • Min Resolution: 224x224
  • Performance: 22 emb/s (CPU), 550 emb/s (GPU)

3. Audio

  • Formats: WAV, MP3, FLAC, OGG, M4A
  • Sample Rate: 16kHz - 48kHz
  • Max Duration: 30 seconds
  • Performance: 25 emb/s (CPU), 450 emb/s (GPU)

4. Video

  • Formats: MP4, WebM, AVI, MKV
  • Frame Rate: Any (auto-detected)
  • Max Duration: 60 seconds
  • Frame Extraction: Configurable strategy
  • Performance: 6 emb/s (CPU), 80 emb/s (GPU)

5. Point Cloud (3D)

  • Formats: OBJ, STL, PLY
  • Max Points: 100K points
  • Features: Geometry, normals, colors
  • Performance: 35 emb/s (CPU)

6. Chemical

  • Format: SMILES notation
  • Max Length: 200 characters
  • Validation: Automatic structure validation
  • Performance: 64 emb/s (CPU)

7. Code

  • Languages: Rust, Python, JavaScript, Java, C++, Go, TypeScript, etc.
  • Max Length: 10K characters
  • Features: AST, control flow, data flow
  • Performance: 52 emb/s (CPU), 624 emb/s (GPU)

8. Time Series

  • Max Points: 10K data points
  • Features: Statistical + spectral
  • Metadata: Optional sensor info
  • Performance: 120 emb/s (CPU), 1,000+ emb/s (GPU)

GPU Acceleration

Enable GPU Support

[dependencies]
heliosdb-multimodal-vector = { version = "0.6", features = ["gpu"] }
heliosdb-gpu = "0.6"
let service = MultimodalVectorService::new()
    .await?
    .with_gpu_acceleration(true);

// Automatic GPU usage for supported modalities
// 10-25x speedup on NVIDIA GPUs

GPU Performance

Modality CPU (emb/s) GPU (emb/s) Speedup
Text 80 1,200 15x
Image 22 550 25x
Audio 25 450 18x
Video 6 80 13x
Code 52 624 12x
TimeSeries 120 1,000+ 8x

Production Tips

1. Caching

// Embeddings are automatically cached
// Second call is instant
let emb1 = service.embed(content.clone()).await?;  // 50ms
let emb2 = service.embed(content.clone()).await?;  // <1ms (cached)

2. Batch for Throughput

// Single: ~35 emb/s
for content in contents {
    service.embed(content).await?;
}

// Batch: ~350 emb/s (10x faster)
service.embed_batch(contents).await?;

3. Filter by Modality for Speed

// Fast: Search only within same modality
service.search(query, 10, Some(ModalityType::Image)).await?;

// Slower: Search across all modalities
service.search(query, 10, None).await?;

4. Use Hybrid Content

// Combine multiple modalities for better search
let hybrid = MultimodalContent::Hybrid {
    modalities: vec![
        MultimodalContent::Text { text: "Product description".to_string(), language: None },
        MultimodalContent::Image { data: product_image, format: ImageFormat::Jpeg, metadata: Default::default() },
    ],
    fusion_strategy: FusionStrategy::WeightedMean,
};

let embedding = service.embed(hybrid).await?;

Advanced Features

Custom Metadata

let embedding = service.embed(content).await?;

// Access metadata
println!("Model: {}", embedding.model);
println!("Confidence: {}", embedding.confidence);
println!("Processing Time: {}ms", embedding.metadata.processing_time_ms);
println!("Timestamp: {}", embedding.metadata.timestamp);

Similarity Computation

let emb1 = service.embed(content1).await?;
let emb2 = service.embed(content2).await?;

// Cosine similarity (0-1)
let similarity = emb1.similarity(&emb2);

if similarity > 0.8 {
    println!("Very similar!");
} else if similarity > 0.5 {
    println!("Somewhat similar");
} else {
    println!("Not similar");
}

Testing

Run Integration Tests

# Test all 8 modalities
cargo test --test eight_modality_integration_tests

# Test specific modality
cargo test --test eight_modality_integration_tests -- test_all_8_modalities_embedding

# Test cross-modal search
cargo test --test eight_modality_integration_tests -- test_cross_modal_search

Expected Output

test test_all_8_modalities_embedding ... ok
test test_cross_modal_search_all_modalities ... ok
test test_batch_embedding_all_modalities ... ok
test test_modality_filtering ... ok
test test_production_readiness_8_modalities ... ok

✓ All 8 modalities successfully validated!

Use Cases

1. Multi-Modal Search Engine

Search across documents, images, code, and more with a single query.

Find similar code snippets across languages and repositories.

Index time series data, chemical structures, and research papers together.

4. Media Asset Management

Organize images, videos, audio, and metadata in unified search.

5. Documentation + Code Alignment

Link documentation text to relevant code implementations.


References


Last Updated: November 14, 2025 Version: v0.6.0 Status: Production Ready ✓ Modalities: 8 Complete ✓