summaryrefslogtreecommitdiff
path: root/candle-transformers/src/models/moondream.rs
diff options
context:
space:
mode:
Diffstat (limited to 'candle-transformers/src/models/moondream.rs')
-rw-r--r--candle-transformers/src/models/moondream.rs30
1 files changed, 28 insertions, 2 deletions
diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs
index d351d7c0..a9dc9b7d 100644
--- a/candle-transformers/src/models/moondream.rs
+++ b/candle-transformers/src/models/moondream.rs
@@ -1,13 +1,39 @@
//! MoonDream Model vision-to-text
//!
+//!
+//! Moondream is a computer-vision model that can answer real-world questions about images.
+//! It's lightweight with only 1.6B parameters, enabling it to run on mobile phones and edge devices.
+//! [MoonDream Original Implementation](https://github.com/vikhyat/moondream)
+//!
//! The model consists of:
//! - Vision encoder using a ViT-style architecture
//! - Text decoder based on Microsoft's Phi model
//! - Vision projection module to align vision and text embeddings
//!
-//! References:
-//! - [MoonDream Original Implementation](https://github.com/vikhyat/moondream)
+//! # Examples
+//!
+//! <img src="https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg" width="200">
+//!
+//! ```bash
+//! # download an example image
+//! wget https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg
+//!
+//! # Now you can run Moondream from the `candle-examples` crate:
+//! cargo run --example moondream \
+//! --release -- \
+//! --prompt "What is the girl eating?"
+//! --image "./demo-1.jpg"
//!
+//! > avavx: false, neon: true, simd128: false, f16c: false
+//! > temp: 0.00 repeat-penalty: 1.00 repeat-last-n: 64
+//! > retrieved the files in 3.395583ms
+//! > Running on CPU, to run on GPU(metal), build this example with `--features metal`
+//! > loaded the model in 5.485493792s
+//! > loaded and encoded the image Tensor[dims 3, 378, 378; f32] in 4.801396417s
+//! > starting the inference loop
+//! > The girl is eating a hamburger.<
+//! > 9 tokens generated (0.68 token/s)
+//! ```
use crate::models::mixformer::{Config as PhiConfig, MixFormerSequentialForCausalLM as PhiModel};
use crate::models::with_tracing::{layer_norm, linear_b, LayerNorm, Linear};