summaryrefslogtreecommitdiff
path: root/candle-transformers/src/models/llava/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'candle-transformers/src/models/llava/mod.rs')
-rw-r--r--candle-transformers/src/models/llava/mod.rs9
1 files changed, 4 insertions, 5 deletions
diff --git a/candle-transformers/src/models/llava/mod.rs b/candle-transformers/src/models/llava/mod.rs
index 44a00bf9..c252dbed 100644
--- a/candle-transformers/src/models/llava/mod.rs
+++ b/candle-transformers/src/models/llava/mod.rs
@@ -1,13 +1,12 @@
//! The LLaVA (Large Language and Vision Assistant) model.
//!
//! This provides the main model implementation combining a vision tower (CLIP) with
-//! language model (Llama) for multimodal capabilities.
+//! language model (Llama) for multimodal capabilities. The architecture implements the training-free projection technique.
//!
-//! The architecture implements the training-free projection technique from the paper:
-//! [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485).
-//!
-//! - [GH Link](https://github.com/haotian-liu/LLaVA/tree/main)
+//! - 💻[GH Link](https://github.com/haotian-liu/LLaVA/tree/main)
+//! - 📝 [Paper](https://arxiv.org/abs/2304.08485)/ Visual Instruction Tuning
//!
+
pub mod config;
pub mod utils;