summaryrefslogtreecommitdiff
path: root/candle-transformers/src
diff options
context:
space:
mode:
Diffstat (limited to 'candle-transformers/src')
-rw-r--r--candle-transformers/src/models/mobilenetv4.rs800
-rw-r--r--candle-transformers/src/models/mod.rs1
2 files changed, 801 insertions, 0 deletions
diff --git a/candle-transformers/src/models/mobilenetv4.rs b/candle-transformers/src/models/mobilenetv4.rs
new file mode 100644
index 00000000..7cbae7c3
--- /dev/null
+++ b/candle-transformers/src/models/mobilenetv4.rs
@@ -0,0 +1,800 @@
+//! MobileNet-v4 inference implementation based on timm.
+//!
+//! See "MobileNetV4 - Universal Models for the Mobile Ecosystem"
+//! https://arxiv.org/abs/2404.10518
+//!
+//! https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/mobilenetv3.py
+
+use candle::{Result, Tensor, D};
+use candle_nn::{
+ batch_norm, conv2d_no_bias, linear, ops::softmax, Activation, Conv2dConfig, Func, VarBuilder,
+};
+
+#[derive(Clone, Debug)]
+enum BlockType {
+ Convolutional {
+ out_channels: usize,
+ kernel: usize,
+ stride: usize,
+ },
+ UniversalBottleneck {
+ out_channels: usize,
+ start_kernel: usize,
+ mid_kernel: usize,
+ stride: usize,
+ expand: usize,
+ },
+ EdgeResidual {
+ out_channels: usize,
+ kernel: usize,
+ stride: usize,
+ expand: usize,
+ },
+ Attention {
+ out_channels: usize,
+ heads: usize,
+ kernel: usize,
+ stride: usize,
+ kv_dim: usize,
+ kv_stride: usize,
+ },
+}
+
+#[derive(Clone, Debug)]
+pub struct Config {
+ stem_dim: usize,
+ activation: Activation,
+ stages: [Vec<BlockType>; 5],
+}
+
+#[rustfmt::skip]
+impl Config {
+ pub fn small() -> Self {
+ Self {
+ stem_dim: 32,
+ activation: Activation::Relu,
+ stages: [
+ vec![
+ BlockType::Convolutional { out_channels: 32, kernel: 3, stride: 2},
+ BlockType::Convolutional { out_channels: 32, kernel: 1, stride: 1},
+ ],
+ vec![
+ BlockType::Convolutional { out_channels: 96, kernel: 3, stride: 2},
+ BlockType::Convolutional { out_channels: 64, kernel: 1, stride: 1},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 5, mid_kernel: 5, stride: 2, expand: 3},
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 0, mid_kernel: 3, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 0, mid_kernel: 3, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 0, mid_kernel: 3, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 0, mid_kernel: 3, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 128, start_kernel: 3, mid_kernel: 3, stride: 2, expand: 6},
+ BlockType::UniversalBottleneck { out_channels: 128, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 128, start_kernel: 0, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 128, start_kernel: 0, mid_kernel: 5, stride: 1, expand: 3},
+ BlockType::UniversalBottleneck { out_channels: 128, start_kernel: 0, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 128, start_kernel: 0, mid_kernel: 3, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::Convolutional { out_channels: 960, kernel: 1, stride: 1},
+ ],
+ ],
+ }
+ }
+
+ pub fn medium() -> Self {
+ Self {
+ stem_dim: 32,
+ activation: Activation::Relu,
+ stages: [
+ vec![
+ BlockType::EdgeResidual { out_channels: 48, kernel: 3, stride: 2, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 80, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 80, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 2},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 6},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 5, stride: 2, expand: 6},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 2},
+
+ ],
+ vec![
+ BlockType::Convolutional { out_channels: 960, kernel: 1, stride: 1},
+ ],
+ ],
+ }
+ }
+
+ pub fn hybrid_medium() -> Self {
+ Self {
+ stem_dim: 32,
+ activation: Activation::Relu,
+ stages: [
+ vec![
+ BlockType::EdgeResidual { out_channels: 48, kernel: 3, stride: 2, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 80, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 80, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 2},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 6},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 160, heads: 4, kernel: 3, stride: 1, kv_stride:2, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 160, heads: 4, kernel: 3, stride: 1, kv_stride:2, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 160, heads: 4, kernel: 3, stride: 1, kv_stride:2, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 160, heads: 4, kernel: 3, stride: 1, kv_stride:2, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 160, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 5, stride: 2, expand: 6},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 2},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 0, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 256, heads: 4, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 256, heads: 4, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 256, heads: 4, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 256, heads: 4, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 256, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::Convolutional { out_channels: 960, kernel: 1, stride: 1},
+ ],
+ ],
+ }
+ }
+
+ pub fn large() -> Self {
+ Self {
+ stem_dim: 24,
+ activation: Activation::Relu,
+ stages: [
+ vec![
+ BlockType::EdgeResidual { out_channels: 48, kernel: 3, stride: 2, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::Convolutional { out_channels: 960, kernel: 1, stride: 1},
+ ],
+ ],
+ }
+ }
+
+ pub fn hybrid_large() -> Self {
+ Self {
+ stem_dim: 24,
+ activation: Activation::Gelu,
+ stages: [
+ vec![
+ BlockType::EdgeResidual { out_channels: 48, kernel: 3, stride: 2, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 96, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 192, heads: 8, kernel: 3, stride: 1, kv_stride:2, kv_dim: 48},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 192, heads: 8, kernel: 3, stride: 1, kv_stride:2, kv_dim: 48},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 192, heads: 8, kernel: 3, stride: 1, kv_stride:2, kv_dim: 48},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 192, heads: 8, kernel: 3, stride: 1, kv_stride:2, kv_dim: 48},
+ BlockType::UniversalBottleneck { out_channels: 192, start_kernel: 3, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+
+ vec![
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 2, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 3, stride: 1, expand: 4},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 5, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 512, heads: 8, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 512, heads: 8, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 512, heads: 8, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ BlockType::Attention { out_channels: 512, heads: 8, kernel: 3, stride: 1, kv_stride:1, kv_dim: 64},
+ BlockType::UniversalBottleneck { out_channels: 512, start_kernel: 5, mid_kernel: 0, stride: 1, expand: 4},
+ ],
+ vec![
+ BlockType::Convolutional { out_channels: 960, kernel: 1, stride: 1},
+ ],
+ ],
+ }
+ }
+}
+
+fn depthwise_conv(
+ channels: usize,
+ kernel: usize,
+ stride: usize,
+ padding: usize,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let conv2d_cfg = Conv2dConfig {
+ stride,
+ padding,
+ groups: channels,
+ ..Default::default()
+ };
+
+ let bn = batch_norm(channels, 1e-5, vb.pp("bn"))?;
+ let conv = conv2d_no_bias(channels, channels, kernel, conv2d_cfg, vb.pp("conv"))?;
+
+ Ok(Func::new(move |xs| xs.apply(&conv)?.apply_t(&bn, false)))
+}
+
+fn pointwise_conv(
+ in_channels: usize,
+ out_channels: usize,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let conv2d_cfg = Conv2dConfig {
+ ..Default::default()
+ };
+
+ let bn = batch_norm(out_channels, 1e-5, vb.pp("bn"))?;
+ let conv = conv2d_no_bias(in_channels, out_channels, 1, conv2d_cfg, vb.pp("conv"))?;
+
+ Ok(Func::new(move |xs| xs.apply(&conv)?.apply_t(&bn, false)))
+}
+
+//Universal block that uses two pointwise convolutions and all combinations of two depthwise convolutions.
+#[allow(clippy::too_many_arguments)]
+fn universal_inverted_bottleneck_block(
+ cfg: &Config,
+ in_channels: usize,
+ out_channels: usize,
+ expand: usize,
+ start_kernel: usize,
+ mid_kernel: usize,
+ stride: usize,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let act = cfg.activation;
+ let skip_connection = (in_channels == out_channels) && (stride == 1);
+
+ let dw_start_stride = if mid_kernel > 0 { 1 } else { stride };
+ let dw_start = depthwise_conv(
+ in_channels,
+ start_kernel,
+ dw_start_stride,
+ start_kernel / 2,
+ vb.pp("dw_start"),
+ );
+ let pw_exp = pointwise_conv(in_channels, in_channels * expand, vb.pp("pw_exp"))?;
+ let dw_mid = depthwise_conv(
+ in_channels * expand,
+ mid_kernel,
+ stride,
+ mid_kernel / 2,
+ vb.pp("dw_mid"),
+ );
+ let pw_proj = pointwise_conv(in_channels * expand, out_channels, vb.pp("pw_proj"))?;
+
+ let gamma = vb.get(out_channels, "layer_scale.gamma");
+
+ Ok(Func::new(move |xs| {
+ let residual = xs.clone();
+
+ let mut xs = xs.clone();
+
+ if let Ok(f) = &dw_start {
+ xs = xs.apply(f)?;
+ }
+
+ xs = xs.apply(&pw_exp)?.apply(&act)?;
+
+ if let Ok(f) = &dw_mid {
+ xs = xs.apply(f)?.apply(&act)?;
+ }
+
+ xs = xs.apply(&pw_proj)?;
+
+ if let Ok(g) = &gamma {
+ xs = xs.broadcast_mul(&g.reshape((1, (), 1, 1))?)?;
+ };
+
+ if skip_connection {
+ xs = (xs + residual)?;
+ }
+
+ Ok(xs)
+ }))
+}
+
+// Convolutional block including norm and activation.
+fn conv_block(
+ cfg: &Config,
+ in_channels: usize,
+ out_channels: usize,
+ kernel: usize,
+ stride: usize,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let conv2d_cfg = Conv2dConfig {
+ stride,
+ padding: kernel / 2,
+ ..Default::default()
+ };
+
+ let act = cfg.activation;
+ let bn = batch_norm(out_channels, 1e-5, vb.pp("bn1"))?;
+ let conv = conv2d_no_bias(in_channels, out_channels, kernel, conv2d_cfg, vb.pp("conv"))?;
+
+ Ok(Func::new(move |xs| {
+ xs.apply(&conv)?.apply_t(&bn, false)?.apply(&act)
+ }))
+}
+
+fn edge_residual_block(
+ cfg: &Config,
+ in_channels: usize,
+ out_channels: usize,
+ kernel: usize,
+ stride: usize,
+ expand: usize,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let conv_exp_cfg = Conv2dConfig {
+ stride,
+ padding: kernel / 2,
+ ..Default::default()
+ };
+
+ let conv_pwl_cfg = Conv2dConfig {
+ ..Default::default()
+ };
+
+ let act = cfg.activation;
+ let mid_channels = in_channels * expand;
+ let conv_exp = conv2d_no_bias(
+ in_channels,
+ mid_channels,
+ kernel,
+ conv_exp_cfg,
+ vb.pp("conv_exp"),
+ )?;
+ let bn1 = batch_norm(mid_channels, 1e-5, vb.pp("bn1"))?;
+
+ let conv_pwl = conv2d_no_bias(
+ mid_channels,
+ out_channels,
+ 1,
+ conv_pwl_cfg,
+ vb.pp("conv_pwl"),
+ )?;
+ let bn2 = batch_norm(out_channels, 1e-5, vb.pp("bn2"))?;
+
+ Ok(Func::new(move |xs| {
+ let xs = xs
+ .apply(&conv_exp)?
+ .apply_t(&bn1, false)?
+ .apply(&act)?
+ .apply(&conv_pwl)?
+ .apply_t(&bn2, false)?;
+
+ Ok(xs)
+ }))
+}
+
+fn reshape_kv(t: &Tensor) -> Result<Tensor> {
+ let d = t.dims4()?;
+ let t = t
+ .reshape((d.0, d.1, ()))?
+ .transpose(1, 2)?
+ .unsqueeze(1)?
+ .contiguous()?;
+ Ok(t)
+}
+
+fn reshape_query(t: &Tensor, heads: usize, kv_dim: usize) -> Result<Tensor> {
+ let d = t.dims4()?;
+
+ let t = t
+ .reshape((d.0, heads, kv_dim, ()))?
+ .transpose(D::Minus1, D::Minus2)?
+ .contiguous()?;
+ Ok(t)
+}
+
+fn reshape_output(t: &Tensor, heads: usize, h: usize, w: usize) -> Result<Tensor> {
+ let d = t.dims4()?;
+ let t = t.transpose(1, 2)?;
+ let t = t
+ .reshape((d.0, h, w, d.3 * heads))?
+ .permute((0, 3, 1, 2))?
+ .contiguous()?;
+ Ok(t)
+}
+
+// Mobile multi-query attention
+#[allow(clippy::too_many_arguments)]
+fn mqa_block(
+ in_channels: usize,
+ out_channels: usize,
+ heads: usize,
+ kernel: usize,
+ stride: usize,
+ kv_dim: usize,
+ kv_stride: usize,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let down_conv2d_cfg = Conv2dConfig {
+ stride: kv_stride,
+ padding: kernel / 2,
+ groups: in_channels,
+ ..Default::default()
+ };
+
+ let proj_conv2d_cfg = Conv2dConfig {
+ stride,
+ ..Default::default()
+ };
+
+ let skip_connection = (in_channels == out_channels) && (stride == 1);
+ let gamma = vb.get(out_channels, "layer_scale.gamma");
+ let norm = batch_norm(out_channels, 1e-5, vb.pp("norm"))?;
+ let scale = (kv_dim as f64).powf(-0.5);
+
+ let vb = vb.pp("attn");
+
+ let query_proj = conv2d_no_bias(
+ out_channels,
+ kv_dim * heads,
+ 1,
+ proj_conv2d_cfg,
+ vb.pp("query.proj"),
+ )?;
+
+ let key_down_conv = conv2d_no_bias(
+ in_channels,
+ out_channels,
+ kernel,
+ down_conv2d_cfg,
+ vb.pp("key.down_conv"),
+ );
+ let key_norm = batch_norm(out_channels, 1e-5, vb.pp("key.norm"));
+
+ let key_proj = conv2d_no_bias(out_channels, kv_dim, 1, proj_conv2d_cfg, vb.pp("key.proj"))?;
+
+ let value_down_conv = conv2d_no_bias(
+ in_channels,
+ out_channels,
+ kernel,
+ down_conv2d_cfg,
+ vb.pp("value.down_conv"),
+ );
+
+ let value_norm = batch_norm(out_channels, 1e-5, vb.pp("value.norm"));
+ let value_proj = conv2d_no_bias(
+ out_channels,
+ kv_dim,
+ 1,
+ proj_conv2d_cfg,
+ vb.pp("value.proj"),
+ )?;
+
+ let output_proj = conv2d_no_bias(
+ kv_dim * heads,
+ out_channels,
+ 1,
+ proj_conv2d_cfg,
+ vb.pp("output.proj"),
+ )?;
+
+ Ok(Func::new(move |xs| {
+ let (_, _, h, w) = xs.dims4()?;
+
+ let residual = xs.clone();
+
+ let xs = xs.apply_t(&norm, false)?;
+
+ // Query
+ let q = xs.apply(&query_proj)?;
+
+ let q = reshape_query(&q, heads, kv_dim)?;
+ let q = (q * scale)?;
+
+ // Keys
+ let mut k = xs.clone();
+
+ if let (Ok(kd), Ok(n)) = (&key_down_conv, &key_norm) {
+ k = k.apply(kd)?.apply_t(n, false)?;
+ }
+
+ let k = k.apply(&key_proj)?;
+
+ let k = reshape_kv(&k)?;
+
+ // Value
+ let mut v = xs.clone();
+
+ if let (Ok(vd), Ok(n)) = (&value_down_conv, &value_norm) {
+ v = v.apply(vd)?;
+ v = v.apply_t(n, false)?;
+ }
+
+ let v = v.apply(&value_proj)?;
+ let v = reshape_kv(&v)?;
+
+ let attn = q.broadcast_matmul(&(k.transpose(D::Minus2, D::Minus1)?))?;
+ let attn = softmax(&attn, D::Minus1)?;
+ let o = attn.broadcast_matmul(&v)?;
+
+ let o = reshape_output(&o, heads, h, w)?;
+
+ let mut xs = o.apply(&output_proj)?;
+
+ // Layer scale
+
+ if let Ok(g) = &gamma {
+ xs = xs.broadcast_mul(&g.reshape((1, (), 1, 1))?)?;
+ };
+
+ if skip_connection {
+ xs = (xs + residual)?;
+ }
+ Ok(xs)
+ }))
+}
+
+// Stem.
+fn mobilenetv4_stem(cfg: &Config, vb: VarBuilder) -> Result<Func<'static>> {
+ let conv2d_cfg = Conv2dConfig {
+ stride: 2,
+ padding: 1,
+ ..Default::default()
+ };
+
+ let act = cfg.activation;
+ let out_channels = cfg.stem_dim;
+ let bn = batch_norm(out_channels, 1e-5, vb.pp("bn1"))?;
+ let conv = conv2d_no_bias(3, out_channels, 3, conv2d_cfg, vb.pp("conv_stem"))?;
+
+ Ok(Func::new(move |xs| {
+ let xs = xs.apply(&conv)?.apply_t(&bn, false)?.apply(&act)?;
+ Ok(xs)
+ }))
+}
+
+// The blocks in all the 5 stages of the model.
+fn mobilenetv4_blocks(cfg: &Config, vb: VarBuilder) -> Result<Func<'static>> {
+ let mut in_channels = cfg.stem_dim;
+ let mut blocks = Vec::new();
+
+ for stage in 0..5 {
+ let nblocks = cfg.stages[stage].len();
+
+ for block in 0..nblocks {
+ match cfg.stages[stage][block] {
+ BlockType::Convolutional {
+ out_channels,
+ kernel,
+ stride,
+ } => {
+ blocks.push(conv_block(
+ cfg,
+ in_channels,
+ out_channels,
+ kernel,
+ stride,
+ vb.pp(format!("{stage}.{block}")),
+ )?);
+ in_channels = out_channels;
+ }
+
+ BlockType::EdgeResidual {
+ out_channels,
+ kernel,
+ stride,
+ expand,
+ } => {
+ blocks.push(edge_residual_block(
+ cfg,
+ in_channels,
+ out_channels,
+ kernel,
+ stride,
+ expand,
+ vb.pp(format!("{stage}.{block}")),
+ )?);
+ in_channels = out_channels;
+ }
+
+ BlockType::UniversalBottleneck {
+ out_channels,
+ start_kernel,
+ mid_kernel,
+ stride,
+ expand,
+ } => {
+ blocks.push(universal_inverted_bottleneck_block(
+ cfg,
+ in_channels,
+ out_channels,
+ expand,
+ start_kernel,
+ mid_kernel,
+ stride,
+ vb.pp(format!("{stage}.{block}")),
+ )?);
+ in_channels = out_channels;
+ }
+
+ BlockType::Attention {
+ out_channels,
+ heads,
+ kernel,
+ stride,
+ kv_dim,
+ kv_stride,
+ } => {
+ blocks.push(mqa_block(
+ in_channels,
+ out_channels,
+ heads,
+ kernel,
+ stride,
+ kv_dim,
+ kv_stride,
+ vb.pp(format!("{stage}.{block}")),
+ )?);
+ in_channels = out_channels;
+ }
+ }
+ }
+ }
+
+ Ok(Func::new(move |xs| {
+ let mut xs = xs.clone();
+ for block in blocks.iter() {
+ xs = xs.apply(block)?
+ }
+ Ok(xs)
+ }))
+}
+
+// Classification head.
+fn mobilenetv4_head(
+ cfg: &Config,
+ outputs: usize,
+ nclasses: usize,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let conv2d_cfg = Conv2dConfig {
+ ..Default::default()
+ };
+
+ let act = cfg.activation;
+ let conv = conv2d_no_bias(960, outputs, 1, conv2d_cfg, vb.pp("conv_head"))?;
+ let norm = batch_norm(outputs, 1e-5, vb.pp("norm_head"))?;
+ let cls = linear(outputs, nclasses, vb.pp("classifier"))?;
+
+ Ok(Func::new(move |xs| {
+ let mut xs = xs.clone();
+ xs = xs.apply(&conv)?;
+ xs = xs.apply_t(&norm, false)?.apply(&act)?;
+ xs = xs.flatten_from(1)?;
+ xs = xs.apply(&cls)?;
+ Ok(xs)
+ }))
+}
+
+// Build a mobilenetv4 model for a given configuration.
+fn mobilenetv4_model(
+ cfg: &Config,
+ nclasses: Option<usize>,
+ vb: VarBuilder,
+) -> Result<Func<'static>> {
+ let cls = match nclasses {
+ None => None,
+ Some(nclasses) => {
+ let outputs = 1280;
+ let head = mobilenetv4_head(cfg, outputs, nclasses, vb.clone())?;
+ Some(head)
+ }
+ };
+
+ let stem = mobilenetv4_stem(cfg, vb.clone())?;
+
+ let blocks = mobilenetv4_blocks(cfg, vb.pp("blocks"))?;
+
+ Ok(Func::new(move |xs| {
+ let xs = xs.apply(&stem)?.apply(&blocks)?;
+ let xs = xs.mean_keepdim(D::Minus1)?.mean_keepdim(D::Minus2)?;
+ match &cls {
+ None => Ok(xs),
+ Some(cls) => xs.apply(cls),
+ }
+ }))
+}
+
+pub fn mobilenetv4(cfg: &Config, nclasses: usize, vb: VarBuilder) -> Result<Func<'static>> {
+ mobilenetv4_model(cfg, Some(nclasses), vb)
+}
+
+pub fn mobilenetv4_no_final_layer(cfg: &Config, vb: VarBuilder) -> Result<Func<'static>> {
+ mobilenetv4_model(cfg, None, vb)
+}
diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs
index f5859a99..86a0ec08 100644
--- a/candle-transformers/src/models/mod.rs
+++ b/candle-transformers/src/models/mod.rs
@@ -28,6 +28,7 @@ pub mod metavoice;
pub mod mistral;
pub mod mixformer;
pub mod mixtral;
+pub mod mobilenetv4;
pub mod mobileone;
pub mod moondream;
pub mod mpt;