summaryrefslogtreecommitdiff
path: root/candle-datasets/src/vision/mnist.rs
diff options
context:
space:
mode:
authorNicolas Patry <patry.nicolas@protonmail.com>2023-08-14 17:23:08 +0200
committerNicolas Patry <patry.nicolas@protonmail.com>2023-08-28 15:15:01 +0200
commitd7a273be516e7795b6213e9f076676f76d0fac11 (patch)
tree86eadd1f3b8dbd0f36356c32e6bb85f0783c32ce /candle-datasets/src/vision/mnist.rs
parentdd02f589c095790f980cc8fb84f411d67b7e3c21 (diff)
downloadcandle-d7a273be516e7795b6213e9f076676f76d0fac11.tar.gz
candle-d7a273be516e7795b6213e9f076676f76d0fac11.tar.bz2
candle-d7a273be516e7795b6213e9f076676f76d0fac11.zip
Training:
- Removed a lot of surface (SerializedFileReader ownership is really painful). - Moved example + vision to hf.co version. - Removed feature gate.
Diffstat (limited to 'candle-datasets/src/vision/mnist.rs')
-rw-r--r--candle-datasets/src/vision/mnist.rs53
1 files changed, 53 insertions, 0 deletions
diff --git a/candle-datasets/src/vision/mnist.rs b/candle-datasets/src/vision/mnist.rs
index 2267f9a0..c908412c 100644
--- a/candle-datasets/src/vision/mnist.rs
+++ b/candle-datasets/src/vision/mnist.rs
@@ -3,6 +3,8 @@
//! The files can be obtained from the following link:
//! <http://yann.lecun.com/exdb/mnist/>
use candle::{DType, Device, Result, Tensor};
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use parquet::file::reader::{FileReader, SerializedFileReader};
use std::fs::File;
use std::io::{self, BufReader, Read};
@@ -63,3 +65,54 @@ pub fn load_dir<T: AsRef<std::path::Path>>(dir: T) -> Result<crate::vision::Data
labels: 10,
})
}
+
+fn load_parquet(parquet: SerializedFileReader<std::fs::File>) -> Result<(Tensor, Tensor)> {
+ let samples = parquet.metadata().file_metadata().num_rows() as usize;
+ let mut buffer_images: Vec<u8> = Vec::with_capacity(samples * 784);
+ let mut buffer_labels: Vec<u8> = Vec::with_capacity(samples);
+ for row in parquet.into_iter().flatten() {
+ for (_name, field) in row.get_column_iter() {
+ if let parquet::record::Field::Group(subrow) = field {
+ for (_name, field) in subrow.get_column_iter() {
+ if let parquet::record::Field::Bytes(value) = field {
+ let image = image::load_from_memory(value.data()).unwrap();
+ buffer_images.extend(image.to_luma8().as_raw());
+ }
+ }
+ } else if let parquet::record::Field::Long(label) = field {
+ buffer_labels.push(*label as u8);
+ }
+ }
+ }
+ let images = (Tensor::from_vec(buffer_images, (samples, 784), &Device::Cpu)?
+ .to_dtype(DType::F32)?
+ / 255.)?;
+ let labels = Tensor::from_vec(buffer_labels, (samples,), &Device::Cpu)?;
+ Ok((images, labels))
+}
+
+pub fn load() -> Result<crate::vision::Dataset> {
+ let api = Api::new().unwrap();
+ let dataset_id = "mnist".to_string();
+ let repo = Repo::with_revision(
+ dataset_id,
+ RepoType::Dataset,
+ "refs/convert/parquet".to_string(),
+ );
+ let repo = api.repo(repo);
+ let test_parquet_filename = repo.get("mnist/mnist-test.parquet").unwrap();
+ let train_parquet_filename = repo.get("mnist/mnist-train.parquet").unwrap();
+ let test_parquet =
+ SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?).unwrap();
+ let train_parquet =
+ SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?).unwrap();
+ let (test_images, test_labels) = load_parquet(test_parquet)?;
+ let (train_images, train_labels) = load_parquet(train_parquet)?;
+ Ok(crate::vision::Dataset {
+ train_images,
+ train_labels,
+ test_images,
+ test_labels,
+ labels: 10,
+ })
+}