From dd02f589c095790f980cc8fb84f411d67b7e3c21 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 14 Aug 2023 11:09:48 +0200 Subject: Better training+hub --- candle-book/src/training/README.md | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'candle-book') diff --git a/candle-book/src/training/README.md b/candle-book/src/training/README.md index f4f9eb85..ddbbc7af 100644 --- a/candle-book/src/training/README.md +++ b/candle-book/src/training/README.md @@ -6,12 +6,36 @@ start with the Hello world dataset of machine learning, MNIST. Let's start with downloading `MNIST` from [huggingface](https://huggingface.co/datasets/mnist). - -```rust -use candle_datasets::from_hub; +This requires `candle-datasets` with the `hub` feature. +```bash +cargo add candle-datasets --features hub +cargo add hf-hub +``` -let dataset = from_hub("mnist")?; +```rust,ignore +{{#include ../../../candle-examples/src/lib.rs:book_training_1}} ``` This uses the standardized `parquet` files from the `refs/convert/parquet` branch on every dataset. +`files` is now a `Vec` of [`parquet::file::serialized_reader::SerializedFileReader`]. + +We can inspect the content of the files with: + +```rust,ignore +{{#include ../../../candle-examples/src/lib.rs:book_training_2}} +``` + +You should see something like: + +```bash +Column id 1, name label, value 6 +Column id 0, name image, value {bytes: [137, ....] +Column id 1, name label, value 8 +Column id 0, name image, value {bytes: [137, ....] +``` + +So each row contains 2 columns (image, label) with image being saved as bytes. +Let's put them into a useful struct. + + -- cgit v1.2.3