Add a readme for the parler-tts example. (#2434)

* Add a readme for the parler-tts example. * Remove the python decode script. * mp4 tweaks. * Another readme tweak.
author: Laurent Mazare <laurent.mazare@gmail.com> 2024-08-19 08:30:12 +0100
committer: GitHub <noreply@github.com> 2024-08-19 09:30:12 +0200
commit: 14fd2d97e00f4d91ec2a420b03155137c162131a (patch)
tree: d6a55422b89e4897e5a2fc72f92000b531691abd /candle-examples/examples/parler-tts
parent: 31a1075f4b4799a4922fa0f617ee982baa5baa81 (diff)
download: candle-14fd2d97e00f4d91ec2a420b03155137c162131a.tar.gz
candle-14fd2d97e00f4d91ec2a420b03155137c162131a.tar.bz2
candle-14fd2d97e00f4d91ec2a420b03155137c162131a.zip
3 files changed, 21 insertions, 30 deletions
diff --git a/candle-examples/examples/parler-tts/README.md b/candle-examples/examples/parler-tts/README.md
new file mode 100644
index 00000000..83739944
--- /dev/null
+++ b/candle-examples/examples/parler-tts/README.md
@@ -0,0 +1,21 @@
+# candle-parler-tts
+
+[Parler-TTS](https://huggingface.co/parler-tts/parler-tts-large-v1) is a large
+text-to-speech model with 2.2B parameters trained on ~45K hours of audio data.
+The voice can be controlled by a text prompt.
+
+## Run an example
+
+```bash
+cargo run --example parler-tts -r -- \
+  --prompt "Hey, how are you doing today?"
+```
+
+In order to specify some prompt for the voice, use the `--description` argument.
+```bash
+cargo run --example parler-tts -r -- \
+  --prompt "Hey, how are you doing today?" \
+  --description "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
+```
+
+https://github.com/huggingface/candle/raw/main/candle-examples/examples/parler-tts/hello.mp4
diff --git a/candle-examples/examples/parler-tts/decode.py b/candle-examples/examples/parler-tts/decode.py
deleted file mode 100644
index 8942d32e..00000000
--- a/candle-examples/examples/parler-tts/decode.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import torch
-import torchaudio
-from safetensors.torch import load_file
-from parler_tts import DACModel
-
-tensors = load_file("out.safetensors")
-dac_model = DACModel.from_pretrained("parler-tts/dac_44khZ_8kbps")
-print(dac_model.model)
-output_ids = tensors["codes"][None, None]
-print(output_ids, "\n", output_ids.shape)
-batch_size = 1
-with torch.no_grad():
-    output_values = []
-    for sample_id in range(batch_size):
-        sample = output_ids[:, sample_id]
-        sample_mask = (sample >= dac_model.config.codebook_size).sum(dim=(0, 1)) == 0
-        if sample_mask.sum() > 0:
-            sample = sample[:, :, sample_mask]
-            sample = dac_model.decode(sample[None, ...], [None]).audio_values
-            output_values.append(sample.transpose(0, 2))
-        else:
-            output_values.append(torch.zeros((1, 1, 1)).to(dac_model.device))
-    output_lengths = [audio.shape[0] for audio in output_values]
-    pcm = (
-        torch.nn.utils.rnn.pad_sequence(output_values, batch_first=True, padding_value=0)
-        .squeeze(-1)
-        .squeeze(-1)
-    )
-print(pcm.shape, pcm.dtype)
-torchaudio.save("out.wav", pcm.cpu(), sample_rate=44100)
diff --git a/candle-examples/examples/parler-tts/hello.mp4 b/candle-examples/examples/parler-tts/hello.mp4
new file mode 100644
index 00000000..994316db
--- /dev/null
+++ b/candle-examples/examples/parler-tts/hello.mp4
author	Laurent Mazare <laurent.mazare@gmail.com>	2024-08-19 08:30:12 +0100
committer	GitHub <noreply@github.com>	2024-08-19 09:30:12 +0200
commit	14fd2d97e00f4d91ec2a420b03155137c162131a (patch)
tree	d6a55422b89e4897e5a2fc72f92000b531691abd /candle-examples/examples/parler-tts
parent	31a1075f4b4799a4922fa0f617ee982baa5baa81 (diff)
download	candle-14fd2d97e00f4d91ec2a420b03155137c162131a.tar.gz candle-14fd2d97e00f4d91ec2a420b03155137c162131a.tar.bz2 candle-14fd2d97e00f4d91ec2a420b03155137c162131a.zip