diff options
author | Laurent Mazare <laurent.mazare@gmail.com> | 2024-08-19 08:30:12 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-08-19 09:30:12 +0200 |
commit | 14fd2d97e00f4d91ec2a420b03155137c162131a (patch) | |
tree | d6a55422b89e4897e5a2fc72f92000b531691abd /candle-examples/examples/parler-tts/decode.py | |
parent | 31a1075f4b4799a4922fa0f617ee982baa5baa81 (diff) | |
download | candle-14fd2d97e00f4d91ec2a420b03155137c162131a.tar.gz candle-14fd2d97e00f4d91ec2a420b03155137c162131a.tar.bz2 candle-14fd2d97e00f4d91ec2a420b03155137c162131a.zip |
Add a readme for the parler-tts example. (#2434)
* Add a readme for the parler-tts example.
* Remove the python decode script.
* mp4 tweaks.
* Another readme tweak.
Diffstat (limited to 'candle-examples/examples/parler-tts/decode.py')
-rw-r--r-- | candle-examples/examples/parler-tts/decode.py | 30 |
1 files changed, 0 insertions, 30 deletions
diff --git a/candle-examples/examples/parler-tts/decode.py b/candle-examples/examples/parler-tts/decode.py deleted file mode 100644 index 8942d32e..00000000 --- a/candle-examples/examples/parler-tts/decode.py +++ /dev/null @@ -1,30 +0,0 @@ -import torch -import torchaudio -from safetensors.torch import load_file -from parler_tts import DACModel - -tensors = load_file("out.safetensors") -dac_model = DACModel.from_pretrained("parler-tts/dac_44khZ_8kbps") -print(dac_model.model) -output_ids = tensors["codes"][None, None] -print(output_ids, "\n", output_ids.shape) -batch_size = 1 -with torch.no_grad(): - output_values = [] - for sample_id in range(batch_size): - sample = output_ids[:, sample_id] - sample_mask = (sample >= dac_model.config.codebook_size).sum(dim=(0, 1)) == 0 - if sample_mask.sum() > 0: - sample = sample[:, :, sample_mask] - sample = dac_model.decode(sample[None, ...], [None]).audio_values - output_values.append(sample.transpose(0, 2)) - else: - output_values.append(torch.zeros((1, 1, 1)).to(dac_model.device)) - output_lengths = [audio.shape[0] for audio in output_values] - pcm = ( - torch.nn.utils.rnn.pad_sequence(output_values, batch_first=True, padding_value=0) - .squeeze(-1) - .squeeze(-1) - ) -print(pcm.shape, pcm.dtype) -torchaudio.save("out.wav", pcm.cpu(), sample_rate=44100) |