From 90715e702bbebcf2c3cfd39628c931bbadda28b0 Mon Sep 17 00:00:00 2001 From: Heiner Lohaus Date: Tue, 9 Apr 2024 19:19:33 +0200 Subject: Add project files --- projects/text_to_speech/worker.js | 105 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 projects/text_to_speech/worker.js (limited to 'projects/text_to_speech/worker.js') diff --git a/projects/text_to_speech/worker.js b/projects/text_to_speech/worker.js new file mode 100644 index 00000000..249208d0 --- /dev/null +++ b/projects/text_to_speech/worker.js @@ -0,0 +1,105 @@ +import { env, Tensor, AutoTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan } from '@xenova/transformers'; +import { encodeWAV } from './utils'; + +// Disable local model checks +env.allowLocalModels = false; + +// Use the Singleton pattern to enable lazy construction of the pipeline. +class MyTextToSpeechPipeline { + + static BASE_URL = 'https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/'; + + static model_id = 'Xenova/speecht5_tts'; + static vocoder_id = 'Xenova/speecht5_hifigan'; + + static tokenizer_instance = null; + static model_instance = null; + static vocoder_instance = null; + + static async getInstance(progress_callback = null) { + if (this.tokenizer_instance === null) { + this.tokenizer = AutoTokenizer.from_pretrained(this.model_id, { progress_callback }); + } + + if (this.model_instance === null) { + this.model_instance = SpeechT5ForTextToSpeech.from_pretrained(this.model_id, { + quantized: false, + progress_callback, + }); + } + + if (this.vocoder_instance === null) { + this.vocoder_instance = SpeechT5HifiGan.from_pretrained(this.vocoder_id, { + quantized: false, + progress_callback, + }); + } + + return new Promise(async (resolve, reject) => { + const result = await Promise.all([ + this.tokenizer, + this.model_instance, + this.vocoder_instance, + ]); + self.postMessage({ + status: 'ready', + }); + resolve(result); + }); + } + + static async getSpeakerEmbeddings(speaker_id) { + // e.g., `cmu_us_awb_arctic-wav-arctic_a0001` + const speaker_embeddings_url = `${this.BASE_URL}${speaker_id}.bin`; + const speaker_embeddings = new Tensor( + 'float32', + new Float32Array(await (await fetch(speaker_embeddings_url)).arrayBuffer()), + [1, 512] + ) + return speaker_embeddings; + } +} + +// Mapping of cached speaker embeddings +const speaker_embeddings_cache = new Map(); + +// Listen for messages from the main thread +self.addEventListener('message', async (event) => { + // Load the pipeline + const [tokenizer, model, vocoder] = await MyTextToSpeechPipeline.getInstance(x => { + // We also add a progress callback so that we can track model loading. + self.postMessage(x); + }); + + // Tokenize the input + const { input_ids } = tokenizer(event.data.text); + + // Load the speaker embeddings + let speaker_embeddings = speaker_embeddings_cache.get(event.data.speaker_id); + if (speaker_embeddings === undefined) { + speaker_embeddings = await MyTextToSpeechPipeline.getSpeakerEmbeddings(event.data.speaker_id); + speaker_embeddings_cache.set(event.data.speaker_id, speaker_embeddings); + } + + // Generate the waveform + let response; + try { + response = await model.generate_speech(input_ids, speaker_embeddings, { vocoder }); + } catch(e) { + self.postMessage({ + status: 'error', + exception: e, + }); + throw e; + } + const { waveform } = response; + + // Encode the waveform as a WAV file + const wav = encodeWAV(waveform.data); + + // Send the output back to the main thread + self.postMessage({ + status: 'complete', + output: new Blob([wav], { type: 'audio/wav' }), + }); +}); \ No newline at end of file -- cgit v1.2.3