projects/text_to_speech/worker.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

import { env, Tensor, AutoTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan } from '@xenova/transformers';
import { encodeWAV } from './utils';

// Disable local model checks
env.allowLocalModels = false;

// Use the Singleton pattern to enable lazy construction of the pipeline.
class MyTextToSpeechPipeline {

    static BASE_URL = 'https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/';

    static model_id = 'Xenova/speecht5_tts';
    static vocoder_id = 'Xenova/speecht5_hifigan';

    static tokenizer_instance = null;
    static model_instance = null;
    static vocoder_instance = null;

    static async getInstance(progress_callback = null) {
        if (this.tokenizer_instance === null) {
            this.tokenizer = AutoTokenizer.from_pretrained(this.model_id, { progress_callback });
        }

        if (this.model_instance === null) {
            this.model_instance = SpeechT5ForTextToSpeech.from_pretrained(this.model_id, {
                quantized: false,
                progress_callback,
            });
        }

        if (this.vocoder_instance === null) {
            this.vocoder_instance = SpeechT5HifiGan.from_pretrained(this.vocoder_id, {
                quantized: false,
                progress_callback,
            });
        }

        return new Promise(async (resolve, reject) => {
            const result = await Promise.all([
                this.tokenizer,
                this.model_instance,
                this.vocoder_instance,
            ]);
            self.postMessage({
                status: 'ready',
            });
            resolve(result);
        });
    }

    static async getSpeakerEmbeddings(speaker_id) {
        // e.g., `cmu_us_awb_arctic-wav-arctic_a0001`
        const speaker_embeddings_url = `${this.BASE_URL}${speaker_id}.bin`;
        const speaker_embeddings = new Tensor(
            'float32',
            new Float32Array(await (await fetch(speaker_embeddings_url)).arrayBuffer()),
            [1, 512]
        )
        return speaker_embeddings;
    }
}

// Mapping of cached speaker embeddings
const speaker_embeddings_cache = new Map();

// Listen for messages from the main thread
self.addEventListener('message', async (event) => {
    // Load the pipeline
    const [tokenizer, model, vocoder] = await MyTextToSpeechPipeline.getInstance(x => {
        // We also add a progress callback so that we can track model loading.
        self.postMessage(x);
    });

    // Tokenize the input
    const { input_ids } = tokenizer(event.data.text);

    // Load the speaker embeddings
    let speaker_embeddings = speaker_embeddings_cache.get(event.data.speaker_id);
    if (speaker_embeddings === undefined) {
        speaker_embeddings = await MyTextToSpeechPipeline.getSpeakerEmbeddings(event.data.speaker_id);
        speaker_embeddings_cache.set(event.data.speaker_id, speaker_embeddings);
    }

    // Generate the waveform
    let response;
    try {
        response = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
    } catch(e) {
        self.postMessage({
            status: 'error',
            exception: e,
        });
        throw e;
    }
    const { waveform } = response;

    // Encode the waveform as a WAV file
    const wav = encodeWAV(waveform.data);

    // Send the output back to the main thread
    self.postMessage({
        status: 'complete',
        output: new Blob([wav], { type: 'audio/wav' }),
    });
});