candle/candle-wasm-examples/whisper/lib-example.html

<html>
  <head>
    <meta content="text/html;charset=utf-8" http-equiv="Content-Type" />
    <title>Candle Whisper Rust/WASM</title>
  </head>
  <body></body>
</html>

<!DOCTYPE html>
<html>
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <style>
      @import url("https://fonts.googleapis.com/css2?family=Source+Code+Pro:wght@200;300;400&family=Source+Sans+3:wght@100;200;300;400;500;600;700;800;900&display=swap");
      html,
      body {
        font-family: "Source Sans 3", sans-serif;
      }
    </style>
    <script src="https://cdn.tailwindcss.com"></script>
    <script type="module">
      // base url for audio examples
      const AUDIO_BASE_URL =
        "https://huggingface.co/datasets/Narsil/candle-examples/resolve/main/";

      // models base url
      const MODELS = {
        tiny_en: {
          base_url:
            "https://huggingface.co/openai/whisper-tiny.en/resolve/refs%2Fpr%2F17/",
        },
      };
      const whisperWorker = new Worker("./whisperWorker.js", {
        type: "module",
      });

      async function classifyAudio(
        weightsURL, // URL to the weights file
        modelID, // model ID
        tokenizerURL, // URL to the tokenizer file
        mel_filtersURL, // URL to the mel filters file
        audioURL, // URL to the audio file
        updateStatus // function to update the status
      ) {
        return new Promise((resolve, reject) => {
          whisperWorker.postMessage({
            weightsURL,
            modelID,
            tokenizerURL,
            mel_filtersURL,
            audioURL,
          });
          function messageHandler(event) {
            console.log(event.data);
            if ("status" in event.data) {
              updateStatus(event.data);
            }
            if ("error" in event.data) {
              whisperWorker.removeEventListener("message", messageHandler);
              reject(new Error(event.data.error));
            }
            if (event.data.status === "complete") {
              whisperWorker.removeEventListener("message", messageHandler);
              resolve(event.data);
            }
          }
          whisperWorker.addEventListener("message", messageHandler);
        });
      }

      // keep track of the audio URL
      let audioURL = null;
      function setAudio(src) {
        const audio = document.querySelector("#audio");
        audio.src = src;
        audio.controls = true;
        audio.hidden = false;
        document.querySelector("#detect").disabled = false;
        audioURL = src;
      }
      // add event listener to audio buttons
      document.querySelectorAll("#audios-select > button").forEach((target) => {
        target.addEventListener("click", (e) => {
          const value = target.dataset.value;
          const href = AUDIO_BASE_URL + value;
          setAudio(href);
        });
      });
      //add event listener to file input
      document.querySelector("#file-upload").addEventListener("change", (e) => {
        const target = e.target;
        if (target.files.length > 0) {
          const href = URL.createObjectURL(target.files[0]);
          setAudio(href);
        }
      });
      // add event listener to drop-area
      const dropArea = document.querySelector("#drop-area");
      dropArea.addEventListener("dragenter", (e) => {
        e.preventDefault();
        dropArea.classList.add("border-blue-700");
      });
      dropArea.addEventListener("dragleave", (e) => {
        e.preventDefault();
        dropArea.classList.remove("border-blue-700");
      });
      dropArea.addEventListener("dragover", (e) => {
        e.preventDefault();
        dropArea.classList.add("border-blue-700");
      });
      dropArea.addEventListener("drop", (e) => {
        e.preventDefault();
        dropArea.classList.remove("border-blue-700");
        const url = e.dataTransfer.getData("text/uri-list");
        const files = e.dataTransfer.files;
        if (files.length > 0) {
          const href = URL.createObjectURL(files[0]);
          setAudio(href);
        } else if (url) {
          setAudio(url);
        }
      });

      // add event listener to detect button
      document.querySelector("#detect").addEventListener("click", async () => {
        if (audioURL === null) {
          return;
        }
        const modelID = document.querySelector("#model").value;
        const modelURL = MODELS[modelID].base_url + "model.safetensors";
        const tokenizerURL = MODELS[modelID].base_url + "tokenizer.json";

        classifyAudio(
          modelURL,
          modelID,
          tokenizerURL,
          "mel_filters.safetensors",
          audioURL,
          updateStatus
        )
          .then((result) => {
            console.log("RESULT", result);
            const { output } = result;
            const text = output.map((segment) => segment.dr.text).join(" ");
            console.log(text);
            document.querySelector("#output-status").hidden = true;
            document.querySelector("#output-generation").hidden = false;
            document.querySelector("#output-generation").textContent = text;
          })
          .catch((error) => {
            console.error(error);
          });
      });

      function updateStatus(data) {
        const { status, message } = data;
        const button = document.querySelector("#detect");
        if (status === "decoding" || status === "loading") {
          button.disabled = true;
          button.textContent = message;
        } else if (status === "complete") {
          button.disabled = false;
          button.textContent = "Transcribe Audio";
        }
      }
    </script>
  </head>
  <body class="container max-w-4xl mx-auto p-4">
    <main class="grid grid-cols-1 gap-8 relative">
      <span class="absolute text-5xl -ml-[1em]"> 🕯️ </span>
      <div>
        <h1 class="text-5xl font-bold">Candle Whisper</h1>
        <h2 class="text-2xl font-bold">Rust/WASM Demo</h2>
        <p class="max-w-lg">
          Transcribe audio in the browser using rust/wasm with an audio file.
          This demo uses the
          <a
            href="https://huggingface.co/openai/"
            target="_blank"
            class="underline hover:text-blue-500 hover:no-underline"
          >
            OpenAI Whisper models
          </a>
          and WASM runtime built with
          <a
            href="https://github.com/huggingface/candle/"
            target="_blank"
            class="underline hover:text-blue-500 hover:no-underline"
            >Candle
          </a>
        </p>
      </div>

      <div>
        <label for="model" class="font-medium">Models Options: </label>
        <select
          id="model"
          class="border-2 border-gray-500 rounded-md font-light"
        >
          <option value="tiny_en" selected>tiny.en (151 MB)</option>
        </select>
      </div>
      <!-- drag and drop area -->
      <div class="relative">
        <div
          id="drop-area"
          class="flex flex-col items-center justify-center border-2 border-gray-300 border-dashed rounded-xl relative h-48 w-full overflow-hidden"
        >
          <div
            class="flex flex-col items-center justify-center space-y-1 text-center"
          >
            <svg
              width="25"
              height="25"
              viewBox="0 0 25 25"
              fill="none"
              xmlns="http://www.w3.org/2000/svg"
            >
              <path
                d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z"
                fill="#000"
              />
            </svg>
            <div class="flex text-sm text-gray-600">
              <label
                for="file-upload"
                class="relative cursor-pointer bg-white rounded-md font-medium text-blue-950 hover:text-blue-700"
              >
                <span>Drag and drop your audio here</span>
                <span class="block text-xs">or</span>
                <span class="block text-xs">Click to upload</span>
              </label>
            </div>
            <input
              id="file-upload"
              name="file-upload"
              type="file"
              accept="audio/*"
              class="sr-only"
            />
          </div>
          <audio
            id="audio"
            hidden
            controls
            class="w-full p-2 select-none"
          ></audio>
        </div>
      </div>
      <div>
        <div class="flex flex-wrap gap-3 items-center" id="audios-select">
          <h3 class="font-medium">Examples:</h3>
          <button
            data-value="samples_jfk.wav"
            class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
          >
            <span>jfk.wav</span>
            <span class="text-xs block"> (352 kB)</span>
          </button>
          <button
            data-value="samples_a13.wav"
            class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
          >
            <span>a13.wav</span>
            <span class="text-xs block"> (960 kB)</span>
          </button>
          <button
            data-value="samples_mm0.wav"
            class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
          >
            <span>mm0.wav</span>
            <span class="text-xs block new"> (957 kB)</span>
          </button>
          <button
            data-value="samples_gb0.wav"
            class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
          >
            <span>gb0.wav </span>
            <span class="text-xs block">(4.08 MB)</span>
          </button>
          <button
            data-value="samples_gb1.wav"
            class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
          >
            <span>gb1.wav </span>
            <span class="text-xs block">(6.36 MB)</span>
          </button>
          <button
            data-value="samples_hp0.wav"
            class="text-gray-500 border border-gray-500 rounded-md p-2 underline hover:no-underline"
          >
            <span>hp0.wav </span>
            <span class="text-xs block">(8.75 MB)</span>
          </button>
        </div>
      </div>

      <div>
        <button
          id="detect"
          disabled
          class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded disabled:bg-gray-300 disabled:cursor-not-allowed"
        >
          Transcribe Audio
        </button>
      </div>
      <div>
        <h3 class="font-medium">Transcription:</h3>
        <div
          class="min-h-[250px] bg-slate-100 text-gray-500 p-4 rounded-md flex flex-col gap-2"
        >
          <p hidden id="output-generation" class="grid-rows-2"></p>
          <span id="output-status" class="m-auto font-light"
            >No transcription results yet</span
          >
        </div>
      </div>
    </main>
  </body>
</html>