Spaces:

jbilcke-hf
/

observer

Paused

File size: 9,542 Bytes

"use client"

import { useCallback, useEffect, useRef, useState, useTransition } from "react"
import { useInterval } from "usehooks-ts"

// TODO: try this? https://www.npmjs.com/package/react-audio-voice-recorder
import { useRecorder } from "react-microphone-recorder"
import { getWaveBlob } from "webm-to-wav-converter"
import {
  AvailableModels,
  InferenceSession,
  SessionManager,
} from "whisper-turbo"

import { useToast } from "@/components/ui/use-toast"
import { useStore } from "./useStore"

export interface TSSegment {
  text: string;
  start: number;
  stop: number;
  last: boolean;
}

export interface TSTranscript {
  segments: Array<TSSegment>;
}

export function Listen({
  onListen,
}: {
  onListen: (recording: string) => void
}) {
  const { toast } = useToast()
  const speechSynthesis = useStore(state => state.speechSynthesis)
  const isSpeaking = useStore(state => state.isSpeaking)
  const isSpeakingRef = useRef(isSpeaking)
  useEffect(() => {isSpeakingRef.current = isSpeaking }, [isSpeaking])

  const setHearing = useStore(state => state.setHearing)
  const isHearing = useStore(state => state.isHearing)

  const [transcribing, setTranscribing] = useState(false)
  const transcribingRef = useRef(transcribing)
  useEffect(() => { transcribingRef.current = transcribing }, [transcribing])

  // used to detect changes, signal when we can analyze the audio
  const [audioDataFrame, setAudioDataFrame] = useState(0)
  const audioDataFrameRef = useRef(audioDataFrame)
  useEffect(() => { audioDataFrameRef.current = audioDataFrame }, [audioDataFrame])

  const [transcriptBuffer, setTranscriptBuffer] = useState("")
  useEffect(() => {
    onListen(transcriptBuffer)
  }, [transcriptBuffer])
  /*
  Available models: {
    WHISPER_TINY: 'whisper-tiny',
    WHISPER_BASE: 'whisper-base',
    WHISPER_SMALL: 'whisper-small',
    WHISPER_MEDIUM: 'whisper-medium',
    WHISPER_LARGE: 'whisper-large'
  }
  */

  // unfortunately, we cannot really use models larger than TINY because they are
  // too slow to process requests
  const whisperModel: AvailableModels = AvailableModels.WHISPER_TINY

  const listenerRef = useRef({
    isListening: false,
    startedListeningAt: 0,
    stoppedListeningAt: 0,
    durationInMs: 0,
    hits: 0,
    debugCanContinue: true, // used for debugging
  })

  // the background listener is not a CIA spy device, but a detect of changes in the
  // background noise volume level. The goal is to detect whenever an interesting event is happening
  const backgroundListener = useRecorder()

  // the foreground listener is the actual sound sampler
  // with out system, it will always lag a bit behind the background listener
  // however there might be a fix (which I haven't tried yet):
  // to take the last second of the background listener sample,
  // and glue it to the beginning of the foreground listener sample
  //
  // or, alternatively, we could just try to use a shorter time window for the background listener,
  // to make it more reactive
  const foregroundListener = useRecorder()

  // to detect voice, we use a combination of audio level and frequency sampling
  const heardSomething = backgroundListener.audioLevel > 12 // 18
  
  const status = heardSomething ? "I hear something!" : "background noise"

  const session = useRef<InferenceSession | null>(null)
  const [audioData, setAudioData] = useState<Uint8Array | null>(null)
  const [audioMetadata, setAudioMetadata] = useState<File | null>(null)
  const [loaded, setLoaded] = useState<boolean>(false)
  const [progress, setProgress] = useState<number>(0)

  const isLoadingModel = progress > 0
  const hasLoadedModel = progress === 100

  const loadModel = async () => {
    console.log("loadModel")
    if (session.current) {
      session.current.destroy()
    }
    if (!whisperModel) {
      console.error("No whisper model loaded")
      return
    }

    try {
      const manager = new SessionManager()
      const loadResult = await manager.loadModel(
        whisperModel,
        () => {
          setLoaded(true)
        },
        (p: number) => {
          console.log("progress:", p)
          setProgress(p)
        }
      )
      if (loadResult.isErr) {
        throw new Error(loadResult.error.message)
      } else {
        session.current = loadResult.value
      }
    } catch (err) {
      const error = `failed to load the model: ${err}`
      console.error(error)
      toast({
        title: "Error",
        description: error,
        variant: "destructive"
      })
    }
  }

  const runSession = async () => {
    if (!loaded) {
      console.log("runSession: aborting (model not loaded yet)")
      return
    }
    if (!session.current) {
      console.log("runSession: aborting (no model loaded)")
      toast({
        title: "Error",
        description: "No model loaded",
        variant: "destructive"
      })
      return
    }
    // console.log("debug:", { audioData, audioDataFrame })
    if (!audioData) {
      console.log("runSession: aborting (no audio file loaded)")
      toast({
        title: "Error",
        description: "No audio file loaded",
        variant: "destructive"
      })
      return
    }

    setTranscribing(transcribingRef.current = true)

    try {
      await session.current.transcribe(audioData, (s: any) => {
        const segment = s as { text: string, start: number, stop: number, last: boolean }
        const text = segment.text.trim()
        console.log("text:", text)
        if (text) {
          setTranscriptBuffer(text)
        }

        if (s.last) {
          console.log("IS LAST")
          setTranscribing(transcribingRef.current = false)
          return
        }
      })
    } catch (err) {
      const error = `transcription crashed: ${err}`
      console.error(error)
      toast({
        title: "Error",
        description: "No audio file loaded",
        variant: "destructive"
      })
    }
  }

  // let's disable the background recorder for now
  useInterval(() => {
    // console.log("let's stop, and start again")
    backgroundListener.stopRecording()
    backgroundListener.startRecording()
  }, 3000)

  useEffect(() => {
    const fn = async () => {
      console.log("load model..")
      await loadModel()

      console.log("starting to listen to background noise to detect volume peaks..")
      backgroundListener.startRecording()
    }

    fn()
  }, [])


  useEffect(() => {
    if (!audioData) {
      console.log("no audio")
    }
    // console.log("audioDataFrame changed, need to process audioData!")
    runSession()
  }, [audioDataFrame])

  // note: this effect only reacts to "head something" changes
  // anod not to changes to isListening or isSpekaing
  useEffect(() => {
    const isListening = listenerRef.current.isListening

    if (!heardSomething) { return }

    if (listenerRef.current.isListening) {
      // console.log("we are already listening, so skipping..")
      return
    }
    if (isSpeakingRef.current) {
      console.log("we are already busy speaking, so ignoring..")
      return
    }
    setHearing(true)
    // console.log("recording..")
    foregroundListener.startRecording()
    listenerRef.current.hits = 0
    listenerRef.current.isListening = true
      
    setTimeout(async () => {
      foregroundListener.stopRecording()
      setHearing(false)
      listenerRef.current.isListening = false
      listenerRef.current.stoppedListeningAt = Date.now()
      listenerRef.current.durationInMs =
        listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
  
      const hits = listenerRef.current.hits
  
      if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
        return
      }

      if (hits <= 11) {
        return
      }

          
      console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)

      
      // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
      // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
      
      // console.log("got an interesting sample, sending for review")

      // temporary, to prevent infinite loop
      if (listenerRef.current.debugCanContinue) {
        // to prevent the infinite loop, set this value to false
        // listenerRef.current.debugCanContinue = false

        try {
          const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
          const arrayBuffer = await blob.arrayBuffer()
          const uint8Array = new Uint8Array(arrayBuffer)

          setAudioData(uint8Array)
          setAudioDataFrame(audioDataFrameRef.current + 1)
        } catch (err) {
          const error = `failed to convert the audio sample: ${err}`
          console.error(error)
          toast({
            title: "Error",
            description: error,
            variant: "destructive"
          })
        }
      } else {
        console.log("Julian: infinite loop temporary disabled!")
      }
    }, 2000)
  }, [heardSomething])

  if (heardSomething && listenerRef.current.isListening) {
    listenerRef.current.hits = listenerRef.current.hits + 1
  }

  return (
    <div className="fixed top-80 left-16 z-10 bg-gray-100 p-4">
      {isLoadingModel && !hasLoadedModel 
        ? <p>Loading whisper-turbo: {progress}% done</p>
        : <p>{
          transcriptBuffer
          || ""
          }</p>
      }
    </div>
  )
}