vad_go / nx_vad /dsp /streaming_vad /streaming_vad.go
HoneyTian's picture
update
d9494cc
package streaming_vad
import (
"fmt"
"math"
)
const (
FrameFlagSpeechPre = iota
FrameFlagSpeechStart
FrameFlagSpeechPresent
FrameFlagSpeechEnd
FrameFlagSpeechPost
)
const (
VadFlagPrepare = "VadFlagPrepare" //准备
VadFlagSpeaking = "VadFlagSpeaking" //说话中
VadFlagPause = "VadFlagPause" //逗号停顿
VadFlagNoSpeech = "VadFlagNoSpeech" //句号停顿
VadFlagUnknown = "VadFlagUnknown" //未知状态
)
type ParametersForFdType struct {
SampleRate uint32
Threshold float32
MinThreshold float32
FrameLengthInSecond float32
StartRejectUpdateNoiseLevelTimeInSecond float32
StartRejectSpeechTimeInSecond float32
SpeechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度.
SpeechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度.
SpeechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度.
SpeechStartRequiredLengthInSecond float32
SpeechStartConfirmRequiredLengthInSecond float32
SpeechPresentMaintainRequiredLengthInSecond float32
SpeechEndConfirmRequiredLengthInSecond float32
}
func (pd *ParametersForFdType) Init () {
pd.SampleRate = 8000
pd.Threshold = 150.0
pd.MinThreshold = 50.0
pd.FrameLengthInSecond = 0.01
//Start Reject
pd.StartRejectUpdateNoiseLevelTimeInSecond = 0.2
pd.StartRejectSpeechTimeInSecond = 0.25
//Window Length
pd.SpeechStartWindowLengthInSecond = 0.15
pd.SpeechPresentWindowLengthInSecond = 0.4
pd.SpeechEndConfirmWindowLengthInSecond = 0.15
//Required Length
pd.SpeechStartRequiredLengthInSecond = 0.09
pd.SpeechStartConfirmRequiredLengthInSecond = 0.075
pd.SpeechPresentMaintainRequiredLengthInSecond = 0.1
pd.SpeechEndConfirmRequiredLengthInSecond = 0.12
}
type DecisionStateType struct {
decisionFlag bool
timeInMilliSecond uint32
}
type FrameDecisionType struct {
params ParametersForFdType
sampleRate uint32
threshold float32
minThreshold float32
adaptFactor float32
//
frameLengthInSecond float32
noiseLevelValue float32
startRejectUpdateNoiseLevelTimeInSecond float32
startRejectUpdateNoiseLevelFrameNumber uint32
startRejectSpeechTimeInSecond float32
startRejectSpeechTimeInMilliSecond uint32
speechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度.
speechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度.
speechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度.
speechStartRequiredLengthInSecond float32
speechStartConfirmRequiredLengthInSecond float32
speechPresentMaintainRequiredLengthInSecond float32
speechEndConfirmRequiredLengthInSecond float32
decisionStateDeque []DecisionStateType
decisionStateDequeSize uint32
decisionStateDequeIndex uint32
processedFramesNumber uint32
lastFrameFlag int
thisFrameFlag int
}
func (fd *FrameDecisionType) Init (params ParametersForFdType) {
fd.params = params
fd.sampleRate = params.SampleRate
fd.threshold = params.Threshold
fd.minThreshold = params.MinThreshold
fd.adaptFactor = fd.threshold
fd.frameLengthInSecond = params.FrameLengthInSecond
fd.noiseLevelValue = fd.threshold / 2.0
fd.startRejectUpdateNoiseLevelTimeInSecond = params.StartRejectUpdateNoiseLevelTimeInSecond
fd.startRejectUpdateNoiseLevelFrameNumber = uint32(fd.startRejectUpdateNoiseLevelTimeInSecond / fd.frameLengthInSecond)
fd.startRejectSpeechTimeInSecond = params.StartRejectSpeechTimeInSecond
fd.startRejectSpeechTimeInMilliSecond = uint32(fd.startRejectSpeechTimeInSecond * 1e3 + 0.5)
fd.speechStartWindowLengthInSecond = params.SpeechStartWindowLengthInSecond
fd.speechPresentWindowLengthInSecond = params.SpeechPresentWindowLengthInSecond
fd.speechEndConfirmWindowLengthInSecond = params.SpeechEndConfirmWindowLengthInSecond
fd.speechStartRequiredLengthInSecond = params.SpeechStartRequiredLengthInSecond
fd.speechStartConfirmRequiredLengthInSecond = params.SpeechStartConfirmRequiredLengthInSecond
fd.speechPresentMaintainRequiredLengthInSecond = params.SpeechPresentMaintainRequiredLengthInSecond
fd.speechEndConfirmRequiredLengthInSecond = params.SpeechEndConfirmRequiredLengthInSecond
//initialize: decisionStateDeque, decisionStateDequeSize, decisionStateDequeIndex
largestWindowLengthInSecond := fd.speechStartWindowLengthInSecond
if largestWindowLengthInSecond < fd.speechPresentWindowLengthInSecond {
largestWindowLengthInSecond = fd.speechPresentWindowLengthInSecond
}
if largestWindowLengthInSecond < fd.speechEndConfirmWindowLengthInSecond {
largestWindowLengthInSecond = fd.speechEndConfirmWindowLengthInSecond
}
decisionStateDequeSize := uint32(largestWindowLengthInSecond / fd.frameLengthInSecond + 0.5)
fd.RefreshDecisionStateDeque(decisionStateDequeSize)
fd.processedFramesNumber = 0
fd.lastFrameFlag = FrameFlagSpeechPre
fd.thisFrameFlag = FrameFlagSpeechPre
}
/*
ProcessStart 当连续语音太长被强制截断时, 就需要有一个方法来重置状态
*/
func (fd *FrameDecisionType) ProcessStart(resetThreshold bool) {
fd.RefreshDecisionStateDeque(fd.decisionStateDequeSize)
if resetThreshold {
fd.threshold = fd.params.Threshold
fd.adaptFactor = fd.threshold
fd.noiseLevelValue = fd.threshold / 2.0
fd.processedFramesNumber = 0
}
fd.lastFrameFlag = FrameFlagSpeechPre
fd.thisFrameFlag = FrameFlagSpeechPre
}
func (fd *FrameDecisionType) RefreshDecisionStateDeque(decisionStateDequeSize uint32) {
fd.decisionStateDeque = make([]DecisionStateType, decisionStateDequeSize)
fd.decisionStateDequeSize = decisionStateDequeSize
fd.decisionStateDequeIndex = 0
}
func (fd *FrameDecisionType) UpdateDecisionState (frameStartTimeInMilliSecond uint32, decisionFlag bool) {
fd.decisionStateDeque[fd.decisionStateDequeIndex].decisionFlag = decisionFlag
fd.decisionStateDeque[fd.decisionStateDequeIndex].timeInMilliSecond = frameStartTimeInMilliSecond
fd.decisionStateDequeIndex = (fd.decisionStateDequeIndex + 1) % fd.decisionStateDequeSize
}
func (fd *FrameDecisionType) SumDecisionTrue (durationInSecond float32) (activeDurationInSecond float32) {
if len(fd.decisionStateDeque) == 0 {
return 0.0
}
indexTemp := int64(fd.decisionStateDequeIndex) - 1
if indexTemp < 0 {
indexTemp = int64(fd.decisionStateDequeSize) - 1
}
decisionFlag := fd.decisionStateDeque[indexTemp].decisionFlag
endInMilliSecond := int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
beginInMilliSecond := endInMilliSecond - int64(durationInSecond * 1e3)
if beginInMilliSecond < 0 {
beginInMilliSecond = 0
}
var timeSum uint32 = 0
for i := uint32(1); i < fd.decisionStateDequeSize; i++ {
if fd.decisionStateDeque[indexTemp].timeInMilliSecond < uint32(beginInMilliSecond) {
break
}
indexTemp--
if indexTemp < 0 {
indexTemp = int64(fd.decisionStateDequeSize) - 1
}
if decisionFlag {
timeSum += uint32(endInMilliSecond) - fd.decisionStateDeque[indexTemp].timeInMilliSecond
}
decisionFlag = fd.decisionStateDeque[indexTemp].decisionFlag
endInMilliSecond = int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond)
}
activeDurationInSecond = float32(timeSum) * 1e-3
return activeDurationInSecond
}
/*
SpeechFrameProcess 处理一帧音频, 给该帧音频配一个标签.
*/
func (fd *FrameDecisionType) SpeechFrameProcess (frameStartTimeInMilliSecond uint32, buffer []int16) {
bufferSize := uint32(len(buffer))
/**************************Calculate the RMS***************************/
sumTemp := int64(0)
ssqTemp := int64(0)
for i := uint32(0); i < bufferSize; i++ {
sumTemp = sumTemp + int64(buffer[i])
ssqTemp = ssqTemp + int64(buffer[i]) * int64(buffer[i])
}
sum := float64(sumTemp)
sum /= float64(bufferSize)
ssq := float64(ssqTemp)
rms := float32(math.Sqrt((ssq / float64(bufferSize)) - (sum * sum)))
//fmt.Printf("rms %f\n", rms)
/**********************************************************************/
var decisionFlag bool
if frameStartTimeInMilliSecond < fd.startRejectSpeechTimeInMilliSecond {
decisionFlag = false
} else {
decisionFlag = rms > fd.threshold && rms > 400
}
//fmt.Printf("decisionFlag %t\n", decisionFlag)
fd.UpdateDecisionState(frameStartTimeInMilliSecond, decisionFlag)
if fd.thisFrameFlag == FrameFlagSpeechPre {
if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartRequiredLengthInSecond {
fd.thisFrameFlag = FrameFlagSpeechStart
}
} else if fd.thisFrameFlag == FrameFlagSpeechStart {
if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartConfirmRequiredLengthInSecond {
fd.thisFrameFlag = FrameFlagSpeechPresent
} else {
//TODO: 感觉这一部分是不会触发的吧.
if fd.speechStartConfirmRequiredLengthInSecond != 0 {
fd.thisFrameFlag = FrameFlagSpeechPre
}
}
} else if fd.thisFrameFlag == FrameFlagSpeechPresent {
if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) < fd.speechPresentMaintainRequiredLengthInSecond {
fd.thisFrameFlag = FrameFlagSpeechEnd
}
} else if fd.thisFrameFlag == FrameFlagSpeechEnd {
if fd.SumDecisionTrue(fd.speechEndConfirmWindowLengthInSecond) <= fd.speechEndConfirmRequiredLengthInSecond {
fd.thisFrameFlag = FrameFlagSpeechPre
} else if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) >= fd.speechPresentMaintainRequiredLengthInSecond {
//fd.thisFrameFlag = FrameFlagSpeechPre
//我感觉这里的条件判断应该是 < 而不是 >=.
//有可能他是想在这里添加一个短暂的停顿,用于添加逗号.
fd.thisFrameFlag = FrameFlagSpeechPre
}
}
//
if fd.thisFrameFlag == FrameFlagSpeechPre && !decisionFlag {
fd.threshold = (0.02 * rms * 2) + (0.98 * fd.threshold)
fd.adaptFactor = fd.threshold
} else if decisionFlag && fd.thisFrameFlag == FrameFlagSpeechPresent {
if rms < fd.adaptFactor {
fd.adaptFactor = 0.01 * rms + 0.99 * fd.adaptFactor
} else {
fd.adaptFactor = 0.05 * rms + 0.95 * fd.adaptFactor
}
thresholdTemp := fd.noiseLevelValue + 0.3 * fd.adaptFactor
fd.threshold = (0.1 * thresholdTemp) + 0.9 * fd.threshold
}
//
if fd.threshold < fd.minThreshold {
fd.threshold = fd.minThreshold
}
// Update the Threshold
if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
alphaAdapt := float32(fd.processedFramesNumber) / float32(fd.startRejectUpdateNoiseLevelFrameNumber)
fd.noiseLevelValue = (alphaAdapt * fd.noiseLevelValue) + ((1 - alphaAdapt) * rms)
} else {
if rms > fd.noiseLevelValue {
fd.noiseLevelValue = (0.001 * rms) + (0.999 * fd.noiseLevelValue)
} else {
fd.noiseLevelValue = (0.05 * rms) + (0.95 * fd.noiseLevelValue)
}
}
if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber {
if fd.noiseLevelValue > 400 {
fd.noiseLevelValue = fd.noiseLevelValue * 0.1
}
fd.threshold = fd.noiseLevelValue * 2
if fd.threshold < fd.minThreshold {
fd.threshold = fd.minThreshold
}
}
fd.processedFramesNumber++
}
type VadEventMarkerType struct {
VadFlag string
Time uint32
}
type StreamingVadType struct{
sampleRate uint32 //采样率
silenceTime float32 //判断语音结束时需要的静音时长
timeout float32 //单个语音的最大长度. 语音活动时长超过时, 将被强制判断定为结束
timeoutInMilliSecond uint32
//VAD检测按帧指定, 每一次接收到音频信号时, 需要将 frameLength 的余数部分保存起来, 下一次则将要将余下的部分拼接到信号的开始位置.
frameLength uint32 //每一帖的长度
unfinishedFrame []int16 //剩余帧
unfinishedFrameSize uint32 //剩余帧长度
frameDecision FrameDecisionType
//
startRejectSpeechTimeInMilliSecond uint32
allowedSilenceTimeInSpeechInMilliSecond uint32
allowedLongestSpeechDurationInMilliSecond uint32
minDurationOfLongSpeechInMilliSecond uint32
endOfLongSpeechRequiredSilenceTimeInMilliSecond uint32
endOfNormalSpeechRequiredSilenceTimeInMilliSecond uint32
minDurationOfSpeechToAddCommaInMilliSecond uint32 //where to add comma if speech
//只有检测到 VadFlagSpeaking 标签时, 才知道语音已经开始了,
//此时向前推 prepareDurationInMilliSecond 的时间, 将其定义为 VadFlagPrepare 的位置.
prepareDurationInMilliSecond uint32
//检测到语音结束时, 并不会马上判断语音结束,
//而是需要一定时间 SpeechEndConfirmWindowLengthInSecond 的静音后再确认它,
//语音结束的事件节点是`语音结束`后 nonSpeechPadInInMilliSecond 的时间位置.
nonSpeechPadInInMilliSecond uint32
speechFrameGlobalTimeInMilliSecond uint32
speechDetectedStartTimeInMilliSecond uint32
speechDetectedStartTimeIsValid bool
speechDetectedEndTimeInMilliSecond uint32
speechDetectedEndTimeIsValid bool
speechDetectedEndTimeIsValidPossible bool
speechDetectedStartAndEnd bool
//
lastVadEndTimeInMilliSecond uint32
thisDetectedState string //VadFlag
VadEventMarkerDeque []VadEventMarkerType
}
/*
silenceTime: 0.4
timeout: 3.0
以下条件应满足:
minDurationOfSpeechToAddCommaInMilliSecond < endOfNormalSpeechRequiredSilenceTimeInMilliSecond
endOfNormalSpeechRequiredSilenceTimeInMilliSecond < endOfLongSpeechRequiredSilenceTimeInMilliSecond
*/
func (sv *StreamingVadType) Init (sampleRate uint32, silenceTime float32, timeout float32) {
sv.sampleRate = sampleRate
sv.timeout = timeout
sv.timeoutInMilliSecond = uint32(timeout * 1e3)
sv.frameLength = uint32(0.02 * float32(sampleRate))
sv.unfinishedFrameSize = 0
var params ParametersForFdType
params.Init()
params.SampleRate = sampleRate
sv.frameDecision.Init(params)
//
sv.startRejectSpeechTimeInMilliSecond = uint32(0.7 * 1e3)
sv.allowedSilenceTimeInSpeechInMilliSecond = uint32(0.2 * 1e3)
sv.minDurationOfLongSpeechInMilliSecond = 0
sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond = 0
sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond = uint32(silenceTime * 1e3)
sv.minDurationOfSpeechToAddCommaInMilliSecond = uint32(0.3 * 1e3)
sv.allowedLongestSpeechDurationInMilliSecond = 0
sv.prepareDurationInMilliSecond = uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) * 2
sv.nonSpeechPadInInMilliSecond = uint32(silenceTime * 1e3 * 0.5)
sv.speechFrameGlobalTimeInMilliSecond = 0
//
sv.speechDetectedStartTimeInMilliSecond = 0
sv.speechDetectedStartTimeIsValid = false
sv.speechDetectedEndTimeInMilliSecond = 0
sv.speechDetectedEndTimeIsValid = false
sv.speechDetectedEndTimeIsValidPossible = false
sv.speechDetectedStartAndEnd = false
//
sv.lastVadEndTimeInMilliSecond = 0
sv.thisDetectedState = VadFlagNoSpeech
fmt.Println("do StreamingVad Init...")
}
func (sv *StreamingVadType) ProcessSpeechByChunk(buffer []int16) (err error) {
var validBuffer []int16
//unfinished frame
unfinishedFrameSize := uint32(len(sv.unfinishedFrame))
for i := uint32(0); i < unfinishedFrameSize; i++ {
validBuffer = append(validBuffer, sv.unfinishedFrame[i])
}
//buffer
bufferSize := uint32(len(buffer))
for i := uint32(0); i < bufferSize; i++ {
validBuffer = append(validBuffer, buffer[i])
}
//remainder
remainderSize := uint32(len(validBuffer)) % sv.frameLength
boundary := uint32(len(validBuffer)) - remainderSize
sv.unfinishedFrame = validBuffer[boundary:]
validBuffer = validBuffer[:boundary]
if uint32(len(validBuffer)) > sv.frameLength {
sv.ProcessSpeech(validBuffer)
}
return nil
}
//ProcessSpeechByChunk 需要将 buffer 更新成指定 frameLength 的倍数, 多余的部分保存起来以供下次使用.
func (sv *StreamingVadType) DeprecatedProcessSpeechByChunk(buffer []int16) (err error) {
bufferSize := uint32(len(buffer))
var validBuffer []int16
var unfinishedFrame []int16
var point int16
validSize := (uint32(len(buffer)) + sv.unfinishedFrameSize) / sv.frameLength * sv.frameLength
if validSize >= sv.frameLength {
if sv.unfinishedFrameSize != 0 {
for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
point = sv.unfinishedFrame[i]
validBuffer = append(validBuffer, point)
}
for i := uint32(0); i < validSize - sv.unfinishedFrameSize; i++ {
point = buffer[i]
validBuffer = append(validBuffer, point)
}
} else {
for i := uint32(0); i < validSize; i++ {
point = buffer[i]
validBuffer = append(validBuffer, point)
}
}
sv.ProcessSpeech(validBuffer)
}
//fmt.Printf("validBuffer size: %d\n", len(validBuffer))
//fmt.Printf("validSize: %d\n", validSize)
//fmt.Printf("last unfinishedFrameSize: %d\n", sv.unfinishedFrameSize)
sv.unfinishedFrameSize = (bufferSize + sv.unfinishedFrameSize) - validSize
begin := bufferSize - sv.unfinishedFrameSize - 1
for i := uint32(0); i < sv.unfinishedFrameSize; i++ {
point = buffer[begin + i]
unfinishedFrame = append(unfinishedFrame, point)
}
sv.unfinishedFrame = unfinishedFrame
fmt.Println("do StreamingVad ProcessSpeechByChunk...")
return nil
}
//ProcessSpeech 根据上一帧的语音标签和当前帧的语音标签来判断VAD状态.
func (sv *StreamingVadType) ProcessSpeech(buffer []int16) {
bufferLength := uint32(len(buffer))
if bufferLength % sv.frameLength != 0 {
panic(fmt.Sprintf("bufferLength (%d) should be a multiple of B frameLength (%d)", bufferLength, sv.frameLength))
}
var frameBuffer []int16
for begin := uint32(0); begin + sv.frameLength <= bufferLength; {
frameBuffer = buffer[begin: begin + sv.frameLength]
sv.frameDecision.SpeechFrameProcess(sv.speechFrameGlobalTimeInMilliSecond, frameBuffer)
begin += sv.frameLength
if sv.frameDecision.lastFrameFlag == FrameFlagSpeechStart && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPresent {
if sv.thisDetectedState == VadFlagNoSpeech {
//start
var prepareTime uint32 = 0
if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
}
if prepareTime < sv.lastVadEndTimeInMilliSecond {
prepareTime = sv.lastVadEndTimeInMilliSecond
}
vadEventMarker := VadEventMarkerType{
VadFlag: VadFlagPrepare,
Time: prepareTime,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
sv.thisDetectedState = VadFlagSpeaking
vadEventMarker = VadEventMarkerType{
VadFlag: VadFlagSpeaking,
Time: sv.speechFrameGlobalTimeInMilliSecond,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
//
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
sv.speechDetectedEndTimeIsValid = false
//sv.speechDetectedEndTimeIsValidPossible = false
sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
sv.speechDetectedStartTimeIsValid = true
} else if sv.thisDetectedState == VadFlagSpeaking && sv.speechDetectedEndTimeIsValid &&
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond > sv.minDurationOfSpeechToAddCommaInMilliSecond &&
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond < sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond {
//pause
vadEventMarker := VadEventMarkerType{
VadFlag: VadFlagPause,
Time: sv.speechDetectedEndTimeInMilliSecond,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
sv.thisDetectedState = VadFlagSpeaking
vadEventMarker = VadEventMarkerType{
VadFlag: VadFlagSpeaking,
Time: sv.speechFrameGlobalTimeInMilliSecond,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
//
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
sv.speechDetectedEndTimeIsValid = false
//sv.speechDetectedEndTimeIsValidPossible = false
} else if sv.thisDetectedState == VadFlagSpeaking &&
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond <= sv.minDurationOfSpeechToAddCommaInMilliSecond {
//
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
sv.speechDetectedEndTimeIsValid = false
//sv.speechDetectedEndTimeIsValidPossible = false
} else {}
}
//end
if sv.frameDecision.lastFrameFlag == FrameFlagSpeechEnd && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPre {
sv.speechDetectedEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
sv.speechDetectedEndTimeIsValid = true
//sv.speechDetectedEndTimeIsValidPossible = true
}
//只在开始一定时间后, 才能检测到 Vad 结束.
if sv.speechFrameGlobalTimeInMilliSecond > sv.startRejectSpeechTimeInMilliSecond {
if sv.speechDetectedEndTimeIsValid {
var endOfSpeechRequiredSilenceTime uint32
if sv.minDurationOfLongSpeechInMilliSecond > 0 &&
sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond > 0 &&
(sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond) > sv.minDurationOfLongSpeechInMilliSecond {
endOfSpeechRequiredSilenceTime = sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond
} else {
endOfSpeechRequiredSilenceTime = sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond
}
if (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond) >= endOfSpeechRequiredSilenceTime {
endTime := sv.speechDetectedEndTimeInMilliSecond + sv.nonSpeechPadInInMilliSecond
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
sv.speechDetectedEndTimeIsValid = false
sv.speechDetectedStartTimeInMilliSecond = uint32(0)
sv.speechDetectedStartTimeIsValid = false
sv.thisDetectedState = VadFlagNoSpeech
sv.lastVadEndTimeInMilliSecond = endTime
vadEventMarker := VadEventMarkerType{
VadFlag: VadFlagNoSpeech,
Time: endTime,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
}
}
}
//当语音时长超过时, 强制切断
if sv.speechDetectedStartTimeIsValid && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond > sv.timeoutInMilliSecond {
//end
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
sv.speechDetectedEndTimeIsValid = false
sv.speechDetectedStartTimeInMilliSecond = uint32(0)
sv.speechDetectedStartTimeIsValid = false
sv.thisDetectedState = VadFlagNoSpeech
sv.lastVadEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond
vadEventMarker := VadEventMarkerType{
VadFlag: VadFlagNoSpeech,
Time: sv.speechFrameGlobalTimeInMilliSecond,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
//start
var prepareTime uint32 = 0
if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond {
prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond
}
if prepareTime < sv.lastVadEndTimeInMilliSecond {
prepareTime = sv.lastVadEndTimeInMilliSecond
}
vadEventMarker = VadEventMarkerType{
VadFlag: VadFlagPrepare,
Time: prepareTime,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
sv.thisDetectedState = VadFlagSpeaking
vadEventMarker = VadEventMarkerType{
VadFlag: VadFlagSpeaking,
Time: sv.speechFrameGlobalTimeInMilliSecond,
}
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker)
//
sv.speechDetectedEndTimeInMilliSecond = uint32(0)
sv.speechDetectedEndTimeIsValid = false
//sv.speechDetectedEndTimeIsValidPossible = false
sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3)
sv.speechDetectedStartTimeIsValid = true
}
//loop
sv.frameDecision.lastFrameFlag = sv.frameDecision.thisFrameFlag
sv.speechFrameGlobalTimeInMilliSecond += uint32(float32(sv.frameLength) / float32(sv.sampleRate) * 1e3)
}
}