Spaces:
Sleeping
Sleeping
package streaming_vad | |
import ( | |
"fmt" | |
"math" | |
) | |
const ( | |
FrameFlagSpeechPre = iota | |
FrameFlagSpeechStart | |
FrameFlagSpeechPresent | |
FrameFlagSpeechEnd | |
FrameFlagSpeechPost | |
) | |
const ( | |
VadFlagPrepare = "VadFlagPrepare" //准备 | |
VadFlagSpeaking = "VadFlagSpeaking" //说话中 | |
VadFlagPause = "VadFlagPause" //逗号停顿 | |
VadFlagNoSpeech = "VadFlagNoSpeech" //句号停顿 | |
VadFlagUnknown = "VadFlagUnknown" //未知状态 | |
) | |
type ParametersForFdType struct { | |
SampleRate uint32 | |
Threshold float32 | |
MinThreshold float32 | |
FrameLengthInSecond float32 | |
StartRejectUpdateNoiseLevelTimeInSecond float32 | |
StartRejectSpeechTimeInSecond float32 | |
SpeechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度. | |
SpeechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度. | |
SpeechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度. | |
SpeechStartRequiredLengthInSecond float32 | |
SpeechStartConfirmRequiredLengthInSecond float32 | |
SpeechPresentMaintainRequiredLengthInSecond float32 | |
SpeechEndConfirmRequiredLengthInSecond float32 | |
} | |
func (pd *ParametersForFdType) Init () { | |
pd.SampleRate = 8000 | |
pd.Threshold = 150.0 | |
pd.MinThreshold = 50.0 | |
pd.FrameLengthInSecond = 0.01 | |
//Start Reject | |
pd.StartRejectUpdateNoiseLevelTimeInSecond = 0.2 | |
pd.StartRejectSpeechTimeInSecond = 0.25 | |
//Window Length | |
pd.SpeechStartWindowLengthInSecond = 0.15 | |
pd.SpeechPresentWindowLengthInSecond = 0.4 | |
pd.SpeechEndConfirmWindowLengthInSecond = 0.15 | |
//Required Length | |
pd.SpeechStartRequiredLengthInSecond = 0.09 | |
pd.SpeechStartConfirmRequiredLengthInSecond = 0.075 | |
pd.SpeechPresentMaintainRequiredLengthInSecond = 0.1 | |
pd.SpeechEndConfirmRequiredLengthInSecond = 0.12 | |
} | |
type DecisionStateType struct { | |
decisionFlag bool | |
timeInMilliSecond uint32 | |
} | |
type FrameDecisionType struct { | |
params ParametersForFdType | |
sampleRate uint32 | |
threshold float32 | |
minThreshold float32 | |
adaptFactor float32 | |
// | |
frameLengthInSecond float32 | |
noiseLevelValue float32 | |
startRejectUpdateNoiseLevelTimeInSecond float32 | |
startRejectUpdateNoiseLevelFrameNumber uint32 | |
startRejectSpeechTimeInSecond float32 | |
startRejectSpeechTimeInMilliSecond uint32 | |
speechStartWindowLengthInSecond float32 //检测语音开始,所需的窗口长度. | |
speechPresentWindowLengthInSecond float32 //检测语音活动,所需的窗口长度. | |
speechEndConfirmWindowLengthInSecond float32 //确认语音结束,所需的窗口长度. | |
speechStartRequiredLengthInSecond float32 | |
speechStartConfirmRequiredLengthInSecond float32 | |
speechPresentMaintainRequiredLengthInSecond float32 | |
speechEndConfirmRequiredLengthInSecond float32 | |
decisionStateDeque []DecisionStateType | |
decisionStateDequeSize uint32 | |
decisionStateDequeIndex uint32 | |
processedFramesNumber uint32 | |
lastFrameFlag int | |
thisFrameFlag int | |
} | |
func (fd *FrameDecisionType) Init (params ParametersForFdType) { | |
fd.params = params | |
fd.sampleRate = params.SampleRate | |
fd.threshold = params.Threshold | |
fd.minThreshold = params.MinThreshold | |
fd.adaptFactor = fd.threshold | |
fd.frameLengthInSecond = params.FrameLengthInSecond | |
fd.noiseLevelValue = fd.threshold / 2.0 | |
fd.startRejectUpdateNoiseLevelTimeInSecond = params.StartRejectUpdateNoiseLevelTimeInSecond | |
fd.startRejectUpdateNoiseLevelFrameNumber = uint32(fd.startRejectUpdateNoiseLevelTimeInSecond / fd.frameLengthInSecond) | |
fd.startRejectSpeechTimeInSecond = params.StartRejectSpeechTimeInSecond | |
fd.startRejectSpeechTimeInMilliSecond = uint32(fd.startRejectSpeechTimeInSecond * 1e3 + 0.5) | |
fd.speechStartWindowLengthInSecond = params.SpeechStartWindowLengthInSecond | |
fd.speechPresentWindowLengthInSecond = params.SpeechPresentWindowLengthInSecond | |
fd.speechEndConfirmWindowLengthInSecond = params.SpeechEndConfirmWindowLengthInSecond | |
fd.speechStartRequiredLengthInSecond = params.SpeechStartRequiredLengthInSecond | |
fd.speechStartConfirmRequiredLengthInSecond = params.SpeechStartConfirmRequiredLengthInSecond | |
fd.speechPresentMaintainRequiredLengthInSecond = params.SpeechPresentMaintainRequiredLengthInSecond | |
fd.speechEndConfirmRequiredLengthInSecond = params.SpeechEndConfirmRequiredLengthInSecond | |
//initialize: decisionStateDeque, decisionStateDequeSize, decisionStateDequeIndex | |
largestWindowLengthInSecond := fd.speechStartWindowLengthInSecond | |
if largestWindowLengthInSecond < fd.speechPresentWindowLengthInSecond { | |
largestWindowLengthInSecond = fd.speechPresentWindowLengthInSecond | |
} | |
if largestWindowLengthInSecond < fd.speechEndConfirmWindowLengthInSecond { | |
largestWindowLengthInSecond = fd.speechEndConfirmWindowLengthInSecond | |
} | |
decisionStateDequeSize := uint32(largestWindowLengthInSecond / fd.frameLengthInSecond + 0.5) | |
fd.RefreshDecisionStateDeque(decisionStateDequeSize) | |
fd.processedFramesNumber = 0 | |
fd.lastFrameFlag = FrameFlagSpeechPre | |
fd.thisFrameFlag = FrameFlagSpeechPre | |
} | |
/* | |
ProcessStart 当连续语音太长被强制截断时, 就需要有一个方法来重置状态 | |
*/ | |
func (fd *FrameDecisionType) ProcessStart(resetThreshold bool) { | |
fd.RefreshDecisionStateDeque(fd.decisionStateDequeSize) | |
if resetThreshold { | |
fd.threshold = fd.params.Threshold | |
fd.adaptFactor = fd.threshold | |
fd.noiseLevelValue = fd.threshold / 2.0 | |
fd.processedFramesNumber = 0 | |
} | |
fd.lastFrameFlag = FrameFlagSpeechPre | |
fd.thisFrameFlag = FrameFlagSpeechPre | |
} | |
func (fd *FrameDecisionType) RefreshDecisionStateDeque(decisionStateDequeSize uint32) { | |
fd.decisionStateDeque = make([]DecisionStateType, decisionStateDequeSize) | |
fd.decisionStateDequeSize = decisionStateDequeSize | |
fd.decisionStateDequeIndex = 0 | |
} | |
func (fd *FrameDecisionType) UpdateDecisionState (frameStartTimeInMilliSecond uint32, decisionFlag bool) { | |
fd.decisionStateDeque[fd.decisionStateDequeIndex].decisionFlag = decisionFlag | |
fd.decisionStateDeque[fd.decisionStateDequeIndex].timeInMilliSecond = frameStartTimeInMilliSecond | |
fd.decisionStateDequeIndex = (fd.decisionStateDequeIndex + 1) % fd.decisionStateDequeSize | |
} | |
func (fd *FrameDecisionType) SumDecisionTrue (durationInSecond float32) (activeDurationInSecond float32) { | |
if len(fd.decisionStateDeque) == 0 { | |
return 0.0 | |
} | |
indexTemp := int64(fd.decisionStateDequeIndex) - 1 | |
if indexTemp < 0 { | |
indexTemp = int64(fd.decisionStateDequeSize) - 1 | |
} | |
decisionFlag := fd.decisionStateDeque[indexTemp].decisionFlag | |
endInMilliSecond := int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond) | |
beginInMilliSecond := endInMilliSecond - int64(durationInSecond * 1e3) | |
if beginInMilliSecond < 0 { | |
beginInMilliSecond = 0 | |
} | |
var timeSum uint32 = 0 | |
for i := uint32(1); i < fd.decisionStateDequeSize; i++ { | |
if fd.decisionStateDeque[indexTemp].timeInMilliSecond < uint32(beginInMilliSecond) { | |
break | |
} | |
indexTemp-- | |
if indexTemp < 0 { | |
indexTemp = int64(fd.decisionStateDequeSize) - 1 | |
} | |
if decisionFlag { | |
timeSum += uint32(endInMilliSecond) - fd.decisionStateDeque[indexTemp].timeInMilliSecond | |
} | |
decisionFlag = fd.decisionStateDeque[indexTemp].decisionFlag | |
endInMilliSecond = int64(fd.decisionStateDeque[indexTemp].timeInMilliSecond) | |
} | |
activeDurationInSecond = float32(timeSum) * 1e-3 | |
return activeDurationInSecond | |
} | |
/* | |
SpeechFrameProcess 处理一帧音频, 给该帧音频配一个标签. | |
*/ | |
func (fd *FrameDecisionType) SpeechFrameProcess (frameStartTimeInMilliSecond uint32, buffer []int16) { | |
bufferSize := uint32(len(buffer)) | |
/**************************Calculate the RMS***************************/ | |
sumTemp := int64(0) | |
ssqTemp := int64(0) | |
for i := uint32(0); i < bufferSize; i++ { | |
sumTemp = sumTemp + int64(buffer[i]) | |
ssqTemp = ssqTemp + int64(buffer[i]) * int64(buffer[i]) | |
} | |
sum := float64(sumTemp) | |
sum /= float64(bufferSize) | |
ssq := float64(ssqTemp) | |
rms := float32(math.Sqrt((ssq / float64(bufferSize)) - (sum * sum))) | |
//fmt.Printf("rms %f\n", rms) | |
/**********************************************************************/ | |
var decisionFlag bool | |
if frameStartTimeInMilliSecond < fd.startRejectSpeechTimeInMilliSecond { | |
decisionFlag = false | |
} else { | |
decisionFlag = rms > fd.threshold && rms > 400 | |
} | |
//fmt.Printf("decisionFlag %t\n", decisionFlag) | |
fd.UpdateDecisionState(frameStartTimeInMilliSecond, decisionFlag) | |
if fd.thisFrameFlag == FrameFlagSpeechPre { | |
if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartRequiredLengthInSecond { | |
fd.thisFrameFlag = FrameFlagSpeechStart | |
} | |
} else if fd.thisFrameFlag == FrameFlagSpeechStart { | |
if fd.SumDecisionTrue(fd.speechStartWindowLengthInSecond) > fd.speechStartConfirmRequiredLengthInSecond { | |
fd.thisFrameFlag = FrameFlagSpeechPresent | |
} else { | |
//TODO: 感觉这一部分是不会触发的吧. | |
if fd.speechStartConfirmRequiredLengthInSecond != 0 { | |
fd.thisFrameFlag = FrameFlagSpeechPre | |
} | |
} | |
} else if fd.thisFrameFlag == FrameFlagSpeechPresent { | |
if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) < fd.speechPresentMaintainRequiredLengthInSecond { | |
fd.thisFrameFlag = FrameFlagSpeechEnd | |
} | |
} else if fd.thisFrameFlag == FrameFlagSpeechEnd { | |
if fd.SumDecisionTrue(fd.speechEndConfirmWindowLengthInSecond) <= fd.speechEndConfirmRequiredLengthInSecond { | |
fd.thisFrameFlag = FrameFlagSpeechPre | |
} else if fd.SumDecisionTrue(fd.speechPresentWindowLengthInSecond) >= fd.speechPresentMaintainRequiredLengthInSecond { | |
//fd.thisFrameFlag = FrameFlagSpeechPre | |
//我感觉这里的条件判断应该是 < 而不是 >=. | |
//有可能他是想在这里添加一个短暂的停顿,用于添加逗号. | |
fd.thisFrameFlag = FrameFlagSpeechPre | |
} | |
} | |
// | |
if fd.thisFrameFlag == FrameFlagSpeechPre && !decisionFlag { | |
fd.threshold = (0.02 * rms * 2) + (0.98 * fd.threshold) | |
fd.adaptFactor = fd.threshold | |
} else if decisionFlag && fd.thisFrameFlag == FrameFlagSpeechPresent { | |
if rms < fd.adaptFactor { | |
fd.adaptFactor = 0.01 * rms + 0.99 * fd.adaptFactor | |
} else { | |
fd.adaptFactor = 0.05 * rms + 0.95 * fd.adaptFactor | |
} | |
thresholdTemp := fd.noiseLevelValue + 0.3 * fd.adaptFactor | |
fd.threshold = (0.1 * thresholdTemp) + 0.9 * fd.threshold | |
} | |
// | |
if fd.threshold < fd.minThreshold { | |
fd.threshold = fd.minThreshold | |
} | |
// Update the Threshold | |
if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber { | |
alphaAdapt := float32(fd.processedFramesNumber) / float32(fd.startRejectUpdateNoiseLevelFrameNumber) | |
fd.noiseLevelValue = (alphaAdapt * fd.noiseLevelValue) + ((1 - alphaAdapt) * rms) | |
} else { | |
if rms > fd.noiseLevelValue { | |
fd.noiseLevelValue = (0.001 * rms) + (0.999 * fd.noiseLevelValue) | |
} else { | |
fd.noiseLevelValue = (0.05 * rms) + (0.95 * fd.noiseLevelValue) | |
} | |
} | |
if fd.processedFramesNumber < fd.startRejectUpdateNoiseLevelFrameNumber { | |
if fd.noiseLevelValue > 400 { | |
fd.noiseLevelValue = fd.noiseLevelValue * 0.1 | |
} | |
fd.threshold = fd.noiseLevelValue * 2 | |
if fd.threshold < fd.minThreshold { | |
fd.threshold = fd.minThreshold | |
} | |
} | |
fd.processedFramesNumber++ | |
} | |
type VadEventMarkerType struct { | |
VadFlag string | |
Time uint32 | |
} | |
type StreamingVadType struct{ | |
sampleRate uint32 //采样率 | |
silenceTime float32 //判断语音结束时需要的静音时长 | |
timeout float32 //单个语音的最大长度. 语音活动时长超过时, 将被强制判断定为结束 | |
timeoutInMilliSecond uint32 | |
//VAD检测按帧指定, 每一次接收到音频信号时, 需要将 frameLength 的余数部分保存起来, 下一次则将要将余下的部分拼接到信号的开始位置. | |
frameLength uint32 //每一帖的长度 | |
unfinishedFrame []int16 //剩余帧 | |
unfinishedFrameSize uint32 //剩余帧长度 | |
frameDecision FrameDecisionType | |
// | |
startRejectSpeechTimeInMilliSecond uint32 | |
allowedSilenceTimeInSpeechInMilliSecond uint32 | |
allowedLongestSpeechDurationInMilliSecond uint32 | |
minDurationOfLongSpeechInMilliSecond uint32 | |
endOfLongSpeechRequiredSilenceTimeInMilliSecond uint32 | |
endOfNormalSpeechRequiredSilenceTimeInMilliSecond uint32 | |
minDurationOfSpeechToAddCommaInMilliSecond uint32 //where to add comma if speech | |
//只有检测到 VadFlagSpeaking 标签时, 才知道语音已经开始了, | |
//此时向前推 prepareDurationInMilliSecond 的时间, 将其定义为 VadFlagPrepare 的位置. | |
prepareDurationInMilliSecond uint32 | |
//检测到语音结束时, 并不会马上判断语音结束, | |
//而是需要一定时间 SpeechEndConfirmWindowLengthInSecond 的静音后再确认它, | |
//语音结束的事件节点是`语音结束`后 nonSpeechPadInInMilliSecond 的时间位置. | |
nonSpeechPadInInMilliSecond uint32 | |
speechFrameGlobalTimeInMilliSecond uint32 | |
speechDetectedStartTimeInMilliSecond uint32 | |
speechDetectedStartTimeIsValid bool | |
speechDetectedEndTimeInMilliSecond uint32 | |
speechDetectedEndTimeIsValid bool | |
speechDetectedEndTimeIsValidPossible bool | |
speechDetectedStartAndEnd bool | |
// | |
lastVadEndTimeInMilliSecond uint32 | |
thisDetectedState string //VadFlag | |
VadEventMarkerDeque []VadEventMarkerType | |
} | |
/* | |
silenceTime: 0.4 | |
timeout: 3.0 | |
以下条件应满足: | |
minDurationOfSpeechToAddCommaInMilliSecond < endOfNormalSpeechRequiredSilenceTimeInMilliSecond | |
endOfNormalSpeechRequiredSilenceTimeInMilliSecond < endOfLongSpeechRequiredSilenceTimeInMilliSecond | |
*/ | |
func (sv *StreamingVadType) Init (sampleRate uint32, silenceTime float32, timeout float32) { | |
sv.sampleRate = sampleRate | |
sv.timeout = timeout | |
sv.timeoutInMilliSecond = uint32(timeout * 1e3) | |
sv.frameLength = uint32(0.02 * float32(sampleRate)) | |
sv.unfinishedFrameSize = 0 | |
var params ParametersForFdType | |
params.Init() | |
params.SampleRate = sampleRate | |
sv.frameDecision.Init(params) | |
// | |
sv.startRejectSpeechTimeInMilliSecond = uint32(0.7 * 1e3) | |
sv.allowedSilenceTimeInSpeechInMilliSecond = uint32(0.2 * 1e3) | |
sv.minDurationOfLongSpeechInMilliSecond = 0 | |
sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond = 0 | |
sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond = uint32(silenceTime * 1e3) | |
sv.minDurationOfSpeechToAddCommaInMilliSecond = uint32(0.3 * 1e3) | |
sv.allowedLongestSpeechDurationInMilliSecond = 0 | |
sv.prepareDurationInMilliSecond = uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) * 2 | |
sv.nonSpeechPadInInMilliSecond = uint32(silenceTime * 1e3 * 0.5) | |
sv.speechFrameGlobalTimeInMilliSecond = 0 | |
// | |
sv.speechDetectedStartTimeInMilliSecond = 0 | |
sv.speechDetectedStartTimeIsValid = false | |
sv.speechDetectedEndTimeInMilliSecond = 0 | |
sv.speechDetectedEndTimeIsValid = false | |
sv.speechDetectedEndTimeIsValidPossible = false | |
sv.speechDetectedStartAndEnd = false | |
// | |
sv.lastVadEndTimeInMilliSecond = 0 | |
sv.thisDetectedState = VadFlagNoSpeech | |
fmt.Println("do StreamingVad Init...") | |
} | |
func (sv *StreamingVadType) ProcessSpeechByChunk(buffer []int16) (err error) { | |
var validBuffer []int16 | |
//unfinished frame | |
unfinishedFrameSize := uint32(len(sv.unfinishedFrame)) | |
for i := uint32(0); i < unfinishedFrameSize; i++ { | |
validBuffer = append(validBuffer, sv.unfinishedFrame[i]) | |
} | |
//buffer | |
bufferSize := uint32(len(buffer)) | |
for i := uint32(0); i < bufferSize; i++ { | |
validBuffer = append(validBuffer, buffer[i]) | |
} | |
//remainder | |
remainderSize := uint32(len(validBuffer)) % sv.frameLength | |
boundary := uint32(len(validBuffer)) - remainderSize | |
sv.unfinishedFrame = validBuffer[boundary:] | |
validBuffer = validBuffer[:boundary] | |
if uint32(len(validBuffer)) > sv.frameLength { | |
sv.ProcessSpeech(validBuffer) | |
} | |
return nil | |
} | |
//ProcessSpeechByChunk 需要将 buffer 更新成指定 frameLength 的倍数, 多余的部分保存起来以供下次使用. | |
func (sv *StreamingVadType) DeprecatedProcessSpeechByChunk(buffer []int16) (err error) { | |
bufferSize := uint32(len(buffer)) | |
var validBuffer []int16 | |
var unfinishedFrame []int16 | |
var point int16 | |
validSize := (uint32(len(buffer)) + sv.unfinishedFrameSize) / sv.frameLength * sv.frameLength | |
if validSize >= sv.frameLength { | |
if sv.unfinishedFrameSize != 0 { | |
for i := uint32(0); i < sv.unfinishedFrameSize; i++ { | |
point = sv.unfinishedFrame[i] | |
validBuffer = append(validBuffer, point) | |
} | |
for i := uint32(0); i < validSize - sv.unfinishedFrameSize; i++ { | |
point = buffer[i] | |
validBuffer = append(validBuffer, point) | |
} | |
} else { | |
for i := uint32(0); i < validSize; i++ { | |
point = buffer[i] | |
validBuffer = append(validBuffer, point) | |
} | |
} | |
sv.ProcessSpeech(validBuffer) | |
} | |
//fmt.Printf("validBuffer size: %d\n", len(validBuffer)) | |
//fmt.Printf("validSize: %d\n", validSize) | |
//fmt.Printf("last unfinishedFrameSize: %d\n", sv.unfinishedFrameSize) | |
sv.unfinishedFrameSize = (bufferSize + sv.unfinishedFrameSize) - validSize | |
begin := bufferSize - sv.unfinishedFrameSize - 1 | |
for i := uint32(0); i < sv.unfinishedFrameSize; i++ { | |
point = buffer[begin + i] | |
unfinishedFrame = append(unfinishedFrame, point) | |
} | |
sv.unfinishedFrame = unfinishedFrame | |
fmt.Println("do StreamingVad ProcessSpeechByChunk...") | |
return nil | |
} | |
//ProcessSpeech 根据上一帧的语音标签和当前帧的语音标签来判断VAD状态. | |
func (sv *StreamingVadType) ProcessSpeech(buffer []int16) { | |
bufferLength := uint32(len(buffer)) | |
if bufferLength % sv.frameLength != 0 { | |
panic(fmt.Sprintf("bufferLength (%d) should be a multiple of B frameLength (%d)", bufferLength, sv.frameLength)) | |
} | |
var frameBuffer []int16 | |
for begin := uint32(0); begin + sv.frameLength <= bufferLength; { | |
frameBuffer = buffer[begin: begin + sv.frameLength] | |
sv.frameDecision.SpeechFrameProcess(sv.speechFrameGlobalTimeInMilliSecond, frameBuffer) | |
begin += sv.frameLength | |
if sv.frameDecision.lastFrameFlag == FrameFlagSpeechStart && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPresent { | |
if sv.thisDetectedState == VadFlagNoSpeech { | |
//start | |
var prepareTime uint32 = 0 | |
if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond { | |
prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond | |
} | |
if prepareTime < sv.lastVadEndTimeInMilliSecond { | |
prepareTime = sv.lastVadEndTimeInMilliSecond | |
} | |
vadEventMarker := VadEventMarkerType{ | |
VadFlag: VadFlagPrepare, | |
Time: prepareTime, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
sv.thisDetectedState = VadFlagSpeaking | |
vadEventMarker = VadEventMarkerType{ | |
VadFlag: VadFlagSpeaking, | |
Time: sv.speechFrameGlobalTimeInMilliSecond, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
// | |
sv.speechDetectedEndTimeInMilliSecond = uint32(0) | |
sv.speechDetectedEndTimeIsValid = false | |
//sv.speechDetectedEndTimeIsValidPossible = false | |
sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) | |
sv.speechDetectedStartTimeIsValid = true | |
} else if sv.thisDetectedState == VadFlagSpeaking && sv.speechDetectedEndTimeIsValid && | |
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond > sv.minDurationOfSpeechToAddCommaInMilliSecond && | |
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond < sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond { | |
//pause | |
vadEventMarker := VadEventMarkerType{ | |
VadFlag: VadFlagPause, | |
Time: sv.speechDetectedEndTimeInMilliSecond, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
sv.thisDetectedState = VadFlagSpeaking | |
vadEventMarker = VadEventMarkerType{ | |
VadFlag: VadFlagSpeaking, | |
Time: sv.speechFrameGlobalTimeInMilliSecond, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
// | |
sv.speechDetectedEndTimeInMilliSecond = uint32(0) | |
sv.speechDetectedEndTimeIsValid = false | |
//sv.speechDetectedEndTimeIsValidPossible = false | |
} else if sv.thisDetectedState == VadFlagSpeaking && | |
sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond <= sv.minDurationOfSpeechToAddCommaInMilliSecond { | |
// | |
sv.speechDetectedEndTimeInMilliSecond = uint32(0) | |
sv.speechDetectedEndTimeIsValid = false | |
//sv.speechDetectedEndTimeIsValidPossible = false | |
} else {} | |
} | |
//end | |
if sv.frameDecision.lastFrameFlag == FrameFlagSpeechEnd && sv.frameDecision.thisFrameFlag == FrameFlagSpeechPre { | |
sv.speechDetectedEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond | |
sv.speechDetectedEndTimeIsValid = true | |
//sv.speechDetectedEndTimeIsValidPossible = true | |
} | |
//只在开始一定时间后, 才能检测到 Vad 结束. | |
if sv.speechFrameGlobalTimeInMilliSecond > sv.startRejectSpeechTimeInMilliSecond { | |
if sv.speechDetectedEndTimeIsValid { | |
var endOfSpeechRequiredSilenceTime uint32 | |
if sv.minDurationOfLongSpeechInMilliSecond > 0 && | |
sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond > 0 && | |
(sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond) > sv.minDurationOfLongSpeechInMilliSecond { | |
endOfSpeechRequiredSilenceTime = sv.endOfLongSpeechRequiredSilenceTimeInMilliSecond | |
} else { | |
endOfSpeechRequiredSilenceTime = sv.endOfNormalSpeechRequiredSilenceTimeInMilliSecond | |
} | |
if (sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedEndTimeInMilliSecond) >= endOfSpeechRequiredSilenceTime { | |
endTime := sv.speechDetectedEndTimeInMilliSecond + sv.nonSpeechPadInInMilliSecond | |
sv.speechDetectedEndTimeInMilliSecond = uint32(0) | |
sv.speechDetectedEndTimeIsValid = false | |
sv.speechDetectedStartTimeInMilliSecond = uint32(0) | |
sv.speechDetectedStartTimeIsValid = false | |
sv.thisDetectedState = VadFlagNoSpeech | |
sv.lastVadEndTimeInMilliSecond = endTime | |
vadEventMarker := VadEventMarkerType{ | |
VadFlag: VadFlagNoSpeech, | |
Time: endTime, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
} | |
} | |
} | |
//当语音时长超过时, 强制切断 | |
if sv.speechDetectedStartTimeIsValid && sv.speechFrameGlobalTimeInMilliSecond - sv.speechDetectedStartTimeInMilliSecond > sv.timeoutInMilliSecond { | |
//end | |
sv.speechDetectedEndTimeInMilliSecond = uint32(0) | |
sv.speechDetectedEndTimeIsValid = false | |
sv.speechDetectedStartTimeInMilliSecond = uint32(0) | |
sv.speechDetectedStartTimeIsValid = false | |
sv.thisDetectedState = VadFlagNoSpeech | |
sv.lastVadEndTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond | |
vadEventMarker := VadEventMarkerType{ | |
VadFlag: VadFlagNoSpeech, | |
Time: sv.speechFrameGlobalTimeInMilliSecond, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
//start | |
var prepareTime uint32 = 0 | |
if sv.speechFrameGlobalTimeInMilliSecond > sv.prepareDurationInMilliSecond { | |
prepareTime = sv.speechFrameGlobalTimeInMilliSecond - sv.prepareDurationInMilliSecond | |
} | |
if prepareTime < sv.lastVadEndTimeInMilliSecond { | |
prepareTime = sv.lastVadEndTimeInMilliSecond | |
} | |
vadEventMarker = VadEventMarkerType{ | |
VadFlag: VadFlagPrepare, | |
Time: prepareTime, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
sv.thisDetectedState = VadFlagSpeaking | |
vadEventMarker = VadEventMarkerType{ | |
VadFlag: VadFlagSpeaking, | |
Time: sv.speechFrameGlobalTimeInMilliSecond, | |
} | |
sv.VadEventMarkerDeque = append(sv.VadEventMarkerDeque, vadEventMarker) | |
// | |
sv.speechDetectedEndTimeInMilliSecond = uint32(0) | |
sv.speechDetectedEndTimeIsValid = false | |
//sv.speechDetectedEndTimeIsValidPossible = false | |
sv.speechDetectedStartTimeInMilliSecond = sv.speechFrameGlobalTimeInMilliSecond - uint32(sv.frameDecision.speechStartWindowLengthInSecond * 1e3) | |
sv.speechDetectedStartTimeIsValid = true | |
} | |
//loop | |
sv.frameDecision.lastFrameFlag = sv.frameDecision.thisFrameFlag | |
sv.speechFrameGlobalTimeInMilliSecond += uint32(float32(sv.frameLength) / float32(sv.sampleRate) * 1e3) | |
} | |
} | |