File size: 2,400 Bytes
5e1b738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package main

// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
	"os"
	"path/filepath"

	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
	"github.com/go-audio/wav"
	"github.com/mudler/LocalAI/pkg/grpc/base"
	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
	"github.com/mudler/LocalAI/pkg/utils"
)

type Whisper struct {
	base.SingleThread
	whisper whisper.Model
}

func (sd *Whisper) Load(opts *pb.ModelOptions) error {
	// Note: the Model here is a path to a directory containing the model files
	w, err := whisper.New(opts.ModelFile)
	sd.whisper = w
	return err
}

func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {

	dir, err := os.MkdirTemp("", "whisper")
	if err != nil {
		return pb.TranscriptResult{}, err
	}
	defer os.RemoveAll(dir)

	convertedPath := filepath.Join(dir, "converted.wav")

	if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
		return pb.TranscriptResult{}, err
	}

	// Open samples
	fh, err := os.Open(convertedPath)
	if err != nil {
		return pb.TranscriptResult{}, err
	}
	defer fh.Close()

	// Read samples
	d := wav.NewDecoder(fh)
	buf, err := d.FullPCMBuffer()
	if err != nil {
		return pb.TranscriptResult{}, err
	}

	data := buf.AsFloat32Buffer().Data

	// Process samples
	context, err := sd.whisper.NewContext()
	if err != nil {
		return pb.TranscriptResult{}, err

	}

	context.SetThreads(uint(opts.Threads))

	if opts.Language != "" {
		context.SetLanguage(opts.Language)
	} else {
		context.SetLanguage("auto")
	}

	if opts.Translate {
		context.SetTranslate(true)
	}

	if err := context.Process(data, nil, nil); err != nil {
		return pb.TranscriptResult{}, err
	}

	segments := []*pb.TranscriptSegment{}
	text := ""
	for {
		s, err := context.NextSegment()
		if err != nil {
			break
		}

		var tokens []int32
		for _, t := range s.Tokens {
			tokens = append(tokens, int32(t.Id))
		}

		segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
		segments = append(segments, segment)

		text += s.Text
	}

	return pb.TranscriptResult{
		Segments: segments,
		Text:     text,
	}, nil

}