nsarrazin HF Staff antoniora commited on
Commit
6e18e46
·
unverified ·
1 Parent(s): 5ce3b1d

Add metrics for models, tools, websearch (#1186)

Browse files

* Add custom metrics for messages and conversations

* lint

* Add metrics for
- model health
- tools
- websearch

* Add time window & age buckets to summaries

* Increase max age for tool use duration

---------

Co-authored-by: antoniora <[email protected]>

src/lib/server/metrics.ts CHANGED
@@ -1,10 +1,39 @@
1
- import { collectDefaultMetrics, Registry } from "prom-client";
2
  import express from "express";
3
  import { logger } from "$lib/server/logger";
4
  import { env } from "$env/dynamic/private";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  export class MetricsServer {
7
  private static instance: MetricsServer;
 
8
 
9
  private constructor() {
10
  const app = express();
@@ -17,6 +46,114 @@ export class MetricsServer {
17
  const register = new Registry();
18
  collectDefaultMetrics({ register });
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  app.get("/metrics", (req, res) => {
21
  register.metrics().then((metrics) => {
22
  res.set("Content-Type", "text/plain");
@@ -40,4 +177,8 @@ export class MetricsServer {
40
 
41
  return MetricsServer.instance;
42
  }
 
 
 
 
43
  }
 
1
+ import { collectDefaultMetrics, Registry, Counter, Summary } from "prom-client";
2
  import express from "express";
3
  import { logger } from "$lib/server/logger";
4
  import { env } from "$env/dynamic/private";
5
+ import type { Model } from "$lib/types/Model";
6
+ import type { Tool } from "$lib/types/Tool";
7
+
8
+ interface Metrics {
9
+ model: {
10
+ conversationsTotal: Counter<Model["id"]>;
11
+ messagesTotal: Counter<Model["id"]>;
12
+ tokenCountTotal: Counter<Model["id"]>;
13
+ timePerOutputToken: Summary<Model["id"]>;
14
+ timeToFirstToken: Summary<Model["id"]>;
15
+ latency: Summary<Model["id"]>;
16
+ };
17
+
18
+ webSearch: {
19
+ requestCount: Counter;
20
+ pageFetchCount: Counter;
21
+ pageFetchCountError: Counter;
22
+ pageFetchDuration: Summary;
23
+ embeddingDuration: Summary;
24
+ };
25
+
26
+ tool: {
27
+ toolUseCount: Counter<Tool["name"]>;
28
+ toolUseCountError: Counter<Tool["name"]>;
29
+ toolUseDuration: Summary<Tool["name"]>;
30
+ timeToChooseTools: Summary;
31
+ };
32
+ }
33
 
34
  export class MetricsServer {
35
  private static instance: MetricsServer;
36
+ private metrics: Metrics;
37
 
38
  private constructor() {
39
  const app = express();
 
46
  const register = new Registry();
47
  collectDefaultMetrics({ register });
48
 
49
+ this.metrics = {
50
+ model: {
51
+ conversationsTotal: new Counter({
52
+ name: "model_conversations_total",
53
+ help: "Total number of conversations",
54
+ labelNames: ["model"],
55
+ registers: [register],
56
+ }),
57
+ messagesTotal: new Counter({
58
+ name: "model_messages_total",
59
+ help: "Total number of messages",
60
+ labelNames: ["model"],
61
+ registers: [register],
62
+ }),
63
+ tokenCountTotal: new Counter({
64
+ name: "model_token_count_total",
65
+ help: "Total number of tokens",
66
+ labelNames: ["model"],
67
+ registers: [register],
68
+ }),
69
+ timePerOutputToken: new Summary({
70
+ name: "model_time_per_output_token_ms",
71
+ help: "Time per output token in ms",
72
+ labelNames: ["model"],
73
+ registers: [register],
74
+ maxAgeSeconds: 5 * 60,
75
+ ageBuckets: 5,
76
+ }),
77
+ timeToFirstToken: new Summary({
78
+ name: "model_time_to_first_token_ms",
79
+ help: "Time to first token",
80
+ labelNames: ["model"],
81
+ registers: [register],
82
+ maxAgeSeconds: 5 * 60,
83
+ ageBuckets: 5,
84
+ }),
85
+ latency: new Summary({
86
+ name: "model_latency_ms",
87
+ help: "Total latency until end of answer",
88
+ labelNames: ["model"],
89
+ registers: [register],
90
+ maxAgeSeconds: 5 * 60,
91
+ ageBuckets: 5,
92
+ }),
93
+ },
94
+ webSearch: {
95
+ requestCount: new Counter({
96
+ name: "web_search_request_count",
97
+ help: "Total number of web search requests",
98
+ registers: [register],
99
+ }),
100
+ pageFetchCount: new Counter({
101
+ name: "web_search_page_fetch_count",
102
+ help: "Total number of web search page fetches",
103
+ registers: [register],
104
+ }),
105
+ pageFetchCountError: new Counter({
106
+ name: "web_search_page_fetch_count_error",
107
+ help: "Total number of web search page fetch errors",
108
+ registers: [register],
109
+ }),
110
+ pageFetchDuration: new Summary({
111
+ name: "web_search_page_fetch_duration_ms",
112
+ help: "Web search page fetch duration",
113
+ registers: [register],
114
+ maxAgeSeconds: 5 * 60,
115
+ ageBuckets: 5,
116
+ }),
117
+ embeddingDuration: new Summary({
118
+ name: "web_search_embedding_duration_ms",
119
+ help: "Web search embedding duration",
120
+ registers: [register],
121
+ maxAgeSeconds: 5 * 60,
122
+ ageBuckets: 5,
123
+ }),
124
+ },
125
+ tool: {
126
+ toolUseCount: new Counter({
127
+ name: "tool_use_count",
128
+ help: "Total number of tool uses",
129
+ labelNames: ["tool"],
130
+ registers: [register],
131
+ }),
132
+ toolUseCountError: new Counter({
133
+ name: "tool_use_count_error",
134
+ help: "Total number of tool use errors",
135
+ labelNames: ["tool"],
136
+ registers: [register],
137
+ }),
138
+ toolUseDuration: new Summary({
139
+ name: "tool_use_duration_ms",
140
+ help: "Tool use duration",
141
+ labelNames: ["tool"],
142
+ registers: [register],
143
+ maxAgeSeconds: 30 * 60, // longer duration since we use this to give feedback to the user
144
+ ageBuckets: 5,
145
+ }),
146
+ timeToChooseTools: new Summary({
147
+ name: "time_to_choose_tools_ms",
148
+ help: "Time to choose tools",
149
+ labelNames: ["model"],
150
+ registers: [register],
151
+ maxAgeSeconds: 5 * 60,
152
+ ageBuckets: 5,
153
+ }),
154
+ },
155
+ };
156
+
157
  app.get("/metrics", (req, res) => {
158
  register.metrics().then((metrics) => {
159
  res.set("Content-Type", "text/plain");
 
177
 
178
  return MetricsServer.instance;
179
  }
180
+
181
+ public static getMetrics(): Metrics {
182
+ return MetricsServer.getInstance().metrics;
183
+ }
184
  }
src/lib/server/textGeneration/tools.ts CHANGED
@@ -18,6 +18,7 @@ import { logger } from "../logger";
18
  import { toolHasName } from "../tools/utils";
19
  import type { MessageFile } from "$lib/types/Message";
20
  import { mergeAsyncGenerators } from "$lib/utils/mergeAsyncGenerators";
 
21
 
22
  function makeFilesPrompt(files: MessageFile[], fileMessageIndex: number): string {
23
  if (files.length === 0) {
@@ -62,6 +63,9 @@ async function* runTool(
62
  // Special case for directly_answer tool where we ignore
63
  if (toolHasName(directlyAnswer.name, tool)) return;
64
 
 
 
 
65
  yield {
66
  type: MessageUpdateType.Tool,
67
  subtype: MessageToolUpdateType.Call,
@@ -92,8 +96,14 @@ async function* runTool(
92
  };
93
  }
94
 
 
 
 
 
 
95
  return { ...toolResult, call } as ToolResult;
96
  } catch (e) {
 
97
  yield {
98
  type: MessageUpdateType.Tool,
99
  subtype: MessageToolUpdateType.Error,
@@ -102,6 +112,7 @@ async function* runTool(
102
  };
103
  }
104
  } catch (cause) {
 
105
  console.error(Error(`Failed while running tool ${call.name}`), { cause });
106
  return {
107
  call,
@@ -126,6 +137,8 @@ export async function* runTools(
126
  };
127
  });
128
 
 
 
129
  // do the function calling bits here
130
  for await (const output of await endpoint({
131
  messages: messagesWithFilesPrompt,
@@ -163,6 +176,11 @@ export async function* runTools(
163
  }
164
  }
165
 
 
 
 
 
 
166
  const toolContext: BackendToolContext = { conv, messages, preprompt, assistant };
167
  const toolResults: (ToolResult | undefined)[] = yield* mergeAsyncGenerators(
168
  calls.map((call) => runTool(toolContext, tools, call))
 
18
  import { toolHasName } from "../tools/utils";
19
  import type { MessageFile } from "$lib/types/Message";
20
  import { mergeAsyncGenerators } from "$lib/utils/mergeAsyncGenerators";
21
+ import { MetricsServer } from "../metrics";
22
 
23
  function makeFilesPrompt(files: MessageFile[], fileMessageIndex: number): string {
24
  if (files.length === 0) {
 
63
  // Special case for directly_answer tool where we ignore
64
  if (toolHasName(directlyAnswer.name, tool)) return;
65
 
66
+ const startTime = Date.now();
67
+ MetricsServer.getMetrics().tool.toolUseCount.inc({ tool: call.name });
68
+
69
  yield {
70
  type: MessageUpdateType.Tool,
71
  subtype: MessageToolUpdateType.Call,
 
96
  };
97
  }
98
 
99
+ MetricsServer.getMetrics().tool.toolUseDuration.observe(
100
+ { tool: call.name },
101
+ Date.now() - startTime
102
+ );
103
+
104
  return { ...toolResult, call } as ToolResult;
105
  } catch (e) {
106
+ MetricsServer.getMetrics().tool.toolUseCountError.inc({ tool: call.name });
107
  yield {
108
  type: MessageUpdateType.Tool,
109
  subtype: MessageToolUpdateType.Error,
 
112
  };
113
  }
114
  } catch (cause) {
115
+ MetricsServer.getMetrics().tool.toolUseCountError.inc({ tool: call.name });
116
  console.error(Error(`Failed while running tool ${call.name}`), { cause });
117
  return {
118
  call,
 
137
  };
138
  });
139
 
140
+ const pickToolStartTime = Date.now();
141
+
142
  // do the function calling bits here
143
  for await (const output of await endpoint({
144
  messages: messagesWithFilesPrompt,
 
176
  }
177
  }
178
 
179
+ MetricsServer.getMetrics().tool.timeToChooseTools.observe(
180
+ { model: conv.model },
181
+ Date.now() - pickToolStartTime
182
+ );
183
+
184
  const toolContext: BackendToolContext = { conv, messages, preprompt, assistant };
185
  const toolResults: (ToolResult | undefined)[] = yield* mergeAsyncGenerators(
186
  calls.map((call) => runTool(toolContext, tools, call))
src/lib/server/websearch/embed/embed.ts CHANGED
@@ -1,3 +1,4 @@
 
1
  import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch";
2
  import type { EmbeddingBackendModel } from "../../embeddingModels";
3
  import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity";
@@ -14,6 +15,8 @@ export async function findContextSources(
14
  prompt: string,
15
  embeddingModel: EmbeddingBackendModel
16
  ) {
 
 
17
  const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree));
18
  const markdownElems = sourcesMarkdownElems.flat();
19
 
@@ -76,5 +79,7 @@ export async function findContextSources(
76
  })
77
  .filter((contextSource) => contextSource.context.length > 0);
78
 
 
 
79
  return contextSources;
80
  }
 
1
+ import { MetricsServer } from "$lib/server/metrics";
2
  import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch";
3
  import type { EmbeddingBackendModel } from "../../embeddingModels";
4
  import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity";
 
15
  prompt: string,
16
  embeddingModel: EmbeddingBackendModel
17
  ) {
18
+ const startTime = Date.now();
19
+
20
  const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree));
21
  const markdownElems = sourcesMarkdownElems.flat();
22
 
 
79
  })
80
  .filter((contextSource) => contextSource.context.length > 0);
81
 
82
+ MetricsServer.getMetrics().webSearch.embeddingDuration.observe(Date.now() - startTime);
83
+
84
  return contextSources;
85
  }
src/lib/server/websearch/runWebSearch.ts CHANGED
@@ -17,6 +17,7 @@ import {
17
  makeSourcesUpdate,
18
  } from "./update";
19
  import { mergeAsyncGenerators } from "$lib/utils/mergeAsyncGenerators";
 
20
 
21
  const MAX_N_PAGES_TO_SCRAPE = 8 as const;
22
  const MAX_N_PAGES_TO_EMBED = 5 as const;
@@ -31,6 +32,8 @@ export async function* runWebSearch(
31
  const createdAt = new Date();
32
  const updatedAt = new Date();
33
 
 
 
34
  try {
35
  const embeddingModel =
36
  embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
 
17
  makeSourcesUpdate,
18
  } from "./update";
19
  import { mergeAsyncGenerators } from "$lib/utils/mergeAsyncGenerators";
20
+ import { MetricsServer } from "../metrics";
21
 
22
  const MAX_N_PAGES_TO_SCRAPE = 8 as const;
23
  const MAX_N_PAGES_TO_EMBED = 5 as const;
 
32
  const createdAt = new Date();
33
  const updatedAt = new Date();
34
 
35
+ MetricsServer.getMetrics().webSearch.requestCount.inc();
36
+
37
  try {
38
  const embeddingModel =
39
  embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
src/lib/server/websearch/scrape/scrape.ts CHANGED
@@ -6,16 +6,24 @@ import { spatialParser } from "./parser";
6
  import { htmlToMarkdownTree } from "../markdown/tree";
7
  import { timeout } from "$lib/utils/timeout";
8
  import { makeErrorUpdate, makeGeneralUpdate } from "../update";
 
9
 
10
  export const scrape = (maxCharsPerElem: number) =>
11
  async function* (
12
  source: WebSearchSource
13
  ): AsyncGenerator<MessageWebSearchUpdate, WebSearchScrapedSource | undefined, undefined> {
14
  try {
 
 
 
15
  const page = await scrapeUrl(source.link, maxCharsPerElem);
 
 
 
16
  yield makeGeneralUpdate({ message: "Browsing webpage", args: [source.link] });
17
  return { ...source, page };
18
  } catch (e) {
 
19
  const message = e instanceof Error ? e.message : String(e);
20
  yield makeErrorUpdate({ message: "Failed to parse webpage", args: [message, source.link] });
21
  }
 
6
  import { htmlToMarkdownTree } from "../markdown/tree";
7
  import { timeout } from "$lib/utils/timeout";
8
  import { makeErrorUpdate, makeGeneralUpdate } from "../update";
9
+ import { MetricsServer } from "$lib/server/metrics";
10
 
11
  export const scrape = (maxCharsPerElem: number) =>
12
  async function* (
13
  source: WebSearchSource
14
  ): AsyncGenerator<MessageWebSearchUpdate, WebSearchScrapedSource | undefined, undefined> {
15
  try {
16
+ const startTime = Date.now();
17
+ MetricsServer.getMetrics().webSearch.pageFetchCount.inc();
18
+
19
  const page = await scrapeUrl(source.link, maxCharsPerElem);
20
+
21
+ MetricsServer.getMetrics().webSearch.pageFetchDuration.observe(Date.now() - startTime);
22
+
23
  yield makeGeneralUpdate({ message: "Browsing webpage", args: [source.link] });
24
  return { ...source, page };
25
  } catch (e) {
26
+ MetricsServer.getMetrics().webSearch.pageFetchCountError.inc();
27
  const message = e instanceof Error ? e.message : String(e);
28
  yield makeErrorUpdate({ message: "Failed to parse webpage", args: [message, source.link] });
29
  }
src/routes/conversation/+server.ts CHANGED
@@ -10,6 +10,7 @@ import { defaultEmbeddingModel } from "$lib/server/embeddingModels";
10
  import { v4 } from "uuid";
11
  import { authCondition } from "$lib/server/auth";
12
  import { usageLimits } from "$lib/server/usageLimits";
 
13
 
14
  export const POST: RequestHandler = async ({ locals, request }) => {
15
  const body = await request.text();
@@ -115,6 +116,8 @@ export const POST: RequestHandler = async ({ locals, request }) => {
115
  ...(values.fromShare ? { meta: { fromShareId: values.fromShare } } : {}),
116
  });
117
 
 
 
118
  return new Response(
119
  JSON.stringify({
120
  conversationId: res.insertedId.toString(),
 
10
  import { v4 } from "uuid";
11
  import { authCondition } from "$lib/server/auth";
12
  import { usageLimits } from "$lib/server/usageLimits";
13
+ import { MetricsServer } from "$lib/server/metrics";
14
 
15
  export const POST: RequestHandler = async ({ locals, request }) => {
16
  const body = await request.text();
 
116
  ...(values.fromShare ? { meta: { fromShareId: values.fromShare } } : {}),
117
  });
118
 
119
+ MetricsServer.getMetrics().model.conversationsTotal.inc({ model: values.model });
120
+
121
  return new Response(
122
  JSON.stringify({
123
  conversationId: res.insertedId.toString(),
src/routes/conversation/[id]/+server.ts CHANGED
@@ -21,6 +21,7 @@ import { buildSubtree } from "$lib/utils/tree/buildSubtree.js";
21
  import { addChildren } from "$lib/utils/tree/addChildren.js";
22
  import { addSibling } from "$lib/utils/tree/addSibling.js";
23
  import { usageLimits } from "$lib/server/usageLimits";
 
24
  import { textGeneration } from "$lib/server/textGeneration";
25
  import type { TextGenerationContext } from "$lib/server/textGeneration/types";
26
 
@@ -293,6 +294,8 @@ export async function POST({ request, locals, params, getClientAddress }) {
293
 
294
  let doneStreaming = false;
295
 
 
 
296
  // we now build the stream
297
  const stream = new ReadableStream({
298
  async start(controller) {
@@ -306,6 +309,25 @@ export async function POST({ request, locals, params, getClientAddress }) {
306
  if (event.type === MessageUpdateType.Stream) {
307
  if (event.token === "") return;
308
  messageToWriteTo.content += event.token;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  }
310
 
311
  // Set the title
@@ -321,6 +343,12 @@ export async function POST({ request, locals, params, getClientAddress }) {
321
  else if (event.type === MessageUpdateType.FinalAnswer) {
322
  messageToWriteTo.interrupted = event.interrupted;
323
  messageToWriteTo.content = initialMessageContent + event.text;
 
 
 
 
 
 
324
  }
325
 
326
  // Add file
@@ -428,6 +456,8 @@ export async function POST({ request, locals, params, getClientAddress }) {
428
  );
429
  }
430
 
 
 
431
  // Todo: maybe we should wait for the message to be saved before ending the response - in case of errors
432
  return new Response(stream, {
433
  headers: {
 
21
  import { addChildren } from "$lib/utils/tree/addChildren.js";
22
  import { addSibling } from "$lib/utils/tree/addSibling.js";
23
  import { usageLimits } from "$lib/server/usageLimits";
24
+ import { MetricsServer } from "$lib/server/metrics";
25
  import { textGeneration } from "$lib/server/textGeneration";
26
  import type { TextGenerationContext } from "$lib/server/textGeneration/types";
27
 
 
294
 
295
  let doneStreaming = false;
296
 
297
+ let lastTokenTimestamp: undefined | Date = undefined;
298
+
299
  // we now build the stream
300
  const stream = new ReadableStream({
301
  async start(controller) {
 
309
  if (event.type === MessageUpdateType.Stream) {
310
  if (event.token === "") return;
311
  messageToWriteTo.content += event.token;
312
+
313
+ // add to token total
314
+ MetricsServer.getMetrics().model.tokenCountTotal.inc({ model: model?.id });
315
+
316
+ // if this is the first token, add to time to first token
317
+ if (!lastTokenTimestamp) {
318
+ MetricsServer.getMetrics().model.timeToFirstToken.observe(
319
+ { model: model?.id },
320
+ Date.now() - promptedAt.getTime()
321
+ );
322
+ lastTokenTimestamp = new Date();
323
+ }
324
+
325
+ // add to time per token
326
+ MetricsServer.getMetrics().model.timePerOutputToken.observe(
327
+ { model: model?.id },
328
+ Date.now() - (lastTokenTimestamp ?? promptedAt).getTime()
329
+ );
330
+ lastTokenTimestamp = new Date();
331
  }
332
 
333
  // Set the title
 
343
  else if (event.type === MessageUpdateType.FinalAnswer) {
344
  messageToWriteTo.interrupted = event.interrupted;
345
  messageToWriteTo.content = initialMessageContent + event.text;
346
+
347
+ // add to latency
348
+ MetricsServer.getMetrics().model.latency.observe(
349
+ { model: model?.id },
350
+ Date.now() - promptedAt.getTime()
351
+ );
352
  }
353
 
354
  // Add file
 
456
  );
457
  }
458
 
459
+ const metrics = MetricsServer.getMetrics();
460
+ metrics.model.messagesTotal.inc({ model: model?.id });
461
  // Todo: maybe we should wait for the message to be saved before ending the response - in case of errors
462
  return new Response(stream, {
463
  headers: {