nsarrazin HF Staff commited on
Commit
7d6fc19
·
unverified ·
1 Parent(s): b3ea80f

feat(front): use webworker for markdown parsing (#1733)

Browse files

* feat(front): move markdown parsing to web worker

* feat: use webworker for markdown parsing

* feat(markdown-worker): implement message buffering and processing

* fix(markdown): import KaTeX CSS locally instead of via CDN

* fix(markdown): make sure messages are serializable

* feat(markdown): make sure links have target blank

* refactor(markdown): improve HTML escaping function formatting

src/lib/components/chat/MarkdownRenderer.svelte CHANGED
@@ -1,199 +1,71 @@
1
  <script lang="ts">
2
  import type { WebSearchSource } from "$lib/types/WebSearch";
3
- import katex from "katex";
4
- import "katex/dist/contrib/mhchem.mjs";
5
- import DOMPurify from "isomorphic-dompurify";
6
- import { Marked } from "marked";
7
- import type { Tokens, TokenizerExtension, RendererExtension } from "marked";
8
  import CodeBlock from "../CodeBlock.svelte";
 
 
 
 
9
 
10
  interface Props {
11
  content: string;
12
  sources?: WebSearchSource[];
13
  }
14
 
15
- let { content, sources = [] }: Props = $props();
16
-
17
- interface katexBlockToken extends Tokens.Generic {
18
- type: "katexBlock";
19
- raw: string;
20
- text: string;
21
- displayMode: true;
22
- }
23
-
24
- interface katexInlineToken extends Tokens.Generic {
25
- type: "katexInline";
26
- raw: string;
27
- text: string;
28
- displayMode: false;
29
- }
30
-
31
- export const katexBlockExtension: TokenizerExtension & RendererExtension = {
32
- name: "katexBlock",
33
- level: "block",
34
-
35
- start(src: string): number | undefined {
36
- const match = src.match(/(\${2}|\\\[)/);
37
- return match ? match.index : -1;
38
- },
39
-
40
- tokenizer(src: string): katexBlockToken | undefined {
41
- // 1) $$ ... $$
42
- const rule1 = /^\${2}([\s\S]+?)\${2}/;
43
- const match1 = rule1.exec(src);
44
- if (match1) {
45
- const token: katexBlockToken = {
46
- type: "katexBlock",
47
- raw: match1[0],
48
- text: match1[1].trim(),
49
- displayMode: true,
50
- };
51
- return token;
52
- }
53
-
54
- // 2) \[ ... \]
55
- const rule2 = /^\\\[([\s\S]+?)\\\]/;
56
- const match2 = rule2.exec(src);
57
- if (match2) {
58
- const token: katexBlockToken = {
59
- type: "katexBlock",
60
- raw: match2[0],
61
- text: match2[1].trim(),
62
- displayMode: true,
63
- };
64
- return token;
65
- }
66
-
67
- return undefined;
68
- },
69
-
70
- renderer(token) {
71
- if (token.type === "katexBlock") {
72
- return katex.renderToString(token.text, {
73
- throwOnError: false,
74
- displayMode: token.displayMode,
75
- });
76
- }
77
-
78
- return undefined;
79
- },
80
- };
81
 
82
- const katexInlineExtension: TokenizerExtension & RendererExtension = {
83
- name: "katexInline",
84
- level: "inline",
85
-
86
- start(src: string): number | undefined {
87
- const match = src.match(/(\$|\\\()/);
88
- return match ? match.index : -1;
89
- },
90
-
91
- tokenizer(src: string): katexInlineToken | undefined {
92
- // 1) $...$
93
- const rule1 = /^\$([^$]+?)\$/;
94
- const match1 = rule1.exec(src);
95
- if (match1) {
96
- const token: katexInlineToken = {
97
- type: "katexInline",
98
- raw: match1[0],
99
- text: match1[1].trim(),
100
- displayMode: false,
101
- };
102
- return token;
103
- }
104
-
105
- // 2) \(...\)
106
- const rule2 = /^\\\(([\s\S]+?)\\\)/;
107
- const match2 = rule2.exec(src);
108
- if (match2) {
109
- const token: katexInlineToken = {
110
- type: "katexInline",
111
- raw: match2[0],
112
- text: match2[1].trim(),
113
- displayMode: false,
114
- };
115
- return token;
116
- }
117
-
118
- return undefined;
119
- },
120
-
121
- renderer(token) {
122
- if (token.type === "katexInline") {
123
- return katex.renderToString(token.text, {
124
- throwOnError: false,
125
- displayMode: token.displayMode,
126
- });
127
- }
128
- return undefined;
129
- },
130
- };
131
-
132
- function escapeHTML(content: string) {
133
- return content.replace(
134
- /[<>&"']/g,
135
- (x) =>
136
- ({
137
- "<": "&lt;",
138
- ">": "&gt;",
139
- "&": "&amp;",
140
- "'": "&#39;",
141
- '"': "&quot;",
142
- })[x] || x
143
- );
144
- }
145
 
146
- function addInlineCitations(md: string, webSearchSources: WebSearchSource[] = []): string {
147
- const linkStyle =
148
- "color: rgb(59, 130, 246); text-decoration: none; hover:text-decoration: underline;";
149
 
150
- return md.replace(/\[(\d+)\]/g, (match: string) => {
151
- const indices: number[] = (match.match(/\d+/g) || []).map(Number);
152
- const links: string = indices
153
- .map((index: number) => {
154
- if (index === 0) return false;
155
- const source = webSearchSources[index - 1];
156
- if (source) {
157
- return `<a href="${source.link}" target="_blank" rel="noreferrer" style="${linkStyle}">${index}</a>`;
158
  }
159
- return "";
160
- })
161
- .filter(Boolean)
162
- .join(", ");
163
-
164
- return links ? ` <sup>${links}</sup>` : match;
165
- });
 
 
166
  }
167
 
168
- const marked = new Marked({
169
- hooks: {
170
- postprocess: (html) => DOMPurify.sanitize(addInlineCitations(html, sources)),
171
- },
172
- extensions: [katexBlockExtension, katexInlineExtension],
173
- renderer: {
174
- link: (href, title, text) =>
175
- `<a href="${href?.replace(/>$/, "")}" target="_blank" rel="noreferrer">${text}</a>`,
176
- html: (html) => escapeHTML(html),
177
- },
178
- gfm: true,
179
- breaks: true,
180
  });
181
 
182
  DOMPurify.addHook("afterSanitizeAttributes", (node) => {
183
  if (node.tagName === "A") {
184
- node.setAttribute("rel", "noreferrer");
185
  node.setAttribute("target", "_blank");
 
186
  }
187
  });
188
  </script>
189
 
190
- {#each marked.lexer(content) as token}
191
- {#if token.type === "code"}
192
- <CodeBlock lang={token.lang} code={token.text} />
193
- {:else}
194
- {#await marked.parse(token.raw) then parsed}
195
  <!-- eslint-disable-next-line svelte/no-at-html-tags -->
196
- {@html parsed}
197
  {/await}
 
 
198
  {/if}
199
  {/each}
 
1
  <script lang="ts">
2
  import type { WebSearchSource } from "$lib/types/WebSearch";
3
+ import { processTokens, processTokensSync, type Token } from "$lib/utils/marked";
4
+ import MarkdownWorker from "$lib/workers/markdownWorker?worker";
 
 
 
5
  import CodeBlock from "../CodeBlock.svelte";
6
+ import type { IncomingMessage, OutgoingMessage } from "$lib/workers/markdownWorker";
7
+ import { browser } from "$app/environment";
8
+
9
+ import DOMPurify from "isomorphic-dompurify";
10
 
11
  interface Props {
12
  content: string;
13
  sources?: WebSearchSource[];
14
  }
15
 
16
+ const worker = browser && window.Worker ? new MarkdownWorker() : null;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ let { content, sources = [] }: Props = $props();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ let tokens: Token[] = $state(processTokensSync(content, sources));
 
 
21
 
22
+ async function processContent(content: string, sources: WebSearchSource[]): Promise<Token[]> {
23
+ if (worker) {
24
+ return new Promise((resolve) => {
25
+ worker.onmessage = (event: MessageEvent<OutgoingMessage>) => {
26
+ if (event.data.type !== "processed") {
27
+ throw new Error("Invalid message type");
 
 
28
  }
29
+ resolve(event.data.tokens);
30
+ };
31
+ worker.postMessage(
32
+ JSON.parse(JSON.stringify({ content, sources, type: "process" })) as IncomingMessage
33
+ );
34
+ });
35
+ } else {
36
+ return processTokens(content, sources);
37
+ }
38
  }
39
 
40
+ $effect(() => {
41
+ if (!browser) {
42
+ tokens = processTokensSync(content, sources);
43
+ } else {
44
+ (async () => {
45
+ if (!browser) {
46
+ tokens = processTokensSync(content, sources);
47
+ } else {
48
+ tokens = await processContent(content, sources);
49
+ }
50
+ })();
51
+ }
52
  });
53
 
54
  DOMPurify.addHook("afterSanitizeAttributes", (node) => {
55
  if (node.tagName === "A") {
 
56
  node.setAttribute("target", "_blank");
57
+ node.setAttribute("rel", "noreferrer");
58
  }
59
  });
60
  </script>
61
 
62
+ {#each tokens as token}
63
+ {#if token.type === "text"}
64
+ {#await token.html then html}
 
 
65
  <!-- eslint-disable-next-line svelte/no-at-html-tags -->
66
+ {@html DOMPurify.sanitize(html)}
67
  {/await}
68
+ {:else if token.type === "code"}
69
+ <CodeBlock lang={token.lang} code={token.code} />
70
  {/if}
71
  {/each}
src/lib/utils/marked.ts ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import katex from "katex";
2
+ import "katex/dist/contrib/mhchem.mjs";
3
+ import { Marked } from "marked";
4
+ import type { Tokens, TokenizerExtension, RendererExtension } from "marked";
5
+ import type { WebSearchSource } from "$lib/types/WebSearch";
6
+
7
+ interface katexBlockToken extends Tokens.Generic {
8
+ type: "katexBlock";
9
+ raw: string;
10
+ text: string;
11
+ displayMode: true;
12
+ }
13
+
14
+ interface katexInlineToken extends Tokens.Generic {
15
+ type: "katexInline";
16
+ raw: string;
17
+ text: string;
18
+ displayMode: false;
19
+ }
20
+
21
+ export const katexBlockExtension: TokenizerExtension & RendererExtension = {
22
+ name: "katexBlock",
23
+ level: "block",
24
+
25
+ start(src: string): number | undefined {
26
+ const match = src.match(/(\${2}|\\\[)/);
27
+ return match ? match.index : -1;
28
+ },
29
+
30
+ tokenizer(src: string): katexBlockToken | undefined {
31
+ // 1) $$ ... $$
32
+ const rule1 = /^\${2}([\s\S]+?)\${2}/;
33
+ const match1 = rule1.exec(src);
34
+ if (match1) {
35
+ const token: katexBlockToken = {
36
+ type: "katexBlock",
37
+ raw: match1[0],
38
+ text: match1[1].trim(),
39
+ displayMode: true,
40
+ };
41
+ return token;
42
+ }
43
+
44
+ // 2) \[ ... \]
45
+ const rule2 = /^\\\[([\s\S]+?)\\\]/;
46
+ const match2 = rule2.exec(src);
47
+ if (match2) {
48
+ const token: katexBlockToken = {
49
+ type: "katexBlock",
50
+ raw: match2[0],
51
+ text: match2[1].trim(),
52
+ displayMode: true,
53
+ };
54
+ return token;
55
+ }
56
+
57
+ return undefined;
58
+ },
59
+
60
+ renderer(token) {
61
+ if (token.type === "katexBlock") {
62
+ return katex.renderToString(token.text, {
63
+ throwOnError: false,
64
+ displayMode: token.displayMode,
65
+ });
66
+ }
67
+ return undefined;
68
+ },
69
+ };
70
+
71
+ const katexInlineExtension: TokenizerExtension & RendererExtension = {
72
+ name: "katexInline",
73
+ level: "inline",
74
+
75
+ start(src: string): number | undefined {
76
+ const match = src.match(/(\$|\\\()/);
77
+ return match ? match.index : -1;
78
+ },
79
+
80
+ tokenizer(src: string): katexInlineToken | undefined {
81
+ // 1) $...$
82
+ const rule1 = /^\$([^$]+?)\$/;
83
+ const match1 = rule1.exec(src);
84
+ if (match1) {
85
+ const token: katexInlineToken = {
86
+ type: "katexInline",
87
+ raw: match1[0],
88
+ text: match1[1].trim(),
89
+ displayMode: false,
90
+ };
91
+ return token;
92
+ }
93
+
94
+ // 2) \(...\)
95
+ const rule2 = /^\\\(([\s\S]+?)\\\)/;
96
+ const match2 = rule2.exec(src);
97
+ if (match2) {
98
+ const token: katexInlineToken = {
99
+ type: "katexInline",
100
+ raw: match2[0],
101
+ text: match2[1].trim(),
102
+ displayMode: false,
103
+ };
104
+ return token;
105
+ }
106
+
107
+ return undefined;
108
+ },
109
+
110
+ renderer(token) {
111
+ if (token.type === "katexInline") {
112
+ return katex.renderToString(token.text, {
113
+ throwOnError: false,
114
+ displayMode: token.displayMode,
115
+ });
116
+ }
117
+ return undefined;
118
+ },
119
+ };
120
+
121
+ function escapeHTML(content: string) {
122
+ return content.replace(
123
+ /[<>&"']/g,
124
+ (x) =>
125
+ ({
126
+ "<": "&lt;",
127
+ ">": "&gt;",
128
+ "&": "&amp;",
129
+ "'": "&#39;",
130
+ '"': "&quot;",
131
+ })[x] || x
132
+ );
133
+ }
134
+
135
+ function addInlineCitations(md: string, webSearchSources: WebSearchSource[] = []): string {
136
+ const linkStyle =
137
+ "color: rgb(59, 130, 246); text-decoration: none; hover:text-decoration: underline;";
138
+ return md.replace(/\[(\d+)\]/g, (match: string) => {
139
+ const indices: number[] = (match.match(/\d+/g) || []).map(Number);
140
+ const links: string = indices
141
+ .map((index: number) => {
142
+ if (index === 0) return false;
143
+ const source = webSearchSources[index - 1];
144
+ if (source) {
145
+ return `<a href="${source.link}" target="_blank" rel="noreferrer" style="${linkStyle}">${index}</a>`;
146
+ }
147
+ return "";
148
+ })
149
+ .filter(Boolean)
150
+ .join(", ");
151
+ return links ? ` <sup>${links}</sup>` : match;
152
+ });
153
+ }
154
+
155
+ function createMarkedInstance(sources: WebSearchSource[]): Marked {
156
+ return new Marked({
157
+ hooks: {
158
+ postprocess: (html) => addInlineCitations(html, sources),
159
+ },
160
+ extensions: [katexBlockExtension, katexInlineExtension],
161
+ renderer: {
162
+ link: (href, title, text) =>
163
+ `<a href="${href?.replace(/>$/, "")}" target="_blank" rel="noreferrer">${text}</a>`,
164
+ html: (html) => escapeHTML(html),
165
+ },
166
+ gfm: true,
167
+ breaks: true,
168
+ });
169
+ }
170
+ type CodeToken = {
171
+ type: "code";
172
+ lang: string;
173
+ code: string;
174
+ };
175
+
176
+ type TextToken = {
177
+ type: "text";
178
+ html: string | Promise<string>;
179
+ };
180
+
181
+ export async function processTokens(content: string, sources: WebSearchSource[]): Promise<Token[]> {
182
+ const marked = createMarkedInstance(sources);
183
+ const tokens = marked.lexer(content);
184
+
185
+ const processedTokens = await Promise.all(
186
+ tokens.map(async (token) => {
187
+ if (token.type === "code") {
188
+ return {
189
+ type: "code" as const,
190
+ lang: token.lang,
191
+ code: token.text,
192
+ };
193
+ } else {
194
+ return {
195
+ type: "text" as const,
196
+ html: marked.parse(token.raw),
197
+ };
198
+ }
199
+ })
200
+ );
201
+
202
+ return processedTokens;
203
+ }
204
+
205
+ export function processTokensSync(content: string, sources: WebSearchSource[]): Token[] {
206
+ const marked = createMarkedInstance(sources);
207
+ const tokens = marked.lexer(content);
208
+ return tokens.map((token) => {
209
+ if (token.type === "code") {
210
+ return { type: "code" as const, lang: token.lang, code: token.text };
211
+ }
212
+ return { type: "text" as const, html: marked.parse(token.raw) };
213
+ });
214
+ }
215
+
216
+ export type Token = CodeToken | TextToken;
src/lib/workers/markdownWorker.ts ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { WebSearchSource } from "$lib/types/WebSearch";
2
+ import { processTokens, type Token } from "$lib/utils/marked";
3
+
4
+ export type IncomingMessage = {
5
+ type: "process";
6
+ content: string;
7
+ sources: WebSearchSource[];
8
+ };
9
+
10
+ export type OutgoingMessage = {
11
+ type: "processed";
12
+ tokens: Token[];
13
+ };
14
+
15
+ // Flag to track if the worker is currently processing a message
16
+ let isProcessing = false;
17
+
18
+ // Buffer to store the latest incoming message
19
+ let latestMessage: IncomingMessage | null = null;
20
+
21
+ // Helper function to safely handle the latest message
22
+ async function processMessage() {
23
+ if (latestMessage) {
24
+ const nextMessage = latestMessage;
25
+
26
+ latestMessage = null;
27
+ isProcessing = true;
28
+
29
+ try {
30
+ const { content, sources } = nextMessage;
31
+ const processedTokens = await processTokens(content, sources);
32
+ postMessage(JSON.parse(JSON.stringify({ type: "processed", tokens: processedTokens })));
33
+ } finally {
34
+ isProcessing = false;
35
+
36
+ // After processing, check if a new message was buffered
37
+ processMessage();
38
+ }
39
+ }
40
+ }
41
+
42
+ onmessage = (event) => {
43
+ if (event.data.type !== "process") {
44
+ return;
45
+ }
46
+
47
+ latestMessage = event.data as IncomingMessage;
48
+
49
+ if (!isProcessing && latestMessage) {
50
+ processMessage();
51
+ }
52
+ };
src/routes/conversation/[id]/+page.svelte CHANGED
@@ -27,6 +27,8 @@
27
  import { useSettingsStore } from "$lib/stores/settings.js";
28
  import { browser } from "$app/environment";
29
 
 
 
30
  let { data = $bindable() } = $props();
31
 
32
  let loading = $state(false);
@@ -472,12 +474,6 @@
472
 
473
  <svelte:head>
474
  <title>{title}</title>
475
- <link
476
- rel="stylesheet"
477
- href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css"
478
- integrity="sha384-GvrOXuhMATgEsSwCs4smul74iXGOixntILdUW9XmUC6+HX0sLNAK3q71HotJqlAn"
479
- crossorigin="anonymous"
480
- />
481
  </svelte:head>
482
 
483
  <ChatWindow
 
27
  import { useSettingsStore } from "$lib/stores/settings.js";
28
  import { browser } from "$app/environment";
29
 
30
+ import "katex/dist/katex.min.css";
31
+
32
  let { data = $bindable() } = $props();
33
 
34
  let loading = $state(false);
 
474
 
475
  <svelte:head>
476
  <title>{title}</title>
 
 
 
 
 
 
477
  </svelte:head>
478
 
479
  <ChatWindow