Liam Dyer Aaditya Sahay Aaditya Sahay commited on
Commit
2c00ea8
·
unverified ·
1 Parent(s): d3e833a

Web Search: Playwright, spatial parsing, markdown (#1094)

Browse files

* feat: playwright, spatial parsing, markdown for web search

Co-authored-by: Aaditya Sahay <[email protected]>

* feat: choose multiple clusters if necessary (#2)

* chore: resolve linting failures

* feat: improve paring performance and error messages

* feat: combine embeddable chunks together on cpu

* feat: reduce parsed pages from 10 to 8

* feat: disable javascript in playwright by default

* feat: embedding and parsing error messages

* feat: move isURL, fix type errors, misc

* feat: misc cleanup

* feat: change serializedHtmlElement to interface

* fix: isUrl filename

* fix: add playwright dependencies to docker

* feat: add playwright browsers to docker image

* feat: enable javascript by default

* feat: remove error message from console on failed page

---------

Co-authored-by: Aaditya Sahay <[email protected]>
Co-authored-by: Aaditya Sahay <[email protected]>

Files changed (38) hide show
  1. .env +2 -1
  2. Dockerfile +6 -0
  3. README.md +2 -0
  4. package-lock.json +289 -27
  5. package.json +5 -0
  6. src/lib/components/chat/ChatMessage.svelte +3 -3
  7. src/lib/server/embeddingEndpoints/hfApi/embeddingHfApi.ts +6 -1
  8. src/lib/server/isURLLocal.ts +31 -19
  9. src/lib/server/preprocessMessages.ts +4 -6
  10. src/lib/server/sentenceSimilarity.ts +12 -21
  11. src/lib/server/websearch/embed/combine.ts +37 -0
  12. src/lib/server/websearch/embed/embed.ts +80 -0
  13. src/lib/server/websearch/embed/tree.ts +6 -0
  14. src/lib/server/websearch/markdown/fromHtml.ts +98 -0
  15. src/lib/server/websearch/markdown/tree.ts +63 -0
  16. src/lib/server/websearch/markdown/types.ts +55 -0
  17. src/lib/server/websearch/markdown/utils/chunk.ts +60 -0
  18. src/lib/server/websearch/markdown/utils/nlp.ts +11 -0
  19. src/lib/server/websearch/markdown/utils/stringify.ts +75 -0
  20. src/lib/server/websearch/parseWeb.ts +0 -41
  21. src/lib/server/websearch/runWebSearch.ts +69 -145
  22. src/lib/server/websearch/scrape/parser.ts +552 -0
  23. src/lib/server/websearch/scrape/playwright.ts +59 -0
  24. src/lib/server/websearch/scrape/scrape.ts +34 -0
  25. src/lib/server/websearch/scrape/types.ts +5 -0
  26. src/lib/server/websearch/search/endpoints.ts +27 -0
  27. src/lib/server/websearch/{searchSearxng.ts → search/endpoints/searxng.ts} +5 -3
  28. src/lib/server/websearch/search/endpoints/serpApi.ts +25 -0
  29. src/lib/server/websearch/search/endpoints/serpStack.ts +35 -0
  30. src/lib/server/websearch/search/endpoints/serper.ts +31 -0
  31. src/lib/server/websearch/{searchWebLocal.ts → search/endpoints/webLocal.ts} +16 -26
  32. src/lib/server/websearch/search/endpoints/youApi.ts +41 -0
  33. src/lib/server/websearch/{generateQuery.ts → search/generateQuery.ts} +1 -1
  34. src/lib/server/websearch/search/search.ts +77 -0
  35. src/lib/server/websearch/searchWeb.ts +0 -148
  36. src/lib/types/WebSearch.ts +16 -17
  37. src/lib/utils/isUrl.ts +8 -0
  38. src/lib/utils/timeout.ts +6 -3
.env CHANGED
@@ -27,6 +27,7 @@ SEARXNG_QUERY_URL=# where '<query>' will be replaced with query keywords see htt
27
 
28
  WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
29
  WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
 
30
 
31
  # Parameters to enable open id login
32
  OPENID_CONFIG=`{
@@ -155,4 +156,4 @@ ALLOWED_USER_EMAILS=`[]` # if it's defined, only these emails will be allowed to
155
  USAGE_LIMITS=`{}`
156
  ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
157
  METRICS_PORT=
158
- LOG_LEVEL=info
 
27
 
28
  WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
29
  WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
30
+ WEBSEARCH_JAVASCRIPT=true # CPU usage reduces by 60% on average by disabling javascript. Enable to improve website compatibility
31
 
32
  # Parameters to enable open id login
33
  OPENID_CONFIG=`{
 
156
  USAGE_LIMITS=`{}`
157
  ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
158
  METRICS_PORT=
159
+ LOG_LEVEL=info
Dockerfile CHANGED
@@ -83,6 +83,12 @@ COPY --chown=1000 gcp-*.json /app/
83
  COPY --from=builder --chown=1000 /app/build /app/build
84
  COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
85
 
 
 
 
 
 
 
86
  RUN chmod +x /app/entrypoint.sh
87
 
88
  CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]
 
83
  COPY --from=builder --chown=1000 /app/build /app/build
84
  COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
85
 
86
+ RUN npx playwright install
87
+
88
+ USER root
89
+ RUN npx playwright install-deps
90
+ USER user
91
+
92
  RUN chmod +x /app/entrypoint.sh
93
 
94
  CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]
README.md CHANGED
@@ -170,6 +170,8 @@ You can enable the web search through an API by adding `YDC_API_KEY` ([docs.you.
170
 
171
  You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
172
 
 
 
173
  ### Custom models
174
 
175
  You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :
 
170
 
171
  You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
172
 
173
+ You can enable Javascript when parsing webpages to improve compatibility with `WEBSEARCH_JAVASCRIPT=true` at the cost of increased CPU usage. You'll want at least 4 cores when enabling.
174
+
175
  ### Custom models
176
 
177
  You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :
package-lock.json CHANGED
@@ -8,9 +8,11 @@
8
  "name": "chat-ui",
9
  "version": "0.8.4",
10
  "dependencies": {
 
11
  "@huggingface/hub": "^0.5.1",
12
  "@huggingface/inference": "^2.6.3",
13
  "@iconify-json/bi": "^1.1.21",
 
14
  "@resvg/resvg-js": "^2.6.0",
15
  "@xenova/transformers": "^2.16.1",
16
  "autoprefixer": "^10.4.14",
@@ -32,10 +34,12 @@
32
  "parquetjs": "^0.11.2",
33
  "pino": "^9.0.0",
34
  "pino-pretty": "^11.0.0",
 
35
  "postcss": "^8.4.31",
36
  "saslprep": "^1.0.3",
37
  "satori": "^0.10.11",
38
  "satori-html": "^0.3.2",
 
39
  "serpapi": "^1.1.1",
40
  "sharp": "^0.33.2",
41
  "tailwind-scrollbar": "^3.0.0",
@@ -55,6 +59,7 @@
55
  "@types/jsdom": "^21.1.1",
56
  "@types/minimist": "^1.2.5",
57
  "@types/parquetjs": "^0.10.3",
 
58
  "@types/uuid": "^9.0.8",
59
  "@typescript-eslint/eslint-plugin": "^6.x",
60
  "@typescript-eslint/parser": "^6.x",
@@ -159,39 +164,54 @@
159
  }
160
  },
161
  "node_modules/@anthropic-ai/vertex-sdk": {
162
- "version": "0.3.0",
163
- "resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.0.tgz",
164
- "integrity": "sha512-RquU3sXAuGdxWnbx5luHovFnQVso7LuAtSmpLkZMOT6x5csldAJdp4TIgMX6/55pAefNVPDTtEYChwK5wpxRww==",
165
  "optional": true,
166
  "dependencies": {
167
- "@anthropic-ai/sdk": "^0.14",
168
  "google-auth-library": "^9.4.2"
169
  }
170
  },
171
- "node_modules/@anthropic-ai/vertex-sdk/node_modules/@anthropic-ai/sdk": {
172
- "version": "0.14.1",
173
- "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.14.1.tgz",
174
- "integrity": "sha512-/o0+6ijSF0WSxnzQ0GUZPKaxOE0y1dqAn9gM9KPU7hc/tqiI4lzCYqe/EFSEw8pFONgYi1IjcvevYjgOOc2vpg==",
175
- "optional": true,
176
  "dependencies": {
177
- "@types/node": "^18.11.18",
178
- "@types/node-fetch": "^2.6.4",
179
- "abort-controller": "^3.0.0",
180
- "agentkeepalive": "^4.2.1",
181
- "digest-fetch": "^1.3.0",
182
- "form-data-encoder": "1.7.2",
183
- "formdata-node": "^4.3.2",
184
- "node-fetch": "^2.6.7",
185
- "web-streams-polyfill": "^3.2.1"
186
  }
187
  },
188
- "node_modules/@anthropic-ai/vertex-sdk/node_modules/web-streams-polyfill": {
189
- "version": "3.3.3",
190
- "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz",
191
- "integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==",
192
- "optional": true,
193
- "engines": {
194
- "node": ">= 8"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  }
196
  },
197
  "node_modules/@cspotcode/source-map-support": {
@@ -1314,6 +1334,18 @@
1314
  "node": ">=8.0.0"
1315
  }
1316
  },
 
 
 
 
 
 
 
 
 
 
 
 
1317
  "node_modules/@polka/url": {
1318
  "version": "1.0.0-next.21",
1319
  "resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
@@ -1374,6 +1406,43 @@
1374
  "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
1375
  "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
1376
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1377
  "node_modules/@resvg/resvg-js": {
1378
  "version": "2.6.0",
1379
  "resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
@@ -2063,6 +2132,15 @@
2063
  "@types/chai": "*"
2064
  }
2065
  },
 
 
 
 
 
 
 
 
 
2066
  "node_modules/@types/connect": {
2067
  "version": "3.4.38",
2068
  "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
@@ -2108,6 +2186,29 @@
2108
  "@types/send": "*"
2109
  }
2110
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2111
  "node_modules/@types/http-errors": {
2112
  "version": "2.0.4",
2113
  "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
@@ -2216,6 +2317,12 @@
2216
  "integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
2217
  "dev": true
2218
  },
 
 
 
 
 
 
2219
  "node_modules/@types/semver": {
2220
  "version": "7.5.3",
2221
  "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
@@ -3660,7 +3767,6 @@
3660
  "version": "4.3.1",
3661
  "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
3662
  "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
3663
- "dev": true,
3664
  "engines": {
3665
  "node": ">=0.10.0"
3666
  }
@@ -3791,6 +3897,30 @@
3791
  "node": ">=6.0.0"
3792
  }
3793
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3794
  "node_modules/domexception": {
3795
  "version": "4.0.0",
3796
  "resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
@@ -3802,6 +3932,33 @@
3802
  "node": ">=12"
3803
  }
3804
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3805
  "node_modules/dotenv": {
3806
  "version": "16.0.3",
3807
  "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
@@ -3940,7 +4097,6 @@
3940
  "version": "4.0.0",
3941
  "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
3942
  "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
3943
- "dev": true,
3944
  "engines": {
3945
  "node": ">=10"
3946
  },
@@ -4924,6 +5080,24 @@
4924
  "node": ">=12"
4925
  }
4926
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4927
  "node_modules/http-errors": {
4928
  "version": "2.0.0",
4929
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
@@ -5194,6 +5368,14 @@
5194
  "node": ">=8"
5195
  }
5196
  },
 
 
 
 
 
 
 
 
5197
  "node_modules/is-potential-custom-element-name": {
5198
  "version": "1.0.1",
5199
  "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
@@ -6354,6 +6536,11 @@
6354
  "hex-rgb": "^4.1.0"
6355
  }
6356
  },
 
 
 
 
 
6357
  "node_modules/parse5": {
6358
  "version": "7.1.2",
6359
  "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
@@ -6645,6 +6832,47 @@
6645
  "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
6646
  "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
6647
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6648
  "node_modules/postcss": {
6649
  "version": "8.4.35",
6650
  "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
@@ -7431,6 +7659,19 @@
7431
  "rimraf": "bin.js"
7432
  }
7433
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
7434
  "node_modules/saslprep": {
7435
  "version": "1.0.3",
7436
  "resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
@@ -7481,6 +7722,14 @@
7481
  "node": ">=v12.22.7"
7482
  }
7483
  },
 
 
 
 
 
 
 
 
7484
  "node_modules/secure-json-parse": {
7485
  "version": "2.7.0",
7486
  "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
@@ -8428,6 +8677,19 @@
8428
  "node": ">=14.0.0"
8429
  }
8430
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
8431
  "node_modules/to-regex-range": {
8432
  "version": "5.0.1",
8433
  "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
 
8
  "name": "chat-ui",
9
  "version": "0.8.4",
10
  "dependencies": {
11
+ "@cliqz/adblocker-playwright": "^1.27.2",
12
  "@huggingface/hub": "^0.5.1",
13
  "@huggingface/inference": "^2.6.3",
14
  "@iconify-json/bi": "^1.1.21",
15
+ "@playwright/browser-chromium": "^1.43.1",
16
  "@resvg/resvg-js": "^2.6.0",
17
  "@xenova/transformers": "^2.16.1",
18
  "autoprefixer": "^10.4.14",
 
34
  "parquetjs": "^0.11.2",
35
  "pino": "^9.0.0",
36
  "pino-pretty": "^11.0.0",
37
+ "playwright": "^1.40.0",
38
  "postcss": "^8.4.31",
39
  "saslprep": "^1.0.3",
40
  "satori": "^0.10.11",
41
  "satori-html": "^0.3.2",
42
+ "sbd": "^1.0.19",
43
  "serpapi": "^1.1.1",
44
  "sharp": "^0.33.2",
45
  "tailwind-scrollbar": "^3.0.0",
 
59
  "@types/jsdom": "^21.1.1",
60
  "@types/minimist": "^1.2.5",
61
  "@types/parquetjs": "^0.10.3",
62
+ "@types/sbd": "^1.0.5",
63
  "@types/uuid": "^9.0.8",
64
  "@typescript-eslint/eslint-plugin": "^6.x",
65
  "@typescript-eslint/parser": "^6.x",
 
164
  }
165
  },
166
  "node_modules/@anthropic-ai/vertex-sdk": {
167
+ "version": "0.3.6",
168
+ "resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.6.tgz",
169
+ "integrity": "sha512-4pNVobcCsPCWLSaFJkT/XxwX5rmot+q2PE2LF5vfuRNFTWFjeTrsPgTB48D0Sce/c/2p4fddrFKGN6fdnn8zRg==",
170
  "optional": true,
171
  "dependencies": {
172
+ "@anthropic-ai/sdk": ">=0.14 <1",
173
  "google-auth-library": "^9.4.2"
174
  }
175
  },
176
+ "node_modules/@cliqz/adblocker": {
177
+ "version": "1.27.2",
178
+ "resolved": "https://registry.npmjs.org/@cliqz/adblocker/-/adblocker-1.27.2.tgz",
179
+ "integrity": "sha512-sFjbx9xBGWaOsvVFVHVUNOrzCafGtjYDAp95KTeoJcNZbPs4D2RsabYZEeg4JkwPkfhcFseJqfnsMyJ4XsqVfQ==",
 
180
  "dependencies": {
181
+ "@cliqz/adblocker-content": "^1.27.2",
182
+ "@cliqz/adblocker-extended-selectors": "^1.27.2",
183
+ "@remusao/guess-url-type": "^1.2.1",
184
+ "@remusao/small": "^1.2.1",
185
+ "@remusao/smaz": "^1.9.1",
186
+ "@types/chrome": "^0.0.266",
187
+ "@types/firefox-webext-browser": "^120.0.0",
188
+ "tldts-experimental": "^6.0.14"
 
189
  }
190
  },
191
+ "node_modules/@cliqz/adblocker-content": {
192
+ "version": "1.27.2",
193
+ "resolved": "https://registry.npmjs.org/@cliqz/adblocker-content/-/adblocker-content-1.27.2.tgz",
194
+ "integrity": "sha512-fzxsOt7r3YUgxoyW9GPCOShKOLNbB4n3gWtyMBFQ+lwHsQKfLehxN4Zxjg4Ad6AXJNW4gfIBq69ghnj2jHfviw==",
195
+ "dependencies": {
196
+ "@cliqz/adblocker-extended-selectors": "^1.27.2"
197
+ }
198
+ },
199
+ "node_modules/@cliqz/adblocker-extended-selectors": {
200
+ "version": "1.27.2",
201
+ "resolved": "https://registry.npmjs.org/@cliqz/adblocker-extended-selectors/-/adblocker-extended-selectors-1.27.2.tgz",
202
+ "integrity": "sha512-HZ03U8pAOuEwTo1vZ9tv49kIC4riWqYvr5p3illZshxo+eCUi8CPbgYSyYCtgd1JpO1wNnCwEX95/twXfT8cnA=="
203
+ },
204
+ "node_modules/@cliqz/adblocker-playwright": {
205
+ "version": "1.27.2",
206
+ "resolved": "https://registry.npmjs.org/@cliqz/adblocker-playwright/-/adblocker-playwright-1.27.2.tgz",
207
+ "integrity": "sha512-b+OoWKz/h787YItfCwjnhZ8l6/bv/DPTzaq1pyyY6Ovpdd+dGvVW1fehw+87FC6j/WQbTeuOdpLiwp8ouvrftg==",
208
+ "dependencies": {
209
+ "@cliqz/adblocker": "^1.27.2",
210
+ "@cliqz/adblocker-content": "^1.27.2",
211
+ "tldts-experimental": "^6.0.14"
212
+ },
213
+ "peerDependencies": {
214
+ "playwright": "^1.x"
215
  }
216
  },
217
  "node_modules/@cspotcode/source-map-support": {
 
1334
  "node": ">=8.0.0"
1335
  }
1336
  },
1337
+ "node_modules/@playwright/browser-chromium": {
1338
+ "version": "1.43.1",
1339
+ "resolved": "https://registry.npmjs.org/@playwright/browser-chromium/-/browser-chromium-1.43.1.tgz",
1340
+ "integrity": "sha512-CBuHhRIF/VGyUnPvK7/4IUbm0AAOZZI5huHlr+qNr5cFQpQ6TXBqOwSMef/xUz9HcjxWOxDPION7br1kOlyV/A==",
1341
+ "hasInstallScript": true,
1342
+ "dependencies": {
1343
+ "playwright-core": "1.43.1"
1344
+ },
1345
+ "engines": {
1346
+ "node": ">=16"
1347
+ }
1348
+ },
1349
  "node_modules/@polka/url": {
1350
  "version": "1.0.0-next.21",
1351
  "resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
 
1406
  "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
1407
  "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
1408
  },
1409
+ "node_modules/@remusao/guess-url-type": {
1410
+ "version": "1.2.1",
1411
+ "resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.2.1.tgz",
1412
+ "integrity": "sha512-rbOqre2jW8STjheOsOaQHLgYBaBZ9Owbdt8NO7WvNZftJlaG3y/K9oOkl8ZUpuFBisIhmBuMEW6c+YrQl5inRA=="
1413
+ },
1414
+ "node_modules/@remusao/small": {
1415
+ "version": "1.2.1",
1416
+ "resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.2.1.tgz",
1417
+ "integrity": "sha512-7MjoGt0TJMVw1GPKgWq6SJPws1SLsUXQRa43Umht+nkyw2jnpy3WpiLNqGdwo5rHr5Wp9B2W/Pm5RQp656UJdw=="
1418
+ },
1419
+ "node_modules/@remusao/smaz": {
1420
+ "version": "1.9.1",
1421
+ "resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.9.1.tgz",
1422
+ "integrity": "sha512-e6BLuP8oaXCZ9+v46Is4ilAZ/Vq6YLgmBP204Ixgk1qTjXmqvFYG7+AS7v9nsZdGOy96r9DWGFbbDVgMxwu1rA==",
1423
+ "dependencies": {
1424
+ "@remusao/smaz-compress": "^1.9.1",
1425
+ "@remusao/smaz-decompress": "^1.9.1"
1426
+ }
1427
+ },
1428
+ "node_modules/@remusao/smaz-compress": {
1429
+ "version": "1.9.1",
1430
+ "resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.9.1.tgz",
1431
+ "integrity": "sha512-E2f48TwloQu3r6BdLOGF2aczeH7bJ/32oJGqvzT9SKur0cuUnLcZ7ZXP874E2fwmdE+cXzfC7bKzp79cDnmeyw==",
1432
+ "dependencies": {
1433
+ "@remusao/trie": "^1.4.1"
1434
+ }
1435
+ },
1436
+ "node_modules/@remusao/smaz-decompress": {
1437
+ "version": "1.9.1",
1438
+ "resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.9.1.tgz",
1439
+ "integrity": "sha512-TfjKKprYe3n47od8auhvJ/Ikj9kQTbDTe71ynKlxslrvvUhlIV3VQSuwYuMWMbdz1fIs0H/fxCN1Z8/H3km6/A=="
1440
+ },
1441
+ "node_modules/@remusao/trie": {
1442
+ "version": "1.4.1",
1443
+ "resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.4.1.tgz",
1444
+ "integrity": "sha512-yvwa+aCyYI/UjeD39BnpMypG8N06l86wIDW1/PAc6ihBRnodIfZDwccxQN3n1t74wduzaz74m4ZMHZnB06567Q=="
1445
+ },
1446
  "node_modules/@resvg/resvg-js": {
1447
  "version": "2.6.0",
1448
  "resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
 
2132
  "@types/chai": "*"
2133
  }
2134
  },
2135
+ "node_modules/@types/chrome": {
2136
+ "version": "0.0.266",
2137
+ "resolved": "https://registry.npmjs.org/@types/chrome/-/chrome-0.0.266.tgz",
2138
+ "integrity": "sha512-QSQWJTL7NjZElvq/6/E5C1+pHgEP8UAJzwoz7M4vSJ7AECt6NNehJ+tU6snnvuTqZOBjFCivvitYo5+8tNPmhg==",
2139
+ "dependencies": {
2140
+ "@types/filesystem": "*",
2141
+ "@types/har-format": "*"
2142
+ }
2143
+ },
2144
  "node_modules/@types/connect": {
2145
  "version": "3.4.38",
2146
  "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
 
2186
  "@types/send": "*"
2187
  }
2188
  },
2189
+ "node_modules/@types/filesystem": {
2190
+ "version": "0.0.36",
2191
+ "resolved": "https://registry.npmjs.org/@types/filesystem/-/filesystem-0.0.36.tgz",
2192
+ "integrity": "sha512-vPDXOZuannb9FZdxgHnqSwAG/jvdGM8Wq+6N4D/d80z+D4HWH+bItqsZaVRQykAn6WEVeEkLm2oQigyHtgb0RA==",
2193
+ "dependencies": {
2194
+ "@types/filewriter": "*"
2195
+ }
2196
+ },
2197
+ "node_modules/@types/filewriter": {
2198
+ "version": "0.0.33",
2199
+ "resolved": "https://registry.npmjs.org/@types/filewriter/-/filewriter-0.0.33.tgz",
2200
+ "integrity": "sha512-xFU8ZXTw4gd358lb2jw25nxY9QAgqn2+bKKjKOYfNCzN4DKCFetK7sPtrlpg66Ywe3vWY9FNxprZawAh9wfJ3g=="
2201
+ },
2202
+ "node_modules/@types/firefox-webext-browser": {
2203
+ "version": "120.0.3",
2204
+ "resolved": "https://registry.npmjs.org/@types/firefox-webext-browser/-/firefox-webext-browser-120.0.3.tgz",
2205
+ "integrity": "sha512-APbBSxOvFMbKwXy/4YrEVa5Di6N0C9yl4w0WA0xzdkOrChAfPQ/KlcC8QLyhemHCHpF1CB/zHy52+oUQurViOg=="
2206
+ },
2207
+ "node_modules/@types/har-format": {
2208
+ "version": "1.2.15",
2209
+ "resolved": "https://registry.npmjs.org/@types/har-format/-/har-format-1.2.15.tgz",
2210
+ "integrity": "sha512-RpQH4rXLuvTXKR0zqHq3go0RVXYv/YVqv4TnPH95VbwUxZdQlK1EtcMvQvMpDngHbt13Csh9Z4qT9AbkiQH5BA=="
2211
+ },
2212
  "node_modules/@types/http-errors": {
2213
  "version": "2.0.4",
2214
  "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
 
2317
  "integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
2318
  "dev": true
2319
  },
2320
+ "node_modules/@types/sbd": {
2321
+ "version": "1.0.5",
2322
+ "resolved": "https://registry.npmjs.org/@types/sbd/-/sbd-1.0.5.tgz",
2323
+ "integrity": "sha512-60PxBBWhg0C3yb5bTP+wwWYGTKMcuB0S6mTEa1sedMC79tYY0Ei7YjU4qsWzGn++lWscLQde16SnElJrf5/aTw==",
2324
+ "dev": true
2325
+ },
2326
  "node_modules/@types/semver": {
2327
  "version": "7.5.3",
2328
  "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
 
3767
  "version": "4.3.1",
3768
  "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
3769
  "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
 
3770
  "engines": {
3771
  "node": ">=0.10.0"
3772
  }
 
3897
  "node": ">=6.0.0"
3898
  }
3899
  },
3900
+ "node_modules/dom-serializer": {
3901
+ "version": "2.0.0",
3902
+ "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
3903
+ "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
3904
+ "dependencies": {
3905
+ "domelementtype": "^2.3.0",
3906
+ "domhandler": "^5.0.2",
3907
+ "entities": "^4.2.0"
3908
+ },
3909
+ "funding": {
3910
+ "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
3911
+ }
3912
+ },
3913
+ "node_modules/domelementtype": {
3914
+ "version": "2.3.0",
3915
+ "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
3916
+ "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
3917
+ "funding": [
3918
+ {
3919
+ "type": "github",
3920
+ "url": "https://github.com/sponsors/fb55"
3921
+ }
3922
+ ]
3923
+ },
3924
  "node_modules/domexception": {
3925
  "version": "4.0.0",
3926
  "resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
 
3932
  "node": ">=12"
3933
  }
3934
  },
3935
+ "node_modules/domhandler": {
3936
+ "version": "5.0.3",
3937
+ "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
3938
+ "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
3939
+ "dependencies": {
3940
+ "domelementtype": "^2.3.0"
3941
+ },
3942
+ "engines": {
3943
+ "node": ">= 4"
3944
+ },
3945
+ "funding": {
3946
+ "url": "https://github.com/fb55/domhandler?sponsor=1"
3947
+ }
3948
+ },
3949
+ "node_modules/domutils": {
3950
+ "version": "3.1.0",
3951
+ "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
3952
+ "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
3953
+ "dependencies": {
3954
+ "dom-serializer": "^2.0.0",
3955
+ "domelementtype": "^2.3.0",
3956
+ "domhandler": "^5.0.3"
3957
+ },
3958
+ "funding": {
3959
+ "url": "https://github.com/fb55/domutils?sponsor=1"
3960
+ }
3961
+ },
3962
  "node_modules/dotenv": {
3963
  "version": "16.0.3",
3964
  "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
 
4097
  "version": "4.0.0",
4098
  "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
4099
  "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
 
4100
  "engines": {
4101
  "node": ">=10"
4102
  },
 
5080
  "node": ">=12"
5081
  }
5082
  },
5083
+ "node_modules/htmlparser2": {
5084
+ "version": "8.0.2",
5085
+ "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
5086
+ "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
5087
+ "funding": [
5088
+ "https://github.com/fb55/htmlparser2?sponsor=1",
5089
+ {
5090
+ "type": "github",
5091
+ "url": "https://github.com/sponsors/fb55"
5092
+ }
5093
+ ],
5094
+ "dependencies": {
5095
+ "domelementtype": "^2.3.0",
5096
+ "domhandler": "^5.0.3",
5097
+ "domutils": "^3.0.1",
5098
+ "entities": "^4.4.0"
5099
+ }
5100
+ },
5101
  "node_modules/http-errors": {
5102
  "version": "2.0.0",
5103
  "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
 
5368
  "node": ">=8"
5369
  }
5370
  },
5371
+ "node_modules/is-plain-object": {
5372
+ "version": "5.0.0",
5373
+ "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz",
5374
+ "integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==",
5375
+ "engines": {
5376
+ "node": ">=0.10.0"
5377
+ }
5378
+ },
5379
  "node_modules/is-potential-custom-element-name": {
5380
  "version": "1.0.1",
5381
  "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
 
6536
  "hex-rgb": "^4.1.0"
6537
  }
6538
  },
6539
+ "node_modules/parse-srcset": {
6540
+ "version": "1.0.2",
6541
+ "resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz",
6542
+ "integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="
6543
+ },
6544
  "node_modules/parse5": {
6545
  "version": "7.1.2",
6546
  "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
 
6832
  "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
6833
  "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
6834
  },
6835
+ "node_modules/playwright": {
6836
+ "version": "1.43.1",
6837
+ "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.43.1.tgz",
6838
+ "integrity": "sha512-V7SoH0ai2kNt1Md9E3Gwas5B9m8KR2GVvwZnAI6Pg0m3sh7UvgiYhRrhsziCmqMJNouPckiOhk8T+9bSAK0VIA==",
6839
+ "dependencies": {
6840
+ "playwright-core": "1.43.1"
6841
+ },
6842
+ "bin": {
6843
+ "playwright": "cli.js"
6844
+ },
6845
+ "engines": {
6846
+ "node": ">=16"
6847
+ },
6848
+ "optionalDependencies": {
6849
+ "fsevents": "2.3.2"
6850
+ }
6851
+ },
6852
+ "node_modules/playwright-core": {
6853
+ "version": "1.43.1",
6854
+ "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.43.1.tgz",
6855
+ "integrity": "sha512-EI36Mto2Vrx6VF7rm708qSnesVQKbxEWvPrfA1IPY6HgczBplDx7ENtx+K2n4kJ41sLLkuGfmb0ZLSSXlDhqPg==",
6856
+ "bin": {
6857
+ "playwright-core": "cli.js"
6858
+ },
6859
+ "engines": {
6860
+ "node": ">=16"
6861
+ }
6862
+ },
6863
+ "node_modules/playwright/node_modules/fsevents": {
6864
+ "version": "2.3.2",
6865
+ "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
6866
+ "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
6867
+ "hasInstallScript": true,
6868
+ "optional": true,
6869
+ "os": [
6870
+ "darwin"
6871
+ ],
6872
+ "engines": {
6873
+ "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
6874
+ }
6875
+ },
6876
  "node_modules/postcss": {
6877
  "version": "8.4.35",
6878
  "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
 
7659
  "rimraf": "bin.js"
7660
  }
7661
  },
7662
+ "node_modules/sanitize-html": {
7663
+ "version": "2.13.0",
7664
+ "resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.13.0.tgz",
7665
+ "integrity": "sha512-Xff91Z+4Mz5QiNSLdLWwjgBDm5b1RU6xBT0+12rapjiaR7SwfRdjw8f+6Rir2MXKLrDicRFHdb51hGOAxmsUIA==",
7666
+ "dependencies": {
7667
+ "deepmerge": "^4.2.2",
7668
+ "escape-string-regexp": "^4.0.0",
7669
+ "htmlparser2": "^8.0.0",
7670
+ "is-plain-object": "^5.0.0",
7671
+ "parse-srcset": "^1.0.2",
7672
+ "postcss": "^8.3.11"
7673
+ }
7674
+ },
7675
  "node_modules/saslprep": {
7676
  "version": "1.0.3",
7677
  "resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
 
7722
  "node": ">=v12.22.7"
7723
  }
7724
  },
7725
+ "node_modules/sbd": {
7726
+ "version": "1.0.19",
7727
+ "resolved": "https://registry.npmjs.org/sbd/-/sbd-1.0.19.tgz",
7728
+ "integrity": "sha512-b5RyZMGSrFuIB4AHdbv12uYHS8YGEJ36gtuvG3RflbJGY+T0dXmAL0E4vZjQqT2RsX0v+ZwVqhV2zsGr5aFK9w==",
7729
+ "dependencies": {
7730
+ "sanitize-html": "^2.3.2"
7731
+ }
7732
+ },
7733
  "node_modules/secure-json-parse": {
7734
  "version": "2.7.0",
7735
  "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
 
8677
  "node": ">=14.0.0"
8678
  }
8679
  },
8680
+ "node_modules/tldts-core": {
8681
+ "version": "6.1.18",
8682
+ "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.18.tgz",
8683
+ "integrity": "sha512-e4wx32F/7dMBSZyKAx825Yte3U0PQtZZ0bkWxYQiwLteRVnQ5zM40fEbi0IyNtwQssgJAk3GCr7Q+w39hX0VKA=="
8684
+ },
8685
+ "node_modules/tldts-experimental": {
8686
+ "version": "6.1.18",
8687
+ "resolved": "https://registry.npmjs.org/tldts-experimental/-/tldts-experimental-6.1.18.tgz",
8688
+ "integrity": "sha512-E9/pAIybo7/MPdsQSKcCDElgObk78Be1gFqO645LbfhL5HG597sOeRQ55EuvIHlTo1Ypyyl+F/V+p0CnrTu3uQ==",
8689
+ "dependencies": {
8690
+ "tldts-core": "^6.1.18"
8691
+ }
8692
+ },
8693
  "node_modules/to-regex-range": {
8694
  "version": "5.0.1",
8695
  "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
package.json CHANGED
@@ -28,6 +28,7 @@
28
  "@types/jsdom": "^21.1.1",
29
  "@types/minimist": "^1.2.5",
30
  "@types/parquetjs": "^0.10.3",
 
31
  "@types/uuid": "^9.0.8",
32
  "@typescript-eslint/eslint-plugin": "^6.x",
33
  "@typescript-eslint/parser": "^6.x",
@@ -52,9 +53,11 @@
52
  },
53
  "type": "module",
54
  "dependencies": {
 
55
  "@huggingface/hub": "^0.5.1",
56
  "@huggingface/inference": "^2.6.3",
57
  "@iconify-json/bi": "^1.1.21",
 
58
  "@resvg/resvg-js": "^2.6.0",
59
  "@xenova/transformers": "^2.16.1",
60
  "autoprefixer": "^10.4.14",
@@ -76,10 +79,12 @@
76
  "parquetjs": "^0.11.2",
77
  "pino": "^9.0.0",
78
  "pino-pretty": "^11.0.0",
 
79
  "postcss": "^8.4.31",
80
  "saslprep": "^1.0.3",
81
  "satori": "^0.10.11",
82
  "satori-html": "^0.3.2",
 
83
  "serpapi": "^1.1.1",
84
  "sharp": "^0.33.2",
85
  "tailwind-scrollbar": "^3.0.0",
 
28
  "@types/jsdom": "^21.1.1",
29
  "@types/minimist": "^1.2.5",
30
  "@types/parquetjs": "^0.10.3",
31
+ "@types/sbd": "^1.0.5",
32
  "@types/uuid": "^9.0.8",
33
  "@typescript-eslint/eslint-plugin": "^6.x",
34
  "@typescript-eslint/parser": "^6.x",
 
53
  },
54
  "type": "module",
55
  "dependencies": {
56
+ "@cliqz/adblocker-playwright": "^1.27.2",
57
  "@huggingface/hub": "^0.5.1",
58
  "@huggingface/inference": "^2.6.3",
59
  "@iconify-json/bi": "^1.1.21",
60
+ "@playwright/browser-chromium": "^1.43.1",
61
  "@resvg/resvg-js": "^2.6.0",
62
  "@xenova/transformers": "^2.16.1",
63
  "autoprefixer": "^10.4.14",
 
79
  "parquetjs": "^0.11.2",
80
  "pino": "^9.0.0",
81
  "pino-pretty": "^11.0.0",
82
+ "playwright": "^1.40.0",
83
  "postcss": "^8.4.31",
84
  "saslprep": "^1.0.3",
85
  "satori": "^0.10.11",
86
  "satori-html": "^0.3.2",
87
+ "sbd": "^1.0.19",
88
  "serpapi": "^1.1.1",
89
  "sharp": "^0.33.2",
90
  "tailwind-scrollbar": "^3.0.0",
src/lib/components/chat/ChatMessage.svelte CHANGED
@@ -227,7 +227,7 @@
227
  {#if webSearchSources?.length}
228
  <div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
229
  <div class="text-gray-400">Sources:</div>
230
- {#each webSearchSources as { link, title, hostname }}
231
  <a
232
  class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
233
  href={link}
@@ -235,10 +235,10 @@
235
  >
236
  <img
237
  class="h-3.5 w-3.5 rounded"
238
- src="https://www.google.com/s2/favicons?sz=64&domain_url={hostname}"
239
  alt="{title} favicon"
240
  />
241
- <div>{hostname.replace(/^www\./, "")}</div>
242
  </a>
243
  {/each}
244
  </div>
 
227
  {#if webSearchSources?.length}
228
  <div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
229
  <div class="text-gray-400">Sources:</div>
230
+ {#each webSearchSources as { link, title }}
231
  <a
232
  class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
233
  href={link}
 
235
  >
236
  <img
237
  class="h-3.5 w-3.5 rounded"
238
+ src="https://www.google.com/s2/favicons?sz=64&domain_url={new URL(link).hostname}"
239
  alt="{title} favicon"
240
  />
241
+ <div>{new URL(link).hostname.replace(/^www\./, "")}</div>
242
  </a>
243
  {/each}
244
  </div>
src/lib/server/embeddingEndpoints/hfApi/embeddingHfApi.ts CHANGED
@@ -32,7 +32,12 @@ export async function embeddingEndpointHfApi(
32
  "Content-Type": "application/json",
33
  ...(authorization ? { Authorization: authorization } : {}),
34
  },
35
- body: JSON.stringify({ inputs: batchInputs }),
 
 
 
 
 
36
  });
37
 
38
  if (!response.ok) {
 
32
  "Content-Type": "application/json",
33
  ...(authorization ? { Authorization: authorization } : {}),
34
  },
35
+ body: JSON.stringify({
36
+ inputs: {
37
+ source_sentence: batchInputs[0],
38
+ sentences: batchInputs.slice(1),
39
+ },
40
+ }),
41
  });
42
 
43
  if (!response.ok) {
src/lib/server/isURLLocal.ts CHANGED
@@ -1,26 +1,38 @@
1
  import { Address6, Address4 } from "ip-address";
2
-
3
  import dns from "node:dns";
4
 
5
- export async function isURLLocal(URL: URL): Promise<boolean> {
6
- const isLocal = new Promise<boolean>((resolve, reject) => {
7
- dns.lookup(URL.hostname, (err, address, family) => {
8
- if (err) {
9
- reject(err);
10
- }
11
- if (family === 4) {
12
- const addr = new Address4(address);
13
- resolve(addr.isInSubnet(new Address4("127.0.0.0/8")));
14
- } else if (family === 6) {
15
- const addr = new Address6(address);
16
- resolve(
17
- addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal()
18
- );
19
- } else {
20
- reject(new Error("Unknown IP family"));
21
- }
22
  });
23
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- return isLocal;
 
 
 
 
 
 
 
26
  }
 
1
  import { Address6, Address4 } from "ip-address";
 
2
  import dns from "node:dns";
3
 
4
+ const dnsLookup = (hostname: string): Promise<{ address: string; family: number }> => {
5
+ return new Promise((resolve, reject) => {
6
+ dns.lookup(hostname, (err, address, family) => {
7
+ if (err) return reject(err);
8
+ resolve({ address, family });
 
 
 
 
 
 
 
 
 
 
 
 
9
  });
10
  });
11
+ };
12
+
13
+ export async function isURLLocal(URL: URL): Promise<boolean> {
14
+ const { address, family } = await dnsLookup(URL.hostname);
15
+
16
+ if (family === 4) {
17
+ const addr = new Address4(address);
18
+ const localSubnet = new Address4("127.0.0.0/8");
19
+ return addr.isInSubnet(localSubnet);
20
+ }
21
+
22
+ if (family === 6) {
23
+ const addr = new Address6(address);
24
+ return addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal();
25
+ }
26
+
27
+ throw Error("Unknown IP family");
28
+ }
29
 
30
+ export function isURLStringLocal(url: string) {
31
+ try {
32
+ const urlObj = new URL(url);
33
+ return isURLLocal(urlObj);
34
+ } catch (e) {
35
+ // assume local if URL parsing fails
36
+ return true;
37
+ }
38
  }
src/lib/server/preprocessMessages.ts CHANGED
@@ -13,11 +13,9 @@ export async function preprocessMessages(
13
  return await Promise.all(
14
  structuredClone(messages).map(async (message, idx) => {
15
  const webSearchContext = webSearch?.contextSources
16
- .map(({ context }) => context)
17
- .flat()
18
- .sort((a, b) => a.idx - b.idx)
19
- .map(({ text }) => text)
20
- .join(" ");
21
  // start by adding websearch to the last message
22
  if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
23
  const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
@@ -27,7 +25,7 @@ export async function preprocessMessages(
27
  .map((el) => el.content);
28
  const currentDate = format(new Date(), "MMMM d, yyyy");
29
 
30
- message.content = `I searched the web using the query: ${webSearch.searchQuery}.
31
  Today is ${currentDate} and here are the results:
32
  =====================
33
  ${webSearchContext}
 
13
  return await Promise.all(
14
  structuredClone(messages).map(async (message, idx) => {
15
  const webSearchContext = webSearch?.contextSources
16
+ .map(({ context }) => context.trim())
17
+ .join("\n\n----------\n\n");
18
+
 
 
19
  // start by adding websearch to the last message
20
  if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
21
  const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
 
25
  .map((el) => el.content);
26
  const currentDate = format(new Date(), "MMMM d, yyyy");
27
 
28
+ message.content = `I searched the web using the query: ${webSearch.searchQuery}.
29
  Today is ${currentDate} and here are the results:
30
  =====================
31
  ${webSearchContext}
src/lib/server/sentenceSimilarity.ts CHANGED
@@ -3,40 +3,31 @@ import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
3
  import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
4
 
5
  // see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
6
- function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
7
  return 1.0 - dot(embeddingA, embeddingB);
8
  }
9
 
10
- export async function findSimilarSentences(
11
  embeddingModel: EmbeddingBackendModel,
12
  query: string,
13
- sentences: string[],
14
- { topK = 5 }: { topK: number }
15
- ): Promise<Embedding> {
16
  const inputs = [
17
  `${embeddingModel.preQuery}${query}`,
18
  ...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
19
  ];
20
 
21
  const embeddingEndpoint = await embeddingModel.getEndpoint();
22
- const output = await embeddingEndpoint({ inputs });
 
 
23
 
24
  const queryEmbedding: Embedding = output[0];
25
  const sentencesEmbeddings: Embedding[] = output.slice(1);
26
 
27
- const distancesFromQuery: { distance: number; index: number }[] = [...sentencesEmbeddings].map(
28
- (sentenceEmbedding: Embedding, index: number) => {
29
- return {
30
- distance: innerProduct(queryEmbedding, sentenceEmbedding),
31
- index,
32
- };
33
- }
34
- );
35
-
36
- distancesFromQuery.sort((a, b) => {
37
- return a.distance - b.distance;
38
- });
39
-
40
- // Return the indexes of the closest topK sentences
41
- return distancesFromQuery.slice(0, topK).map((item) => item.index);
42
  }
 
3
  import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
4
 
5
  // see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
6
+ export function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
7
  return 1.0 - dot(embeddingA, embeddingB);
8
  }
9
 
10
+ export async function getSentenceSimilarity(
11
  embeddingModel: EmbeddingBackendModel,
12
  query: string,
13
+ sentences: string[]
14
+ ): Promise<{ distance: number; embedding: Embedding; idx: number }[]> {
 
15
  const inputs = [
16
  `${embeddingModel.preQuery}${query}`,
17
  ...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
18
  ];
19
 
20
  const embeddingEndpoint = await embeddingModel.getEndpoint();
21
+ const output = await embeddingEndpoint({ inputs }).catch((err) => {
22
+ throw Error("Failed to generate embeddings for sentence similarity", { cause: err });
23
+ });
24
 
25
  const queryEmbedding: Embedding = output[0];
26
  const sentencesEmbeddings: Embedding[] = output.slice(1);
27
 
28
+ return sentencesEmbeddings.map((sentenceEmbedding, idx) => ({
29
+ distance: innerProduct(queryEmbedding, sentenceEmbedding),
30
+ embedding: sentenceEmbedding,
31
+ idx,
32
+ }));
 
 
 
 
 
 
 
 
 
 
33
  }
src/lib/server/websearch/embed/combine.ts ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
2
+ import { getSentenceSimilarity } from "$lib/server/sentenceSimilarity";
3
+
4
+ /**
5
+ * Combines sentences together to reach the maximum character limit of the embedding model
6
+ * Improves performance considerably when using CPU embedding
7
+ */
8
+ export async function getCombinedSentenceSimilarity(
9
+ embeddingModel: EmbeddingBackendModel,
10
+ query: string,
11
+ sentences: string[]
12
+ ): ReturnType<typeof getSentenceSimilarity> {
13
+ const combinedSentences = sentences.reduce<{ text: string; indices: number[] }[]>(
14
+ (acc, sentence, idx) => {
15
+ const lastSentence = acc[acc.length - 1];
16
+ if (!lastSentence) return [{ text: sentence, indices: [idx] }];
17
+ if (lastSentence.text.length + sentence.length < embeddingModel.chunkCharLength) {
18
+ lastSentence.text += ` ${sentence}`;
19
+ lastSentence.indices.push(idx);
20
+ return acc;
21
+ }
22
+ return [...acc, { text: sentence, indices: [idx] }];
23
+ },
24
+ []
25
+ );
26
+
27
+ const embeddings = await getSentenceSimilarity(
28
+ embeddingModel,
29
+ query,
30
+ combinedSentences.map(({ text }) => text)
31
+ );
32
+
33
+ return embeddings.flatMap((embedding, idx) => {
34
+ const { indices } = combinedSentences[idx];
35
+ return indices.map((i) => ({ ...embedding, idx: i }));
36
+ });
37
+ }
src/lib/server/websearch/embed/embed.ts ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch";
2
+ import type { EmbeddingBackendModel } from "../../embeddingModels";
3
+ import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity";
4
+ import { MarkdownElementType, type MarkdownElement } from "../markdown/types";
5
+ import { stringifyMarkdownElement } from "../markdown/utils/stringify";
6
+ import { getCombinedSentenceSimilarity } from "./combine";
7
+ import { flattenTree } from "./tree";
8
+
9
+ const MIN_CHARS = 3_000;
10
+ const SOFT_MAX_CHARS = 8_000;
11
+
12
+ export async function findContextSources(
13
+ sources: WebSearchScrapedSource[],
14
+ prompt: string,
15
+ embeddingModel: EmbeddingBackendModel
16
+ ) {
17
+ const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree));
18
+ const markdownElems = sourcesMarkdownElems.flat();
19
+
20
+ // When using CPU embedding (transformersjs), join sentences together to the max character limit
21
+ // to reduce inference time
22
+ const embeddingFunc =
23
+ embeddingModel.endpoints[0].type === "transformersjs"
24
+ ? getCombinedSentenceSimilarity
25
+ : getSentenceSimilarity;
26
+
27
+ const embeddings = await embeddingFunc(
28
+ embeddingModel,
29
+ prompt,
30
+ markdownElems
31
+ .map(stringifyMarkdownElement)
32
+ // Safety in case the stringified markdown elements are too long
33
+ // but chunking should have happened earlier
34
+ .map((elem) => elem.slice(0, embeddingModel.chunkCharLength))
35
+ );
36
+
37
+ const topEmbeddings = embeddings
38
+ .sort((a, b) => a.distance - b.distance)
39
+ .filter((embedding) => markdownElems[embedding.idx].type !== MarkdownElementType.Header);
40
+
41
+ let totalChars = 0;
42
+ const selectedMarkdownElems = new Set<MarkdownElement>();
43
+ const selectedEmbeddings: number[][] = [];
44
+ for (const embedding of topEmbeddings) {
45
+ const elem = markdownElems[embedding.idx];
46
+
47
+ // Ignore elements that are too similar to already selected elements
48
+ const tooSimilar = selectedEmbeddings.some(
49
+ (selectedEmbedding) => innerProduct(selectedEmbedding, embedding.embedding) < 0.01
50
+ );
51
+ if (tooSimilar) continue;
52
+
53
+ // Add element
54
+ if (!selectedMarkdownElems.has(elem)) {
55
+ selectedMarkdownElems.add(elem);
56
+ selectedEmbeddings.push(embedding.embedding);
57
+ totalChars += elem.content.length;
58
+ }
59
+
60
+ // Add element's parent (header)
61
+ if (elem.parent && !selectedMarkdownElems.has(elem.parent)) {
62
+ selectedMarkdownElems.add(elem.parent);
63
+ totalChars += elem.parent.content.length;
64
+ }
65
+
66
+ if (totalChars > SOFT_MAX_CHARS) break;
67
+ if (totalChars > MIN_CHARS && embedding.distance > 0.25) break;
68
+ }
69
+
70
+ const contextSources = sourcesMarkdownElems
71
+ .map<WebSearchUsedSource>((elems, idx) => {
72
+ const sourceSelectedElems = elems.filter((elem) => selectedMarkdownElems.has(elem));
73
+ const context = sourceSelectedElems.map(stringifyMarkdownElement).join("\n");
74
+ const source = sources[idx];
75
+ return { ...source, context };
76
+ })
77
+ .filter((contextSource) => contextSource.context.length > 0);
78
+
79
+ return contextSources;
80
+ }
src/lib/server/websearch/embed/tree.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import type { MarkdownElement } from "../markdown/types";
2
+
3
+ export function flattenTree(elem: MarkdownElement): MarkdownElement[] {
4
+ if ("children" in elem) return [elem, ...elem.children.flatMap(flattenTree)];
5
+ return [elem];
6
+ }
src/lib/server/websearch/markdown/fromHtml.ts ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { collapseString, sanitizeString } from "./utils/nlp";
2
+ import { stringifyHTMLElements, stringifyHTMLElementsUnformatted } from "./utils/stringify";
3
+ import { MarkdownElementType, tagNameMap, type HeaderElement, type MarkdownElement } from "./types";
4
+ import type { SerializedHTMLElement } from "../scrape/types";
5
+
6
+ interface ConversionState {
7
+ defaultType:
8
+ | MarkdownElementType.Paragraph
9
+ | MarkdownElementType.BlockQuote
10
+ | MarkdownElementType.UnorderedListItem
11
+ | MarkdownElementType.OrderedListItem;
12
+ listDepth: number;
13
+ blockQuoteDepth: number;
14
+ }
15
+ export function htmlElementToMarkdownElements(
16
+ parent: HeaderElement,
17
+ elem: SerializedHTMLElement | string,
18
+ prevState: ConversionState = {
19
+ defaultType: MarkdownElementType.Paragraph,
20
+ listDepth: 0,
21
+ blockQuoteDepth: 0,
22
+ }
23
+ ): MarkdownElement | MarkdownElement[] {
24
+ // Found text so create an element based on the previous state
25
+ if (typeof elem === "string") {
26
+ if (elem.trim().length === 0) return [];
27
+ if (
28
+ prevState.defaultType === MarkdownElementType.UnorderedListItem ||
29
+ prevState.defaultType === MarkdownElementType.OrderedListItem
30
+ ) {
31
+ return {
32
+ parent,
33
+ type: prevState.defaultType,
34
+ content: elem,
35
+ depth: prevState.listDepth,
36
+ };
37
+ }
38
+ if (prevState.defaultType === MarkdownElementType.BlockQuote) {
39
+ return {
40
+ parent,
41
+ type: prevState.defaultType,
42
+ content: elem,
43
+ depth: prevState.blockQuoteDepth,
44
+ };
45
+ }
46
+ return { parent, type: prevState.defaultType, content: elem };
47
+ }
48
+
49
+ const type = tagNameMap[elem.tagName] ?? MarkdownElementType.Paragraph;
50
+
51
+ // Update the state based on the current element
52
+ const state: ConversionState = { ...prevState };
53
+ if (type === MarkdownElementType.UnorderedList || type === MarkdownElementType.OrderedList) {
54
+ state.listDepth += 1;
55
+ state.defaultType =
56
+ type === MarkdownElementType.UnorderedList
57
+ ? MarkdownElementType.UnorderedListItem
58
+ : MarkdownElementType.OrderedListItem;
59
+ }
60
+ if (type === MarkdownElementType.BlockQuote) {
61
+ state.defaultType = MarkdownElementType.BlockQuote;
62
+ state.blockQuoteDepth += 1;
63
+ }
64
+
65
+ // Headers
66
+ if (type === MarkdownElementType.Header) {
67
+ return {
68
+ parent,
69
+ type,
70
+ level: Number(elem.tagName[1]),
71
+ content: collapseString(stringifyHTMLElements(elem.content)),
72
+ children: [],
73
+ };
74
+ }
75
+
76
+ // Code blocks
77
+ if (type === MarkdownElementType.CodeBlock) {
78
+ return {
79
+ parent,
80
+ type,
81
+ content: sanitizeString(stringifyHTMLElementsUnformatted(elem.content)),
82
+ };
83
+ }
84
+
85
+ // Typical case, we want to flatten the DOM and only create elements when we see text
86
+ return elem.content.flatMap((el) => htmlElementToMarkdownElements(parent, el, state));
87
+ }
88
+
89
+ export function mergeAdjacentElements(elements: MarkdownElement[]): MarkdownElement[] {
90
+ return elements.reduce<MarkdownElement[]>((acc, elem) => {
91
+ const last = acc[acc.length - 1];
92
+ if (last && last.type === MarkdownElementType.Paragraph && last.type === elem.type) {
93
+ last.content += elem.content;
94
+ return acc;
95
+ }
96
+ return [...acc, elem];
97
+ }, []);
98
+ }
src/lib/server/websearch/markdown/tree.ts ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { SerializedHTMLElement } from "../scrape/types";
2
+ import { htmlElementToMarkdownElements, mergeAdjacentElements } from "./fromHtml";
3
+ import type { HeaderElement, MarkdownElement } from "./types";
4
+ import { MarkdownElementType } from "./types";
5
+ import { chunkElements } from "./utils/chunk";
6
+
7
+ /**
8
+ * Converts HTML elements to Markdown elements and creates a tree based on header tags
9
+ * For example: h1 [h2 [p p blockquote] h2 [h3 [...] ] ]
10
+ **/
11
+ export function htmlToMarkdownTree(
12
+ title: string,
13
+ htmlElements: SerializedHTMLElement[],
14
+ maxCharsPerElem: number
15
+ ): HeaderElement {
16
+ let parent: HeaderElement = {
17
+ type: MarkdownElementType.Header,
18
+ level: 1,
19
+ parent: null,
20
+ content: title,
21
+ children: [],
22
+ };
23
+
24
+ const markdownElements = chunkElements(
25
+ mergeAdjacentElements(
26
+ htmlElements.flatMap((elem) => htmlElementToMarkdownElements(parent, elem))
27
+ ),
28
+ maxCharsPerElem
29
+ );
30
+
31
+ for (const elem of markdownElements) {
32
+ if (elem.type !== MarkdownElementType.Header) {
33
+ elem.parent = parent;
34
+ parent.children.push(elem);
35
+ continue;
36
+ }
37
+
38
+ // add 1 to current level to offset for the title being level 1
39
+ elem.level += 1;
40
+
41
+ // Pop up header levels until reaching the same level as the current header
42
+ // or until we reach the root
43
+ inner: while (parent !== null && parent.parent !== null) {
44
+ if (parent.level < elem.level) break inner;
45
+ parent = parent.parent;
46
+ }
47
+ parent.children.push(elem);
48
+ parent = elem;
49
+ }
50
+
51
+ // Pop up to the root
52
+ while (parent.parent !== null) {
53
+ parent = parent.parent;
54
+ }
55
+ return parent;
56
+ }
57
+
58
+ export function removeParents<T extends MarkdownElement>(elem: T): T {
59
+ if ("children" in elem) {
60
+ return { ...elem, parent: null, children: elem.children.map((child) => removeParents(child)) };
61
+ }
62
+ return { ...elem, parent: null };
63
+ }
src/lib/server/websearch/markdown/types.ts ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* eslint-disable-next-line no-shadow */
2
+ export enum MarkdownElementType {
3
+ Header = "HEADER",
4
+ Paragraph = "PARAGRAPH",
5
+ BlockQuote = "BLOCKQUOTE",
6
+ CodeBlock = "CODE_BLOCK",
7
+
8
+ UnorderedList = "UNORDERED_LIST",
9
+ OrderedList = "ORDERED_LIST",
10
+ UnorderedListItem = "UNORDERED_LIST_ITEM",
11
+ OrderedListItem = "ORDERED_LIST_ITEM",
12
+ }
13
+
14
+ interface BaseMarkdownElement<T = MarkdownElementType> {
15
+ type: T;
16
+ content: string;
17
+ parent: HeaderElement | null;
18
+ }
19
+
20
+ export interface HeaderElement extends BaseMarkdownElement<MarkdownElementType.Header> {
21
+ level: number;
22
+ children: MarkdownElement[];
23
+ }
24
+ type ListItem = MarkdownElementType.UnorderedListItem | MarkdownElementType.OrderedListItem;
25
+ interface ListItemElement extends BaseMarkdownElement<ListItem> {
26
+ depth: number;
27
+ }
28
+ interface BlockQuoteElement extends BaseMarkdownElement<MarkdownElementType.BlockQuote> {
29
+ depth: number;
30
+ }
31
+ interface ParagraphElement extends BaseMarkdownElement<MarkdownElementType.Paragraph> {}
32
+ interface CodeBlockElement extends BaseMarkdownElement<MarkdownElementType.CodeBlock> {}
33
+
34
+ export type MarkdownElement =
35
+ | HeaderElement
36
+ | ParagraphElement
37
+ | BlockQuoteElement
38
+ | CodeBlockElement
39
+ | ListItemElement;
40
+
41
+ export const tagNameMap: Record<string, MarkdownElementType> = {
42
+ h1: MarkdownElementType.Header,
43
+ h2: MarkdownElementType.Header,
44
+ h3: MarkdownElementType.Header,
45
+ h4: MarkdownElementType.Header,
46
+ h5: MarkdownElementType.Header,
47
+ h6: MarkdownElementType.Header,
48
+ div: MarkdownElementType.Paragraph,
49
+ p: MarkdownElementType.Paragraph,
50
+ blockquote: MarkdownElementType.BlockQuote,
51
+ pre: MarkdownElementType.CodeBlock,
52
+ ul: MarkdownElementType.UnorderedList,
53
+ ol: MarkdownElementType.OrderedList,
54
+ li: MarkdownElementType.UnorderedListItem,
55
+ };
src/lib/server/websearch/markdown/utils/chunk.ts ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { sentences as splitBySentences } from "sbd";
2
+ import { MarkdownElementType, type MarkdownElement } from "../types";
3
+
4
+ export function chunkElements(elements: MarkdownElement[], maxLength: number): MarkdownElement[] {
5
+ return elements.flatMap((elem) => {
6
+ // Can't split headers because it would break the tree, and this situation should be rare
7
+ // so we just cut off the end
8
+ if (elem.type === MarkdownElementType.Header) {
9
+ return { ...elem, content: elem.content.slice(0, maxLength) };
10
+ }
11
+ const contentChunks = enforceMaxLength(elem.content, maxLength);
12
+ return contentChunks.map<MarkdownElement>((content) => ({ ...elem, content }));
13
+ });
14
+ }
15
+
16
+ const delimitersByPriority = ["?", "!", ".", ";", ":", ",", "|", " - ", " ", "-"];
17
+ function enforceMaxLength(text: string, maxLength: number): string[] {
18
+ if (text.length <= maxLength) return [text].filter(Boolean);
19
+ return splitBySentences(text)
20
+ .flatMap((sentence) => {
21
+ if (sentence.length <= maxLength) return sentence;
22
+
23
+ // Discover all necessary split points to fit the sentence within the max length
24
+ const indices: [number, number][] = [];
25
+ while ((indices.at(-1)?.[1] ?? 0) < sentence.length) {
26
+ const prevIndex = indices.at(-1)?.[1] ?? 0;
27
+
28
+ // Remaining text fits within maxLength
29
+ if (prevIndex + maxLength >= sentence.length) {
30
+ indices.push([prevIndex, sentence.length]);
31
+ continue;
32
+ }
33
+
34
+ const bestDelimiter = delimitersByPriority.find(
35
+ (delimiter) => sentence.lastIndexOf(delimiter, prevIndex + maxLength) !== -1
36
+ );
37
+ // Fallback in the unusual case that no delimiter is found
38
+ if (!bestDelimiter) {
39
+ indices.push([prevIndex, prevIndex + maxLength]);
40
+ continue;
41
+ }
42
+
43
+ const closestDelimiter = sentence.lastIndexOf(bestDelimiter, prevIndex + maxLength);
44
+ indices.push([prevIndex, Math.max(prevIndex + 1, closestDelimiter)]);
45
+ }
46
+
47
+ return indices.map((sliceIndices) => sentence.slice(...sliceIndices));
48
+ })
49
+ .reduce<string[]>(
50
+ (chunks, sentence) => {
51
+ const lastChunk = chunks[chunks.length - 1];
52
+ if (lastChunk.length + sentence.length <= maxLength) {
53
+ return [...chunks.slice(0, -1), lastChunk + sentence];
54
+ }
55
+ return [...chunks, sentence];
56
+ },
57
+ [""]
58
+ )
59
+ .filter(Boolean);
60
+ }
src/lib/server/websearch/markdown/utils/nlp.ts ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /** Remove excess whitespace and newlines */
2
+ export const sanitizeString = (str: string) =>
3
+ str
4
+ .split("\n")
5
+ .map((s) => s.trim())
6
+ .filter(Boolean)
7
+ .join("\n")
8
+ .replaceAll(/ +/g, " ");
9
+
10
+ /** Collapses a string into a single line */
11
+ export const collapseString = (str: string) => sanitizeString(str.replaceAll(/\n/g, " "));
src/lib/server/websearch/markdown/utils/stringify.ts ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { SerializedHTMLElement } from "../../scrape/types";
2
+ import { MarkdownElementType, type MarkdownElement } from "../types";
3
+
4
+ // --- Markdown Elements ---
5
+
6
+ /** Converts markdown element to a string with formatting */
7
+ export function stringifyMarkdownElement(elem: MarkdownElement): string {
8
+ const content = elem.content.trim();
9
+ if (elem.type === MarkdownElementType.Header) return `${"#".repeat(elem.level)} ${content}\n\n`;
10
+ if (elem.type === MarkdownElementType.BlockQuote) {
11
+ return `${"> ".repeat(elem.depth)}${content}\n\n`;
12
+ }
13
+ if (elem.type === MarkdownElementType.CodeBlock) return `\`\`\`\n${content}\n\`\`\`\n\n`;
14
+
15
+ if (elem.type === MarkdownElementType.UnorderedListItem) return `- ${content}\n`;
16
+ if (elem.type === MarkdownElementType.OrderedListItem) {
17
+ const siblings = elem.parent?.children ?? [elem];
18
+ const currentIndex = siblings.indexOf(elem);
19
+ const lastAdjacentIndex = siblings
20
+ .slice(currentIndex + 1)
21
+ .findLastIndex((child) => child.type === MarkdownElementType.OrderedListItem);
22
+ const order = currentIndex - lastAdjacentIndex + 1;
23
+ return `${order}. ${content}\n`;
24
+ }
25
+
26
+ return `${content}\n\n`;
27
+ }
28
+
29
+ // ----- HTML Elements -----
30
+
31
+ /** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
32
+ export function stringifyHTMLElements(elems: (SerializedHTMLElement | string)[]): string {
33
+ return elems.map(stringifyHTMLElement).join("").trim();
34
+ }
35
+
36
+ /** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
37
+ export function stringifyHTMLElement(elem: SerializedHTMLElement | string): string {
38
+ if (typeof elem === "string") return elem;
39
+ if (elem.tagName === "br") return "\n";
40
+
41
+ const content = elem.content.map(stringifyHTMLElement).join("");
42
+ if (content.length === 0) return content;
43
+
44
+ if (elem.tagName === "strong" || elem.tagName === "b") return `**${content}**`;
45
+ if (elem.tagName === "em" || elem.tagName === "i") return `*${content}*`;
46
+ if (elem.tagName === "s" || elem.tagName === "strike") return `~~${content}~~`;
47
+
48
+ if (elem.tagName === "code" || elem.tagName === "var" || elem.tagName === "tt") {
49
+ return `\`${content}\``;
50
+ }
51
+
52
+ if (elem.tagName === "sup") return `<sup>${content}</sup>`;
53
+ if (elem.tagName === "sub") return `<sub>${content}</sub>`;
54
+
55
+ if (elem.tagName === "a" && content.trim().length > 0) {
56
+ const href = elem.attributes.href;
57
+ if (!href) return elem.content.map(stringifyHTMLElement).join("");
58
+ return `[${elem.content.map(stringifyHTMLElement).join("")}](${href})`;
59
+ }
60
+
61
+ return elem.content.map(stringifyHTMLElement).join("");
62
+ }
63
+
64
+ /** Grabs all text content directly, ignoring HTML tags */
65
+ export function stringifyHTMLElementsUnformatted(
66
+ elems: (SerializedHTMLElement | string)[]
67
+ ): string {
68
+ return elems.map(stringifyHTMLElementUnformatted).join("");
69
+ }
70
+
71
+ /** Grabs all text content directly, ignoring HTML tags */
72
+ function stringifyHTMLElementUnformatted(elem: SerializedHTMLElement | string): string {
73
+ if (typeof elem === "string") return elem;
74
+ return elem.content.map(stringifyHTMLElementUnformatted).join("");
75
+ }
src/lib/server/websearch/parseWeb.ts DELETED
@@ -1,41 +0,0 @@
1
- import { JSDOM, VirtualConsole } from "jsdom";
2
-
3
- export async function parseWeb(url: string) {
4
- const abortController = new AbortController();
5
- setTimeout(() => abortController.abort(), 10000);
6
- const r = await fetch(url, { signal: abortController.signal, credentials: "omit" }).catch();
7
-
8
- if (r.headers.get("content-type")?.includes("text/html")) {
9
- const virtualConsole = new VirtualConsole();
10
- virtualConsole.on("error", () => {
11
- // No-op to skip console errors.
12
- });
13
-
14
- // put the html string into a DOM
15
- const dom = new JSDOM((await r.text()) ?? "", {
16
- virtualConsole,
17
- });
18
-
19
- const { document } = dom.window;
20
- const paragraphs = document.querySelectorAll("p, table, pre, ul, ol");
21
-
22
- if (!paragraphs.length) {
23
- throw new Error(`webpage doesn't have any parseable element`);
24
- }
25
- const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
26
-
27
- // combine text contents from paragraphs and then remove newlines and multiple spaces
28
- const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
29
-
30
- return text;
31
- } else if (
32
- r.headers.get("content-type")?.includes("text/plain") ||
33
- r.headers.get("content-type")?.includes("text/markdown")
34
- ) {
35
- const text = await r.text();
36
- // JSON.stringify is needed to turn string concatenation into a single string (ex: "Hello, " + "world!" -> "Hello, world!")
37
- return JSON.stringify(text);
38
- } else {
39
- throw new Error("Unsupported content type");
40
- }
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/lib/server/websearch/runWebSearch.ts CHANGED
@@ -1,179 +1,103 @@
1
- import { searchWeb } from "$lib/server/websearch/searchWeb";
2
- import { generateQuery } from "$lib/server/websearch/generateQuery";
3
- import { parseWeb } from "$lib/server/websearch/parseWeb";
4
- import { chunk } from "$lib/utils/chunk";
5
- import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
6
- import { getWebSearchProvider } from "./searchWeb";
7
  import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
8
- import { env } from "$env/dynamic/private";
9
 
10
  import type { Conversation } from "$lib/types/Conversation";
11
  import type { MessageUpdate } from "$lib/types/MessageUpdate";
12
  import type { Message } from "$lib/types/Message";
13
- import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
14
  import type { Assistant } from "$lib/types/Assistant";
15
 
16
- import { z } from "zod";
17
- import JSON5 from "json5";
18
- import { isURLLocal } from "../isURLLocal";
 
19
 
20
- const MAX_N_PAGES_SCRAPE = 10 as const;
21
- const MAX_N_PAGES_EMBED = 5 as const;
22
 
23
- const listSchema = z.array(z.string()).default([]);
24
-
25
- const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST));
26
- const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST));
 
27
 
28
  export async function runWebSearch(
29
  conv: Conversation,
30
  messages: Message[],
31
  updatePad: (upd: MessageUpdate) => void,
32
  ragSettings?: Assistant["rag"]
33
- ) {
34
  const prompt = messages[messages.length - 1].content;
35
- const webSearch: WebSearch = {
36
- prompt,
37
- searchQuery: "",
38
- results: [],
39
- contextSources: [],
40
- createdAt: new Date(),
41
- updatedAt: new Date(),
42
- };
43
-
44
- function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
45
- updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
46
- }
47
 
48
  try {
49
- // if the assistant specified direct links, skip the websearch
50
- if (ragSettings && ragSettings?.allowedLinks.length > 0) {
51
- appendUpdate("Using links specified in Assistant");
52
-
53
- let linksToUse = [...ragSettings.allowedLinks];
54
-
55
- if (env.ENABLE_LOCAL_FETCH !== "true") {
56
- const localLinks = await Promise.all(
57
- linksToUse.map(async (link) => {
58
- try {
59
- const url = new URL(link);
60
- return await isURLLocal(url);
61
- } catch (e) {
62
- return true;
63
- }
64
- })
65
- );
66
-
67
- linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
68
- }
69
-
70
- webSearch.results = linksToUse.map((link) => {
71
- return { link, hostname: new URL(link).hostname, title: "", text: "" };
72
- });
73
- } else {
74
- webSearch.searchQuery = await generateQuery(messages);
75
- const searchProvider = getWebSearchProvider();
76
- appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
77
-
78
- let filters = "";
79
- if (ragSettings && ragSettings?.allowedDomains.length > 0) {
80
- appendUpdate("Filtering on specified domains");
81
- filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
82
- }
83
-
84
- // handle the global lists
85
- filters +=
86
- allowList.map((item) => "site:" + item).join(" OR ") +
87
- " " +
88
- blockList.map((item) => "-site:" + item).join(" ");
89
-
90
- webSearch.searchQuery = filters + " " + webSearch.searchQuery;
91
-
92
- const results = await searchWeb(webSearch.searchQuery);
93
- webSearch.results =
94
- (results.organic_results &&
95
- results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
96
- try {
97
- const { title, link, text } = el;
98
- const { hostname } = new URL(link);
99
- return { title, link, hostname, text };
100
- } catch (e) {
101
- // Ignore Errors
102
- return null;
103
- }
104
- })) ??
105
- [];
106
- }
107
-
108
- webSearch.results = webSearch.results.filter((value) => value !== null);
109
- webSearch.results = webSearch.results
110
- .filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
111
- .slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
112
-
113
- // fetch the model
114
  const embeddingModel =
115
  embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
116
-
117
  if (!embeddingModel) {
118
- throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`);
119
  }
120
 
121
- let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
122
- if (webSearch.results.length > 0) {
123
- appendUpdate("Browsing results");
124
- const promises = webSearch.results.map(async (result) => {
125
- const { link } = result;
126
- let text = result.text ?? "";
127
- if (!text) {
128
- try {
129
- text = await parseWeb(link);
130
- appendUpdate("Browsing webpage", [link]);
131
- } catch (e) {
132
- appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error");
133
- // ignore errors
134
- }
135
- }
136
- const MAX_N_CHUNKS = 100;
137
- const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS);
138
- return texts.map((t) => ({ source: result, text: t }));
139
- });
140
- const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
141
- paragraphChunks = nestedParagraphChunks.flat();
142
- if (!paragraphChunks.length) {
143
- throw new Error("No text found on the first 5 results");
144
- }
145
- } else {
146
- throw new Error("No results found for this search query");
147
  }
148
 
 
149
  appendUpdate("Extracting relevant information");
150
- const topKClosestParagraphs = 8;
151
- const texts = paragraphChunks.map(({ text }) => text);
152
- const indices = await findSimilarSentences(embeddingModel, prompt, texts, {
153
- topK: topKClosestParagraphs,
154
- });
155
-
156
- for (const idx of indices) {
157
- const { source } = paragraphChunks[idx];
158
- const contextWithId = { idx, text: texts[idx] };
159
- const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
160
- if (usedSource) {
161
- usedSource.context.push(contextWithId);
162
- } else {
163
- webSearch.contextSources.push({ ...source, context: [contextWithId] });
164
- }
165
- }
166
  updatePad({
167
  type: "webSearch",
168
  messageType: "sources",
169
  message: "sources",
170
- sources: webSearch.contextSources,
171
  });
 
 
 
 
 
 
 
 
 
 
 
 
172
  } catch (searchError) {
173
- if (searchError instanceof Error) {
174
- appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error");
175
- }
 
 
 
 
 
 
 
 
176
  }
177
-
178
- return webSearch;
179
  }
 
 
 
 
 
 
 
1
  import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
 
2
 
3
  import type { Conversation } from "$lib/types/Conversation";
4
  import type { MessageUpdate } from "$lib/types/MessageUpdate";
5
  import type { Message } from "$lib/types/Message";
6
+ import type { WebSearch, WebSearchScrapedSource } from "$lib/types/WebSearch";
7
  import type { Assistant } from "$lib/types/Assistant";
8
 
9
+ import { search } from "./search/search";
10
+ import { scrape } from "./scrape/scrape";
11
+ import { findContextSources } from "./embed/embed";
12
+ import { removeParents } from "./markdown/tree";
13
 
14
+ const MAX_N_PAGES_TO_SCRAPE = 8 as const;
15
+ const MAX_N_PAGES_TO_EMBED = 5 as const;
16
 
17
+ export type AppendUpdate = (message: string, args?: string[], type?: "error" | "update") => void;
18
+ const makeAppendUpdate =
19
+ (updatePad: (upd: MessageUpdate) => void): AppendUpdate =>
20
+ (message, args, type) =>
21
+ updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
22
 
23
  export async function runWebSearch(
24
  conv: Conversation,
25
  messages: Message[],
26
  updatePad: (upd: MessageUpdate) => void,
27
  ragSettings?: Assistant["rag"]
28
+ ): Promise<WebSearch> {
29
  const prompt = messages[messages.length - 1].content;
30
+ const createdAt = new Date();
31
+ const updatedAt = new Date();
32
+ const appendUpdate = makeAppendUpdate(updatePad);
 
 
 
 
 
 
 
 
 
33
 
34
  try {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  const embeddingModel =
36
  embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
 
37
  if (!embeddingModel) {
38
+ throw Error(`Embedding model ${conv.embeddingModel} not available anymore`);
39
  }
40
 
41
+ // Search the web
42
+ const { searchQuery, pages } = await search(messages, ragSettings, appendUpdate);
43
+ if (pages.length === 0) throw Error("No results found for this search query");
44
+
45
+ // Scrape pages
46
+ appendUpdate("Browsing search results");
47
+
48
+ const scrapedPages = await Promise.all(
49
+ pages
50
+ .slice(0, MAX_N_PAGES_TO_SCRAPE)
51
+ .map(scrape(appendUpdate, embeddingModel.chunkCharLength))
52
+ ).then((allScrapedPages) =>
53
+ allScrapedPages
54
+ .filter((p): p is WebSearchScrapedSource => Boolean(p))
55
+ .filter((p) => p.page.markdownTree.children.length > 0)
56
+ .slice(0, MAX_N_PAGES_TO_EMBED)
57
+ );
58
+
59
+ if (!scrapedPages.length) {
60
+ throw Error(`No text found in the first ${MAX_N_PAGES_TO_SCRAPE} results`);
 
 
 
 
 
 
61
  }
62
 
63
+ // Chunk the text of each of the elements and find the most similar chunks to the prompt
64
  appendUpdate("Extracting relevant information");
65
+ const contextSources = await findContextSources(scrapedPages, prompt, embeddingModel).then(
66
+ (ctxSources) =>
67
+ ctxSources.map((source) => ({
68
+ ...source,
69
+ page: { ...source.page, markdownTree: removeParents(source.page.markdownTree) },
70
+ }))
71
+ );
 
 
 
 
 
 
 
 
 
72
  updatePad({
73
  type: "webSearch",
74
  messageType: "sources",
75
  message: "sources",
76
+ sources: contextSources,
77
  });
78
+
79
+ return {
80
+ prompt,
81
+ searchQuery,
82
+ results: scrapedPages.map(({ page, ...source }) => ({
83
+ ...source,
84
+ page: { ...page, markdownTree: removeParents(page.markdownTree) },
85
+ })),
86
+ contextSources,
87
+ createdAt,
88
+ updatedAt,
89
+ };
90
  } catch (searchError) {
91
+ const message = searchError instanceof Error ? searchError.message : String(searchError);
92
+ console.error(message);
93
+ appendUpdate("An error occurred", [JSON.stringify(message)], "error");
94
+ return {
95
+ prompt,
96
+ searchQuery: "",
97
+ results: [],
98
+ contextSources: [],
99
+ createdAt,
100
+ updatedAt,
101
+ };
102
  }
 
 
103
  }
src/lib/server/websearch/scrape/parser.ts ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { SerializedHTMLElement } from "./types";
2
+
3
+ interface DBSCANOptions<T> {
4
+ dataset: T[];
5
+ epsilon?: number;
6
+ epsilonCompare?: (distance: number, epsilon: number) => boolean;
7
+ minimumPoints?: number;
8
+ distanceFunction: (a: T, b: T) => number;
9
+ }
10
+
11
+ export function spatialParser() {
12
+ /**
13
+ * Implementation for dbscan, inlined and migrated to typescript from https://github.com/cdxOo/dbscan (MIT License)
14
+ */
15
+ const DBSCAN = <T>({
16
+ dataset,
17
+ epsilon = 1,
18
+ epsilonCompare = (dist, e) => dist < e,
19
+ minimumPoints = 2,
20
+ distanceFunction,
21
+ }: DBSCANOptions<T>) => {
22
+ const visitedIndices: Record<number, boolean> = {};
23
+ const isVisited = (i: number) => visitedIndices[i];
24
+ const markVisited = (i: number) => {
25
+ visitedIndices[i] = true;
26
+ };
27
+
28
+ const clusteredIndices: Record<number, boolean> = {};
29
+ const isClustered = (i: number) => clusteredIndices[i];
30
+ const markClustered = (i: number) => {
31
+ clusteredIndices[i] = true;
32
+ };
33
+
34
+ const uniqueMerge = <U>(targetArray: U[], sourceArray: U[]) => {
35
+ for (let i = 0; i < sourceArray.length; i += 1) {
36
+ const item = sourceArray[i];
37
+ if (targetArray.indexOf(item) < 0) {
38
+ targetArray.push(item);
39
+ }
40
+ }
41
+ };
42
+
43
+ const findNeighbors = (index: number) => {
44
+ const neighbors = [];
45
+ for (let other = 0; other < dataset.length; other += 1) {
46
+ const distance = distanceFunction(dataset[index], dataset[other]);
47
+ if (epsilonCompare(distance, epsilon)) {
48
+ neighbors.push(other);
49
+ }
50
+ }
51
+ return neighbors;
52
+ };
53
+
54
+ const noise: number[] = [];
55
+ const addNoise = (i: number) => noise.push(i);
56
+
57
+ const clusters: number[][] = [];
58
+ const createCluster = () => clusters.push([]) - 1;
59
+ const addIndexToCluster = (c: number, i: number) => {
60
+ clusters[c].push(i);
61
+ markClustered(i);
62
+ };
63
+
64
+ const expandCluster = (c: number, neighbors: number[]) => {
65
+ for (let i = 0; i < neighbors.length; i += 1) {
66
+ const neighborIndex = neighbors[i];
67
+ if (!isVisited(neighborIndex)) {
68
+ markVisited(neighborIndex);
69
+
70
+ const secondaryNeighbors = findNeighbors(neighborIndex);
71
+ if (secondaryNeighbors.length >= minimumPoints) {
72
+ uniqueMerge(neighbors, secondaryNeighbors);
73
+ }
74
+ }
75
+
76
+ if (!isClustered(neighborIndex)) {
77
+ addIndexToCluster(c, neighborIndex);
78
+ }
79
+ }
80
+ };
81
+
82
+ dataset.forEach((_, index) => {
83
+ if (!isVisited(index)) {
84
+ markVisited(index);
85
+
86
+ const neighbors = findNeighbors(index);
87
+ if (neighbors.length < minimumPoints) {
88
+ addNoise(index);
89
+ } else {
90
+ const clusterIndex = createCluster();
91
+ addIndexToCluster(clusterIndex, index);
92
+ expandCluster(clusterIndex, neighbors);
93
+ }
94
+ }
95
+ });
96
+
97
+ return { clusters, noise };
98
+ };
99
+
100
+ // -----------
101
+ // Scraping implementation
102
+
103
+ const IgnoredTagsList = [
104
+ "footer",
105
+ "nav",
106
+ "aside",
107
+ "script",
108
+ "style",
109
+ "noscript",
110
+ "form",
111
+ "button",
112
+ ];
113
+ const InlineTags = [
114
+ "a",
115
+ "abbrv",
116
+ "span",
117
+ "address",
118
+ "time",
119
+ "acronym",
120
+ "strong",
121
+ "b",
122
+ "br",
123
+ "sub",
124
+ "sup",
125
+ "tt",
126
+ "var",
127
+ "em",
128
+ "i",
129
+ ];
130
+
131
+ type ReadableNode = HTMLElement;
132
+ type NodeWithRect = {
133
+ node: ReadableNode;
134
+ rect: DOMRect;
135
+ };
136
+
137
+ const isOnlyChild = (node: Node) => {
138
+ if (!node.parentElement) return true;
139
+ if (node.parentElement.nodeName === "body") return false;
140
+ if (node.parentElement.childNodes.length === 1) return true;
141
+ return false;
142
+ };
143
+
144
+ const hasValidInlineParent = (node: Node) => {
145
+ return node.parentElement && !node.parentElement.matches("div, section, article, main, body ");
146
+ };
147
+
148
+ const hasValidParent = (node: Node) => {
149
+ return node.parentElement && !node.parentElement.isSameNode(document.body);
150
+ };
151
+
152
+ const possibleCodeParents = Array.from(document.querySelectorAll("pre, p"));
153
+ const possibleTableParents = Array.from(document.querySelectorAll("table"));
154
+ const possibleListParents = Array.from(document.querySelectorAll("ul, ol"));
155
+ /**
156
+ * We want to find the highest parent of text node in the cluster.
157
+ * For example in this case: <p><span>Text here</span></p>
158
+ * the P tag is highest parent.
159
+ */
160
+ const findHighestDirectParentOfReadableNode = (node: Node): HTMLElement => {
161
+ // go up the tree until the parent is no longer an only child
162
+ let parent = node.parentElement;
163
+ // if the parent is an inline tag, then go up one more level
164
+ while (
165
+ parent &&
166
+ hasValidInlineParent(parent) &&
167
+ InlineTags.includes(parent?.tagName.toLowerCase())
168
+ ) {
169
+ parent = parent.parentElement;
170
+ }
171
+
172
+ while (parent && isOnlyChild(parent)) {
173
+ if (!hasValidParent(parent)) break;
174
+ parent = parent.parentElement;
175
+ }
176
+
177
+ if (!parent) {
178
+ throw new Error(
179
+ "disconnected node found, this should not really be possible when traversing through the dom"
180
+ );
181
+ }
182
+
183
+ // if the parent is a span, code or div tag check if there is a pre tag or p tag above it
184
+ if (["span", "code", "div"].includes(parent.nodeName.toLowerCase())) {
185
+ const hasParent = possibleCodeParents.find((tag) => tag.contains(parent)) as HTMLElement;
186
+ if (hasParent) {
187
+ parent = hasParent;
188
+ }
189
+ }
190
+
191
+ // if the parent is a li tag check if there is a ul or ol tag above it
192
+ if (parent.nodeName.toLowerCase() === "li") {
193
+ const hasParent = possibleListParents.find((tag) => tag.contains(parent)) as HTMLElement;
194
+ if (hasParent) {
195
+ parent = hasParent;
196
+ }
197
+ }
198
+
199
+ // if the parent is a td, th, tr tag check if there is a table tag above it
200
+ if (["td", "th", "tr"].includes(parent.nodeName.toLowerCase())) {
201
+ const hasParent = possibleTableParents.find((tag) => tag.contains(parent)) as HTMLElement;
202
+ if (hasParent) {
203
+ parent = hasParent;
204
+ }
205
+ }
206
+
207
+ return parent;
208
+ };
209
+ const barredNodes = Array.from(document.querySelectorAll(IgnoredTagsList.join(",")));
210
+
211
+ const doesNodePassHeuristics = (node: Node) => {
212
+ if ((node.textContent ?? "").trim().length < 10) {
213
+ return false;
214
+ }
215
+
216
+ const parentNode = findHighestDirectParentOfReadableNode(node);
217
+
218
+ if (parentNode && parentNode instanceof Element) {
219
+ if (
220
+ !parentNode.checkVisibility({
221
+ checkOpacity: true,
222
+ checkVisibilityCSS: true,
223
+ })
224
+ )
225
+ return false;
226
+
227
+ const rect = parentNode.getBoundingClientRect();
228
+ // elements that are readable usually don't have really small height or width
229
+ if (rect.width < 4 || rect.height < 4) {
230
+ return false;
231
+ }
232
+ }
233
+
234
+ if (parentNode && parentNode instanceof Element) {
235
+ if (barredNodes.some((barredNode) => barredNode.contains(parentNode))) {
236
+ return false;
237
+ }
238
+ }
239
+
240
+ return true;
241
+ };
242
+
243
+ const getAllReadableNodes = (): NodeWithRect[] => {
244
+ if (!document.body) throw new Error("Page failed to load");
245
+ const treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
246
+ acceptNode(node) {
247
+ if (doesNodePassHeuristics(node)) {
248
+ return NodeFilter.FILTER_ACCEPT;
249
+ } else {
250
+ return NodeFilter.FILTER_SKIP;
251
+ }
252
+ },
253
+ });
254
+
255
+ const readableNodes = [];
256
+
257
+ while (treeWalker.nextNode()) {
258
+ readableNodes.push(treeWalker.currentNode as ReadableNode);
259
+ }
260
+
261
+ /*
262
+ * <table><p>hello</p><p>world</p></table>
263
+ * table is already included in the parent of the first p tag
264
+ */
265
+
266
+ const parentsForReadableNodes = readableNodes.map(findHighestDirectParentOfReadableNode);
267
+ const listWithOnlyParents: HTMLElement[] = [];
268
+ // find unique nodes in the parent list, a unique node is a node that is not a child of any other node in the list
269
+ for (let i = 0; i < parentsForReadableNodes.length; i++) {
270
+ const node = parentsForReadableNodes[i];
271
+ const hasParentInList = parentsForReadableNodes.find((otherNode, idx) => {
272
+ if (i === idx) return false;
273
+ return otherNode.contains(node);
274
+ });
275
+ listWithOnlyParents.push(hasParentInList ? hasParentInList : node);
276
+ }
277
+
278
+ const uniqueParents = Array.from(new Set(listWithOnlyParents));
279
+
280
+ return uniqueParents.map((node) => {
281
+ return {
282
+ node,
283
+ rect: node.getBoundingClientRect(),
284
+ };
285
+ });
286
+ };
287
+
288
+ const distanceFunction = (a: NodeWithRect, b: NodeWithRect) => {
289
+ // we make two assumptions here which are fine to make for rects returned from getBoundingClientRect
290
+ // 1. rects are upright and not rotated
291
+ // 2. If two rects intersect, we assume distance to be 0
292
+ let dx = 0;
293
+ let dy = 0;
294
+ const rect1 = a.rect;
295
+ const rect2 = b.rect;
296
+ // Calculate the horizontal distance
297
+ if (rect1.x + rect1.width < rect2.x) {
298
+ dx = rect2.x - (rect1.x + rect1.width);
299
+ } else if (rect2.x + rect2.width < rect1.x) {
300
+ dx = rect1.x - (rect2.x + rect2.width);
301
+ }
302
+
303
+ // Calculate the vertical distance
304
+ if (rect1.y + rect1.height < rect2.y) {
305
+ dy = rect2.y - (rect1.y + rect1.height);
306
+ } else if (rect2.y + rect2.height < rect1.y) {
307
+ dy = rect1.y - (rect2.y + rect2.height);
308
+ }
309
+
310
+ const distance = Math.sqrt(dx * dx + dy * dy);
311
+ // Return the Euclidean distance
312
+ return distance;
313
+ };
314
+ /**
315
+ * Clusters nodes using dbscan
316
+ */
317
+ const clusterReadableNodes = (nodes: NodeWithRect[]) => {
318
+ const { clusters } = DBSCAN({
319
+ dataset: nodes,
320
+ epsilon: 28,
321
+ minimumPoints: 1,
322
+ distanceFunction,
323
+ });
324
+
325
+ return clusters;
326
+ };
327
+
328
+ const totalTextLength = (cluster: number[]) => {
329
+ return cluster
330
+ .map((t) => readableNodes[t].node.innerText?.replaceAll(/ {2}|\r\n|\n|\r/gm, ""))
331
+ .join("").length;
332
+ };
333
+
334
+ const approximatelyEqual = (a: number, b: number, epsilon = 1) => {
335
+ return Math.abs(a - b) < epsilon;
336
+ };
337
+
338
+ const getClusterBounds = (cluster: number[]) => {
339
+ const leftMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.x));
340
+ const topMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.y));
341
+ const rightMostPoint = Math.max(
342
+ ...cluster.map((c) => readableNodes[c].rect.x + readableNodes[c].rect.width)
343
+ );
344
+ const bottomMostPoint = Math.max(
345
+ ...cluster.map((c) => readableNodes[c].rect.y + readableNodes[c].rect.height)
346
+ );
347
+ return {
348
+ // left most element
349
+ x: leftMostPoint,
350
+ y: topMostPoint,
351
+ width: rightMostPoint - leftMostPoint,
352
+ height: bottomMostPoint - topMostPoint,
353
+ };
354
+ };
355
+
356
+ const round = (num: number, decimalPlaces = 2) => {
357
+ const factor = Math.pow(10, decimalPlaces);
358
+ return Math.round(num * factor) / factor;
359
+ };
360
+
361
+ /** minimum distance to center of the screen */
362
+ const clusterCentrality = (cluster: number[]) => {
363
+ const bounds = getClusterBounds(cluster);
364
+ const centerOfScreen = window.innerWidth / 2;
365
+ // the cluster contains the center of the screen
366
+ if (bounds.x < centerOfScreen && bounds.x + bounds.width > centerOfScreen) {
367
+ return 0;
368
+ }
369
+
370
+ // the cluster is to the left of the screen
371
+ if (bounds.x + bounds.width < centerOfScreen) {
372
+ return centerOfScreen - (bounds.x + bounds.width);
373
+ }
374
+
375
+ // the cluster is to the right of the screen
376
+ return bounds.x - centerOfScreen;
377
+ };
378
+ /** measure of text share that belong to the cluster */
379
+ const percentageTextShare = (cluster: number[], totalLength: number) => {
380
+ // apply an exponentially increasing penalty for centrality per 100 pixels distance from center
381
+
382
+ return round((totalTextLength(cluster) / totalLength) * 100);
383
+ };
384
+
385
+ const shouldMergeClusters = (clusterA: number[], clusterB: number[]) => {
386
+ const clusterABounds = getClusterBounds(clusterA);
387
+ const clusterBBounds = getClusterBounds(clusterB);
388
+
389
+ // A cluster is horizontally aligned if the x and width are roughly equal
390
+ const isHorizontallyAligned =
391
+ approximatelyEqual(clusterABounds.x, clusterBBounds.x, 40) &&
392
+ approximatelyEqual(clusterABounds.width, clusterBBounds.width, 40);
393
+
394
+ if (!isHorizontallyAligned) return false;
395
+
396
+ // check the y gap between the clusters
397
+ const higherCluster = clusterABounds.y < clusterBBounds.y ? clusterABounds : clusterBBounds;
398
+ const lowerCluster = clusterABounds.y < clusterBBounds.y ? clusterBBounds : clusterABounds;
399
+ const yGap = lowerCluster.y - (higherCluster.y + higherCluster.height);
400
+
401
+ if (approximatelyEqual(yGap, 0, 100)) return true;
402
+ };
403
+
404
+ const findCriticalClusters = (clusters: number[][]) => {
405
+ // merge the clusters that have similar widths and x position
406
+
407
+ let i = 0;
408
+ while (i < clusters.length) {
409
+ const cluster = clusters[i];
410
+ for (let j = i + 1; j < clusters.length; j++) {
411
+ const otherCluster = clusters[j];
412
+ if (shouldMergeClusters(cluster, otherCluster)) {
413
+ cluster.push(...otherCluster);
414
+ clusters.splice(j, 1);
415
+ j -= 1;
416
+ }
417
+ }
418
+
419
+ i++;
420
+ }
421
+
422
+ const totalText = totalTextLength(clusters.flat());
423
+
424
+ // sort in descending order of text share
425
+ const clusterWithMetrics = clusters.map((cluster) => {
426
+ const centrality = clusterCentrality(cluster);
427
+ return {
428
+ cluster,
429
+ centrality,
430
+ percentageTextShare: percentageTextShare(cluster, totalText),
431
+ };
432
+ });
433
+
434
+ // if there is a dominant cluster with more than 60% text share, return that
435
+ const dominantCluster = clusterWithMetrics[0].percentageTextShare > 60;
436
+ if (dominantCluster) return [clusterWithMetrics[0].cluster];
437
+
438
+ // clusters are sorted by text share after applying a penalty for centrality
439
+ const sortedClusters = clusterWithMetrics.sort((a, b) => {
440
+ const penaltyForA = Math.pow(0.9, a.centrality / 100);
441
+ const penaltyForB = Math.pow(0.9, b.centrality / 100);
442
+ const adjustedTextShareA = a.percentageTextShare * penaltyForA;
443
+ const adjustedTextShareB = b.percentageTextShare * penaltyForB;
444
+
445
+ return adjustedTextShareB - adjustedTextShareA;
446
+ });
447
+
448
+ // find all clusters that are similar to the largest cluster in terms of text share
449
+ // and see if they are enough to cover at least 60% of the text share
450
+ const largeTextShareClusters = sortedClusters.filter((c) =>
451
+ approximatelyEqual(c.percentageTextShare, sortedClusters[0].percentageTextShare, 10)
452
+ );
453
+
454
+ const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
455
+ (acc, cluster) => acc + cluster.percentageTextShare,
456
+ 0
457
+ );
458
+
459
+ if (totalTextShareOfLargeClusters > 60) {
460
+ return largeTextShareClusters.map((c) => c.cluster);
461
+ }
462
+
463
+ // choose clusters till the text share is greater than 60%
464
+ let totalTextShare = 0;
465
+ const criticalClusters = [];
466
+ for (const cluster of sortedClusters) {
467
+ /** Ignore clusters with less than 2%*/
468
+ if (cluster.percentageTextShare < 2) continue;
469
+ if (totalTextShare > 60) break;
470
+ criticalClusters.push(cluster.cluster);
471
+ totalTextShare += cluster.percentageTextShare;
472
+ }
473
+
474
+ // if the total text share is less than 60% then return an empty array
475
+ // as this website should not be particularly useful for the web search anyways
476
+ // this should almost never happen on structured website with a lot of text
477
+ if (totalTextShare < 60) {
478
+ return [];
479
+ }
480
+
481
+ return criticalClusters;
482
+ };
483
+
484
+ const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
485
+ function serializeHTMLElement(node: Element): SerializedHTMLElement {
486
+ return {
487
+ tagName: node.tagName.toLowerCase(),
488
+ attributes: allowListedAttributes.reduce((acc, attr) => {
489
+ const value = node.getAttribute(attr);
490
+ if (value) {
491
+ acc[attr] = value;
492
+ }
493
+ return acc;
494
+ }, {} as Record<string, string>),
495
+ content: Array.from(node.childNodes).map(serializeNode).filter(Boolean),
496
+ };
497
+ }
498
+
499
+ function serializeNode(node: Node): SerializedHTMLElement | string {
500
+ if (node.nodeType === 1) return serializeHTMLElement(node as Element);
501
+ else if (node.nodeType === 3) return node.textContent ?? "";
502
+ else return "";
503
+ }
504
+
505
+ function getPageMetadata(): {
506
+ title: string;
507
+ siteName?: string;
508
+ author?: string;
509
+ description?: string;
510
+ createdAt?: string;
511
+ updatedAt?: string;
512
+ } {
513
+ const title = document.title ?? "";
514
+ const siteName =
515
+ document.querySelector("meta[property='og:site_name']")?.getAttribute("content") ?? undefined;
516
+ const author =
517
+ document.querySelector("meta[name='author']")?.getAttribute("content") ?? undefined;
518
+ const description =
519
+ document.querySelector("meta[name='description']")?.getAttribute("content") ??
520
+ document.querySelector("meta[property='og:description']")?.getAttribute("content") ??
521
+ undefined;
522
+ const createdAt =
523
+ document.querySelector("meta[property='article:published_time']")?.getAttribute("content") ??
524
+ document.querySelector("meta[name='date']")?.getAttribute("content") ??
525
+ undefined;
526
+ const updatedAt =
527
+ document.querySelector("meta[property='article:modified_time']")?.getAttribute("content") ??
528
+ undefined;
529
+
530
+ return { title, siteName, author, description, createdAt, updatedAt };
531
+ }
532
+
533
+ const readableNodes = getAllReadableNodes();
534
+ const clusters = clusterReadableNodes(readableNodes);
535
+
536
+ const criticalClusters = findCriticalClusters(clusters);
537
+
538
+ // filter readable nodes using the above information as well as heuristics
539
+ const filteredNodes = readableNodes.filter((_, idx) => {
540
+ return criticalClusters.some((cluster) => {
541
+ return cluster.includes(idx);
542
+ });
543
+ });
544
+
545
+ const elements = filteredNodes
546
+ .filter(
547
+ (node, idx, nodes) => !nodes.slice(idx + 1).some((otherNode) => node.node === otherNode.node)
548
+ )
549
+ .map<SerializedHTMLElement>(({ node }) => serializeHTMLElement(node));
550
+ const metadata = getPageMetadata();
551
+ return { ...metadata, elements };
552
+ }
src/lib/server/websearch/scrape/playwright.ts ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ type BrowserContext,
3
+ chromium,
4
+ devices,
5
+ type Page,
6
+ type BrowserContextOptions,
7
+ } from "playwright";
8
+ import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
9
+ import { env } from "$env/dynamic/private";
10
+
11
+ // Singleton initialized by initPlaywrightService
12
+ let playwrightService: Promise<{ ctx: BrowserContext; blocker: PlaywrightBlocker }>;
13
+
14
+ async function initPlaywrightService() {
15
+ if (playwrightService) return playwrightService;
16
+
17
+ const browser = await chromium.launch({ headless: true });
18
+
19
+ process.on("SIGINT", () => browser.close());
20
+
21
+ const device = devices["Desktop Chrome"];
22
+ const options: BrowserContextOptions = {
23
+ ...device,
24
+ // Increasing width improves spatial clustering accuracy
25
+ screen: {
26
+ width: 3840,
27
+ height: 1080,
28
+ },
29
+ viewport: {
30
+ width: 3840,
31
+ height: 1080,
32
+ },
33
+ reducedMotion: "reduce",
34
+ acceptDownloads: false,
35
+ timezoneId: "America/New_York",
36
+ locale: "en-US",
37
+ };
38
+ const ctx = await browser.newContext(options);
39
+ const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
40
+ const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
41
+ if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
42
+ return mostBlocked;
43
+ });
44
+ return Object.freeze({ ctx, blocker });
45
+ }
46
+
47
+ export async function loadPage(url: string): Promise<Page> {
48
+ if (!playwrightService) playwrightService = initPlaywrightService();
49
+ const { ctx, blocker } = await playwrightService;
50
+
51
+ const page = await ctx.newPage();
52
+ await blocker.enableBlockingInPage(page);
53
+
54
+ await page.goto(url, { waitUntil: "load", timeout: 2000 }).catch(() => {
55
+ console.warn(`Failed to load page within 2s: ${url}`);
56
+ });
57
+
58
+ return page;
59
+ }
src/lib/server/websearch/scrape/scrape.ts ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { AppendUpdate } from "../runWebSearch";
2
+ import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
3
+ import { loadPage } from "./playwright";
4
+
5
+ import { spatialParser } from "./parser";
6
+ import { htmlToMarkdownTree } from "../markdown/tree";
7
+ import { timeout } from "$lib/utils/timeout";
8
+
9
+ export const scrape =
10
+ (appendUpdate: AppendUpdate, maxCharsPerElem: number) =>
11
+ async (source: WebSearchSource): Promise<WebSearchScrapedSource | undefined> => {
12
+ try {
13
+ const page = await scrapeUrl(source.link, maxCharsPerElem);
14
+ appendUpdate("Browsing webpage", [source.link]);
15
+ return { ...source, page };
16
+ } catch (e) {
17
+ const message = e instanceof Error ? e.message : String(e);
18
+ appendUpdate("Failed to parse webpage", [message, source.link], "error");
19
+ }
20
+ };
21
+
22
+ export async function scrapeUrl(url: string, maxCharsPerElem: number) {
23
+ const page = await loadPage(url);
24
+
25
+ return timeout(page.evaluate(spatialParser), 2000)
26
+ .then(({ elements, ...parsed }) => ({
27
+ ...parsed,
28
+ markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
29
+ }))
30
+ .catch((cause) => {
31
+ throw Error("Parsing failed", { cause });
32
+ })
33
+ .finally(() => page.close());
34
+ }
src/lib/server/websearch/scrape/types.ts ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ export interface SerializedHTMLElement {
2
+ tagName: string;
3
+ attributes: Record<string, string>;
4
+ content: (SerializedHTMLElement | string)[];
5
+ }
src/lib/server/websearch/search/endpoints.ts ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { WebSearchProvider, type WebSearchSource } from "$lib/types/WebSearch";
2
+ import { env } from "$env/dynamic/private";
3
+ import searchSerper from "./endpoints/serper";
4
+ import searchSerpApi from "./endpoints/serpApi";
5
+ import searchSerpStack from "./endpoints/serpStack";
6
+ import searchYouApi from "./endpoints/youApi";
7
+ import searchWebLocal from "./endpoints/webLocal";
8
+ import searchSearxng from "./endpoints/searxng";
9
+
10
+ export function getWebSearchProvider() {
11
+ if (env.YDC_API_KEY) return WebSearchProvider.YOU;
12
+ if (env.SEARXNG_QUERY_URL) return WebSearchProvider.SEARXNG;
13
+ return WebSearchProvider.GOOGLE;
14
+ }
15
+
16
+ /** Searches the web using the first available provider, based on the env */
17
+ export async function searchWeb(query: string): Promise<WebSearchSource[]> {
18
+ if (env.USE_LOCAL_WEBSEARCH) return searchWebLocal(query);
19
+ if (env.SEARXNG_QUERY_URL) return searchSearxng(query);
20
+ if (env.SERPER_API_KEY) return searchSerper(query);
21
+ if (env.YDC_API_KEY) return searchYouApi(query);
22
+ if (env.SERPAPI_KEY) return searchSerpApi(query);
23
+ if (env.SERPSTACK_API_KEY) return searchSerpStack(query);
24
+ throw new Error(
25
+ "No configuration found for web search. Please set USE_LOCAL_WEBSEARCH, SEARXNG_QUERY_URL, SERPER_API_KEY, YDC_API_KEY, or SERPSTACK_API_KEY in your environment variables."
26
+ );
27
+ }
src/lib/server/websearch/{searchSearxng.ts → search/endpoints/searxng.ts} RENAMED
@@ -1,7 +1,9 @@
1
  import { env } from "$env/dynamic/private";
2
  import { logger } from "$lib/server/logger";
 
 
3
 
4
- export async function searchSearxng(query: string) {
5
  const abortController = new AbortController();
6
  setTimeout(() => abortController.abort(), 10000);
7
 
@@ -20,7 +22,7 @@ export async function searchSearxng(query: string) {
20
  .then((response) => response.json() as Promise<{ results: { url: string }[] }>)
21
  .catch((error) => {
22
  logger.error("Failed to fetch or parse JSON", error);
23
- throw new Error("Failed to fetch or parse JSON");
24
  });
25
 
26
  // Extract 'url' elements from the JSON response and trim to the top 5 URLs
@@ -31,5 +33,5 @@ export async function searchSearxng(query: string) {
31
  }
32
 
33
  // Map URLs to the correct object shape
34
- return { organic_results: urls.map((link) => ({ link })) };
35
  }
 
1
  import { env } from "$env/dynamic/private";
2
  import { logger } from "$lib/server/logger";
3
+ import type { WebSearchSource } from "$lib/types/WebSearch";
4
+ import { isURL } from "$lib/utils/isUrl";
5
 
6
+ export default async function searchSearxng(query: string): Promise<WebSearchSource[]> {
7
  const abortController = new AbortController();
8
  setTimeout(() => abortController.abort(), 10000);
9
 
 
22
  .then((response) => response.json() as Promise<{ results: { url: string }[] }>)
23
  .catch((error) => {
24
  logger.error("Failed to fetch or parse JSON", error);
25
+ throw new Error("Failed to fetch or parse JSON", { cause: error });
26
  });
27
 
28
  // Extract 'url' elements from the JSON response and trim to the top 5 URLs
 
33
  }
34
 
35
  // Map URLs to the correct object shape
36
+ return urls.filter(isURL).map((link) => ({ link }));
37
  }
src/lib/server/websearch/search/endpoints/serpApi.ts ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { env } from "$env/dynamic/private";
2
+ import { getJson, type GoogleParameters } from "serpapi";
3
+ import type { WebSearchSource } from "$lib/types/WebSearch";
4
+ import { isURL } from "$lib/utils/isUrl";
5
+
6
+ type SerpApiResponse = {
7
+ organic_results: {
8
+ link: string;
9
+ }[];
10
+ };
11
+
12
+ export default async function searchWebSerpApi(query: string): Promise<WebSearchSource[]> {
13
+ const params = {
14
+ q: query,
15
+ hl: "en",
16
+ gl: "us",
17
+ google_domain: "google.com",
18
+ api_key: env.SERPAPI_KEY,
19
+ } satisfies GoogleParameters;
20
+
21
+ // Show result as JSON
22
+ const response = (await getJson("google", params)) as unknown as SerpApiResponse;
23
+
24
+ return response.organic_results.filter(({ link }) => isURL(link));
25
+ }
src/lib/server/websearch/search/endpoints/serpStack.ts ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { env } from "$env/dynamic/private";
2
+ import { isURL } from "$lib/utils/isUrl";
3
+ import type { WebSearchSource } from "$lib/types/WebSearch";
4
+
5
+ type SerpStackResponse = {
6
+ organic_results: {
7
+ title: string;
8
+ url: string;
9
+ snippet?: string;
10
+ }[];
11
+ error?: string;
12
+ };
13
+
14
+ export default async function searchSerpStack(query: string): Promise<WebSearchSource[]> {
15
+ const response = await fetch(
16
+ `http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
17
+ { headers: { "Content-type": "application/json; charset=UTF-8" } }
18
+ );
19
+
20
+ const data = (await response.json()) as SerpStackResponse;
21
+
22
+ if (!response.ok) {
23
+ throw new Error(
24
+ data.error ?? `SerpStack API returned error code ${response.status} - ${response.statusText}`
25
+ );
26
+ }
27
+
28
+ return data.organic_results
29
+ .filter(({ url }) => isURL(url))
30
+ .map(({ title, url, snippet }) => ({
31
+ title,
32
+ link: url,
33
+ text: snippet ?? "",
34
+ }));
35
+ }
src/lib/server/websearch/search/endpoints/serper.ts ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { env } from "$env/dynamic/private";
2
+ import type { WebSearchSource } from "$lib/types/WebSearch";
3
+
4
+ export default async function search(query: string): Promise<WebSearchSource[]> {
5
+ const params = {
6
+ q: query,
7
+ hl: "en",
8
+ gl: "us",
9
+ };
10
+
11
+ const response = await fetch("https://google.serper.dev/search", {
12
+ method: "POST",
13
+ body: JSON.stringify(params),
14
+ headers: {
15
+ "x-api-key": env.SERPER_API_KEY,
16
+ "Content-type": "application/json",
17
+ },
18
+ });
19
+
20
+ /* eslint-disable @typescript-eslint/no-explicit-any */
21
+ const data = (await response.json()) as Record<string, any>;
22
+
23
+ if (!response.ok) {
24
+ throw new Error(
25
+ data["message"] ??
26
+ `Serper API returned error code ${response.status} - ${response.statusText}`
27
+ );
28
+ }
29
+
30
+ return data["organic"] ?? [];
31
+ }
src/lib/server/websearch/{searchWebLocal.ts → search/endpoints/webLocal.ts} RENAMED
@@ -1,45 +1,35 @@
1
  import { JSDOM, VirtualConsole } from "jsdom";
 
 
2
 
3
- export async function searchWebLocal(query: string) {
4
  const abortController = new AbortController();
5
  setTimeout(() => abortController.abort(), 10000);
6
 
7
- const htmlString = await fetch("https://www.google.com/search?hl=en&q=" + query, {
8
- signal: abortController.signal,
9
- })
 
10
  .then((response) => response.text())
11
  .catch();
12
 
13
  const virtualConsole = new VirtualConsole();
 
 
14
 
15
- virtualConsole.on("error", () => {
16
- // No-op to skip console errors.
17
- });
18
-
19
- // put the html string into a DOM
20
- const dom = new JSDOM(htmlString ?? "", {
21
- virtualConsole,
22
- });
23
-
24
- const { document } = dom.window;
25
- // get all a documents with href tag
26
-
27
  const links = document.querySelectorAll("a");
28
-
29
- if (!links.length) {
30
- throw new Error(`webpage doesn't have any "a" element`);
31
- }
32
 
33
  // take url that start wirth /url?q=
34
  // and do not contain google.com links
35
  // and strip them up to '&sa='
36
  const linksHref = Array.from(links)
37
- .filter((el) => el.href?.startsWith("/url?q=") && !el.href.includes("google.com/"))
38
- .map((el) => {
39
- const link = el.href;
40
- return link.slice("/url?q=".length, link.indexOf("&sa="));
41
- });
42
 
43
  // remove duplicate links and map links to the correct object shape
44
- return { organic_results: [...new Set(linksHref)].map((link) => ({ link })) };
45
  }
 
1
  import { JSDOM, VirtualConsole } from "jsdom";
2
+ import { isURL } from "$lib/utils/isUrl";
3
+ import type { WebSearchSource } from "$lib/types/WebSearch";
4
 
5
+ export default async function searchWebLocal(query: string): Promise<WebSearchSource[]> {
6
  const abortController = new AbortController();
7
  setTimeout(() => abortController.abort(), 10000);
8
 
9
+ const htmlString = await fetch(
10
+ "https://www.google.com/search?hl=en&q=" + encodeURIComponent(query),
11
+ { signal: abortController.signal }
12
+ )
13
  .then((response) => response.text())
14
  .catch();
15
 
16
  const virtualConsole = new VirtualConsole();
17
+ virtualConsole.on("error", () => {}); // No-op to skip console errors.
18
+ const document = new JSDOM(htmlString ?? "", { virtualConsole }).window.document;
19
 
20
+ // get all links
 
 
 
 
 
 
 
 
 
 
 
21
  const links = document.querySelectorAll("a");
22
+ if (!links.length) throw new Error(`webpage doesn't have any "a" element`);
 
 
 
23
 
24
  // take url that start wirth /url?q=
25
  // and do not contain google.com links
26
  // and strip them up to '&sa='
27
  const linksHref = Array.from(links)
28
+ .map((el) => el.href)
29
+ .filter((link) => link.startsWith("/url?q=") && !link.includes("google.com/"))
30
+ .map((link) => link.slice("/url?q=".length, link.indexOf("&sa=")))
31
+ .filter(isURL);
 
32
 
33
  // remove duplicate links and map links to the correct object shape
34
+ return [...new Set(linksHref)].map((link) => ({ link }));
35
  }
src/lib/server/websearch/search/endpoints/youApi.ts ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { env } from "$env/dynamic/private";
2
+ import { isURL } from "$lib/utils/isUrl";
3
+ import type { WebSearchSource } from "$lib/types/WebSearch";
4
+
5
+ interface YouWebSearch {
6
+ hits: YouSearchHit[];
7
+ latency: number;
8
+ }
9
+
10
+ interface YouSearchHit {
11
+ url: string;
12
+ title: string;
13
+ description: string;
14
+ snippets: string[];
15
+ }
16
+
17
+ export default async function searchWebYouApi(query: string): Promise<WebSearchSource[]> {
18
+ const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
19
+ method: "GET",
20
+ headers: {
21
+ "X-API-Key": env.YDC_API_KEY,
22
+ "Content-type": "application/json; charset=UTF-8",
23
+ },
24
+ });
25
+
26
+ if (!response.ok) {
27
+ throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
28
+ }
29
+
30
+ const data = (await response.json()) as YouWebSearch;
31
+ const formattedResultsWithSnippets = data.hits
32
+ .filter(({ url }) => isURL(url))
33
+ .map(({ title, url, snippets }) => ({
34
+ title,
35
+ link: url,
36
+ text: snippets?.join("\n") || "",
37
+ }))
38
+ .sort((a, b) => b.text.length - a.text.length); // desc order by text length
39
+
40
+ return formattedResultsWithSnippets;
41
+ }
src/lib/server/websearch/{generateQuery.ts → search/generateQuery.ts} RENAMED
@@ -1,6 +1,6 @@
1
  import type { Message } from "$lib/types/Message";
2
  import { format } from "date-fns";
3
- import { generateFromDefaultEndpoint } from "../generateFromDefaultEndpoint";
4
 
5
  export async function generateQuery(messages: Message[]) {
6
  const currentDate = format(new Date(), "MMMM d, yyyy");
 
1
  import type { Message } from "$lib/types/Message";
2
  import { format } from "date-fns";
3
+ import { generateFromDefaultEndpoint } from "../../generateFromDefaultEndpoint";
4
 
5
  export async function generateQuery(messages: Message[]) {
6
  const currentDate = format(new Date(), "MMMM d, yyyy");
src/lib/server/websearch/search/search.ts ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { WebSearchSource } from "$lib/types/WebSearch";
2
+ import type { Message } from "$lib/types/Message";
3
+ import type { Assistant } from "$lib/types/Assistant";
4
+ import type { AppendUpdate } from "../runWebSearch";
5
+ import { getWebSearchProvider, searchWeb } from "./endpoints";
6
+ import { generateQuery } from "./generateQuery";
7
+ import { isURLStringLocal } from "$lib/server/isURLLocal";
8
+ import { isURL } from "$lib/utils/isUrl";
9
+
10
+ import z from "zod";
11
+ import JSON5 from "json5";
12
+ import { env } from "$env/dynamic/private";
13
+
14
+ const listSchema = z.array(z.string()).default([]);
15
+ const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST));
16
+ const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST));
17
+
18
+ export async function search(
19
+ messages: Message[],
20
+ ragSettings: Assistant["rag"] | undefined,
21
+ appendUpdate: AppendUpdate
22
+ ): Promise<{ searchQuery: string; pages: WebSearchSource[] }> {
23
+ if (ragSettings && ragSettings?.allowedLinks.length > 0) {
24
+ appendUpdate("Using links specified in Assistant");
25
+ return {
26
+ searchQuery: "",
27
+ pages: await directLinksToSource(ragSettings.allowedLinks).then(filterByBlockList),
28
+ };
29
+ }
30
+
31
+ const searchQuery = await generateQuery(messages);
32
+ appendUpdate(`Searching ${getWebSearchProvider()}`, [searchQuery]);
33
+
34
+ // handle the global and (optional) rag lists
35
+ if (ragSettings && ragSettings?.allowedDomains.length > 0) {
36
+ appendUpdate("Filtering on specified domains");
37
+ }
38
+ const filters = buildQueryFromSiteFilters(
39
+ [...(ragSettings?.allowedDomains ?? []), ...allowList],
40
+ blockList
41
+ );
42
+
43
+ const searchQueryWithFilters = `${filters} ${searchQuery}`;
44
+ const searchResults = await searchWeb(searchQueryWithFilters).then(filterByBlockList);
45
+
46
+ return {
47
+ searchQuery: searchQueryWithFilters,
48
+ pages: searchResults,
49
+ };
50
+ }
51
+
52
+ // ----------
53
+ // Utils
54
+ function filterByBlockList(results: WebSearchSource[]): WebSearchSource[] {
55
+ return results.filter((result) => !blockList.some((blocked) => result.link.includes(blocked)));
56
+ }
57
+
58
+ function buildQueryFromSiteFilters(allow: string[], block: string[]) {
59
+ return (
60
+ allow.map((item) => "site:" + item).join(" OR ") +
61
+ " " +
62
+ block.map((item) => "-site:" + item).join(" ")
63
+ );
64
+ }
65
+
66
+ async function directLinksToSource(links: string[]): Promise<WebSearchSource[]> {
67
+ if (env.ENABLE_LOCAL_FETCH !== "true") {
68
+ const localLinks = await Promise.all(links.map(isURLStringLocal));
69
+ links = links.filter((_, index) => !localLinks[index]);
70
+ }
71
+
72
+ return links.filter(isURL).map((link) => ({
73
+ link,
74
+ title: "",
75
+ text: [""],
76
+ }));
77
+ }
src/lib/server/websearch/searchWeb.ts DELETED
@@ -1,148 +0,0 @@
1
- import type { YouWebSearch } from "../../types/WebSearch";
2
- import { WebSearchProvider } from "../../types/WebSearch";
3
- import { env } from "$env/dynamic/private";
4
- import { getJson } from "serpapi";
5
- import type { GoogleParameters } from "serpapi";
6
- import { searchWebLocal } from "./searchWebLocal";
7
- import { searchSearxng } from "./searchSearxng";
8
-
9
- // get which SERP api is providing web results
10
- export function getWebSearchProvider() {
11
- if (env.YDC_API_KEY) {
12
- return WebSearchProvider.YOU;
13
- } else if (env.SEARXNG_QUERY_URL) {
14
- return WebSearchProvider.SEARXNG;
15
- } else {
16
- return WebSearchProvider.GOOGLE;
17
- }
18
- }
19
-
20
- // Show result as JSON
21
- export async function searchWeb(query: string) {
22
- if (env.USE_LOCAL_WEBSEARCH) {
23
- return await searchWebLocal(query);
24
- }
25
- if (env.SEARXNG_QUERY_URL) {
26
- return await searchSearxng(query);
27
- }
28
- if (env.SERPER_API_KEY) {
29
- return await searchWebSerper(query);
30
- }
31
- if (env.YDC_API_KEY) {
32
- return await searchWebYouApi(query);
33
- }
34
- if (env.SERPAPI_KEY) {
35
- return await searchWebSerpApi(query);
36
- }
37
- if (env.SERPSTACK_API_KEY) {
38
- return await searchSerpStack(query);
39
- }
40
- throw new Error("No You.com or Serper.dev or SerpAPI key found");
41
- }
42
-
43
- export async function searchWebSerper(query: string) {
44
- const params = {
45
- q: query,
46
- hl: "en",
47
- gl: "us",
48
- };
49
-
50
- const response = await fetch("https://google.serper.dev/search", {
51
- method: "POST",
52
- body: JSON.stringify(params),
53
- headers: {
54
- "x-api-key": env.SERPER_API_KEY,
55
- "Content-type": "application/json; charset=UTF-8",
56
- },
57
- });
58
-
59
- /* eslint-disable @typescript-eslint/no-explicit-any */
60
- const data = (await response.json()) as Record<string, any>;
61
-
62
- if (!response.ok) {
63
- throw new Error(
64
- data["message"] ??
65
- `Serper API returned error code ${response.status} - ${response.statusText}`
66
- );
67
- }
68
-
69
- return {
70
- organic_results: data["organic"] ?? [],
71
- };
72
- }
73
-
74
- export async function searchWebSerpApi(query: string) {
75
- const params = {
76
- q: query,
77
- hl: "en",
78
- gl: "us",
79
- google_domain: "google.com",
80
- api_key: env.SERPAPI_KEY,
81
- } satisfies GoogleParameters;
82
-
83
- // Show result as JSON
84
- const response = await getJson("google", params);
85
-
86
- return response;
87
- }
88
-
89
- export async function searchWebYouApi(query: string) {
90
- const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
91
- method: "GET",
92
- headers: {
93
- "X-API-Key": env.YDC_API_KEY,
94
- "Content-type": "application/json; charset=UTF-8",
95
- },
96
- });
97
-
98
- if (!response.ok) {
99
- throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
100
- }
101
-
102
- const data = (await response.json()) as YouWebSearch;
103
- const formattedResultsWithSnippets = data.hits
104
- .map(({ title, url, snippets }) => ({
105
- title,
106
- link: url,
107
- text: snippets?.join("\n") || "",
108
- hostname: new URL(url).hostname,
109
- }))
110
- .sort((a, b) => b.text.length - a.text.length); // desc order by text length
111
-
112
- return {
113
- organic_results: formattedResultsWithSnippets,
114
- };
115
- }
116
-
117
- export async function searchSerpStack(query: string) {
118
- const response = await fetch(
119
- `http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
120
- {
121
- method: "GET",
122
- headers: {
123
- "Content-type": "application/json; charset=UTF-8",
124
- },
125
- }
126
- );
127
-
128
- const data = (await response.json()) as Record<string, any>;
129
-
130
- if (!response.ok) {
131
- throw new Error(
132
- data["error"] ??
133
- `SerpStack API returned error code ${response.status} - ${response.statusText}`
134
- );
135
- }
136
-
137
- const resultsWithSnippets = data["organic_results"].map(
138
- ({ title, url, snippet }: { title: string; url: string; snippet: string | undefined }) => ({
139
- title,
140
- link: url,
141
- text: snippet || "",
142
- })
143
- );
144
-
145
- return {
146
- organic_results: resultsWithSnippets ?? [],
147
- };
148
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/lib/types/WebSearch.ts CHANGED
@@ -1,6 +1,7 @@
1
  import type { ObjectId } from "mongodb";
2
  import type { Conversation } from "./Conversation";
3
  import type { Timestamps } from "./Timestamps";
 
4
 
5
  export interface WebSearch extends Timestamps {
6
  _id?: ObjectId;
@@ -14,14 +15,24 @@ export interface WebSearch extends Timestamps {
14
  }
15
 
16
  export interface WebSearchSource {
17
- title: string;
18
  link: string;
19
- hostname: string;
20
- text?: string; // You.com provides text of webpage right away
 
 
 
 
 
 
 
 
 
 
21
  }
22
 
23
- export interface WebSearchUsedSource extends WebSearchSource {
24
- context: { idx: number; text: string }[];
25
  }
26
 
27
  export type WebSearchMessageSources = {
@@ -29,18 +40,6 @@ export type WebSearchMessageSources = {
29
  sources: WebSearchSource[];
30
  };
31
 
32
- export interface YouWebSearch {
33
- hits: YouSearchHit[];
34
- latency: number;
35
- }
36
-
37
- interface YouSearchHit {
38
- url: string;
39
- title: string;
40
- description: string;
41
- snippets: string[];
42
- }
43
-
44
  // eslint-disable-next-line no-shadow
45
  export enum WebSearchProvider {
46
  GOOGLE = "Google",
 
1
  import type { ObjectId } from "mongodb";
2
  import type { Conversation } from "./Conversation";
3
  import type { Timestamps } from "./Timestamps";
4
+ import type { HeaderElement } from "$lib/server/websearch/markdown/types";
5
 
6
  export interface WebSearch extends Timestamps {
7
  _id?: ObjectId;
 
15
  }
16
 
17
  export interface WebSearchSource {
18
+ title?: string;
19
  link: string;
20
+ }
21
+ export interface WebSearchScrapedSource extends WebSearchSource {
22
+ page: WebSearchPage;
23
+ }
24
+ export interface WebSearchPage {
25
+ title: string;
26
+ siteName?: string;
27
+ author?: string;
28
+ description?: string;
29
+ createdAt?: string;
30
+ modifiedAt?: string;
31
+ markdownTree: HeaderElement;
32
  }
33
 
34
+ export interface WebSearchUsedSource extends WebSearchScrapedSource {
35
+ context: string;
36
  }
37
 
38
  export type WebSearchMessageSources = {
 
40
  sources: WebSearchSource[];
41
  };
42
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  // eslint-disable-next-line no-shadow
44
  export enum WebSearchProvider {
45
  GOOGLE = "Google",
src/lib/utils/isUrl.ts ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ export function isURL(url: string) {
2
+ try {
3
+ new URL(url);
4
+ return true;
5
+ } catch (e) {
6
+ return false;
7
+ }
8
+ }
src/lib/utils/timeout.ts CHANGED
@@ -1,6 +1,9 @@
1
  export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
2
  let timer: NodeJS.Timeout;
3
- return Promise.race([prom, new Promise<T>((_r, rej) => (timer = setTimeout(rej, time)))]).finally(
4
- () => clearTimeout(timer)
5
- );
 
 
 
6
  };
 
1
  export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
2
  let timer: NodeJS.Timeout;
3
+ return Promise.race([
4
+ prom,
5
+ new Promise<T>((_, reject) => {
6
+ timer = setTimeout(() => reject(new Error(`Timeout after ${time / 1000} seconds`)), time);
7
+ }),
8
+ ]).finally(() => clearTimeout(timer));
9
  };