Spaces:

jdelavande
/

chat-ui-energy

Running on CPU Upgrade

Liam Dyer Aaditya Sahay Aaditya Sahay commited on May 13, 2024

Commit

2c00ea8

unverified ·

1 Parent(s): d3e833a

Web Search: Playwright, spatial parsing, markdown (#1094)

* feat: playwright, spatial parsing, markdown for web search

Co-authored-by: Aaditya Sahay <[email protected]>

* feat: choose multiple clusters if necessary (#2)

* chore: resolve linting failures

* feat: improve paring performance and error messages

* feat: combine embeddable chunks together on cpu

* feat: reduce parsed pages from 10 to 8

* feat: disable javascript in playwright by default

* feat: embedding and parsing error messages

* feat: move isURL, fix type errors, misc

* feat: misc cleanup

* feat: change serializedHtmlElement to interface

* fix: isUrl filename

* fix: add playwright dependencies to docker

* feat: add playwright browsers to docker image

* feat: enable javascript by default

* feat: remove error message from console on failed page

---------

Co-authored-by: Aaditya Sahay <[email protected]>
Co-authored-by: Aaditya Sahay <[email protected]>

Files changed (38) hide show

.env +2 -1
Dockerfile +6 -0
README.md +2 -0
package-lock.json +289 -27
package.json +5 -0
src/lib/components/chat/ChatMessage.svelte +3 -3
src/lib/server/embeddingEndpoints/hfApi/embeddingHfApi.ts +6 -1
src/lib/server/isURLLocal.ts +31 -19
src/lib/server/preprocessMessages.ts +4 -6
src/lib/server/sentenceSimilarity.ts +12 -21
src/lib/server/websearch/embed/combine.ts +37 -0
src/lib/server/websearch/embed/embed.ts +80 -0
src/lib/server/websearch/embed/tree.ts +6 -0
src/lib/server/websearch/markdown/fromHtml.ts +98 -0
src/lib/server/websearch/markdown/tree.ts +63 -0
src/lib/server/websearch/markdown/types.ts +55 -0
src/lib/server/websearch/markdown/utils/chunk.ts +60 -0
src/lib/server/websearch/markdown/utils/nlp.ts +11 -0
src/lib/server/websearch/markdown/utils/stringify.ts +75 -0
src/lib/server/websearch/parseWeb.ts +0 -41
src/lib/server/websearch/runWebSearch.ts +69 -145
src/lib/server/websearch/scrape/parser.ts +552 -0
src/lib/server/websearch/scrape/playwright.ts +59 -0
src/lib/server/websearch/scrape/scrape.ts +34 -0
src/lib/server/websearch/scrape/types.ts +5 -0
src/lib/server/websearch/search/endpoints.ts +27 -0
src/lib/server/websearch/{searchSearxng.ts → search/endpoints/searxng.ts} +5 -3
src/lib/server/websearch/search/endpoints/serpApi.ts +25 -0
src/lib/server/websearch/search/endpoints/serpStack.ts +35 -0
src/lib/server/websearch/search/endpoints/serper.ts +31 -0
src/lib/server/websearch/{searchWebLocal.ts → search/endpoints/webLocal.ts} +16 -26
src/lib/server/websearch/search/endpoints/youApi.ts +41 -0
src/lib/server/websearch/{generateQuery.ts → search/generateQuery.ts} +1 -1
src/lib/server/websearch/search/search.ts +77 -0
src/lib/server/websearch/searchWeb.ts +0 -148
src/lib/types/WebSearch.ts +16 -17
src/lib/utils/isUrl.ts +8 -0
src/lib/utils/timeout.ts +6 -3

.env CHANGED Viewed

@@ -27,6 +27,7 @@ SEARXNG_QUERY_URL=# where '<query>' will be replaced with query keywords see htt
 WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
 WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
 # Parameters to enable open id login
 OPENID_CONFIG=`{
@@ -155,4 +156,4 @@ ALLOWED_USER_EMAILS=`[]` # if it's defined, only these emails will be allowed to
 USAGE_LIMITS=`{}`
 ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
 METRICS_PORT=
-LOG_LEVEL=info

 WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
 WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
+WEBSEARCH_JAVASCRIPT=true # CPU usage reduces by 60% on average by disabling javascript. Enable to improve website compatibility
 # Parameters to enable open id login
 OPENID_CONFIG=`{
 USAGE_LIMITS=`{}`
 ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
 METRICS_PORT=
+LOG_LEVEL=info

Dockerfile CHANGED Viewed

@@ -83,6 +83,12 @@ COPY --chown=1000 gcp-*.json /app/
 COPY --from=builder --chown=1000 /app/build /app/build
 COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
 RUN chmod +x /app/entrypoint.sh
 CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]

 COPY --from=builder --chown=1000 /app/build /app/build
 COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
+RUN npx playwright install
+USER root
+RUN npx playwright install-deps
+USER user
 RUN chmod +x /app/entrypoint.sh
 CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]

README.md CHANGED Viewed

@@ -170,6 +170,8 @@ You can enable the web search through an API by adding `YDC_API_KEY` ([docs.you.
 You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
 ### Custom models
 You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :

 You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
+You can enable Javascript when parsing webpages to improve compatibility with `WEBSEARCH_JAVASCRIPT=true` at the cost of increased CPU usage. You'll want at least 4 cores when enabling.
 ### Custom models
 You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :

package-lock.json CHANGED Viewed

@@ -8,9 +8,11 @@
 			"name": "chat-ui",
 			"version": "0.8.4",
 			"dependencies": {
 				"@huggingface/hub": "^0.5.1",
 				"@huggingface/inference": "^2.6.3",
 				"@iconify-json/bi": "^1.1.21",
 				"@resvg/resvg-js": "^2.6.0",
 				"@xenova/transformers": "^2.16.1",
 				"autoprefixer": "^10.4.14",
@@ -32,10 +34,12 @@
 				"parquetjs": "^0.11.2",
 				"pino": "^9.0.0",
 				"pino-pretty": "^11.0.0",
 				"postcss": "^8.4.31",
 				"saslprep": "^1.0.3",
 				"satori": "^0.10.11",
 				"satori-html": "^0.3.2",
 				"serpapi": "^1.1.1",
 				"sharp": "^0.33.2",
 				"tailwind-scrollbar": "^3.0.0",
@@ -55,6 +59,7 @@
 				"@types/jsdom": "^21.1.1",
 				"@types/minimist": "^1.2.5",
 				"@types/parquetjs": "^0.10.3",
 				"@types/uuid": "^9.0.8",
 				"@typescript-eslint/eslint-plugin": "^6.x",
 				"@typescript-eslint/parser": "^6.x",
@@ -159,39 +164,54 @@
 			}
 		},
 		"node_modules/@anthropic-ai/vertex-sdk": {
-			"version": "0.3.0",
-			"resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.0.tgz",
-			"integrity": "sha512-RquU3sXAuGdxWnbx5luHovFnQVso7LuAtSmpLkZMOT6x5csldAJdp4TIgMX6/55pAefNVPDTtEYChwK5wpxRww==",
 			"optional": true,
 			"dependencies": {
-				"@anthropic-ai/sdk": "^0.14",
 				"google-auth-library": "^9.4.2"
 			}
 		},
-		"node_modules/@anthropic-ai/vertex-sdk/node_modules/@anthropic-ai/sdk": {
-			"version": "0.14.1",
-			"resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.14.1.tgz",
-			"integrity": "sha512-/o0+6ijSF0WSxnzQ0GUZPKaxOE0y1dqAn9gM9KPU7hc/tqiI4lzCYqe/EFSEw8pFONgYi1IjcvevYjgOOc2vpg==",
-			"optional": true,
 			"dependencies": {
-				"@types/node": "^18.11.18",
-				"@types/node-fetch": "^2.6.4",
-				"abort-controller": "^3.0.0",
-				"agentkeepalive": "^4.2.1",
-				"digest-fetch": "^1.3.0",
-				"form-data-encoder": "1.7.2",
-				"formdata-node": "^4.3.2",
-				"node-fetch": "^2.6.7",
-				"web-streams-polyfill": "^3.2.1"
 			}
 		},
-		"node_modules/@anthropic-ai/vertex-sdk/node_modules/web-streams-polyfill": {
-			"version": "3.3.3",
-			"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz",
-			"integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==",
-			"optional": true,
-			"engines": {
-				"node": ">= 8"
 			}
 		},
 		"node_modules/@cspotcode/source-map-support": {
@@ -1314,6 +1334,18 @@
 				"node": ">=8.0.0"
 			}
 		},
 		"node_modules/@polka/url": {
 			"version": "1.0.0-next.21",
 			"resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
@@ -1374,6 +1406,43 @@
 			"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
 			"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
 		},
 		"node_modules/@resvg/resvg-js": {
 			"version": "2.6.0",
 			"resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
@@ -2063,6 +2132,15 @@
 				"@types/chai": "*"
 			}
 		},
 		"node_modules/@types/connect": {
 			"version": "3.4.38",
 			"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
@@ -2108,6 +2186,29 @@
 				"@types/send": "*"
 			}
 		},
 		"node_modules/@types/http-errors": {
 			"version": "2.0.4",
 			"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
@@ -2216,6 +2317,12 @@
 			"integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
 			"dev": true
 		},
 		"node_modules/@types/semver": {
 			"version": "7.5.3",
 			"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
@@ -3660,7 +3767,6 @@
 			"version": "4.3.1",
 			"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
 			"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
-			"dev": true,
 			"engines": {
 				"node": ">=0.10.0"
 			}
@@ -3791,6 +3897,30 @@
 				"node": ">=6.0.0"
 			}
 		},
 		"node_modules/domexception": {
 			"version": "4.0.0",
 			"resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
@@ -3802,6 +3932,33 @@
 				"node": ">=12"
 			}
 		},
 		"node_modules/dotenv": {
 			"version": "16.0.3",
 			"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
@@ -3940,7 +4097,6 @@
 			"version": "4.0.0",
 			"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
 			"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
-			"dev": true,
 			"engines": {
 				"node": ">=10"
 			},
@@ -4924,6 +5080,24 @@
 				"node": ">=12"
 			}
 		},
 		"node_modules/http-errors": {
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
@@ -5194,6 +5368,14 @@
 				"node": ">=8"
 			}
 		},
 		"node_modules/is-potential-custom-element-name": {
 			"version": "1.0.1",
 			"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
@@ -6354,6 +6536,11 @@
 				"hex-rgb": "^4.1.0"
 			}
 		},
 		"node_modules/parse5": {
 			"version": "7.1.2",
 			"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
@@ -6645,6 +6832,47 @@
 			"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
 			"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
 		},
 		"node_modules/postcss": {
 			"version": "8.4.35",
 			"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
@@ -7431,6 +7659,19 @@
 				"rimraf": "bin.js"
 			}
 		},
 		"node_modules/saslprep": {
 			"version": "1.0.3",
 			"resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
@@ -7481,6 +7722,14 @@
 				"node": ">=v12.22.7"
 			}
 		},
 		"node_modules/secure-json-parse": {
 			"version": "2.7.0",
 			"resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
@@ -8428,6 +8677,19 @@
 				"node": ">=14.0.0"
 			}
 		},
 		"node_modules/to-regex-range": {
 			"version": "5.0.1",
 			"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",

 			"name": "chat-ui",
 			"version": "0.8.4",
 			"dependencies": {
+				"@cliqz/adblocker-playwright": "^1.27.2",
 				"@huggingface/hub": "^0.5.1",
 				"@huggingface/inference": "^2.6.3",
 				"@iconify-json/bi": "^1.1.21",
+				"@playwright/browser-chromium": "^1.43.1",
 				"@resvg/resvg-js": "^2.6.0",
 				"@xenova/transformers": "^2.16.1",
 				"autoprefixer": "^10.4.14",
 				"parquetjs": "^0.11.2",
 				"pino": "^9.0.0",
 				"pino-pretty": "^11.0.0",
+				"playwright": "^1.40.0",
 				"postcss": "^8.4.31",
 				"saslprep": "^1.0.3",
 				"satori": "^0.10.11",
 				"satori-html": "^0.3.2",
+				"sbd": "^1.0.19",
 				"serpapi": "^1.1.1",
 				"sharp": "^0.33.2",
 				"tailwind-scrollbar": "^3.0.0",
 				"@types/jsdom": "^21.1.1",
 				"@types/minimist": "^1.2.5",
 				"@types/parquetjs": "^0.10.3",
+				"@types/sbd": "^1.0.5",
 				"@types/uuid": "^9.0.8",
 				"@typescript-eslint/eslint-plugin": "^6.x",
 				"@typescript-eslint/parser": "^6.x",
 			}
 		},
 		"node_modules/@anthropic-ai/vertex-sdk": {
+			"version": "0.3.6",
+			"resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.6.tgz",
+			"integrity": "sha512-4pNVobcCsPCWLSaFJkT/XxwX5rmot+q2PE2LF5vfuRNFTWFjeTrsPgTB48D0Sce/c/2p4fddrFKGN6fdnn8zRg==",
 			"optional": true,
 			"dependencies": {
+				"@anthropic-ai/sdk": ">=0.14 <1",
 				"google-auth-library": "^9.4.2"
 			}
 		},
+		"node_modules/@cliqz/adblocker": {
+			"version": "1.27.2",
+			"resolved": "https://registry.npmjs.org/@cliqz/adblocker/-/adblocker-1.27.2.tgz",
+			"integrity": "sha512-sFjbx9xBGWaOsvVFVHVUNOrzCafGtjYDAp95KTeoJcNZbPs4D2RsabYZEeg4JkwPkfhcFseJqfnsMyJ4XsqVfQ==",
 			"dependencies": {
+				"@cliqz/adblocker-content": "^1.27.2",
+				"@cliqz/adblocker-extended-selectors": "^1.27.2",
+				"@remusao/guess-url-type": "^1.2.1",
+				"@remusao/small": "^1.2.1",
+				"@remusao/smaz": "^1.9.1",
+				"@types/chrome": "^0.0.266",
+				"@types/firefox-webext-browser": "^120.0.0",
+				"tldts-experimental": "^6.0.14"
 			}
 		},
+		"node_modules/@cliqz/adblocker-content": {
+			"version": "1.27.2",
+			"resolved": "https://registry.npmjs.org/@cliqz/adblocker-content/-/adblocker-content-1.27.2.tgz",
+			"integrity": "sha512-fzxsOt7r3YUgxoyW9GPCOShKOLNbB4n3gWtyMBFQ+lwHsQKfLehxN4Zxjg4Ad6AXJNW4gfIBq69ghnj2jHfviw==",
+			"dependencies": {
+				"@cliqz/adblocker-extended-selectors": "^1.27.2"
+			}
+		},
+		"node_modules/@cliqz/adblocker-extended-selectors": {
+			"version": "1.27.2",
+			"resolved": "https://registry.npmjs.org/@cliqz/adblocker-extended-selectors/-/adblocker-extended-selectors-1.27.2.tgz",
+			"integrity": "sha512-HZ03U8pAOuEwTo1vZ9tv49kIC4riWqYvr5p3illZshxo+eCUi8CPbgYSyYCtgd1JpO1wNnCwEX95/twXfT8cnA=="
+		},
+		"node_modules/@cliqz/adblocker-playwright": {
+			"version": "1.27.2",
+			"resolved": "https://registry.npmjs.org/@cliqz/adblocker-playwright/-/adblocker-playwright-1.27.2.tgz",
+			"integrity": "sha512-b+OoWKz/h787YItfCwjnhZ8l6/bv/DPTzaq1pyyY6Ovpdd+dGvVW1fehw+87FC6j/WQbTeuOdpLiwp8ouvrftg==",
+			"dependencies": {
+				"@cliqz/adblocker": "^1.27.2",
+				"@cliqz/adblocker-content": "^1.27.2",
+				"tldts-experimental": "^6.0.14"
+			},
+			"peerDependencies": {
+				"playwright": "^1.x"
 			}
 		},
 		"node_modules/@cspotcode/source-map-support": {
 				"node": ">=8.0.0"
 			}
 		},
+		"node_modules/@playwright/browser-chromium": {
+			"version": "1.43.1",
+			"resolved": "https://registry.npmjs.org/@playwright/browser-chromium/-/browser-chromium-1.43.1.tgz",
+			"integrity": "sha512-CBuHhRIF/VGyUnPvK7/4IUbm0AAOZZI5huHlr+qNr5cFQpQ6TXBqOwSMef/xUz9HcjxWOxDPION7br1kOlyV/A==",
+			"hasInstallScript": true,
+			"dependencies": {
+				"playwright-core": "1.43.1"
+			},
+			"engines": {
+				"node": ">=16"
+			}
+		},
 		"node_modules/@polka/url": {
 			"version": "1.0.0-next.21",
 			"resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
 			"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
 			"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
 		},
+		"node_modules/@remusao/guess-url-type": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.2.1.tgz",
+			"integrity": "sha512-rbOqre2jW8STjheOsOaQHLgYBaBZ9Owbdt8NO7WvNZftJlaG3y/K9oOkl8ZUpuFBisIhmBuMEW6c+YrQl5inRA=="
+		},
+		"node_modules/@remusao/small": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.2.1.tgz",
+			"integrity": "sha512-7MjoGt0TJMVw1GPKgWq6SJPws1SLsUXQRa43Umht+nkyw2jnpy3WpiLNqGdwo5rHr5Wp9B2W/Pm5RQp656UJdw=="
+		},
+		"node_modules/@remusao/smaz": {
+			"version": "1.9.1",
+			"resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.9.1.tgz",
+			"integrity": "sha512-e6BLuP8oaXCZ9+v46Is4ilAZ/Vq6YLgmBP204Ixgk1qTjXmqvFYG7+AS7v9nsZdGOy96r9DWGFbbDVgMxwu1rA==",
+			"dependencies": {
+				"@remusao/smaz-compress": "^1.9.1",
+				"@remusao/smaz-decompress": "^1.9.1"
+			}
+		},
+		"node_modules/@remusao/smaz-compress": {
+			"version": "1.9.1",
+			"resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.9.1.tgz",
+			"integrity": "sha512-E2f48TwloQu3r6BdLOGF2aczeH7bJ/32oJGqvzT9SKur0cuUnLcZ7ZXP874E2fwmdE+cXzfC7bKzp79cDnmeyw==",
+			"dependencies": {
+				"@remusao/trie": "^1.4.1"
+			}
+		},
+		"node_modules/@remusao/smaz-decompress": {
+			"version": "1.9.1",
+			"resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.9.1.tgz",
+			"integrity": "sha512-TfjKKprYe3n47od8auhvJ/Ikj9kQTbDTe71ynKlxslrvvUhlIV3VQSuwYuMWMbdz1fIs0H/fxCN1Z8/H3km6/A=="
+		},
+		"node_modules/@remusao/trie": {
+			"version": "1.4.1",
+			"resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.4.1.tgz",
+			"integrity": "sha512-yvwa+aCyYI/UjeD39BnpMypG8N06l86wIDW1/PAc6ihBRnodIfZDwccxQN3n1t74wduzaz74m4ZMHZnB06567Q=="
+		},
 		"node_modules/@resvg/resvg-js": {
 			"version": "2.6.0",
 			"resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
 				"@types/chai": "*"
 			}
 		},
+		"node_modules/@types/chrome": {
+			"version": "0.0.266",
+			"resolved": "https://registry.npmjs.org/@types/chrome/-/chrome-0.0.266.tgz",
+			"integrity": "sha512-QSQWJTL7NjZElvq/6/E5C1+pHgEP8UAJzwoz7M4vSJ7AECt6NNehJ+tU6snnvuTqZOBjFCivvitYo5+8tNPmhg==",
+			"dependencies": {
+				"@types/filesystem": "*",
+				"@types/har-format": "*"
+			}
+		},
 		"node_modules/@types/connect": {
 			"version": "3.4.38",
 			"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
 				"@types/send": "*"
 			}
 		},
+		"node_modules/@types/filesystem": {
+			"version": "0.0.36",
+			"resolved": "https://registry.npmjs.org/@types/filesystem/-/filesystem-0.0.36.tgz",
+			"integrity": "sha512-vPDXOZuannb9FZdxgHnqSwAG/jvdGM8Wq+6N4D/d80z+D4HWH+bItqsZaVRQykAn6WEVeEkLm2oQigyHtgb0RA==",
+			"dependencies": {
+				"@types/filewriter": "*"
+			}
+		},
+		"node_modules/@types/filewriter": {
+			"version": "0.0.33",
+			"resolved": "https://registry.npmjs.org/@types/filewriter/-/filewriter-0.0.33.tgz",
+			"integrity": "sha512-xFU8ZXTw4gd358lb2jw25nxY9QAgqn2+bKKjKOYfNCzN4DKCFetK7sPtrlpg66Ywe3vWY9FNxprZawAh9wfJ3g=="
+		},
+		"node_modules/@types/firefox-webext-browser": {
+			"version": "120.0.3",
+			"resolved": "https://registry.npmjs.org/@types/firefox-webext-browser/-/firefox-webext-browser-120.0.3.tgz",
+			"integrity": "sha512-APbBSxOvFMbKwXy/4YrEVa5Di6N0C9yl4w0WA0xzdkOrChAfPQ/KlcC8QLyhemHCHpF1CB/zHy52+oUQurViOg=="
+		},
+		"node_modules/@types/har-format": {
+			"version": "1.2.15",
+			"resolved": "https://registry.npmjs.org/@types/har-format/-/har-format-1.2.15.tgz",
+			"integrity": "sha512-RpQH4rXLuvTXKR0zqHq3go0RVXYv/YVqv4TnPH95VbwUxZdQlK1EtcMvQvMpDngHbt13Csh9Z4qT9AbkiQH5BA=="
+		},
 		"node_modules/@types/http-errors": {
 			"version": "2.0.4",
 			"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
 			"integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
 			"dev": true
 		},
+		"node_modules/@types/sbd": {
+			"version": "1.0.5",
+			"resolved": "https://registry.npmjs.org/@types/sbd/-/sbd-1.0.5.tgz",
+			"integrity": "sha512-60PxBBWhg0C3yb5bTP+wwWYGTKMcuB0S6mTEa1sedMC79tYY0Ei7YjU4qsWzGn++lWscLQde16SnElJrf5/aTw==",
+			"dev": true
+		},
 		"node_modules/@types/semver": {
 			"version": "7.5.3",
 			"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
 			"version": "4.3.1",
 			"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
 			"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
 			"engines": {
 				"node": ">=0.10.0"
 			}
 				"node": ">=6.0.0"
 			}
 		},
+		"node_modules/dom-serializer": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
+			"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
+			"dependencies": {
+				"domelementtype": "^2.3.0",
+				"domhandler": "^5.0.2",
+				"entities": "^4.2.0"
+			},
+			"funding": {
+				"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
+			}
+		},
+		"node_modules/domelementtype": {
+			"version": "2.3.0",
+			"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
+			"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/fb55"
+				}
+			]
+		},
 		"node_modules/domexception": {
 			"version": "4.0.0",
 			"resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
 				"node": ">=12"
 			}
 		},
+		"node_modules/domhandler": {
+			"version": "5.0.3",
+			"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
+			"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
+			"dependencies": {
+				"domelementtype": "^2.3.0"
+			},
+			"engines": {
+				"node": ">= 4"
+			},
+			"funding": {
+				"url": "https://github.com/fb55/domhandler?sponsor=1"
+			}
+		},
+		"node_modules/domutils": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
+			"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
+			"dependencies": {
+				"dom-serializer": "^2.0.0",
+				"domelementtype": "^2.3.0",
+				"domhandler": "^5.0.3"
+			},
+			"funding": {
+				"url": "https://github.com/fb55/domutils?sponsor=1"
+			}
+		},
 		"node_modules/dotenv": {
 			"version": "16.0.3",
 			"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
 			"version": "4.0.0",
 			"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
 			"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
 			"engines": {
 				"node": ">=10"
 			},
 				"node": ">=12"
 			}
 		},
+		"node_modules/htmlparser2": {
+			"version": "8.0.2",
+			"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
+			"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
+			"funding": [
+				"https://github.com/fb55/htmlparser2?sponsor=1",
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/fb55"
+				}
+			],
+			"dependencies": {
+				"domelementtype": "^2.3.0",
+				"domhandler": "^5.0.3",
+				"domutils": "^3.0.1",
+				"entities": "^4.4.0"
+			}
+		},
 		"node_modules/http-errors": {
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
 				"node": ">=8"
 			}
 		},
+		"node_modules/is-plain-object": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz",
+			"integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
 		"node_modules/is-potential-custom-element-name": {
 			"version": "1.0.1",
 			"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
 				"hex-rgb": "^4.1.0"
 			}
 		},
+		"node_modules/parse-srcset": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz",
+			"integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="
+		},
 		"node_modules/parse5": {
 			"version": "7.1.2",
 			"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
 			"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
 			"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
 		},
+		"node_modules/playwright": {
+			"version": "1.43.1",
+			"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.43.1.tgz",
+			"integrity": "sha512-V7SoH0ai2kNt1Md9E3Gwas5B9m8KR2GVvwZnAI6Pg0m3sh7UvgiYhRrhsziCmqMJNouPckiOhk8T+9bSAK0VIA==",
+			"dependencies": {
+				"playwright-core": "1.43.1"
+			},
+			"bin": {
+				"playwright": "cli.js"
+			},
+			"engines": {
+				"node": ">=16"
+			},
+			"optionalDependencies": {
+				"fsevents": "2.3.2"
+			}
+		},
+		"node_modules/playwright-core": {
+			"version": "1.43.1",
+			"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.43.1.tgz",
+			"integrity": "sha512-EI36Mto2Vrx6VF7rm708qSnesVQKbxEWvPrfA1IPY6HgczBplDx7ENtx+K2n4kJ41sLLkuGfmb0ZLSSXlDhqPg==",
+			"bin": {
+				"playwright-core": "cli.js"
+			},
+			"engines": {
+				"node": ">=16"
+			}
+		},
+		"node_modules/playwright/node_modules/fsevents": {
+			"version": "2.3.2",
+			"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+			"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+			"hasInstallScript": true,
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+			}
+		},
 		"node_modules/postcss": {
 			"version": "8.4.35",
 			"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
 				"rimraf": "bin.js"
 			}
 		},
+		"node_modules/sanitize-html": {
+			"version": "2.13.0",
+			"resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.13.0.tgz",
+			"integrity": "sha512-Xff91Z+4Mz5QiNSLdLWwjgBDm5b1RU6xBT0+12rapjiaR7SwfRdjw8f+6Rir2MXKLrDicRFHdb51hGOAxmsUIA==",
+			"dependencies": {
+				"deepmerge": "^4.2.2",
+				"escape-string-regexp": "^4.0.0",
+				"htmlparser2": "^8.0.0",
+				"is-plain-object": "^5.0.0",
+				"parse-srcset": "^1.0.2",
+				"postcss": "^8.3.11"
+			}
+		},
 		"node_modules/saslprep": {
 			"version": "1.0.3",
 			"resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
 				"node": ">=v12.22.7"
 			}
 		},
+		"node_modules/sbd": {
+			"version": "1.0.19",
+			"resolved": "https://registry.npmjs.org/sbd/-/sbd-1.0.19.tgz",
+			"integrity": "sha512-b5RyZMGSrFuIB4AHdbv12uYHS8YGEJ36gtuvG3RflbJGY+T0dXmAL0E4vZjQqT2RsX0v+ZwVqhV2zsGr5aFK9w==",
+			"dependencies": {
+				"sanitize-html": "^2.3.2"
+			}
+		},
 		"node_modules/secure-json-parse": {
 			"version": "2.7.0",
 			"resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
 				"node": ">=14.0.0"
 			}
 		},
+		"node_modules/tldts-core": {
+			"version": "6.1.18",
+			"resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.18.tgz",
+			"integrity": "sha512-e4wx32F/7dMBSZyKAx825Yte3U0PQtZZ0bkWxYQiwLteRVnQ5zM40fEbi0IyNtwQssgJAk3GCr7Q+w39hX0VKA=="
+		},
+		"node_modules/tldts-experimental": {
+			"version": "6.1.18",
+			"resolved": "https://registry.npmjs.org/tldts-experimental/-/tldts-experimental-6.1.18.tgz",
+			"integrity": "sha512-E9/pAIybo7/MPdsQSKcCDElgObk78Be1gFqO645LbfhL5HG597sOeRQ55EuvIHlTo1Ypyyl+F/V+p0CnrTu3uQ==",
+			"dependencies": {
+				"tldts-core": "^6.1.18"
+			}
+		},
 		"node_modules/to-regex-range": {
 			"version": "5.0.1",
 			"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",

package.json CHANGED Viewed

@@ -28,6 +28,7 @@
 		"@types/jsdom": "^21.1.1",
 		"@types/minimist": "^1.2.5",
 		"@types/parquetjs": "^0.10.3",
 		"@types/uuid": "^9.0.8",
 		"@typescript-eslint/eslint-plugin": "^6.x",
 		"@typescript-eslint/parser": "^6.x",
@@ -52,9 +53,11 @@
 	},
 	"type": "module",
 	"dependencies": {
 		"@huggingface/hub": "^0.5.1",
 		"@huggingface/inference": "^2.6.3",
 		"@iconify-json/bi": "^1.1.21",
 		"@resvg/resvg-js": "^2.6.0",
 		"@xenova/transformers": "^2.16.1",
 		"autoprefixer": "^10.4.14",
@@ -76,10 +79,12 @@
 		"parquetjs": "^0.11.2",
 		"pino": "^9.0.0",
 		"pino-pretty": "^11.0.0",
 		"postcss": "^8.4.31",
 		"saslprep": "^1.0.3",
 		"satori": "^0.10.11",
 		"satori-html": "^0.3.2",
 		"serpapi": "^1.1.1",
 		"sharp": "^0.33.2",
 		"tailwind-scrollbar": "^3.0.0",

 		"@types/jsdom": "^21.1.1",
 		"@types/minimist": "^1.2.5",
 		"@types/parquetjs": "^0.10.3",
+		"@types/sbd": "^1.0.5",
 		"@types/uuid": "^9.0.8",
 		"@typescript-eslint/eslint-plugin": "^6.x",
 		"@typescript-eslint/parser": "^6.x",
 	},
 	"type": "module",
 	"dependencies": {
+		"@cliqz/adblocker-playwright": "^1.27.2",
 		"@huggingface/hub": "^0.5.1",
 		"@huggingface/inference": "^2.6.3",
 		"@iconify-json/bi": "^1.1.21",
+		"@playwright/browser-chromium": "^1.43.1",
 		"@resvg/resvg-js": "^2.6.0",
 		"@xenova/transformers": "^2.16.1",
 		"autoprefixer": "^10.4.14",
 		"parquetjs": "^0.11.2",
 		"pino": "^9.0.0",
 		"pino-pretty": "^11.0.0",
+		"playwright": "^1.40.0",
 		"postcss": "^8.4.31",
 		"saslprep": "^1.0.3",
 		"satori": "^0.10.11",
 		"satori-html": "^0.3.2",
+		"sbd": "^1.0.19",
 		"serpapi": "^1.1.1",
 		"sharp": "^0.33.2",
 		"tailwind-scrollbar": "^3.0.0",

src/lib/components/chat/ChatMessage.svelte CHANGED Viewed

@@ -227,7 +227,7 @@
 			{#if webSearchSources?.length}
 				<div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
 					<div class="text-gray-400">Sources:</div>
-					{#each webSearchSources as { link, title, hostname }}
 						<a
 							class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
 							href={link}
@@ -235,10 +235,10 @@
 						>
 							<img
 								class="h-3.5 w-3.5 rounded"
-								src="https://www.google.com/s2/favicons?sz=64&domain_url={hostname}"
 								alt="{title} favicon"
 							/>
-							<div>{hostname.replace(/^www\./, "")}</div>
 						</a>
 					{/each}
 				</div>

 			{#if webSearchSources?.length}
 				<div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
 					<div class="text-gray-400">Sources:</div>
+					{#each webSearchSources as { link, title }}
 						<a
 							class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
 							href={link}
 						>
 							<img
 								class="h-3.5 w-3.5 rounded"
+								src="https://www.google.com/s2/favicons?sz=64&domain_url={new URL(link).hostname}"
 								alt="{title} favicon"
 							/>
+							<div>{new URL(link).hostname.replace(/^www\./, "")}</div>
 						</a>
 					{/each}
 				</div>

src/lib/server/embeddingEndpoints/hfApi/embeddingHfApi.ts CHANGED Viewed

@@ -32,7 +32,12 @@ export async function embeddingEndpointHfApi(
 						"Content-Type": "application/json",
 						...(authorization ? { Authorization: authorization } : {}),
 					},
-					body: JSON.stringify({ inputs: batchInputs }),
 				});
 				if (!response.ok) {

 						"Content-Type": "application/json",
 						...(authorization ? { Authorization: authorization } : {}),
 					},
+					body: JSON.stringify({
+						inputs: {
+							source_sentence: batchInputs[0],
+							sentences: batchInputs.slice(1),
+						},
+					}),
 				});
 				if (!response.ok) {

src/lib/server/isURLLocal.ts CHANGED Viewed

@@ -1,26 +1,38 @@
 import { Address6, Address4 } from "ip-address";
 import dns from "node:dns";
-export async function isURLLocal(URL: URL): Promise<boolean> {
-	const isLocal = new Promise<boolean>((resolve, reject) => {
-		dns.lookup(URL.hostname, (err, address, family) => {
-			if (err) {
-				reject(err);
-			}
-			if (family === 4) {
-				const addr = new Address4(address);
-				resolve(addr.isInSubnet(new Address4("127.0.0.0/8")));
-			} else if (family === 6) {
-				const addr = new Address6(address);
-				resolve(
-					addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal()
-				);
-			} else {
-				reject(new Error("Unknown IP family"));
-			}
 		});
 	});
-	return isLocal;
 }

 import { Address6, Address4 } from "ip-address";
 import dns from "node:dns";
+const dnsLookup = (hostname: string): Promise<{ address: string; family: number }> => {
+	return new Promise((resolve, reject) => {
+		dns.lookup(hostname, (err, address, family) => {
+			if (err) return reject(err);
+			resolve({ address, family });
 		});
 	});
+};
+export async function isURLLocal(URL: URL): Promise<boolean> {
+	const { address, family } = await dnsLookup(URL.hostname);
+	if (family === 4) {
+		const addr = new Address4(address);
+		const localSubnet = new Address4("127.0.0.0/8");
+		return addr.isInSubnet(localSubnet);
+	}
+	if (family === 6) {
+		const addr = new Address6(address);
+		return addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal();
+	}
+	throw Error("Unknown IP family");
+}
+export function isURLStringLocal(url: string) {
+	try {
+		const urlObj = new URL(url);
+		return isURLLocal(urlObj);
+	} catch (e) {
+		// assume local if URL parsing fails
+		return true;
+	}
 }

src/lib/server/preprocessMessages.ts CHANGED Viewed

@@ -13,11 +13,9 @@ export async function preprocessMessages(
 	return await Promise.all(
 		structuredClone(messages).map(async (message, idx) => {
 			const webSearchContext = webSearch?.contextSources
-				.map(({ context }) => context)
-				.flat()
-				.sort((a, b) => a.idx - b.idx)
-				.map(({ text }) => text)
-				.join(" ");
 			// start by adding websearch to the last message
 			if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
 				const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
@@ -27,7 +25,7 @@ export async function preprocessMessages(
 					.map((el) => el.content);
 				const currentDate = format(new Date(), "MMMM d, yyyy");
-				message.content = `I searched the web using the query: ${webSearch.searchQuery}.
 Today is ${currentDate} and here are the results:
 =====================
 ${webSearchContext}

 	return await Promise.all(
 		structuredClone(messages).map(async (message, idx) => {
 			const webSearchContext = webSearch?.contextSources
+				.map(({ context }) => context.trim())
+				.join("\n\n----------\n\n");
 			// start by adding websearch to the last message
 			if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
 				const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
 					.map((el) => el.content);
 				const currentDate = format(new Date(), "MMMM d, yyyy");
+				message.content = `I searched the web using the query: ${webSearch.searchQuery}.
 Today is ${currentDate} and here are the results:
 =====================
 ${webSearchContext}

src/lib/server/sentenceSimilarity.ts CHANGED Viewed

@@ -3,40 +3,31 @@ import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
 import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
 // see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
-function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
 	return 1.0 - dot(embeddingA, embeddingB);
 }
-export async function findSimilarSentences(
 	embeddingModel: EmbeddingBackendModel,
 	query: string,
-	sentences: string[],
-	{ topK = 5 }: { topK: number }
-): Promise<Embedding> {
 	const inputs = [
 		`${embeddingModel.preQuery}${query}`,
 		...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
 	];
 	const embeddingEndpoint = await embeddingModel.getEndpoint();
-	const output = await embeddingEndpoint({ inputs });
 	const queryEmbedding: Embedding = output[0];
 	const sentencesEmbeddings: Embedding[] = output.slice(1);
-	const distancesFromQuery: { distance: number; index: number }[] = [...sentencesEmbeddings].map(
-		(sentenceEmbedding: Embedding, index: number) => {
-			return {
-				distance: innerProduct(queryEmbedding, sentenceEmbedding),
-				index,
-			};
-		}
-	);
-	distancesFromQuery.sort((a, b) => {
-		return a.distance - b.distance;
-	});
-	// Return the indexes of the closest topK sentences
-	return distancesFromQuery.slice(0, topK).map((item) => item.index);
 }

 import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
 // see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
+export function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
 	return 1.0 - dot(embeddingA, embeddingB);
 }
+export async function getSentenceSimilarity(
 	embeddingModel: EmbeddingBackendModel,
 	query: string,
+	sentences: string[]
+): Promise<{ distance: number; embedding: Embedding; idx: number }[]> {
 	const inputs = [
 		`${embeddingModel.preQuery}${query}`,
 		...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
 	];
 	const embeddingEndpoint = await embeddingModel.getEndpoint();
+	const output = await embeddingEndpoint({ inputs }).catch((err) => {
+		throw Error("Failed to generate embeddings for sentence similarity", { cause: err });
+	});
 	const queryEmbedding: Embedding = output[0];
 	const sentencesEmbeddings: Embedding[] = output.slice(1);
+	return sentencesEmbeddings.map((sentenceEmbedding, idx) => ({
+		distance: innerProduct(queryEmbedding, sentenceEmbedding),
+		embedding: sentenceEmbedding,
+		idx,
+	}));
 }

src/lib/server/websearch/embed/combine.ts ADDED Viewed

	@@ -0,0 +1,37 @@

+import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
+import { getSentenceSimilarity } from "$lib/server/sentenceSimilarity";
+/**
+ * Combines sentences together to reach the maximum character limit of the embedding model
+ * Improves performance considerably when using CPU embedding
+ */
+export async function getCombinedSentenceSimilarity(
+	embeddingModel: EmbeddingBackendModel,
+	query: string,
+	sentences: string[]
+): ReturnType<typeof getSentenceSimilarity> {
+	const combinedSentences = sentences.reduce<{ text: string; indices: number[] }[]>(
+		(acc, sentence, idx) => {
+			const lastSentence = acc[acc.length - 1];
+			if (!lastSentence) return [{ text: sentence, indices: [idx] }];
+			if (lastSentence.text.length + sentence.length < embeddingModel.chunkCharLength) {
+				lastSentence.text += ` ${sentence}`;
+				lastSentence.indices.push(idx);
+				return acc;
+			}
+			return [...acc, { text: sentence, indices: [idx] }];
+		},
+		[]
+	);
+	const embeddings = await getSentenceSimilarity(
+		embeddingModel,
+		query,
+		combinedSentences.map(({ text }) => text)
+	);
+	return embeddings.flatMap((embedding, idx) => {
+		const { indices } = combinedSentences[idx];
+		return indices.map((i) => ({ ...embedding, idx: i }));
+	});
+}

src/lib/server/websearch/embed/embed.ts ADDED Viewed

	@@ -0,0 +1,80 @@

+import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch";
+import type { EmbeddingBackendModel } from "../../embeddingModels";
+import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity";
+import { MarkdownElementType, type MarkdownElement } from "../markdown/types";
+import { stringifyMarkdownElement } from "../markdown/utils/stringify";
+import { getCombinedSentenceSimilarity } from "./combine";
+import { flattenTree } from "./tree";
+const MIN_CHARS = 3_000;
+const SOFT_MAX_CHARS = 8_000;
+export async function findContextSources(
+	sources: WebSearchScrapedSource[],
+	prompt: string,
+	embeddingModel: EmbeddingBackendModel
+) {
+	const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree));
+	const markdownElems = sourcesMarkdownElems.flat();
+	// When using CPU embedding (transformersjs), join sentences together to the max character limit
+	// to reduce inference time
+	const embeddingFunc =
+		embeddingModel.endpoints[0].type === "transformersjs"
+			? getCombinedSentenceSimilarity
+			: getSentenceSimilarity;
+	const embeddings = await embeddingFunc(
+		embeddingModel,
+		prompt,
+		markdownElems
+			.map(stringifyMarkdownElement)
+			// Safety in case the stringified markdown elements are too long
+			// but chunking should have happened earlier
+			.map((elem) => elem.slice(0, embeddingModel.chunkCharLength))
+	);
+	const topEmbeddings = embeddings
+		.sort((a, b) => a.distance - b.distance)
+		.filter((embedding) => markdownElems[embedding.idx].type !== MarkdownElementType.Header);
+	let totalChars = 0;
+	const selectedMarkdownElems = new Set<MarkdownElement>();
+	const selectedEmbeddings: number[][] = [];
+	for (const embedding of topEmbeddings) {
+		const elem = markdownElems[embedding.idx];
+		// Ignore elements that are too similar to already selected elements
+		const tooSimilar = selectedEmbeddings.some(
+			(selectedEmbedding) => innerProduct(selectedEmbedding, embedding.embedding) < 0.01
+		);
+		if (tooSimilar) continue;
+		// Add element
+		if (!selectedMarkdownElems.has(elem)) {
+			selectedMarkdownElems.add(elem);
+			selectedEmbeddings.push(embedding.embedding);
+			totalChars += elem.content.length;
+		}
+		// Add element's parent (header)
+		if (elem.parent && !selectedMarkdownElems.has(elem.parent)) {
+			selectedMarkdownElems.add(elem.parent);
+			totalChars += elem.parent.content.length;
+		}
+		if (totalChars > SOFT_MAX_CHARS) break;
+		if (totalChars > MIN_CHARS && embedding.distance > 0.25) break;
+	}
+	const contextSources = sourcesMarkdownElems
+		.map<WebSearchUsedSource>((elems, idx) => {
+			const sourceSelectedElems = elems.filter((elem) => selectedMarkdownElems.has(elem));
+			const context = sourceSelectedElems.map(stringifyMarkdownElement).join("\n");
+			const source = sources[idx];
+			return { ...source, context };
+		})
+		.filter((contextSource) => contextSource.context.length > 0);
+	return contextSources;
+}

src/lib/server/websearch/embed/tree.ts ADDED Viewed

	@@ -0,0 +1,6 @@

+import type { MarkdownElement } from "../markdown/types";
+export function flattenTree(elem: MarkdownElement): MarkdownElement[] {
+	if ("children" in elem) return [elem, ...elem.children.flatMap(flattenTree)];
+	return [elem];
+}

src/lib/server/websearch/markdown/fromHtml.ts ADDED Viewed

	@@ -0,0 +1,98 @@

+import { collapseString, sanitizeString } from "./utils/nlp";
+import { stringifyHTMLElements, stringifyHTMLElementsUnformatted } from "./utils/stringify";
+import { MarkdownElementType, tagNameMap, type HeaderElement, type MarkdownElement } from "./types";
+import type { SerializedHTMLElement } from "../scrape/types";
+interface ConversionState {
+	defaultType:
+		| MarkdownElementType.Paragraph
+		| MarkdownElementType.BlockQuote
+		| MarkdownElementType.UnorderedListItem
+		| MarkdownElementType.OrderedListItem;
+	listDepth: number;
+	blockQuoteDepth: number;
+}
+export function htmlElementToMarkdownElements(
+	parent: HeaderElement,
+	elem: SerializedHTMLElement | string,
+	prevState: ConversionState = {
+		defaultType: MarkdownElementType.Paragraph,
+		listDepth: 0,
+		blockQuoteDepth: 0,
+	}
+): MarkdownElement | MarkdownElement[] {
+	// Found text so create an element based on the previous state
+	if (typeof elem === "string") {
+		if (elem.trim().length === 0) return [];
+		if (
+			prevState.defaultType === MarkdownElementType.UnorderedListItem ||
+			prevState.defaultType === MarkdownElementType.OrderedListItem
+		) {
+			return {
+				parent,
+				type: prevState.defaultType,
+				content: elem,
+				depth: prevState.listDepth,
+			};
+		}
+		if (prevState.defaultType === MarkdownElementType.BlockQuote) {
+			return {
+				parent,
+				type: prevState.defaultType,
+				content: elem,
+				depth: prevState.blockQuoteDepth,
+			};
+		}
+		return { parent, type: prevState.defaultType, content: elem };
+	}
+	const type = tagNameMap[elem.tagName] ?? MarkdownElementType.Paragraph;
+	// Update the state based on the current element
+	const state: ConversionState = { ...prevState };
+	if (type === MarkdownElementType.UnorderedList || type === MarkdownElementType.OrderedList) {
+		state.listDepth += 1;
+		state.defaultType =
+			type === MarkdownElementType.UnorderedList
+				? MarkdownElementType.UnorderedListItem
+				: MarkdownElementType.OrderedListItem;
+	}
+	if (type === MarkdownElementType.BlockQuote) {
+		state.defaultType = MarkdownElementType.BlockQuote;
+		state.blockQuoteDepth += 1;
+	}
+	// Headers
+	if (type === MarkdownElementType.Header) {
+		return {
+			parent,
+			type,
+			level: Number(elem.tagName[1]),
+			content: collapseString(stringifyHTMLElements(elem.content)),
+			children: [],
+		};
+	}
+	// Code blocks
+	if (type === MarkdownElementType.CodeBlock) {
+		return {
+			parent,
+			type,
+			content: sanitizeString(stringifyHTMLElementsUnformatted(elem.content)),
+		};
+	}
+	// Typical case, we want to flatten the DOM and only create elements when we see text
+	return elem.content.flatMap((el) => htmlElementToMarkdownElements(parent, el, state));
+}
+export function mergeAdjacentElements(elements: MarkdownElement[]): MarkdownElement[] {
+	return elements.reduce<MarkdownElement[]>((acc, elem) => {
+		const last = acc[acc.length - 1];
+		if (last && last.type === MarkdownElementType.Paragraph && last.type === elem.type) {
+			last.content += elem.content;
+			return acc;
+		}
+		return [...acc, elem];
+	}, []);
+}

src/lib/server/websearch/markdown/tree.ts ADDED Viewed

	@@ -0,0 +1,63 @@

+import type { SerializedHTMLElement } from "../scrape/types";
+import { htmlElementToMarkdownElements, mergeAdjacentElements } from "./fromHtml";
+import type { HeaderElement, MarkdownElement } from "./types";
+import { MarkdownElementType } from "./types";
+import { chunkElements } from "./utils/chunk";
+/**
+ * Converts HTML elements to Markdown elements and creates a tree based on header tags
+ * For example: h1 [h2 [p p blockquote] h2 [h3 [...] ] ]
+ **/
+export function htmlToMarkdownTree(
+	title: string,
+	htmlElements: SerializedHTMLElement[],
+	maxCharsPerElem: number
+): HeaderElement {
+	let parent: HeaderElement = {
+		type: MarkdownElementType.Header,
+		level: 1,
+		parent: null,
+		content: title,
+		children: [],
+	};
+	const markdownElements = chunkElements(
+		mergeAdjacentElements(
+			htmlElements.flatMap((elem) => htmlElementToMarkdownElements(parent, elem))
+		),
+		maxCharsPerElem
+	);
+	for (const elem of markdownElements) {
+		if (elem.type !== MarkdownElementType.Header) {
+			elem.parent = parent;
+			parent.children.push(elem);
+			continue;
+		}
+		// add 1 to current level to offset for the title being level 1
+		elem.level += 1;
+		// Pop up header levels until reaching the same level as the current header
+		// or until we reach the root
+		inner: while (parent !== null && parent.parent !== null) {
+			if (parent.level < elem.level) break inner;
+			parent = parent.parent;
+		}
+		parent.children.push(elem);
+		parent = elem;
+	}
+	// Pop up to the root
+	while (parent.parent !== null) {
+		parent = parent.parent;
+	}
+	return parent;
+}
+export function removeParents<T extends MarkdownElement>(elem: T): T {
+	if ("children" in elem) {
+		return { ...elem, parent: null, children: elem.children.map((child) => removeParents(child)) };
+	}
+	return { ...elem, parent: null };
+}

src/lib/server/websearch/markdown/types.ts ADDED Viewed

	@@ -0,0 +1,55 @@

+/* eslint-disable-next-line no-shadow */
+export enum MarkdownElementType {
+	Header = "HEADER",
+	Paragraph = "PARAGRAPH",
+	BlockQuote = "BLOCKQUOTE",
+	CodeBlock = "CODE_BLOCK",
+	UnorderedList = "UNORDERED_LIST",
+	OrderedList = "ORDERED_LIST",
+	UnorderedListItem = "UNORDERED_LIST_ITEM",
+	OrderedListItem = "ORDERED_LIST_ITEM",
+}
+interface BaseMarkdownElement<T = MarkdownElementType> {
+	type: T;
+	content: string;
+	parent: HeaderElement | null;
+}
+export interface HeaderElement extends BaseMarkdownElement<MarkdownElementType.Header> {
+	level: number;
+	children: MarkdownElement[];
+}
+type ListItem = MarkdownElementType.UnorderedListItem | MarkdownElementType.OrderedListItem;
+interface ListItemElement extends BaseMarkdownElement<ListItem> {
+	depth: number;
+}
+interface BlockQuoteElement extends BaseMarkdownElement<MarkdownElementType.BlockQuote> {
+	depth: number;
+}
+interface ParagraphElement extends BaseMarkdownElement<MarkdownElementType.Paragraph> {}
+interface CodeBlockElement extends BaseMarkdownElement<MarkdownElementType.CodeBlock> {}
+export type MarkdownElement =
+	| HeaderElement
+	| ParagraphElement
+	| BlockQuoteElement
+	| CodeBlockElement
+	| ListItemElement;
+export const tagNameMap: Record<string, MarkdownElementType> = {
+	h1: MarkdownElementType.Header,
+	h2: MarkdownElementType.Header,
+	h3: MarkdownElementType.Header,
+	h4: MarkdownElementType.Header,
+	h5: MarkdownElementType.Header,
+	h6: MarkdownElementType.Header,
+	div: MarkdownElementType.Paragraph,
+	p: MarkdownElementType.Paragraph,
+	blockquote: MarkdownElementType.BlockQuote,
+	pre: MarkdownElementType.CodeBlock,
+	ul: MarkdownElementType.UnorderedList,
+	ol: MarkdownElementType.OrderedList,
+	li: MarkdownElementType.UnorderedListItem,
+};

src/lib/server/websearch/markdown/utils/chunk.ts ADDED Viewed

	@@ -0,0 +1,60 @@

+import { sentences as splitBySentences } from "sbd";
+import { MarkdownElementType, type MarkdownElement } from "../types";
+export function chunkElements(elements: MarkdownElement[], maxLength: number): MarkdownElement[] {
+	return elements.flatMap((elem) => {
+		// Can't split headers because it would break the tree, and this situation should be rare
+		// so we just cut off the end
+		if (elem.type === MarkdownElementType.Header) {
+			return { ...elem, content: elem.content.slice(0, maxLength) };
+		}
+		const contentChunks = enforceMaxLength(elem.content, maxLength);
+		return contentChunks.map<MarkdownElement>((content) => ({ ...elem, content }));
+	});
+}
+const delimitersByPriority = ["?", "!", ".", ";", ":", ",", "|", " - ", " ", "-"];
+function enforceMaxLength(text: string, maxLength: number): string[] {
+	if (text.length <= maxLength) return [text].filter(Boolean);
+	return splitBySentences(text)
+		.flatMap((sentence) => {
+			if (sentence.length <= maxLength) return sentence;
+			// Discover all necessary split points to fit the sentence within the max length
+			const indices: [number, number][] = [];
+			while ((indices.at(-1)?.[1] ?? 0) < sentence.length) {
+				const prevIndex = indices.at(-1)?.[1] ?? 0;
+				// Remaining text fits within maxLength
+				if (prevIndex + maxLength >= sentence.length) {
+					indices.push([prevIndex, sentence.length]);
+					continue;
+				}
+				const bestDelimiter = delimitersByPriority.find(
+					(delimiter) => sentence.lastIndexOf(delimiter, prevIndex + maxLength) !== -1
+				);
+				// Fallback in the unusual case that no delimiter is found
+				if (!bestDelimiter) {
+					indices.push([prevIndex, prevIndex + maxLength]);
+					continue;
+				}
+				const closestDelimiter = sentence.lastIndexOf(bestDelimiter, prevIndex + maxLength);
+				indices.push([prevIndex, Math.max(prevIndex + 1, closestDelimiter)]);
+			}
+			return indices.map((sliceIndices) => sentence.slice(...sliceIndices));
+		})
+		.reduce<string[]>(
+			(chunks, sentence) => {
+				const lastChunk = chunks[chunks.length - 1];
+				if (lastChunk.length + sentence.length <= maxLength) {
+					return [...chunks.slice(0, -1), lastChunk + sentence];
+				}
+				return [...chunks, sentence];
+			},
+			[""]
+		)
+		.filter(Boolean);
+}

src/lib/server/websearch/markdown/utils/nlp.ts ADDED Viewed

	@@ -0,0 +1,11 @@

+/** Remove excess whitespace and newlines */
+export const sanitizeString = (str: string) =>
+	str
+		.split("\n")
+		.map((s) => s.trim())
+		.filter(Boolean)
+		.join("\n")
+		.replaceAll(/ +/g, " ");
+/** Collapses a string into a single line */
+export const collapseString = (str: string) => sanitizeString(str.replaceAll(/\n/g, " "));

src/lib/server/websearch/markdown/utils/stringify.ts ADDED Viewed

	@@ -0,0 +1,75 @@

+import type { SerializedHTMLElement } from "../../scrape/types";
+import { MarkdownElementType, type MarkdownElement } from "../types";
+// --- Markdown Elements ---
+/** Converts markdown element to a string with formatting */
+export function stringifyMarkdownElement(elem: MarkdownElement): string {
+	const content = elem.content.trim();
+	if (elem.type === MarkdownElementType.Header) return `${"#".repeat(elem.level)} ${content}\n\n`;
+	if (elem.type === MarkdownElementType.BlockQuote) {
+		return `${"> ".repeat(elem.depth)}${content}\n\n`;
+	}
+	if (elem.type === MarkdownElementType.CodeBlock) return `\`\`\`\n${content}\n\`\`\`\n\n`;
+	if (elem.type === MarkdownElementType.UnorderedListItem) return `- ${content}\n`;
+	if (elem.type === MarkdownElementType.OrderedListItem) {
+		const siblings = elem.parent?.children ?? [elem];
+		const currentIndex = siblings.indexOf(elem);
+		const lastAdjacentIndex = siblings
+			.slice(currentIndex + 1)
+			.findLastIndex((child) => child.type === MarkdownElementType.OrderedListItem);
+		const order = currentIndex - lastAdjacentIndex + 1;
+		return `${order}. ${content}\n`;
+	}
+	return `${content}\n\n`;
+}
+// ----- HTML Elements -----
+/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
+export function stringifyHTMLElements(elems: (SerializedHTMLElement | string)[]): string {
+	return elems.map(stringifyHTMLElement).join("").trim();
+}
+/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
+export function stringifyHTMLElement(elem: SerializedHTMLElement | string): string {
+	if (typeof elem === "string") return elem;
+	if (elem.tagName === "br") return "\n";
+	const content = elem.content.map(stringifyHTMLElement).join("");
+	if (content.length === 0) return content;
+	if (elem.tagName === "strong" || elem.tagName === "b") return `**${content}**`;
+	if (elem.tagName === "em" || elem.tagName === "i") return `*${content}*`;
+	if (elem.tagName === "s" || elem.tagName === "strike") return `~~${content}~~`;
+	if (elem.tagName === "code" || elem.tagName === "var" || elem.tagName === "tt") {
+		return `\`${content}\``;
+	}
+	if (elem.tagName === "sup") return `<sup>${content}</sup>`;
+	if (elem.tagName === "sub") return `<sub>${content}</sub>`;
+	if (elem.tagName === "a" && content.trim().length > 0) {
+		const href = elem.attributes.href;
+		if (!href) return elem.content.map(stringifyHTMLElement).join("");
+		return `[${elem.content.map(stringifyHTMLElement).join("")}](${href})`;
+	}
+	return elem.content.map(stringifyHTMLElement).join("");
+}
+/** Grabs all text content directly, ignoring HTML tags */
+export function stringifyHTMLElementsUnformatted(
+	elems: (SerializedHTMLElement | string)[]
+): string {
+	return elems.map(stringifyHTMLElementUnformatted).join("");
+}
+/** Grabs all text content directly, ignoring HTML tags */
+function stringifyHTMLElementUnformatted(elem: SerializedHTMLElement | string): string {
+	if (typeof elem === "string") return elem;
+	return elem.content.map(stringifyHTMLElementUnformatted).join("");
+}

src/lib/server/websearch/parseWeb.ts DELETED Viewed

@@ -1,41 +0,0 @@
-import { JSDOM, VirtualConsole } from "jsdom";
-export async function parseWeb(url: string) {
-	const abortController = new AbortController();
-	setTimeout(() => abortController.abort(), 10000);
-	const r = await fetch(url, { signal: abortController.signal, credentials: "omit" }).catch();
-	if (r.headers.get("content-type")?.includes("text/html")) {
-		const virtualConsole = new VirtualConsole();
-		virtualConsole.on("error", () => {
-			// No-op to skip console errors.
-		});
-		// put the html string into a DOM
-		const dom = new JSDOM((await r.text()) ?? "", {
-			virtualConsole,
-		});
-		const { document } = dom.window;
-		const paragraphs = document.querySelectorAll("p, table, pre, ul, ol");
-		if (!paragraphs.length) {
-			throw new Error(`webpage doesn't have any parseable element`);
-		}
-		const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
-		// combine text contents from paragraphs and then remove newlines and multiple spaces
-		const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
-		return text;
-	} else if (
-		r.headers.get("content-type")?.includes("text/plain") ||
-		r.headers.get("content-type")?.includes("text/markdown")
-	) {
-		const text = await r.text();
-		// JSON.stringify is needed to turn string concatenation into a single string (ex: "Hello, " + "world!" -> "Hello, world!")
-		return JSON.stringify(text);
-	} else {
-		throw new Error("Unsupported content type");
-	}
-}

src/lib/server/websearch/runWebSearch.ts CHANGED Viewed

@@ -1,179 +1,103 @@
-import { searchWeb } from "$lib/server/websearch/searchWeb";
-import { generateQuery } from "$lib/server/websearch/generateQuery";
-import { parseWeb } from "$lib/server/websearch/parseWeb";
-import { chunk } from "$lib/utils/chunk";
-import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
-import { getWebSearchProvider } from "./searchWeb";
 import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
-import { env } from "$env/dynamic/private";
 import type { Conversation } from "$lib/types/Conversation";
 import type { MessageUpdate } from "$lib/types/MessageUpdate";
 import type { Message } from "$lib/types/Message";
-import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
 import type { Assistant } from "$lib/types/Assistant";
-import { z } from "zod";
-import JSON5 from "json5";
-import { isURLLocal } from "../isURLLocal";
-const MAX_N_PAGES_SCRAPE = 10 as const;
-const MAX_N_PAGES_EMBED = 5 as const;
-const listSchema = z.array(z.string()).default([]);
-const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST));
-const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST));
 export async function runWebSearch(
 	conv: Conversation,
 	messages: Message[],
 	updatePad: (upd: MessageUpdate) => void,
 	ragSettings?: Assistant["rag"]
-) {
 	const prompt = messages[messages.length - 1].content;
-	const webSearch: WebSearch = {
-		prompt,
-		searchQuery: "",
-		results: [],
-		contextSources: [],
-		createdAt: new Date(),
-		updatedAt: new Date(),
-	};
-	function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
-		updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
-	}
 	try {
-		// if the assistant specified direct links, skip the websearch
-		if (ragSettings && ragSettings?.allowedLinks.length > 0) {
-			appendUpdate("Using links specified in Assistant");
-			let linksToUse = [...ragSettings.allowedLinks];
-			if (env.ENABLE_LOCAL_FETCH !== "true") {
-				const localLinks = await Promise.all(
-					linksToUse.map(async (link) => {
-						try {
-							const url = new URL(link);
-							return await isURLLocal(url);
-						} catch (e) {
-							return true;
-						}
-					})
-				);
-				linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
-			}
-			webSearch.results = linksToUse.map((link) => {
-				return { link, hostname: new URL(link).hostname, title: "", text: "" };
-			});
-		} else {
-			webSearch.searchQuery = await generateQuery(messages);
-			const searchProvider = getWebSearchProvider();
-			appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
-			let filters = "";
-			if (ragSettings && ragSettings?.allowedDomains.length > 0) {
-				appendUpdate("Filtering on specified domains");
-				filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
-			}
-			// handle the global lists
-			filters +=
-				allowList.map((item) => "site:" + item).join(" OR ") +
-				" " +
-				blockList.map((item) => "-site:" + item).join(" ");
-			webSearch.searchQuery = filters + " " + webSearch.searchQuery;
-			const results = await searchWeb(webSearch.searchQuery);
-			webSearch.results =
-				(results.organic_results &&
-					results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
-						try {
-							const { title, link, text } = el;
-							const { hostname } = new URL(link);
-							return { title, link, hostname, text };
-						} catch (e) {
-							// Ignore Errors
-							return null;
-						}
-					})) ??
-				[];
-		}
-		webSearch.results = webSearch.results.filter((value) => value !== null);
-		webSearch.results = webSearch.results
-			.filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
-			.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
-		// fetch the model
 		const embeddingModel =
 			embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
 		if (!embeddingModel) {
-			throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`);
 		}
-		let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
-		if (webSearch.results.length > 0) {
-			appendUpdate("Browsing results");
-			const promises = webSearch.results.map(async (result) => {
-				const { link } = result;
-				let text = result.text ?? "";
-				if (!text) {
-					try {
-						text = await parseWeb(link);
-						appendUpdate("Browsing webpage", [link]);
-					} catch (e) {
-						appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error");
-						// ignore errors
-					}
-				}
-				const MAX_N_CHUNKS = 100;
-				const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS);
-				return texts.map((t) => ({ source: result, text: t }));
-			});
-			const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
-			paragraphChunks = nestedParagraphChunks.flat();
-			if (!paragraphChunks.length) {
-				throw new Error("No text found on the first 5 results");
-			}
-		} else {
-			throw new Error("No results found for this search query");
 		}
 		appendUpdate("Extracting relevant information");
-		const topKClosestParagraphs = 8;
-		const texts = paragraphChunks.map(({ text }) => text);
-		const indices = await findSimilarSentences(embeddingModel, prompt, texts, {
-			topK: topKClosestParagraphs,
-		});
-		for (const idx of indices) {
-			const { source } = paragraphChunks[idx];
-			const contextWithId = { idx, text: texts[idx] };
-			const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
-			if (usedSource) {
-				usedSource.context.push(contextWithId);
-			} else {
-				webSearch.contextSources.push({ ...source, context: [contextWithId] });
-			}
-		}
 		updatePad({
 			type: "webSearch",
 			messageType: "sources",
 			message: "sources",
-			sources: webSearch.contextSources,
 		});
 	} catch (searchError) {
-		if (searchError instanceof Error) {
-			appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error");
-		}
 	}
-	return webSearch;
 }

 import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
 import type { Conversation } from "$lib/types/Conversation";
 import type { MessageUpdate } from "$lib/types/MessageUpdate";
 import type { Message } from "$lib/types/Message";
+import type { WebSearch, WebSearchScrapedSource } from "$lib/types/WebSearch";
 import type { Assistant } from "$lib/types/Assistant";
+import { search } from "./search/search";
+import { scrape } from "./scrape/scrape";
+import { findContextSources } from "./embed/embed";
+import { removeParents } from "./markdown/tree";
+const MAX_N_PAGES_TO_SCRAPE = 8 as const;
+const MAX_N_PAGES_TO_EMBED = 5 as const;
+export type AppendUpdate = (message: string, args?: string[], type?: "error" | "update") => void;
+const makeAppendUpdate =
+	(updatePad: (upd: MessageUpdate) => void): AppendUpdate =>
+	(message, args, type) =>
+		updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
 export async function runWebSearch(
 	conv: Conversation,
 	messages: Message[],
 	updatePad: (upd: MessageUpdate) => void,
 	ragSettings?: Assistant["rag"]
+): Promise<WebSearch> {
 	const prompt = messages[messages.length - 1].content;
+	const createdAt = new Date();
+	const updatedAt = new Date();
+	const appendUpdate = makeAppendUpdate(updatePad);
 	try {
 		const embeddingModel =
 			embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
 		if (!embeddingModel) {
+			throw Error(`Embedding model ${conv.embeddingModel} not available anymore`);
 		}
+		// Search the web
+		const { searchQuery, pages } = await search(messages, ragSettings, appendUpdate);
+		if (pages.length === 0) throw Error("No results found for this search query");
+		// Scrape pages
+		appendUpdate("Browsing search results");
+		const scrapedPages = await Promise.all(
+			pages
+				.slice(0, MAX_N_PAGES_TO_SCRAPE)
+				.map(scrape(appendUpdate, embeddingModel.chunkCharLength))
+		).then((allScrapedPages) =>
+			allScrapedPages
+				.filter((p): p is WebSearchScrapedSource => Boolean(p))
+				.filter((p) => p.page.markdownTree.children.length > 0)
+				.slice(0, MAX_N_PAGES_TO_EMBED)
+		);
+		if (!scrapedPages.length) {
+			throw Error(`No text found in the first ${MAX_N_PAGES_TO_SCRAPE} results`);
 		}
+		// Chunk the text of each of the elements and find the most similar chunks to the prompt
 		appendUpdate("Extracting relevant information");
+		const contextSources = await findContextSources(scrapedPages, prompt, embeddingModel).then(
+			(ctxSources) =>
+				ctxSources.map((source) => ({
+					...source,
+					page: { ...source.page, markdownTree: removeParents(source.page.markdownTree) },
+				}))
+		);
 		updatePad({
 			type: "webSearch",
 			messageType: "sources",
 			message: "sources",
+			sources: contextSources,
 		});
+		return {
+			prompt,
+			searchQuery,
+			results: scrapedPages.map(({ page, ...source }) => ({
+				...source,
+				page: { ...page, markdownTree: removeParents(page.markdownTree) },
+			})),
+			contextSources,
+			createdAt,
+			updatedAt,
+		};
 	} catch (searchError) {
+		const message = searchError instanceof Error ? searchError.message : String(searchError);
+		console.error(message);
+		appendUpdate("An error occurred", [JSON.stringify(message)], "error");
+		return {
+			prompt,
+			searchQuery: "",
+			results: [],
+			contextSources: [],
+			createdAt,
+			updatedAt,
+		};
 	}
 }

src/lib/server/websearch/scrape/parser.ts ADDED Viewed

	@@ -0,0 +1,552 @@

+import type { SerializedHTMLElement } from "./types";
+interface DBSCANOptions<T> {
+	dataset: T[];
+	epsilon?: number;
+	epsilonCompare?: (distance: number, epsilon: number) => boolean;
+	minimumPoints?: number;
+	distanceFunction: (a: T, b: T) => number;
+}
+export function spatialParser() {
+	/**
+	 * Implementation for dbscan, inlined and migrated to typescript from https://github.com/cdxOo/dbscan (MIT License)
+	 */
+	const DBSCAN = <T>({
+		dataset,
+		epsilon = 1,
+		epsilonCompare = (dist, e) => dist < e,
+		minimumPoints = 2,
+		distanceFunction,
+	}: DBSCANOptions<T>) => {
+		const visitedIndices: Record<number, boolean> = {};
+		const isVisited = (i: number) => visitedIndices[i];
+		const markVisited = (i: number) => {
+			visitedIndices[i] = true;
+		};
+		const clusteredIndices: Record<number, boolean> = {};
+		const isClustered = (i: number) => clusteredIndices[i];
+		const markClustered = (i: number) => {
+			clusteredIndices[i] = true;
+		};
+		const uniqueMerge = <U>(targetArray: U[], sourceArray: U[]) => {
+			for (let i = 0; i < sourceArray.length; i += 1) {
+				const item = sourceArray[i];
+				if (targetArray.indexOf(item) < 0) {
+					targetArray.push(item);
+				}
+			}
+		};
+		const findNeighbors = (index: number) => {
+			const neighbors = [];
+			for (let other = 0; other < dataset.length; other += 1) {
+				const distance = distanceFunction(dataset[index], dataset[other]);
+				if (epsilonCompare(distance, epsilon)) {
+					neighbors.push(other);
+				}
+			}
+			return neighbors;
+		};
+		const noise: number[] = [];
+		const addNoise = (i: number) => noise.push(i);
+		const clusters: number[][] = [];
+		const createCluster = () => clusters.push([]) - 1;
+		const addIndexToCluster = (c: number, i: number) => {
+			clusters[c].push(i);
+			markClustered(i);
+		};
+		const expandCluster = (c: number, neighbors: number[]) => {
+			for (let i = 0; i < neighbors.length; i += 1) {
+				const neighborIndex = neighbors[i];
+				if (!isVisited(neighborIndex)) {
+					markVisited(neighborIndex);
+					const secondaryNeighbors = findNeighbors(neighborIndex);
+					if (secondaryNeighbors.length >= minimumPoints) {
+						uniqueMerge(neighbors, secondaryNeighbors);
+					}
+				}
+				if (!isClustered(neighborIndex)) {
+					addIndexToCluster(c, neighborIndex);
+				}
+			}
+		};
+		dataset.forEach((_, index) => {
+			if (!isVisited(index)) {
+				markVisited(index);
+				const neighbors = findNeighbors(index);
+				if (neighbors.length < minimumPoints) {
+					addNoise(index);
+				} else {
+					const clusterIndex = createCluster();
+					addIndexToCluster(clusterIndex, index);
+					expandCluster(clusterIndex, neighbors);
+				}
+			}
+		});
+		return { clusters, noise };
+	};
+	// -----------
+	// Scraping implementation
+	const IgnoredTagsList = [
+		"footer",
+		"nav",
+		"aside",
+		"script",
+		"style",
+		"noscript",
+		"form",
+		"button",
+	];
+	const InlineTags = [
+		"a",
+		"abbrv",
+		"span",
+		"address",
+		"time",
+		"acronym",
+		"strong",
+		"b",
+		"br",
+		"sub",
+		"sup",
+		"tt",
+		"var",
+		"em",
+		"i",
+	];
+	type ReadableNode = HTMLElement;
+	type NodeWithRect = {
+		node: ReadableNode;
+		rect: DOMRect;
+	};
+	const isOnlyChild = (node: Node) => {
+		if (!node.parentElement) return true;
+		if (node.parentElement.nodeName === "body") return false;
+		if (node.parentElement.childNodes.length === 1) return true;
+		return false;
+	};
+	const hasValidInlineParent = (node: Node) => {
+		return node.parentElement && !node.parentElement.matches("div, section, article, main, body ");
+	};
+	const hasValidParent = (node: Node) => {
+		return node.parentElement && !node.parentElement.isSameNode(document.body);
+	};
+	const possibleCodeParents = Array.from(document.querySelectorAll("pre, p"));
+	const possibleTableParents = Array.from(document.querySelectorAll("table"));
+	const possibleListParents = Array.from(document.querySelectorAll("ul, ol"));
+	/**
+	 * We want to find the highest parent of text node in the cluster.
+	 * For example in this case: <p><span>Text here</span></p>
+	 * the P tag is highest parent.
+	 */
+	const findHighestDirectParentOfReadableNode = (node: Node): HTMLElement => {
+		// go up the tree until the parent is no longer an only child
+		let parent = node.parentElement;
+		// if the parent is an inline tag, then go up one more level
+		while (
+			parent &&
+			hasValidInlineParent(parent) &&
+			InlineTags.includes(parent?.tagName.toLowerCase())
+		) {
+			parent = parent.parentElement;
+		}
+		while (parent && isOnlyChild(parent)) {
+			if (!hasValidParent(parent)) break;
+			parent = parent.parentElement;
+		}
+		if (!parent) {
+			throw new Error(
+				"disconnected node found, this should not really be possible when traversing through the dom"
+			);
+		}
+		// if the parent is a span, code or div tag check if there is a pre tag or p tag above it
+		if (["span", "code", "div"].includes(parent.nodeName.toLowerCase())) {
+			const hasParent = possibleCodeParents.find((tag) => tag.contains(parent)) as HTMLElement;
+			if (hasParent) {
+				parent = hasParent;
+			}
+		}
+		// if the parent is a li tag check if there is a ul or ol tag above it
+		if (parent.nodeName.toLowerCase() === "li") {
+			const hasParent = possibleListParents.find((tag) => tag.contains(parent)) as HTMLElement;
+			if (hasParent) {
+				parent = hasParent;
+			}
+		}
+		// if the parent is a td, th, tr tag check if there is a table tag above it
+		if (["td", "th", "tr"].includes(parent.nodeName.toLowerCase())) {
+			const hasParent = possibleTableParents.find((tag) => tag.contains(parent)) as HTMLElement;
+			if (hasParent) {
+				parent = hasParent;
+			}
+		}
+		return parent;
+	};
+	const barredNodes = Array.from(document.querySelectorAll(IgnoredTagsList.join(",")));
+	const doesNodePassHeuristics = (node: Node) => {
+		if ((node.textContent ?? "").trim().length < 10) {
+			return false;
+		}
+		const parentNode = findHighestDirectParentOfReadableNode(node);
+		if (parentNode && parentNode instanceof Element) {
+			if (
+				!parentNode.checkVisibility({
+					checkOpacity: true,
+					checkVisibilityCSS: true,
+				})
+			)
+				return false;
+			const rect = parentNode.getBoundingClientRect();
+			// elements that are readable usually don't have really small height or width
+			if (rect.width < 4 || rect.height < 4) {
+				return false;
+			}
+		}
+		if (parentNode && parentNode instanceof Element) {
+			if (barredNodes.some((barredNode) => barredNode.contains(parentNode))) {
+				return false;
+			}
+		}
+		return true;
+	};
+	const getAllReadableNodes = (): NodeWithRect[] => {
+		if (!document.body) throw new Error("Page failed to load");
+		const treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
+			acceptNode(node) {
+				if (doesNodePassHeuristics(node)) {
+					return NodeFilter.FILTER_ACCEPT;
+				} else {
+					return NodeFilter.FILTER_SKIP;
+				}
+			},
+		});
+		const readableNodes = [];
+		while (treeWalker.nextNode()) {
+			readableNodes.push(treeWalker.currentNode as ReadableNode);
+		}
+		/*
+		 * <table><p>hello</p><p>world</p></table>
+		 * table is already included in the parent of the first p tag
+		 */
+		const parentsForReadableNodes = readableNodes.map(findHighestDirectParentOfReadableNode);
+		const listWithOnlyParents: HTMLElement[] = [];
+		// find unique nodes in the parent list, a unique node is a node that is not a child of any other node in the list
+		for (let i = 0; i < parentsForReadableNodes.length; i++) {
+			const node = parentsForReadableNodes[i];
+			const hasParentInList = parentsForReadableNodes.find((otherNode, idx) => {
+				if (i === idx) return false;
+				return otherNode.contains(node);
+			});
+			listWithOnlyParents.push(hasParentInList ? hasParentInList : node);
+		}
+		const uniqueParents = Array.from(new Set(listWithOnlyParents));
+		return uniqueParents.map((node) => {
+			return {
+				node,
+				rect: node.getBoundingClientRect(),
+			};
+		});
+	};
+	const distanceFunction = (a: NodeWithRect, b: NodeWithRect) => {
+		// we make two assumptions here which are fine to make for rects returned from getBoundingClientRect
+		// 1. rects are upright and not rotated
+		// 2. If two rects intersect, we assume distance to be 0
+		let dx = 0;
+		let dy = 0;
+		const rect1 = a.rect;
+		const rect2 = b.rect;
+		// Calculate the horizontal distance
+		if (rect1.x + rect1.width < rect2.x) {
+			dx = rect2.x - (rect1.x + rect1.width);
+		} else if (rect2.x + rect2.width < rect1.x) {
+			dx = rect1.x - (rect2.x + rect2.width);
+		}
+		// Calculate the vertical distance
+		if (rect1.y + rect1.height < rect2.y) {
+			dy = rect2.y - (rect1.y + rect1.height);
+		} else if (rect2.y + rect2.height < rect1.y) {
+			dy = rect1.y - (rect2.y + rect2.height);
+		}
+		const distance = Math.sqrt(dx * dx + dy * dy);
+		// Return the Euclidean distance
+		return distance;
+	};
+	/**
+	 * Clusters nodes using dbscan
+	 */
+	const clusterReadableNodes = (nodes: NodeWithRect[]) => {
+		const { clusters } = DBSCAN({
+			dataset: nodes,
+			epsilon: 28,
+			minimumPoints: 1,
+			distanceFunction,
+		});
+		return clusters;
+	};
+	const totalTextLength = (cluster: number[]) => {
+		return cluster
+			.map((t) => readableNodes[t].node.innerText?.replaceAll(/ {2}|\r\n|\n|\r/gm, ""))
+			.join("").length;
+	};
+	const approximatelyEqual = (a: number, b: number, epsilon = 1) => {
+		return Math.abs(a - b) < epsilon;
+	};
+	const getClusterBounds = (cluster: number[]) => {
+		const leftMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.x));
+		const topMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.y));
+		const rightMostPoint = Math.max(
+			...cluster.map((c) => readableNodes[c].rect.x + readableNodes[c].rect.width)
+		);
+		const bottomMostPoint = Math.max(
+			...cluster.map((c) => readableNodes[c].rect.y + readableNodes[c].rect.height)
+		);
+		return {
+			// left most element
+			x: leftMostPoint,
+			y: topMostPoint,
+			width: rightMostPoint - leftMostPoint,
+			height: bottomMostPoint - topMostPoint,
+		};
+	};
+	const round = (num: number, decimalPlaces = 2) => {
+		const factor = Math.pow(10, decimalPlaces);
+		return Math.round(num * factor) / factor;
+	};
+	/** minimum distance to center of the screen */
+	const clusterCentrality = (cluster: number[]) => {
+		const bounds = getClusterBounds(cluster);
+		const centerOfScreen = window.innerWidth / 2;
+		// the cluster contains the center of the screen
+		if (bounds.x < centerOfScreen && bounds.x + bounds.width > centerOfScreen) {
+			return 0;
+		}
+		// the cluster is to the left of the screen
+		if (bounds.x + bounds.width < centerOfScreen) {
+			return centerOfScreen - (bounds.x + bounds.width);
+		}
+		// the cluster is to the right of the screen
+		return bounds.x - centerOfScreen;
+	};
+	/** measure of text share that belong to the cluster */
+	const percentageTextShare = (cluster: number[], totalLength: number) => {
+		// apply an exponentially increasing penalty for centrality per 100 pixels distance from center
+		return round((totalTextLength(cluster) / totalLength) * 100);
+	};
+	const shouldMergeClusters = (clusterA: number[], clusterB: number[]) => {
+		const clusterABounds = getClusterBounds(clusterA);
+		const clusterBBounds = getClusterBounds(clusterB);
+		// A cluster is horizontally aligned if the x and width are roughly equal
+		const isHorizontallyAligned =
+			approximatelyEqual(clusterABounds.x, clusterBBounds.x, 40) &&
+			approximatelyEqual(clusterABounds.width, clusterBBounds.width, 40);
+		if (!isHorizontallyAligned) return false;
+		// check the y gap between the clusters
+		const higherCluster = clusterABounds.y < clusterBBounds.y ? clusterABounds : clusterBBounds;
+		const lowerCluster = clusterABounds.y < clusterBBounds.y ? clusterBBounds : clusterABounds;
+		const yGap = lowerCluster.y - (higherCluster.y + higherCluster.height);
+		if (approximatelyEqual(yGap, 0, 100)) return true;
+	};
+	const findCriticalClusters = (clusters: number[][]) => {
+		// merge the clusters that have similar widths and x position
+		let i = 0;
+		while (i < clusters.length) {
+			const cluster = clusters[i];
+			for (let j = i + 1; j < clusters.length; j++) {
+				const otherCluster = clusters[j];
+				if (shouldMergeClusters(cluster, otherCluster)) {
+					cluster.push(...otherCluster);
+					clusters.splice(j, 1);
+					j -= 1;
+				}
+			}
+			i++;
+		}
+		const totalText = totalTextLength(clusters.flat());
+		// sort in descending order of text share
+		const clusterWithMetrics = clusters.map((cluster) => {
+			const centrality = clusterCentrality(cluster);
+			return {
+				cluster,
+				centrality,
+				percentageTextShare: percentageTextShare(cluster, totalText),
+			};
+		});
+		// if there is a dominant cluster with more than 60% text share, return that
+		const dominantCluster = clusterWithMetrics[0].percentageTextShare > 60;
+		if (dominantCluster) return [clusterWithMetrics[0].cluster];
+		// clusters are sorted by text share after applying a penalty for centrality
+		const sortedClusters = clusterWithMetrics.sort((a, b) => {
+			const penaltyForA = Math.pow(0.9, a.centrality / 100);
+			const penaltyForB = Math.pow(0.9, b.centrality / 100);
+			const adjustedTextShareA = a.percentageTextShare * penaltyForA;
+			const adjustedTextShareB = b.percentageTextShare * penaltyForB;
+			return adjustedTextShareB - adjustedTextShareA;
+		});
+		// find all clusters that are similar to the largest cluster in terms of text share
+		// and see if they are enough to cover at least 60% of the text share
+		const largeTextShareClusters = sortedClusters.filter((c) =>
+			approximatelyEqual(c.percentageTextShare, sortedClusters[0].percentageTextShare, 10)
+		);
+		const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
+			(acc, cluster) => acc + cluster.percentageTextShare,
+			0
+		);
+		if (totalTextShareOfLargeClusters > 60) {
+			return largeTextShareClusters.map((c) => c.cluster);
+		}
+		// choose clusters till the text share is greater than 60%
+		let totalTextShare = 0;
+		const criticalClusters = [];
+		for (const cluster of sortedClusters) {
+			/** Ignore clusters with less than 2%*/
+			if (cluster.percentageTextShare < 2) continue;
+			if (totalTextShare > 60) break;
+			criticalClusters.push(cluster.cluster);
+			totalTextShare += cluster.percentageTextShare;
+		}
+		// if the total text share is less than 60% then return an empty array
+		// as this website should not be particularly useful for the web search anyways
+		// this should almost never happen on structured website with a lot of text
+		if (totalTextShare < 60) {
+			return [];
+		}
+		return criticalClusters;
+	};
+	const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
+	function serializeHTMLElement(node: Element): SerializedHTMLElement {
+		return {
+			tagName: node.tagName.toLowerCase(),
+			attributes: allowListedAttributes.reduce((acc, attr) => {
+				const value = node.getAttribute(attr);
+				if (value) {
+					acc[attr] = value;
+				}
+				return acc;
+			}, {} as Record<string, string>),
+			content: Array.from(node.childNodes).map(serializeNode).filter(Boolean),
+		};
+	}
+	function serializeNode(node: Node): SerializedHTMLElement | string {
+		if (node.nodeType === 1) return serializeHTMLElement(node as Element);
+		else if (node.nodeType === 3) return node.textContent ?? "";
+		else return "";
+	}
+	function getPageMetadata(): {
+		title: string;
+		siteName?: string;
+		author?: string;
+		description?: string;
+		createdAt?: string;
+		updatedAt?: string;
+	} {
+		const title = document.title ?? "";
+		const siteName =
+			document.querySelector("meta[property='og:site_name']")?.getAttribute("content") ?? undefined;
+		const author =
+			document.querySelector("meta[name='author']")?.getAttribute("content") ?? undefined;
+		const description =
+			document.querySelector("meta[name='description']")?.getAttribute("content") ??
+			document.querySelector("meta[property='og:description']")?.getAttribute("content") ??
+			undefined;
+		const createdAt =
+			document.querySelector("meta[property='article:published_time']")?.getAttribute("content") ??
+			document.querySelector("meta[name='date']")?.getAttribute("content") ??
+			undefined;
+		const updatedAt =
+			document.querySelector("meta[property='article:modified_time']")?.getAttribute("content") ??
+			undefined;
+		return { title, siteName, author, description, createdAt, updatedAt };
+	}
+	const readableNodes = getAllReadableNodes();
+	const clusters = clusterReadableNodes(readableNodes);
+	const criticalClusters = findCriticalClusters(clusters);
+	// filter readable nodes using the above information as well as heuristics
+	const filteredNodes = readableNodes.filter((_, idx) => {
+		return criticalClusters.some((cluster) => {
+			return cluster.includes(idx);
+		});
+	});
+	const elements = filteredNodes
+		.filter(
+			(node, idx, nodes) => !nodes.slice(idx + 1).some((otherNode) => node.node === otherNode.node)
+		)
+		.map<SerializedHTMLElement>(({ node }) => serializeHTMLElement(node));
+	const metadata = getPageMetadata();
+	return { ...metadata, elements };
+}

src/lib/server/websearch/scrape/playwright.ts ADDED Viewed

	@@ -0,0 +1,59 @@

+import {
+	type BrowserContext,
+	chromium,
+	devices,
+	type Page,
+	type BrowserContextOptions,
+} from "playwright";
+import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
+import { env } from "$env/dynamic/private";
+// Singleton initialized by initPlaywrightService
+let playwrightService: Promise<{ ctx: BrowserContext; blocker: PlaywrightBlocker }>;
+async function initPlaywrightService() {
+	if (playwrightService) return playwrightService;
+	const browser = await chromium.launch({ headless: true });
+	process.on("SIGINT", () => browser.close());
+	const device = devices["Desktop Chrome"];
+	const options: BrowserContextOptions = {
+		...device,
+		// Increasing width improves spatial clustering accuracy
+		screen: {
+			width: 3840,
+			height: 1080,
+		},
+		viewport: {
+			width: 3840,
+			height: 1080,
+		},
+		reducedMotion: "reduce",
+		acceptDownloads: false,
+		timezoneId: "America/New_York",
+		locale: "en-US",
+	};
+	const ctx = await browser.newContext(options);
+	const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
+		const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
+		if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
+		return mostBlocked;
+	});
+	return Object.freeze({ ctx, blocker });
+}
+export async function loadPage(url: string): Promise<Page> {
+	if (!playwrightService) playwrightService = initPlaywrightService();
+	const { ctx, blocker } = await playwrightService;
+	const page = await ctx.newPage();
+	await blocker.enableBlockingInPage(page);
+	await page.goto(url, { waitUntil: "load", timeout: 2000 }).catch(() => {
+		console.warn(`Failed to load page within 2s: ${url}`);
+	});
+	return page;
+}

src/lib/server/websearch/scrape/scrape.ts ADDED Viewed

	@@ -0,0 +1,34 @@

+import type { AppendUpdate } from "../runWebSearch";
+import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
+import { loadPage } from "./playwright";
+import { spatialParser } from "./parser";
+import { htmlToMarkdownTree } from "../markdown/tree";
+import { timeout } from "$lib/utils/timeout";
+export const scrape =
+	(appendUpdate: AppendUpdate, maxCharsPerElem: number) =>
+	async (source: WebSearchSource): Promise<WebSearchScrapedSource | undefined> => {
+		try {
+			const page = await scrapeUrl(source.link, maxCharsPerElem);
+			appendUpdate("Browsing webpage", [source.link]);
+			return { ...source, page };
+		} catch (e) {
+			const message = e instanceof Error ? e.message : String(e);
+			appendUpdate("Failed to parse webpage", [message, source.link], "error");
+		}
+	};
+export async function scrapeUrl(url: string, maxCharsPerElem: number) {
+	const page = await loadPage(url);
+	return timeout(page.evaluate(spatialParser), 2000)
+		.then(({ elements, ...parsed }) => ({
+			...parsed,
+			markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
+		}))
+		.catch((cause) => {
+			throw Error("Parsing failed", { cause });
+		})
+		.finally(() => page.close());
+}

src/lib/server/websearch/scrape/types.ts ADDED Viewed

	@@ -0,0 +1,5 @@

+export interface SerializedHTMLElement {
+	tagName: string;
+	attributes: Record<string, string>;
+	content: (SerializedHTMLElement | string)[];
+}

src/lib/server/websearch/search/endpoints.ts ADDED Viewed

	@@ -0,0 +1,27 @@

+import { WebSearchProvider, type WebSearchSource } from "$lib/types/WebSearch";
+import { env } from "$env/dynamic/private";
+import searchSerper from "./endpoints/serper";
+import searchSerpApi from "./endpoints/serpApi";
+import searchSerpStack from "./endpoints/serpStack";
+import searchYouApi from "./endpoints/youApi";
+import searchWebLocal from "./endpoints/webLocal";
+import searchSearxng from "./endpoints/searxng";
+export function getWebSearchProvider() {
+	if (env.YDC_API_KEY) return WebSearchProvider.YOU;
+	if (env.SEARXNG_QUERY_URL) return WebSearchProvider.SEARXNG;
+	return WebSearchProvider.GOOGLE;
+}
+/** Searches the web using the first available provider, based on the env */
+export async function searchWeb(query: string): Promise<WebSearchSource[]> {
+	if (env.USE_LOCAL_WEBSEARCH) return searchWebLocal(query);
+	if (env.SEARXNG_QUERY_URL) return searchSearxng(query);
+	if (env.SERPER_API_KEY) return searchSerper(query);
+	if (env.YDC_API_KEY) return searchYouApi(query);
+	if (env.SERPAPI_KEY) return searchSerpApi(query);
+	if (env.SERPSTACK_API_KEY) return searchSerpStack(query);
+	throw new Error(
+		"No configuration found for web search. Please set USE_LOCAL_WEBSEARCH, SEARXNG_QUERY_URL, SERPER_API_KEY, YDC_API_KEY, or SERPSTACK_API_KEY in your environment variables."
+	);
+}

src/lib/server/websearch/{searchSearxng.ts → search/endpoints/searxng.ts} RENAMED Viewed

@@ -1,7 +1,9 @@
 import { env } from "$env/dynamic/private";
 import { logger } from "$lib/server/logger";
-export async function searchSearxng(query: string) {
 	const abortController = new AbortController();
 	setTimeout(() => abortController.abort(), 10000);
@@ -20,7 +22,7 @@ export async function searchSearxng(query: string) {
 		.then((response) => response.json() as Promise<{ results: { url: string }[] }>)
 		.catch((error) => {
 			logger.error("Failed to fetch or parse JSON", error);
-			throw new Error("Failed to fetch or parse JSON");
 		});
 	// Extract 'url' elements from the JSON response and trim to the top 5 URLs
@@ -31,5 +33,5 @@ export async function searchSearxng(query: string) {
 	}
 	// Map URLs to the correct object shape
-	return { organic_results: urls.map((link) => ({ link })) };
 }

 import { env } from "$env/dynamic/private";
 import { logger } from "$lib/server/logger";
+import type { WebSearchSource } from "$lib/types/WebSearch";
+import { isURL } from "$lib/utils/isUrl";
+export default async function searchSearxng(query: string): Promise<WebSearchSource[]> {
 	const abortController = new AbortController();
 	setTimeout(() => abortController.abort(), 10000);
 		.then((response) => response.json() as Promise<{ results: { url: string }[] }>)
 		.catch((error) => {
 			logger.error("Failed to fetch or parse JSON", error);
+			throw new Error("Failed to fetch or parse JSON", { cause: error });
 		});
 	// Extract 'url' elements from the JSON response and trim to the top 5 URLs
 	}
 	// Map URLs to the correct object shape
+	return urls.filter(isURL).map((link) => ({ link }));
 }

src/lib/server/websearch/search/endpoints/serpApi.ts ADDED Viewed

	@@ -0,0 +1,25 @@

+import { env } from "$env/dynamic/private";
+import { getJson, type GoogleParameters } from "serpapi";
+import type { WebSearchSource } from "$lib/types/WebSearch";
+import { isURL } from "$lib/utils/isUrl";
+type SerpApiResponse = {
+	organic_results: {
+		link: string;
+	}[];
+};
+export default async function searchWebSerpApi(query: string): Promise<WebSearchSource[]> {
+	const params = {
+		q: query,
+		hl: "en",
+		gl: "us",
+		google_domain: "google.com",
+		api_key: env.SERPAPI_KEY,
+	} satisfies GoogleParameters;
+	// Show result as JSON
+	const response = (await getJson("google", params)) as unknown as SerpApiResponse;
+	return response.organic_results.filter(({ link }) => isURL(link));
+}

src/lib/server/websearch/search/endpoints/serpStack.ts ADDED Viewed

	@@ -0,0 +1,35 @@

+import { env } from "$env/dynamic/private";
+import { isURL } from "$lib/utils/isUrl";
+import type { WebSearchSource } from "$lib/types/WebSearch";
+type SerpStackResponse = {
+	organic_results: {
+		title: string;
+		url: string;
+		snippet?: string;
+	}[];
+	error?: string;
+};
+export default async function searchSerpStack(query: string): Promise<WebSearchSource[]> {
+	const response = await fetch(
+		`http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
+		{ headers: { "Content-type": "application/json; charset=UTF-8" } }
+	);
+	const data = (await response.json()) as SerpStackResponse;
+	if (!response.ok) {
+		throw new Error(
+			data.error ?? `SerpStack API returned error code ${response.status} - ${response.statusText}`
+		);
+	}
+	return data.organic_results
+		.filter(({ url }) => isURL(url))
+		.map(({ title, url, snippet }) => ({
+			title,
+			link: url,
+			text: snippet ?? "",
+		}));
+}

src/lib/server/websearch/search/endpoints/serper.ts ADDED Viewed

	@@ -0,0 +1,31 @@

+import { env } from "$env/dynamic/private";
+import type { WebSearchSource } from "$lib/types/WebSearch";
+export default async function search(query: string): Promise<WebSearchSource[]> {
+	const params = {
+		q: query,
+		hl: "en",
+		gl: "us",
+	};
+	const response = await fetch("https://google.serper.dev/search", {
+		method: "POST",
+		body: JSON.stringify(params),
+		headers: {
+			"x-api-key": env.SERPER_API_KEY,
+			"Content-type": "application/json",
+		},
+	});
+	/* eslint-disable @typescript-eslint/no-explicit-any */
+	const data = (await response.json()) as Record<string, any>;
+	if (!response.ok) {
+		throw new Error(
+			data["message"] ??
+				`Serper API returned error code ${response.status} - ${response.statusText}`
+		);
+	}
+	return data["organic"] ?? [];
+}

src/lib/server/websearch/{searchWebLocal.ts → search/endpoints/webLocal.ts} RENAMED Viewed

@@ -1,45 +1,35 @@
 import { JSDOM, VirtualConsole } from "jsdom";
-export async function searchWebLocal(query: string) {
 	const abortController = new AbortController();
 	setTimeout(() => abortController.abort(), 10000);
-	const htmlString = await fetch("https://www.google.com/search?hl=en&q=" + query, {
-		signal: abortController.signal,
-	})
 		.then((response) => response.text())
 		.catch();
 	const virtualConsole = new VirtualConsole();
-	virtualConsole.on("error", () => {
-		// No-op to skip console errors.
-	});
-	// put the html string into a DOM
-	const dom = new JSDOM(htmlString ?? "", {
-		virtualConsole,
-	});
-	const { document } = dom.window;
-	// get all a documents with href tag
 	const links = document.querySelectorAll("a");
-	if (!links.length) {
-		throw new Error(`webpage doesn't have any "a" element`);
-	}
 	// take url that start wirth /url?q=
 	// and do not contain google.com links
 	// and strip them up to '&sa='
 	const linksHref = Array.from(links)
-		.filter((el) => el.href?.startsWith("/url?q=") && !el.href.includes("google.com/"))
-		.map((el) => {
-			const link = el.href;
-			return link.slice("/url?q=".length, link.indexOf("&sa="));
-		});
 	// remove duplicate links and map links to the correct object shape
-	return { organic_results: [...new Set(linksHref)].map((link) => ({ link })) };
 }

 import { JSDOM, VirtualConsole } from "jsdom";
+import { isURL } from "$lib/utils/isUrl";
+import type { WebSearchSource } from "$lib/types/WebSearch";
+export default async function searchWebLocal(query: string): Promise<WebSearchSource[]> {
 	const abortController = new AbortController();
 	setTimeout(() => abortController.abort(), 10000);
+	const htmlString = await fetch(
+		"https://www.google.com/search?hl=en&q=" + encodeURIComponent(query),
+		{ signal: abortController.signal }
+	)
 		.then((response) => response.text())
 		.catch();
 	const virtualConsole = new VirtualConsole();
+	virtualConsole.on("error", () => {}); // No-op to skip console errors.
+	const document = new JSDOM(htmlString ?? "", { virtualConsole }).window.document;
+	// get all links
 	const links = document.querySelectorAll("a");
+	if (!links.length) throw new Error(`webpage doesn't have any "a" element`);
 	// take url that start wirth /url?q=
 	// and do not contain google.com links
 	// and strip them up to '&sa='
 	const linksHref = Array.from(links)
+		.map((el) => el.href)
+		.filter((link) => link.startsWith("/url?q=") && !link.includes("google.com/"))
+		.map((link) => link.slice("/url?q=".length, link.indexOf("&sa=")))
+		.filter(isURL);
 	// remove duplicate links and map links to the correct object shape
+	return [...new Set(linksHref)].map((link) => ({ link }));
 }

src/lib/server/websearch/search/endpoints/youApi.ts ADDED Viewed

	@@ -0,0 +1,41 @@

+import { env } from "$env/dynamic/private";
+import { isURL } from "$lib/utils/isUrl";
+import type { WebSearchSource } from "$lib/types/WebSearch";
+interface YouWebSearch {
+	hits: YouSearchHit[];
+	latency: number;
+}
+interface YouSearchHit {
+	url: string;
+	title: string;
+	description: string;
+	snippets: string[];
+}
+export default async function searchWebYouApi(query: string): Promise<WebSearchSource[]> {
+	const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
+		method: "GET",
+		headers: {
+			"X-API-Key": env.YDC_API_KEY,
+			"Content-type": "application/json; charset=UTF-8",
+		},
+	});
+	if (!response.ok) {
+		throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
+	}
+	const data = (await response.json()) as YouWebSearch;
+	const formattedResultsWithSnippets = data.hits
+		.filter(({ url }) => isURL(url))
+		.map(({ title, url, snippets }) => ({
+			title,
+			link: url,
+			text: snippets?.join("\n") || "",
+		}))
+		.sort((a, b) => b.text.length - a.text.length); // desc order by text length
+	return formattedResultsWithSnippets;
+}

src/lib/server/websearch/{generateQuery.ts → search/generateQuery.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
 import type { Message } from "$lib/types/Message";
 import { format } from "date-fns";
-import { generateFromDefaultEndpoint } from "../generateFromDefaultEndpoint";
 export async function generateQuery(messages: Message[]) {
 	const currentDate = format(new Date(), "MMMM d, yyyy");

 import type { Message } from "$lib/types/Message";
 import { format } from "date-fns";
+import { generateFromDefaultEndpoint } from "../../generateFromDefaultEndpoint";
 export async function generateQuery(messages: Message[]) {
 	const currentDate = format(new Date(), "MMMM d, yyyy");

src/lib/server/websearch/search/search.ts ADDED Viewed

	@@ -0,0 +1,77 @@

+import type { WebSearchSource } from "$lib/types/WebSearch";
+import type { Message } from "$lib/types/Message";
+import type { Assistant } from "$lib/types/Assistant";
+import type { AppendUpdate } from "../runWebSearch";
+import { getWebSearchProvider, searchWeb } from "./endpoints";
+import { generateQuery } from "./generateQuery";
+import { isURLStringLocal } from "$lib/server/isURLLocal";
+import { isURL } from "$lib/utils/isUrl";
+import z from "zod";
+import JSON5 from "json5";
+import { env } from "$env/dynamic/private";
+const listSchema = z.array(z.string()).default([]);
+const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST));
+const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST));
+export async function search(
+	messages: Message[],
+	ragSettings: Assistant["rag"] | undefined,
+	appendUpdate: AppendUpdate
+): Promise<{ searchQuery: string; pages: WebSearchSource[] }> {
+	if (ragSettings && ragSettings?.allowedLinks.length > 0) {
+		appendUpdate("Using links specified in Assistant");
+		return {
+			searchQuery: "",
+			pages: await directLinksToSource(ragSettings.allowedLinks).then(filterByBlockList),
+		};
+	}
+	const searchQuery = await generateQuery(messages);
+	appendUpdate(`Searching ${getWebSearchProvider()}`, [searchQuery]);
+	// handle the global and (optional) rag lists
+	if (ragSettings && ragSettings?.allowedDomains.length > 0) {
+		appendUpdate("Filtering on specified domains");
+	}
+	const filters = buildQueryFromSiteFilters(
+		[...(ragSettings?.allowedDomains ?? []), ...allowList],
+		blockList
+	);
+	const searchQueryWithFilters = `${filters} ${searchQuery}`;
+	const searchResults = await searchWeb(searchQueryWithFilters).then(filterByBlockList);
+	return {
+		searchQuery: searchQueryWithFilters,
+		pages: searchResults,
+	};
+}
+// ----------
+// Utils
+function filterByBlockList(results: WebSearchSource[]): WebSearchSource[] {
+	return results.filter((result) => !blockList.some((blocked) => result.link.includes(blocked)));
+}
+function buildQueryFromSiteFilters(allow: string[], block: string[]) {
+	return (
+		allow.map((item) => "site:" + item).join(" OR ") +
+		" " +
+		block.map((item) => "-site:" + item).join(" ")
+	);
+}
+async function directLinksToSource(links: string[]): Promise<WebSearchSource[]> {
+	if (env.ENABLE_LOCAL_FETCH !== "true") {
+		const localLinks = await Promise.all(links.map(isURLStringLocal));
+		links = links.filter((_, index) => !localLinks[index]);
+	}
+	return links.filter(isURL).map((link) => ({
+		link,
+		title: "",
+		text: [""],
+	}));
+}

src/lib/server/websearch/searchWeb.ts DELETED Viewed

@@ -1,148 +0,0 @@
-import type { YouWebSearch } from "../../types/WebSearch";
-import { WebSearchProvider } from "../../types/WebSearch";
-import { env } from "$env/dynamic/private";
-import { getJson } from "serpapi";
-import type { GoogleParameters } from "serpapi";
-import { searchWebLocal } from "./searchWebLocal";
-import { searchSearxng } from "./searchSearxng";
-// get which SERP api is providing web results
-export function getWebSearchProvider() {
-	if (env.YDC_API_KEY) {
-		return WebSearchProvider.YOU;
-	} else if (env.SEARXNG_QUERY_URL) {
-		return WebSearchProvider.SEARXNG;
-	} else {
-		return WebSearchProvider.GOOGLE;
-	}
-}
-// Show result as JSON
-export async function searchWeb(query: string) {
-	if (env.USE_LOCAL_WEBSEARCH) {
-		return await searchWebLocal(query);
-	}
-	if (env.SEARXNG_QUERY_URL) {
-		return await searchSearxng(query);
-	}
-	if (env.SERPER_API_KEY) {
-		return await searchWebSerper(query);
-	}
-	if (env.YDC_API_KEY) {
-		return await searchWebYouApi(query);
-	}
-	if (env.SERPAPI_KEY) {
-		return await searchWebSerpApi(query);
-	}
-	if (env.SERPSTACK_API_KEY) {
-		return await searchSerpStack(query);
-	}
-	throw new Error("No You.com or Serper.dev or SerpAPI key found");
-}
-export async function searchWebSerper(query: string) {
-	const params = {
-		q: query,
-		hl: "en",
-		gl: "us",
-	};
-	const response = await fetch("https://google.serper.dev/search", {
-		method: "POST",
-		body: JSON.stringify(params),
-		headers: {
-			"x-api-key": env.SERPER_API_KEY,
-			"Content-type": "application/json; charset=UTF-8",
-		},
-	});
-	/* eslint-disable @typescript-eslint/no-explicit-any */
-	const data = (await response.json()) as Record<string, any>;
-	if (!response.ok) {
-		throw new Error(
-			data["message"] ??
-				`Serper API returned error code ${response.status} - ${response.statusText}`
-		);
-	}
-	return {
-		organic_results: data["organic"] ?? [],
-	};
-}
-export async function searchWebSerpApi(query: string) {
-	const params = {
-		q: query,
-		hl: "en",
-		gl: "us",
-		google_domain: "google.com",
-		api_key: env.SERPAPI_KEY,
-	} satisfies GoogleParameters;
-	// Show result as JSON
-	const response = await getJson("google", params);
-	return response;
-}
-export async function searchWebYouApi(query: string) {
-	const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
-		method: "GET",
-		headers: {
-			"X-API-Key": env.YDC_API_KEY,
-			"Content-type": "application/json; charset=UTF-8",
-		},
-	});
-	if (!response.ok) {
-		throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
-	}
-	const data = (await response.json()) as YouWebSearch;
-	const formattedResultsWithSnippets = data.hits
-		.map(({ title, url, snippets }) => ({
-			title,
-			link: url,
-			text: snippets?.join("\n") || "",
-			hostname: new URL(url).hostname,
-		}))
-		.sort((a, b) => b.text.length - a.text.length); // desc order by text length
-	return {
-		organic_results: formattedResultsWithSnippets,
-	};
-}
-export async function searchSerpStack(query: string) {
-	const response = await fetch(
-		`http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
-		{
-			method: "GET",
-			headers: {
-				"Content-type": "application/json; charset=UTF-8",
-			},
-		}
-	);
-	const data = (await response.json()) as Record<string, any>;
-	if (!response.ok) {
-		throw new Error(
-			data["error"] ??
-				`SerpStack API returned error code ${response.status} - ${response.statusText}`
-		);
-	}
-	const resultsWithSnippets = data["organic_results"].map(
-		({ title, url, snippet }: { title: string; url: string; snippet: string | undefined }) => ({
-			title,
-			link: url,
-			text: snippet || "",
-		})
-	);
-	return {
-		organic_results: resultsWithSnippets ?? [],
-	};
-}

src/lib/types/WebSearch.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import type { ObjectId } from "mongodb";
 import type { Conversation } from "./Conversation";
 import type { Timestamps } from "./Timestamps";
 export interface WebSearch extends Timestamps {
 	_id?: ObjectId;
@@ -14,14 +15,24 @@ export interface WebSearch extends Timestamps {
 }
 export interface WebSearchSource {
-	title: string;
 	link: string;
-	hostname: string;
-	text?: string; // You.com provides text of webpage right away
 }
-export interface WebSearchUsedSource extends WebSearchSource {
-	context: { idx: number; text: string }[];
 }
 export type WebSearchMessageSources = {
@@ -29,18 +40,6 @@ export type WebSearchMessageSources = {
 	sources: WebSearchSource[];
 };
-export interface YouWebSearch {
-	hits: YouSearchHit[];
-	latency: number;
-}
-interface YouSearchHit {
-	url: string;
-	title: string;
-	description: string;
-	snippets: string[];
-}
 // eslint-disable-next-line no-shadow
 export enum WebSearchProvider {
 	GOOGLE = "Google",

 import type { ObjectId } from "mongodb";
 import type { Conversation } from "./Conversation";
 import type { Timestamps } from "./Timestamps";
+import type { HeaderElement } from "$lib/server/websearch/markdown/types";
 export interface WebSearch extends Timestamps {
 	_id?: ObjectId;
 }
 export interface WebSearchSource {
+	title?: string;
 	link: string;
+}
+export interface WebSearchScrapedSource extends WebSearchSource {
+	page: WebSearchPage;
+}
+export interface WebSearchPage {
+	title: string;
+	siteName?: string;
+	author?: string;
+	description?: string;
+	createdAt?: string;
+	modifiedAt?: string;
+	markdownTree: HeaderElement;
 }
+export interface WebSearchUsedSource extends WebSearchScrapedSource {
+	context: string;
 }
 export type WebSearchMessageSources = {
 	sources: WebSearchSource[];
 };
 // eslint-disable-next-line no-shadow
 export enum WebSearchProvider {
 	GOOGLE = "Google",

src/lib/utils/isUrl.ts ADDED Viewed

	@@ -0,0 +1,8 @@

+export function isURL(url: string) {
+	try {
+		new URL(url);
+		return true;
+	} catch (e) {
+		return false;
+	}
+}

src/lib/utils/timeout.ts CHANGED Viewed

@@ -1,6 +1,9 @@
 export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
 	let timer: NodeJS.Timeout;
-	return Promise.race([prom, new Promise<T>((_r, rej) => (timer = setTimeout(rej, time)))]).finally(
-		() => clearTimeout(timer)
-	);
 };

 export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
 	let timer: NodeJS.Timeout;
+	return Promise.race([
+		prom,
+		new Promise<T>((_, reject) => {
+			timer = setTimeout(() => reject(new Error(`Timeout after ${time / 1000} seconds`)), time);
+		}),
+	]).finally(() => clearTimeout(timer));
 };