Spaces:
Running
on
CPU Upgrade
Web Search: Playwright, spatial parsing, markdown (#1094)
Browse files* feat: playwright, spatial parsing, markdown for web search
Co-authored-by: Aaditya Sahay <[email protected]>
* feat: choose multiple clusters if necessary (#2)
* chore: resolve linting failures
* feat: improve paring performance and error messages
* feat: combine embeddable chunks together on cpu
* feat: reduce parsed pages from 10 to 8
* feat: disable javascript in playwright by default
* feat: embedding and parsing error messages
* feat: move isURL, fix type errors, misc
* feat: misc cleanup
* feat: change serializedHtmlElement to interface
* fix: isUrl filename
* fix: add playwright dependencies to docker
* feat: add playwright browsers to docker image
* feat: enable javascript by default
* feat: remove error message from console on failed page
---------
Co-authored-by: Aaditya Sahay <[email protected]>
Co-authored-by: Aaditya Sahay <[email protected]>
- .env +2 -1
- Dockerfile +6 -0
- README.md +2 -0
- package-lock.json +289 -27
- package.json +5 -0
- src/lib/components/chat/ChatMessage.svelte +3 -3
- src/lib/server/embeddingEndpoints/hfApi/embeddingHfApi.ts +6 -1
- src/lib/server/isURLLocal.ts +31 -19
- src/lib/server/preprocessMessages.ts +4 -6
- src/lib/server/sentenceSimilarity.ts +12 -21
- src/lib/server/websearch/embed/combine.ts +37 -0
- src/lib/server/websearch/embed/embed.ts +80 -0
- src/lib/server/websearch/embed/tree.ts +6 -0
- src/lib/server/websearch/markdown/fromHtml.ts +98 -0
- src/lib/server/websearch/markdown/tree.ts +63 -0
- src/lib/server/websearch/markdown/types.ts +55 -0
- src/lib/server/websearch/markdown/utils/chunk.ts +60 -0
- src/lib/server/websearch/markdown/utils/nlp.ts +11 -0
- src/lib/server/websearch/markdown/utils/stringify.ts +75 -0
- src/lib/server/websearch/parseWeb.ts +0 -41
- src/lib/server/websearch/runWebSearch.ts +69 -145
- src/lib/server/websearch/scrape/parser.ts +552 -0
- src/lib/server/websearch/scrape/playwright.ts +59 -0
- src/lib/server/websearch/scrape/scrape.ts +34 -0
- src/lib/server/websearch/scrape/types.ts +5 -0
- src/lib/server/websearch/search/endpoints.ts +27 -0
- src/lib/server/websearch/{searchSearxng.ts → search/endpoints/searxng.ts} +5 -3
- src/lib/server/websearch/search/endpoints/serpApi.ts +25 -0
- src/lib/server/websearch/search/endpoints/serpStack.ts +35 -0
- src/lib/server/websearch/search/endpoints/serper.ts +31 -0
- src/lib/server/websearch/{searchWebLocal.ts → search/endpoints/webLocal.ts} +16 -26
- src/lib/server/websearch/search/endpoints/youApi.ts +41 -0
- src/lib/server/websearch/{generateQuery.ts → search/generateQuery.ts} +1 -1
- src/lib/server/websearch/search/search.ts +77 -0
- src/lib/server/websearch/searchWeb.ts +0 -148
- src/lib/types/WebSearch.ts +16 -17
- src/lib/utils/isUrl.ts +8 -0
- src/lib/utils/timeout.ts +6 -3
@@ -27,6 +27,7 @@ SEARXNG_QUERY_URL=# where '<query>' will be replaced with query keywords see htt
|
|
27 |
|
28 |
WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
|
29 |
WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
|
|
|
30 |
|
31 |
# Parameters to enable open id login
|
32 |
OPENID_CONFIG=`{
|
@@ -155,4 +156,4 @@ ALLOWED_USER_EMAILS=`[]` # if it's defined, only these emails will be allowed to
|
|
155 |
USAGE_LIMITS=`{}`
|
156 |
ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
|
157 |
METRICS_PORT=
|
158 |
-
LOG_LEVEL=info
|
|
|
27 |
|
28 |
WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
|
29 |
WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
|
30 |
+
WEBSEARCH_JAVASCRIPT=true # CPU usage reduces by 60% on average by disabling javascript. Enable to improve website compatibility
|
31 |
|
32 |
# Parameters to enable open id login
|
33 |
OPENID_CONFIG=`{
|
|
|
156 |
USAGE_LIMITS=`{}`
|
157 |
ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
|
158 |
METRICS_PORT=
|
159 |
+
LOG_LEVEL=info
|
@@ -83,6 +83,12 @@ COPY --chown=1000 gcp-*.json /app/
|
|
83 |
COPY --from=builder --chown=1000 /app/build /app/build
|
84 |
COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
RUN chmod +x /app/entrypoint.sh
|
87 |
|
88 |
CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]
|
|
|
83 |
COPY --from=builder --chown=1000 /app/build /app/build
|
84 |
COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
|
85 |
|
86 |
+
RUN npx playwright install
|
87 |
+
|
88 |
+
USER root
|
89 |
+
RUN npx playwright install-deps
|
90 |
+
USER user
|
91 |
+
|
92 |
RUN chmod +x /app/entrypoint.sh
|
93 |
|
94 |
CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]
|
@@ -170,6 +170,8 @@ You can enable the web search through an API by adding `YDC_API_KEY` ([docs.you.
|
|
170 |
|
171 |
You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
|
172 |
|
|
|
|
|
173 |
### Custom models
|
174 |
|
175 |
You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :
|
|
|
170 |
|
171 |
You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
|
172 |
|
173 |
+
You can enable Javascript when parsing webpages to improve compatibility with `WEBSEARCH_JAVASCRIPT=true` at the cost of increased CPU usage. You'll want at least 4 cores when enabling.
|
174 |
+
|
175 |
### Custom models
|
176 |
|
177 |
You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :
|
@@ -8,9 +8,11 @@
|
|
8 |
"name": "chat-ui",
|
9 |
"version": "0.8.4",
|
10 |
"dependencies": {
|
|
|
11 |
"@huggingface/hub": "^0.5.1",
|
12 |
"@huggingface/inference": "^2.6.3",
|
13 |
"@iconify-json/bi": "^1.1.21",
|
|
|
14 |
"@resvg/resvg-js": "^2.6.0",
|
15 |
"@xenova/transformers": "^2.16.1",
|
16 |
"autoprefixer": "^10.4.14",
|
@@ -32,10 +34,12 @@
|
|
32 |
"parquetjs": "^0.11.2",
|
33 |
"pino": "^9.0.0",
|
34 |
"pino-pretty": "^11.0.0",
|
|
|
35 |
"postcss": "^8.4.31",
|
36 |
"saslprep": "^1.0.3",
|
37 |
"satori": "^0.10.11",
|
38 |
"satori-html": "^0.3.2",
|
|
|
39 |
"serpapi": "^1.1.1",
|
40 |
"sharp": "^0.33.2",
|
41 |
"tailwind-scrollbar": "^3.0.0",
|
@@ -55,6 +59,7 @@
|
|
55 |
"@types/jsdom": "^21.1.1",
|
56 |
"@types/minimist": "^1.2.5",
|
57 |
"@types/parquetjs": "^0.10.3",
|
|
|
58 |
"@types/uuid": "^9.0.8",
|
59 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
60 |
"@typescript-eslint/parser": "^6.x",
|
@@ -159,39 +164,54 @@
|
|
159 |
}
|
160 |
},
|
161 |
"node_modules/@anthropic-ai/vertex-sdk": {
|
162 |
-
"version": "0.3.
|
163 |
-
"resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.
|
164 |
-
"integrity": "sha512-
|
165 |
"optional": true,
|
166 |
"dependencies": {
|
167 |
-
"@anthropic-ai/sdk": "
|
168 |
"google-auth-library": "^9.4.2"
|
169 |
}
|
170 |
},
|
171 |
-
"node_modules/@
|
172 |
-
"version": "
|
173 |
-
"resolved": "https://registry.npmjs.org/@
|
174 |
-
"integrity": "sha512
|
175 |
-
"optional": true,
|
176 |
"dependencies": {
|
177 |
-
"@
|
178 |
-
"@
|
179 |
-
"
|
180 |
-
"
|
181 |
-
"
|
182 |
-
"
|
183 |
-
"
|
184 |
-
"
|
185 |
-
"web-streams-polyfill": "^3.2.1"
|
186 |
}
|
187 |
},
|
188 |
-
"node_modules/@
|
189 |
-
"version": "
|
190 |
-
"resolved": "https://registry.npmjs.org/
|
191 |
-
"integrity": "sha512-
|
192 |
-
"
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
}
|
196 |
},
|
197 |
"node_modules/@cspotcode/source-map-support": {
|
@@ -1314,6 +1334,18 @@
|
|
1314 |
"node": ">=8.0.0"
|
1315 |
}
|
1316 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1317 |
"node_modules/@polka/url": {
|
1318 |
"version": "1.0.0-next.21",
|
1319 |
"resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
|
@@ -1374,6 +1406,43 @@
|
|
1374 |
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
|
1375 |
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
|
1376 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1377 |
"node_modules/@resvg/resvg-js": {
|
1378 |
"version": "2.6.0",
|
1379 |
"resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
|
@@ -2063,6 +2132,15 @@
|
|
2063 |
"@types/chai": "*"
|
2064 |
}
|
2065 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2066 |
"node_modules/@types/connect": {
|
2067 |
"version": "3.4.38",
|
2068 |
"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
|
@@ -2108,6 +2186,29 @@
|
|
2108 |
"@types/send": "*"
|
2109 |
}
|
2110 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2111 |
"node_modules/@types/http-errors": {
|
2112 |
"version": "2.0.4",
|
2113 |
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
|
@@ -2216,6 +2317,12 @@
|
|
2216 |
"integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
|
2217 |
"dev": true
|
2218 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
2219 |
"node_modules/@types/semver": {
|
2220 |
"version": "7.5.3",
|
2221 |
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
|
@@ -3660,7 +3767,6 @@
|
|
3660 |
"version": "4.3.1",
|
3661 |
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
|
3662 |
"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
|
3663 |
-
"dev": true,
|
3664 |
"engines": {
|
3665 |
"node": ">=0.10.0"
|
3666 |
}
|
@@ -3791,6 +3897,30 @@
|
|
3791 |
"node": ">=6.0.0"
|
3792 |
}
|
3793 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3794 |
"node_modules/domexception": {
|
3795 |
"version": "4.0.0",
|
3796 |
"resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
|
@@ -3802,6 +3932,33 @@
|
|
3802 |
"node": ">=12"
|
3803 |
}
|
3804 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3805 |
"node_modules/dotenv": {
|
3806 |
"version": "16.0.3",
|
3807 |
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
|
@@ -3940,7 +4097,6 @@
|
|
3940 |
"version": "4.0.0",
|
3941 |
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
|
3942 |
"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
|
3943 |
-
"dev": true,
|
3944 |
"engines": {
|
3945 |
"node": ">=10"
|
3946 |
},
|
@@ -4924,6 +5080,24 @@
|
|
4924 |
"node": ">=12"
|
4925 |
}
|
4926 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4927 |
"node_modules/http-errors": {
|
4928 |
"version": "2.0.0",
|
4929 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
|
@@ -5194,6 +5368,14 @@
|
|
5194 |
"node": ">=8"
|
5195 |
}
|
5196 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5197 |
"node_modules/is-potential-custom-element-name": {
|
5198 |
"version": "1.0.1",
|
5199 |
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
@@ -6354,6 +6536,11 @@
|
|
6354 |
"hex-rgb": "^4.1.0"
|
6355 |
}
|
6356 |
},
|
|
|
|
|
|
|
|
|
|
|
6357 |
"node_modules/parse5": {
|
6358 |
"version": "7.1.2",
|
6359 |
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
|
@@ -6645,6 +6832,47 @@
|
|
6645 |
"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
|
6646 |
"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
|
6647 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6648 |
"node_modules/postcss": {
|
6649 |
"version": "8.4.35",
|
6650 |
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
|
@@ -7431,6 +7659,19 @@
|
|
7431 |
"rimraf": "bin.js"
|
7432 |
}
|
7433 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7434 |
"node_modules/saslprep": {
|
7435 |
"version": "1.0.3",
|
7436 |
"resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
|
@@ -7481,6 +7722,14 @@
|
|
7481 |
"node": ">=v12.22.7"
|
7482 |
}
|
7483 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7484 |
"node_modules/secure-json-parse": {
|
7485 |
"version": "2.7.0",
|
7486 |
"resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
|
@@ -8428,6 +8677,19 @@
|
|
8428 |
"node": ">=14.0.0"
|
8429 |
}
|
8430 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8431 |
"node_modules/to-regex-range": {
|
8432 |
"version": "5.0.1",
|
8433 |
"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
|
|
|
8 |
"name": "chat-ui",
|
9 |
"version": "0.8.4",
|
10 |
"dependencies": {
|
11 |
+
"@cliqz/adblocker-playwright": "^1.27.2",
|
12 |
"@huggingface/hub": "^0.5.1",
|
13 |
"@huggingface/inference": "^2.6.3",
|
14 |
"@iconify-json/bi": "^1.1.21",
|
15 |
+
"@playwright/browser-chromium": "^1.43.1",
|
16 |
"@resvg/resvg-js": "^2.6.0",
|
17 |
"@xenova/transformers": "^2.16.1",
|
18 |
"autoprefixer": "^10.4.14",
|
|
|
34 |
"parquetjs": "^0.11.2",
|
35 |
"pino": "^9.0.0",
|
36 |
"pino-pretty": "^11.0.0",
|
37 |
+
"playwright": "^1.40.0",
|
38 |
"postcss": "^8.4.31",
|
39 |
"saslprep": "^1.0.3",
|
40 |
"satori": "^0.10.11",
|
41 |
"satori-html": "^0.3.2",
|
42 |
+
"sbd": "^1.0.19",
|
43 |
"serpapi": "^1.1.1",
|
44 |
"sharp": "^0.33.2",
|
45 |
"tailwind-scrollbar": "^3.0.0",
|
|
|
59 |
"@types/jsdom": "^21.1.1",
|
60 |
"@types/minimist": "^1.2.5",
|
61 |
"@types/parquetjs": "^0.10.3",
|
62 |
+
"@types/sbd": "^1.0.5",
|
63 |
"@types/uuid": "^9.0.8",
|
64 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
65 |
"@typescript-eslint/parser": "^6.x",
|
|
|
164 |
}
|
165 |
},
|
166 |
"node_modules/@anthropic-ai/vertex-sdk": {
|
167 |
+
"version": "0.3.6",
|
168 |
+
"resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.6.tgz",
|
169 |
+
"integrity": "sha512-4pNVobcCsPCWLSaFJkT/XxwX5rmot+q2PE2LF5vfuRNFTWFjeTrsPgTB48D0Sce/c/2p4fddrFKGN6fdnn8zRg==",
|
170 |
"optional": true,
|
171 |
"dependencies": {
|
172 |
+
"@anthropic-ai/sdk": ">=0.14 <1",
|
173 |
"google-auth-library": "^9.4.2"
|
174 |
}
|
175 |
},
|
176 |
+
"node_modules/@cliqz/adblocker": {
|
177 |
+
"version": "1.27.2",
|
178 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker/-/adblocker-1.27.2.tgz",
|
179 |
+
"integrity": "sha512-sFjbx9xBGWaOsvVFVHVUNOrzCafGtjYDAp95KTeoJcNZbPs4D2RsabYZEeg4JkwPkfhcFseJqfnsMyJ4XsqVfQ==",
|
|
|
180 |
"dependencies": {
|
181 |
+
"@cliqz/adblocker-content": "^1.27.2",
|
182 |
+
"@cliqz/adblocker-extended-selectors": "^1.27.2",
|
183 |
+
"@remusao/guess-url-type": "^1.2.1",
|
184 |
+
"@remusao/small": "^1.2.1",
|
185 |
+
"@remusao/smaz": "^1.9.1",
|
186 |
+
"@types/chrome": "^0.0.266",
|
187 |
+
"@types/firefox-webext-browser": "^120.0.0",
|
188 |
+
"tldts-experimental": "^6.0.14"
|
|
|
189 |
}
|
190 |
},
|
191 |
+
"node_modules/@cliqz/adblocker-content": {
|
192 |
+
"version": "1.27.2",
|
193 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker-content/-/adblocker-content-1.27.2.tgz",
|
194 |
+
"integrity": "sha512-fzxsOt7r3YUgxoyW9GPCOShKOLNbB4n3gWtyMBFQ+lwHsQKfLehxN4Zxjg4Ad6AXJNW4gfIBq69ghnj2jHfviw==",
|
195 |
+
"dependencies": {
|
196 |
+
"@cliqz/adblocker-extended-selectors": "^1.27.2"
|
197 |
+
}
|
198 |
+
},
|
199 |
+
"node_modules/@cliqz/adblocker-extended-selectors": {
|
200 |
+
"version": "1.27.2",
|
201 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker-extended-selectors/-/adblocker-extended-selectors-1.27.2.tgz",
|
202 |
+
"integrity": "sha512-HZ03U8pAOuEwTo1vZ9tv49kIC4riWqYvr5p3illZshxo+eCUi8CPbgYSyYCtgd1JpO1wNnCwEX95/twXfT8cnA=="
|
203 |
+
},
|
204 |
+
"node_modules/@cliqz/adblocker-playwright": {
|
205 |
+
"version": "1.27.2",
|
206 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker-playwright/-/adblocker-playwright-1.27.2.tgz",
|
207 |
+
"integrity": "sha512-b+OoWKz/h787YItfCwjnhZ8l6/bv/DPTzaq1pyyY6Ovpdd+dGvVW1fehw+87FC6j/WQbTeuOdpLiwp8ouvrftg==",
|
208 |
+
"dependencies": {
|
209 |
+
"@cliqz/adblocker": "^1.27.2",
|
210 |
+
"@cliqz/adblocker-content": "^1.27.2",
|
211 |
+
"tldts-experimental": "^6.0.14"
|
212 |
+
},
|
213 |
+
"peerDependencies": {
|
214 |
+
"playwright": "^1.x"
|
215 |
}
|
216 |
},
|
217 |
"node_modules/@cspotcode/source-map-support": {
|
|
|
1334 |
"node": ">=8.0.0"
|
1335 |
}
|
1336 |
},
|
1337 |
+
"node_modules/@playwright/browser-chromium": {
|
1338 |
+
"version": "1.43.1",
|
1339 |
+
"resolved": "https://registry.npmjs.org/@playwright/browser-chromium/-/browser-chromium-1.43.1.tgz",
|
1340 |
+
"integrity": "sha512-CBuHhRIF/VGyUnPvK7/4IUbm0AAOZZI5huHlr+qNr5cFQpQ6TXBqOwSMef/xUz9HcjxWOxDPION7br1kOlyV/A==",
|
1341 |
+
"hasInstallScript": true,
|
1342 |
+
"dependencies": {
|
1343 |
+
"playwright-core": "1.43.1"
|
1344 |
+
},
|
1345 |
+
"engines": {
|
1346 |
+
"node": ">=16"
|
1347 |
+
}
|
1348 |
+
},
|
1349 |
"node_modules/@polka/url": {
|
1350 |
"version": "1.0.0-next.21",
|
1351 |
"resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
|
|
|
1406 |
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
|
1407 |
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
|
1408 |
},
|
1409 |
+
"node_modules/@remusao/guess-url-type": {
|
1410 |
+
"version": "1.2.1",
|
1411 |
+
"resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.2.1.tgz",
|
1412 |
+
"integrity": "sha512-rbOqre2jW8STjheOsOaQHLgYBaBZ9Owbdt8NO7WvNZftJlaG3y/K9oOkl8ZUpuFBisIhmBuMEW6c+YrQl5inRA=="
|
1413 |
+
},
|
1414 |
+
"node_modules/@remusao/small": {
|
1415 |
+
"version": "1.2.1",
|
1416 |
+
"resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.2.1.tgz",
|
1417 |
+
"integrity": "sha512-7MjoGt0TJMVw1GPKgWq6SJPws1SLsUXQRa43Umht+nkyw2jnpy3WpiLNqGdwo5rHr5Wp9B2W/Pm5RQp656UJdw=="
|
1418 |
+
},
|
1419 |
+
"node_modules/@remusao/smaz": {
|
1420 |
+
"version": "1.9.1",
|
1421 |
+
"resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.9.1.tgz",
|
1422 |
+
"integrity": "sha512-e6BLuP8oaXCZ9+v46Is4ilAZ/Vq6YLgmBP204Ixgk1qTjXmqvFYG7+AS7v9nsZdGOy96r9DWGFbbDVgMxwu1rA==",
|
1423 |
+
"dependencies": {
|
1424 |
+
"@remusao/smaz-compress": "^1.9.1",
|
1425 |
+
"@remusao/smaz-decompress": "^1.9.1"
|
1426 |
+
}
|
1427 |
+
},
|
1428 |
+
"node_modules/@remusao/smaz-compress": {
|
1429 |
+
"version": "1.9.1",
|
1430 |
+
"resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.9.1.tgz",
|
1431 |
+
"integrity": "sha512-E2f48TwloQu3r6BdLOGF2aczeH7bJ/32oJGqvzT9SKur0cuUnLcZ7ZXP874E2fwmdE+cXzfC7bKzp79cDnmeyw==",
|
1432 |
+
"dependencies": {
|
1433 |
+
"@remusao/trie": "^1.4.1"
|
1434 |
+
}
|
1435 |
+
},
|
1436 |
+
"node_modules/@remusao/smaz-decompress": {
|
1437 |
+
"version": "1.9.1",
|
1438 |
+
"resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.9.1.tgz",
|
1439 |
+
"integrity": "sha512-TfjKKprYe3n47od8auhvJ/Ikj9kQTbDTe71ynKlxslrvvUhlIV3VQSuwYuMWMbdz1fIs0H/fxCN1Z8/H3km6/A=="
|
1440 |
+
},
|
1441 |
+
"node_modules/@remusao/trie": {
|
1442 |
+
"version": "1.4.1",
|
1443 |
+
"resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.4.1.tgz",
|
1444 |
+
"integrity": "sha512-yvwa+aCyYI/UjeD39BnpMypG8N06l86wIDW1/PAc6ihBRnodIfZDwccxQN3n1t74wduzaz74m4ZMHZnB06567Q=="
|
1445 |
+
},
|
1446 |
"node_modules/@resvg/resvg-js": {
|
1447 |
"version": "2.6.0",
|
1448 |
"resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
|
|
|
2132 |
"@types/chai": "*"
|
2133 |
}
|
2134 |
},
|
2135 |
+
"node_modules/@types/chrome": {
|
2136 |
+
"version": "0.0.266",
|
2137 |
+
"resolved": "https://registry.npmjs.org/@types/chrome/-/chrome-0.0.266.tgz",
|
2138 |
+
"integrity": "sha512-QSQWJTL7NjZElvq/6/E5C1+pHgEP8UAJzwoz7M4vSJ7AECt6NNehJ+tU6snnvuTqZOBjFCivvitYo5+8tNPmhg==",
|
2139 |
+
"dependencies": {
|
2140 |
+
"@types/filesystem": "*",
|
2141 |
+
"@types/har-format": "*"
|
2142 |
+
}
|
2143 |
+
},
|
2144 |
"node_modules/@types/connect": {
|
2145 |
"version": "3.4.38",
|
2146 |
"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
|
|
|
2186 |
"@types/send": "*"
|
2187 |
}
|
2188 |
},
|
2189 |
+
"node_modules/@types/filesystem": {
|
2190 |
+
"version": "0.0.36",
|
2191 |
+
"resolved": "https://registry.npmjs.org/@types/filesystem/-/filesystem-0.0.36.tgz",
|
2192 |
+
"integrity": "sha512-vPDXOZuannb9FZdxgHnqSwAG/jvdGM8Wq+6N4D/d80z+D4HWH+bItqsZaVRQykAn6WEVeEkLm2oQigyHtgb0RA==",
|
2193 |
+
"dependencies": {
|
2194 |
+
"@types/filewriter": "*"
|
2195 |
+
}
|
2196 |
+
},
|
2197 |
+
"node_modules/@types/filewriter": {
|
2198 |
+
"version": "0.0.33",
|
2199 |
+
"resolved": "https://registry.npmjs.org/@types/filewriter/-/filewriter-0.0.33.tgz",
|
2200 |
+
"integrity": "sha512-xFU8ZXTw4gd358lb2jw25nxY9QAgqn2+bKKjKOYfNCzN4DKCFetK7sPtrlpg66Ywe3vWY9FNxprZawAh9wfJ3g=="
|
2201 |
+
},
|
2202 |
+
"node_modules/@types/firefox-webext-browser": {
|
2203 |
+
"version": "120.0.3",
|
2204 |
+
"resolved": "https://registry.npmjs.org/@types/firefox-webext-browser/-/firefox-webext-browser-120.0.3.tgz",
|
2205 |
+
"integrity": "sha512-APbBSxOvFMbKwXy/4YrEVa5Di6N0C9yl4w0WA0xzdkOrChAfPQ/KlcC8QLyhemHCHpF1CB/zHy52+oUQurViOg=="
|
2206 |
+
},
|
2207 |
+
"node_modules/@types/har-format": {
|
2208 |
+
"version": "1.2.15",
|
2209 |
+
"resolved": "https://registry.npmjs.org/@types/har-format/-/har-format-1.2.15.tgz",
|
2210 |
+
"integrity": "sha512-RpQH4rXLuvTXKR0zqHq3go0RVXYv/YVqv4TnPH95VbwUxZdQlK1EtcMvQvMpDngHbt13Csh9Z4qT9AbkiQH5BA=="
|
2211 |
+
},
|
2212 |
"node_modules/@types/http-errors": {
|
2213 |
"version": "2.0.4",
|
2214 |
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
|
|
|
2317 |
"integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
|
2318 |
"dev": true
|
2319 |
},
|
2320 |
+
"node_modules/@types/sbd": {
|
2321 |
+
"version": "1.0.5",
|
2322 |
+
"resolved": "https://registry.npmjs.org/@types/sbd/-/sbd-1.0.5.tgz",
|
2323 |
+
"integrity": "sha512-60PxBBWhg0C3yb5bTP+wwWYGTKMcuB0S6mTEa1sedMC79tYY0Ei7YjU4qsWzGn++lWscLQde16SnElJrf5/aTw==",
|
2324 |
+
"dev": true
|
2325 |
+
},
|
2326 |
"node_modules/@types/semver": {
|
2327 |
"version": "7.5.3",
|
2328 |
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
|
|
|
3767 |
"version": "4.3.1",
|
3768 |
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
|
3769 |
"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
|
|
|
3770 |
"engines": {
|
3771 |
"node": ">=0.10.0"
|
3772 |
}
|
|
|
3897 |
"node": ">=6.0.0"
|
3898 |
}
|
3899 |
},
|
3900 |
+
"node_modules/dom-serializer": {
|
3901 |
+
"version": "2.0.0",
|
3902 |
+
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
3903 |
+
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
3904 |
+
"dependencies": {
|
3905 |
+
"domelementtype": "^2.3.0",
|
3906 |
+
"domhandler": "^5.0.2",
|
3907 |
+
"entities": "^4.2.0"
|
3908 |
+
},
|
3909 |
+
"funding": {
|
3910 |
+
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
3911 |
+
}
|
3912 |
+
},
|
3913 |
+
"node_modules/domelementtype": {
|
3914 |
+
"version": "2.3.0",
|
3915 |
+
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
3916 |
+
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
3917 |
+
"funding": [
|
3918 |
+
{
|
3919 |
+
"type": "github",
|
3920 |
+
"url": "https://github.com/sponsors/fb55"
|
3921 |
+
}
|
3922 |
+
]
|
3923 |
+
},
|
3924 |
"node_modules/domexception": {
|
3925 |
"version": "4.0.0",
|
3926 |
"resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
|
|
|
3932 |
"node": ">=12"
|
3933 |
}
|
3934 |
},
|
3935 |
+
"node_modules/domhandler": {
|
3936 |
+
"version": "5.0.3",
|
3937 |
+
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
3938 |
+
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
3939 |
+
"dependencies": {
|
3940 |
+
"domelementtype": "^2.3.0"
|
3941 |
+
},
|
3942 |
+
"engines": {
|
3943 |
+
"node": ">= 4"
|
3944 |
+
},
|
3945 |
+
"funding": {
|
3946 |
+
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
3947 |
+
}
|
3948 |
+
},
|
3949 |
+
"node_modules/domutils": {
|
3950 |
+
"version": "3.1.0",
|
3951 |
+
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
|
3952 |
+
"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
|
3953 |
+
"dependencies": {
|
3954 |
+
"dom-serializer": "^2.0.0",
|
3955 |
+
"domelementtype": "^2.3.0",
|
3956 |
+
"domhandler": "^5.0.3"
|
3957 |
+
},
|
3958 |
+
"funding": {
|
3959 |
+
"url": "https://github.com/fb55/domutils?sponsor=1"
|
3960 |
+
}
|
3961 |
+
},
|
3962 |
"node_modules/dotenv": {
|
3963 |
"version": "16.0.3",
|
3964 |
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
|
|
|
4097 |
"version": "4.0.0",
|
4098 |
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
|
4099 |
"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
|
|
|
4100 |
"engines": {
|
4101 |
"node": ">=10"
|
4102 |
},
|
|
|
5080 |
"node": ">=12"
|
5081 |
}
|
5082 |
},
|
5083 |
+
"node_modules/htmlparser2": {
|
5084 |
+
"version": "8.0.2",
|
5085 |
+
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
|
5086 |
+
"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
|
5087 |
+
"funding": [
|
5088 |
+
"https://github.com/fb55/htmlparser2?sponsor=1",
|
5089 |
+
{
|
5090 |
+
"type": "github",
|
5091 |
+
"url": "https://github.com/sponsors/fb55"
|
5092 |
+
}
|
5093 |
+
],
|
5094 |
+
"dependencies": {
|
5095 |
+
"domelementtype": "^2.3.0",
|
5096 |
+
"domhandler": "^5.0.3",
|
5097 |
+
"domutils": "^3.0.1",
|
5098 |
+
"entities": "^4.4.0"
|
5099 |
+
}
|
5100 |
+
},
|
5101 |
"node_modules/http-errors": {
|
5102 |
"version": "2.0.0",
|
5103 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
|
|
|
5368 |
"node": ">=8"
|
5369 |
}
|
5370 |
},
|
5371 |
+
"node_modules/is-plain-object": {
|
5372 |
+
"version": "5.0.0",
|
5373 |
+
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz",
|
5374 |
+
"integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==",
|
5375 |
+
"engines": {
|
5376 |
+
"node": ">=0.10.0"
|
5377 |
+
}
|
5378 |
+
},
|
5379 |
"node_modules/is-potential-custom-element-name": {
|
5380 |
"version": "1.0.1",
|
5381 |
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
|
|
6536 |
"hex-rgb": "^4.1.0"
|
6537 |
}
|
6538 |
},
|
6539 |
+
"node_modules/parse-srcset": {
|
6540 |
+
"version": "1.0.2",
|
6541 |
+
"resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz",
|
6542 |
+
"integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="
|
6543 |
+
},
|
6544 |
"node_modules/parse5": {
|
6545 |
"version": "7.1.2",
|
6546 |
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
|
|
|
6832 |
"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
|
6833 |
"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
|
6834 |
},
|
6835 |
+
"node_modules/playwright": {
|
6836 |
+
"version": "1.43.1",
|
6837 |
+
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.43.1.tgz",
|
6838 |
+
"integrity": "sha512-V7SoH0ai2kNt1Md9E3Gwas5B9m8KR2GVvwZnAI6Pg0m3sh7UvgiYhRrhsziCmqMJNouPckiOhk8T+9bSAK0VIA==",
|
6839 |
+
"dependencies": {
|
6840 |
+
"playwright-core": "1.43.1"
|
6841 |
+
},
|
6842 |
+
"bin": {
|
6843 |
+
"playwright": "cli.js"
|
6844 |
+
},
|
6845 |
+
"engines": {
|
6846 |
+
"node": ">=16"
|
6847 |
+
},
|
6848 |
+
"optionalDependencies": {
|
6849 |
+
"fsevents": "2.3.2"
|
6850 |
+
}
|
6851 |
+
},
|
6852 |
+
"node_modules/playwright-core": {
|
6853 |
+
"version": "1.43.1",
|
6854 |
+
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.43.1.tgz",
|
6855 |
+
"integrity": "sha512-EI36Mto2Vrx6VF7rm708qSnesVQKbxEWvPrfA1IPY6HgczBplDx7ENtx+K2n4kJ41sLLkuGfmb0ZLSSXlDhqPg==",
|
6856 |
+
"bin": {
|
6857 |
+
"playwright-core": "cli.js"
|
6858 |
+
},
|
6859 |
+
"engines": {
|
6860 |
+
"node": ">=16"
|
6861 |
+
}
|
6862 |
+
},
|
6863 |
+
"node_modules/playwright/node_modules/fsevents": {
|
6864 |
+
"version": "2.3.2",
|
6865 |
+
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
6866 |
+
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
6867 |
+
"hasInstallScript": true,
|
6868 |
+
"optional": true,
|
6869 |
+
"os": [
|
6870 |
+
"darwin"
|
6871 |
+
],
|
6872 |
+
"engines": {
|
6873 |
+
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
6874 |
+
}
|
6875 |
+
},
|
6876 |
"node_modules/postcss": {
|
6877 |
"version": "8.4.35",
|
6878 |
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
|
|
|
7659 |
"rimraf": "bin.js"
|
7660 |
}
|
7661 |
},
|
7662 |
+
"node_modules/sanitize-html": {
|
7663 |
+
"version": "2.13.0",
|
7664 |
+
"resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.13.0.tgz",
|
7665 |
+
"integrity": "sha512-Xff91Z+4Mz5QiNSLdLWwjgBDm5b1RU6xBT0+12rapjiaR7SwfRdjw8f+6Rir2MXKLrDicRFHdb51hGOAxmsUIA==",
|
7666 |
+
"dependencies": {
|
7667 |
+
"deepmerge": "^4.2.2",
|
7668 |
+
"escape-string-regexp": "^4.0.0",
|
7669 |
+
"htmlparser2": "^8.0.0",
|
7670 |
+
"is-plain-object": "^5.0.0",
|
7671 |
+
"parse-srcset": "^1.0.2",
|
7672 |
+
"postcss": "^8.3.11"
|
7673 |
+
}
|
7674 |
+
},
|
7675 |
"node_modules/saslprep": {
|
7676 |
"version": "1.0.3",
|
7677 |
"resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
|
|
|
7722 |
"node": ">=v12.22.7"
|
7723 |
}
|
7724 |
},
|
7725 |
+
"node_modules/sbd": {
|
7726 |
+
"version": "1.0.19",
|
7727 |
+
"resolved": "https://registry.npmjs.org/sbd/-/sbd-1.0.19.tgz",
|
7728 |
+
"integrity": "sha512-b5RyZMGSrFuIB4AHdbv12uYHS8YGEJ36gtuvG3RflbJGY+T0dXmAL0E4vZjQqT2RsX0v+ZwVqhV2zsGr5aFK9w==",
|
7729 |
+
"dependencies": {
|
7730 |
+
"sanitize-html": "^2.3.2"
|
7731 |
+
}
|
7732 |
+
},
|
7733 |
"node_modules/secure-json-parse": {
|
7734 |
"version": "2.7.0",
|
7735 |
"resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
|
|
|
8677 |
"node": ">=14.0.0"
|
8678 |
}
|
8679 |
},
|
8680 |
+
"node_modules/tldts-core": {
|
8681 |
+
"version": "6.1.18",
|
8682 |
+
"resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.18.tgz",
|
8683 |
+
"integrity": "sha512-e4wx32F/7dMBSZyKAx825Yte3U0PQtZZ0bkWxYQiwLteRVnQ5zM40fEbi0IyNtwQssgJAk3GCr7Q+w39hX0VKA=="
|
8684 |
+
},
|
8685 |
+
"node_modules/tldts-experimental": {
|
8686 |
+
"version": "6.1.18",
|
8687 |
+
"resolved": "https://registry.npmjs.org/tldts-experimental/-/tldts-experimental-6.1.18.tgz",
|
8688 |
+
"integrity": "sha512-E9/pAIybo7/MPdsQSKcCDElgObk78Be1gFqO645LbfhL5HG597sOeRQ55EuvIHlTo1Ypyyl+F/V+p0CnrTu3uQ==",
|
8689 |
+
"dependencies": {
|
8690 |
+
"tldts-core": "^6.1.18"
|
8691 |
+
}
|
8692 |
+
},
|
8693 |
"node_modules/to-regex-range": {
|
8694 |
"version": "5.0.1",
|
8695 |
"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
|
@@ -28,6 +28,7 @@
|
|
28 |
"@types/jsdom": "^21.1.1",
|
29 |
"@types/minimist": "^1.2.5",
|
30 |
"@types/parquetjs": "^0.10.3",
|
|
|
31 |
"@types/uuid": "^9.0.8",
|
32 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
33 |
"@typescript-eslint/parser": "^6.x",
|
@@ -52,9 +53,11 @@
|
|
52 |
},
|
53 |
"type": "module",
|
54 |
"dependencies": {
|
|
|
55 |
"@huggingface/hub": "^0.5.1",
|
56 |
"@huggingface/inference": "^2.6.3",
|
57 |
"@iconify-json/bi": "^1.1.21",
|
|
|
58 |
"@resvg/resvg-js": "^2.6.0",
|
59 |
"@xenova/transformers": "^2.16.1",
|
60 |
"autoprefixer": "^10.4.14",
|
@@ -76,10 +79,12 @@
|
|
76 |
"parquetjs": "^0.11.2",
|
77 |
"pino": "^9.0.0",
|
78 |
"pino-pretty": "^11.0.0",
|
|
|
79 |
"postcss": "^8.4.31",
|
80 |
"saslprep": "^1.0.3",
|
81 |
"satori": "^0.10.11",
|
82 |
"satori-html": "^0.3.2",
|
|
|
83 |
"serpapi": "^1.1.1",
|
84 |
"sharp": "^0.33.2",
|
85 |
"tailwind-scrollbar": "^3.0.0",
|
|
|
28 |
"@types/jsdom": "^21.1.1",
|
29 |
"@types/minimist": "^1.2.5",
|
30 |
"@types/parquetjs": "^0.10.3",
|
31 |
+
"@types/sbd": "^1.0.5",
|
32 |
"@types/uuid": "^9.0.8",
|
33 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
34 |
"@typescript-eslint/parser": "^6.x",
|
|
|
53 |
},
|
54 |
"type": "module",
|
55 |
"dependencies": {
|
56 |
+
"@cliqz/adblocker-playwright": "^1.27.2",
|
57 |
"@huggingface/hub": "^0.5.1",
|
58 |
"@huggingface/inference": "^2.6.3",
|
59 |
"@iconify-json/bi": "^1.1.21",
|
60 |
+
"@playwright/browser-chromium": "^1.43.1",
|
61 |
"@resvg/resvg-js": "^2.6.0",
|
62 |
"@xenova/transformers": "^2.16.1",
|
63 |
"autoprefixer": "^10.4.14",
|
|
|
79 |
"parquetjs": "^0.11.2",
|
80 |
"pino": "^9.0.0",
|
81 |
"pino-pretty": "^11.0.0",
|
82 |
+
"playwright": "^1.40.0",
|
83 |
"postcss": "^8.4.31",
|
84 |
"saslprep": "^1.0.3",
|
85 |
"satori": "^0.10.11",
|
86 |
"satori-html": "^0.3.2",
|
87 |
+
"sbd": "^1.0.19",
|
88 |
"serpapi": "^1.1.1",
|
89 |
"sharp": "^0.33.2",
|
90 |
"tailwind-scrollbar": "^3.0.0",
|
@@ -227,7 +227,7 @@
|
|
227 |
{#if webSearchSources?.length}
|
228 |
<div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
|
229 |
<div class="text-gray-400">Sources:</div>
|
230 |
-
{#each webSearchSources as { link, title
|
231 |
<a
|
232 |
class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
|
233 |
href={link}
|
@@ -235,10 +235,10 @@
|
|
235 |
>
|
236 |
<img
|
237 |
class="h-3.5 w-3.5 rounded"
|
238 |
-
src="https://www.google.com/s2/favicons?sz=64&domain_url={hostname}"
|
239 |
alt="{title} favicon"
|
240 |
/>
|
241 |
-
<div>{hostname.replace(/^www\./, "")}</div>
|
242 |
</a>
|
243 |
{/each}
|
244 |
</div>
|
|
|
227 |
{#if webSearchSources?.length}
|
228 |
<div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
|
229 |
<div class="text-gray-400">Sources:</div>
|
230 |
+
{#each webSearchSources as { link, title }}
|
231 |
<a
|
232 |
class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
|
233 |
href={link}
|
|
|
235 |
>
|
236 |
<img
|
237 |
class="h-3.5 w-3.5 rounded"
|
238 |
+
src="https://www.google.com/s2/favicons?sz=64&domain_url={new URL(link).hostname}"
|
239 |
alt="{title} favicon"
|
240 |
/>
|
241 |
+
<div>{new URL(link).hostname.replace(/^www\./, "")}</div>
|
242 |
</a>
|
243 |
{/each}
|
244 |
</div>
|
@@ -32,7 +32,12 @@ export async function embeddingEndpointHfApi(
|
|
32 |
"Content-Type": "application/json",
|
33 |
...(authorization ? { Authorization: authorization } : {}),
|
34 |
},
|
35 |
-
body: JSON.stringify({
|
|
|
|
|
|
|
|
|
|
|
36 |
});
|
37 |
|
38 |
if (!response.ok) {
|
|
|
32 |
"Content-Type": "application/json",
|
33 |
...(authorization ? { Authorization: authorization } : {}),
|
34 |
},
|
35 |
+
body: JSON.stringify({
|
36 |
+
inputs: {
|
37 |
+
source_sentence: batchInputs[0],
|
38 |
+
sentences: batchInputs.slice(1),
|
39 |
+
},
|
40 |
+
}),
|
41 |
});
|
42 |
|
43 |
if (!response.ok) {
|
@@ -1,26 +1,38 @@
|
|
1 |
import { Address6, Address4 } from "ip-address";
|
2 |
-
|
3 |
import dns from "node:dns";
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
dns.lookup(
|
8 |
-
if (err)
|
9 |
-
|
10 |
-
}
|
11 |
-
if (family === 4) {
|
12 |
-
const addr = new Address4(address);
|
13 |
-
resolve(addr.isInSubnet(new Address4("127.0.0.0/8")));
|
14 |
-
} else if (family === 6) {
|
15 |
-
const addr = new Address6(address);
|
16 |
-
resolve(
|
17 |
-
addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal()
|
18 |
-
);
|
19 |
-
} else {
|
20 |
-
reject(new Error("Unknown IP family"));
|
21 |
-
}
|
22 |
});
|
23 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
}
|
|
|
1 |
import { Address6, Address4 } from "ip-address";
|
|
|
2 |
import dns from "node:dns";
|
3 |
|
4 |
+
const dnsLookup = (hostname: string): Promise<{ address: string; family: number }> => {
|
5 |
+
return new Promise((resolve, reject) => {
|
6 |
+
dns.lookup(hostname, (err, address, family) => {
|
7 |
+
if (err) return reject(err);
|
8 |
+
resolve({ address, family });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
});
|
10 |
});
|
11 |
+
};
|
12 |
+
|
13 |
+
export async function isURLLocal(URL: URL): Promise<boolean> {
|
14 |
+
const { address, family } = await dnsLookup(URL.hostname);
|
15 |
+
|
16 |
+
if (family === 4) {
|
17 |
+
const addr = new Address4(address);
|
18 |
+
const localSubnet = new Address4("127.0.0.0/8");
|
19 |
+
return addr.isInSubnet(localSubnet);
|
20 |
+
}
|
21 |
+
|
22 |
+
if (family === 6) {
|
23 |
+
const addr = new Address6(address);
|
24 |
+
return addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal();
|
25 |
+
}
|
26 |
+
|
27 |
+
throw Error("Unknown IP family");
|
28 |
+
}
|
29 |
|
30 |
+
export function isURLStringLocal(url: string) {
|
31 |
+
try {
|
32 |
+
const urlObj = new URL(url);
|
33 |
+
return isURLLocal(urlObj);
|
34 |
+
} catch (e) {
|
35 |
+
// assume local if URL parsing fails
|
36 |
+
return true;
|
37 |
+
}
|
38 |
}
|
@@ -13,11 +13,9 @@ export async function preprocessMessages(
|
|
13 |
return await Promise.all(
|
14 |
structuredClone(messages).map(async (message, idx) => {
|
15 |
const webSearchContext = webSearch?.contextSources
|
16 |
-
.map(({ context }) => context)
|
17 |
-
.
|
18 |
-
|
19 |
-
.map(({ text }) => text)
|
20 |
-
.join(" ");
|
21 |
// start by adding websearch to the last message
|
22 |
if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
|
23 |
const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
|
@@ -27,7 +25,7 @@ export async function preprocessMessages(
|
|
27 |
.map((el) => el.content);
|
28 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
29 |
|
30 |
-
message.content = `I searched the web using the query: ${webSearch.searchQuery}.
|
31 |
Today is ${currentDate} and here are the results:
|
32 |
=====================
|
33 |
${webSearchContext}
|
|
|
13 |
return await Promise.all(
|
14 |
structuredClone(messages).map(async (message, idx) => {
|
15 |
const webSearchContext = webSearch?.contextSources
|
16 |
+
.map(({ context }) => context.trim())
|
17 |
+
.join("\n\n----------\n\n");
|
18 |
+
|
|
|
|
|
19 |
// start by adding websearch to the last message
|
20 |
if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
|
21 |
const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
|
|
|
25 |
.map((el) => el.content);
|
26 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
27 |
|
28 |
+
message.content = `I searched the web using the query: ${webSearch.searchQuery}.
|
29 |
Today is ${currentDate} and here are the results:
|
30 |
=====================
|
31 |
${webSearchContext}
|
@@ -3,40 +3,31 @@ import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
|
|
3 |
import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
|
4 |
|
5 |
// see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
|
6 |
-
function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
|
7 |
return 1.0 - dot(embeddingA, embeddingB);
|
8 |
}
|
9 |
|
10 |
-
export async function
|
11 |
embeddingModel: EmbeddingBackendModel,
|
12 |
query: string,
|
13 |
-
sentences: string[]
|
14 |
-
|
15 |
-
): Promise<Embedding> {
|
16 |
const inputs = [
|
17 |
`${embeddingModel.preQuery}${query}`,
|
18 |
...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
|
19 |
];
|
20 |
|
21 |
const embeddingEndpoint = await embeddingModel.getEndpoint();
|
22 |
-
const output = await embeddingEndpoint({ inputs })
|
|
|
|
|
23 |
|
24 |
const queryEmbedding: Embedding = output[0];
|
25 |
const sentencesEmbeddings: Embedding[] = output.slice(1);
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
};
|
33 |
-
}
|
34 |
-
);
|
35 |
-
|
36 |
-
distancesFromQuery.sort((a, b) => {
|
37 |
-
return a.distance - b.distance;
|
38 |
-
});
|
39 |
-
|
40 |
-
// Return the indexes of the closest topK sentences
|
41 |
-
return distancesFromQuery.slice(0, topK).map((item) => item.index);
|
42 |
}
|
|
|
3 |
import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
|
4 |
|
5 |
// see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
|
6 |
+
export function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
|
7 |
return 1.0 - dot(embeddingA, embeddingB);
|
8 |
}
|
9 |
|
10 |
+
export async function getSentenceSimilarity(
|
11 |
embeddingModel: EmbeddingBackendModel,
|
12 |
query: string,
|
13 |
+
sentences: string[]
|
14 |
+
): Promise<{ distance: number; embedding: Embedding; idx: number }[]> {
|
|
|
15 |
const inputs = [
|
16 |
`${embeddingModel.preQuery}${query}`,
|
17 |
...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
|
18 |
];
|
19 |
|
20 |
const embeddingEndpoint = await embeddingModel.getEndpoint();
|
21 |
+
const output = await embeddingEndpoint({ inputs }).catch((err) => {
|
22 |
+
throw Error("Failed to generate embeddings for sentence similarity", { cause: err });
|
23 |
+
});
|
24 |
|
25 |
const queryEmbedding: Embedding = output[0];
|
26 |
const sentencesEmbeddings: Embedding[] = output.slice(1);
|
27 |
|
28 |
+
return sentencesEmbeddings.map((sentenceEmbedding, idx) => ({
|
29 |
+
distance: innerProduct(queryEmbedding, sentenceEmbedding),
|
30 |
+
embedding: sentenceEmbedding,
|
31 |
+
idx,
|
32 |
+
}));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
|
2 |
+
import { getSentenceSimilarity } from "$lib/server/sentenceSimilarity";
|
3 |
+
|
4 |
+
/**
|
5 |
+
* Combines sentences together to reach the maximum character limit of the embedding model
|
6 |
+
* Improves performance considerably when using CPU embedding
|
7 |
+
*/
|
8 |
+
export async function getCombinedSentenceSimilarity(
|
9 |
+
embeddingModel: EmbeddingBackendModel,
|
10 |
+
query: string,
|
11 |
+
sentences: string[]
|
12 |
+
): ReturnType<typeof getSentenceSimilarity> {
|
13 |
+
const combinedSentences = sentences.reduce<{ text: string; indices: number[] }[]>(
|
14 |
+
(acc, sentence, idx) => {
|
15 |
+
const lastSentence = acc[acc.length - 1];
|
16 |
+
if (!lastSentence) return [{ text: sentence, indices: [idx] }];
|
17 |
+
if (lastSentence.text.length + sentence.length < embeddingModel.chunkCharLength) {
|
18 |
+
lastSentence.text += ` ${sentence}`;
|
19 |
+
lastSentence.indices.push(idx);
|
20 |
+
return acc;
|
21 |
+
}
|
22 |
+
return [...acc, { text: sentence, indices: [idx] }];
|
23 |
+
},
|
24 |
+
[]
|
25 |
+
);
|
26 |
+
|
27 |
+
const embeddings = await getSentenceSimilarity(
|
28 |
+
embeddingModel,
|
29 |
+
query,
|
30 |
+
combinedSentences.map(({ text }) => text)
|
31 |
+
);
|
32 |
+
|
33 |
+
return embeddings.flatMap((embedding, idx) => {
|
34 |
+
const { indices } = combinedSentences[idx];
|
35 |
+
return indices.map((i) => ({ ...embedding, idx: i }));
|
36 |
+
});
|
37 |
+
}
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch";
|
2 |
+
import type { EmbeddingBackendModel } from "../../embeddingModels";
|
3 |
+
import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity";
|
4 |
+
import { MarkdownElementType, type MarkdownElement } from "../markdown/types";
|
5 |
+
import { stringifyMarkdownElement } from "../markdown/utils/stringify";
|
6 |
+
import { getCombinedSentenceSimilarity } from "./combine";
|
7 |
+
import { flattenTree } from "./tree";
|
8 |
+
|
9 |
+
const MIN_CHARS = 3_000;
|
10 |
+
const SOFT_MAX_CHARS = 8_000;
|
11 |
+
|
12 |
+
export async function findContextSources(
|
13 |
+
sources: WebSearchScrapedSource[],
|
14 |
+
prompt: string,
|
15 |
+
embeddingModel: EmbeddingBackendModel
|
16 |
+
) {
|
17 |
+
const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree));
|
18 |
+
const markdownElems = sourcesMarkdownElems.flat();
|
19 |
+
|
20 |
+
// When using CPU embedding (transformersjs), join sentences together to the max character limit
|
21 |
+
// to reduce inference time
|
22 |
+
const embeddingFunc =
|
23 |
+
embeddingModel.endpoints[0].type === "transformersjs"
|
24 |
+
? getCombinedSentenceSimilarity
|
25 |
+
: getSentenceSimilarity;
|
26 |
+
|
27 |
+
const embeddings = await embeddingFunc(
|
28 |
+
embeddingModel,
|
29 |
+
prompt,
|
30 |
+
markdownElems
|
31 |
+
.map(stringifyMarkdownElement)
|
32 |
+
// Safety in case the stringified markdown elements are too long
|
33 |
+
// but chunking should have happened earlier
|
34 |
+
.map((elem) => elem.slice(0, embeddingModel.chunkCharLength))
|
35 |
+
);
|
36 |
+
|
37 |
+
const topEmbeddings = embeddings
|
38 |
+
.sort((a, b) => a.distance - b.distance)
|
39 |
+
.filter((embedding) => markdownElems[embedding.idx].type !== MarkdownElementType.Header);
|
40 |
+
|
41 |
+
let totalChars = 0;
|
42 |
+
const selectedMarkdownElems = new Set<MarkdownElement>();
|
43 |
+
const selectedEmbeddings: number[][] = [];
|
44 |
+
for (const embedding of topEmbeddings) {
|
45 |
+
const elem = markdownElems[embedding.idx];
|
46 |
+
|
47 |
+
// Ignore elements that are too similar to already selected elements
|
48 |
+
const tooSimilar = selectedEmbeddings.some(
|
49 |
+
(selectedEmbedding) => innerProduct(selectedEmbedding, embedding.embedding) < 0.01
|
50 |
+
);
|
51 |
+
if (tooSimilar) continue;
|
52 |
+
|
53 |
+
// Add element
|
54 |
+
if (!selectedMarkdownElems.has(elem)) {
|
55 |
+
selectedMarkdownElems.add(elem);
|
56 |
+
selectedEmbeddings.push(embedding.embedding);
|
57 |
+
totalChars += elem.content.length;
|
58 |
+
}
|
59 |
+
|
60 |
+
// Add element's parent (header)
|
61 |
+
if (elem.parent && !selectedMarkdownElems.has(elem.parent)) {
|
62 |
+
selectedMarkdownElems.add(elem.parent);
|
63 |
+
totalChars += elem.parent.content.length;
|
64 |
+
}
|
65 |
+
|
66 |
+
if (totalChars > SOFT_MAX_CHARS) break;
|
67 |
+
if (totalChars > MIN_CHARS && embedding.distance > 0.25) break;
|
68 |
+
}
|
69 |
+
|
70 |
+
const contextSources = sourcesMarkdownElems
|
71 |
+
.map<WebSearchUsedSource>((elems, idx) => {
|
72 |
+
const sourceSelectedElems = elems.filter((elem) => selectedMarkdownElems.has(elem));
|
73 |
+
const context = sourceSelectedElems.map(stringifyMarkdownElement).join("\n");
|
74 |
+
const source = sources[idx];
|
75 |
+
return { ...source, context };
|
76 |
+
})
|
77 |
+
.filter((contextSource) => contextSource.context.length > 0);
|
78 |
+
|
79 |
+
return contextSources;
|
80 |
+
}
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { MarkdownElement } from "../markdown/types";
|
2 |
+
|
3 |
+
export function flattenTree(elem: MarkdownElement): MarkdownElement[] {
|
4 |
+
if ("children" in elem) return [elem, ...elem.children.flatMap(flattenTree)];
|
5 |
+
return [elem];
|
6 |
+
}
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { collapseString, sanitizeString } from "./utils/nlp";
|
2 |
+
import { stringifyHTMLElements, stringifyHTMLElementsUnformatted } from "./utils/stringify";
|
3 |
+
import { MarkdownElementType, tagNameMap, type HeaderElement, type MarkdownElement } from "./types";
|
4 |
+
import type { SerializedHTMLElement } from "../scrape/types";
|
5 |
+
|
6 |
+
interface ConversionState {
|
7 |
+
defaultType:
|
8 |
+
| MarkdownElementType.Paragraph
|
9 |
+
| MarkdownElementType.BlockQuote
|
10 |
+
| MarkdownElementType.UnorderedListItem
|
11 |
+
| MarkdownElementType.OrderedListItem;
|
12 |
+
listDepth: number;
|
13 |
+
blockQuoteDepth: number;
|
14 |
+
}
|
15 |
+
export function htmlElementToMarkdownElements(
|
16 |
+
parent: HeaderElement,
|
17 |
+
elem: SerializedHTMLElement | string,
|
18 |
+
prevState: ConversionState = {
|
19 |
+
defaultType: MarkdownElementType.Paragraph,
|
20 |
+
listDepth: 0,
|
21 |
+
blockQuoteDepth: 0,
|
22 |
+
}
|
23 |
+
): MarkdownElement | MarkdownElement[] {
|
24 |
+
// Found text so create an element based on the previous state
|
25 |
+
if (typeof elem === "string") {
|
26 |
+
if (elem.trim().length === 0) return [];
|
27 |
+
if (
|
28 |
+
prevState.defaultType === MarkdownElementType.UnorderedListItem ||
|
29 |
+
prevState.defaultType === MarkdownElementType.OrderedListItem
|
30 |
+
) {
|
31 |
+
return {
|
32 |
+
parent,
|
33 |
+
type: prevState.defaultType,
|
34 |
+
content: elem,
|
35 |
+
depth: prevState.listDepth,
|
36 |
+
};
|
37 |
+
}
|
38 |
+
if (prevState.defaultType === MarkdownElementType.BlockQuote) {
|
39 |
+
return {
|
40 |
+
parent,
|
41 |
+
type: prevState.defaultType,
|
42 |
+
content: elem,
|
43 |
+
depth: prevState.blockQuoteDepth,
|
44 |
+
};
|
45 |
+
}
|
46 |
+
return { parent, type: prevState.defaultType, content: elem };
|
47 |
+
}
|
48 |
+
|
49 |
+
const type = tagNameMap[elem.tagName] ?? MarkdownElementType.Paragraph;
|
50 |
+
|
51 |
+
// Update the state based on the current element
|
52 |
+
const state: ConversionState = { ...prevState };
|
53 |
+
if (type === MarkdownElementType.UnorderedList || type === MarkdownElementType.OrderedList) {
|
54 |
+
state.listDepth += 1;
|
55 |
+
state.defaultType =
|
56 |
+
type === MarkdownElementType.UnorderedList
|
57 |
+
? MarkdownElementType.UnorderedListItem
|
58 |
+
: MarkdownElementType.OrderedListItem;
|
59 |
+
}
|
60 |
+
if (type === MarkdownElementType.BlockQuote) {
|
61 |
+
state.defaultType = MarkdownElementType.BlockQuote;
|
62 |
+
state.blockQuoteDepth += 1;
|
63 |
+
}
|
64 |
+
|
65 |
+
// Headers
|
66 |
+
if (type === MarkdownElementType.Header) {
|
67 |
+
return {
|
68 |
+
parent,
|
69 |
+
type,
|
70 |
+
level: Number(elem.tagName[1]),
|
71 |
+
content: collapseString(stringifyHTMLElements(elem.content)),
|
72 |
+
children: [],
|
73 |
+
};
|
74 |
+
}
|
75 |
+
|
76 |
+
// Code blocks
|
77 |
+
if (type === MarkdownElementType.CodeBlock) {
|
78 |
+
return {
|
79 |
+
parent,
|
80 |
+
type,
|
81 |
+
content: sanitizeString(stringifyHTMLElementsUnformatted(elem.content)),
|
82 |
+
};
|
83 |
+
}
|
84 |
+
|
85 |
+
// Typical case, we want to flatten the DOM and only create elements when we see text
|
86 |
+
return elem.content.flatMap((el) => htmlElementToMarkdownElements(parent, el, state));
|
87 |
+
}
|
88 |
+
|
89 |
+
export function mergeAdjacentElements(elements: MarkdownElement[]): MarkdownElement[] {
|
90 |
+
return elements.reduce<MarkdownElement[]>((acc, elem) => {
|
91 |
+
const last = acc[acc.length - 1];
|
92 |
+
if (last && last.type === MarkdownElementType.Paragraph && last.type === elem.type) {
|
93 |
+
last.content += elem.content;
|
94 |
+
return acc;
|
95 |
+
}
|
96 |
+
return [...acc, elem];
|
97 |
+
}, []);
|
98 |
+
}
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { SerializedHTMLElement } from "../scrape/types";
|
2 |
+
import { htmlElementToMarkdownElements, mergeAdjacentElements } from "./fromHtml";
|
3 |
+
import type { HeaderElement, MarkdownElement } from "./types";
|
4 |
+
import { MarkdownElementType } from "./types";
|
5 |
+
import { chunkElements } from "./utils/chunk";
|
6 |
+
|
7 |
+
/**
|
8 |
+
* Converts HTML elements to Markdown elements and creates a tree based on header tags
|
9 |
+
* For example: h1 [h2 [p p blockquote] h2 [h3 [...] ] ]
|
10 |
+
**/
|
11 |
+
export function htmlToMarkdownTree(
|
12 |
+
title: string,
|
13 |
+
htmlElements: SerializedHTMLElement[],
|
14 |
+
maxCharsPerElem: number
|
15 |
+
): HeaderElement {
|
16 |
+
let parent: HeaderElement = {
|
17 |
+
type: MarkdownElementType.Header,
|
18 |
+
level: 1,
|
19 |
+
parent: null,
|
20 |
+
content: title,
|
21 |
+
children: [],
|
22 |
+
};
|
23 |
+
|
24 |
+
const markdownElements = chunkElements(
|
25 |
+
mergeAdjacentElements(
|
26 |
+
htmlElements.flatMap((elem) => htmlElementToMarkdownElements(parent, elem))
|
27 |
+
),
|
28 |
+
maxCharsPerElem
|
29 |
+
);
|
30 |
+
|
31 |
+
for (const elem of markdownElements) {
|
32 |
+
if (elem.type !== MarkdownElementType.Header) {
|
33 |
+
elem.parent = parent;
|
34 |
+
parent.children.push(elem);
|
35 |
+
continue;
|
36 |
+
}
|
37 |
+
|
38 |
+
// add 1 to current level to offset for the title being level 1
|
39 |
+
elem.level += 1;
|
40 |
+
|
41 |
+
// Pop up header levels until reaching the same level as the current header
|
42 |
+
// or until we reach the root
|
43 |
+
inner: while (parent !== null && parent.parent !== null) {
|
44 |
+
if (parent.level < elem.level) break inner;
|
45 |
+
parent = parent.parent;
|
46 |
+
}
|
47 |
+
parent.children.push(elem);
|
48 |
+
parent = elem;
|
49 |
+
}
|
50 |
+
|
51 |
+
// Pop up to the root
|
52 |
+
while (parent.parent !== null) {
|
53 |
+
parent = parent.parent;
|
54 |
+
}
|
55 |
+
return parent;
|
56 |
+
}
|
57 |
+
|
58 |
+
export function removeParents<T extends MarkdownElement>(elem: T): T {
|
59 |
+
if ("children" in elem) {
|
60 |
+
return { ...elem, parent: null, children: elem.children.map((child) => removeParents(child)) };
|
61 |
+
}
|
62 |
+
return { ...elem, parent: null };
|
63 |
+
}
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* eslint-disable-next-line no-shadow */
|
2 |
+
export enum MarkdownElementType {
|
3 |
+
Header = "HEADER",
|
4 |
+
Paragraph = "PARAGRAPH",
|
5 |
+
BlockQuote = "BLOCKQUOTE",
|
6 |
+
CodeBlock = "CODE_BLOCK",
|
7 |
+
|
8 |
+
UnorderedList = "UNORDERED_LIST",
|
9 |
+
OrderedList = "ORDERED_LIST",
|
10 |
+
UnorderedListItem = "UNORDERED_LIST_ITEM",
|
11 |
+
OrderedListItem = "ORDERED_LIST_ITEM",
|
12 |
+
}
|
13 |
+
|
14 |
+
interface BaseMarkdownElement<T = MarkdownElementType> {
|
15 |
+
type: T;
|
16 |
+
content: string;
|
17 |
+
parent: HeaderElement | null;
|
18 |
+
}
|
19 |
+
|
20 |
+
export interface HeaderElement extends BaseMarkdownElement<MarkdownElementType.Header> {
|
21 |
+
level: number;
|
22 |
+
children: MarkdownElement[];
|
23 |
+
}
|
24 |
+
type ListItem = MarkdownElementType.UnorderedListItem | MarkdownElementType.OrderedListItem;
|
25 |
+
interface ListItemElement extends BaseMarkdownElement<ListItem> {
|
26 |
+
depth: number;
|
27 |
+
}
|
28 |
+
interface BlockQuoteElement extends BaseMarkdownElement<MarkdownElementType.BlockQuote> {
|
29 |
+
depth: number;
|
30 |
+
}
|
31 |
+
interface ParagraphElement extends BaseMarkdownElement<MarkdownElementType.Paragraph> {}
|
32 |
+
interface CodeBlockElement extends BaseMarkdownElement<MarkdownElementType.CodeBlock> {}
|
33 |
+
|
34 |
+
export type MarkdownElement =
|
35 |
+
| HeaderElement
|
36 |
+
| ParagraphElement
|
37 |
+
| BlockQuoteElement
|
38 |
+
| CodeBlockElement
|
39 |
+
| ListItemElement;
|
40 |
+
|
41 |
+
export const tagNameMap: Record<string, MarkdownElementType> = {
|
42 |
+
h1: MarkdownElementType.Header,
|
43 |
+
h2: MarkdownElementType.Header,
|
44 |
+
h3: MarkdownElementType.Header,
|
45 |
+
h4: MarkdownElementType.Header,
|
46 |
+
h5: MarkdownElementType.Header,
|
47 |
+
h6: MarkdownElementType.Header,
|
48 |
+
div: MarkdownElementType.Paragraph,
|
49 |
+
p: MarkdownElementType.Paragraph,
|
50 |
+
blockquote: MarkdownElementType.BlockQuote,
|
51 |
+
pre: MarkdownElementType.CodeBlock,
|
52 |
+
ul: MarkdownElementType.UnorderedList,
|
53 |
+
ol: MarkdownElementType.OrderedList,
|
54 |
+
li: MarkdownElementType.UnorderedListItem,
|
55 |
+
};
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { sentences as splitBySentences } from "sbd";
|
2 |
+
import { MarkdownElementType, type MarkdownElement } from "../types";
|
3 |
+
|
4 |
+
export function chunkElements(elements: MarkdownElement[], maxLength: number): MarkdownElement[] {
|
5 |
+
return elements.flatMap((elem) => {
|
6 |
+
// Can't split headers because it would break the tree, and this situation should be rare
|
7 |
+
// so we just cut off the end
|
8 |
+
if (elem.type === MarkdownElementType.Header) {
|
9 |
+
return { ...elem, content: elem.content.slice(0, maxLength) };
|
10 |
+
}
|
11 |
+
const contentChunks = enforceMaxLength(elem.content, maxLength);
|
12 |
+
return contentChunks.map<MarkdownElement>((content) => ({ ...elem, content }));
|
13 |
+
});
|
14 |
+
}
|
15 |
+
|
16 |
+
const delimitersByPriority = ["?", "!", ".", ";", ":", ",", "|", " - ", " ", "-"];
|
17 |
+
function enforceMaxLength(text: string, maxLength: number): string[] {
|
18 |
+
if (text.length <= maxLength) return [text].filter(Boolean);
|
19 |
+
return splitBySentences(text)
|
20 |
+
.flatMap((sentence) => {
|
21 |
+
if (sentence.length <= maxLength) return sentence;
|
22 |
+
|
23 |
+
// Discover all necessary split points to fit the sentence within the max length
|
24 |
+
const indices: [number, number][] = [];
|
25 |
+
while ((indices.at(-1)?.[1] ?? 0) < sentence.length) {
|
26 |
+
const prevIndex = indices.at(-1)?.[1] ?? 0;
|
27 |
+
|
28 |
+
// Remaining text fits within maxLength
|
29 |
+
if (prevIndex + maxLength >= sentence.length) {
|
30 |
+
indices.push([prevIndex, sentence.length]);
|
31 |
+
continue;
|
32 |
+
}
|
33 |
+
|
34 |
+
const bestDelimiter = delimitersByPriority.find(
|
35 |
+
(delimiter) => sentence.lastIndexOf(delimiter, prevIndex + maxLength) !== -1
|
36 |
+
);
|
37 |
+
// Fallback in the unusual case that no delimiter is found
|
38 |
+
if (!bestDelimiter) {
|
39 |
+
indices.push([prevIndex, prevIndex + maxLength]);
|
40 |
+
continue;
|
41 |
+
}
|
42 |
+
|
43 |
+
const closestDelimiter = sentence.lastIndexOf(bestDelimiter, prevIndex + maxLength);
|
44 |
+
indices.push([prevIndex, Math.max(prevIndex + 1, closestDelimiter)]);
|
45 |
+
}
|
46 |
+
|
47 |
+
return indices.map((sliceIndices) => sentence.slice(...sliceIndices));
|
48 |
+
})
|
49 |
+
.reduce<string[]>(
|
50 |
+
(chunks, sentence) => {
|
51 |
+
const lastChunk = chunks[chunks.length - 1];
|
52 |
+
if (lastChunk.length + sentence.length <= maxLength) {
|
53 |
+
return [...chunks.slice(0, -1), lastChunk + sentence];
|
54 |
+
}
|
55 |
+
return [...chunks, sentence];
|
56 |
+
},
|
57 |
+
[""]
|
58 |
+
)
|
59 |
+
.filter(Boolean);
|
60 |
+
}
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/** Remove excess whitespace and newlines */
|
2 |
+
export const sanitizeString = (str: string) =>
|
3 |
+
str
|
4 |
+
.split("\n")
|
5 |
+
.map((s) => s.trim())
|
6 |
+
.filter(Boolean)
|
7 |
+
.join("\n")
|
8 |
+
.replaceAll(/ +/g, " ");
|
9 |
+
|
10 |
+
/** Collapses a string into a single line */
|
11 |
+
export const collapseString = (str: string) => sanitizeString(str.replaceAll(/\n/g, " "));
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { SerializedHTMLElement } from "../../scrape/types";
|
2 |
+
import { MarkdownElementType, type MarkdownElement } from "../types";
|
3 |
+
|
4 |
+
// --- Markdown Elements ---
|
5 |
+
|
6 |
+
/** Converts markdown element to a string with formatting */
|
7 |
+
export function stringifyMarkdownElement(elem: MarkdownElement): string {
|
8 |
+
const content = elem.content.trim();
|
9 |
+
if (elem.type === MarkdownElementType.Header) return `${"#".repeat(elem.level)} ${content}\n\n`;
|
10 |
+
if (elem.type === MarkdownElementType.BlockQuote) {
|
11 |
+
return `${"> ".repeat(elem.depth)}${content}\n\n`;
|
12 |
+
}
|
13 |
+
if (elem.type === MarkdownElementType.CodeBlock) return `\`\`\`\n${content}\n\`\`\`\n\n`;
|
14 |
+
|
15 |
+
if (elem.type === MarkdownElementType.UnorderedListItem) return `- ${content}\n`;
|
16 |
+
if (elem.type === MarkdownElementType.OrderedListItem) {
|
17 |
+
const siblings = elem.parent?.children ?? [elem];
|
18 |
+
const currentIndex = siblings.indexOf(elem);
|
19 |
+
const lastAdjacentIndex = siblings
|
20 |
+
.slice(currentIndex + 1)
|
21 |
+
.findLastIndex((child) => child.type === MarkdownElementType.OrderedListItem);
|
22 |
+
const order = currentIndex - lastAdjacentIndex + 1;
|
23 |
+
return `${order}. ${content}\n`;
|
24 |
+
}
|
25 |
+
|
26 |
+
return `${content}\n\n`;
|
27 |
+
}
|
28 |
+
|
29 |
+
// ----- HTML Elements -----
|
30 |
+
|
31 |
+
/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
|
32 |
+
export function stringifyHTMLElements(elems: (SerializedHTMLElement | string)[]): string {
|
33 |
+
return elems.map(stringifyHTMLElement).join("").trim();
|
34 |
+
}
|
35 |
+
|
36 |
+
/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
|
37 |
+
export function stringifyHTMLElement(elem: SerializedHTMLElement | string): string {
|
38 |
+
if (typeof elem === "string") return elem;
|
39 |
+
if (elem.tagName === "br") return "\n";
|
40 |
+
|
41 |
+
const content = elem.content.map(stringifyHTMLElement).join("");
|
42 |
+
if (content.length === 0) return content;
|
43 |
+
|
44 |
+
if (elem.tagName === "strong" || elem.tagName === "b") return `**${content}**`;
|
45 |
+
if (elem.tagName === "em" || elem.tagName === "i") return `*${content}*`;
|
46 |
+
if (elem.tagName === "s" || elem.tagName === "strike") return `~~${content}~~`;
|
47 |
+
|
48 |
+
if (elem.tagName === "code" || elem.tagName === "var" || elem.tagName === "tt") {
|
49 |
+
return `\`${content}\``;
|
50 |
+
}
|
51 |
+
|
52 |
+
if (elem.tagName === "sup") return `<sup>${content}</sup>`;
|
53 |
+
if (elem.tagName === "sub") return `<sub>${content}</sub>`;
|
54 |
+
|
55 |
+
if (elem.tagName === "a" && content.trim().length > 0) {
|
56 |
+
const href = elem.attributes.href;
|
57 |
+
if (!href) return elem.content.map(stringifyHTMLElement).join("");
|
58 |
+
return `[${elem.content.map(stringifyHTMLElement).join("")}](${href})`;
|
59 |
+
}
|
60 |
+
|
61 |
+
return elem.content.map(stringifyHTMLElement).join("");
|
62 |
+
}
|
63 |
+
|
64 |
+
/** Grabs all text content directly, ignoring HTML tags */
|
65 |
+
export function stringifyHTMLElementsUnformatted(
|
66 |
+
elems: (SerializedHTMLElement | string)[]
|
67 |
+
): string {
|
68 |
+
return elems.map(stringifyHTMLElementUnformatted).join("");
|
69 |
+
}
|
70 |
+
|
71 |
+
/** Grabs all text content directly, ignoring HTML tags */
|
72 |
+
function stringifyHTMLElementUnformatted(elem: SerializedHTMLElement | string): string {
|
73 |
+
if (typeof elem === "string") return elem;
|
74 |
+
return elem.content.map(stringifyHTMLElementUnformatted).join("");
|
75 |
+
}
|
@@ -1,41 +0,0 @@
|
|
1 |
-
import { JSDOM, VirtualConsole } from "jsdom";
|
2 |
-
|
3 |
-
export async function parseWeb(url: string) {
|
4 |
-
const abortController = new AbortController();
|
5 |
-
setTimeout(() => abortController.abort(), 10000);
|
6 |
-
const r = await fetch(url, { signal: abortController.signal, credentials: "omit" }).catch();
|
7 |
-
|
8 |
-
if (r.headers.get("content-type")?.includes("text/html")) {
|
9 |
-
const virtualConsole = new VirtualConsole();
|
10 |
-
virtualConsole.on("error", () => {
|
11 |
-
// No-op to skip console errors.
|
12 |
-
});
|
13 |
-
|
14 |
-
// put the html string into a DOM
|
15 |
-
const dom = new JSDOM((await r.text()) ?? "", {
|
16 |
-
virtualConsole,
|
17 |
-
});
|
18 |
-
|
19 |
-
const { document } = dom.window;
|
20 |
-
const paragraphs = document.querySelectorAll("p, table, pre, ul, ol");
|
21 |
-
|
22 |
-
if (!paragraphs.length) {
|
23 |
-
throw new Error(`webpage doesn't have any parseable element`);
|
24 |
-
}
|
25 |
-
const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
|
26 |
-
|
27 |
-
// combine text contents from paragraphs and then remove newlines and multiple spaces
|
28 |
-
const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
|
29 |
-
|
30 |
-
return text;
|
31 |
-
} else if (
|
32 |
-
r.headers.get("content-type")?.includes("text/plain") ||
|
33 |
-
r.headers.get("content-type")?.includes("text/markdown")
|
34 |
-
) {
|
35 |
-
const text = await r.text();
|
36 |
-
// JSON.stringify is needed to turn string concatenation into a single string (ex: "Hello, " + "world!" -> "Hello, world!")
|
37 |
-
return JSON.stringify(text);
|
38 |
-
} else {
|
39 |
-
throw new Error("Unsupported content type");
|
40 |
-
}
|
41 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,179 +1,103 @@
|
|
1 |
-
import { searchWeb } from "$lib/server/websearch/searchWeb";
|
2 |
-
import { generateQuery } from "$lib/server/websearch/generateQuery";
|
3 |
-
import { parseWeb } from "$lib/server/websearch/parseWeb";
|
4 |
-
import { chunk } from "$lib/utils/chunk";
|
5 |
-
import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
|
6 |
-
import { getWebSearchProvider } from "./searchWeb";
|
7 |
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
|
8 |
-
import { env } from "$env/dynamic/private";
|
9 |
|
10 |
import type { Conversation } from "$lib/types/Conversation";
|
11 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
12 |
import type { Message } from "$lib/types/Message";
|
13 |
-
import type { WebSearch,
|
14 |
import type { Assistant } from "$lib/types/Assistant";
|
15 |
|
16 |
-
import {
|
17 |
-
import
|
18 |
-
import {
|
|
|
19 |
|
20 |
-
const
|
21 |
-
const
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
27 |
|
28 |
export async function runWebSearch(
|
29 |
conv: Conversation,
|
30 |
messages: Message[],
|
31 |
updatePad: (upd: MessageUpdate) => void,
|
32 |
ragSettings?: Assistant["rag"]
|
33 |
-
) {
|
34 |
const prompt = messages[messages.length - 1].content;
|
35 |
-
const
|
36 |
-
|
37 |
-
|
38 |
-
results: [],
|
39 |
-
contextSources: [],
|
40 |
-
createdAt: new Date(),
|
41 |
-
updatedAt: new Date(),
|
42 |
-
};
|
43 |
-
|
44 |
-
function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
|
45 |
-
updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
|
46 |
-
}
|
47 |
|
48 |
try {
|
49 |
-
// if the assistant specified direct links, skip the websearch
|
50 |
-
if (ragSettings && ragSettings?.allowedLinks.length > 0) {
|
51 |
-
appendUpdate("Using links specified in Assistant");
|
52 |
-
|
53 |
-
let linksToUse = [...ragSettings.allowedLinks];
|
54 |
-
|
55 |
-
if (env.ENABLE_LOCAL_FETCH !== "true") {
|
56 |
-
const localLinks = await Promise.all(
|
57 |
-
linksToUse.map(async (link) => {
|
58 |
-
try {
|
59 |
-
const url = new URL(link);
|
60 |
-
return await isURLLocal(url);
|
61 |
-
} catch (e) {
|
62 |
-
return true;
|
63 |
-
}
|
64 |
-
})
|
65 |
-
);
|
66 |
-
|
67 |
-
linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
|
68 |
-
}
|
69 |
-
|
70 |
-
webSearch.results = linksToUse.map((link) => {
|
71 |
-
return { link, hostname: new URL(link).hostname, title: "", text: "" };
|
72 |
-
});
|
73 |
-
} else {
|
74 |
-
webSearch.searchQuery = await generateQuery(messages);
|
75 |
-
const searchProvider = getWebSearchProvider();
|
76 |
-
appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
|
77 |
-
|
78 |
-
let filters = "";
|
79 |
-
if (ragSettings && ragSettings?.allowedDomains.length > 0) {
|
80 |
-
appendUpdate("Filtering on specified domains");
|
81 |
-
filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
|
82 |
-
}
|
83 |
-
|
84 |
-
// handle the global lists
|
85 |
-
filters +=
|
86 |
-
allowList.map((item) => "site:" + item).join(" OR ") +
|
87 |
-
" " +
|
88 |
-
blockList.map((item) => "-site:" + item).join(" ");
|
89 |
-
|
90 |
-
webSearch.searchQuery = filters + " " + webSearch.searchQuery;
|
91 |
-
|
92 |
-
const results = await searchWeb(webSearch.searchQuery);
|
93 |
-
webSearch.results =
|
94 |
-
(results.organic_results &&
|
95 |
-
results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
|
96 |
-
try {
|
97 |
-
const { title, link, text } = el;
|
98 |
-
const { hostname } = new URL(link);
|
99 |
-
return { title, link, hostname, text };
|
100 |
-
} catch (e) {
|
101 |
-
// Ignore Errors
|
102 |
-
return null;
|
103 |
-
}
|
104 |
-
})) ??
|
105 |
-
[];
|
106 |
-
}
|
107 |
-
|
108 |
-
webSearch.results = webSearch.results.filter((value) => value !== null);
|
109 |
-
webSearch.results = webSearch.results
|
110 |
-
.filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
|
111 |
-
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
|
112 |
-
|
113 |
-
// fetch the model
|
114 |
const embeddingModel =
|
115 |
embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
|
116 |
-
|
117 |
if (!embeddingModel) {
|
118 |
-
throw
|
119 |
}
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
paragraphChunks = nestedParagraphChunks.flat();
|
142 |
-
if (!paragraphChunks.length) {
|
143 |
-
throw new Error("No text found on the first 5 results");
|
144 |
-
}
|
145 |
-
} else {
|
146 |
-
throw new Error("No results found for this search query");
|
147 |
}
|
148 |
|
|
|
149 |
appendUpdate("Extracting relevant information");
|
150 |
-
const
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
const { source } = paragraphChunks[idx];
|
158 |
-
const contextWithId = { idx, text: texts[idx] };
|
159 |
-
const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
|
160 |
-
if (usedSource) {
|
161 |
-
usedSource.context.push(contextWithId);
|
162 |
-
} else {
|
163 |
-
webSearch.contextSources.push({ ...source, context: [contextWithId] });
|
164 |
-
}
|
165 |
-
}
|
166 |
updatePad({
|
167 |
type: "webSearch",
|
168 |
messageType: "sources",
|
169 |
message: "sources",
|
170 |
-
sources:
|
171 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
} catch (searchError) {
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
}
|
177 |
-
|
178 |
-
return webSearch;
|
179 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
|
|
|
2 |
|
3 |
import type { Conversation } from "$lib/types/Conversation";
|
4 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
5 |
import type { Message } from "$lib/types/Message";
|
6 |
+
import type { WebSearch, WebSearchScrapedSource } from "$lib/types/WebSearch";
|
7 |
import type { Assistant } from "$lib/types/Assistant";
|
8 |
|
9 |
+
import { search } from "./search/search";
|
10 |
+
import { scrape } from "./scrape/scrape";
|
11 |
+
import { findContextSources } from "./embed/embed";
|
12 |
+
import { removeParents } from "./markdown/tree";
|
13 |
|
14 |
+
const MAX_N_PAGES_TO_SCRAPE = 8 as const;
|
15 |
+
const MAX_N_PAGES_TO_EMBED = 5 as const;
|
16 |
|
17 |
+
export type AppendUpdate = (message: string, args?: string[], type?: "error" | "update") => void;
|
18 |
+
const makeAppendUpdate =
|
19 |
+
(updatePad: (upd: MessageUpdate) => void): AppendUpdate =>
|
20 |
+
(message, args, type) =>
|
21 |
+
updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
|
22 |
|
23 |
export async function runWebSearch(
|
24 |
conv: Conversation,
|
25 |
messages: Message[],
|
26 |
updatePad: (upd: MessageUpdate) => void,
|
27 |
ragSettings?: Assistant["rag"]
|
28 |
+
): Promise<WebSearch> {
|
29 |
const prompt = messages[messages.length - 1].content;
|
30 |
+
const createdAt = new Date();
|
31 |
+
const updatedAt = new Date();
|
32 |
+
const appendUpdate = makeAppendUpdate(updatePad);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
const embeddingModel =
|
36 |
embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
|
|
|
37 |
if (!embeddingModel) {
|
38 |
+
throw Error(`Embedding model ${conv.embeddingModel} not available anymore`);
|
39 |
}
|
40 |
|
41 |
+
// Search the web
|
42 |
+
const { searchQuery, pages } = await search(messages, ragSettings, appendUpdate);
|
43 |
+
if (pages.length === 0) throw Error("No results found for this search query");
|
44 |
+
|
45 |
+
// Scrape pages
|
46 |
+
appendUpdate("Browsing search results");
|
47 |
+
|
48 |
+
const scrapedPages = await Promise.all(
|
49 |
+
pages
|
50 |
+
.slice(0, MAX_N_PAGES_TO_SCRAPE)
|
51 |
+
.map(scrape(appendUpdate, embeddingModel.chunkCharLength))
|
52 |
+
).then((allScrapedPages) =>
|
53 |
+
allScrapedPages
|
54 |
+
.filter((p): p is WebSearchScrapedSource => Boolean(p))
|
55 |
+
.filter((p) => p.page.markdownTree.children.length > 0)
|
56 |
+
.slice(0, MAX_N_PAGES_TO_EMBED)
|
57 |
+
);
|
58 |
+
|
59 |
+
if (!scrapedPages.length) {
|
60 |
+
throw Error(`No text found in the first ${MAX_N_PAGES_TO_SCRAPE} results`);
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
}
|
62 |
|
63 |
+
// Chunk the text of each of the elements and find the most similar chunks to the prompt
|
64 |
appendUpdate("Extracting relevant information");
|
65 |
+
const contextSources = await findContextSources(scrapedPages, prompt, embeddingModel).then(
|
66 |
+
(ctxSources) =>
|
67 |
+
ctxSources.map((source) => ({
|
68 |
+
...source,
|
69 |
+
page: { ...source.page, markdownTree: removeParents(source.page.markdownTree) },
|
70 |
+
}))
|
71 |
+
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
updatePad({
|
73 |
type: "webSearch",
|
74 |
messageType: "sources",
|
75 |
message: "sources",
|
76 |
+
sources: contextSources,
|
77 |
});
|
78 |
+
|
79 |
+
return {
|
80 |
+
prompt,
|
81 |
+
searchQuery,
|
82 |
+
results: scrapedPages.map(({ page, ...source }) => ({
|
83 |
+
...source,
|
84 |
+
page: { ...page, markdownTree: removeParents(page.markdownTree) },
|
85 |
+
})),
|
86 |
+
contextSources,
|
87 |
+
createdAt,
|
88 |
+
updatedAt,
|
89 |
+
};
|
90 |
} catch (searchError) {
|
91 |
+
const message = searchError instanceof Error ? searchError.message : String(searchError);
|
92 |
+
console.error(message);
|
93 |
+
appendUpdate("An error occurred", [JSON.stringify(message)], "error");
|
94 |
+
return {
|
95 |
+
prompt,
|
96 |
+
searchQuery: "",
|
97 |
+
results: [],
|
98 |
+
contextSources: [],
|
99 |
+
createdAt,
|
100 |
+
updatedAt,
|
101 |
+
};
|
102 |
}
|
|
|
|
|
103 |
}
|
@@ -0,0 +1,552 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { SerializedHTMLElement } from "./types";
|
2 |
+
|
3 |
+
interface DBSCANOptions<T> {
|
4 |
+
dataset: T[];
|
5 |
+
epsilon?: number;
|
6 |
+
epsilonCompare?: (distance: number, epsilon: number) => boolean;
|
7 |
+
minimumPoints?: number;
|
8 |
+
distanceFunction: (a: T, b: T) => number;
|
9 |
+
}
|
10 |
+
|
11 |
+
export function spatialParser() {
|
12 |
+
/**
|
13 |
+
* Implementation for dbscan, inlined and migrated to typescript from https://github.com/cdxOo/dbscan (MIT License)
|
14 |
+
*/
|
15 |
+
const DBSCAN = <T>({
|
16 |
+
dataset,
|
17 |
+
epsilon = 1,
|
18 |
+
epsilonCompare = (dist, e) => dist < e,
|
19 |
+
minimumPoints = 2,
|
20 |
+
distanceFunction,
|
21 |
+
}: DBSCANOptions<T>) => {
|
22 |
+
const visitedIndices: Record<number, boolean> = {};
|
23 |
+
const isVisited = (i: number) => visitedIndices[i];
|
24 |
+
const markVisited = (i: number) => {
|
25 |
+
visitedIndices[i] = true;
|
26 |
+
};
|
27 |
+
|
28 |
+
const clusteredIndices: Record<number, boolean> = {};
|
29 |
+
const isClustered = (i: number) => clusteredIndices[i];
|
30 |
+
const markClustered = (i: number) => {
|
31 |
+
clusteredIndices[i] = true;
|
32 |
+
};
|
33 |
+
|
34 |
+
const uniqueMerge = <U>(targetArray: U[], sourceArray: U[]) => {
|
35 |
+
for (let i = 0; i < sourceArray.length; i += 1) {
|
36 |
+
const item = sourceArray[i];
|
37 |
+
if (targetArray.indexOf(item) < 0) {
|
38 |
+
targetArray.push(item);
|
39 |
+
}
|
40 |
+
}
|
41 |
+
};
|
42 |
+
|
43 |
+
const findNeighbors = (index: number) => {
|
44 |
+
const neighbors = [];
|
45 |
+
for (let other = 0; other < dataset.length; other += 1) {
|
46 |
+
const distance = distanceFunction(dataset[index], dataset[other]);
|
47 |
+
if (epsilonCompare(distance, epsilon)) {
|
48 |
+
neighbors.push(other);
|
49 |
+
}
|
50 |
+
}
|
51 |
+
return neighbors;
|
52 |
+
};
|
53 |
+
|
54 |
+
const noise: number[] = [];
|
55 |
+
const addNoise = (i: number) => noise.push(i);
|
56 |
+
|
57 |
+
const clusters: number[][] = [];
|
58 |
+
const createCluster = () => clusters.push([]) - 1;
|
59 |
+
const addIndexToCluster = (c: number, i: number) => {
|
60 |
+
clusters[c].push(i);
|
61 |
+
markClustered(i);
|
62 |
+
};
|
63 |
+
|
64 |
+
const expandCluster = (c: number, neighbors: number[]) => {
|
65 |
+
for (let i = 0; i < neighbors.length; i += 1) {
|
66 |
+
const neighborIndex = neighbors[i];
|
67 |
+
if (!isVisited(neighborIndex)) {
|
68 |
+
markVisited(neighborIndex);
|
69 |
+
|
70 |
+
const secondaryNeighbors = findNeighbors(neighborIndex);
|
71 |
+
if (secondaryNeighbors.length >= minimumPoints) {
|
72 |
+
uniqueMerge(neighbors, secondaryNeighbors);
|
73 |
+
}
|
74 |
+
}
|
75 |
+
|
76 |
+
if (!isClustered(neighborIndex)) {
|
77 |
+
addIndexToCluster(c, neighborIndex);
|
78 |
+
}
|
79 |
+
}
|
80 |
+
};
|
81 |
+
|
82 |
+
dataset.forEach((_, index) => {
|
83 |
+
if (!isVisited(index)) {
|
84 |
+
markVisited(index);
|
85 |
+
|
86 |
+
const neighbors = findNeighbors(index);
|
87 |
+
if (neighbors.length < minimumPoints) {
|
88 |
+
addNoise(index);
|
89 |
+
} else {
|
90 |
+
const clusterIndex = createCluster();
|
91 |
+
addIndexToCluster(clusterIndex, index);
|
92 |
+
expandCluster(clusterIndex, neighbors);
|
93 |
+
}
|
94 |
+
}
|
95 |
+
});
|
96 |
+
|
97 |
+
return { clusters, noise };
|
98 |
+
};
|
99 |
+
|
100 |
+
// -----------
|
101 |
+
// Scraping implementation
|
102 |
+
|
103 |
+
const IgnoredTagsList = [
|
104 |
+
"footer",
|
105 |
+
"nav",
|
106 |
+
"aside",
|
107 |
+
"script",
|
108 |
+
"style",
|
109 |
+
"noscript",
|
110 |
+
"form",
|
111 |
+
"button",
|
112 |
+
];
|
113 |
+
const InlineTags = [
|
114 |
+
"a",
|
115 |
+
"abbrv",
|
116 |
+
"span",
|
117 |
+
"address",
|
118 |
+
"time",
|
119 |
+
"acronym",
|
120 |
+
"strong",
|
121 |
+
"b",
|
122 |
+
"br",
|
123 |
+
"sub",
|
124 |
+
"sup",
|
125 |
+
"tt",
|
126 |
+
"var",
|
127 |
+
"em",
|
128 |
+
"i",
|
129 |
+
];
|
130 |
+
|
131 |
+
type ReadableNode = HTMLElement;
|
132 |
+
type NodeWithRect = {
|
133 |
+
node: ReadableNode;
|
134 |
+
rect: DOMRect;
|
135 |
+
};
|
136 |
+
|
137 |
+
const isOnlyChild = (node: Node) => {
|
138 |
+
if (!node.parentElement) return true;
|
139 |
+
if (node.parentElement.nodeName === "body") return false;
|
140 |
+
if (node.parentElement.childNodes.length === 1) return true;
|
141 |
+
return false;
|
142 |
+
};
|
143 |
+
|
144 |
+
const hasValidInlineParent = (node: Node) => {
|
145 |
+
return node.parentElement && !node.parentElement.matches("div, section, article, main, body ");
|
146 |
+
};
|
147 |
+
|
148 |
+
const hasValidParent = (node: Node) => {
|
149 |
+
return node.parentElement && !node.parentElement.isSameNode(document.body);
|
150 |
+
};
|
151 |
+
|
152 |
+
const possibleCodeParents = Array.from(document.querySelectorAll("pre, p"));
|
153 |
+
const possibleTableParents = Array.from(document.querySelectorAll("table"));
|
154 |
+
const possibleListParents = Array.from(document.querySelectorAll("ul, ol"));
|
155 |
+
/**
|
156 |
+
* We want to find the highest parent of text node in the cluster.
|
157 |
+
* For example in this case: <p><span>Text here</span></p>
|
158 |
+
* the P tag is highest parent.
|
159 |
+
*/
|
160 |
+
const findHighestDirectParentOfReadableNode = (node: Node): HTMLElement => {
|
161 |
+
// go up the tree until the parent is no longer an only child
|
162 |
+
let parent = node.parentElement;
|
163 |
+
// if the parent is an inline tag, then go up one more level
|
164 |
+
while (
|
165 |
+
parent &&
|
166 |
+
hasValidInlineParent(parent) &&
|
167 |
+
InlineTags.includes(parent?.tagName.toLowerCase())
|
168 |
+
) {
|
169 |
+
parent = parent.parentElement;
|
170 |
+
}
|
171 |
+
|
172 |
+
while (parent && isOnlyChild(parent)) {
|
173 |
+
if (!hasValidParent(parent)) break;
|
174 |
+
parent = parent.parentElement;
|
175 |
+
}
|
176 |
+
|
177 |
+
if (!parent) {
|
178 |
+
throw new Error(
|
179 |
+
"disconnected node found, this should not really be possible when traversing through the dom"
|
180 |
+
);
|
181 |
+
}
|
182 |
+
|
183 |
+
// if the parent is a span, code or div tag check if there is a pre tag or p tag above it
|
184 |
+
if (["span", "code", "div"].includes(parent.nodeName.toLowerCase())) {
|
185 |
+
const hasParent = possibleCodeParents.find((tag) => tag.contains(parent)) as HTMLElement;
|
186 |
+
if (hasParent) {
|
187 |
+
parent = hasParent;
|
188 |
+
}
|
189 |
+
}
|
190 |
+
|
191 |
+
// if the parent is a li tag check if there is a ul or ol tag above it
|
192 |
+
if (parent.nodeName.toLowerCase() === "li") {
|
193 |
+
const hasParent = possibleListParents.find((tag) => tag.contains(parent)) as HTMLElement;
|
194 |
+
if (hasParent) {
|
195 |
+
parent = hasParent;
|
196 |
+
}
|
197 |
+
}
|
198 |
+
|
199 |
+
// if the parent is a td, th, tr tag check if there is a table tag above it
|
200 |
+
if (["td", "th", "tr"].includes(parent.nodeName.toLowerCase())) {
|
201 |
+
const hasParent = possibleTableParents.find((tag) => tag.contains(parent)) as HTMLElement;
|
202 |
+
if (hasParent) {
|
203 |
+
parent = hasParent;
|
204 |
+
}
|
205 |
+
}
|
206 |
+
|
207 |
+
return parent;
|
208 |
+
};
|
209 |
+
const barredNodes = Array.from(document.querySelectorAll(IgnoredTagsList.join(",")));
|
210 |
+
|
211 |
+
const doesNodePassHeuristics = (node: Node) => {
|
212 |
+
if ((node.textContent ?? "").trim().length < 10) {
|
213 |
+
return false;
|
214 |
+
}
|
215 |
+
|
216 |
+
const parentNode = findHighestDirectParentOfReadableNode(node);
|
217 |
+
|
218 |
+
if (parentNode && parentNode instanceof Element) {
|
219 |
+
if (
|
220 |
+
!parentNode.checkVisibility({
|
221 |
+
checkOpacity: true,
|
222 |
+
checkVisibilityCSS: true,
|
223 |
+
})
|
224 |
+
)
|
225 |
+
return false;
|
226 |
+
|
227 |
+
const rect = parentNode.getBoundingClientRect();
|
228 |
+
// elements that are readable usually don't have really small height or width
|
229 |
+
if (rect.width < 4 || rect.height < 4) {
|
230 |
+
return false;
|
231 |
+
}
|
232 |
+
}
|
233 |
+
|
234 |
+
if (parentNode && parentNode instanceof Element) {
|
235 |
+
if (barredNodes.some((barredNode) => barredNode.contains(parentNode))) {
|
236 |
+
return false;
|
237 |
+
}
|
238 |
+
}
|
239 |
+
|
240 |
+
return true;
|
241 |
+
};
|
242 |
+
|
243 |
+
const getAllReadableNodes = (): NodeWithRect[] => {
|
244 |
+
if (!document.body) throw new Error("Page failed to load");
|
245 |
+
const treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
|
246 |
+
acceptNode(node) {
|
247 |
+
if (doesNodePassHeuristics(node)) {
|
248 |
+
return NodeFilter.FILTER_ACCEPT;
|
249 |
+
} else {
|
250 |
+
return NodeFilter.FILTER_SKIP;
|
251 |
+
}
|
252 |
+
},
|
253 |
+
});
|
254 |
+
|
255 |
+
const readableNodes = [];
|
256 |
+
|
257 |
+
while (treeWalker.nextNode()) {
|
258 |
+
readableNodes.push(treeWalker.currentNode as ReadableNode);
|
259 |
+
}
|
260 |
+
|
261 |
+
/*
|
262 |
+
* <table><p>hello</p><p>world</p></table>
|
263 |
+
* table is already included in the parent of the first p tag
|
264 |
+
*/
|
265 |
+
|
266 |
+
const parentsForReadableNodes = readableNodes.map(findHighestDirectParentOfReadableNode);
|
267 |
+
const listWithOnlyParents: HTMLElement[] = [];
|
268 |
+
// find unique nodes in the parent list, a unique node is a node that is not a child of any other node in the list
|
269 |
+
for (let i = 0; i < parentsForReadableNodes.length; i++) {
|
270 |
+
const node = parentsForReadableNodes[i];
|
271 |
+
const hasParentInList = parentsForReadableNodes.find((otherNode, idx) => {
|
272 |
+
if (i === idx) return false;
|
273 |
+
return otherNode.contains(node);
|
274 |
+
});
|
275 |
+
listWithOnlyParents.push(hasParentInList ? hasParentInList : node);
|
276 |
+
}
|
277 |
+
|
278 |
+
const uniqueParents = Array.from(new Set(listWithOnlyParents));
|
279 |
+
|
280 |
+
return uniqueParents.map((node) => {
|
281 |
+
return {
|
282 |
+
node,
|
283 |
+
rect: node.getBoundingClientRect(),
|
284 |
+
};
|
285 |
+
});
|
286 |
+
};
|
287 |
+
|
288 |
+
const distanceFunction = (a: NodeWithRect, b: NodeWithRect) => {
|
289 |
+
// we make two assumptions here which are fine to make for rects returned from getBoundingClientRect
|
290 |
+
// 1. rects are upright and not rotated
|
291 |
+
// 2. If two rects intersect, we assume distance to be 0
|
292 |
+
let dx = 0;
|
293 |
+
let dy = 0;
|
294 |
+
const rect1 = a.rect;
|
295 |
+
const rect2 = b.rect;
|
296 |
+
// Calculate the horizontal distance
|
297 |
+
if (rect1.x + rect1.width < rect2.x) {
|
298 |
+
dx = rect2.x - (rect1.x + rect1.width);
|
299 |
+
} else if (rect2.x + rect2.width < rect1.x) {
|
300 |
+
dx = rect1.x - (rect2.x + rect2.width);
|
301 |
+
}
|
302 |
+
|
303 |
+
// Calculate the vertical distance
|
304 |
+
if (rect1.y + rect1.height < rect2.y) {
|
305 |
+
dy = rect2.y - (rect1.y + rect1.height);
|
306 |
+
} else if (rect2.y + rect2.height < rect1.y) {
|
307 |
+
dy = rect1.y - (rect2.y + rect2.height);
|
308 |
+
}
|
309 |
+
|
310 |
+
const distance = Math.sqrt(dx * dx + dy * dy);
|
311 |
+
// Return the Euclidean distance
|
312 |
+
return distance;
|
313 |
+
};
|
314 |
+
/**
|
315 |
+
* Clusters nodes using dbscan
|
316 |
+
*/
|
317 |
+
const clusterReadableNodes = (nodes: NodeWithRect[]) => {
|
318 |
+
const { clusters } = DBSCAN({
|
319 |
+
dataset: nodes,
|
320 |
+
epsilon: 28,
|
321 |
+
minimumPoints: 1,
|
322 |
+
distanceFunction,
|
323 |
+
});
|
324 |
+
|
325 |
+
return clusters;
|
326 |
+
};
|
327 |
+
|
328 |
+
const totalTextLength = (cluster: number[]) => {
|
329 |
+
return cluster
|
330 |
+
.map((t) => readableNodes[t].node.innerText?.replaceAll(/ {2}|\r\n|\n|\r/gm, ""))
|
331 |
+
.join("").length;
|
332 |
+
};
|
333 |
+
|
334 |
+
const approximatelyEqual = (a: number, b: number, epsilon = 1) => {
|
335 |
+
return Math.abs(a - b) < epsilon;
|
336 |
+
};
|
337 |
+
|
338 |
+
const getClusterBounds = (cluster: number[]) => {
|
339 |
+
const leftMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.x));
|
340 |
+
const topMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.y));
|
341 |
+
const rightMostPoint = Math.max(
|
342 |
+
...cluster.map((c) => readableNodes[c].rect.x + readableNodes[c].rect.width)
|
343 |
+
);
|
344 |
+
const bottomMostPoint = Math.max(
|
345 |
+
...cluster.map((c) => readableNodes[c].rect.y + readableNodes[c].rect.height)
|
346 |
+
);
|
347 |
+
return {
|
348 |
+
// left most element
|
349 |
+
x: leftMostPoint,
|
350 |
+
y: topMostPoint,
|
351 |
+
width: rightMostPoint - leftMostPoint,
|
352 |
+
height: bottomMostPoint - topMostPoint,
|
353 |
+
};
|
354 |
+
};
|
355 |
+
|
356 |
+
const round = (num: number, decimalPlaces = 2) => {
|
357 |
+
const factor = Math.pow(10, decimalPlaces);
|
358 |
+
return Math.round(num * factor) / factor;
|
359 |
+
};
|
360 |
+
|
361 |
+
/** minimum distance to center of the screen */
|
362 |
+
const clusterCentrality = (cluster: number[]) => {
|
363 |
+
const bounds = getClusterBounds(cluster);
|
364 |
+
const centerOfScreen = window.innerWidth / 2;
|
365 |
+
// the cluster contains the center of the screen
|
366 |
+
if (bounds.x < centerOfScreen && bounds.x + bounds.width > centerOfScreen) {
|
367 |
+
return 0;
|
368 |
+
}
|
369 |
+
|
370 |
+
// the cluster is to the left of the screen
|
371 |
+
if (bounds.x + bounds.width < centerOfScreen) {
|
372 |
+
return centerOfScreen - (bounds.x + bounds.width);
|
373 |
+
}
|
374 |
+
|
375 |
+
// the cluster is to the right of the screen
|
376 |
+
return bounds.x - centerOfScreen;
|
377 |
+
};
|
378 |
+
/** measure of text share that belong to the cluster */
|
379 |
+
const percentageTextShare = (cluster: number[], totalLength: number) => {
|
380 |
+
// apply an exponentially increasing penalty for centrality per 100 pixels distance from center
|
381 |
+
|
382 |
+
return round((totalTextLength(cluster) / totalLength) * 100);
|
383 |
+
};
|
384 |
+
|
385 |
+
const shouldMergeClusters = (clusterA: number[], clusterB: number[]) => {
|
386 |
+
const clusterABounds = getClusterBounds(clusterA);
|
387 |
+
const clusterBBounds = getClusterBounds(clusterB);
|
388 |
+
|
389 |
+
// A cluster is horizontally aligned if the x and width are roughly equal
|
390 |
+
const isHorizontallyAligned =
|
391 |
+
approximatelyEqual(clusterABounds.x, clusterBBounds.x, 40) &&
|
392 |
+
approximatelyEqual(clusterABounds.width, clusterBBounds.width, 40);
|
393 |
+
|
394 |
+
if (!isHorizontallyAligned) return false;
|
395 |
+
|
396 |
+
// check the y gap between the clusters
|
397 |
+
const higherCluster = clusterABounds.y < clusterBBounds.y ? clusterABounds : clusterBBounds;
|
398 |
+
const lowerCluster = clusterABounds.y < clusterBBounds.y ? clusterBBounds : clusterABounds;
|
399 |
+
const yGap = lowerCluster.y - (higherCluster.y + higherCluster.height);
|
400 |
+
|
401 |
+
if (approximatelyEqual(yGap, 0, 100)) return true;
|
402 |
+
};
|
403 |
+
|
404 |
+
const findCriticalClusters = (clusters: number[][]) => {
|
405 |
+
// merge the clusters that have similar widths and x position
|
406 |
+
|
407 |
+
let i = 0;
|
408 |
+
while (i < clusters.length) {
|
409 |
+
const cluster = clusters[i];
|
410 |
+
for (let j = i + 1; j < clusters.length; j++) {
|
411 |
+
const otherCluster = clusters[j];
|
412 |
+
if (shouldMergeClusters(cluster, otherCluster)) {
|
413 |
+
cluster.push(...otherCluster);
|
414 |
+
clusters.splice(j, 1);
|
415 |
+
j -= 1;
|
416 |
+
}
|
417 |
+
}
|
418 |
+
|
419 |
+
i++;
|
420 |
+
}
|
421 |
+
|
422 |
+
const totalText = totalTextLength(clusters.flat());
|
423 |
+
|
424 |
+
// sort in descending order of text share
|
425 |
+
const clusterWithMetrics = clusters.map((cluster) => {
|
426 |
+
const centrality = clusterCentrality(cluster);
|
427 |
+
return {
|
428 |
+
cluster,
|
429 |
+
centrality,
|
430 |
+
percentageTextShare: percentageTextShare(cluster, totalText),
|
431 |
+
};
|
432 |
+
});
|
433 |
+
|
434 |
+
// if there is a dominant cluster with more than 60% text share, return that
|
435 |
+
const dominantCluster = clusterWithMetrics[0].percentageTextShare > 60;
|
436 |
+
if (dominantCluster) return [clusterWithMetrics[0].cluster];
|
437 |
+
|
438 |
+
// clusters are sorted by text share after applying a penalty for centrality
|
439 |
+
const sortedClusters = clusterWithMetrics.sort((a, b) => {
|
440 |
+
const penaltyForA = Math.pow(0.9, a.centrality / 100);
|
441 |
+
const penaltyForB = Math.pow(0.9, b.centrality / 100);
|
442 |
+
const adjustedTextShareA = a.percentageTextShare * penaltyForA;
|
443 |
+
const adjustedTextShareB = b.percentageTextShare * penaltyForB;
|
444 |
+
|
445 |
+
return adjustedTextShareB - adjustedTextShareA;
|
446 |
+
});
|
447 |
+
|
448 |
+
// find all clusters that are similar to the largest cluster in terms of text share
|
449 |
+
// and see if they are enough to cover at least 60% of the text share
|
450 |
+
const largeTextShareClusters = sortedClusters.filter((c) =>
|
451 |
+
approximatelyEqual(c.percentageTextShare, sortedClusters[0].percentageTextShare, 10)
|
452 |
+
);
|
453 |
+
|
454 |
+
const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
|
455 |
+
(acc, cluster) => acc + cluster.percentageTextShare,
|
456 |
+
0
|
457 |
+
);
|
458 |
+
|
459 |
+
if (totalTextShareOfLargeClusters > 60) {
|
460 |
+
return largeTextShareClusters.map((c) => c.cluster);
|
461 |
+
}
|
462 |
+
|
463 |
+
// choose clusters till the text share is greater than 60%
|
464 |
+
let totalTextShare = 0;
|
465 |
+
const criticalClusters = [];
|
466 |
+
for (const cluster of sortedClusters) {
|
467 |
+
/** Ignore clusters with less than 2%*/
|
468 |
+
if (cluster.percentageTextShare < 2) continue;
|
469 |
+
if (totalTextShare > 60) break;
|
470 |
+
criticalClusters.push(cluster.cluster);
|
471 |
+
totalTextShare += cluster.percentageTextShare;
|
472 |
+
}
|
473 |
+
|
474 |
+
// if the total text share is less than 60% then return an empty array
|
475 |
+
// as this website should not be particularly useful for the web search anyways
|
476 |
+
// this should almost never happen on structured website with a lot of text
|
477 |
+
if (totalTextShare < 60) {
|
478 |
+
return [];
|
479 |
+
}
|
480 |
+
|
481 |
+
return criticalClusters;
|
482 |
+
};
|
483 |
+
|
484 |
+
const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
|
485 |
+
function serializeHTMLElement(node: Element): SerializedHTMLElement {
|
486 |
+
return {
|
487 |
+
tagName: node.tagName.toLowerCase(),
|
488 |
+
attributes: allowListedAttributes.reduce((acc, attr) => {
|
489 |
+
const value = node.getAttribute(attr);
|
490 |
+
if (value) {
|
491 |
+
acc[attr] = value;
|
492 |
+
}
|
493 |
+
return acc;
|
494 |
+
}, {} as Record<string, string>),
|
495 |
+
content: Array.from(node.childNodes).map(serializeNode).filter(Boolean),
|
496 |
+
};
|
497 |
+
}
|
498 |
+
|
499 |
+
function serializeNode(node: Node): SerializedHTMLElement | string {
|
500 |
+
if (node.nodeType === 1) return serializeHTMLElement(node as Element);
|
501 |
+
else if (node.nodeType === 3) return node.textContent ?? "";
|
502 |
+
else return "";
|
503 |
+
}
|
504 |
+
|
505 |
+
function getPageMetadata(): {
|
506 |
+
title: string;
|
507 |
+
siteName?: string;
|
508 |
+
author?: string;
|
509 |
+
description?: string;
|
510 |
+
createdAt?: string;
|
511 |
+
updatedAt?: string;
|
512 |
+
} {
|
513 |
+
const title = document.title ?? "";
|
514 |
+
const siteName =
|
515 |
+
document.querySelector("meta[property='og:site_name']")?.getAttribute("content") ?? undefined;
|
516 |
+
const author =
|
517 |
+
document.querySelector("meta[name='author']")?.getAttribute("content") ?? undefined;
|
518 |
+
const description =
|
519 |
+
document.querySelector("meta[name='description']")?.getAttribute("content") ??
|
520 |
+
document.querySelector("meta[property='og:description']")?.getAttribute("content") ??
|
521 |
+
undefined;
|
522 |
+
const createdAt =
|
523 |
+
document.querySelector("meta[property='article:published_time']")?.getAttribute("content") ??
|
524 |
+
document.querySelector("meta[name='date']")?.getAttribute("content") ??
|
525 |
+
undefined;
|
526 |
+
const updatedAt =
|
527 |
+
document.querySelector("meta[property='article:modified_time']")?.getAttribute("content") ??
|
528 |
+
undefined;
|
529 |
+
|
530 |
+
return { title, siteName, author, description, createdAt, updatedAt };
|
531 |
+
}
|
532 |
+
|
533 |
+
const readableNodes = getAllReadableNodes();
|
534 |
+
const clusters = clusterReadableNodes(readableNodes);
|
535 |
+
|
536 |
+
const criticalClusters = findCriticalClusters(clusters);
|
537 |
+
|
538 |
+
// filter readable nodes using the above information as well as heuristics
|
539 |
+
const filteredNodes = readableNodes.filter((_, idx) => {
|
540 |
+
return criticalClusters.some((cluster) => {
|
541 |
+
return cluster.includes(idx);
|
542 |
+
});
|
543 |
+
});
|
544 |
+
|
545 |
+
const elements = filteredNodes
|
546 |
+
.filter(
|
547 |
+
(node, idx, nodes) => !nodes.slice(idx + 1).some((otherNode) => node.node === otherNode.node)
|
548 |
+
)
|
549 |
+
.map<SerializedHTMLElement>(({ node }) => serializeHTMLElement(node));
|
550 |
+
const metadata = getPageMetadata();
|
551 |
+
return { ...metadata, elements };
|
552 |
+
}
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import {
|
2 |
+
type BrowserContext,
|
3 |
+
chromium,
|
4 |
+
devices,
|
5 |
+
type Page,
|
6 |
+
type BrowserContextOptions,
|
7 |
+
} from "playwright";
|
8 |
+
import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
|
9 |
+
import { env } from "$env/dynamic/private";
|
10 |
+
|
11 |
+
// Singleton initialized by initPlaywrightService
|
12 |
+
let playwrightService: Promise<{ ctx: BrowserContext; blocker: PlaywrightBlocker }>;
|
13 |
+
|
14 |
+
async function initPlaywrightService() {
|
15 |
+
if (playwrightService) return playwrightService;
|
16 |
+
|
17 |
+
const browser = await chromium.launch({ headless: true });
|
18 |
+
|
19 |
+
process.on("SIGINT", () => browser.close());
|
20 |
+
|
21 |
+
const device = devices["Desktop Chrome"];
|
22 |
+
const options: BrowserContextOptions = {
|
23 |
+
...device,
|
24 |
+
// Increasing width improves spatial clustering accuracy
|
25 |
+
screen: {
|
26 |
+
width: 3840,
|
27 |
+
height: 1080,
|
28 |
+
},
|
29 |
+
viewport: {
|
30 |
+
width: 3840,
|
31 |
+
height: 1080,
|
32 |
+
},
|
33 |
+
reducedMotion: "reduce",
|
34 |
+
acceptDownloads: false,
|
35 |
+
timezoneId: "America/New_York",
|
36 |
+
locale: "en-US",
|
37 |
+
};
|
38 |
+
const ctx = await browser.newContext(options);
|
39 |
+
const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
|
40 |
+
const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
|
41 |
+
if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
|
42 |
+
return mostBlocked;
|
43 |
+
});
|
44 |
+
return Object.freeze({ ctx, blocker });
|
45 |
+
}
|
46 |
+
|
47 |
+
export async function loadPage(url: string): Promise<Page> {
|
48 |
+
if (!playwrightService) playwrightService = initPlaywrightService();
|
49 |
+
const { ctx, blocker } = await playwrightService;
|
50 |
+
|
51 |
+
const page = await ctx.newPage();
|
52 |
+
await blocker.enableBlockingInPage(page);
|
53 |
+
|
54 |
+
await page.goto(url, { waitUntil: "load", timeout: 2000 }).catch(() => {
|
55 |
+
console.warn(`Failed to load page within 2s: ${url}`);
|
56 |
+
});
|
57 |
+
|
58 |
+
return page;
|
59 |
+
}
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { AppendUpdate } from "../runWebSearch";
|
2 |
+
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
|
3 |
+
import { loadPage } from "./playwright";
|
4 |
+
|
5 |
+
import { spatialParser } from "./parser";
|
6 |
+
import { htmlToMarkdownTree } from "../markdown/tree";
|
7 |
+
import { timeout } from "$lib/utils/timeout";
|
8 |
+
|
9 |
+
export const scrape =
|
10 |
+
(appendUpdate: AppendUpdate, maxCharsPerElem: number) =>
|
11 |
+
async (source: WebSearchSource): Promise<WebSearchScrapedSource | undefined> => {
|
12 |
+
try {
|
13 |
+
const page = await scrapeUrl(source.link, maxCharsPerElem);
|
14 |
+
appendUpdate("Browsing webpage", [source.link]);
|
15 |
+
return { ...source, page };
|
16 |
+
} catch (e) {
|
17 |
+
const message = e instanceof Error ? e.message : String(e);
|
18 |
+
appendUpdate("Failed to parse webpage", [message, source.link], "error");
|
19 |
+
}
|
20 |
+
};
|
21 |
+
|
22 |
+
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
|
23 |
+
const page = await loadPage(url);
|
24 |
+
|
25 |
+
return timeout(page.evaluate(spatialParser), 2000)
|
26 |
+
.then(({ elements, ...parsed }) => ({
|
27 |
+
...parsed,
|
28 |
+
markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
|
29 |
+
}))
|
30 |
+
.catch((cause) => {
|
31 |
+
throw Error("Parsing failed", { cause });
|
32 |
+
})
|
33 |
+
.finally(() => page.close());
|
34 |
+
}
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export interface SerializedHTMLElement {
|
2 |
+
tagName: string;
|
3 |
+
attributes: Record<string, string>;
|
4 |
+
content: (SerializedHTMLElement | string)[];
|
5 |
+
}
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { WebSearchProvider, type WebSearchSource } from "$lib/types/WebSearch";
|
2 |
+
import { env } from "$env/dynamic/private";
|
3 |
+
import searchSerper from "./endpoints/serper";
|
4 |
+
import searchSerpApi from "./endpoints/serpApi";
|
5 |
+
import searchSerpStack from "./endpoints/serpStack";
|
6 |
+
import searchYouApi from "./endpoints/youApi";
|
7 |
+
import searchWebLocal from "./endpoints/webLocal";
|
8 |
+
import searchSearxng from "./endpoints/searxng";
|
9 |
+
|
10 |
+
export function getWebSearchProvider() {
|
11 |
+
if (env.YDC_API_KEY) return WebSearchProvider.YOU;
|
12 |
+
if (env.SEARXNG_QUERY_URL) return WebSearchProvider.SEARXNG;
|
13 |
+
return WebSearchProvider.GOOGLE;
|
14 |
+
}
|
15 |
+
|
16 |
+
/** Searches the web using the first available provider, based on the env */
|
17 |
+
export async function searchWeb(query: string): Promise<WebSearchSource[]> {
|
18 |
+
if (env.USE_LOCAL_WEBSEARCH) return searchWebLocal(query);
|
19 |
+
if (env.SEARXNG_QUERY_URL) return searchSearxng(query);
|
20 |
+
if (env.SERPER_API_KEY) return searchSerper(query);
|
21 |
+
if (env.YDC_API_KEY) return searchYouApi(query);
|
22 |
+
if (env.SERPAPI_KEY) return searchSerpApi(query);
|
23 |
+
if (env.SERPSTACK_API_KEY) return searchSerpStack(query);
|
24 |
+
throw new Error(
|
25 |
+
"No configuration found for web search. Please set USE_LOCAL_WEBSEARCH, SEARXNG_QUERY_URL, SERPER_API_KEY, YDC_API_KEY, or SERPSTACK_API_KEY in your environment variables."
|
26 |
+
);
|
27 |
+
}
|
@@ -1,7 +1,9 @@
|
|
1 |
import { env } from "$env/dynamic/private";
|
2 |
import { logger } from "$lib/server/logger";
|
|
|
|
|
3 |
|
4 |
-
export async function searchSearxng(query: string) {
|
5 |
const abortController = new AbortController();
|
6 |
setTimeout(() => abortController.abort(), 10000);
|
7 |
|
@@ -20,7 +22,7 @@ export async function searchSearxng(query: string) {
|
|
20 |
.then((response) => response.json() as Promise<{ results: { url: string }[] }>)
|
21 |
.catch((error) => {
|
22 |
logger.error("Failed to fetch or parse JSON", error);
|
23 |
-
throw new Error("Failed to fetch or parse JSON");
|
24 |
});
|
25 |
|
26 |
// Extract 'url' elements from the JSON response and trim to the top 5 URLs
|
@@ -31,5 +33,5 @@ export async function searchSearxng(query: string) {
|
|
31 |
}
|
32 |
|
33 |
// Map URLs to the correct object shape
|
34 |
-
return
|
35 |
}
|
|
|
1 |
import { env } from "$env/dynamic/private";
|
2 |
import { logger } from "$lib/server/logger";
|
3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
4 |
+
import { isURL } from "$lib/utils/isUrl";
|
5 |
|
6 |
+
export default async function searchSearxng(query: string): Promise<WebSearchSource[]> {
|
7 |
const abortController = new AbortController();
|
8 |
setTimeout(() => abortController.abort(), 10000);
|
9 |
|
|
|
22 |
.then((response) => response.json() as Promise<{ results: { url: string }[] }>)
|
23 |
.catch((error) => {
|
24 |
logger.error("Failed to fetch or parse JSON", error);
|
25 |
+
throw new Error("Failed to fetch or parse JSON", { cause: error });
|
26 |
});
|
27 |
|
28 |
// Extract 'url' elements from the JSON response and trim to the top 5 URLs
|
|
|
33 |
}
|
34 |
|
35 |
// Map URLs to the correct object shape
|
36 |
+
return urls.filter(isURL).map((link) => ({ link }));
|
37 |
}
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { env } from "$env/dynamic/private";
|
2 |
+
import { getJson, type GoogleParameters } from "serpapi";
|
3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
4 |
+
import { isURL } from "$lib/utils/isUrl";
|
5 |
+
|
6 |
+
type SerpApiResponse = {
|
7 |
+
organic_results: {
|
8 |
+
link: string;
|
9 |
+
}[];
|
10 |
+
};
|
11 |
+
|
12 |
+
export default async function searchWebSerpApi(query: string): Promise<WebSearchSource[]> {
|
13 |
+
const params = {
|
14 |
+
q: query,
|
15 |
+
hl: "en",
|
16 |
+
gl: "us",
|
17 |
+
google_domain: "google.com",
|
18 |
+
api_key: env.SERPAPI_KEY,
|
19 |
+
} satisfies GoogleParameters;
|
20 |
+
|
21 |
+
// Show result as JSON
|
22 |
+
const response = (await getJson("google", params)) as unknown as SerpApiResponse;
|
23 |
+
|
24 |
+
return response.organic_results.filter(({ link }) => isURL(link));
|
25 |
+
}
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { env } from "$env/dynamic/private";
|
2 |
+
import { isURL } from "$lib/utils/isUrl";
|
3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
4 |
+
|
5 |
+
type SerpStackResponse = {
|
6 |
+
organic_results: {
|
7 |
+
title: string;
|
8 |
+
url: string;
|
9 |
+
snippet?: string;
|
10 |
+
}[];
|
11 |
+
error?: string;
|
12 |
+
};
|
13 |
+
|
14 |
+
export default async function searchSerpStack(query: string): Promise<WebSearchSource[]> {
|
15 |
+
const response = await fetch(
|
16 |
+
`http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
|
17 |
+
{ headers: { "Content-type": "application/json; charset=UTF-8" } }
|
18 |
+
);
|
19 |
+
|
20 |
+
const data = (await response.json()) as SerpStackResponse;
|
21 |
+
|
22 |
+
if (!response.ok) {
|
23 |
+
throw new Error(
|
24 |
+
data.error ?? `SerpStack API returned error code ${response.status} - ${response.statusText}`
|
25 |
+
);
|
26 |
+
}
|
27 |
+
|
28 |
+
return data.organic_results
|
29 |
+
.filter(({ url }) => isURL(url))
|
30 |
+
.map(({ title, url, snippet }) => ({
|
31 |
+
title,
|
32 |
+
link: url,
|
33 |
+
text: snippet ?? "",
|
34 |
+
}));
|
35 |
+
}
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { env } from "$env/dynamic/private";
|
2 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
3 |
+
|
4 |
+
export default async function search(query: string): Promise<WebSearchSource[]> {
|
5 |
+
const params = {
|
6 |
+
q: query,
|
7 |
+
hl: "en",
|
8 |
+
gl: "us",
|
9 |
+
};
|
10 |
+
|
11 |
+
const response = await fetch("https://google.serper.dev/search", {
|
12 |
+
method: "POST",
|
13 |
+
body: JSON.stringify(params),
|
14 |
+
headers: {
|
15 |
+
"x-api-key": env.SERPER_API_KEY,
|
16 |
+
"Content-type": "application/json",
|
17 |
+
},
|
18 |
+
});
|
19 |
+
|
20 |
+
/* eslint-disable @typescript-eslint/no-explicit-any */
|
21 |
+
const data = (await response.json()) as Record<string, any>;
|
22 |
+
|
23 |
+
if (!response.ok) {
|
24 |
+
throw new Error(
|
25 |
+
data["message"] ??
|
26 |
+
`Serper API returned error code ${response.status} - ${response.statusText}`
|
27 |
+
);
|
28 |
+
}
|
29 |
+
|
30 |
+
return data["organic"] ?? [];
|
31 |
+
}
|
@@ -1,45 +1,35 @@
|
|
1 |
import { JSDOM, VirtualConsole } from "jsdom";
|
|
|
|
|
2 |
|
3 |
-
export async function searchWebLocal(query: string) {
|
4 |
const abortController = new AbortController();
|
5 |
setTimeout(() => abortController.abort(), 10000);
|
6 |
|
7 |
-
const htmlString = await fetch(
|
8 |
-
|
9 |
-
|
|
|
10 |
.then((response) => response.text())
|
11 |
.catch();
|
12 |
|
13 |
const virtualConsole = new VirtualConsole();
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
// No-op to skip console errors.
|
17 |
-
});
|
18 |
-
|
19 |
-
// put the html string into a DOM
|
20 |
-
const dom = new JSDOM(htmlString ?? "", {
|
21 |
-
virtualConsole,
|
22 |
-
});
|
23 |
-
|
24 |
-
const { document } = dom.window;
|
25 |
-
// get all a documents with href tag
|
26 |
-
|
27 |
const links = document.querySelectorAll("a");
|
28 |
-
|
29 |
-
if (!links.length) {
|
30 |
-
throw new Error(`webpage doesn't have any "a" element`);
|
31 |
-
}
|
32 |
|
33 |
// take url that start wirth /url?q=
|
34 |
// and do not contain google.com links
|
35 |
// and strip them up to '&sa='
|
36 |
const linksHref = Array.from(links)
|
37 |
-
.
|
38 |
-
.
|
39 |
-
|
40 |
-
|
41 |
-
});
|
42 |
|
43 |
// remove duplicate links and map links to the correct object shape
|
44 |
-
return
|
45 |
}
|
|
|
1 |
import { JSDOM, VirtualConsole } from "jsdom";
|
2 |
+
import { isURL } from "$lib/utils/isUrl";
|
3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
4 |
|
5 |
+
export default async function searchWebLocal(query: string): Promise<WebSearchSource[]> {
|
6 |
const abortController = new AbortController();
|
7 |
setTimeout(() => abortController.abort(), 10000);
|
8 |
|
9 |
+
const htmlString = await fetch(
|
10 |
+
"https://www.google.com/search?hl=en&q=" + encodeURIComponent(query),
|
11 |
+
{ signal: abortController.signal }
|
12 |
+
)
|
13 |
.then((response) => response.text())
|
14 |
.catch();
|
15 |
|
16 |
const virtualConsole = new VirtualConsole();
|
17 |
+
virtualConsole.on("error", () => {}); // No-op to skip console errors.
|
18 |
+
const document = new JSDOM(htmlString ?? "", { virtualConsole }).window.document;
|
19 |
|
20 |
+
// get all links
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
const links = document.querySelectorAll("a");
|
22 |
+
if (!links.length) throw new Error(`webpage doesn't have any "a" element`);
|
|
|
|
|
|
|
23 |
|
24 |
// take url that start wirth /url?q=
|
25 |
// and do not contain google.com links
|
26 |
// and strip them up to '&sa='
|
27 |
const linksHref = Array.from(links)
|
28 |
+
.map((el) => el.href)
|
29 |
+
.filter((link) => link.startsWith("/url?q=") && !link.includes("google.com/"))
|
30 |
+
.map((link) => link.slice("/url?q=".length, link.indexOf("&sa=")))
|
31 |
+
.filter(isURL);
|
|
|
32 |
|
33 |
// remove duplicate links and map links to the correct object shape
|
34 |
+
return [...new Set(linksHref)].map((link) => ({ link }));
|
35 |
}
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { env } from "$env/dynamic/private";
|
2 |
+
import { isURL } from "$lib/utils/isUrl";
|
3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
4 |
+
|
5 |
+
interface YouWebSearch {
|
6 |
+
hits: YouSearchHit[];
|
7 |
+
latency: number;
|
8 |
+
}
|
9 |
+
|
10 |
+
interface YouSearchHit {
|
11 |
+
url: string;
|
12 |
+
title: string;
|
13 |
+
description: string;
|
14 |
+
snippets: string[];
|
15 |
+
}
|
16 |
+
|
17 |
+
export default async function searchWebYouApi(query: string): Promise<WebSearchSource[]> {
|
18 |
+
const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
|
19 |
+
method: "GET",
|
20 |
+
headers: {
|
21 |
+
"X-API-Key": env.YDC_API_KEY,
|
22 |
+
"Content-type": "application/json; charset=UTF-8",
|
23 |
+
},
|
24 |
+
});
|
25 |
+
|
26 |
+
if (!response.ok) {
|
27 |
+
throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
|
28 |
+
}
|
29 |
+
|
30 |
+
const data = (await response.json()) as YouWebSearch;
|
31 |
+
const formattedResultsWithSnippets = data.hits
|
32 |
+
.filter(({ url }) => isURL(url))
|
33 |
+
.map(({ title, url, snippets }) => ({
|
34 |
+
title,
|
35 |
+
link: url,
|
36 |
+
text: snippets?.join("\n") || "",
|
37 |
+
}))
|
38 |
+
.sort((a, b) => b.text.length - a.text.length); // desc order by text length
|
39 |
+
|
40 |
+
return formattedResultsWithSnippets;
|
41 |
+
}
|
@@ -1,6 +1,6 @@
|
|
1 |
import type { Message } from "$lib/types/Message";
|
2 |
import { format } from "date-fns";
|
3 |
-
import { generateFromDefaultEndpoint } from "
|
4 |
|
5 |
export async function generateQuery(messages: Message[]) {
|
6 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
|
|
1 |
import type { Message } from "$lib/types/Message";
|
2 |
import { format } from "date-fns";
|
3 |
+
import { generateFromDefaultEndpoint } from "../../generateFromDefaultEndpoint";
|
4 |
|
5 |
export async function generateQuery(messages: Message[]) {
|
6 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
2 |
+
import type { Message } from "$lib/types/Message";
|
3 |
+
import type { Assistant } from "$lib/types/Assistant";
|
4 |
+
import type { AppendUpdate } from "../runWebSearch";
|
5 |
+
import { getWebSearchProvider, searchWeb } from "./endpoints";
|
6 |
+
import { generateQuery } from "./generateQuery";
|
7 |
+
import { isURLStringLocal } from "$lib/server/isURLLocal";
|
8 |
+
import { isURL } from "$lib/utils/isUrl";
|
9 |
+
|
10 |
+
import z from "zod";
|
11 |
+
import JSON5 from "json5";
|
12 |
+
import { env } from "$env/dynamic/private";
|
13 |
+
|
14 |
+
const listSchema = z.array(z.string()).default([]);
|
15 |
+
const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST));
|
16 |
+
const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST));
|
17 |
+
|
18 |
+
export async function search(
|
19 |
+
messages: Message[],
|
20 |
+
ragSettings: Assistant["rag"] | undefined,
|
21 |
+
appendUpdate: AppendUpdate
|
22 |
+
): Promise<{ searchQuery: string; pages: WebSearchSource[] }> {
|
23 |
+
if (ragSettings && ragSettings?.allowedLinks.length > 0) {
|
24 |
+
appendUpdate("Using links specified in Assistant");
|
25 |
+
return {
|
26 |
+
searchQuery: "",
|
27 |
+
pages: await directLinksToSource(ragSettings.allowedLinks).then(filterByBlockList),
|
28 |
+
};
|
29 |
+
}
|
30 |
+
|
31 |
+
const searchQuery = await generateQuery(messages);
|
32 |
+
appendUpdate(`Searching ${getWebSearchProvider()}`, [searchQuery]);
|
33 |
+
|
34 |
+
// handle the global and (optional) rag lists
|
35 |
+
if (ragSettings && ragSettings?.allowedDomains.length > 0) {
|
36 |
+
appendUpdate("Filtering on specified domains");
|
37 |
+
}
|
38 |
+
const filters = buildQueryFromSiteFilters(
|
39 |
+
[...(ragSettings?.allowedDomains ?? []), ...allowList],
|
40 |
+
blockList
|
41 |
+
);
|
42 |
+
|
43 |
+
const searchQueryWithFilters = `${filters} ${searchQuery}`;
|
44 |
+
const searchResults = await searchWeb(searchQueryWithFilters).then(filterByBlockList);
|
45 |
+
|
46 |
+
return {
|
47 |
+
searchQuery: searchQueryWithFilters,
|
48 |
+
pages: searchResults,
|
49 |
+
};
|
50 |
+
}
|
51 |
+
|
52 |
+
// ----------
|
53 |
+
// Utils
|
54 |
+
function filterByBlockList(results: WebSearchSource[]): WebSearchSource[] {
|
55 |
+
return results.filter((result) => !blockList.some((blocked) => result.link.includes(blocked)));
|
56 |
+
}
|
57 |
+
|
58 |
+
function buildQueryFromSiteFilters(allow: string[], block: string[]) {
|
59 |
+
return (
|
60 |
+
allow.map((item) => "site:" + item).join(" OR ") +
|
61 |
+
" " +
|
62 |
+
block.map((item) => "-site:" + item).join(" ")
|
63 |
+
);
|
64 |
+
}
|
65 |
+
|
66 |
+
async function directLinksToSource(links: string[]): Promise<WebSearchSource[]> {
|
67 |
+
if (env.ENABLE_LOCAL_FETCH !== "true") {
|
68 |
+
const localLinks = await Promise.all(links.map(isURLStringLocal));
|
69 |
+
links = links.filter((_, index) => !localLinks[index]);
|
70 |
+
}
|
71 |
+
|
72 |
+
return links.filter(isURL).map((link) => ({
|
73 |
+
link,
|
74 |
+
title: "",
|
75 |
+
text: [""],
|
76 |
+
}));
|
77 |
+
}
|
@@ -1,148 +0,0 @@
|
|
1 |
-
import type { YouWebSearch } from "../../types/WebSearch";
|
2 |
-
import { WebSearchProvider } from "../../types/WebSearch";
|
3 |
-
import { env } from "$env/dynamic/private";
|
4 |
-
import { getJson } from "serpapi";
|
5 |
-
import type { GoogleParameters } from "serpapi";
|
6 |
-
import { searchWebLocal } from "./searchWebLocal";
|
7 |
-
import { searchSearxng } from "./searchSearxng";
|
8 |
-
|
9 |
-
// get which SERP api is providing web results
|
10 |
-
export function getWebSearchProvider() {
|
11 |
-
if (env.YDC_API_KEY) {
|
12 |
-
return WebSearchProvider.YOU;
|
13 |
-
} else if (env.SEARXNG_QUERY_URL) {
|
14 |
-
return WebSearchProvider.SEARXNG;
|
15 |
-
} else {
|
16 |
-
return WebSearchProvider.GOOGLE;
|
17 |
-
}
|
18 |
-
}
|
19 |
-
|
20 |
-
// Show result as JSON
|
21 |
-
export async function searchWeb(query: string) {
|
22 |
-
if (env.USE_LOCAL_WEBSEARCH) {
|
23 |
-
return await searchWebLocal(query);
|
24 |
-
}
|
25 |
-
if (env.SEARXNG_QUERY_URL) {
|
26 |
-
return await searchSearxng(query);
|
27 |
-
}
|
28 |
-
if (env.SERPER_API_KEY) {
|
29 |
-
return await searchWebSerper(query);
|
30 |
-
}
|
31 |
-
if (env.YDC_API_KEY) {
|
32 |
-
return await searchWebYouApi(query);
|
33 |
-
}
|
34 |
-
if (env.SERPAPI_KEY) {
|
35 |
-
return await searchWebSerpApi(query);
|
36 |
-
}
|
37 |
-
if (env.SERPSTACK_API_KEY) {
|
38 |
-
return await searchSerpStack(query);
|
39 |
-
}
|
40 |
-
throw new Error("No You.com or Serper.dev or SerpAPI key found");
|
41 |
-
}
|
42 |
-
|
43 |
-
export async function searchWebSerper(query: string) {
|
44 |
-
const params = {
|
45 |
-
q: query,
|
46 |
-
hl: "en",
|
47 |
-
gl: "us",
|
48 |
-
};
|
49 |
-
|
50 |
-
const response = await fetch("https://google.serper.dev/search", {
|
51 |
-
method: "POST",
|
52 |
-
body: JSON.stringify(params),
|
53 |
-
headers: {
|
54 |
-
"x-api-key": env.SERPER_API_KEY,
|
55 |
-
"Content-type": "application/json; charset=UTF-8",
|
56 |
-
},
|
57 |
-
});
|
58 |
-
|
59 |
-
/* eslint-disable @typescript-eslint/no-explicit-any */
|
60 |
-
const data = (await response.json()) as Record<string, any>;
|
61 |
-
|
62 |
-
if (!response.ok) {
|
63 |
-
throw new Error(
|
64 |
-
data["message"] ??
|
65 |
-
`Serper API returned error code ${response.status} - ${response.statusText}`
|
66 |
-
);
|
67 |
-
}
|
68 |
-
|
69 |
-
return {
|
70 |
-
organic_results: data["organic"] ?? [],
|
71 |
-
};
|
72 |
-
}
|
73 |
-
|
74 |
-
export async function searchWebSerpApi(query: string) {
|
75 |
-
const params = {
|
76 |
-
q: query,
|
77 |
-
hl: "en",
|
78 |
-
gl: "us",
|
79 |
-
google_domain: "google.com",
|
80 |
-
api_key: env.SERPAPI_KEY,
|
81 |
-
} satisfies GoogleParameters;
|
82 |
-
|
83 |
-
// Show result as JSON
|
84 |
-
const response = await getJson("google", params);
|
85 |
-
|
86 |
-
return response;
|
87 |
-
}
|
88 |
-
|
89 |
-
export async function searchWebYouApi(query: string) {
|
90 |
-
const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
|
91 |
-
method: "GET",
|
92 |
-
headers: {
|
93 |
-
"X-API-Key": env.YDC_API_KEY,
|
94 |
-
"Content-type": "application/json; charset=UTF-8",
|
95 |
-
},
|
96 |
-
});
|
97 |
-
|
98 |
-
if (!response.ok) {
|
99 |
-
throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
|
100 |
-
}
|
101 |
-
|
102 |
-
const data = (await response.json()) as YouWebSearch;
|
103 |
-
const formattedResultsWithSnippets = data.hits
|
104 |
-
.map(({ title, url, snippets }) => ({
|
105 |
-
title,
|
106 |
-
link: url,
|
107 |
-
text: snippets?.join("\n") || "",
|
108 |
-
hostname: new URL(url).hostname,
|
109 |
-
}))
|
110 |
-
.sort((a, b) => b.text.length - a.text.length); // desc order by text length
|
111 |
-
|
112 |
-
return {
|
113 |
-
organic_results: formattedResultsWithSnippets,
|
114 |
-
};
|
115 |
-
}
|
116 |
-
|
117 |
-
export async function searchSerpStack(query: string) {
|
118 |
-
const response = await fetch(
|
119 |
-
`http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
|
120 |
-
{
|
121 |
-
method: "GET",
|
122 |
-
headers: {
|
123 |
-
"Content-type": "application/json; charset=UTF-8",
|
124 |
-
},
|
125 |
-
}
|
126 |
-
);
|
127 |
-
|
128 |
-
const data = (await response.json()) as Record<string, any>;
|
129 |
-
|
130 |
-
if (!response.ok) {
|
131 |
-
throw new Error(
|
132 |
-
data["error"] ??
|
133 |
-
`SerpStack API returned error code ${response.status} - ${response.statusText}`
|
134 |
-
);
|
135 |
-
}
|
136 |
-
|
137 |
-
const resultsWithSnippets = data["organic_results"].map(
|
138 |
-
({ title, url, snippet }: { title: string; url: string; snippet: string | undefined }) => ({
|
139 |
-
title,
|
140 |
-
link: url,
|
141 |
-
text: snippet || "",
|
142 |
-
})
|
143 |
-
);
|
144 |
-
|
145 |
-
return {
|
146 |
-
organic_results: resultsWithSnippets ?? [],
|
147 |
-
};
|
148 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,6 +1,7 @@
|
|
1 |
import type { ObjectId } from "mongodb";
|
2 |
import type { Conversation } from "./Conversation";
|
3 |
import type { Timestamps } from "./Timestamps";
|
|
|
4 |
|
5 |
export interface WebSearch extends Timestamps {
|
6 |
_id?: ObjectId;
|
@@ -14,14 +15,24 @@ export interface WebSearch extends Timestamps {
|
|
14 |
}
|
15 |
|
16 |
export interface WebSearchSource {
|
17 |
-
title
|
18 |
link: string;
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
}
|
22 |
|
23 |
-
export interface WebSearchUsedSource extends
|
24 |
-
context:
|
25 |
}
|
26 |
|
27 |
export type WebSearchMessageSources = {
|
@@ -29,18 +40,6 @@ export type WebSearchMessageSources = {
|
|
29 |
sources: WebSearchSource[];
|
30 |
};
|
31 |
|
32 |
-
export interface YouWebSearch {
|
33 |
-
hits: YouSearchHit[];
|
34 |
-
latency: number;
|
35 |
-
}
|
36 |
-
|
37 |
-
interface YouSearchHit {
|
38 |
-
url: string;
|
39 |
-
title: string;
|
40 |
-
description: string;
|
41 |
-
snippets: string[];
|
42 |
-
}
|
43 |
-
|
44 |
// eslint-disable-next-line no-shadow
|
45 |
export enum WebSearchProvider {
|
46 |
GOOGLE = "Google",
|
|
|
1 |
import type { ObjectId } from "mongodb";
|
2 |
import type { Conversation } from "./Conversation";
|
3 |
import type { Timestamps } from "./Timestamps";
|
4 |
+
import type { HeaderElement } from "$lib/server/websearch/markdown/types";
|
5 |
|
6 |
export interface WebSearch extends Timestamps {
|
7 |
_id?: ObjectId;
|
|
|
15 |
}
|
16 |
|
17 |
export interface WebSearchSource {
|
18 |
+
title?: string;
|
19 |
link: string;
|
20 |
+
}
|
21 |
+
export interface WebSearchScrapedSource extends WebSearchSource {
|
22 |
+
page: WebSearchPage;
|
23 |
+
}
|
24 |
+
export interface WebSearchPage {
|
25 |
+
title: string;
|
26 |
+
siteName?: string;
|
27 |
+
author?: string;
|
28 |
+
description?: string;
|
29 |
+
createdAt?: string;
|
30 |
+
modifiedAt?: string;
|
31 |
+
markdownTree: HeaderElement;
|
32 |
}
|
33 |
|
34 |
+
export interface WebSearchUsedSource extends WebSearchScrapedSource {
|
35 |
+
context: string;
|
36 |
}
|
37 |
|
38 |
export type WebSearchMessageSources = {
|
|
|
40 |
sources: WebSearchSource[];
|
41 |
};
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
// eslint-disable-next-line no-shadow
|
44 |
export enum WebSearchProvider {
|
45 |
GOOGLE = "Google",
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export function isURL(url: string) {
|
2 |
+
try {
|
3 |
+
new URL(url);
|
4 |
+
return true;
|
5 |
+
} catch (e) {
|
6 |
+
return false;
|
7 |
+
}
|
8 |
+
}
|
@@ -1,6 +1,9 @@
|
|
1 |
export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
|
2 |
let timer: NodeJS.Timeout;
|
3 |
-
return Promise.race([
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
6 |
};
|
|
|
1 |
export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
|
2 |
let timer: NodeJS.Timeout;
|
3 |
+
return Promise.race([
|
4 |
+
prom,
|
5 |
+
new Promise<T>((_, reject) => {
|
6 |
+
timer = setTimeout(() => reject(new Error(`Timeout after ${time / 1000} seconds`)), time);
|
7 |
+
}),
|
8 |
+
]).finally(() => clearTimeout(timer));
|
9 |
};
|