File size: 971 Bytes
68051dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from typing import List

from .search import get_search_results
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_core.documents.base import Document
from pydantic import StrictStr


def crawl(query: str) -> List[StrictStr]:
    # get links from search results
    links_search_engine: List[StrictStr] = get_search_results(query=query)
    links_crawler: List[StrictStr] = []
    # load the documents
    for link in links_search_engine:
        try:
            html_loader = RecursiveUrlLoader(url=link, max_depth=1, timeout=5)
            docs: List[Document] = html_loader.load()
            for doc in docs:
                source: StrictStr = doc.metadata.get("source")  # type: ignore
                links_crawler.append(source)
        except Exception as e:
            print(f"Error: {e}")
    return list(set(links_crawler + links_search_engine))


if __name__ == "__main__":
    print(crawl("What is the capital of France"))