File size: 1,064 Bytes
1d91f23
 
 
 
 
3bb97ab
1d91f23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c03a29
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests as req
from bs4 import BeautifulSoup
from urllib.parse import urljoin,urlparse

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=300,length_function=len)

def get_base(url):
    parsed = urlparse(url)
    base = f"{parsed.scheme}://{parsed.netloc}"
    return base

def crawl(start,limit):
    base_domain = get_base(start)

    links = [start]
    visited = []
    txt = ""

    while links and len(visited) < limit:
        to_visit = links.pop(0)
        visited.append(to_visit)

        html = req.get(to_visit).text
        soup = BeautifulSoup(html,"lxml")
        for anchor in soup.find_all("a",href=True):
            sublink = urljoin(to_visit,anchor["href"])
            sub_base = get_base(sublink)
            if not sublink in visited and base_domain == sub_base:
                links.append(sublink)
        visited.append(to_visit)
        txt += soup.get_text(" ",True) + "\n\n"

    txt = text_splitter.split_text(txt)
    return txt