Spaces:
Sleeping
Sleeping
File size: 1,064 Bytes
1d91f23 3bb97ab 1d91f23 3c03a29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests as req
from bs4 import BeautifulSoup
from urllib.parse import urljoin,urlparse
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=300,length_function=len)
def get_base(url):
parsed = urlparse(url)
base = f"{parsed.scheme}://{parsed.netloc}"
return base
def crawl(start,limit):
base_domain = get_base(start)
links = [start]
visited = []
txt = ""
while links and len(visited) < limit:
to_visit = links.pop(0)
visited.append(to_visit)
html = req.get(to_visit).text
soup = BeautifulSoup(html,"lxml")
for anchor in soup.find_all("a",href=True):
sublink = urljoin(to_visit,anchor["href"])
sub_base = get_base(sublink)
if not sublink in visited and base_domain == sub_base:
links.append(sublink)
visited.append(to_visit)
txt += soup.get_text(" ",True) + "\n\n"
txt = text_splitter.split_text(txt)
return txt |