Spaces:
Sleeping
Sleeping
app files
Browse files- crawl_the_site.py +38 -0
- create_database.py +28 -0
- main.py +41 -0
crawl_the_site.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
import requests as req
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from urllib.parse import urljoin,urlparse
|
5 |
+
import sys
|
6 |
+
|
7 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=100,length_function=len)
|
8 |
+
|
9 |
+
def get_base(url):
|
10 |
+
parsed = urlparse(url)
|
11 |
+
base = f"{parsed.scheme}://{parsed.netloc}"
|
12 |
+
return base
|
13 |
+
|
14 |
+
def crawl(start,limit):
|
15 |
+
base_domain = get_base(start)
|
16 |
+
|
17 |
+
links = [start]
|
18 |
+
visited = []
|
19 |
+
txt = ""
|
20 |
+
|
21 |
+
while links and len(visited) < limit:
|
22 |
+
to_visit = links.pop(0)
|
23 |
+
visited.append(to_visit)
|
24 |
+
|
25 |
+
html = req.get(to_visit).text
|
26 |
+
soup = BeautifulSoup(html,"lxml")
|
27 |
+
for anchor in soup.find_all("a",href=True):
|
28 |
+
sublink = urljoin(to_visit,anchor["href"])
|
29 |
+
sub_base = get_base(sublink)
|
30 |
+
if not sublink in visited and base_domain == sub_base:
|
31 |
+
links.append(sublink)
|
32 |
+
visited.append(to_visit)
|
33 |
+
txt += soup.get_text(" ",True) + "\n\n"
|
34 |
+
sys.stdout.write("\r"+f"Crawling the site %{round((len(visited)/limit)*100)} done.")
|
35 |
+
sys.stdout.flush()
|
36 |
+
|
37 |
+
txt = text_splitter.split_text(txt)
|
38 |
+
return txt
|
create_database.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import chromadb
|
3 |
+
|
4 |
+
def split_list(list_,chunk_size):
|
5 |
+
return [list_[i:i+chunk_size] for i in range(0,len(list_),chunk_size)]
|
6 |
+
|
7 |
+
def create_database(txt):
|
8 |
+
class EmbeddingFn:
|
9 |
+
def __init__(self,model_name):
|
10 |
+
self.model = SentenceTransformer(model_name)
|
11 |
+
|
12 |
+
def __call__(self,input):
|
13 |
+
return self.model.encode(input).tolist()
|
14 |
+
|
15 |
+
embedding_fn = EmbeddingFn("sentence-transformers/all-mpnet-base-v2")
|
16 |
+
|
17 |
+
ids = [str(i) for i in range(len(txt))]
|
18 |
+
|
19 |
+
chroma_cli = chromadb.Client()
|
20 |
+
collection = chroma_cli.create_collection("chat-with-docs",embedding_function=embedding_fn)
|
21 |
+
|
22 |
+
txt = split_list(txt,5000)
|
23 |
+
ids = split_list(ids,5000)
|
24 |
+
|
25 |
+
for txt_chunk,ids_chunk in zip(txt,ids):
|
26 |
+
collection.add(documents=txt_chunk,ids=ids_chunk)
|
27 |
+
|
28 |
+
return collection
|
main.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import crawl_the_site
|
3 |
+
import create_database
|
4 |
+
|
5 |
+
api_key = input("enter your OpenRouter api key : ")
|
6 |
+
start = input("enter the documentation link : ")
|
7 |
+
limit = int(input("enter the limit for crawling (how many links before stopping) : "))
|
8 |
+
|
9 |
+
print("")
|
10 |
+
txt = crawl_the_site.crawl(start,limit)
|
11 |
+
collection = create_database.create_database(txt)
|
12 |
+
|
13 |
+
openai_client = OpenAI(base_url="https://openrouter.ai/api/v1",api_key=api_key)
|
14 |
+
|
15 |
+
prompt_template = """Answer the question only according to the information provided below. Answer only the user's question, dont give additional information.
|
16 |
+
## Information :
|
17 |
+
{}
|
18 |
+
|
19 |
+
## Question :
|
20 |
+
# {}"""
|
21 |
+
|
22 |
+
while True:
|
23 |
+
q = input("prompt : ")
|
24 |
+
|
25 |
+
results = collection.query(query_texts=[q],n_results=5)
|
26 |
+
infos = results["documents"][0]
|
27 |
+
|
28 |
+
info_text = ""
|
29 |
+
for info in infos:
|
30 |
+
info_text += info + "\n---\n"
|
31 |
+
info_text = info_text.strip()
|
32 |
+
|
33 |
+
prompt = prompt_template.format(info_text,q)
|
34 |
+
|
35 |
+
completion = openai_client.chat.completions.create(
|
36 |
+
extra_headers={},
|
37 |
+
extra_body={},
|
38 |
+
model="deepseek/deepseek-r1:free",
|
39 |
+
messages=[{"role":"user","content":prompt}])
|
40 |
+
|
41 |
+
print((completion.choices[0].message.content)[7:-1])
|