mehmet0001 commited on
Commit
3c03a29
·
verified ·
1 Parent(s): 496f2f7
Files changed (3) hide show
  1. crawl_the_site.py +38 -0
  2. create_database.py +28 -0
  3. main.py +41 -0
crawl_the_site.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ import requests as req
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin,urlparse
5
+ import sys
6
+
7
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=100,length_function=len)
8
+
9
+ def get_base(url):
10
+ parsed = urlparse(url)
11
+ base = f"{parsed.scheme}://{parsed.netloc}"
12
+ return base
13
+
14
+ def crawl(start,limit):
15
+ base_domain = get_base(start)
16
+
17
+ links = [start]
18
+ visited = []
19
+ txt = ""
20
+
21
+ while links and len(visited) < limit:
22
+ to_visit = links.pop(0)
23
+ visited.append(to_visit)
24
+
25
+ html = req.get(to_visit).text
26
+ soup = BeautifulSoup(html,"lxml")
27
+ for anchor in soup.find_all("a",href=True):
28
+ sublink = urljoin(to_visit,anchor["href"])
29
+ sub_base = get_base(sublink)
30
+ if not sublink in visited and base_domain == sub_base:
31
+ links.append(sublink)
32
+ visited.append(to_visit)
33
+ txt += soup.get_text(" ",True) + "\n\n"
34
+ sys.stdout.write("\r"+f"Crawling the site %{round((len(visited)/limit)*100)} done.")
35
+ sys.stdout.flush()
36
+
37
+ txt = text_splitter.split_text(txt)
38
+ return txt
create_database.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import chromadb
3
+
4
+ def split_list(list_,chunk_size):
5
+ return [list_[i:i+chunk_size] for i in range(0,len(list_),chunk_size)]
6
+
7
+ def create_database(txt):
8
+ class EmbeddingFn:
9
+ def __init__(self,model_name):
10
+ self.model = SentenceTransformer(model_name)
11
+
12
+ def __call__(self,input):
13
+ return self.model.encode(input).tolist()
14
+
15
+ embedding_fn = EmbeddingFn("sentence-transformers/all-mpnet-base-v2")
16
+
17
+ ids = [str(i) for i in range(len(txt))]
18
+
19
+ chroma_cli = chromadb.Client()
20
+ collection = chroma_cli.create_collection("chat-with-docs",embedding_function=embedding_fn)
21
+
22
+ txt = split_list(txt,5000)
23
+ ids = split_list(ids,5000)
24
+
25
+ for txt_chunk,ids_chunk in zip(txt,ids):
26
+ collection.add(documents=txt_chunk,ids=ids_chunk)
27
+
28
+ return collection
main.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import crawl_the_site
3
+ import create_database
4
+
5
+ api_key = input("enter your OpenRouter api key : ")
6
+ start = input("enter the documentation link : ")
7
+ limit = int(input("enter the limit for crawling (how many links before stopping) : "))
8
+
9
+ print("")
10
+ txt = crawl_the_site.crawl(start,limit)
11
+ collection = create_database.create_database(txt)
12
+
13
+ openai_client = OpenAI(base_url="https://openrouter.ai/api/v1",api_key=api_key)
14
+
15
+ prompt_template = """Answer the question only according to the information provided below. Answer only the user's question, dont give additional information.
16
+ ## Information :
17
+ {}
18
+
19
+ ## Question :
20
+ # {}"""
21
+
22
+ while True:
23
+ q = input("prompt : ")
24
+
25
+ results = collection.query(query_texts=[q],n_results=5)
26
+ infos = results["documents"][0]
27
+
28
+ info_text = ""
29
+ for info in infos:
30
+ info_text += info + "\n---\n"
31
+ info_text = info_text.strip()
32
+
33
+ prompt = prompt_template.format(info_text,q)
34
+
35
+ completion = openai_client.chat.completions.create(
36
+ extra_headers={},
37
+ extra_body={},
38
+ model="deepseek/deepseek-r1:free",
39
+ messages=[{"role":"user","content":prompt}])
40
+
41
+ print((completion.choices[0].message.content)[7:-1])