Spaces:

mehmet0001
/

chat-with-docs

Sleeping

mehmet0001 commited on Mar 25

Commit

3c03a29

verified ·

1 Parent(s): 496f2f7

app files

Files changed (3) hide show

crawl_the_site.py ADDED Viewed

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import requests as req
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin,urlparse
+import sys
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=100,length_function=len)
+def get_base(url):
+    parsed = urlparse(url)
+    base = f"{parsed.scheme}://{parsed.netloc}"
+    return base
+def crawl(start,limit):
+    base_domain = get_base(start)
+    links = [start]
+    visited = []
+    txt = ""
+    while links and len(visited) < limit:
+        to_visit = links.pop(0)
+        visited.append(to_visit)
+        html = req.get(to_visit).text
+        soup = BeautifulSoup(html,"lxml")
+        for anchor in soup.find_all("a",href=True):
+            sublink = urljoin(to_visit,anchor["href"])
+            sub_base = get_base(sublink)
+            if not sublink in visited and base_domain == sub_base:
+                links.append(sublink)
+        visited.append(to_visit)
+        txt += soup.get_text(" ",True) + "\n\n"
+        sys.stdout.write("\r"+f"Crawling the site %{round((len(visited)/limit)*100)} done.")
+        sys.stdout.flush()
+    txt = text_splitter.split_text(txt)
+    return txt

create_database.py ADDED Viewed

+from sentence_transformers import SentenceTransformer
+import chromadb
+def split_list(list_,chunk_size):
+            return [list_[i:i+chunk_size] for i in range(0,len(list_),chunk_size)]
+def create_database(txt):
+    class EmbeddingFn:
+        def __init__(self,model_name):
+            self.model = SentenceTransformer(model_name)
+        def __call__(self,input):
+            return self.model.encode(input).tolist()
+    embedding_fn = EmbeddingFn("sentence-transformers/all-mpnet-base-v2")
+    ids = [str(i) for i in range(len(txt))]
+    chroma_cli = chromadb.Client()
+    collection = chroma_cli.create_collection("chat-with-docs",embedding_function=embedding_fn)
+    txt = split_list(txt,5000)
+    ids = split_list(ids,5000)
+    for txt_chunk,ids_chunk in zip(txt,ids):
+        collection.add(documents=txt_chunk,ids=ids_chunk)
+    return collection

main.py ADDED Viewed

+from openai import OpenAI
+import crawl_the_site
+import create_database
+api_key = input("enter your OpenRouter api key : ")
+start = input("enter the documentation link : ")
+limit = int(input("enter the limit for crawling (how many links before stopping) : "))
+print("")
+txt = crawl_the_site.crawl(start,limit)
+collection = create_database.create_database(txt)
+openai_client = OpenAI(base_url="https://openrouter.ai/api/v1",api_key=api_key)
+prompt_template = """Answer the question only according to the information provided below. Answer only the user's question, dont give additional information.
+## Information :
+{}
+## Question :
+# {}"""
+while True:
+    q = input("prompt : ")
+    results = collection.query(query_texts=[q],n_results=5)
+    infos = results["documents"][0]
+    info_text = ""
+    for info in infos:
+        info_text += info + "\n---\n"
+    info_text = info_text.strip()
+    prompt = prompt_template.format(info_text,q)
+    completion = openai_client.chat.completions.create(
+    extra_headers={},
+    extra_body={},
+    model="deepseek/deepseek-r1:free",
+    messages=[{"role":"user","content":prompt}])
+    print((completion.choices[0].message.content)[7:-1])