Spaces:

thejagstudio
/

narayangpt

Sleeping

App Files Files Community

thejagstudio commited on Aug 8, 2024

Commit

ba5136e

verified ·

1 Parent(s): 6557919

Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +1 -0
Dockerfile +11 -0
README.md +9 -11
chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/data_level0.bin +3 -0
chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/header.bin +3 -0
chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/index_metadata.pickle +3 -0
chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/length.bin +3 -0
chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/link_lists.bin +3 -0
chroma/chroma.sqlite3 +3 -0
databaseCreator.py +151 -0
encodingGen.py +42 -0
main.py +232 -0
requirements.txt +12 -0
templates/index.html +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["python", "main.py"]

README.md CHANGED Viewed

@@ -1,11 +1,9 @@
----
-title: Narayangpt
-emoji: 🏢
-colorFrom: blue
-colorTo: yellow
-sdk: docker
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Narayangpt
+emoji: 😻
+colorFrom: gray
+colorTo: green
+sdk: docker
+pinned: false
+license: cc-by-3.0
+---

chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:998e0cca15fc892538d911e028c1661336ba6c465037bba4619908939edcd98b
+size 29652000

chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee810497fc5b8c99f0b6ffea36b49f5aaa802fb1fdd969845acc766e4cc33727
+size 100

chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bb2921a6158c0c3d90cae154d9915b3029b3ba1cea6b2d2ab909c58579f63ca
+size 466769

chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:869697aa4f1bec42bd2dd030b8f950ec956cfc77ff973e28a600915f446b1a5d
+size 28000

chroma/468d1e28-05e8-41cd-a9e6-27b3066ef48a/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0b0384bec817cb17e10361b7a9f530011d5d25a83b9b56a16290ce9b1315b9d
+size 62408

chroma/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:460fccfb79271f6ad2c8d8906810f61f495ff33fd0f9ae5a5827870747aab6f2
+size 124534784

databaseCreator.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import argparse
+import os
+import shutil
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema.document import Document
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings.bedrock import BedrockEmbeddings
+import json
+import requests
+from chromadb import Documents, EmbeddingFunction, Embeddings
+CHROMA_PATH = "chroma"
+DATA_PATH = "pdfs"
+class MyEmbeddingFunction(EmbeddingFunction):
+    def embed_documents(self, input: Documents) -> Embeddings:
+        for i in range(5):
+            try:
+                embeddings = []
+                url = "https://api.deepinfra.com/v1/inference/BAAI/bge-large-en-v1.5"
+                payload = json.dumps({
+                    "inputs": input
+                })
+                headers = {
+                    'Accept': 'application/json, text/plain, */*',
+                    'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
+                    'Connection': 'keep-alive',
+                    'Content-Type': 'application/json',
+                    'Origin': 'https://deepinfra.com',
+                    'Referer': 'https://deepinfra.com/',
+                    'Sec-Fetch-Dest': 'empty',
+                    'Sec-Fetch-Mode': 'cors',
+                    'Sec-Fetch-Site': 'same-site',
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
+                    'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+                    'sec-ch-ua-mobile': '?0',
+                    'sec-ch-ua-platform': '"Windows"'
+                }
+                response = requests.request("POST", url, headers=headers, data=payload)
+                return response.json()["embeddings"]
+            except:
+                pass
+def main():
+    # Check if the database should be cleared (using the --clear flag).
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--reset", action="store_true", help="Reset the database.")
+    args = parser.parse_args()
+    if args.reset:
+        print("✨ Clearing Database")
+        clear_database()
+    # Create (or update) the data store.
+    documents = load_documents()
+    chunks = split_documents(documents)
+    add_to_chroma(chunks)
+def load_documents():
+    print("📚 Loading Documents")
+    document_loader = PyPDFDirectoryLoader(DATA_PATH)
+    return document_loader.load()
+def split_documents(documents: list[Document]):
+    print("🔪 Splitting Documents")
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=4000,
+        chunk_overlap=100,
+        length_function=len,
+        is_separator_regex=True
+    )
+    return text_splitter.split_documents(documents)
+def add_to_chroma(chunks: list[Document]):
+    print("🔗 Adding to Chroma")
+    # Load the existing database.
+    custom_embeddings = MyEmbeddingFunction()
+    db = Chroma(
+        persist_directory=CHROMA_PATH, embedding_function=custom_embeddings
+    )
+    # Calculate Page IDs.
+    chunks_with_ids = calculate_chunk_ids(chunks)
+    # Add or Update the documents.
+    existing_items = db.get(include=[])  # IDs are always included by default
+    existing_ids = set(existing_items["ids"])
+    print(f"Number of existing documents in DB: {len(existing_ids)}")
+    # Only add documents that don't exist in the DB.
+    new_chunks = []
+    for chunk in chunks_with_ids:
+        if chunk.metadata["id"] not in existing_ids:
+            new_chunks.append(chunk)
+    if len(new_chunks):
+        print(f"👉 Adding new documents: {len(new_chunks)}")
+        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
+        for i in range(0, len(new_chunks), 100):
+            try:
+                db.add_documents(new_chunks[i:i+100], ids=new_chunk_ids[i:i+100])
+                db.persist()
+                print(f"Added {i+100} documents")
+            except:
+                pass
+    else:
+        print("✅ No new documents to add")
+def calculate_chunk_ids(chunks):
+    last_page_id = None
+    current_chunk_index = 0
+    for chunk in chunks:
+        source = chunk.metadata.get("source")
+        page = chunk.metadata.get("page")
+        current_page_id = f"{source}:{page}"
+        # If the page ID is the same as the last one, increment the index.
+        if current_page_id == last_page_id:
+            current_chunk_index += 1
+        else:
+            current_chunk_index = 0
+        # Calculate the chunk ID.
+        chunk_id = f"{current_page_id}:{current_chunk_index}"
+        last_page_id = current_page_id
+        # Add it to the page meta-data.
+        chunk.metadata["id"] = chunk_id
+    return chunks
+def clear_database():
+    if os.path.exists(CHROMA_PATH):
+        shutil.rmtree(CHROMA_PATH)
+if __name__ == "__main__":
+    main()

encodingGen.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import requests
+import json
+with open("embeddingData.json", "r") as f:
+    data = json.loads(f.read())
+for i in range(0,len(data),10):
+    newData = []
+    for j in range(i,i+10):
+        try:
+            newData.append(data[j]["text"])
+        except:
+            pass
+    url = "https://api.deepinfra.com/v1/inference/BAAI/bge-large-en-v1.5"
+    payload = json.dumps({
+        "inputs": newData
+    })
+    headers = {
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Origin': 'https://deepinfra.com',
+        'Referer': 'https://deepinfra.com/',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-site',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+    for j in range(len(response.json()["embeddings"])):
+        data[i+j]["embedding"] = response.json()["embeddings"][j]
+        print(data[i+j]["text"])
+with open("embeddingData.json", "w") as f:
+    f.write(json.dumps(data, indent=4))

main.py ADDED Viewed

	@@ -0,0 +1,232 @@

+from flask import Flask, request, jsonify, render_template, Response
+import os
+import requests
+import json
+from scipy import spatial
+from flask_cors import CORS
+import random
+import numpy as np
+from langchain_chroma import Chroma
+from chromadb import Documents, EmbeddingFunction, Embeddings
+app = Flask(__name__)
+CORS(app)
+class MyEmbeddingFunction(EmbeddingFunction):
+    def embed_documents(self, input: Documents) -> Embeddings:
+        for i in range(5):
+            try:
+                embeddings = []
+                url = "https://api.deepinfra.com/v1/inference/BAAI/bge-large-en-v1.5"
+                payload = json.dumps({
+                    "inputs": input
+                })
+                headers = {
+                    'Accept': 'application/json, text/plain, */*',
+                    'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
+                    'Connection': 'keep-alive',
+                    'Content-Type': 'application/json',
+                    'Origin': 'https://deepinfra.com',
+                    'Referer': 'https://deepinfra.com/',
+                    'Sec-Fetch-Dest': 'empty',
+                    'Sec-Fetch-Mode': 'cors',
+                    'Sec-Fetch-Site': 'same-site',
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
+                    'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+                    'sec-ch-ua-mobile': '?0',
+                    'sec-ch-ua-platform': '"Windows"'
+                }
+                response = requests.request("POST", url, headers=headers, data=payload)
+                return response.json()["embeddings"]
+            except:
+                pass
+    def embed_query(self, input: Documents) -> Embeddings:
+        print(input)
+        for i in range(5):
+            try:
+                embeddings = []
+                url = "https://api.deepinfra.com/v1/inference/BAAI/bge-large-en-v1.5"
+                payload = json.dumps({
+                    "inputs": [input]
+                })
+                headers = {
+                    'Accept': 'application/json, text/plain, */*',
+                    'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
+                    'Connection': 'keep-alive',
+                    'Content-Type': 'application/json',
+                    'Origin': 'https://deepinfra.com',
+                    'Referer': 'https://deepinfra.com/',
+                    'Sec-Fetch-Dest': 'empty',
+                    'Sec-Fetch-Mode': 'cors',
+                    'Sec-Fetch-Site': 'same-site',
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
+                    'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+                    'sec-ch-ua-mobile': '?0',
+                    'sec-ch-ua-platform': '"Windows"'
+                }
+                response = requests.request("POST", url, headers=headers, data=payload)
+                return response.json()["embeddings"][0]
+            except:
+                pass
+CHROMA_PATH = "chroma"
+custom_embeddings = MyEmbeddingFunction()
+db = Chroma(
+    persist_directory=CHROMA_PATH, embedding_function=custom_embeddings
+)
+def embeddingGen(query):
+    url = "https://api.deepinfra.com/v1/inference/BAAI/bge-large-en-v1.5"
+    payload = json.dumps({
+        "inputs": [query]
+    })
+    headers = {
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Origin': 'https://deepinfra.com',
+        'Referer': 'https://deepinfra.com/',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-site',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+    return response.json()
+def strings_ranked_by_relatedness(query, df, top_n=5):
+    def relatedness_fn(x, y):
+        x_norm = np.linalg.norm(x)
+        y_norm = np.linalg.norm(y)
+        return np.dot(x, y) / (x_norm * y_norm)
+    query_embedding_response = embeddingGen(query)
+    query_embedding = query_embedding_response["embeddings"][0]
+    strings_and_relatednesses = [
+        (row["text"], relatedness_fn(query_embedding, row["embedding"])) for row in df
+    ]
+    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
+    strings, relatednesses = zip(*strings_and_relatednesses)
+    return strings[:top_n], relatednesses[:top_n]
+@app.route("/api/gpt", methods=["POST"])
+def gptRes():
+    data = request.get_json()
+    messages = data["messages"]
+    def inference():
+        url = "https://api.deepinfra.com/v1/openai/chat/completions"
+        payload = json.dumps({
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "messages": messages,
+            "stream": True,
+            "max_tokens": 1024,
+        })
+        headers = {
+            'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
+            'Connection': 'keep-alive',
+            'Content-Type': 'application/json',
+            'Origin': 'https://deepinfra.com',
+            'Referer': 'https://deepinfra.com/',
+            'Sec-Fetch-Dest': 'empty',
+            'Sec-Fetch-Mode': 'cors',
+            'Sec-Fetch-Site': 'same-site',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
+            'X-Deepinfra-Source': 'web-page',
+            'accept': 'text/event-stream',
+            'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"Windows"'
+        }
+        response = requests.request("POST", url, headers=headers, data=payload, stream=True)
+        for line in response.iter_lines(decode_unicode=True):
+            if line:
+                # try:
+                #     line = line.split("data:")[1]
+                #     line = json.loads(line)
+                #     yield line["choices"][0]["delta"]["content"]
+                # except:
+                #     yield ""
+                yield line
+    return Response(inference(), content_type='text/event-stream')
+@app.route("/", methods=["GET"])
+def index():
+    return render_template("index.html")
+@app.route("/api/getAPI", methods=["POST"])
+def getAPI():
+    return jsonify({"API":  random.choice(apiKeys)})
+@app.route("/api/getContext", methods=["POST"])
+def getContext():
+    global db
+    question = request.form["question"]
+    results = db.similarity_search_with_score(question, k=5)
+    context = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    sources = [doc.metadata.get("id", None) for doc, _score in results]
+    return jsonify({"context": context, "sources": sources})
+@app.route("/api/audioGenerate", methods=["POST"])
+def audioGenerate():
+    answer = request.form["answer"]
+    audio = []
+    for i in answer.split("\n"):
+        url = "https://deepgram.com/api/ttsAudioGeneration"
+        payload = json.dumps({
+            "text": i,
+            "model": "aura-asteria-en",
+            "demoType": "landing-page",
+            "params": "tag=landingpage-product-texttospeech"
+        })
+        headers = {
+            'accept': '*/*',
+            'accept-language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
+            'content-type': 'application/json',
+            'origin': 'https://deepgram.com',
+            'priority': 'u=1, i',
+            'referer': 'https://deepgram.com/',
+            'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"Windows"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
+        }
+        response = requests.request("POST", url, headers=headers, data=payload)
+        audio.append(response.json()["data"])
+    return jsonify({"audio": audio})
+if __name__ == "__main__":
+    # app.run(debug=True)
+    from waitress import serve
+    serve(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Flask
+scipy
+requests
+Flask-Cors
+pypdf
+langchain
+chromadb
+pytest
+langchain-community
+langchain_chroma
+waitress
+uvicorn

templates/index.html ADDED Viewed

	@@ -0,0 +1 @@

+ <!doctype html><html lang="en"><head><meta charset="utf-8"><link rel="icon" href="/images/logo.png"><meta name="viewport" content="width=device-width,initial-scale=1"><meta name="theme-color" content="#000000"><meta name="description" content="Web site created using create-react-app"><link rel="apple-touch-icon" href="/logo192.png"><link rel="manifest" href="/manifest.json"><title>NarayanGPT</title><script defer="defer" src="/static/js/main.99ba6a2a.js"></script><link href="/static/css/main.3346b154.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>