Spaces:

vaishnaveswar
/

AIVIZ-BOT

Running

App Files Files Community

vaishnaveswar commited on Sep 19, 2024

Commit

e1cda2e

1 Parent(s): ef67833

revert

Browse files

Files changed (11) hide show

.gitignore +3 -0
README.md +2 -1
app.py +30 -62
configs/.env +1 -0
configs/config.py +45 -0
llm_setup/llm_setup.py +75 -0
processing/documents.py +45 -0
processing/texts.py +6 -0
requirements.txt +153 -1
services/scraper.py +26 -0
stores/chroma.py +23 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv
+configs/.env
+.idea

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Gradio Chatbot
 emoji: 💬
 colorFrom: yellow
 colorTo: purple
@@ -7,6 +7,7 @@ sdk: gradio
 sdk_version: 4.36.1
 app_file: app.py
 pinned: false
 ---
 An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 ---
+title: AIVI Bot
 emoji: 💬
 colorFrom: yellow
 colorTo: purple
 sdk_version: 4.36.1
 app_file: app.py
 pinned: false
+license: mit
 ---
 An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py CHANGED Viewed

@@ -1,63 +1,31 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import logging
 import gradio as gr
+import configs.config as config
+import services.scraper
+import stores.chroma
+from llm_setup.llm_setup import LLMService
+logger = logging.getLogger()  # Create a logger object
+logger.setLevel(logging.INFO)  # Set the logging level to INFO
+config.set_envs()  # Set environment variables using the config module
+store = stores.chroma.ChromaDB(config.EMBEDDINGS)
+service = services.scraper.Service(store)
+# Scrape data and get the store vector retriever
+service.scrape_and_get_store_vector_retriever(config.URLS)
+# Initialize the LLMService with logger, prompt, and store vector retriever
+llm_svc = LLMService(logger, config.SYSTEM_PROMPT, store.get_chroma_instance().as_retriever())
+def respond(user_input, history):
+    response = llm_svc.conversational_rag_chain().invoke(user_input)
+    return response
+if __name__ == '__main__':
+    logging.info("Starting AIVIz Bot")
+    gr.ChatInterface(respond).launch(share=True)

configs/.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GOOGLE_API_KEY=""

configs/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import getpass as getpass
+import os
+from dotenv import load_dotenv
+from langchain_huggingface import HuggingFaceEmbeddings
+load_dotenv()
+URLS = ["https://aisviz.gitbook.io/documentation", "https://aisviz.gitbook.io/documentation/default-start/quick-start",
+        "https://aisviz.gitbook.io/documentation/default-start/sql-database",
+        "https://aisviz.gitbook.io/documentation/default-start/ais-hardware",
+        "https://aisviz.gitbook.io/documentation/default-start/compile-aisdb",
+        "https://aisviz.gitbook.io/documentation/tutorials/database-loading",
+        "https://aisviz.gitbook.io/documentation/tutorials/data-querying",
+        "https://aisviz.gitbook.io/documentation/tutorials/data-cleaning",
+        "https://aisviz.gitbook.io/documentation/tutorials/data-visualization",
+        "https://aisviz.gitbook.io/documentation/tutorials/track-interpolation",
+        "https://aisviz.gitbook.io/documentation/tutorials/haversine-distance",
+        "https://aisviz.gitbook.io/documentation/tutorials/vessel-speed",
+        "https://aisviz.gitbook.io/documentation/tutorials/coast-shore-and-ports",
+        "https://aisviz.gitbook.io/documentation/tutorials/vessel-metadata",
+        "https://aisviz.gitbook.io/documentation/tutorials/using-your-ais-data",
+        "https://aisviz.gitbook.io/documentation/tutorials/ais-data-to-csv",
+        "https://aisviz.gitbook.io/documentation/tutorials/bathymetric-data",
+        "https://aisviz.gitbook.io/documentation/machine-learning/seq2seq-in-pytorch",
+        "https://aisviz.gitbook.io/documentation/machine-learning/autoencoders-in-keras"]
+CHUNK_SIZE = 2400
+CHUNK_OVERLAP = 200
+TOTAL_RESULTS = 2389
+MAX_SIZE = 100
+EMBEDDINGS = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-mpnet-base-v2",
+    model_kwargs={"device": "cpu"},
+)
+SYSTEM_PROMPT = """
+You are a chatbot to assist users asking about Automatic Identification systems (AIS) from the context given to you.
+Use this Context: {context}. If the question is beyond the context, just tell you don't know the answer.
+Give scenario based answer that can clearly explain it to humans.
+Now, answer for this user's question in a descriptive manner: {question}."""
+def set_envs():
+    if "GOOGLE_API_KEY" not in os.environ:
+        os.environ["GOOGLE_API_KEY"] = getpass.getpass(os.getenv("GOOGLE_API_KEY"))

llm_setup/llm_setup.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    PromptTemplate,
+    HumanMessagePromptTemplate
+)
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_google_genai import ChatGoogleGenerativeAI
+from processing.documents import format_documents
+def _initialize_llm() -> ChatGoogleGenerativeAI:
+    """
+    Initializes the LLM instance.
+    """
+    llm = ChatGoogleGenerativeAI(model="gemini-pro")
+    return llm
+class LLMService:
+    """
+    Service for managing LLM interactions and conversational RAG chain.
+    Args:
+        logger: Logger instance for logging.
+        system_prompt: The prompt for the QA system.
+        web_retriever: A VectorStoreRetriever instance for retrieving web documents.
+    """
+    def __init__(self, logger, system_prompt: str, web_retriever: VectorStoreRetriever):
+        self._conversational_rag_chain = None
+        self._logger = logger
+        self.system_prompt = system_prompt
+        self._web_retriever = web_retriever
+        self.llm = _initialize_llm()
+        self._initialize_conversational_rag_chain()
+    def _initialize_conversational_rag_chain(self):
+        """
+        Initializes the conversational RAG chain.
+        """
+        # Initialize RAG (Retrieval-Augmented Generation) chain
+        prompt = ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(
+            prompt=PromptTemplate(input_variables=['context', 'question'], template=self.system_prompt))])
+        # Initialize conversational RAG chain
+        self._conversational_rag_chain = (
+                {"context": self._web_retriever | format_documents, "question": RunnablePassthrough()}
+                | prompt
+                | self.llm
+                | StrOutputParser()
+        )
+    def conversational_rag_chain(self):
+        """
+        Returns the initialized conversational RAG chain.
+        Returns:
+            The conversational RAG chain instance.
+        """
+        return self._conversational_rag_chain
+    def get_llm(self) -> ChatGoogleGenerativeAI:
+        """
+        Returns the LLM instance.
+        """
+        if self.llm is None:
+            raise Exception("llm is not initialized")
+        return self.llm

processing/documents.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from langchain_community.document_loaders import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from typing import Iterable
+def load_documents(website: str) -> list[Document]:
+    """
+    Loads documents from a given website.
+    Args:
+        website (str): The URL of the website to load documents from.
+    Returns:
+        list[Document]: A list of loaded documents.
+    """
+    loader = WebBaseLoader(website)
+    return loader.load()
+def format_documents(docs: list[Document]) -> str:
+    """
+    Formats a list of documents into a single string.
+    Args:
+        docs (list[Document]): The list of documents to format.
+    Returns:
+        str: The formatted documents as a single string.
+    """
+    return "\n\n".join(doc.page_content for doc in docs)
+def split_documents(documents: Iterable[Document]) -> list[Document]:
+    """
+    Splits documents into smaller chunks.
+    Args:
+        documents (Iterable[Document]): The documents to split.
+    Returns:
+        list[Document]: A list of split documents.
+    """
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    return text_splitter.split_documents(documents)

processing/texts.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def clean_text(text: str) -> str:
+    """
+    Clean the text by removing unwanted characters and formatting.
+    """
+    cleaned_text = text.replace("\n", " ").strip()
+    return cleaned_text

requirements.txt CHANGED Viewed

	@@ -1 +1,153 @@
1	- huggingface_hub~~==0.22.2~~

+huggingface_hub
+aiofiles
+aiohappyeyeballs
+aiohttp
+aiosignal
+annotated-types
+anyio
+asgiref
+attrs
+backoff
+bcrypt
+build
+cachetools
+certifi
+charset-normalizer
+chroma-hnswlib
+chromadb
+click
+colorama
+coloredlogs
+contourpy
+cycler
+dataclasses-json
+Deprecated
+fastapi
+ffmpy
+filelock
+flatbuffers
+fonttools
+frozenlist
+fsspec
+google-ai-generativelanguage
+google-api-core
+google-api-python-client
+google-auth
+google-auth-httplib2
+google-generativeai
+googleapis-common-protos
+gradio
+gradio_client
+greenlet
+grpcio
+grpcio-status
+h11
+httpcore
+httplib2
+httptools
+httpx
+huggingface-hub
+humanfriendly
+idna
+importlib_metadata
+importlib_resources
+Jinja2
+joblib
+jsonpatch
+jsonpointer
+kiwisolver
+kubernetes
+langchain
+langchain-chroma
+langchain-community
+langchain-core
+langchain-google-genai
+langchain-huggingface
+langchain-text-splitters
+langsmith
+markdown-it-py
+MarkupSafe
+marshmallow
+matplotlib
+mdurl
+mmh3
+monotonic
+mpmath
+multidict
+mypy-extensions
+networkx
+numpy
+oauthlib
+onnxruntime
+opentelemetry-api
+opentelemetry-exporter-otlp-proto-common
+opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-instrumentation
+opentelemetry-instrumentation-asgi
+opentelemetry-instrumentation-fastapi
+opentelemetry-proto
+opentelemetry-sdk
+opentelemetry-semantic-conventions
+opentelemetry-util-http
+orjson
+overrides
+packaging
+pandas
+pillow
+posthog
+proto-plus
+protobuf
+pyasn1
+pyasn1_modules
+pydantic
+pydantic-settings
+pydantic_core
+pydub
+Pygments
+pyparsing
+PyPika
+pyproject_hooks
+pyreadline3
+python-dateutil
+python-dotenv
+python-multipart
+pytz
+PyYAML
+regex
+requests
+requests-oauthlib
+rich
+rsa
+ruff
+safetensors
+scikit-learn
+scipy
+semantic-version
+sentence-transformers
+setuptools
+shellingham
+six
+sniffio
+SQLAlchemy
+starlette
+sympy
+tenacity
+threadpoolctl
+tokenizers
+tomlkit
+torch
+tqdm
+transformers
+typer
+typing-inspect
+typing_extensions
+tzdata
+uritemplate
+urllib3
+uvicorn
+watchfiles
+websocket-client
+websockets
+wrapt
+yarl
+zipp

services/scraper.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from langchain.schema import Document
+from processing.documents import load_documents, format_documents, split_documents
+from processing.texts import clean_text
+class Service:
+    def __init__(self, store):
+        self.store = store
+    def scrape_and_get_store_vector_retriever(self, urls: list[str]):
+        """
+        Scrapes website content from fetched schemes and creates a VectorStore retriever.
+        """
+        documents: list[Document] = []
+        for url in urls:
+            try:
+                website_documents = load_documents(url)
+                formatted_content = format_documents(website_documents)
+                cleaned_content = clean_text(formatted_content)
+                documents.append(Document(page_content=cleaned_content))
+            except Exception as e:
+                raise Exception(f"Error processing {url}: {e}")
+        self.store.store_embeddings(split_documents(documents))

stores/chroma.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from langchain.schema import Document
+from langchain_chroma import Chroma
+class ChromaDB:
+    def __init__(self, embeddings):
+        self._persistent_directory = "embeddings"
+        model_name = "sentence-transformers/all-mpnet-base-v2"
+        model_kwargs = {'device': 'cpu'}
+        encode_kwargs = {'normalize_embeddings': False}
+        self.embeddings = embeddings
+        self.chroma = Chroma(persist_directory=self._persistent_directory, embedding_function=self.embeddings)
+    def get_chroma_instance(self) -> Chroma:
+        return self.chroma
+    def store_embeddings(self, documents: list[Document]):
+        """
+        Store embeddings for the documents using HuggingFace embeddings and Chroma vectorstore.
+        """
+        self.chroma.add_documents(documents=documents, embeddings=self.embeddings,
+                                  persist_directory=self._persistent_directory)