Spaces:

mariemerenc
/

Agentic_RAG

Sleeping

App Files Files Community

mariemerenc commited on Mar 12

Commit

8da6fbf

verified ·

1 Parent(s): f347ed6

Upload 7 files

Browse files

Files changed (7) hide show

.env +3 -0
config.py +8 -0
llm_setup.py +18 -0
main.py +60 -0
query_processing.py +36 -0
vector_db.py +44 -0
web_scrapping.py +79 -0

.env ADDED Viewed

	@@ -0,0 +1,3 @@

+GROQ_API_KEY = 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
+SERPER_API_KEY = b585b73c65056ae09f0685e4f9b44aff7e04739e
+GEMINI = AIzaSyCQUkG-WbFlWnzk9H_EM_91NqBjBENfAoQ

config.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+SERPER_API_KEY = os.getenv("SERPER_API_KEY")
+GEMINI = os.getenv("GEMINI")

llm_setup.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from langchain_groq import ChatGroq
+from crewai import LLM
+from config import GROQ_API_KEY, GEMINI
+llm = ChatGroq(
+    model="llama-3.3-70b-specdec",
+    temperature=0,
+    max_tokens=500,
+    timeout=None,
+    max_retries=2,
+)
+crew_llm = LLM(
+    model="gemini/gemini-1.5-flash",
+    api_key=GEMINI,
+    max_tokens=500,
+    temperature=0.7
+)

main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import streamlit as st
+from vector_db import setup_vector_db, get_local_content
+from query_processing import process_query
+from web_scrapping import get_web_content
+import tempfile
+import os
+def main():
+    # Configuração da página do Streamlit
+    st.set_page_config(page_title="Agentic RAG Query System", page_icon="🤖", layout="wide")
+    st.title("Agentic RAG Query System")
+    st.markdown("""
+        Bem-vindo ao sistema de consulta baseado em RAG (Retrieval-Augmented Generation)!
+        Faça uma pergunta e o sistema buscará informações em documentos locais ou na web para fornecer uma resposta.
+    """)
+    # Upload do PDF pelo usuário
+    uploaded_file = st.file_uploader("Carregue seu arquivo PDF", type="pdf")
+    # Inicialização do banco de dados vetorial
+    if uploaded_file is not None:
+        if 'vector_db' not in st.session_state:
+            with st.spinner("Configurando o banco de dados vetorial..."):
+                # Salva o arquivo carregado temporariamente
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                    tmp_file.write(uploaded_file.getvalue())
+                    tmp_file_path = tmp_file.name
+                # Passa o caminho do arquivo temporário para a função setup_vector_db
+                st.session_state.vector_db = setup_vector_db(tmp_file_path)
+                st.session_state.local_context = get_local_content(st.session_state.vector_db, "")
+                # Remove o arquivo temporário após o uso
+                os.unlink(tmp_file_path)
+            st.success("Banco de dados vetorial configurado com sucesso!")
+    else:
+        st.warning("Por favor, carregue um arquivo PDF para continuar.")
+    # Entrada do usuário
+    query = st.text_input("Faça sua pergunta:", placeholder="Ex: O que é Agentic RAG?")
+    # Botão para processar a consulta
+    if st.button("Buscar Resposta"):
+        if query and 'vector_db' in st.session_state:
+            with st.spinner("Processando sua consulta..."):
+                try:
+                    # Processa a consulta
+                    result = process_query(query, st.session_state.vector_db, st.session_state.local_context)
+                    # Exibe o resultado
+                    st.subheader("Resposta:")
+                    st.write(result)
+                except Exception as e:
+                    st.error(f"Ocorreu um erro ao processar sua consulta: {e}")
+        else:
+            st.warning("Por favor, insira uma pergunta e carregue um PDF.")
+if __name__ == "__main__":
+    main()

query_processing.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from llm_setup import llm
+from vector_db import get_local_content, check_local_knowledge
+from web_scrapping import get_web_content
+def generate_final_answer(context, query):
+    """Generate final answer using LLM"""
+    messages = [
+        (
+            "system",
+            "You are a helpful assistant. Use the provided context to answer the query accurately.",
+        ),
+        ("system", f"Context: {context}"),
+        ("human", query),
+    ]
+    response = llm.invoke(messages)
+    return response.content
+def process_query(query, vector_db, local_context):
+    """Main function to process user query"""
+    print(f"Processing query: {query}")
+    # Etapa 1: Checar se é possível responder com conhecimento local
+    can_answer_locally = check_local_knowledge(query, local_context)
+    print(f"Can answer locally: {can_answer_locally}")
+    # Etapa 2: Obter contexto do banco de dados local ou da web
+    if can_answer_locally:
+        context = get_local_content(vector_db, query)
+        print("Retrieved context from local documents")
+    else:
+        context = get_web_content(query)
+        print("Retrieved context from web scraping")
+    # Etapa 3: Gerar resposta final
+    answer = generate_final_answer(context, query)
+    return answer

vector_db.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+def setup_vector_db(pdf_path):
+    """Setup vector database from PDF"""
+    # carregando e dividindo o PDF em chunks
+    loader = PyPDFLoader(pdf_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=50
+    )
+    chunks = text_splitter.split_documents(documents)
+    # criando banco de dados vetorial
+    embeddings = HuggingFaceEmbeddings(
+        model_name="Snowflake/snowflake-arctic-embed-l-v2.0"
+    )
+    vector_db = FAISS.from_documents(chunks, embeddings)
+    return vector_db
+def get_local_content(vector_db, query):
+    """Get content from vector database"""
+    docs = vector_db.similarity_search(query, k=5)
+    return " ".join([doc.page_content for doc in docs])
+def check_local_knowledge(query, vector_db, threshold=0.7):
+    """
+    Verifica se a consulta pode ser respondida com base no conhecimento local.
+    Retorna True se houver documentos relevantes no banco de dados vetorial.
+    """
+    try:
+        # buscando documentos relevantes do banco de dados
+        docs = vector_db.similarity_search(query, k=1)
+        if docs:
+            return True  # háá documentos relevantes
+        return False  # não há documentos relevantes
+    except Exception as e:
+        print(f"Erro ao verificar conhecimento local: {e}")
+        return False

web_scrapping.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from crewai import Agent, Task, Crew
+from llm_setup import crew_llm
+from crewai_tools import (
+    SerperDevTool,
+    ScrapeWebsiteTool
+)
+# Função disponibilizada no DataCamp
+def setup_web_scraping_agent():
+    """Setup the web scraping agent and related components"""
+    search_tool = SerperDevTool()  # Tool for performing web searches
+    scrape_website = ScrapeWebsiteTool()  # Tool for extracting data from websites
+    # Define the web search agent
+    web_search_agent = Agent(
+        role="Expert Web Search Agent",
+        goal="Identify and retrieve relevant web data for user queries",
+        backstory="An expert in identifying valuable web sources for the user's needs",
+        allow_delegation=False,
+        verbose=True,
+        llm=crew_llm
+    )
+    # Define the web scraping agent
+    web_scraper_agent = Agent(
+        role="Expert Web Scraper Agent",
+        goal="Extract and analyze content from specific web pages identified by the search agent",
+        backstory="A highly skilled web scraper, capable of analyzing and summarizing website content accurately",
+        allow_delegation=False,
+        verbose=True,
+        llm=crew_llm
+    )
+    # Define the web search task
+    search_task = Task(
+        description=(
+            "Identify the most relevant web page or article for the topic: '{topic}'. "
+            "Use all available tools to search for and provide a link to a web page "
+            "that contains valuable information about the topic. Keep your response concise."
+        ),
+        expected_output=(
+            "A concise summary of the most relevant web page or article for '{topic}', "
+            "including the link to the source and key points from the content."
+        ),
+        tools=[search_tool],
+        agent=web_search_agent,
+    )
+    # Define the web scraping task
+    scraping_task = Task(
+        description=(
+            "Extract and analyze data from the given web page or website. Focus on the key sections "
+            "that provide insights into the topic: '{topic}'. Use all available tools to retrieve the content, "
+            "and summarize the key findings in a concise manner."
+        ),
+        expected_output=(
+            "A detailed summary of the content from the given web page or website, highlighting the key insights "
+            "and explaining their relevance to the topic: '{topic}'. Ensure clarity and conciseness."
+        ),
+        tools=[scrape_website],
+        agent=web_scraper_agent,
+    )
+    # Define the crew to manage agents and tasks
+    crew = Crew(
+        agents=[web_search_agent, web_scraper_agent],
+        tasks=[search_task, scraping_task],
+        verbose=1,
+        memory=False,
+    )
+    return crew
+# Função disponibilizada no DataCamp
+def get_web_content(query):
+    """Get content from web scraping"""
+    crew = setup_web_scraping_agent()
+    result = crew.kickoff(inputs={"topic": query})
+    return result.raw