Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- .env +3 -0
- config.py +8 -0
- llm_setup.py +18 -0
- main.py +60 -0
- query_processing.py +36 -0
- vector_db.py +44 -0
- web_scrapping.py +79 -0
.env
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
GROQ_API_KEY = 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
|
2 |
+
SERPER_API_KEY = b585b73c65056ae09f0685e4f9b44aff7e04739e
|
3 |
+
GEMINI = AIzaSyCQUkG-WbFlWnzk9H_EM_91NqBjBENfAoQ
|
config.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
7 |
+
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
|
8 |
+
GEMINI = os.getenv("GEMINI")
|
llm_setup.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_groq import ChatGroq
|
2 |
+
from crewai import LLM
|
3 |
+
from config import GROQ_API_KEY, GEMINI
|
4 |
+
|
5 |
+
llm = ChatGroq(
|
6 |
+
model="llama-3.3-70b-specdec",
|
7 |
+
temperature=0,
|
8 |
+
max_tokens=500,
|
9 |
+
timeout=None,
|
10 |
+
max_retries=2,
|
11 |
+
)
|
12 |
+
|
13 |
+
crew_llm = LLM(
|
14 |
+
model="gemini/gemini-1.5-flash",
|
15 |
+
api_key=GEMINI,
|
16 |
+
max_tokens=500,
|
17 |
+
temperature=0.7
|
18 |
+
)
|
main.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from vector_db import setup_vector_db, get_local_content
|
3 |
+
from query_processing import process_query
|
4 |
+
from web_scrapping import get_web_content
|
5 |
+
import tempfile
|
6 |
+
import os
|
7 |
+
|
8 |
+
def main():
|
9 |
+
# Configuração da página do Streamlit
|
10 |
+
st.set_page_config(page_title="Agentic RAG Query System", page_icon="🤖", layout="wide")
|
11 |
+
st.title("Agentic RAG Query System")
|
12 |
+
st.markdown("""
|
13 |
+
Bem-vindo ao sistema de consulta baseado em RAG (Retrieval-Augmented Generation)!
|
14 |
+
Faça uma pergunta e o sistema buscará informações em documentos locais ou na web para fornecer uma resposta.
|
15 |
+
""")
|
16 |
+
|
17 |
+
# Upload do PDF pelo usuário
|
18 |
+
uploaded_file = st.file_uploader("Carregue seu arquivo PDF", type="pdf")
|
19 |
+
|
20 |
+
# Inicialização do banco de dados vetorial
|
21 |
+
if uploaded_file is not None:
|
22 |
+
if 'vector_db' not in st.session_state:
|
23 |
+
with st.spinner("Configurando o banco de dados vetorial..."):
|
24 |
+
# Salva o arquivo carregado temporariamente
|
25 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
26 |
+
tmp_file.write(uploaded_file.getvalue())
|
27 |
+
tmp_file_path = tmp_file.name
|
28 |
+
|
29 |
+
# Passa o caminho do arquivo temporário para a função setup_vector_db
|
30 |
+
st.session_state.vector_db = setup_vector_db(tmp_file_path)
|
31 |
+
st.session_state.local_context = get_local_content(st.session_state.vector_db, "")
|
32 |
+
|
33 |
+
# Remove o arquivo temporário após o uso
|
34 |
+
os.unlink(tmp_file_path)
|
35 |
+
|
36 |
+
st.success("Banco de dados vetorial configurado com sucesso!")
|
37 |
+
else:
|
38 |
+
st.warning("Por favor, carregue um arquivo PDF para continuar.")
|
39 |
+
|
40 |
+
# Entrada do usuário
|
41 |
+
query = st.text_input("Faça sua pergunta:", placeholder="Ex: O que é Agentic RAG?")
|
42 |
+
|
43 |
+
# Botão para processar a consulta
|
44 |
+
if st.button("Buscar Resposta"):
|
45 |
+
if query and 'vector_db' in st.session_state:
|
46 |
+
with st.spinner("Processando sua consulta..."):
|
47 |
+
try:
|
48 |
+
# Processa a consulta
|
49 |
+
result = process_query(query, st.session_state.vector_db, st.session_state.local_context)
|
50 |
+
|
51 |
+
# Exibe o resultado
|
52 |
+
st.subheader("Resposta:")
|
53 |
+
st.write(result)
|
54 |
+
except Exception as e:
|
55 |
+
st.error(f"Ocorreu um erro ao processar sua consulta: {e}")
|
56 |
+
else:
|
57 |
+
st.warning("Por favor, insira uma pergunta e carregue um PDF.")
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
main()
|
query_processing.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llm_setup import llm
|
2 |
+
from vector_db import get_local_content, check_local_knowledge
|
3 |
+
from web_scrapping import get_web_content
|
4 |
+
|
5 |
+
def generate_final_answer(context, query):
|
6 |
+
"""Generate final answer using LLM"""
|
7 |
+
messages = [
|
8 |
+
(
|
9 |
+
"system",
|
10 |
+
"You are a helpful assistant. Use the provided context to answer the query accurately.",
|
11 |
+
),
|
12 |
+
("system", f"Context: {context}"),
|
13 |
+
("human", query),
|
14 |
+
]
|
15 |
+
response = llm.invoke(messages)
|
16 |
+
return response.content
|
17 |
+
|
18 |
+
def process_query(query, vector_db, local_context):
|
19 |
+
"""Main function to process user query"""
|
20 |
+
print(f"Processing query: {query}")
|
21 |
+
|
22 |
+
# Etapa 1: Checar se é possível responder com conhecimento local
|
23 |
+
can_answer_locally = check_local_knowledge(query, local_context)
|
24 |
+
print(f"Can answer locally: {can_answer_locally}")
|
25 |
+
|
26 |
+
# Etapa 2: Obter contexto do banco de dados local ou da web
|
27 |
+
if can_answer_locally:
|
28 |
+
context = get_local_content(vector_db, query)
|
29 |
+
print("Retrieved context from local documents")
|
30 |
+
else:
|
31 |
+
context = get_web_content(query)
|
32 |
+
print("Retrieved context from web scraping")
|
33 |
+
|
34 |
+
# Etapa 3: Gerar resposta final
|
35 |
+
answer = generate_final_answer(context, query)
|
36 |
+
return answer
|
vector_db.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
|
6 |
+
def setup_vector_db(pdf_path):
|
7 |
+
"""Setup vector database from PDF"""
|
8 |
+
# carregando e dividindo o PDF em chunks
|
9 |
+
loader = PyPDFLoader(pdf_path)
|
10 |
+
documents = loader.load()
|
11 |
+
|
12 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
13 |
+
chunk_size=1000,
|
14 |
+
chunk_overlap=50
|
15 |
+
)
|
16 |
+
chunks = text_splitter.split_documents(documents)
|
17 |
+
|
18 |
+
# criando banco de dados vetorial
|
19 |
+
embeddings = HuggingFaceEmbeddings(
|
20 |
+
model_name="Snowflake/snowflake-arctic-embed-l-v2.0"
|
21 |
+
)
|
22 |
+
vector_db = FAISS.from_documents(chunks, embeddings)
|
23 |
+
|
24 |
+
return vector_db
|
25 |
+
|
26 |
+
def get_local_content(vector_db, query):
|
27 |
+
"""Get content from vector database"""
|
28 |
+
docs = vector_db.similarity_search(query, k=5)
|
29 |
+
return " ".join([doc.page_content for doc in docs])
|
30 |
+
|
31 |
+
def check_local_knowledge(query, vector_db, threshold=0.7):
|
32 |
+
"""
|
33 |
+
Verifica se a consulta pode ser respondida com base no conhecimento local.
|
34 |
+
Retorna True se houver documentos relevantes no banco de dados vetorial.
|
35 |
+
"""
|
36 |
+
try:
|
37 |
+
# buscando documentos relevantes do banco de dados
|
38 |
+
docs = vector_db.similarity_search(query, k=1)
|
39 |
+
if docs:
|
40 |
+
return True # háá documentos relevantes
|
41 |
+
return False # não há documentos relevantes
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Erro ao verificar conhecimento local: {e}")
|
44 |
+
return False
|
web_scrapping.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from crewai import Agent, Task, Crew
|
2 |
+
from llm_setup import crew_llm
|
3 |
+
from crewai_tools import (
|
4 |
+
SerperDevTool,
|
5 |
+
ScrapeWebsiteTool
|
6 |
+
)
|
7 |
+
|
8 |
+
# Função disponibilizada no DataCamp
|
9 |
+
def setup_web_scraping_agent():
|
10 |
+
"""Setup the web scraping agent and related components"""
|
11 |
+
search_tool = SerperDevTool() # Tool for performing web searches
|
12 |
+
scrape_website = ScrapeWebsiteTool() # Tool for extracting data from websites
|
13 |
+
|
14 |
+
# Define the web search agent
|
15 |
+
web_search_agent = Agent(
|
16 |
+
role="Expert Web Search Agent",
|
17 |
+
goal="Identify and retrieve relevant web data for user queries",
|
18 |
+
backstory="An expert in identifying valuable web sources for the user's needs",
|
19 |
+
allow_delegation=False,
|
20 |
+
verbose=True,
|
21 |
+
llm=crew_llm
|
22 |
+
)
|
23 |
+
|
24 |
+
# Define the web scraping agent
|
25 |
+
web_scraper_agent = Agent(
|
26 |
+
role="Expert Web Scraper Agent",
|
27 |
+
goal="Extract and analyze content from specific web pages identified by the search agent",
|
28 |
+
backstory="A highly skilled web scraper, capable of analyzing and summarizing website content accurately",
|
29 |
+
allow_delegation=False,
|
30 |
+
verbose=True,
|
31 |
+
llm=crew_llm
|
32 |
+
)
|
33 |
+
|
34 |
+
# Define the web search task
|
35 |
+
search_task = Task(
|
36 |
+
description=(
|
37 |
+
"Identify the most relevant web page or article for the topic: '{topic}'. "
|
38 |
+
"Use all available tools to search for and provide a link to a web page "
|
39 |
+
"that contains valuable information about the topic. Keep your response concise."
|
40 |
+
),
|
41 |
+
expected_output=(
|
42 |
+
"A concise summary of the most relevant web page or article for '{topic}', "
|
43 |
+
"including the link to the source and key points from the content."
|
44 |
+
),
|
45 |
+
tools=[search_tool],
|
46 |
+
agent=web_search_agent,
|
47 |
+
)
|
48 |
+
|
49 |
+
# Define the web scraping task
|
50 |
+
scraping_task = Task(
|
51 |
+
description=(
|
52 |
+
"Extract and analyze data from the given web page or website. Focus on the key sections "
|
53 |
+
"that provide insights into the topic: '{topic}'. Use all available tools to retrieve the content, "
|
54 |
+
"and summarize the key findings in a concise manner."
|
55 |
+
),
|
56 |
+
expected_output=(
|
57 |
+
"A detailed summary of the content from the given web page or website, highlighting the key insights "
|
58 |
+
"and explaining their relevance to the topic: '{topic}'. Ensure clarity and conciseness."
|
59 |
+
),
|
60 |
+
tools=[scrape_website],
|
61 |
+
agent=web_scraper_agent,
|
62 |
+
)
|
63 |
+
|
64 |
+
# Define the crew to manage agents and tasks
|
65 |
+
crew = Crew(
|
66 |
+
agents=[web_search_agent, web_scraper_agent],
|
67 |
+
tasks=[search_task, scraping_task],
|
68 |
+
verbose=1,
|
69 |
+
memory=False,
|
70 |
+
)
|
71 |
+
return crew
|
72 |
+
|
73 |
+
|
74 |
+
# Função disponibilizada no DataCamp
|
75 |
+
def get_web_content(query):
|
76 |
+
"""Get content from web scraping"""
|
77 |
+
crew = setup_web_scraping_agent()
|
78 |
+
result = crew.kickoff(inputs={"topic": query})
|
79 |
+
return result.raw
|