mariemerenc commited on
Commit
8da6fbf
·
verified ·
1 Parent(s): f347ed6

Upload 7 files

Browse files
Files changed (7) hide show
  1. .env +3 -0
  2. config.py +8 -0
  3. llm_setup.py +18 -0
  4. main.py +60 -0
  5. query_processing.py +36 -0
  6. vector_db.py +44 -0
  7. web_scrapping.py +79 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GROQ_API_KEY = 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
2
+ SERPER_API_KEY = b585b73c65056ae09f0685e4f9b44aff7e04739e
3
+ GEMINI = AIzaSyCQUkG-WbFlWnzk9H_EM_91NqBjBENfAoQ
config.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
7
+ SERPER_API_KEY = os.getenv("SERPER_API_KEY")
8
+ GEMINI = os.getenv("GEMINI")
llm_setup.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from crewai import LLM
3
+ from config import GROQ_API_KEY, GEMINI
4
+
5
+ llm = ChatGroq(
6
+ model="llama-3.3-70b-specdec",
7
+ temperature=0,
8
+ max_tokens=500,
9
+ timeout=None,
10
+ max_retries=2,
11
+ )
12
+
13
+ crew_llm = LLM(
14
+ model="gemini/gemini-1.5-flash",
15
+ api_key=GEMINI,
16
+ max_tokens=500,
17
+ temperature=0.7
18
+ )
main.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from vector_db import setup_vector_db, get_local_content
3
+ from query_processing import process_query
4
+ from web_scrapping import get_web_content
5
+ import tempfile
6
+ import os
7
+
8
+ def main():
9
+ # Configuração da página do Streamlit
10
+ st.set_page_config(page_title="Agentic RAG Query System", page_icon="🤖", layout="wide")
11
+ st.title("Agentic RAG Query System")
12
+ st.markdown("""
13
+ Bem-vindo ao sistema de consulta baseado em RAG (Retrieval-Augmented Generation)!
14
+ Faça uma pergunta e o sistema buscará informações em documentos locais ou na web para fornecer uma resposta.
15
+ """)
16
+
17
+ # Upload do PDF pelo usuário
18
+ uploaded_file = st.file_uploader("Carregue seu arquivo PDF", type="pdf")
19
+
20
+ # Inicialização do banco de dados vetorial
21
+ if uploaded_file is not None:
22
+ if 'vector_db' not in st.session_state:
23
+ with st.spinner("Configurando o banco de dados vetorial..."):
24
+ # Salva o arquivo carregado temporariamente
25
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
26
+ tmp_file.write(uploaded_file.getvalue())
27
+ tmp_file_path = tmp_file.name
28
+
29
+ # Passa o caminho do arquivo temporário para a função setup_vector_db
30
+ st.session_state.vector_db = setup_vector_db(tmp_file_path)
31
+ st.session_state.local_context = get_local_content(st.session_state.vector_db, "")
32
+
33
+ # Remove o arquivo temporário após o uso
34
+ os.unlink(tmp_file_path)
35
+
36
+ st.success("Banco de dados vetorial configurado com sucesso!")
37
+ else:
38
+ st.warning("Por favor, carregue um arquivo PDF para continuar.")
39
+
40
+ # Entrada do usuário
41
+ query = st.text_input("Faça sua pergunta:", placeholder="Ex: O que é Agentic RAG?")
42
+
43
+ # Botão para processar a consulta
44
+ if st.button("Buscar Resposta"):
45
+ if query and 'vector_db' in st.session_state:
46
+ with st.spinner("Processando sua consulta..."):
47
+ try:
48
+ # Processa a consulta
49
+ result = process_query(query, st.session_state.vector_db, st.session_state.local_context)
50
+
51
+ # Exibe o resultado
52
+ st.subheader("Resposta:")
53
+ st.write(result)
54
+ except Exception as e:
55
+ st.error(f"Ocorreu um erro ao processar sua consulta: {e}")
56
+ else:
57
+ st.warning("Por favor, insira uma pergunta e carregue um PDF.")
58
+
59
+ if __name__ == "__main__":
60
+ main()
query_processing.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llm_setup import llm
2
+ from vector_db import get_local_content, check_local_knowledge
3
+ from web_scrapping import get_web_content
4
+
5
+ def generate_final_answer(context, query):
6
+ """Generate final answer using LLM"""
7
+ messages = [
8
+ (
9
+ "system",
10
+ "You are a helpful assistant. Use the provided context to answer the query accurately.",
11
+ ),
12
+ ("system", f"Context: {context}"),
13
+ ("human", query),
14
+ ]
15
+ response = llm.invoke(messages)
16
+ return response.content
17
+
18
+ def process_query(query, vector_db, local_context):
19
+ """Main function to process user query"""
20
+ print(f"Processing query: {query}")
21
+
22
+ # Etapa 1: Checar se é possível responder com conhecimento local
23
+ can_answer_locally = check_local_knowledge(query, local_context)
24
+ print(f"Can answer locally: {can_answer_locally}")
25
+
26
+ # Etapa 2: Obter contexto do banco de dados local ou da web
27
+ if can_answer_locally:
28
+ context = get_local_content(vector_db, query)
29
+ print("Retrieved context from local documents")
30
+ else:
31
+ context = get_web_content(query)
32
+ print("Retrieved context from web scraping")
33
+
34
+ # Etapa 3: Gerar resposta final
35
+ answer = generate_final_answer(context, query)
36
+ return answer
vector_db.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+
6
+ def setup_vector_db(pdf_path):
7
+ """Setup vector database from PDF"""
8
+ # carregando e dividindo o PDF em chunks
9
+ loader = PyPDFLoader(pdf_path)
10
+ documents = loader.load()
11
+
12
+ text_splitter = RecursiveCharacterTextSplitter(
13
+ chunk_size=1000,
14
+ chunk_overlap=50
15
+ )
16
+ chunks = text_splitter.split_documents(documents)
17
+
18
+ # criando banco de dados vetorial
19
+ embeddings = HuggingFaceEmbeddings(
20
+ model_name="Snowflake/snowflake-arctic-embed-l-v2.0"
21
+ )
22
+ vector_db = FAISS.from_documents(chunks, embeddings)
23
+
24
+ return vector_db
25
+
26
+ def get_local_content(vector_db, query):
27
+ """Get content from vector database"""
28
+ docs = vector_db.similarity_search(query, k=5)
29
+ return " ".join([doc.page_content for doc in docs])
30
+
31
+ def check_local_knowledge(query, vector_db, threshold=0.7):
32
+ """
33
+ Verifica se a consulta pode ser respondida com base no conhecimento local.
34
+ Retorna True se houver documentos relevantes no banco de dados vetorial.
35
+ """
36
+ try:
37
+ # buscando documentos relevantes do banco de dados
38
+ docs = vector_db.similarity_search(query, k=1)
39
+ if docs:
40
+ return True # háá documentos relevantes
41
+ return False # não há documentos relevantes
42
+ except Exception as e:
43
+ print(f"Erro ao verificar conhecimento local: {e}")
44
+ return False
web_scrapping.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from crewai import Agent, Task, Crew
2
+ from llm_setup import crew_llm
3
+ from crewai_tools import (
4
+ SerperDevTool,
5
+ ScrapeWebsiteTool
6
+ )
7
+
8
+ # Função disponibilizada no DataCamp
9
+ def setup_web_scraping_agent():
10
+ """Setup the web scraping agent and related components"""
11
+ search_tool = SerperDevTool() # Tool for performing web searches
12
+ scrape_website = ScrapeWebsiteTool() # Tool for extracting data from websites
13
+
14
+ # Define the web search agent
15
+ web_search_agent = Agent(
16
+ role="Expert Web Search Agent",
17
+ goal="Identify and retrieve relevant web data for user queries",
18
+ backstory="An expert in identifying valuable web sources for the user's needs",
19
+ allow_delegation=False,
20
+ verbose=True,
21
+ llm=crew_llm
22
+ )
23
+
24
+ # Define the web scraping agent
25
+ web_scraper_agent = Agent(
26
+ role="Expert Web Scraper Agent",
27
+ goal="Extract and analyze content from specific web pages identified by the search agent",
28
+ backstory="A highly skilled web scraper, capable of analyzing and summarizing website content accurately",
29
+ allow_delegation=False,
30
+ verbose=True,
31
+ llm=crew_llm
32
+ )
33
+
34
+ # Define the web search task
35
+ search_task = Task(
36
+ description=(
37
+ "Identify the most relevant web page or article for the topic: '{topic}'. "
38
+ "Use all available tools to search for and provide a link to a web page "
39
+ "that contains valuable information about the topic. Keep your response concise."
40
+ ),
41
+ expected_output=(
42
+ "A concise summary of the most relevant web page or article for '{topic}', "
43
+ "including the link to the source and key points from the content."
44
+ ),
45
+ tools=[search_tool],
46
+ agent=web_search_agent,
47
+ )
48
+
49
+ # Define the web scraping task
50
+ scraping_task = Task(
51
+ description=(
52
+ "Extract and analyze data from the given web page or website. Focus on the key sections "
53
+ "that provide insights into the topic: '{topic}'. Use all available tools to retrieve the content, "
54
+ "and summarize the key findings in a concise manner."
55
+ ),
56
+ expected_output=(
57
+ "A detailed summary of the content from the given web page or website, highlighting the key insights "
58
+ "and explaining their relevance to the topic: '{topic}'. Ensure clarity and conciseness."
59
+ ),
60
+ tools=[scrape_website],
61
+ agent=web_scraper_agent,
62
+ )
63
+
64
+ # Define the crew to manage agents and tasks
65
+ crew = Crew(
66
+ agents=[web_search_agent, web_scraper_agent],
67
+ tasks=[search_task, scraping_task],
68
+ verbose=1,
69
+ memory=False,
70
+ )
71
+ return crew
72
+
73
+
74
+ # Função disponibilizada no DataCamp
75
+ def get_web_content(query):
76
+ """Get content from web scraping"""
77
+ crew = setup_web_scraping_agent()
78
+ result = crew.kickoff(inputs={"topic": query})
79
+ return result.raw