Spaces:
Sleeping
Sleeping
Update sherlock2.py
Browse files- sherlock2.py +46 -25
sherlock2.py
CHANGED
@@ -66,13 +66,14 @@ def generate_embeddings_from_documents(extracted_text):
|
|
66 |
|
67 |
|
68 |
# Web scraping and Wikipedia search function
|
69 |
-
def search_and_scrape_wikipedia(keywords, max_topics_per_query=3,
|
70 |
"""
|
71 |
-
Searches and scrapes Wikipedia
|
|
|
72 |
Args:
|
73 |
keywords (list): A list of keywords to search for on Wikipedia.
|
74 |
max_topics_per_query (int, optional): The maximum number of Wikipedia topics to explore for each query. Defaults to 3.
|
75 |
-
|
76 |
Defaults to 'gemini-pro'.
|
77 |
Returns:
|
78 |
list: A list of dictionaries, where each dictionary represents a relevant piece of information, with keys:
|
@@ -82,31 +83,54 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='
|
|
82 |
- "additional_sources": (Optional) A list of additional source URLs extracted from citations.
|
83 |
"""
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
search_history = set() # Keep track of explored topics to avoid redundancy
|
86 |
wikipedia_info = []
|
87 |
-
|
88 |
|
89 |
for query in keywords:
|
90 |
-
search_terms = wikipedia.search(query, results=max_topics_per_query, suggestion=False) # Search Wikipedia
|
91 |
|
92 |
-
for search_term in search_terms:
|
93 |
if search_term in search_history:
|
94 |
-
continue
|
95 |
|
96 |
search_history.add(search_term)
|
97 |
|
98 |
try:
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
"""))
|
110 |
|
111 |
additional_sources = []
|
112 |
if response.candidates[0].citation_metadata:
|
@@ -114,16 +138,16 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='
|
|
114 |
|
115 |
wikipedia_info.append({
|
116 |
"topic": search_term,
|
117 |
-
"summary":
|
118 |
"url": url,
|
119 |
"additional_sources": additional_sources
|
120 |
})
|
121 |
|
122 |
-
except wikipedia.exceptions.DisambiguationError:
|
123 |
print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
|
124 |
-
except wikipedia.exceptions.PageError:
|
125 |
print(f"No Wikipedia page found for '{search_term}', skipping.")
|
126 |
-
except Exception as e:
|
127 |
st.error(f"Error searching Wikipedia: {e}")
|
128 |
|
129 |
return wikipedia_info
|
@@ -290,15 +314,12 @@ def investigate():
|
|
290 |
with st.expander("Sherlock's Analysis and Suggestions:"):
|
291 |
st.write(response.text)
|
292 |
|
293 |
-
# Initialize wikipedia_info with an empty list
|
294 |
-
wikipedia_info = []
|
295 |
-
|
296 |
search_options = st.multiselect("Search for additional clues:", ["Wikipedia", "Internet"], default=["Wikipedia"])
|
297 |
if st.button("Search"):
|
298 |
with st.spinner("Searching for clues..."):
|
299 |
web_search_results = []
|
300 |
if "Wikipedia" in search_options:
|
301 |
-
wikipedia_info = search_and_scrape_wikipedia(keywords)
|
302 |
st.subheader("Wikipedia Findings:")
|
303 |
for info in wikipedia_info:
|
304 |
st.write(f"**Topic:** {info['topic']}")
|
|
|
66 |
|
67 |
|
68 |
# Web scraping and Wikipedia search function
|
69 |
+
def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, react_model='gemini-pro'):
|
70 |
"""
|
71 |
+
Searches and scrapes Wikipedia using the ReAct prompting method to find information relevant to the provided keywords.
|
72 |
+
|
73 |
Args:
|
74 |
keywords (list): A list of keywords to search for on Wikipedia.
|
75 |
max_topics_per_query (int, optional): The maximum number of Wikipedia topics to explore for each query. Defaults to 3.
|
76 |
+
react_model (str, optional): The name of the generative model to use with ReAct prompting.
|
77 |
Defaults to 'gemini-pro'.
|
78 |
Returns:
|
79 |
list: A list of dictionaries, where each dictionary represents a relevant piece of information, with keys:
|
|
|
83 |
- "additional_sources": (Optional) A list of additional source URLs extracted from citations.
|
84 |
"""
|
85 |
|
86 |
+
# ReAct Prompt Template (similar to the provided example)
|
87 |
+
react_prompt_template = """
|
88 |
+
**Question:** {question}
|
89 |
+
|
90 |
+
**Thought 1:** I need to search Wikipedia for information related to "{question}".
|
91 |
+
|
92 |
+
**Action 1:** <search>{question}</search>
|
93 |
+
|
94 |
+
**Observation 1:** {observation} # This will be filled in during the process
|
95 |
+
|
96 |
+
# ... (Further Thought-Action-Observation steps as needed)
|
97 |
+
|
98 |
+
**Action N:** <finish>{answer}</finish> # The final answer will be extracted from here
|
99 |
+
"""
|
100 |
+
|
101 |
search_history = set() # Keep track of explored topics to avoid redundancy
|
102 |
wikipedia_info = []
|
103 |
+
react_model = genai.GenerativeModel(react_model) # Initialize the generative model
|
104 |
|
105 |
for query in keywords:
|
106 |
+
search_terms = wikipedia.search(query, results=max_topics_per_query, suggestion=False) # Search Wikipedia
|
107 |
|
108 |
+
for search_term in search_terms:
|
109 |
if search_term in search_history:
|
110 |
+
continue
|
111 |
|
112 |
search_history.add(search_term)
|
113 |
|
114 |
try:
|
115 |
+
# Construct the initial ReAct prompt
|
116 |
+
react_prompt = react_prompt_template.format(question=search_term, observation="")
|
117 |
+
|
118 |
+
# Perform ReAct-based search and extraction
|
119 |
+
while True:
|
120 |
+
response = react_model.generate_content([react_prompt], stop_sequences=["</finish>"])
|
121 |
+
|
122 |
+
# Extract action and observation from the response
|
123 |
+
action, observation = re.findall(r"<(.*?)>(.*?)</\1>", response.text)[-1] # Get the last action and observation
|
124 |
+
|
125 |
+
# Update the ReAct prompt with the observation
|
126 |
+
react_prompt = react_prompt.replace("{observation}", observation.strip(), 1) # Replace only the first occurrence
|
127 |
|
128 |
+
if action == "finish":
|
129 |
+
answer = observation.strip() # Extract the final answer
|
130 |
+
break # Exit the loop when </finish> is encountered
|
131 |
+
|
132 |
+
page = wikipedia.page(search_term, auto_suggest=False)
|
133 |
+
url = page.url
|
|
|
134 |
|
135 |
additional_sources = []
|
136 |
if response.candidates[0].citation_metadata:
|
|
|
138 |
|
139 |
wikipedia_info.append({
|
140 |
"topic": search_term,
|
141 |
+
"summary": answer, # Use the extracted answer as the summary
|
142 |
"url": url,
|
143 |
"additional_sources": additional_sources
|
144 |
})
|
145 |
|
146 |
+
except wikipedia.exceptions.DisambiguationError:
|
147 |
print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
|
148 |
+
except wikipedia.exceptions.PageError:
|
149 |
print(f"No Wikipedia page found for '{search_term}', skipping.")
|
150 |
+
except Exception as e:
|
151 |
st.error(f"Error searching Wikipedia: {e}")
|
152 |
|
153 |
return wikipedia_info
|
|
|
314 |
with st.expander("Sherlock's Analysis and Suggestions:"):
|
315 |
st.write(response.text)
|
316 |
|
|
|
|
|
|
|
317 |
search_options = st.multiselect("Search for additional clues:", ["Wikipedia", "Internet"], default=["Wikipedia"])
|
318 |
if st.button("Search"):
|
319 |
with st.spinner("Searching for clues..."):
|
320 |
web_search_results = []
|
321 |
if "Wikipedia" in search_options:
|
322 |
+
wikipedia_info = search_and_scrape_wikipedia(keywords) # Use the new ReAct-based function
|
323 |
st.subheader("Wikipedia Findings:")
|
324 |
for info in wikipedia_info:
|
325 |
st.write(f"**Topic:** {info['topic']}")
|