jayash391 commited on
Commit
74b353b
·
verified ·
1 Parent(s): dbec8c8

Update sherlock2.py

Browse files
Files changed (1) hide show
  1. sherlock2.py +46 -25
sherlock2.py CHANGED
@@ -66,13 +66,14 @@ def generate_embeddings_from_documents(extracted_text):
66
 
67
 
68
  # Web scraping and Wikipedia search function
69
- def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='gemini-pro'):
70
  """
71
- Searches and scrapes Wikipedia for information relevant to the provided keywords.
 
72
  Args:
73
  keywords (list): A list of keywords to search for on Wikipedia.
74
  max_topics_per_query (int, optional): The maximum number of Wikipedia topics to explore for each query. Defaults to 3.
75
- mining_model (str, optional): The name of the generative model to use for extracting relevant information.
76
  Defaults to 'gemini-pro'.
77
  Returns:
78
  list: A list of dictionaries, where each dictionary represents a relevant piece of information, with keys:
@@ -82,31 +83,54 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='
82
  - "additional_sources": (Optional) A list of additional source URLs extracted from citations.
83
  """
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  search_history = set() # Keep track of explored topics to avoid redundancy
86
  wikipedia_info = []
87
- mining_model = genai.GenerativeModel(mining_model) # Initialize the generative model
88
 
89
  for query in keywords:
90
- search_terms = wikipedia.search(query, results=max_topics_per_query, suggestion=False) # Search Wikipedia using the keyword
91
 
92
- for search_term in search_terms: # Explore top results
93
  if search_term in search_history:
94
- continue # Skip if the topic has already been explored
95
 
96
  search_history.add(search_term)
97
 
98
  try:
99
- page = wikipedia.page(search_term, auto_suggest=False) # Get the Wikipedia page
100
- url = page.url
101
- page_content = page.content
 
 
 
 
 
 
 
 
 
102
 
103
- # Extract Relevant Information using the Generative Model
104
- response = mining_model.generate_content(textwrap.dedent(f"""\
105
- Extract relevant information related to the keyword "{query}"
106
- from the following Wikipedia page content:
107
- {page_content}
108
- Note: Do not summarize the entire page. Only extract and return the information relevant to the keyword.
109
- """))
110
 
111
  additional_sources = []
112
  if response.candidates[0].citation_metadata:
@@ -114,16 +138,16 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='
114
 
115
  wikipedia_info.append({
116
  "topic": search_term,
117
- "summary": response.text,
118
  "url": url,
119
  "additional_sources": additional_sources
120
  })
121
 
122
- except wikipedia.exceptions.DisambiguationError: # Handle ambiguous search results
123
  print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
124
- except wikipedia.exceptions.PageError: # Handle cases where no Wikipedia page is found
125
  print(f"No Wikipedia page found for '{search_term}', skipping.")
126
- except Exception as e: # Handle other exceptions
127
  st.error(f"Error searching Wikipedia: {e}")
128
 
129
  return wikipedia_info
@@ -290,15 +314,12 @@ def investigate():
290
  with st.expander("Sherlock's Analysis and Suggestions:"):
291
  st.write(response.text)
292
 
293
- # Initialize wikipedia_info with an empty list
294
- wikipedia_info = []
295
-
296
  search_options = st.multiselect("Search for additional clues:", ["Wikipedia", "Internet"], default=["Wikipedia"])
297
  if st.button("Search"):
298
  with st.spinner("Searching for clues..."):
299
  web_search_results = []
300
  if "Wikipedia" in search_options:
301
- wikipedia_info = search_and_scrape_wikipedia(keywords)
302
  st.subheader("Wikipedia Findings:")
303
  for info in wikipedia_info:
304
  st.write(f"**Topic:** {info['topic']}")
 
66
 
67
 
68
  # Web scraping and Wikipedia search function
69
+ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, react_model='gemini-pro'):
70
  """
71
+ Searches and scrapes Wikipedia using the ReAct prompting method to find information relevant to the provided keywords.
72
+
73
  Args:
74
  keywords (list): A list of keywords to search for on Wikipedia.
75
  max_topics_per_query (int, optional): The maximum number of Wikipedia topics to explore for each query. Defaults to 3.
76
+ react_model (str, optional): The name of the generative model to use with ReAct prompting.
77
  Defaults to 'gemini-pro'.
78
  Returns:
79
  list: A list of dictionaries, where each dictionary represents a relevant piece of information, with keys:
 
83
  - "additional_sources": (Optional) A list of additional source URLs extracted from citations.
84
  """
85
 
86
+ # ReAct Prompt Template (similar to the provided example)
87
+ react_prompt_template = """
88
+ **Question:** {question}
89
+
90
+ **Thought 1:** I need to search Wikipedia for information related to "{question}".
91
+
92
+ **Action 1:** <search>{question}</search>
93
+
94
+ **Observation 1:** {observation} # This will be filled in during the process
95
+
96
+ # ... (Further Thought-Action-Observation steps as needed)
97
+
98
+ **Action N:** <finish>{answer}</finish> # The final answer will be extracted from here
99
+ """
100
+
101
  search_history = set() # Keep track of explored topics to avoid redundancy
102
  wikipedia_info = []
103
+ react_model = genai.GenerativeModel(react_model) # Initialize the generative model
104
 
105
  for query in keywords:
106
+ search_terms = wikipedia.search(query, results=max_topics_per_query, suggestion=False) # Search Wikipedia
107
 
108
+ for search_term in search_terms:
109
  if search_term in search_history:
110
+ continue
111
 
112
  search_history.add(search_term)
113
 
114
  try:
115
+ # Construct the initial ReAct prompt
116
+ react_prompt = react_prompt_template.format(question=search_term, observation="")
117
+
118
+ # Perform ReAct-based search and extraction
119
+ while True:
120
+ response = react_model.generate_content([react_prompt], stop_sequences=["</finish>"])
121
+
122
+ # Extract action and observation from the response
123
+ action, observation = re.findall(r"<(.*?)>(.*?)</\1>", response.text)[-1] # Get the last action and observation
124
+
125
+ # Update the ReAct prompt with the observation
126
+ react_prompt = react_prompt.replace("{observation}", observation.strip(), 1) # Replace only the first occurrence
127
 
128
+ if action == "finish":
129
+ answer = observation.strip() # Extract the final answer
130
+ break # Exit the loop when </finish> is encountered
131
+
132
+ page = wikipedia.page(search_term, auto_suggest=False)
133
+ url = page.url
 
134
 
135
  additional_sources = []
136
  if response.candidates[0].citation_metadata:
 
138
 
139
  wikipedia_info.append({
140
  "topic": search_term,
141
+ "summary": answer, # Use the extracted answer as the summary
142
  "url": url,
143
  "additional_sources": additional_sources
144
  })
145
 
146
+ except wikipedia.exceptions.DisambiguationError:
147
  print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
148
+ except wikipedia.exceptions.PageError:
149
  print(f"No Wikipedia page found for '{search_term}', skipping.")
150
+ except Exception as e:
151
  st.error(f"Error searching Wikipedia: {e}")
152
 
153
  return wikipedia_info
 
314
  with st.expander("Sherlock's Analysis and Suggestions:"):
315
  st.write(response.text)
316
 
 
 
 
317
  search_options = st.multiselect("Search for additional clues:", ["Wikipedia", "Internet"], default=["Wikipedia"])
318
  if st.button("Search"):
319
  with st.spinner("Searching for clues..."):
320
  web_search_results = []
321
  if "Wikipedia" in search_options:
322
+ wikipedia_info = search_and_scrape_wikipedia(keywords) # Use the new ReAct-based function
323
  st.subheader("Wikipedia Findings:")
324
  for info in wikipedia_info:
325
  st.write(f"**Topic:** {info['topic']}")