jayash391 commited on
Commit
b2b9111
·
verified ·
1 Parent(s): e626369

Update sherlock2.py

Browse files
Files changed (1) hide show
  1. sherlock2.py +101 -64
sherlock2.py CHANGED
@@ -3,6 +3,7 @@ import google.ai.generativelanguage as glm
3
  import streamlit as st
4
  from bs4 import BeautifulSoup
5
  import wikipedia
 
6
  import os
7
  import re
8
  import requests
@@ -83,74 +84,110 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, react_model='g
83
  - "additional_sources": (Optional) A list of additional source URLs extracted from citations.
84
  """
85
 
86
- # ReAct Prompt Template (similar to the provided example)
87
- react_prompt_template = """
88
- **Question:** {question}
89
-
90
- **Thought 1:** I need to search Wikipedia for information related to "{question}".
91
-
92
- **Action 1:**
93
-
94
- **Observation 1:** {observation} # This will be filled in during the process
95
-
96
- # ... (Further Thought-Action-Observation steps as needed)
97
-
98
- **Action N:** # The final answer will be extracted from here
99
  """
100
 
101
- search_history = set() # Keep track of explored topics to avoid redundancy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  wikipedia_info = []
103
- react_model = genai.GenerativeModel(react_model) # Initialize the generative model
104
-
105
  for query in keywords:
106
- # Search Wikipedia (modified line)
107
- search_terms = wikipedia.search(query, results=max_topics_per_query, suggestion=False, srsearch=query)
108
-
109
- for search_term in search_terms:
110
- if search_term in search_history:
111
- continue
112
-
113
- search_history.add(search_term)
114
-
115
- try:
116
- # Construct the initial ReAct prompt
117
- react_prompt = react_prompt_template.format(question=search_term, observation="")
118
-
119
- # Perform ReAct-based search and extraction
120
- while True:
121
- response = react_model.generate_content([react_prompt], stop_sequences=[""])
122
-
123
- # Extract action and observation from the response
124
- action, observation = re.findall(r"<(.*?)>(.*?)", response.text)[-1] # Get the last action and observation
125
-
126
- # Update the ReAct prompt with the observation
127
- react_prompt = react_prompt.replace("{observation}", observation.strip(), 1) # Replace only the first occurrence
128
-
129
- if action == "finish":
130
- answer = observation.strip() # Extract the final answer
131
- break # Exit the loop when is encountered
132
-
133
- # Get Wikipedia page and URL (modified line)
134
- page = wikipedia.page(search_term, auto_suggest=False)
135
- url = page.url
136
-
137
- additional_sources = []
138
- if response.candidates[0].citation_metadata:
139
- additional_sources = [source.url for source in response.candidates[0].citation_metadata.citation_sources]
140
-
141
- wikipedia_info.append({
142
- "topic": search_term,
143
- "summary": answer, # Use the extracted answer as the summary
144
- "url": url,
145
- "additional_sources": additional_sources
146
- })
147
-
148
- except wikipedia.exceptions.DisambiguationError:
149
- print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
150
- except wikipedia.exceptions.PageError:
151
- print(f"No Wikipedia page found for '{search_term}', skipping.")
152
- except Exception as e:
153
- st.error(f"Error searching Wikipedia: {e}")
154
 
155
  return wikipedia_info
156
 
 
3
  import streamlit as st
4
  from bs4 import BeautifulSoup
5
  import wikipedia
6
+ from wikipedia.exceptions import DisambiguationError, PageError
7
  import os
8
  import re
9
  import requests
 
84
  - "additional_sources": (Optional) A list of additional source URLs extracted from citations.
85
  """
86
 
87
+ model_instructions = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, Observation is understanding relevant information from an Action's output and Action can be of three types:
88
+ (1) , which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search and you can try to search the information from those topics.
89
+ (2) , which returns the next sentence containing keyword in the current context. This only does exact matches, so keep your searches short.
90
+ (3) , which returns the answer and finishes the task.
 
 
 
 
 
 
 
 
 
91
  """
92
 
93
+ # Define tools for ReAct (search, lookup, finish)
94
+ class ReAct:
95
+ def __init__(self, model: str, react_prompt: str):
96
+ self.model = genai.GenerativeModel(model)
97
+ self.chat = self.model.start_chat(history=[])
98
+ self.should_continue_prompting = True
99
+ self._search_history: list[str] = []
100
+ self._search_urls: list[str] = []
101
+ self._prompt = react_prompt
102
+
103
+ @classmethod
104
+ def add_method(cls, func):
105
+ setattr(cls, func.__name__, func)
106
+
107
+ @staticmethod
108
+ def clean(text: str):
109
+ text = text.replace("\n", " ")
110
+ return text
111
+
112
+ def search(self, query: str):
113
+ observation = None
114
+ query = query.strip()
115
+ try:
116
+ observation = wikipedia.summary(query, sentences=4, auto_suggest=False)
117
+ wiki_url = wikipedia.page(query, auto_suggest=False).url
118
+ observation = self.clean(observation)
119
+ self._search_history.append(query)
120
+ self._search_urls.append(wiki_url)
121
+ except (DisambiguationError, PageError) as e:
122
+ observation = f'Could not find ["{query}"]. Similar: {wikipedia.search(query)}.'
123
+ return observation
124
+
125
+ def lookup(self, phrase: str, context_length=200):
126
+ page = wikipedia.page(self._search_history[-1], auto_suggest=False).content
127
+ page = self.clean(page)
128
+ start_index = page.find(phrase)
129
+ result = page[max(0, start_index - context_length):start_index+len(phrase)+context_length]
130
+ return result
131
+
132
+ def finish(self, _):
133
+ self.should_continue_prompting = False
134
+
135
+ def __call__(self, user_question, max_calls: int=8, **generation_kwargs):
136
+ if len(self.chat.history) == 0:
137
+ model_prompt = self._prompt.format(question=user_question)
138
+ else:
139
+ model_prompt = user_question
140
+
141
+ callable_entities = ['', '', '']
142
+ generation_kwargs.update({'stop_sequences': callable_entities})
143
+
144
+ self.should_continue_prompting = True
145
+ for idx in range(max_calls):
146
+ self.response = self.chat.send_message(content=[model_prompt],
147
+ generation_config=generation_kwargs, stream=False)
148
+ response_cmd = self.chat.history[-1].parts[-1].text
149
+ try:
150
+ cmd = re.findall(r'<(.*)>', response_cmd)[-1]
151
+ query = response_cmd.split(f'<{cmd}>')[-1].strip()
152
+ observation = self.__getattribute__(cmd)(query)
153
+
154
+ if not self.should_continue_prompting:
155
+ break
156
+
157
+ model_prompt = f"\nObservation {idx + 1}\n{observation}"
158
+
159
+ except (IndexError, AttributeError) as e:
160
+ model_prompt = "Please try to generate thought-action-observation traces."
161
+
162
+ # Initialize ReAct with model and instructions
163
+ react_agent = ReAct(model=react_model, react_prompt=model_instructions)
164
+
165
+ search_history = set()
166
  wikipedia_info = []
 
 
167
  for query in keywords:
168
+ # Use ReAct to search and extract information
169
+ react_agent(query)
170
+ response_text = react_agent.response.text
171
+
172
+ # Process response_text to extract information
173
+ observations = []
174
+ for line in response_text.strip().split('\n'):
175
+ if line.startswith("Observation"):
176
+ observations.append(line.split(':')[-1].strip())
177
+
178
+ # Assuming the last observation is the final answer/summary
179
+ summary = observations[-1]
180
+
181
+ # Get URL from search history (assuming successful search)
182
+ url = react_agent._search_urls[-1]
183
+
184
+ # Create a dictionary with the extracted information
185
+ wikipedia_info.append({
186
+ "topic": query, # Using the original query as the topic
187
+ "summary": summary,
188
+ "url": url
189
+ # "additional_sources": # Add this if you implement additional source extraction
190
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  return wikipedia_info
193