Spaces:

jayash391
/

Sherlock

Sleeping

App Files Files Community

jayash391 commited on Apr 30, 2024

Commit

74b353b

verified ·

1 Parent(s): dbec8c8

Update sherlock2.py

Browse files

Files changed (1) hide show

sherlock2.py +46 -25

sherlock2.py CHANGED Viewed

@@ -66,13 +66,14 @@ def generate_embeddings_from_documents(extracted_text):
 # Web scraping and Wikipedia search function
-def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='gemini-pro'):
     """
-    Searches and scrapes Wikipedia for information relevant to the provided keywords.
     Args:
         keywords (list): A list of keywords to search for on Wikipedia.
         max_topics_per_query (int, optional): The maximum number of Wikipedia topics to explore for each query. Defaults to 3.
-        mining_model (str, optional): The name of the generative model to use for extracting relevant information.
                                      Defaults to 'gemini-pro'.
     Returns:
         list: A list of dictionaries, where each dictionary represents a relevant piece of information, with keys:
@@ -82,31 +83,54 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='
         - "additional_sources": (Optional) A list of additional source URLs extracted from citations.
     """
     search_history = set()  # Keep track of explored topics to avoid redundancy
     wikipedia_info = []
-    mining_model = genai.GenerativeModel(mining_model)  # Initialize the generative model
     for query in keywords:
-        search_terms = wikipedia.search(query, results=max_topics_per_query, suggestion=False)  # Search Wikipedia using the keyword
-        for search_term in search_terms:  # Explore top results
             if search_term in search_history:
-                continue  # Skip if the topic has already been explored
             search_history.add(search_term)
             try:
-                page = wikipedia.page(search_term, auto_suggest=False)  # Get the Wikipedia page
-                url = page.url
-                page_content = page.content
-                # Extract Relevant Information using the Generative Model
-                response = mining_model.generate_content(textwrap.dedent(f"""\
-                    Extract relevant information related to the keyword "{query}"
-                    from the following Wikipedia page content:
-                    {page_content}
-                    Note: Do not summarize the entire page. Only extract and return the information relevant to the keyword.
-                """))
                 additional_sources = []
                 if response.candidates[0].citation_metadata:
@@ -114,16 +138,16 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, mining_model='
                 wikipedia_info.append({
                     "topic": search_term,
-                    "summary": response.text,
                     "url": url,
                     "additional_sources": additional_sources
                 })
-            except wikipedia.exceptions.DisambiguationError:  # Handle ambiguous search results
                 print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
-            except wikipedia.exceptions.PageError:  # Handle cases where no Wikipedia page is found
                 print(f"No Wikipedia page found for '{search_term}', skipping.")
-            except Exception as e:  # Handle other exceptions
                 st.error(f"Error searching Wikipedia: {e}")
     return wikipedia_info
@@ -290,15 +314,12 @@ def investigate():
         with st.expander("Sherlock's Analysis and Suggestions:"):
             st.write(response.text)
-        # Initialize wikipedia_info with an empty list
-        wikipedia_info = []
         search_options = st.multiselect("Search for additional clues:", ["Wikipedia", "Internet"], default=["Wikipedia"])
         if st.button("Search"):
             with st.spinner("Searching for clues..."):
                 web_search_results = []
                 if "Wikipedia" in search_options:
-                    wikipedia_info = search_and_scrape_wikipedia(keywords)
                     st.subheader("Wikipedia Findings:")
                     for info in wikipedia_info:
                         st.write(f"**Topic:** {info['topic']}")

 # Web scraping and Wikipedia search function
+def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, react_model='gemini-pro'):
     """
+    Searches and scrapes Wikipedia using the ReAct prompting method to find information relevant to the provided keywords.
     Args:
         keywords (list): A list of keywords to search for on Wikipedia.
         max_topics_per_query (int, optional): The maximum number of Wikipedia topics to explore for each query. Defaults to 3.
+        react_model (str, optional): The name of the generative model to use with ReAct prompting.
                                      Defaults to 'gemini-pro'.
     Returns:
         list: A list of dictionaries, where each dictionary represents a relevant piece of information, with keys:
         - "additional_sources": (Optional) A list of additional source URLs extracted from citations.
     """
+    # ReAct Prompt Template (similar to the provided example)
+    react_prompt_template = """
+    **Question:** {question}
+    **Thought 1:** I need to search Wikipedia for information related to "{question}".
+    **Action 1:** <search>{question}</search>
+    **Observation 1:** {observation}  # This will be filled in during the process
+    # ... (Further Thought-Action-Observation steps as needed)
+    **Action N:** <finish>{answer}</finish>  # The final answer will be extracted from here
+    """
     search_history = set()  # Keep track of explored topics to avoid redundancy
     wikipedia_info = []
+    react_model = genai.GenerativeModel(react_model)  # Initialize the generative model
     for query in keywords:
+        search_terms = wikipedia.search(query, results=max_topics_per_query, suggestion=False)  # Search Wikipedia
+        for search_term in search_terms:
             if search_term in search_history:
+                continue
             search_history.add(search_term)
             try:
+                # Construct the initial ReAct prompt
+                react_prompt = react_prompt_template.format(question=search_term, observation="")
+                # Perform ReAct-based search and extraction
+                while True:
+                    response = react_model.generate_content([react_prompt], stop_sequences=["</finish>"])
+                    # Extract action and observation from the response
+                    action, observation = re.findall(r"<(.*?)>(.*?)</\1>", response.text)[-1]  # Get the last action and observation
+                    # Update the ReAct prompt with the observation
+                    react_prompt = react_prompt.replace("{observation}", observation.strip(), 1)  # Replace only the first occurrence
+                    if action == "finish":
+                        answer = observation.strip()  # Extract the final answer
+                        break  # Exit the loop when </finish> is encountered
+                page = wikipedia.page(search_term, auto_suggest=False)
+                url = page.url
                 additional_sources = []
                 if response.candidates[0].citation_metadata:
                 wikipedia_info.append({
                     "topic": search_term,
+                    "summary": answer,  # Use the extracted answer as the summary
                     "url": url,
                     "additional_sources": additional_sources
                 })
+            except wikipedia.exceptions.DisambiguationError:
                 print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
+            except wikipedia.exceptions.PageError:
                 print(f"No Wikipedia page found for '{search_term}', skipping.")
+            except Exception as e:
                 st.error(f"Error searching Wikipedia: {e}")
     return wikipedia_info
         with st.expander("Sherlock's Analysis and Suggestions:"):
             st.write(response.text)
         search_options = st.multiselect("Search for additional clues:", ["Wikipedia", "Internet"], default=["Wikipedia"])
         if st.button("Search"):
             with st.spinner("Searching for clues..."):
                 web_search_results = []
                 if "Wikipedia" in search_options:
+                    wikipedia_info = search_and_scrape_wikipedia(keywords)  # Use the new ReAct-based function
                     st.subheader("Wikipedia Findings:")
                     for info in wikipedia_info:
                         st.write(f"**Topic:** {info['topic']}")