Spaces:
Paused
Paused
File size: 14,581 Bytes
4279593 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 |
import os
import json
import asyncio
from typing import List, Dict, Any, Optional
from langchain.prompts import ChatPromptTemplate
from pathlib import Path
from dotenv import load_dotenv
import time
from langchain_community.tools import BraveSearch
from src.utils.api_key_manager import with_api_manager
from src.helpers.helper import remove_markdown
class SearchEngine:
def __init__(
self,
brave_api_key: Optional[str] = None,
):
if brave_api_key is None:
if os.getenv("BRAVE_API_KEY") is None:
raise ValueError("BRAVE_API_KEY is not set")
else:
self.brave_api_key = os.getenv("BRAVE_API_KEY")
else:
self.brave_api_key = brave_api_key
@with_api_manager()
async def generate_optimized_query(self, user_query: str, context: str = None, *, llm) -> str:
if context:
template = \
"""Objective:
Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query and relevant past context.
The generated SEO query should enhance visibility, relevance, and ranking on search engines.
Information:
The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices.
Instructions:
1. Understand the Inputs:
- User Query: This is the current question or statement provided by the user.
- Past Context: This includes any relevant previous interactions, preferences, or information that can inform the understanding of the user's intent.
2. Analyze the User Intent:
- Determine what the user is seeking to find or achieve with their query.
- Identify keywords and phrases that are central to the user's intent.
3. Incorporate SEO Best Practices:
- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords.
- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely.
- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs.
- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation).
- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing.
4. Generate the SEO-Optimized Query:
- Combine the insights from the user query and past context.
- Formulate a search query that maximizes SEO potential while staying true to the user's intent.
5. Review and Refine:
- Ensure the generated query is free from grammatical errors.
- Verify that the query does not include unnecessary or irrelevant keywords.
- Confirm that the query is tailored to improve search engine rankings for the intended content.
6. Format [IMPORTANT]:
- If the user query is a question, the SEO-optimized query should also be a question.
- If the user query is a statement, the SEO-optimized query should be a clear and concise statement.
- Unless search results would be more accurate if the optimized query was a question.
Example 1:
- User Query:
'Best vegan restaurants in New York'
- Past Context:
'User has previously shown interest in healthy eating and sustainability.'
- SEO-Optimized Search Query:
'Top Vegan Restaurants in New York City for Healthy Dining'
Example 2:
- User Query:
'Give me a list of the best sci-fi movies'
- Past Context:
'User has a preference for classic science fiction films. Previous searches include "Blade Runner" and "2001: A Space Odyssey."'
- SEO-Optimized Search Query:
'What are the top classic science fiction movies to watch that are similar to Blade Runner and 2001: A Space Odyssey?'
Input:
- User Query:
{user_query}
- Past Context:
{context}
Output:
(The generated SEO-friendly query based on the inputs in plain text format without any markdown)"""
else:
template = \
"""Objective:
Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query.
The generated SEO query should enhance visibility, relevance, and ranking on search engines.
Information:
The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices.
Instructions:
1. Understand the Input:
- User Query: This is the current question or statement provided by the user.
2. Analyze the User Intent:
- Determine what the user is seeking to find or achieve with their query.
- Identify keywords and phrases that are central to the user's intent.
3. Incorporate SEO Best Practices:
- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords.
- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely.
- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs.
- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation).
- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing.
4. Generate the SEO-Optimized Query:
- Utilize the insights from the user query.
- Formulate a search query that maximizes SEO potential while staying true to the user's intent.
5. Review and Refine:
- Ensure the generated query is free from grammatical errors.
- Verify that the query does not include unnecessary or irrelevant keywords.
- Confirm that the query is tailored to improve search engine rankings for the intended content.
6. Format [IMPORTANT]:
- If the user query is a question, the SEO-optimized query should also be a question.
- If the user query is a statement, the SEO-optimized query should be a clear and concise statement.
- Unless search results would be more accurate if the optimized query was a question.
Example 1:
- User Query:
'Best vegan restaurants in New York'
- SEO-Optimized Search Query:
'Top Vegan Restaurants in New York City for Healthy Dining'
Example 2:
- User Query:
'Give me a list of the best sci-fi movies'
- SEO-Optimized Search Query:
'What are the top science fiction movies to watch?'
Input:
- User Query:
{user_query}
Output:
(The generated SEO-friendly query based on the input in plain text format without any markdown)"""
prompt_template = ChatPromptTemplate.from_template(template)
prompt = prompt_template.format(context=context, user_query=user_query)
optimized_query = await llm.ainvoke(prompt)
return optimized_query.content.strip()
async def search(
self,
query: str,
num_results: int = 10,
gl: str = 'us',
hl: str = 'en',
safe: str = 'off',
exclude_filetypes: Optional[List[str]] = None
) -> List[Dict[str, Any]]:
# Construct exclusion string for filetypes (maintaining compatibility)
exclusion = ''
if exclude_filetypes:
exclusion = ' ' + ' '.join([f"NOT filetype:{ft}" for ft in exclude_filetypes])
modified_query = f"{query}{exclusion}"
print(f"Performing search with query: '{modified_query}', num_results: {num_results}, gl: {gl}, hl: {hl}, safe: {safe}")
try:
all_results = []
remaining_results = num_results
offset = 0
while remaining_results > 0 and offset <= 9: # Max offset is 9
# Calculate count for this page (max 20 per request)
count = min(remaining_results, 20)
# Initialize Brave Search within the method
brave_search = BraveSearch.from_api_key(
api_key=self.brave_api_key,
search_kwargs={
"count": count,
"offset": offset,
"country": gl,
"search_lang": hl,
"safesearch": safe
}
)
try:
results_str = await asyncio.to_thread(brave_search.run, modified_query)
page_results = eval(results_str) # Convert string representation of list to actual list
if not page_results: # No more results available
break
except Exception as e:
if "429" in str(e): # Rate limit error
print("Brave API rate limit hit, waiting 1 second...")
await asyncio.sleep(1)
continue
else:
raise e
all_results.extend(page_results)
remaining_results -= len(page_results)
offset += 1
# Add a delay to avoid hitting the rate limit
await asyncio.sleep(1)
print(f"Total results fetched: {len(all_results)}")
return all_results[:num_results] # Ensure we don't return more than requested
except Exception as e:
raise e
@with_api_manager()
async def filter_urls(
self,
query: str,
category: str,
search_results: List[Dict[str, Any]],
num_results: int = 3,
*,
llm
) -> List[Dict[str, str]]:
link_info = {}
for result in search_results:
link = result.get("link")
title = result.get("title")
snippet = result.get("snippet")
if link and title and snippet:
link_info[link] = {"title": title, "snippet": snippet}
template = \
"""[IMPORTANT]
This is a very important task.
Please take a deep breath, read the instructions VERY carefully, and think step-by-step before responding.
[PROMPT]
You are an expert at determining the relevance of search results to a given query.
Your task is to re-rank the given search results based on their relevance to the original query.
Use a hybrid of semantic and keyword matching to determine relevance
Consider factors such as:
1. How well the title and snippet match the query intent
3. The credibility and authority of the source
4. The recency of the information (if applicable)
Rules:
1. Rerank the URLs based on their relevance to the query according to the criteria listed above, from best match to worst match.
2. Once reranked, select the top best matched results according to the category of the query as defined below:
- Simple External Lookup: Select upto 3 top best matched results
- Complex Moderate Decomposition: Select upto 4 top best matched results
- Complex Advanced Decomposition: Select upto 5 top best matched results
- Extensive Research Dynamic Structuring: Select upto 6 top best matched results
3. [IMPORTANT] Select the MINIMUM number of results (based on the categories above) that are required to answer the query.
4. The response should only contain a JSON array of objects, each containing 'link', 'title' and 'snippet' keys after reranking and filtering.
Note: Do not include ANY markdown in your response.
[INPUT]
Query Category:
{category}
Query:
{query}
Dictionary Containing Link, Titles and Snippets:
{link_info}
Ranked URLs (JSON array of objects):"""
prompt = ChatPromptTemplate.from_template(template)
response = await llm.ainvoke(prompt.format_messages(category=category, query=query, link_info=link_info))
cleaned_response = remove_markdown(response.content.strip())
try:
ranked_links = json.loads(cleaned_response)
print(f"Number of search results after reranking and filtering: {len(ranked_links)}")
return ranked_links
except json.JSONDecodeError:
print("Error decoding JSON response from LLM")
return [{"link": link, "title": info["title"], "snippet": info["snippet"]} for link, info in list(link_info.items())[:num_results]]
if __name__ == "__main__":
# Get the project root directory
project_root = Path(__file__).resolve().parents[2]
# Load environment variables
load_dotenv()
required_env_vars = ["BRAVE_API_KEY"]
missing_vars = [var for var in required_env_vars if os.getenv(var) is None]
if missing_vars:
print(f"Environment variables are not set: {missing_vars}")
exit()
else:
print("All environment variables are set!")
search_engine = SearchEngine()
queries = [
"Compare the benefits and drawbacks of AI in healthcare",
"What is the impact of AI on healthcare?",
"How is AI used in healthcare?",
"What are the ethical considerations of AI in healthcare?",
"What are the economic and social impacts of artificial intelligence on the job market?",
"How can cold fusion be achieved without violating the laws of thermodynamics? And how can AGI help with that?",
"What are the major obstacles to achieving carbon neutrality in heavy industries like steel and cement? What are the potential solutions?"
]
async def main(queries: List[str]):
for query in queries:
optimized_query = await search_engine.generate_optimized_query(query)
print(f"\nOriginal Query: {query}")
print(f"Optimized Query: {optimized_query}\n")
start = time.perf_counter()
search_results = await search_engine.search(optimized_query, num_results=2, exclude_filetypes=["pdf"])
end = time.perf_counter()
print(f"Time taken to fetch search results: {end - start:.2f} seconds")
# filtered_search = search_engine.filter_urls(
# optimized_query,
# category="Simple External Lookup",
# search_results=search_results,
# num_results=2
# )
print("Search Results:")
urls = []
for result in search_results:
print(f"- {result['title']}: {result['link']}: {result['snippet']}")
urls.append(result['link'])
print("-"*20)
asyncio.run(main(queries)) |