File size: 17,429 Bytes
fd8b571 486a9e9 68b7136 486a9e9 b5407c0 486a9e9 b5407c0 203605e 486a9e9 203605e fd8b571 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 b5407c0 203605e 486a9e9 203605e 486a9e9 203605e fd8b571 486a9e9 203605e 486a9e9 b5407c0 486a9e9 b5407c0 203605e 2b57935 203605e b5407c0 486a9e9 203605e b5407c0 486a9e9 3a20bdf 203605e 486a9e9 203605e b5407c0 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 68b7136 486a9e9 68b7136 203605e 68b7136 b5407c0 68b7136 203605e 68b7136 fd8b571 68b7136 fd8b571 68b7136 fd8b571 486a9e9 68b7136 fd8b571 68b7136 fd8b571 68b7136 fd8b571 68b7136 fd8b571 68b7136 fd8b571 68b7136 b5407c0 68b7136 b5407c0 68b7136 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 |
def google_search_company_info(company_name: str) -> str:
"""
Search for basic company information to help with NAICS classification
"""
company_info = ""
# Create search queries focused on company information
queries = [
f"what is {company_name} company",
f"{company_name} company about us",
f"{company_name} business description",
f"{company_name} company profile",
f"what does {company_name} company do"
]
try:
print(f"π Searching for information about '{company_name}'...")
for query in queries[:2]: # Limit to first 2 queries to save time
try:
# Search with each query
search_results = search(query, stop=2, pause=2)
for result_url in search_results:
try:
response = requests.get(result_url, timeout=5)
if response.status_code == 200:
# Extract text from paragraphs
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
# Get text from first 3 substantial paragraphs
for p in paragraphs:
text = p.get_text().strip()
if len(text) > 100 and company_name.lower() in text.lower():
company_info += text + "\n\n"
if len(company_info) > 500:
break
if len(company_info) > 500:
break
except Exception as e:
print(f" β οΈ Error fetching {result_url}: {e}")
if len(company_info) > 500:
break
except Exception as e:
print(f" β οΈ Error with query '{query}': {e}")
continue
return company_info.strip()
except Exception as e:
print(f"β Error searching for company info: {str(e)}")
return ""import os
import re
import json
import requests
from typing import List, Dict, Optional, Tuple
import gradio as gr
from googlesearch import search
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
def initialize_gemini(api_key: str):
"""Initialize the Google Gemini API with appropriate configurations"""
genai.configure(api_key=api_key)
generation_config = {
"temperature": 0.2,
"top_p": 0.8,
"top_k": 40,
"max_output_tokens": 1024,
}
safety_settings = {
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=generation_config,
safety_settings=safety_settings
)
return model
def google_search_naics(company_name: str) -> List[str]:
"""
Find potential NAICS codes for a company using multiple targeted Google searches
Uses more specific search queries to improve results
"""
naics_codes = set()
# Create multiple search queries for better results
queries = [
f"2022 NAICS code for {company_name}",
f"NAICS 2022 classification for {company_name}",
f"{company_name} business NAICS 2022 code",
f"{company_name} industry NAICS code 2022",
f"what is {company_name} company NAICS code"
]
try:
print(f"π Searching Google for NAICS codes for '{company_name}'...")
for query in queries:
print(f" Query: {query}")
try:
# Search with each query, limiting to 3 results per query
search_results = search(query, stop=3, pause=2)
for result_url in search_results:
try:
response = requests.get(result_url, timeout=5)
if response.status_code == 200:
# Extract 6-digit NAICS codes
found_codes = re.findall(r'\b\d{6}\b', response.text)
naics_codes.update(found_codes)
# If we find codes, print them
if found_codes:
print(f" Found codes in {result_url}: {found_codes}")
except Exception as e:
print(f" β οΈ Error fetching {result_url}: {e}")
except Exception as e:
print(f" β οΈ Error with query '{query}': {e}")
continue
# Return unique codes, limited to 10 most common
return list(naics_codes)[:10]
except Exception as e:
print(f"β Error performing Google search: {str(e)}")
return []
def get_naics_classification(model, company_name: str, context: str, candidates: List[str]) -> dict:
"""
Use Gemini AI to determine the most appropriate NAICS code from candidates
First provides reasoning, then returns the NAICS code and explanation
"""
try:
print("π€ AI is analyzing NAICS classification...")
# Get additional company information from Google
company_info = google_search_company_info(company_name)
if company_info:
print(f"π Found additional company information:\n{company_info[:200]}...")
# Add the found information to the context
if context:
context = f"{context}\n\nAdditional information found online:\n{company_info}"
else:
context = f"Information found online:\n{company_info}"
# If we have candidate codes from Google search
if candidates:
# Create a prompt that asks for research on the candidates
prompt = f"""
You are a NAICS code classification expert. Based on the company information provided and the NAICS code candidates found from Google search, determine the most appropriate NAICS code.
Company Name: {company_name}
Context Information: {context}
NAICS Code Candidates from Google Search: {candidates}
First, research what these NAICS codes represent:
1. For each NAICS code candidate, briefly explain what industry or business activity it corresponds to.
2. Then explain which industry classification best matches this company based on the name and context provided.
3. Finally, select the single most appropriate NAICS code from the candidates, or suggest a different one if none match.
Your response should be in this format:
RESEARCH: [Brief explanation of what each NAICS code represents]
REASONING: [Your detailed reasoning about why the chosen industry classification is most appropriate for this company]
NAICS_CODE: [6-digit NAICS code]
"""
# If no candidates were found from Google search
else:
prompt = f"""
You are a NAICS code classification expert. Based on the company information provided, determine the most appropriate NAICS code.
Company Name: {company_name}
Context Information: {context}
First, analyze what industry this company likely belongs to based on its name and the provided context.
Consider standard business classifications and determine the most appropriate category.
Then provide the single most appropriate 6-digit NAICS code.
Your response should be in this format:
REASONING: [Your detailed reasoning about the company's industry classification, including what business activities it likely performs]
NAICS_CODE: [6-digit NAICS code]
"""
response = model.generate_content(prompt)
response_text = response.text.strip()
# Create result dictionary
result = {}
# Extract research if available
if "RESEARCH:" in response_text:
research_match = re.search(r'RESEARCH:(.*?)REASONING:', response_text, re.DOTALL | re.IGNORECASE)
if research_match:
result["research"] = research_match.group(1).strip()
# Extract reasoning
reasoning_match = re.search(r'REASONING:(.*?)NAICS_CODE:', response_text, re.DOTALL | re.IGNORECASE)
result["reasoning"] = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
# Extract NAICS code
naics_match = re.search(r'NAICS_CODE:(.*?)(\d{6})', response_text, re.DOTALL)
if naics_match:
result["naics_code"] = naics_match.group(2)
else:
# Try to find any 6-digit code in the response
code_match = re.search(r'\b(\d{6})\b', response_text)
result["naics_code"] = code_match.group(1) if code_match else "000000"
return result
except Exception as e:
print(f"β Error getting NAICS classification: {str(e)}")
return {
"naics_code": "000000",
"reasoning": f"Error analyzing company: {str(e)}"
}
def find_naics_code(company_name: str, context: str = "", api_key: Optional[str] = None) -> Dict:
"""
Core function to find NAICS code for a company that can be called from different interfaces
Args:
company_name: Name of the company
context: Brief description of the company (optional)
api_key: Google Gemini API key (if None, will try to get from environment variable)
Returns:
Dictionary with NAICS code, reasoning, and optional research
"""
# Get API key from environment if not provided
if not api_key:
api_key = os.environ.get('GEMINI_API_KEY')
if not api_key:
return {
"error": "No API key provided. Set GEMINI_API_KEY environment variable or pass as parameter.",
"naics_code": "000000",
"reasoning": "Error: API key missing"
}
# Initialize Gemini model
try:
model = initialize_gemini(api_key)
except Exception as e:
return {
"error": f"Failed to initialize Gemini API: {str(e)}",
"naics_code": "000000",
"reasoning": f"Error: {str(e)}"
}
# Find NAICS Code Candidates via Google search
naics_candidates = google_search_naics(company_name)
# Get classification from Gemini
if not naics_candidates:
print("No NAICS codes found from Google search.")
result = get_naics_classification(model, company_name, context, [])
else:
print(f"Found {len(naics_candidates)} NAICS candidates: {naics_candidates}")
result = get_naics_classification(model, company_name, context, naics_candidates)
# Add metadata
result["company_name"] = company_name
result["context"] = context
result["candidates"] = naics_candidates
return result
# Gradio interface function
def classify_company(company_name: str, company_description: str, api_key: str = None) -> Tuple[str, str, str]:
"""Process inputs from Gradio and return formatted results"""
if not api_key:
api_key = os.environ.get('GEMINI_API_KEY')
if not company_name:
return "Error: Company name is required", "", ""
result = find_naics_code(company_name, company_description, api_key)
# Format the NAICS code output
naics_code = f"**NAICS Code: {result['naics_code']}**"
# Format the research output
research = ""
if "research" in result and result["research"]:
research = f"## Research on NAICS Codes\n\n{result['research']}"
# Format the reasoning output
reasoning = f"## Analysis\n\n{result['reasoning']}"
return naics_code, research, reasoning
# Create the Gradio interface
def create_gradio_interface():
# Check if API key is set in environment
has_api_key = bool(os.environ.get('GEMINI_API_KEY'))
with gr.Blocks(title="NAICS Code Finder") as demo:
gr.Markdown("# NAICS Code Finder")
gr.Markdown("Enter a company name to find its appropriate NAICS code. The tool will search for information about the company and relevant NAICS codes online.")
with gr.Row():
with gr.Column():
company_name = gr.Textbox(label="Company Name", placeholder="Enter company name")
company_description = gr.Textbox(label="Additional Context (optional)", placeholder="Any additional information about the company")
# Only show API key input if not set in environment
if not has_api_key:
api_key = gr.Textbox(
label="Gemini API Key (required)",
placeholder="Enter your Google Gemini API key",
type="password"
)
else:
api_key = gr.Textbox(visible=False, value="")
submit_btn = gr.Button("Find NAICS Code", variant="primary")
with gr.Column():
status_output = gr.Markdown(label="Status")
naics_output = gr.Markdown(label="NAICS Code")
with gr.Accordion("Company Information", open=False):
company_info_output = gr.Markdown()
with gr.Accordion("NAICS Codes Research", open=False):
research_output = gr.Markdown()
with gr.Accordion("Classification Reasoning", open=True):
reasoning_output = gr.Markdown()
# Functions for the interface
def process_company(company_name, company_description, api_key):
if not company_name:
return "Please enter a company name", "", "", "", ""
# Use API key from input or environment
key_to_use = api_key if api_key else os.environ.get('GEMINI_API_KEY')
if not key_to_use:
return "No API key provided. Please enter your Gemini API key.", "", "", "", ""
status_md = "π Searching for company information...\n\n"
yield status_md, "", "", "", ""
# Get company info first
company_info = google_search_company_info(company_name)
if company_info:
company_info_md = f"## Information found about {company_name}\n\n{company_info}"
status_md += "β
Found company information\n\n"
else:
company_info_md = f"No detailed information found for {company_name}"
status_md += "β οΈ No company information found\n\n"
yield status_md, "", company_info_md, "", ""
# Get NAICS candidates
status_md += "π Searching for NAICS codes...\n\n"
yield status_md, "", company_info_md, "", ""
# Run the core functionality
result = find_naics_code(company_name, company_description, key_to_use)
if "candidates" in result and result["candidates"]:
status_md += f"β
Found {len(result['candidates'])} potential NAICS codes\n\n"
else:
status_md += "β οΈ No specific NAICS codes found in search results\n\n"
status_md += "π€ Analyzing classification...\n\n"
yield status_md, "", company_info_md, "", ""
# Format the NAICS code output
naics_code_md = f"## NAICS Code: {result['naics_code']}"
# Format the research output
research_md = ""
if "research" in result and result["research"]:
research_md = f"## Research on NAICS Codes\n\n{result['research']}"
# Format the reasoning output
reasoning_md = f"## Analysis\n\n{result['reasoning']}"
status_md += "β
Classification complete!"
return status_md, naics_code_md, company_info_md, research_md, reasoning_md
submit_btn.click(
process_company,
inputs=[company_name, company_description, api_key],
outputs=[status_output, naics_output, company_info_output, research_output, reasoning_output]
)
gr.Examples(
[
["Apple Inc", "Tech company"],
["Walmart", "Retail store"],
["Goldman Sachs", "Investment bank"],
["Ford Motor Company", "Automobile manufacturer"]
],
inputs=[company_name, company_description]
)
return demo
# Create and launch the interface
demo = create_gradio_interface()
# For Spaces deployment
if __name__ == "__main__":
demo.launch() |