File size: 13,489 Bytes
b8de561 be75490 fd8b571 be75490 fd8b571 be75490 fd8b571 be75490 fd8b571 be75490 203605e be75490 486a9e9 be75490 486a9e9 be75490 203605e 486a9e9 be75490 203605e be75490 203605e be75490 203605e be75490 203605e be75490 486a9e9 203605e 486a9e9 be75490 486a9e9 be75490 b5407c0 be75490 486a9e9 be75490 486a9e9 203605e be75490 fd8b571 be75490 fd8b571 be75490 486a9e9 be75490 b5407c0 486a9e9 be75490 b5407c0 be75490 2b57935 be75490 b5407c0 486a9e9 be75490 203605e b5407c0 486a9e9 be75490 3a20bdf be75490 203605e 486a9e9 203605e b5407c0 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 203605e 486a9e9 68b7136 be75490 68b7136 be75490 203605e be75490 68b7136 be75490 68b7136 b5407c0 68b7136 fd8b571 68b7136 be75490 68b7136 fd8b571 486a9e9 68b7136 fd8b571 68b7136 fd8b571 be75490 fd8b571 be75490 fd8b571 be75490 fd8b571 be75490 fd8b571 be75490 fd8b571 be75490 fd8b571 be75490 68b7136 fd8b571 68b7136 be75490 68b7136 fd8b571 68b7136 b5407c0 68b7136 b5407c0 68b7136 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 |
import os
import re
import json
import requests
from typing import List, Dict, Optional, Tuple
import gradio as gr
from googlesearch import search
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from bs4 import BeautifulSoup
def initialize_gemini(api_key: str):
"""Initialize the Google Gemini API with appropriate configurations"""
genai.configure(api_key=api_key)
generation_config = {
"temperature": 0.2,
"top_p": 0.8,
"top_k": 40,
"max_output_tokens": 1024,
}
safety_settings = {
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=generation_config,
safety_settings=safety_settings
)
return model
def combined_google_search(company_name: str) -> Tuple[str, List[str]]:
"""
Combined search function that finds both company information and NAICS codes
Returns:
Tuple containing (company_info, naics_code_candidates)
"""
company_info = ""
naics_codes = set()
# Create comprehensive search queries
info_queries = [
f"what is {company_name} company business industry sector",
f"{company_name} company about us business description",
f"{company_name} company profile what they do"
]
naics_queries = [
f"2022 NAICS code for {company_name} company",
f"{company_name} NAICS 2022 classification",
f"what is {company_name} industry NAICS code 2022"
]
all_queries = info_queries + naics_queries
try:
print(f"π Searching for information about '{company_name}'...")
for query in all_queries:
print(f" Query: {query}")
try:
# Search with each query
search_results = search(query, stop=3, pause=2)
for result_url in search_results:
try:
response = requests.get(result_url, timeout=5)
if response.status_code == 200:
# Extract NAICS codes
found_codes = re.findall(r'\b\d{6}\b', response.text)
if found_codes:
naics_codes.update(found_codes)
print(f" Found codes in {result_url}: {found_codes}")
# Extract company information
if len(company_info) < 1000: # Only if we need more info
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
# Get text from paragraphs that mention the company
for p in paragraphs:
text = p.get_text().strip()
if len(text) > 80 and company_name.lower() in text.lower():
company_info += text + "\n\n"
if len(company_info) > 1000:
break
except Exception as e:
print(f" β οΈ Error fetching {result_url}: {e}")
# If we have enough information, move to the next query
if len(company_info) > 1000 and len(naics_codes) > 0:
break
except Exception as e:
print(f" β οΈ Error with query '{query}': {e}")
continue
# Return company info and NAICS codes
return company_info.strip(), list(naics_codes)[:10]
except Exception as e:
print(f"β Error during Google search: {str(e)}")
return "", []
def analyze_naics_code(model, company_name: str, context: str, company_info: str, naics_candidates: List[str]) -> dict:
"""
Use Gemini AI to determine the most appropriate NAICS code
"""
try:
print("π€ AI is analyzing NAICS classification...")
# Combine provided context with discovered company info
if company_info:
if context:
combined_context = f"{context}\n\nAdditional information found online:\n{company_info}"
else:
combined_context = f"Information found online:\n{company_info}"
else:
combined_context = context
# Create the prompt based on whether we have candidate codes
if naics_candidates:
prompt = f"""
You are a NAICS code classification expert. Based on the company information provided and any NAICS code candidates found from online research, determine the most appropriate NAICS code.
Company Name: {company_name}
Information about the company: {combined_context}
NAICS Code Candidates found in research: {naics_candidates}
First, analyze what these NAICS codes represent and which industry this company belongs to based on the information provided.
Then select the single most appropriate 6-digit NAICS code.
Your response should be in this format:
REASONING: [Your detailed reasoning about why the chosen industry classification is most appropriate for this company, including what business activities it performs]
NAICS_CODE: [6-digit NAICS code]
"""
else:
prompt = f"""
You are a NAICS code classification expert. Based on the company information provided, determine the most appropriate NAICS code.
Company Name: {company_name}
Information about the company: {combined_context}
Analyze what industry this company likely belongs to based on its name and the provided information.
Consider standard business classifications and determine the most appropriate category.
Then provide the single most appropriate 6-digit NAICS code.
Your response should be in this format:
REASONING: [Your detailed reasoning about the company's industry classification, including what business activities it likely performs]
NAICS_CODE: [6-digit NAICS code]
"""
response = model.generate_content(prompt)
response_text = response.text.strip()
# Create result dictionary
result = {}
# Extract reasoning
reasoning_match = re.search(r'REASONING:(.*?)NAICS_CODE:', response_text, re.DOTALL | re.IGNORECASE)
result["reasoning"] = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
# Extract NAICS code
naics_match = re.search(r'NAICS_CODE:(.*?)(\d{6})', response_text, re.DOTALL)
if naics_match:
result["naics_code"] = naics_match.group(2)
else:
# Try to find any 6-digit code in the response
code_match = re.search(r'\b(\d{6})\b', response_text)
result["naics_code"] = code_match.group(1) if code_match else "000000"
return result
except Exception as e:
print(f"β Error getting NAICS classification: {str(e)}")
return {
"naics_code": "000000",
"reasoning": f"Error analyzing company: {str(e)}"
}
def find_naics_code(company_name: str, context: str = "", api_key: Optional[str] = None) -> Dict:
"""
Core function to find NAICS code for a company
"""
# Get API key from environment if not provided
if not api_key:
api_key = os.environ.get('GEMINI_API_KEY')
if not api_key:
return {
"error": "No API key provided. Set GEMINI_API_KEY environment variable or pass as parameter.",
"naics_code": "000000",
"reasoning": "Error: API key missing"
}
# Initialize Gemini model
try:
model = initialize_gemini(api_key)
except Exception as e:
return {
"error": f"Failed to initialize Gemini API: {str(e)}",
"naics_code": "000000",
"reasoning": f"Error: {str(e)}"
}
# Run the combined search
company_info, naics_candidates = combined_google_search(company_name)
# Get AI analysis
result = analyze_naics_code(model, company_name, context, company_info, naics_candidates)
# Add metadata
result["company_name"] = company_name
result["context"] = context
result["company_info"] = company_info
result["candidates"] = naics_candidates
return result
# Create the Gradio interface
def create_gradio_interface():
# Check if API key is set in environment
has_api_key = bool(os.environ.get('GEMINI_API_KEY'))
with gr.Blocks(title="NAICS Code Finder") as demo:
gr.Markdown("# NAICS Code Finder")
gr.Markdown("Enter a company name to find its appropriate NAICS code. The tool will search for information about the company and find the most appropriate classification.")
with gr.Row():
with gr.Column():
company_name = gr.Textbox(label="Company Name", placeholder="Enter company name")
company_description = gr.Textbox(label="Additional Context (optional)", placeholder="Any additional information about the company")
# Only show API key input if not set in environment
if not has_api_key:
api_key = gr.Textbox(
label="Gemini API Key (required)",
placeholder="Enter your Google Gemini API key",
type="password"
)
else:
api_key = gr.Textbox(visible=False, value="")
submit_btn = gr.Button("Find NAICS Code", variant="primary")
with gr.Column():
status_output = gr.Markdown(label="Status")
naics_output = gr.Markdown(label="NAICS Code")
with gr.Accordion("Company Information", open=False):
company_info_output = gr.Markdown()
with gr.Accordion("Classification Reasoning", open=True):
reasoning_output = gr.Markdown()
# Functions for the interface
def process_company(company_name, company_description, api_key):
if not company_name:
return "Please enter a company name", "", "", ""
# Use API key from input or environment
key_to_use = api_key if api_key else os.environ.get('GEMINI_API_KEY')
if not key_to_use:
return "No API key provided. Please enter your Gemini API key.", "", "", ""
status_md = "π Searching for company information and NAICS codes...\n\n"
yield status_md, "", "", ""
# Run the core functionality
result = find_naics_code(company_name, company_description, key_to_use)
# Update status based on results
if "company_info" in result and result["company_info"]:
status_md += "β
Found company information\n\n"
company_info_md = f"## Information found about {company_name}\n\n{result['company_info']}"
else:
status_md += "β οΈ Limited company information found\n\n"
company_info_md = f"Limited information found for {company_name}"
if "candidates" in result and result["candidates"]:
status_md += f"β
Found {len(result['candidates'])} potential NAICS codes: {', '.join(result['candidates'])}\n\n"
else:
status_md += "β οΈ No specific NAICS codes found in search results\n\n"
status_md += "π€ Analyzing classification...\n\n"
yield status_md, "", company_info_md, ""
# Format the NAICS code output
naics_code_md = f"## NAICS Code: {result['naics_code']}"
# Format the reasoning output
reasoning_md = f"## Analysis\n\n{result['reasoning']}"
status_md += "β
Classification complete!"
return status_md, naics_code_md, company_info_md, reasoning_md
submit_btn.click(
process_company,
inputs=[company_name, company_description, api_key],
outputs=[status_output, naics_output, company_info_output, reasoning_output]
)
gr.Examples(
[
["Apple Inc", "Tech company"],
["Walmart", "Retail store"],
["Goldman Sachs", "Investment bank"],
["Ford Motor Company", "Automobile manufacturer"]
],
inputs=[company_name, company_description]
)
return demo
# Create and launch the interface
demo = create_gradio_interface()
# For Spaces deployment
if __name__ == "__main__":
demo.launch() |