jzou1995 commited on
Commit
203605e
Β·
verified Β·
1 Parent(s): 3a20bdf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -167
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import re
3
  import json
4
- import gradio as gr
5
  import requests
6
  from typing import List, Dict
7
  from googlesearch import search
@@ -31,67 +30,82 @@ def initialize_gemini(api_key: str):
31
  )
32
  return model
33
 
34
- def google_search_naics(company_name: str, company_description: str = "") -> List[str]:
35
- """Find potential NAICS codes for a company using Google search with enhanced context"""
36
- # Create a more effective search query with company description if available
37
- if company_description:
38
- query = f"2022 NAICS code for \"{company_name}\" {company_description} industry classification"
39
- else:
40
- query = f"2022 NAICS code for \"{company_name}\" company industry classification"
41
-
42
  naics_codes = set()
43
 
 
 
 
 
 
 
 
 
 
44
  try:
45
- search_results = search(query, stop=5, pause=2)
46
 
47
- for result_url in search_results:
 
48
  try:
49
- response = requests.get(result_url, timeout=5)
50
- if response.status_code == 200:
51
- # Extract 6-digit NAICS codes
52
- found_codes = re.findall(r'\b\d{6}\b', response.text)
53
- naics_codes.update(found_codes)
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
55
- print(f"Error fetching {result_url}: {e}")
 
56
 
57
- return list(naics_codes)[:5] # Return up to 5 extracted NAICS codes
 
58
  except Exception as e:
59
- print(f"Error performing Google search: {str(e)}")
60
  return []
61
 
62
- def get_naics_classification(model, company_name: str, context: str, candidates: List[str], search_query: str) -> dict:
63
  """
64
  Use Gemini AI to determine the most appropriate NAICS code from candidates
65
- First provides reasoning, then multiple possibilities with confidence levels
66
  """
67
  try:
 
 
68
  # If we have candidate codes from Google search
69
  if candidates:
 
70
  prompt = f"""
71
  You are a NAICS code classification expert. Based on the company information provided and the NAICS code candidates found from Google search, determine the most appropriate NAICS code.
72
 
73
  Company Name: {company_name}
74
  Context Information: {context}
75
- Google Search Query Used: {search_query}
76
- NAICS Code Candidates from Google Search: {candidates}
77
 
78
- First, start with a section titled "GOOGLE_FINDINGS:" where you describe what the Google search results suggest about this company based on the NAICS codes found.
79
-
80
- Then, in a section titled "REASONING:", explain your reasoning for which industry this company belongs to.
81
 
82
- Then list 3 potential NAICS classifications with confidence percentages (must add up to 100%).
83
- Finally, provide your final conclusion.
 
 
84
 
85
  Your response should be in this format:
86
- GOOGLE_FINDINGS: [Describe what the Google search results suggest about this company based on the NAICS codes found]
87
-
88
- REASONING: [Your detailed reasoning about the company's industry classification]
89
-
90
- POSSIBILITY_1: [Industry name] - NAICS Code [6-digit code] - [XX]% confidence
91
- POSSIBILITY_2: [Industry name] - NAICS Code [6-digit code] - [XX]% confidence
92
- POSSIBILITY_3: [Industry name] - NAICS Code [6-digit code] - [XX]% confidence
93
-
94
- CONCLUSION: I am [XX]% confident this company is [industry description] which is NAICS code [6-digit code]
95
  """
96
  # If no candidates were found from Google search
97
  else:
@@ -100,157 +114,86 @@ You are a NAICS code classification expert. Based on the company information pro
100
 
101
  Company Name: {company_name}
102
  Context Information: {context}
103
- Google Search Query Used: {search_query}
104
-
105
- First, start with a section titled "GOOGLE_FINDINGS:" where you acknowledge that the Google search did not return any specific NAICS codes for this company.
106
 
107
- Then, in a section titled "REASONING:", explain your reasoning for which industry this company belongs to based on the limited information available.
108
-
109
- Then list 3 potential NAICS classifications with confidence percentages (must add up to 100%).
110
- Finally, provide your final conclusion.
111
 
112
  Your response should be in this format:
113
- GOOGLE_FINDINGS: No specific NAICS codes were found in the Google search results using the query "{search_query}".
114
-
115
- REASONING: [Your detailed reasoning about the company's industry classification based on the limited information available]
116
-
117
- POSSIBILITY_1: [Industry name] - NAICS Code [6-digit code] - [XX]% confidence
118
- POSSIBILITY_2: [Industry name] - NAICS Code [6-digit code] - [XX]% confidence
119
- POSSIBILITY_3: [Industry name] - NAICS Code [6-digit code] - [XX]% confidence
120
-
121
- CONCLUSION: I am [XX]% confident this company is [industry description] which is NAICS code [6-digit code]
122
  """
123
  response = model.generate_content(prompt)
124
  response_text = response.text.strip()
125
 
126
- # Extract Google findings
127
- google_findings_match = re.search(r'GOOGLE_FINDINGS:(.*?)REASONING:', response_text, re.DOTALL | re.IGNORECASE)
128
- google_findings = google_findings_match.group(1).strip() if google_findings_match else "No Google findings provided."
129
-
130
- # Extract reasoning
131
- reasoning_match = re.search(r'REASONING:(.*?)POSSIBILITY_1:', response_text, re.DOTALL | re.IGNORECASE)
132
- reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
133
-
134
- # Extract possibilities
135
- possibilities = []
136
 
137
- # Try to extract possibility 1
138
- poss1_match = re.search(r'POSSIBILITY_1:(.*?)POSSIBILITY_2:', response_text, re.DOTALL | re.IGNORECASE)
139
- if poss1_match:
140
- possibilities.append(poss1_match.group(1).strip())
141
-
142
- # Try to extract possibility 2
143
- poss2_match = re.search(r'POSSIBILITY_2:(.*?)POSSIBILITY_3:', response_text, re.DOTALL | re.IGNORECASE)
144
- if poss2_match:
145
- possibilities.append(poss2_match.group(1).strip())
146
-
147
- # Try to extract possibility 3
148
- poss3_match = re.search(r'POSSIBILITY_3:(.*?)CONCLUSION:', response_text, re.DOTALL | re.IGNORECASE)
149
- if poss3_match:
150
- possibilities.append(poss3_match.group(1).strip())
151
 
152
- # Extract conclusion
153
- conclusion_match = re.search(r'CONCLUSION:(.*?)$', response_text, re.DOTALL | re.IGNORECASE)
154
- conclusion = conclusion_match.group(1).strip() if conclusion_match else "No conclusion provided."
155
 
156
- # Extract final NAICS code from conclusion
157
- naics_match = re.search(r'NAICS code (\d{6})', conclusion)
158
  if naics_match:
159
- naics_code = naics_match.group(1)
160
  else:
161
- # Try to find any 6-digit code in the conclusion
162
- code_match = re.search(r'\b(\d{6})\b', conclusion)
163
- naics_code = code_match.group(1) if code_match else "000000"
164
-
165
- return {
166
- "naics_code": naics_code,
167
- "google_findings": google_findings,
168
- "reasoning": reasoning,
169
- "possibilities": possibilities,
170
- "conclusion": conclusion
171
- }
172
  except Exception as e:
173
- print(f"Error getting NAICS classification: {str(e)}")
174
  return {
175
  "naics_code": "000000",
176
- "google_findings": "Error occurred during Google search.",
177
- "reasoning": f"Error analyzing company: {str(e)}",
178
- "possibilities": [],
179
- "conclusion": "Error in analysis"
180
  }
181
 
182
- def find_naics_code(api_key, company_name, company_description):
183
- """Main function to find NAICS code that will be called by Gradio"""
184
- if not api_key or not company_name:
185
- return "Please provide both API key and company name."
186
-
187
- try:
188
- # Initialize Gemini API
189
- model = initialize_gemini(api_key)
190
-
191
- # Search for NAICS candidates with company description for better context
192
- naics_candidates = google_search_naics(company_name, company_description)
193
- # Store search query for reporting
194
- search_query = f"2022 NAICS code for \"{company_name}\" {company_description} industry classification" if company_description else f"2022 NAICS code for \"{company_name}\" company industry classification"
 
 
195
 
196
- # Get classification
 
 
197
  if not naics_candidates:
198
- result = get_naics_classification(model, company_name, company_description, [], search_query)
 
 
199
  else:
200
- result = get_naics_classification(model, company_name, company_description, naics_candidates, search_query)
201
-
202
- # Format the output with NAICS code at the end
203
- output = f"## Analysis for {company_name}\n\n"
204
-
205
- # Display search query prominently at the top
206
- output += f"**Google Search Query Used:**\n`{search_query}`\n\n"
207
-
208
- # Add Google findings first
209
- if 'google_findings' in result and result['google_findings']:
210
- output += f"**Google Search Findings:**\n{result['google_findings']}\n\n"
211
-
212
- # Then reasoning
213
- output += f"**Reasoning:**\n{result['reasoning']}\n\n"
214
-
215
- # Add possibilities section
216
- if 'possibilities' in result and result['possibilities']:
217
- output += f"**Possible Classifications:**\n\n"
218
- for i, possibility in enumerate(result['possibilities'], 1):
219
- output += f"{i}. {possibility}\n\n"
220
-
221
- # Add conclusion
222
- if 'conclusion' in result and result['conclusion']:
223
- output += f"**Conclusion:**\n{result['conclusion']}\n\n"
224
-
225
- # Add final NAICS code at the very end
226
- output += f"**FINAL NAICS CODE: {result['naics_code']}**"
227
-
228
- return output
229
-
230
- except Exception as e:
231
- return f"Error: {str(e)}"
232
 
233
- # Create Gradio Interface
234
- with gr.Blocks(title="NAICS Code Finder") as app:
235
- gr.Markdown("# NAICS Code Finder")
236
- gr.Markdown("This app helps you find the appropriate NAICS code for a company based on its name and description.")
237
-
238
- with gr.Row():
239
- with gr.Column():
240
- api_key = gr.Textbox(label="Google Gemini API Key", placeholder="Enter your Gemini API key here", type="password")
241
- company_name = gr.Textbox(label="Company Name", placeholder="Enter the company name")
242
- company_description = gr.Textbox(label="Company Description", placeholder="Enter a brief description of the company", lines=5)
243
 
244
- submit_btn = gr.Button("Find NAICS Code")
245
-
246
- with gr.Column():
247
- output = gr.Markdown(label="Result")
248
-
249
- submit_btn.click(
250
- fn=find_naics_code,
251
- inputs=[api_key, company_name, company_description],
252
- outputs=output
253
- )
254
 
255
  if __name__ == "__main__":
256
- app.launch()
 
1
  import os
2
  import re
3
  import json
 
4
  import requests
5
  from typing import List, Dict
6
  from googlesearch import search
 
30
  )
31
  return model
32
 
33
+ def google_search_naics(company_name: str) -> List[str]:
34
+ """
35
+ Find potential NAICS codes for a company using multiple targeted Google searches
36
+ Uses more specific search queries to improve results
37
+ """
 
 
 
38
  naics_codes = set()
39
 
40
+ # Create multiple search queries for better results
41
+ queries = [
42
+ f"NAICS code for {company_name}",
43
+ f"what is {company_name} company NAICS code",
44
+ f"{company_name} business entity NAICS classification",
45
+ f"{company_name} industry classification NAICS",
46
+ f"{company_name} company information NAICS"
47
+ ]
48
+
49
  try:
50
+ print(f"πŸ”Ž Searching Google for NAICS codes for '{company_name}'...")
51
 
52
+ for query in queries:
53
+ print(f" Query: {query}")
54
  try:
55
+ # Search with each query, limiting to 3 results per query
56
+ search_results = search(query, stop=3, pause=2)
57
+
58
+ for result_url in search_results:
59
+ try:
60
+ response = requests.get(result_url, timeout=5)
61
+ if response.status_code == 200:
62
+ # Extract 6-digit NAICS codes
63
+ found_codes = re.findall(r'\b\d{6}\b', response.text)
64
+ naics_codes.update(found_codes)
65
+
66
+ # If we find codes, print them
67
+ if found_codes:
68
+ print(f" Found codes in {result_url}: {found_codes}")
69
+ except Exception as e:
70
+ print(f" ⚠️ Error fetching {result_url}: {e}")
71
  except Exception as e:
72
+ print(f" ⚠️ Error with query '{query}': {e}")
73
+ continue
74
 
75
+ # Return unique codes, limited to 10 most common
76
+ return list(naics_codes)[:10]
77
  except Exception as e:
78
+ print(f"❌ Error performing Google search: {str(e)}")
79
  return []
80
 
81
+ def get_naics_classification(model, company_name: str, context: str, candidates: List[str]) -> dict:
82
  """
83
  Use Gemini AI to determine the most appropriate NAICS code from candidates
84
+ First provides reasoning, then returns the NAICS code and explanation
85
  """
86
  try:
87
+ print("πŸ€– AI is analyzing NAICS classification...")
88
+
89
  # If we have candidate codes from Google search
90
  if candidates:
91
+ # Create a prompt that asks for research on the candidates
92
  prompt = f"""
93
  You are a NAICS code classification expert. Based on the company information provided and the NAICS code candidates found from Google search, determine the most appropriate NAICS code.
94
 
95
  Company Name: {company_name}
96
  Context Information: {context}
 
 
97
 
98
+ NAICS Code Candidates from Google Search: {candidates}
 
 
99
 
100
+ First, research what these NAICS codes represent:
101
+ 1. For each NAICS code candidate, briefly explain what industry or business activity it corresponds to.
102
+ 2. Then explain which industry classification best matches this company based on the name and context provided.
103
+ 3. Finally, select the single most appropriate NAICS code from the candidates, or suggest a different one if none match.
104
 
105
  Your response should be in this format:
106
+ RESEARCH: [Brief explanation of what each NAICS code represents]
107
+ REASONING: [Your detailed reasoning about why the chosen industry classification is most appropriate for this company]
108
+ NAICS_CODE: [6-digit NAICS code]
 
 
 
 
 
 
109
  """
110
  # If no candidates were found from Google search
111
  else:
 
114
 
115
  Company Name: {company_name}
116
  Context Information: {context}
 
 
 
117
 
118
+ First, analyze what industry this company likely belongs to based on its name and the provided context.
119
+ Consider standard business classifications and determine the most appropriate category.
120
+ Then provide the single most appropriate 6-digit NAICS code.
 
121
 
122
  Your response should be in this format:
123
+ REASONING: [Your detailed reasoning about the company's industry classification, including what business activities it likely performs]
124
+ NAICS_CODE: [6-digit NAICS code]
 
 
 
 
 
 
 
125
  """
126
  response = model.generate_content(prompt)
127
  response_text = response.text.strip()
128
 
129
+ # Create result dictionary
130
+ result = {}
 
 
 
 
 
 
 
 
131
 
132
+ # Extract research if available
133
+ if "RESEARCH:" in response_text:
134
+ research_match = re.search(r'RESEARCH:(.*?)REASONING:', response_text, re.DOTALL | re.IGNORECASE)
135
+ if research_match:
136
+ result["research"] = research_match.group(1).strip()
 
 
 
 
 
 
 
 
 
137
 
138
+ # Extract reasoning
139
+ reasoning_match = re.search(r'REASONING:(.*?)NAICS_CODE:', response_text, re.DOTALL | re.IGNORECASE)
140
+ result["reasoning"] = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
141
 
142
+ # Extract NAICS code
143
+ naics_match = re.search(r'NAICS_CODE:(.*?)(\d{6})', response_text, re.DOTALL)
144
  if naics_match:
145
+ result["naics_code"] = naics_match.group(2)
146
  else:
147
+ # Try to find any 6-digit code in the response
148
+ code_match = re.search(r'\b(\d{6})\b', response_text)
149
+ result["naics_code"] = code_match.group(1) if code_match else "000000"
150
+
151
+ return result
 
 
 
 
 
 
152
  except Exception as e:
153
+ print(f"❌ Error getting NAICS classification: {str(e)}")
154
  return {
155
  "naics_code": "000000",
156
+ "reasoning": f"Error analyzing company: {str(e)}"
 
 
 
157
  }
158
 
159
+ def main():
160
+ """Main function to run the NAICS classifier"""
161
+ print("πŸš€ NAICS Code Finder\n")
162
+
163
+ # Step 1: Get API Key
164
+ api_key = input("Enter your Google Gemini API Key: ")
165
+ model = initialize_gemini(api_key)
166
+
167
+ while True:
168
+ # Step 2: Get Company Info
169
+ company_name = input("\nEnter the company name (or 'exit' to quit): ")
170
+ if company_name.lower() == 'exit':
171
+ break
172
+
173
+ context = input("Enter a brief description of the company (or press Enter for none): ")
174
 
175
+ # Step 3: Find NAICS Code Candidates
176
+ naics_candidates = google_search_naics(company_name)
177
+
178
  if not naics_candidates:
179
+ print("❌ No NAICS codes found from Google search.")
180
+ # Ask Gemini to suggest a code even without candidates
181
+ result = get_naics_classification(model, company_name, context, [])
182
  else:
183
+ print(f"βœ… Found {len(naics_candidates)} NAICS candidates: {naics_candidates}")
184
+ # Use Gemini to select the best code
185
+ result = get_naics_classification(model, company_name, context, naics_candidates)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ # Display research findings if available
188
+ if "research" in result:
189
+ print(f"\nπŸ“Š NAICS Code Research:\n{result['research']}")
190
+
191
+ # Display reasoning
192
+ print(f"\n🧠 Reasoning:\n{result['reasoning']}")
 
 
 
 
193
 
194
+ # Output the NAICS code
195
+ print(f"\nπŸ† NAICS Code: {result['naics_code']}")
196
+ print("-" * 80)
 
 
 
 
 
 
 
197
 
198
  if __name__ == "__main__":
199
+ main()