Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,8 +12,8 @@ import io
|
|
12 |
|
13 |
def fetch_detail(cert_number, main_data, headers):
|
14 |
"""
|
15 |
-
For a given certification number, call the URAC detail API and return a list of rows
|
16 |
-
|
17 |
"""
|
18 |
detail_rows = []
|
19 |
url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
|
@@ -23,7 +23,6 @@ def fetch_detail(cert_number, main_data, headers):
|
|
23 |
detail_data = response.json()
|
24 |
entities = detail_data.get("certificationEntities", [])
|
25 |
if not entities:
|
26 |
-
# No site records: return row with blank site fields.
|
27 |
row = main_data.copy()
|
28 |
row.update({
|
29 |
"Site Name": None,
|
@@ -71,12 +70,12 @@ def fetch_detail(cert_number, main_data, headers):
|
|
71 |
st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
|
72 |
return detail_rows
|
73 |
|
74 |
-
def scrape_urac():
|
75 |
"""
|
76 |
Scrape URAC accreditation data:
|
77 |
1. Call the main filter API.
|
78 |
2. Parse organization details.
|
79 |
-
3. For each organization, call the detail API
|
80 |
Returns a pandas DataFrame.
|
81 |
"""
|
82 |
organizations = []
|
@@ -143,7 +142,7 @@ def scrape_urac():
|
|
143 |
st.write("Error processing URAC main API:", e)
|
144 |
return pd.DataFrame()
|
145 |
|
146 |
-
# Parse
|
147 |
for item in data.get('items', []):
|
148 |
entity = item.get('entity', {})
|
149 |
org_name = entity.get('name', None)
|
@@ -170,7 +169,6 @@ def scrape_urac():
|
|
170 |
state = value
|
171 |
elif label == 'ZipCode':
|
172 |
zipcode = value
|
173 |
-
# Get certification number.
|
174 |
cert_number = item.get("primaryCertification", {}).get("certificationNumber")
|
175 |
if not cert_number:
|
176 |
cert_number = item.get("certificationNumber")
|
@@ -189,7 +187,7 @@ def scrape_urac():
|
|
189 |
}
|
190 |
organizations.append(org_data)
|
191 |
|
192 |
-
#
|
193 |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
194 |
future_to_org = {
|
195 |
executor.submit(fetch_detail, org["Certification Number"], org, headers): org
|
@@ -205,6 +203,8 @@ def scrape_urac():
|
|
205 |
org = future_to_org[future]
|
206 |
st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
|
207 |
completed += 1
|
|
|
|
|
208 |
return pd.DataFrame(all_rows)
|
209 |
|
210 |
def _parse_accreditation_blocks(detail_soup):
|
@@ -298,7 +298,7 @@ def _fetch_detail_for_company(company, base_url, headers, cookies):
|
|
298 |
st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
|
299 |
return rows
|
300 |
|
301 |
-
def scrape_achc():
|
302 |
"""
|
303 |
Scrape ACHC data:
|
304 |
1. Call the main API to get HTML.
|
@@ -334,7 +334,7 @@ def scrape_achc():
|
|
334 |
main_json = main_resp.json()
|
335 |
except Exception as e:
|
336 |
st.write(f"Error fetching ACHC main API: {e}")
|
337 |
-
return pd.DataFrame()
|
338 |
|
339 |
main_html = main_json.get('response_html', '')
|
340 |
main_soup = BeautifulSoup(main_html, 'html.parser')
|
@@ -346,7 +346,6 @@ def scrape_achc():
|
|
346 |
continue
|
347 |
org_tag = list_box.find('b', class_='company_name')
|
348 |
org_name = org_tag.get_text(strip=True) if org_tag else ''
|
349 |
-
# Join all <p> texts for the address.
|
350 |
address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
|
351 |
address = ' '.join(address_parts)
|
352 |
parsed = usaddress.parse(address)
|
@@ -389,6 +388,8 @@ def scrape_achc():
|
|
389 |
except Exception as exc:
|
390 |
st.write(f"Error fetching ACHC detail: {exc}")
|
391 |
completed += 1
|
|
|
|
|
392 |
df = pd.DataFrame(detail_rows_all, columns=[
|
393 |
"Organization Name",
|
394 |
"Start Date",
|
@@ -411,28 +412,28 @@ st.title("Accreditation Data Scraper")
|
|
411 |
st.write("Click the button below to start scraping and generate an Excel file.")
|
412 |
|
413 |
def run_scraper():
|
414 |
-
|
415 |
-
|
416 |
with st.spinner("Scraping URAC data..."):
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
with st.spinner("Scraping ACHC data..."):
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
with st.spinner("Merging data and generating Excel..."):
|
425 |
-
merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
|
426 |
suffixes=("_URAC", "_ACHC"))
|
427 |
-
# Write to an in-memory bytes buffer.
|
428 |
output = io.BytesIO()
|
429 |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
430 |
-
urac_df.
|
431 |
-
|
432 |
-
|
433 |
-
|
|
|
|
|
|
|
434 |
output.seek(0)
|
435 |
-
progress_bar.progress(100)
|
436 |
return output
|
437 |
|
438 |
if st.button("Start Scraping"):
|
|
|
12 |
|
13 |
def fetch_detail(cert_number, main_data, headers):
|
14 |
"""
|
15 |
+
For a given certification number, call the URAC detail API and return a list of rows.
|
16 |
+
If no site records are returned, a row with blank site fields is returned.
|
17 |
"""
|
18 |
detail_rows = []
|
19 |
url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
|
|
|
23 |
detail_data = response.json()
|
24 |
entities = detail_data.get("certificationEntities", [])
|
25 |
if not entities:
|
|
|
26 |
row = main_data.copy()
|
27 |
row.update({
|
28 |
"Site Name": None,
|
|
|
70 |
st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
|
71 |
return detail_rows
|
72 |
|
73 |
+
def scrape_urac(progress_bar=None):
|
74 |
"""
|
75 |
Scrape URAC accreditation data:
|
76 |
1. Call the main filter API.
|
77 |
2. Parse organization details.
|
78 |
+
3. For each organization, call the detail API in parallel to get one row per site address.
|
79 |
Returns a pandas DataFrame.
|
80 |
"""
|
81 |
organizations = []
|
|
|
142 |
st.write("Error processing URAC main API:", e)
|
143 |
return pd.DataFrame()
|
144 |
|
145 |
+
# Parse organization items.
|
146 |
for item in data.get('items', []):
|
147 |
entity = item.get('entity', {})
|
148 |
org_name = entity.get('name', None)
|
|
|
169 |
state = value
|
170 |
elif label == 'ZipCode':
|
171 |
zipcode = value
|
|
|
172 |
cert_number = item.get("primaryCertification", {}).get("certificationNumber")
|
173 |
if not cert_number:
|
174 |
cert_number = item.get("certificationNumber")
|
|
|
187 |
}
|
188 |
organizations.append(org_data)
|
189 |
|
190 |
+
# Fetch detail API calls in parallel and update the progress bar.
|
191 |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
192 |
future_to_org = {
|
193 |
executor.submit(fetch_detail, org["Certification Number"], org, headers): org
|
|
|
203 |
org = future_to_org[future]
|
204 |
st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
|
205 |
completed += 1
|
206 |
+
if progress_bar is not None and total > 0:
|
207 |
+
progress_bar.progress(min(100, int(100 * completed / total)))
|
208 |
return pd.DataFrame(all_rows)
|
209 |
|
210 |
def _parse_accreditation_blocks(detail_soup):
|
|
|
298 |
st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
|
299 |
return rows
|
300 |
|
301 |
+
def scrape_achc(progress_bar=None):
|
302 |
"""
|
303 |
Scrape ACHC data:
|
304 |
1. Call the main API to get HTML.
|
|
|
334 |
main_json = main_resp.json()
|
335 |
except Exception as e:
|
336 |
st.write(f"Error fetching ACHC main API: {e}")
|
337 |
+
return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])
|
338 |
|
339 |
main_html = main_json.get('response_html', '')
|
340 |
main_soup = BeautifulSoup(main_html, 'html.parser')
|
|
|
346 |
continue
|
347 |
org_tag = list_box.find('b', class_='company_name')
|
348 |
org_name = org_tag.get_text(strip=True) if org_tag else ''
|
|
|
349 |
address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
|
350 |
address = ' '.join(address_parts)
|
351 |
parsed = usaddress.parse(address)
|
|
|
388 |
except Exception as exc:
|
389 |
st.write(f"Error fetching ACHC detail: {exc}")
|
390 |
completed += 1
|
391 |
+
if progress_bar is not None and total > 0:
|
392 |
+
progress_bar.progress(min(100, int(100 * completed / total)))
|
393 |
df = pd.DataFrame(detail_rows_all, columns=[
|
394 |
"Organization Name",
|
395 |
"Start Date",
|
|
|
412 |
st.write("Click the button below to start scraping and generate an Excel file.")
|
413 |
|
414 |
def run_scraper():
|
415 |
+
# Scrape URAC data with its own progress bar.
|
|
|
416 |
with st.spinner("Scraping URAC data..."):
|
417 |
+
urac_progress = st.progress(0)
|
418 |
+
urac_df = scrape_urac(progress_bar=urac_progress)
|
419 |
+
# Scrape ACHC data with its own progress bar.
|
420 |
with st.spinner("Scraping ACHC data..."):
|
421 |
+
achc_progress = st.progress(0)
|
422 |
+
achc_df = scrape_achc(progress_bar=achc_progress)
|
423 |
+
# Merge data and write to an in-memory Excel file.
|
424 |
with st.spinner("Merging data and generating Excel..."):
|
425 |
+
merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
|
426 |
suffixes=("_URAC", "_ACHC"))
|
|
|
427 |
output = io.BytesIO()
|
428 |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
429 |
+
if not urac_df.empty:
|
430 |
+
urac_df.to_excel(writer, sheet_name="URAC", index=False)
|
431 |
+
if not achc_df.empty:
|
432 |
+
achc_df.to_excel(writer, sheet_name="ACHC", index=False)
|
433 |
+
if not urac_df.empty and not achc_df.empty:
|
434 |
+
merged_df.to_excel(writer, sheet_name="Merged", index=False)
|
435 |
+
# writer.save()
|
436 |
output.seek(0)
|
|
|
437 |
return output
|
438 |
|
439 |
if st.button("Start Scraping"):
|