Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import requests | |
import usaddress | |
import concurrent.futures | |
import re | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
import io | |
# --- Scraper functions (adapted from your scraper.py) --- | |
def fetch_detail(cert_number, main_data, headers): | |
""" | |
For a given certification number, call the URAC detail API and return a list of rows. | |
If no site records are returned, a row with blank site fields is returned. | |
""" | |
detail_rows = [] | |
url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation" | |
try: | |
response = requests.get(url, headers=headers) | |
response.raise_for_status() | |
detail_data = response.json() | |
entities = detail_data.get("certificationEntities", []) | |
if not entities: | |
row = main_data.copy() | |
row.update({ | |
"Site Name": None, | |
"Site Address": None, | |
"Site Street": None, | |
"Site City": None, | |
"Site State": None, | |
"Site ZipCode": None | |
}) | |
detail_rows.append(row) | |
else: | |
for entity_item in entities: | |
site_entity = entity_item.get("entity", {}) | |
site_name = site_entity.get("name", None) | |
# Combine the site address parts. | |
site_address_parts = [] | |
for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']: | |
part = site_entity.get(key) | |
if part: | |
site_address_parts.append(part) | |
site_address = ', '.join(site_address_parts) | |
# Parse the site address using usaddress. | |
parsed_site = usaddress.parse(site_address) | |
site_street, site_city, site_state, site_zipcode = '', '', '', '' | |
for value, label in parsed_site: | |
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): | |
site_street += f' {value}' | |
elif label == 'PlaceName': | |
site_city = value | |
elif label == 'StateName': | |
site_state = value | |
elif label == 'ZipCode': | |
site_zipcode = value | |
row = main_data.copy() | |
row.update({ | |
"Site Name": site_name, | |
"Site Address": site_address, | |
"Site Street": site_street.strip(), | |
"Site City": site_city, | |
"Site State": site_state, | |
"Site ZipCode": site_zipcode | |
}) | |
detail_rows.append(row) | |
except Exception as e: | |
st.write(f"Error fetching detail for cert_number {cert_number}: {e}") | |
return detail_rows | |
def scrape_urac(progress_bar=None): | |
""" | |
Scrape URAC accreditation data: | |
1. Call the main filter API. | |
2. Parse organization details. | |
3. For each organization, call the detail API in parallel to get one row per site address. | |
Returns a pandas DataFrame. | |
""" | |
organizations = [] | |
all_rows = [] | |
headers = { | |
'accept': '*/*', | |
'accept-language': 'en-US,en;q=0.9', | |
'content-type': 'application/json', | |
'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C', | |
'origin': 'https://accreditnet.urac.org', | |
'priority': 'u=1, i', | |
'referer': 'https://accreditnet.urac.org/directory/', | |
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"Windows"', | |
'sec-fetch-dest': 'empty', | |
'sec-fetch-mode': 'cors', | |
'sec-fetch-site': 'same-origin', | |
'sec-gpc': '1', | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' | |
} | |
json_data = { | |
'filter': { | |
'allParts': [ | |
{ | |
'name': 'completedApplicationDecisionItem.typeDisplay.value', | |
'comparator': 0, | |
'valueType': 0, | |
'textValue': 'Accreditation Program', | |
'integerValue': None, | |
'decimalValue': None, | |
'dateTimeValue': None, | |
'booleanValue': None, | |
'innerFilter': None, | |
}, | |
{ | |
'name': 'certificateType.programName', | |
'comparator': 0, | |
'valueType': 0, | |
'textValue': 'Specialty Pharmacy', | |
'integerValue': None, | |
'decimalValue': None, | |
'dateTimeValue': None, | |
'booleanValue': None, | |
'innerFilter': None, | |
}, | |
], | |
'anyParts': [], | |
'notParts': [], | |
}, | |
'orderBy': 'certificationNumber', | |
'pageSize': 15, | |
'limit': 100, | |
} | |
try: | |
response = requests.post( | |
'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter', | |
headers=headers, | |
json=json_data | |
) | |
response.raise_for_status() | |
data = response.json() | |
except Exception as e: | |
st.write("Error processing URAC main API:", e) | |
return pd.DataFrame() | |
# Parse organization items. | |
for item in data.get('items', []): | |
entity = item.get('entity', {}) | |
org_name = entity.get('name', None) | |
decision = item.get('completedApplicationDecisionItem', {}) | |
outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value') | |
status = outcome if outcome is not None else item.get('effectiveStatusName', None) | |
srt_date = item.get('issuedDate', None) | |
exp_date = item.get('expirationDate', None) | |
program = item.get('certificateType', {}).get('displayName', None) | |
address_parts = [] | |
for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']: | |
part = entity.get(key) | |
if part: | |
address_parts.append(part) | |
address = ', '.join(address_parts) | |
parsed_address = usaddress.parse(address) | |
street, city, state, zipcode = '', '', '', '' | |
for value, label in parsed_address: | |
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): | |
street += f' {value}' | |
elif label == 'PlaceName': | |
city = value | |
elif label == 'StateName': | |
state = value | |
elif label == 'ZipCode': | |
zipcode = value | |
cert_number = item.get("primaryCertification", {}).get("certificationNumber") | |
if not cert_number: | |
cert_number = item.get("certificationNumber") | |
org_data = { | |
"Organization Name": org_name, | |
"Accreditation Status": status, | |
"Start Date": srt_date, | |
"Expiration Date": exp_date, | |
"Program": program, | |
"Address": address, | |
"Street": street.strip(), | |
"City": city, | |
"State": state, | |
"ZipCode": zipcode, | |
"Certification Number": cert_number | |
} | |
organizations.append(org_data) | |
# Fetch detail API calls in parallel and update the progress bar. | |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
future_to_org = { | |
executor.submit(fetch_detail, org["Certification Number"], org, headers): org | |
for org in organizations if org["Certification Number"] | |
} | |
total = len(future_to_org) | |
completed = 0 | |
for future in concurrent.futures.as_completed(future_to_org): | |
try: | |
detail_rows = future.result() | |
all_rows.extend(detail_rows) | |
except Exception as exc: | |
org = future_to_org[future] | |
st.write(f"Error fetching detail for {org['Organization Name']}: {exc}") | |
completed += 1 | |
if progress_bar is not None and total > 0: | |
progress_bar.progress(min(100, int(100 * completed / total))) | |
return pd.DataFrame(all_rows) | |
def _parse_accreditation_blocks(detail_soup): | |
""" | |
Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts. | |
""" | |
results = [] | |
blocks = detail_soup.find_all('div', class_='main_cont_det') | |
for block in blocks: | |
start_date, expiration_date = '', '' | |
site_program, site_service = '', '' | |
for p in block.find_all('p'): | |
text = p.get_text(strip=True) | |
if 'Date:' in text: | |
m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text) | |
if m: | |
start_date = m.group(1) | |
expiration_date = m.group(2) | |
elif 'Program:' in text: | |
site_program = text.split('Program:')[-1].strip() | |
elif 'Service:' in text: | |
site_service = text.split('Service:')[-1].strip() | |
results.append({ | |
"Start Date": start_date, | |
"Expiration Date": expiration_date, | |
"SiteProgram": site_program, | |
"SiteService": site_service | |
}) | |
return results | |
def _extract_original_program(detail_soup): | |
""" | |
Extract the original Program value from the detail soup. | |
""" | |
program = '' | |
for p in detail_soup.find_all('p'): | |
if 'Program:' in p.get_text(): | |
program = p.get_text(strip=True).split('Program:')[-1].strip() | |
break | |
return program | |
def _fetch_detail_for_company(company, base_url, headers, cookies): | |
""" | |
For a given company from the ACHC main API, fetch the detail API, | |
parse the HTML detail, and return one or more rows. | |
""" | |
rows = [] | |
company_id = company["company_id"] | |
detail_payload = f'action=view_provider_details&data_company_id={company_id}' | |
try: | |
detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload) | |
detail_resp.raise_for_status() | |
detail_json = detail_resp.json() | |
detail_html = detail_json.get('response_html', '') | |
detail_soup = BeautifulSoup(detail_html, 'html.parser') | |
original_program = _extract_original_program(detail_soup) | |
acc_blocks = _parse_accreditation_blocks(detail_soup) | |
if not acc_blocks: | |
rows.append({ | |
"Organization Name": company["org_name"], | |
"Start Date": '', | |
"Expiration Date": '', | |
"Accreditation Status": "N/A", | |
"Program": original_program, | |
"SiteProgram": '', | |
"SiteService": '', | |
"Address": company["address"], | |
"Street": company["street"], | |
"City": company["city"], | |
"State": company["state"], | |
"ZipCode": company["zipcode"] | |
}) | |
else: | |
for block in acc_blocks: | |
rows.append({ | |
"Organization Name": company["org_name"], | |
"Start Date": block["Start Date"], | |
"Expiration Date": block["Expiration Date"], | |
"Accreditation Status": "N/A", | |
"Program": original_program, | |
"SiteProgram": block["SiteProgram"], | |
"SiteService": block["SiteService"], | |
"Address": company["address"], | |
"Street": company["street"], | |
"City": company["city"], | |
"State": company["state"], | |
"ZipCode": company["zipcode"] | |
}) | |
except Exception as e: | |
st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}") | |
return rows | |
def scrape_achc(progress_bar=None): | |
""" | |
Scrape ACHC data: | |
1. Call the main API to get HTML. | |
2. Parse each company’s info. | |
3. In parallel, call the detail API to get accreditation details. | |
Returns a pandas DataFrame. | |
""" | |
headers = { | |
'accept': 'application/json, text/javascript, */*; q=0.01', | |
'accept-language': 'en-US,en;q=0.8', | |
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', | |
'origin': 'https://www.achc.org', | |
'priority': 'u=1, i', | |
'referer': 'https://www.achc.org/find-a-provider/', | |
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"Windows"', | |
'sec-fetch-dest': 'empty', | |
'sec-fetch-mode': 'cors', | |
'sec-fetch-site': 'same-origin', | |
'sec-gpc': '1', | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' | |
} | |
cookies = { | |
'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}' | |
} | |
base_url = 'https://www.achc.org/wp-admin/admin-ajax.php' | |
main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search=' | |
try: | |
main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload) | |
main_resp.raise_for_status() | |
main_json = main_resp.json() | |
except Exception as e: | |
st.write(f"Error fetching ACHC main API: {e}") | |
return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name']) | |
main_html = main_json.get('response_html', '') | |
main_soup = BeautifulSoup(main_html, 'html.parser') | |
company_items = main_soup.find_all('li') | |
companies = [] | |
for item in company_items: | |
list_box = item.find('div', class_='list_cont_box') | |
if not list_box: | |
continue | |
org_tag = list_box.find('b', class_='company_name') | |
org_name = org_tag.get_text(strip=True) if org_tag else '' | |
address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')] | |
address = ' '.join(address_parts) | |
parsed = usaddress.parse(address) | |
street, city, state, zipcode = '', '', '', '' | |
for value, label in parsed: | |
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): | |
street += f' {value}' | |
elif label == 'PlaceName': | |
city = value | |
elif label == 'StateName': | |
state = value | |
elif label == 'ZipCode': | |
zipcode = value | |
view_more = item.find('p', class_='view_more_eye') | |
if not view_more or not view_more.has_attr('data-company-id'): | |
continue | |
company_id = view_more['data-company-id'] | |
companies.append({ | |
"company_id": company_id, | |
"org_name": org_name, | |
"address": address, | |
"street": street.strip(), | |
"city": city, | |
"state": state, | |
"zipcode": zipcode | |
}) | |
detail_rows_all = [] | |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
futures = [ | |
executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies) | |
for comp in companies | |
] | |
total = len(futures) | |
completed = 0 | |
for future in concurrent.futures.as_completed(futures): | |
try: | |
rows = future.result() | |
detail_rows_all.extend(rows) | |
except Exception as exc: | |
st.write(f"Error fetching ACHC detail: {exc}") | |
completed += 1 | |
if progress_bar is not None and total > 0: | |
progress_bar.progress(min(100, int(100 * completed / total))) | |
df = pd.DataFrame(detail_rows_all, columns=[ | |
"Organization Name", | |
"Start Date", | |
"Expiration Date", | |
"Accreditation Status", | |
"Program", | |
"SiteProgram", | |
"SiteService", | |
"Address", | |
"Street", | |
"City", | |
"State", | |
"ZipCode" | |
]) | |
return df | |
# --- Streamlit UI --- | |
st.title("Accreditation Data Scraper") | |
st.write("Click the button below to start scraping and generate an Excel file.") | |
def run_scraper(): | |
# Scrape URAC data with its own progress bar. | |
with st.spinner("Scraping URAC data..."): | |
urac_progress = st.progress(0) | |
urac_df = scrape_urac(progress_bar=urac_progress) | |
# Scrape ACHC data with its own progress bar. | |
with st.spinner("Scraping ACHC data..."): | |
achc_progress = st.progress(0) | |
achc_df = scrape_achc(progress_bar=achc_progress) | |
# Merge data and write to an in-memory Excel file. | |
with st.spinner("Merging data and generating Excel..."): | |
merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer", | |
suffixes=("_URAC", "_ACHC")) | |
output = io.BytesIO() | |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: | |
if not urac_df.empty: | |
urac_df.to_excel(writer, sheet_name="URAC", index=False) | |
if not achc_df.empty: | |
achc_df.to_excel(writer, sheet_name="ACHC", index=False) | |
if not urac_df.empty and not achc_df.empty: | |
merged_df.to_excel(writer, sheet_name="Merged", index=False) | |
# writer.save() | |
output.seek(0) | |
return output | |
if st.button("Start Scraping"): | |
excel_data = run_scraper() | |
st.success("Scraping completed!") | |
st.download_button( | |
label="Download Excel File", | |
data=excel_data, | |
file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |