import streamlit as st
import pandas as pd
import requests
import usaddress
import concurrent.futures
import re
from bs4 import BeautifulSoup
from datetime import datetime
import io
# --- Scraper functions (adapted from your scraper.py) ---
def fetch_detail(cert_number, main_data, headers):
"""
For a given certification number, call the URAC detail API and return a list of rows.
If no site records are returned, a row with blank site fields is returned.
"""
detail_rows = []
url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
detail_data = response.json()
entities = detail_data.get("certificationEntities", [])
if not entities:
row = main_data.copy()
row.update({
"Site Name": None,
"Site Address": None,
"Site Street": None,
"Site City": None,
"Site State": None,
"Site ZipCode": None
})
detail_rows.append(row)
else:
for entity_item in entities:
site_entity = entity_item.get("entity", {})
site_name = site_entity.get("name", None)
# Combine the site address parts.
site_address_parts = []
for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
part = site_entity.get(key)
if part:
site_address_parts.append(part)
site_address = ', '.join(site_address_parts)
# Parse the site address using usaddress.
parsed_site = usaddress.parse(site_address)
site_street, site_city, site_state, site_zipcode = '', '', '', ''
for value, label in parsed_site:
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
site_street += f' {value}'
elif label == 'PlaceName':
site_city = value
elif label == 'StateName':
site_state = value
elif label == 'ZipCode':
site_zipcode = value
row = main_data.copy()
row.update({
"Site Name": site_name,
"Site Address": site_address,
"Site Street": site_street.strip(),
"Site City": site_city,
"Site State": site_state,
"Site ZipCode": site_zipcode
})
detail_rows.append(row)
except Exception as e:
st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
return detail_rows
def scrape_urac(progress_bar=None):
"""
Scrape URAC accreditation data:
1. Call the main filter API.
2. Parse organization details.
3. For each organization, call the detail API in parallel to get one row per site address.
Returns a pandas DataFrame.
"""
organizations = []
all_rows = []
headers = {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'content-type': 'application/json',
'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C',
'origin': 'https://accreditnet.urac.org',
'priority': 'u=1, i',
'referer': 'https://accreditnet.urac.org/directory/',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'sec-gpc': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
json_data = {
'filter': {
'allParts': [
{
'name': 'completedApplicationDecisionItem.typeDisplay.value',
'comparator': 0,
'valueType': 0,
'textValue': 'Accreditation Program',
'integerValue': None,
'decimalValue': None,
'dateTimeValue': None,
'booleanValue': None,
'innerFilter': None,
},
{
'name': 'certificateType.programName',
'comparator': 0,
'valueType': 0,
'textValue': 'Specialty Pharmacy',
'integerValue': None,
'decimalValue': None,
'dateTimeValue': None,
'booleanValue': None,
'innerFilter': None,
},
],
'anyParts': [],
'notParts': [],
},
'orderBy': 'certificationNumber',
'pageSize': 15,
'limit': 100,
}
try:
response = requests.post(
'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter',
headers=headers,
json=json_data
)
response.raise_for_status()
data = response.json()
except Exception as e:
st.write("Error processing URAC main API:", e)
return pd.DataFrame()
# Parse organization items.
for item in data.get('items', []):
entity = item.get('entity', {})
org_name = entity.get('name', None)
decision = item.get('completedApplicationDecisionItem', {})
outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value')
status = outcome if outcome is not None else item.get('effectiveStatusName', None)
srt_date = item.get('issuedDate', None)
exp_date = item.get('expirationDate', None)
program = item.get('certificateType', {}).get('displayName', None)
address_parts = []
for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
part = entity.get(key)
if part:
address_parts.append(part)
address = ', '.join(address_parts)
parsed_address = usaddress.parse(address)
street, city, state, zipcode = '', '', '', ''
for value, label in parsed_address:
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
street += f' {value}'
elif label == 'PlaceName':
city = value
elif label == 'StateName':
state = value
elif label == 'ZipCode':
zipcode = value
cert_number = item.get("primaryCertification", {}).get("certificationNumber")
if not cert_number:
cert_number = item.get("certificationNumber")
org_data = {
"Organization Name": org_name,
"Accreditation Status": status,
"Start Date": srt_date,
"Expiration Date": exp_date,
"Program": program,
"Address": address,
"Street": street.strip(),
"City": city,
"State": state,
"ZipCode": zipcode,
"Certification Number": cert_number
}
organizations.append(org_data)
# Fetch detail API calls in parallel and update the progress bar.
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_org = {
executor.submit(fetch_detail, org["Certification Number"], org, headers): org
for org in organizations if org["Certification Number"]
}
total = len(future_to_org)
completed = 0
for future in concurrent.futures.as_completed(future_to_org):
try:
detail_rows = future.result()
all_rows.extend(detail_rows)
except Exception as exc:
org = future_to_org[future]
st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
completed += 1
if progress_bar is not None and total > 0:
progress_bar.progress(min(100, int(100 * completed / total)))
return pd.DataFrame(all_rows)
def _parse_accreditation_blocks(detail_soup):
"""
Parse accreditation blocks (
) and return a list of dicts.
"""
results = []
blocks = detail_soup.find_all('div', class_='main_cont_det')
for block in blocks:
start_date, expiration_date = '', ''
site_program, site_service = '', ''
for p in block.find_all('p'):
text = p.get_text(strip=True)
if 'Date:' in text:
m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text)
if m:
start_date = m.group(1)
expiration_date = m.group(2)
elif 'Program:' in text:
site_program = text.split('Program:')[-1].strip()
elif 'Service:' in text:
site_service = text.split('Service:')[-1].strip()
results.append({
"Start Date": start_date,
"Expiration Date": expiration_date,
"SiteProgram": site_program,
"SiteService": site_service
})
return results
def _extract_original_program(detail_soup):
"""
Extract the original Program value from the detail soup.
"""
program = ''
for p in detail_soup.find_all('p'):
if 'Program:' in p.get_text():
program = p.get_text(strip=True).split('Program:')[-1].strip()
break
return program
def _fetch_detail_for_company(company, base_url, headers, cookies):
"""
For a given company from the ACHC main API, fetch the detail API,
parse the HTML detail, and return one or more rows.
"""
rows = []
company_id = company["company_id"]
detail_payload = f'action=view_provider_details&data_company_id={company_id}'
try:
detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload)
detail_resp.raise_for_status()
detail_json = detail_resp.json()
detail_html = detail_json.get('response_html', '')
detail_soup = BeautifulSoup(detail_html, 'html.parser')
original_program = _extract_original_program(detail_soup)
acc_blocks = _parse_accreditation_blocks(detail_soup)
if not acc_blocks:
rows.append({
"Organization Name": company["org_name"],
"Start Date": '',
"Expiration Date": '',
"Accreditation Status": "N/A",
"Program": original_program,
"SiteProgram": '',
"SiteService": '',
"Address": company["address"],
"Street": company["street"],
"City": company["city"],
"State": company["state"],
"ZipCode": company["zipcode"]
})
else:
for block in acc_blocks:
rows.append({
"Organization Name": company["org_name"],
"Start Date": block["Start Date"],
"Expiration Date": block["Expiration Date"],
"Accreditation Status": "N/A",
"Program": original_program,
"SiteProgram": block["SiteProgram"],
"SiteService": block["SiteService"],
"Address": company["address"],
"Street": company["street"],
"City": company["city"],
"State": company["state"],
"ZipCode": company["zipcode"]
})
except Exception as e:
st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
return rows
def scrape_achc(progress_bar=None):
"""
Scrape ACHC data:
1. Call the main API to get HTML.
2. Parse each company’s info.
3. In parallel, call the detail API to get accreditation details.
Returns a pandas DataFrame.
"""
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-language': 'en-US,en;q=0.8',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'origin': 'https://www.achc.org',
'priority': 'u=1, i',
'referer': 'https://www.achc.org/find-a-provider/',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'sec-gpc': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
cookies = {
'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}'
}
base_url = 'https://www.achc.org/wp-admin/admin-ajax.php'
main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search='
try:
main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload)
main_resp.raise_for_status()
main_json = main_resp.json()
except Exception as e:
st.write(f"Error fetching ACHC main API: {e}")
return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])
main_html = main_json.get('response_html', '')
main_soup = BeautifulSoup(main_html, 'html.parser')
company_items = main_soup.find_all('li')
companies = []
for item in company_items:
list_box = item.find('div', class_='list_cont_box')
if not list_box:
continue
org_tag = list_box.find('b', class_='company_name')
org_name = org_tag.get_text(strip=True) if org_tag else ''
address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
address = ' '.join(address_parts)
parsed = usaddress.parse(address)
street, city, state, zipcode = '', '', '', ''
for value, label in parsed:
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
street += f' {value}'
elif label == 'PlaceName':
city = value
elif label == 'StateName':
state = value
elif label == 'ZipCode':
zipcode = value
view_more = item.find('p', class_='view_more_eye')
if not view_more or not view_more.has_attr('data-company-id'):
continue
company_id = view_more['data-company-id']
companies.append({
"company_id": company_id,
"org_name": org_name,
"address": address,
"street": street.strip(),
"city": city,
"state": state,
"zipcode": zipcode
})
detail_rows_all = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies)
for comp in companies
]
total = len(futures)
completed = 0
for future in concurrent.futures.as_completed(futures):
try:
rows = future.result()
detail_rows_all.extend(rows)
except Exception as exc:
st.write(f"Error fetching ACHC detail: {exc}")
completed += 1
if progress_bar is not None and total > 0:
progress_bar.progress(min(100, int(100 * completed / total)))
df = pd.DataFrame(detail_rows_all, columns=[
"Organization Name",
"Start Date",
"Expiration Date",
"Accreditation Status",
"Program",
"SiteProgram",
"SiteService",
"Address",
"Street",
"City",
"State",
"ZipCode"
])
return df
# --- Streamlit UI ---
st.title("Accreditation Data Scraper")
st.write("Click the button below to start scraping and generate an Excel file.")
def run_scraper():
# Scrape URAC data with its own progress bar.
with st.spinner("Scraping URAC data..."):
urac_progress = st.progress(0)
urac_df = scrape_urac(progress_bar=urac_progress)
# Scrape ACHC data with its own progress bar.
with st.spinner("Scraping ACHC data..."):
achc_progress = st.progress(0)
achc_df = scrape_achc(progress_bar=achc_progress)
# Merge data and write to an in-memory Excel file.
with st.spinner("Merging data and generating Excel..."):
merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
suffixes=("_URAC", "_ACHC"))
output = io.BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
if not urac_df.empty:
urac_df.to_excel(writer, sheet_name="URAC", index=False)
if not achc_df.empty:
achc_df.to_excel(writer, sheet_name="ACHC", index=False)
if not urac_df.empty and not achc_df.empty:
merged_df.to_excel(writer, sheet_name="Merged", index=False)
# writer.save()
output.seek(0)
return output
if st.button("Start Scraping"):
excel_data = run_scraper()
st.success("Scraping completed!")
st.download_button(
label="Download Excel File",
data=excel_data,
file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)