import streamlit as st import pandas as pd import requests import usaddress import concurrent.futures import re from bs4 import BeautifulSoup from datetime import datetime import io # --- Scraper functions (adapted from your scraper.py) --- def fetch_detail(cert_number, main_data, headers): """ For a given certification number, call the URAC detail API and return a list of rows. If no site records are returned, a row with blank site fields is returned. """ detail_rows = [] url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation" try: response = requests.get(url, headers=headers) response.raise_for_status() detail_data = response.json() entities = detail_data.get("certificationEntities", []) if not entities: row = main_data.copy() row.update({ "Site Name": None, "Site Address": None, "Site Street": None, "Site City": None, "Site State": None, "Site ZipCode": None }) detail_rows.append(row) else: for entity_item in entities: site_entity = entity_item.get("entity", {}) site_name = site_entity.get("name", None) # Combine the site address parts. site_address_parts = [] for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']: part = site_entity.get(key) if part: site_address_parts.append(part) site_address = ', '.join(site_address_parts) # Parse the site address using usaddress. parsed_site = usaddress.parse(site_address) site_street, site_city, site_state, site_zipcode = '', '', '', '' for value, label in parsed_site: if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): site_street += f' {value}' elif label == 'PlaceName': site_city = value elif label == 'StateName': site_state = value elif label == 'ZipCode': site_zipcode = value row = main_data.copy() row.update({ "Site Name": site_name, "Site Address": site_address, "Site Street": site_street.strip(), "Site City": site_city, "Site State": site_state, "Site ZipCode": site_zipcode }) detail_rows.append(row) except Exception as e: st.write(f"Error fetching detail for cert_number {cert_number}: {e}") return detail_rows def scrape_urac(progress_bar=None): """ Scrape URAC accreditation data: 1. Call the main filter API. 2. Parse organization details. 3. For each organization, call the detail API in parallel to get one row per site address. Returns a pandas DataFrame. """ organizations = [] all_rows = [] headers = { 'accept': '*/*', 'accept-language': 'en-US,en;q=0.9', 'content-type': 'application/json', 'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C', 'origin': 'https://accreditnet.urac.org', 'priority': 'u=1, i', 'referer': 'https://accreditnet.urac.org/directory/', 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'sec-gpc': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' } json_data = { 'filter': { 'allParts': [ { 'name': 'completedApplicationDecisionItem.typeDisplay.value', 'comparator': 0, 'valueType': 0, 'textValue': 'Accreditation Program', 'integerValue': None, 'decimalValue': None, 'dateTimeValue': None, 'booleanValue': None, 'innerFilter': None, }, { 'name': 'certificateType.programName', 'comparator': 0, 'valueType': 0, 'textValue': 'Specialty Pharmacy', 'integerValue': None, 'decimalValue': None, 'dateTimeValue': None, 'booleanValue': None, 'innerFilter': None, }, ], 'anyParts': [], 'notParts': [], }, 'orderBy': 'certificationNumber', 'pageSize': 15, 'limit': 100, } try: response = requests.post( 'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter', headers=headers, json=json_data ) response.raise_for_status() data = response.json() except Exception as e: st.write("Error processing URAC main API:", e) return pd.DataFrame() # Parse organization items. for item in data.get('items', []): entity = item.get('entity', {}) org_name = entity.get('name', None) decision = item.get('completedApplicationDecisionItem', {}) outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value') status = outcome if outcome is not None else item.get('effectiveStatusName', None) srt_date = item.get('issuedDate', None) exp_date = item.get('expirationDate', None) program = item.get('certificateType', {}).get('displayName', None) address_parts = [] for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']: part = entity.get(key) if part: address_parts.append(part) address = ', '.join(address_parts) parsed_address = usaddress.parse(address) street, city, state, zipcode = '', '', '', '' for value, label in parsed_address: if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): street += f' {value}' elif label == 'PlaceName': city = value elif label == 'StateName': state = value elif label == 'ZipCode': zipcode = value cert_number = item.get("primaryCertification", {}).get("certificationNumber") if not cert_number: cert_number = item.get("certificationNumber") org_data = { "Organization Name": org_name, "Accreditation Status": status, "Start Date": srt_date, "Expiration Date": exp_date, "Program": program, "Address": address, "Street": street.strip(), "City": city, "State": state, "ZipCode": zipcode, "Certification Number": cert_number } organizations.append(org_data) # Fetch detail API calls in parallel and update the progress bar. with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: future_to_org = { executor.submit(fetch_detail, org["Certification Number"], org, headers): org for org in organizations if org["Certification Number"] } total = len(future_to_org) completed = 0 for future in concurrent.futures.as_completed(future_to_org): try: detail_rows = future.result() all_rows.extend(detail_rows) except Exception as exc: org = future_to_org[future] st.write(f"Error fetching detail for {org['Organization Name']}: {exc}") completed += 1 if progress_bar is not None and total > 0: progress_bar.progress(min(100, int(100 * completed / total))) return pd.DataFrame(all_rows) def _parse_accreditation_blocks(detail_soup): """ Parse accreditation blocks (
) and return a list of dicts. """ results = [] blocks = detail_soup.find_all('div', class_='main_cont_det') for block in blocks: start_date, expiration_date = '', '' site_program, site_service = '', '' for p in block.find_all('p'): text = p.get_text(strip=True) if 'Date:' in text: m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text) if m: start_date = m.group(1) expiration_date = m.group(2) elif 'Program:' in text: site_program = text.split('Program:')[-1].strip() elif 'Service:' in text: site_service = text.split('Service:')[-1].strip() results.append({ "Start Date": start_date, "Expiration Date": expiration_date, "SiteProgram": site_program, "SiteService": site_service }) return results def _extract_original_program(detail_soup): """ Extract the original Program value from the detail soup. """ program = '' for p in detail_soup.find_all('p'): if 'Program:' in p.get_text(): program = p.get_text(strip=True).split('Program:')[-1].strip() break return program def _fetch_detail_for_company(company, base_url, headers, cookies): """ For a given company from the ACHC main API, fetch the detail API, parse the HTML detail, and return one or more rows. """ rows = [] company_id = company["company_id"] detail_payload = f'action=view_provider_details&data_company_id={company_id}' try: detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload) detail_resp.raise_for_status() detail_json = detail_resp.json() detail_html = detail_json.get('response_html', '') detail_soup = BeautifulSoup(detail_html, 'html.parser') original_program = _extract_original_program(detail_soup) acc_blocks = _parse_accreditation_blocks(detail_soup) if not acc_blocks: rows.append({ "Organization Name": company["org_name"], "Start Date": '', "Expiration Date": '', "Accreditation Status": "N/A", "Program": original_program, "SiteProgram": '', "SiteService": '', "Address": company["address"], "Street": company["street"], "City": company["city"], "State": company["state"], "ZipCode": company["zipcode"] }) else: for block in acc_blocks: rows.append({ "Organization Name": company["org_name"], "Start Date": block["Start Date"], "Expiration Date": block["Expiration Date"], "Accreditation Status": "N/A", "Program": original_program, "SiteProgram": block["SiteProgram"], "SiteService": block["SiteService"], "Address": company["address"], "Street": company["street"], "City": company["city"], "State": company["state"], "ZipCode": company["zipcode"] }) except Exception as e: st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}") return rows def scrape_achc(progress_bar=None): """ Scrape ACHC data: 1. Call the main API to get HTML. 2. Parse each company’s info. 3. In parallel, call the detail API to get accreditation details. Returns a pandas DataFrame. """ headers = { 'accept': 'application/json, text/javascript, */*; q=0.01', 'accept-language': 'en-US,en;q=0.8', 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'origin': 'https://www.achc.org', 'priority': 'u=1, i', 'referer': 'https://www.achc.org/find-a-provider/', 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'sec-gpc': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' } cookies = { 'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}' } base_url = 'https://www.achc.org/wp-admin/admin-ajax.php' main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search=' try: main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload) main_resp.raise_for_status() main_json = main_resp.json() except Exception as e: st.write(f"Error fetching ACHC main API: {e}") return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name']) main_html = main_json.get('response_html', '') main_soup = BeautifulSoup(main_html, 'html.parser') company_items = main_soup.find_all('li') companies = [] for item in company_items: list_box = item.find('div', class_='list_cont_box') if not list_box: continue org_tag = list_box.find('b', class_='company_name') org_name = org_tag.get_text(strip=True) if org_tag else '' address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')] address = ' '.join(address_parts) parsed = usaddress.parse(address) street, city, state, zipcode = '', '', '', '' for value, label in parsed: if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): street += f' {value}' elif label == 'PlaceName': city = value elif label == 'StateName': state = value elif label == 'ZipCode': zipcode = value view_more = item.find('p', class_='view_more_eye') if not view_more or not view_more.has_attr('data-company-id'): continue company_id = view_more['data-company-id'] companies.append({ "company_id": company_id, "org_name": org_name, "address": address, "street": street.strip(), "city": city, "state": state, "zipcode": zipcode }) detail_rows_all = [] with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: futures = [ executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies) for comp in companies ] total = len(futures) completed = 0 for future in concurrent.futures.as_completed(futures): try: rows = future.result() detail_rows_all.extend(rows) except Exception as exc: st.write(f"Error fetching ACHC detail: {exc}") completed += 1 if progress_bar is not None and total > 0: progress_bar.progress(min(100, int(100 * completed / total))) df = pd.DataFrame(detail_rows_all, columns=[ "Organization Name", "Start Date", "Expiration Date", "Accreditation Status", "Program", "SiteProgram", "SiteService", "Address", "Street", "City", "State", "ZipCode" ]) return df # --- Streamlit UI --- st.title("Accreditation Data Scraper") st.write("Click the button below to start scraping and generate an Excel file.") def run_scraper(): # Scrape URAC data with its own progress bar. with st.spinner("Scraping URAC data..."): urac_progress = st.progress(0) urac_df = scrape_urac(progress_bar=urac_progress) # Scrape ACHC data with its own progress bar. with st.spinner("Scraping ACHC data..."): achc_progress = st.progress(0) achc_df = scrape_achc(progress_bar=achc_progress) # Merge data and write to an in-memory Excel file. with st.spinner("Merging data and generating Excel..."): merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer", suffixes=("_URAC", "_ACHC")) output = io.BytesIO() with pd.ExcelWriter(output, engine='xlsxwriter') as writer: if not urac_df.empty: urac_df.to_excel(writer, sheet_name="URAC", index=False) if not achc_df.empty: achc_df.to_excel(writer, sheet_name="ACHC", index=False) if not urac_df.empty and not achc_df.empty: merged_df.to_excel(writer, sheet_name="Merged", index=False) # writer.save() output.seek(0) return output if st.button("Start Scraping"): excel_data = run_scraper() st.success("Scraping completed!") st.download_button( label="Download Excel File", data=excel_data, file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" )