Random_Stuff / app.py
salmanmapkar's picture
Update app.py
c7bccba verified
import streamlit as st
import pandas as pd
import requests
import usaddress
import concurrent.futures
import re
from bs4 import BeautifulSoup
from datetime import datetime
import io
# --- Scraper functions (adapted from your scraper.py) ---
def fetch_detail(cert_number, main_data, headers):
"""
For a given certification number, call the URAC detail API and return a list of rows.
If no site records are returned, a row with blank site fields is returned.
"""
detail_rows = []
url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
detail_data = response.json()
entities = detail_data.get("certificationEntities", [])
if not entities:
row = main_data.copy()
row.update({
"Site Name": None,
"Site Address": None,
"Site Street": None,
"Site City": None,
"Site State": None,
"Site ZipCode": None
})
detail_rows.append(row)
else:
for entity_item in entities:
site_entity = entity_item.get("entity", {})
site_name = site_entity.get("name", None)
# Combine the site address parts.
site_address_parts = []
for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
part = site_entity.get(key)
if part:
site_address_parts.append(part)
site_address = ', '.join(site_address_parts)
# Parse the site address using usaddress.
parsed_site = usaddress.parse(site_address)
site_street, site_city, site_state, site_zipcode = '', '', '', ''
for value, label in parsed_site:
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
site_street += f' {value}'
elif label == 'PlaceName':
site_city = value
elif label == 'StateName':
site_state = value
elif label == 'ZipCode':
site_zipcode = value
row = main_data.copy()
row.update({
"Site Name": site_name,
"Site Address": site_address,
"Site Street": site_street.strip(),
"Site City": site_city,
"Site State": site_state,
"Site ZipCode": site_zipcode
})
detail_rows.append(row)
except Exception as e:
st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
return detail_rows
def scrape_urac(progress_bar=None):
"""
Scrape URAC accreditation data:
1. Call the main filter API.
2. Parse organization details.
3. For each organization, call the detail API in parallel to get one row per site address.
Returns a pandas DataFrame.
"""
organizations = []
all_rows = []
headers = {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'content-type': 'application/json',
'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C',
'origin': 'https://accreditnet.urac.org',
'priority': 'u=1, i',
'referer': 'https://accreditnet.urac.org/directory/',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'sec-gpc': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
json_data = {
'filter': {
'allParts': [
{
'name': 'completedApplicationDecisionItem.typeDisplay.value',
'comparator': 0,
'valueType': 0,
'textValue': 'Accreditation Program',
'integerValue': None,
'decimalValue': None,
'dateTimeValue': None,
'booleanValue': None,
'innerFilter': None,
},
{
'name': 'certificateType.programName',
'comparator': 0,
'valueType': 0,
'textValue': 'Specialty Pharmacy',
'integerValue': None,
'decimalValue': None,
'dateTimeValue': None,
'booleanValue': None,
'innerFilter': None,
},
],
'anyParts': [],
'notParts': [],
},
'orderBy': 'certificationNumber',
'pageSize': 15,
'limit': 100,
}
try:
response = requests.post(
'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter',
headers=headers,
json=json_data
)
response.raise_for_status()
data = response.json()
except Exception as e:
st.write("Error processing URAC main API:", e)
return pd.DataFrame()
# Parse organization items.
for item in data.get('items', []):
entity = item.get('entity', {})
org_name = entity.get('name', None)
decision = item.get('completedApplicationDecisionItem', {})
outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value')
status = outcome if outcome is not None else item.get('effectiveStatusName', None)
srt_date = item.get('issuedDate', None)
exp_date = item.get('expirationDate', None)
program = item.get('certificateType', {}).get('displayName', None)
address_parts = []
for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
part = entity.get(key)
if part:
address_parts.append(part)
address = ', '.join(address_parts)
parsed_address = usaddress.parse(address)
street, city, state, zipcode = '', '', '', ''
for value, label in parsed_address:
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
street += f' {value}'
elif label == 'PlaceName':
city = value
elif label == 'StateName':
state = value
elif label == 'ZipCode':
zipcode = value
cert_number = item.get("primaryCertification", {}).get("certificationNumber")
if not cert_number:
cert_number = item.get("certificationNumber")
org_data = {
"Organization Name": org_name,
"Accreditation Status": status,
"Start Date": srt_date,
"Expiration Date": exp_date,
"Program": program,
"Address": address,
"Street": street.strip(),
"City": city,
"State": state,
"ZipCode": zipcode,
"Certification Number": cert_number
}
organizations.append(org_data)
# Fetch detail API calls in parallel and update the progress bar.
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_org = {
executor.submit(fetch_detail, org["Certification Number"], org, headers): org
for org in organizations if org["Certification Number"]
}
total = len(future_to_org)
completed = 0
for future in concurrent.futures.as_completed(future_to_org):
try:
detail_rows = future.result()
all_rows.extend(detail_rows)
except Exception as exc:
org = future_to_org[future]
st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
completed += 1
if progress_bar is not None and total > 0:
progress_bar.progress(min(100, int(100 * completed / total)))
return pd.DataFrame(all_rows)
def _parse_accreditation_blocks(detail_soup):
"""
Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts.
"""
results = []
blocks = detail_soup.find_all('div', class_='main_cont_det')
for block in blocks:
start_date, expiration_date = '', ''
site_program, site_service = '', ''
for p in block.find_all('p'):
text = p.get_text(strip=True)
if 'Date:' in text:
m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text)
if m:
start_date = m.group(1)
expiration_date = m.group(2)
elif 'Program:' in text:
site_program = text.split('Program:')[-1].strip()
elif 'Service:' in text:
site_service = text.split('Service:')[-1].strip()
results.append({
"Start Date": start_date,
"Expiration Date": expiration_date,
"SiteProgram": site_program,
"SiteService": site_service
})
return results
def _extract_original_program(detail_soup):
"""
Extract the original Program value from the detail soup.
"""
program = ''
for p in detail_soup.find_all('p'):
if 'Program:' in p.get_text():
program = p.get_text(strip=True).split('Program:')[-1].strip()
break
return program
def _fetch_detail_for_company(company, base_url, headers, cookies):
"""
For a given company from the ACHC main API, fetch the detail API,
parse the HTML detail, and return one or more rows.
"""
rows = []
company_id = company["company_id"]
detail_payload = f'action=view_provider_details&data_company_id={company_id}'
try:
detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload)
detail_resp.raise_for_status()
detail_json = detail_resp.json()
detail_html = detail_json.get('response_html', '')
detail_soup = BeautifulSoup(detail_html, 'html.parser')
original_program = _extract_original_program(detail_soup)
acc_blocks = _parse_accreditation_blocks(detail_soup)
if not acc_blocks:
rows.append({
"Organization Name": company["org_name"],
"Start Date": '',
"Expiration Date": '',
"Accreditation Status": "N/A",
"Program": original_program,
"SiteProgram": '',
"SiteService": '',
"Address": company["address"],
"Street": company["street"],
"City": company["city"],
"State": company["state"],
"ZipCode": company["zipcode"]
})
else:
for block in acc_blocks:
rows.append({
"Organization Name": company["org_name"],
"Start Date": block["Start Date"],
"Expiration Date": block["Expiration Date"],
"Accreditation Status": "N/A",
"Program": original_program,
"SiteProgram": block["SiteProgram"],
"SiteService": block["SiteService"],
"Address": company["address"],
"Street": company["street"],
"City": company["city"],
"State": company["state"],
"ZipCode": company["zipcode"]
})
except Exception as e:
st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
return rows
def scrape_achc(progress_bar=None):
"""
Scrape ACHC data:
1. Call the main API to get HTML.
2. Parse each company’s info.
3. In parallel, call the detail API to get accreditation details.
Returns a pandas DataFrame.
"""
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-language': 'en-US,en;q=0.8',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'origin': 'https://www.achc.org',
'priority': 'u=1, i',
'referer': 'https://www.achc.org/find-a-provider/',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'sec-gpc': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
cookies = {
'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}'
}
base_url = 'https://www.achc.org/wp-admin/admin-ajax.php'
main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search='
try:
main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload)
main_resp.raise_for_status()
main_json = main_resp.json()
except Exception as e:
st.write(f"Error fetching ACHC main API: {e}")
return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])
main_html = main_json.get('response_html', '')
main_soup = BeautifulSoup(main_html, 'html.parser')
company_items = main_soup.find_all('li')
companies = []
for item in company_items:
list_box = item.find('div', class_='list_cont_box')
if not list_box:
continue
org_tag = list_box.find('b', class_='company_name')
org_name = org_tag.get_text(strip=True) if org_tag else ''
address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
address = ' '.join(address_parts)
parsed = usaddress.parse(address)
street, city, state, zipcode = '', '', '', ''
for value, label in parsed:
if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
street += f' {value}'
elif label == 'PlaceName':
city = value
elif label == 'StateName':
state = value
elif label == 'ZipCode':
zipcode = value
view_more = item.find('p', class_='view_more_eye')
if not view_more or not view_more.has_attr('data-company-id'):
continue
company_id = view_more['data-company-id']
companies.append({
"company_id": company_id,
"org_name": org_name,
"address": address,
"street": street.strip(),
"city": city,
"state": state,
"zipcode": zipcode
})
detail_rows_all = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies)
for comp in companies
]
total = len(futures)
completed = 0
for future in concurrent.futures.as_completed(futures):
try:
rows = future.result()
detail_rows_all.extend(rows)
except Exception as exc:
st.write(f"Error fetching ACHC detail: {exc}")
completed += 1
if progress_bar is not None and total > 0:
progress_bar.progress(min(100, int(100 * completed / total)))
df = pd.DataFrame(detail_rows_all, columns=[
"Organization Name",
"Start Date",
"Expiration Date",
"Accreditation Status",
"Program",
"SiteProgram",
"SiteService",
"Address",
"Street",
"City",
"State",
"ZipCode"
])
return df
# --- Streamlit UI ---
st.title("Accreditation Data Scraper")
st.write("Click the button below to start scraping and generate an Excel file.")
def run_scraper():
# Scrape URAC data with its own progress bar.
with st.spinner("Scraping URAC data..."):
urac_progress = st.progress(0)
urac_df = scrape_urac(progress_bar=urac_progress)
# Scrape ACHC data with its own progress bar.
with st.spinner("Scraping ACHC data..."):
achc_progress = st.progress(0)
achc_df = scrape_achc(progress_bar=achc_progress)
# Merge data and write to an in-memory Excel file.
with st.spinner("Merging data and generating Excel..."):
merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
suffixes=("_URAC", "_ACHC"))
output = io.BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
if not urac_df.empty:
urac_df.to_excel(writer, sheet_name="URAC", index=False)
if not achc_df.empty:
achc_df.to_excel(writer, sheet_name="ACHC", index=False)
if not urac_df.empty and not achc_df.empty:
merged_df.to_excel(writer, sheet_name="Merged", index=False)
# writer.save()
output.seek(0)
return output
if st.button("Start Scraping"):
excel_data = run_scraper()
st.success("Scraping completed!")
st.download_button(
label="Download Excel File",
data=excel_data,
file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)