Spaces:

salmanmapkar
/

Random_Stuff

Sleeping

App Files Files Community

Random_Stuff / app.py

salmanmapkar

Update app.py

c7bccba verified about 2 months ago

raw

history blame contribute delete

17.8 kB

	import streamlit as st
	import pandas as pd
	import requests
	import usaddress
	import concurrent.futures
	import re
	from bs4 import BeautifulSoup
	from datetime import datetime
	import io

	# --- Scraper functions (adapted from your scraper.py) ---

	def fetch_detail(cert_number, main_data, headers):
	"""
	For a given certification number, call the URAC detail API and return a list of rows.
	If no site records are returned, a row with blank site fields is returned.
	"""
	detail_rows = []
	url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
	try:
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	detail_data = response.json()
	entities = detail_data.get("certificationEntities", [])
	if not entities:
	row = main_data.copy()
	row.update({
	"Site Name": None,
	"Site Address": None,
	"Site Street": None,
	"Site City": None,
	"Site State": None,
	"Site ZipCode": None
	})
	detail_rows.append(row)
	else:
	for entity_item in entities:
	site_entity = entity_item.get("entity", {})
	site_name = site_entity.get("name", None)
	# Combine the site address parts.
	site_address_parts = []
	for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
	part = site_entity.get(key)
	if part:
	site_address_parts.append(part)
	site_address = ', '.join(site_address_parts)
	# Parse the site address using usaddress.
	parsed_site = usaddress.parse(site_address)
	site_street, site_city, site_state, site_zipcode = '', '', '', ''
	for value, label in parsed_site:
	if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
	site_street += f' {value}'
	elif label == 'PlaceName':
	site_city = value
	elif label == 'StateName':
	site_state = value
	elif label == 'ZipCode':
	site_zipcode = value
	row = main_data.copy()
	row.update({
	"Site Name": site_name,
	"Site Address": site_address,
	"Site Street": site_street.strip(),
	"Site City": site_city,
	"Site State": site_state,
	"Site ZipCode": site_zipcode
	})
	detail_rows.append(row)
	except Exception as e:
	st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
	return detail_rows

	def scrape_urac(progress_bar=None):
	"""
	Scrape URAC accreditation data:
	1. Call the main filter API.
	2. Parse organization details.
	3. For each organization, call the detail API in parallel to get one row per site address.
	Returns a pandas DataFrame.
	"""
	organizations = []
	all_rows = []
	headers = {
	'accept': '/',
	'accept-language': 'en-US,en;q=0.9',
	'content-type': 'application/json',
	'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C',
	'origin': 'https://accreditnet.urac.org',
	'priority': 'u=1, i',
	'referer': 'https://accreditnet.urac.org/directory/',
	'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
	'sec-ch-ua-mobile': '?0',
	'sec-ch-ua-platform': '"Windows"',
	'sec-fetch-dest': 'empty',
	'sec-fetch-mode': 'cors',
	'sec-fetch-site': 'same-origin',
	'sec-gpc': '1',
	'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
	}
	json_data = {
	'filter': {
	'allParts': [
	{
	'name': 'completedApplicationDecisionItem.typeDisplay.value',
	'comparator': 0,
	'valueType': 0,
	'textValue': 'Accreditation Program',
	'integerValue': None,
	'decimalValue': None,
	'dateTimeValue': None,
	'booleanValue': None,
	'innerFilter': None,
	},
	{
	'name': 'certificateType.programName',
	'comparator': 0,
	'valueType': 0,
	'textValue': 'Specialty Pharmacy',
	'integerValue': None,
	'decimalValue': None,
	'dateTimeValue': None,
	'booleanValue': None,
	'innerFilter': None,
	},
	],
	'anyParts': [],
	'notParts': [],
	},
	'orderBy': 'certificationNumber',
	'pageSize': 15,
	'limit': 100,
	}
	try:
	response = requests.post(
	'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter',
	headers=headers,
	json=json_data
	)
	response.raise_for_status()
	data = response.json()
	except Exception as e:
	st.write("Error processing URAC main API:", e)
	return pd.DataFrame()

	# Parse organization items.
	for item in data.get('items', []):
	entity = item.get('entity', {})
	org_name = entity.get('name', None)
	decision = item.get('completedApplicationDecisionItem', {})
	outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value')
	status = outcome if outcome is not None else item.get('effectiveStatusName', None)
	srt_date = item.get('issuedDate', None)
	exp_date = item.get('expirationDate', None)
	program = item.get('certificateType', {}).get('displayName', None)
	address_parts = []
	for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
	part = entity.get(key)
	if part:
	address_parts.append(part)
	address = ', '.join(address_parts)
	parsed_address = usaddress.parse(address)
	street, city, state, zipcode = '', '', '', ''
	for value, label in parsed_address:
	if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
	street += f' {value}'
	elif label == 'PlaceName':
	city = value
	elif label == 'StateName':
	state = value
	elif label == 'ZipCode':
	zipcode = value
	cert_number = item.get("primaryCertification", {}).get("certificationNumber")
	if not cert_number:
	cert_number = item.get("certificationNumber")
	org_data = {
	"Organization Name": org_name,
	"Accreditation Status": status,
	"Start Date": srt_date,
	"Expiration Date": exp_date,
	"Program": program,
	"Address": address,
	"Street": street.strip(),
	"City": city,
	"State": state,
	"ZipCode": zipcode,
	"Certification Number": cert_number
	}
	organizations.append(org_data)

	# Fetch detail API calls in parallel and update the progress bar.
	with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
	future_to_org = {
	executor.submit(fetch_detail, org["Certification Number"], org, headers): org
	for org in organizations if org["Certification Number"]
	}
	total = len(future_to_org)
	completed = 0
	for future in concurrent.futures.as_completed(future_to_org):
	try:
	detail_rows = future.result()
	all_rows.extend(detail_rows)
	except Exception as exc:
	org = future_to_org[future]
	st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
	completed += 1
	if progress_bar is not None and total > 0:
	progress_bar.progress(min(100, int(100 * completed / total)))
	return pd.DataFrame(all_rows)

	def _parse_accreditation_blocks(detail_soup):
	"""
	Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts.
	"""
	results = []
	blocks = detail_soup.find_all('div', class_='main_cont_det')
	for block in blocks:
	start_date, expiration_date = '', ''
	site_program, site_service = '', ''
	for p in block.find_all('p'):
	text = p.get_text(strip=True)
	if 'Date:' in text:
	m = re.search(r'Date:\s([\d/]+)\sThrough\s*([\d/]+)', text)
	if m:
	start_date = m.group(1)
	expiration_date = m.group(2)
	elif 'Program:' in text:
	site_program = text.split('Program:')[-1].strip()
	elif 'Service:' in text:
	site_service = text.split('Service:')[-1].strip()
	results.append({
	"Start Date": start_date,
	"Expiration Date": expiration_date,
	"SiteProgram": site_program,
	"SiteService": site_service
	})
	return results

	def _extract_original_program(detail_soup):
	"""
	Extract the original Program value from the detail soup.
	"""
	program = ''
	for p in detail_soup.find_all('p'):
	if 'Program:' in p.get_text():
	program = p.get_text(strip=True).split('Program:')[-1].strip()
	break
	return program

	def _fetch_detail_for_company(company, base_url, headers, cookies):
	"""
	For a given company from the ACHC main API, fetch the detail API,
	parse the HTML detail, and return one or more rows.
	"""
	rows = []
	company_id = company["company_id"]
	detail_payload = f'action=view_provider_details&data_company_id={company_id}'
	try:
	detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload)
	detail_resp.raise_for_status()
	detail_json = detail_resp.json()
	detail_html = detail_json.get('response_html', '')
	detail_soup = BeautifulSoup(detail_html, 'html.parser')

	original_program = _extract_original_program(detail_soup)
	acc_blocks = _parse_accreditation_blocks(detail_soup)
	if not acc_blocks:
	rows.append({
	"Organization Name": company["org_name"],
	"Start Date": '',
	"Expiration Date": '',
	"Accreditation Status": "N/A",
	"Program": original_program,
	"SiteProgram": '',
	"SiteService": '',
	"Address": company["address"],
	"Street": company["street"],
	"City": company["city"],
	"State": company["state"],
	"ZipCode": company["zipcode"]
	})
	else:
	for block in acc_blocks:
	rows.append({
	"Organization Name": company["org_name"],
	"Start Date": block["Start Date"],
	"Expiration Date": block["Expiration Date"],
	"Accreditation Status": "N/A",
	"Program": original_program,
	"SiteProgram": block["SiteProgram"],
	"SiteService": block["SiteService"],
	"Address": company["address"],
	"Street": company["street"],
	"City": company["city"],
	"State": company["state"],
	"ZipCode": company["zipcode"]
	})
	except Exception as e:
	st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
	return rows

	def scrape_achc(progress_bar=None):
	"""
	Scrape ACHC data:
	1. Call the main API to get HTML.
	2. Parse each company’s info.
	3. In parallel, call the detail API to get accreditation details.
	Returns a pandas DataFrame.
	"""
	headers = {
	'accept': 'application/json, text/javascript, /; q=0.01',
	'accept-language': 'en-US,en;q=0.8',
	'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
	'origin': 'https://www.achc.org',
	'priority': 'u=1, i',
	'referer': 'https://www.achc.org/find-a-provider/',
	'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
	'sec-ch-ua-mobile': '?0',
	'sec-ch-ua-platform': '"Windows"',
	'sec-fetch-dest': 'empty',
	'sec-fetch-mode': 'cors',
	'sec-fetch-site': 'same-origin',
	'sec-gpc': '1',
	'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
	}
	cookies = {
	'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}'
	}
	base_url = 'https://www.achc.org/wp-admin/admin-ajax.php'
	main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search='

	try:
	main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload)
	main_resp.raise_for_status()
	main_json = main_resp.json()
	except Exception as e:
	st.write(f"Error fetching ACHC main API: {e}")
	return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])

	main_html = main_json.get('response_html', '')
	main_soup = BeautifulSoup(main_html, 'html.parser')
	company_items = main_soup.find_all('li')
	companies = []
	for item in company_items:
	list_box = item.find('div', class_='list_cont_box')
	if not list_box:
	continue
	org_tag = list_box.find('b', class_='company_name')
	org_name = org_tag.get_text(strip=True) if org_tag else ''
	address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
	address = ' '.join(address_parts)
	parsed = usaddress.parse(address)
	street, city, state, zipcode = '', '', '', ''
	for value, label in parsed:
	if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
	street += f' {value}'
	elif label == 'PlaceName':
	city = value
	elif label == 'StateName':
	state = value
	elif label == 'ZipCode':
	zipcode = value
	view_more = item.find('p', class_='view_more_eye')
	if not view_more or not view_more.has_attr('data-company-id'):
	continue
	company_id = view_more['data-company-id']
	companies.append({
	"company_id": company_id,
	"org_name": org_name,
	"address": address,
	"street": street.strip(),
	"city": city,
	"state": state,
	"zipcode": zipcode
	})

	detail_rows_all = []
	with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
	futures = [
	executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies)
	for comp in companies
	]
	total = len(futures)
	completed = 0
	for future in concurrent.futures.as_completed(futures):
	try:
	rows = future.result()
	detail_rows_all.extend(rows)
	except Exception as exc:
	st.write(f"Error fetching ACHC detail: {exc}")
	completed += 1
	if progress_bar is not None and total > 0:
	progress_bar.progress(min(100, int(100 * completed / total)))
	df = pd.DataFrame(detail_rows_all, columns=[
	"Organization Name",
	"Start Date",
	"Expiration Date",
	"Accreditation Status",
	"Program",
	"SiteProgram",
	"SiteService",
	"Address",
	"Street",
	"City",
	"State",
	"ZipCode"
	])
	return df

	# --- Streamlit UI ---

	st.title("Accreditation Data Scraper")
	st.write("Click the button below to start scraping and generate an Excel file.")

	def run_scraper():
	# Scrape URAC data with its own progress bar.
	with st.spinner("Scraping URAC data..."):
	urac_progress = st.progress(0)
	urac_df = scrape_urac(progress_bar=urac_progress)
	# Scrape ACHC data with its own progress bar.
	with st.spinner("Scraping ACHC data..."):
	achc_progress = st.progress(0)
	achc_df = scrape_achc(progress_bar=achc_progress)
	# Merge data and write to an in-memory Excel file.
	with st.spinner("Merging data and generating Excel..."):
	merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
	suffixes=("_URAC", "_ACHC"))
	output = io.BytesIO()
	with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
	if not urac_df.empty:
	urac_df.to_excel(writer, sheet_name="URAC", index=False)
	if not achc_df.empty:
	achc_df.to_excel(writer, sheet_name="ACHC", index=False)
	if not urac_df.empty and not achc_df.empty:
	merged_df.to_excel(writer, sheet_name="Merged", index=False)
	# writer.save()
	output.seek(0)
	return output

	if st.button("Start Scraping"):
	excel_data = run_scraper()
	st.success("Scraping completed!")
	st.download_button(
	label="Download Excel File",
	data=excel_data,
	file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)