Spaces:

salmanmapkar
/

Random_Stuff

Sleeping

File size: 17,787 Bytes

import streamlit as st
import pandas as pd
import requests
import usaddress
import concurrent.futures
import re
from bs4 import BeautifulSoup
from datetime import datetime
import io

# --- Scraper functions (adapted from your scraper.py) ---

def fetch_detail(cert_number, main_data, headers):
    """
    For a given certification number, call the URAC detail API and return a list of rows.
    If no site records are returned, a row with blank site fields is returned.
    """
    detail_rows = []
    url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        detail_data = response.json()
        entities = detail_data.get("certificationEntities", [])
        if not entities:
            row = main_data.copy()
            row.update({
                "Site Name": None,
                "Site Address": None,
                "Site Street": None,
                "Site City": None,
                "Site State": None,
                "Site ZipCode": None
            })
            detail_rows.append(row)
        else:
            for entity_item in entities:
                site_entity = entity_item.get("entity", {})
                site_name = site_entity.get("name", None)
                # Combine the site address parts.
                site_address_parts = []
                for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
                    part = site_entity.get(key)
                    if part:
                        site_address_parts.append(part)
                site_address = ', '.join(site_address_parts)
                # Parse the site address using usaddress.
                parsed_site = usaddress.parse(site_address)
                site_street, site_city, site_state, site_zipcode = '', '', '', ''
                for value, label in parsed_site:
                    if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
                        site_street += f' {value}'
                    elif label == 'PlaceName':
                        site_city = value
                    elif label == 'StateName':
                        site_state = value
                    elif label == 'ZipCode':
                        site_zipcode = value
                row = main_data.copy()
                row.update({
                    "Site Name": site_name,
                    "Site Address": site_address,
                    "Site Street": site_street.strip(),
                    "Site City": site_city,
                    "Site State": site_state,
                    "Site ZipCode": site_zipcode
                })
                detail_rows.append(row)
    except Exception as e:
        st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
    return detail_rows

def scrape_urac(progress_bar=None):
    """
    Scrape URAC accreditation data:
      1. Call the main filter API.
      2. Parse organization details.
      3. For each organization, call the detail API in parallel to get one row per site address.
    Returns a pandas DataFrame.
    """
    organizations = []
    all_rows = []
    headers = {
        'accept': '*/*',
        'accept-language': 'en-US,en;q=0.9',
        'content-type': 'application/json',
        'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C',
        'origin': 'https://accreditnet.urac.org',
        'priority': 'u=1, i',
        'referer': 'https://accreditnet.urac.org/directory/',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'sec-gpc': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    json_data = {
        'filter': {
            'allParts': [
                {
                    'name': 'completedApplicationDecisionItem.typeDisplay.value',
                    'comparator': 0,
                    'valueType': 0,
                    'textValue': 'Accreditation Program',
                    'integerValue': None,
                    'decimalValue': None,
                    'dateTimeValue': None,
                    'booleanValue': None,
                    'innerFilter': None,
                },
                {
                    'name': 'certificateType.programName',
                    'comparator': 0,
                    'valueType': 0,
                    'textValue': 'Specialty Pharmacy',
                    'integerValue': None,
                    'decimalValue': None,
                    'dateTimeValue': None,
                    'booleanValue': None,
                    'innerFilter': None,
                },
            ],
            'anyParts': [],
            'notParts': [],
        },
        'orderBy': 'certificationNumber',
        'pageSize': 15,
        'limit': 100,
    }
    try:
        response = requests.post(
            'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter',
            headers=headers,
            json=json_data
        )
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        st.write("Error processing URAC main API:", e)
        return pd.DataFrame()
    
    # Parse organization items.
    for item in data.get('items', []):
        entity = item.get('entity', {})
        org_name = entity.get('name', None)
        decision = item.get('completedApplicationDecisionItem', {})
        outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value')
        status = outcome if outcome is not None else item.get('effectiveStatusName', None)
        srt_date = item.get('issuedDate', None)
        exp_date = item.get('expirationDate', None)
        program = item.get('certificateType', {}).get('displayName', None)
        address_parts = []
        for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
            part = entity.get(key)
            if part:
                address_parts.append(part)
        address = ', '.join(address_parts)
        parsed_address = usaddress.parse(address)
        street, city, state, zipcode = '', '', '', ''
        for value, label in parsed_address:
            if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
                street += f' {value}'
            elif label == 'PlaceName':
                city = value
            elif label == 'StateName':
                state = value
            elif label == 'ZipCode':
                zipcode = value
        cert_number = item.get("primaryCertification", {}).get("certificationNumber")
        if not cert_number:
            cert_number = item.get("certificationNumber")
        org_data = {
            "Organization Name": org_name,
            "Accreditation Status": status,
            "Start Date": srt_date,
            "Expiration Date": exp_date,
            "Program": program,
            "Address": address,
            "Street": street.strip(),
            "City": city,
            "State": state,
            "ZipCode": zipcode,
            "Certification Number": cert_number
        }
        organizations.append(org_data)
    
    # Fetch detail API calls in parallel and update the progress bar.
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_org = {
            executor.submit(fetch_detail, org["Certification Number"], org, headers): org
            for org in organizations if org["Certification Number"]
        }
        total = len(future_to_org)
        completed = 0
        for future in concurrent.futures.as_completed(future_to_org):
            try:
                detail_rows = future.result()
                all_rows.extend(detail_rows)
            except Exception as exc:
                org = future_to_org[future]
                st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
            completed += 1
            if progress_bar is not None and total > 0:
                progress_bar.progress(min(100, int(100 * completed / total)))
    return pd.DataFrame(all_rows)

def _parse_accreditation_blocks(detail_soup):
    """
    Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts.
    """
    results = []
    blocks = detail_soup.find_all('div', class_='main_cont_det')
    for block in blocks:
        start_date, expiration_date = '', ''
        site_program, site_service = '', ''
        for p in block.find_all('p'):
            text = p.get_text(strip=True)
            if 'Date:' in text:
                m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text)
                if m:
                    start_date = m.group(1)
                    expiration_date = m.group(2)
            elif 'Program:' in text:
                site_program = text.split('Program:')[-1].strip()
            elif 'Service:' in text:
                site_service = text.split('Service:')[-1].strip()
        results.append({
            "Start Date": start_date,
            "Expiration Date": expiration_date,
            "SiteProgram": site_program,
            "SiteService": site_service
        })
    return results

def _extract_original_program(detail_soup):
    """
    Extract the original Program value from the detail soup.
    """
    program = ''
    for p in detail_soup.find_all('p'):
        if 'Program:' in p.get_text():
            program = p.get_text(strip=True).split('Program:')[-1].strip()
            break
    return program

def _fetch_detail_for_company(company, base_url, headers, cookies):
    """
    For a given company from the ACHC main API, fetch the detail API,
    parse the HTML detail, and return one or more rows.
    """
    rows = []
    company_id = company["company_id"]
    detail_payload = f'action=view_provider_details&data_company_id={company_id}'
    try:
        detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload)
        detail_resp.raise_for_status()
        detail_json = detail_resp.json()
        detail_html = detail_json.get('response_html', '')
        detail_soup = BeautifulSoup(detail_html, 'html.parser')
        
        original_program = _extract_original_program(detail_soup)
        acc_blocks = _parse_accreditation_blocks(detail_soup)
        if not acc_blocks:
            rows.append({
                "Organization Name": company["org_name"],
                "Start Date": '',
                "Expiration Date": '',
                "Accreditation Status": "N/A",
                "Program": original_program,
                "SiteProgram": '',
                "SiteService": '',
                "Address": company["address"],
                "Street": company["street"],
                "City": company["city"],
                "State": company["state"],
                "ZipCode": company["zipcode"]
            })
        else:
            for block in acc_blocks:
                rows.append({
                    "Organization Name": company["org_name"],
                    "Start Date": block["Start Date"],
                    "Expiration Date": block["Expiration Date"],
                    "Accreditation Status": "N/A",
                    "Program": original_program,
                    "SiteProgram": block["SiteProgram"],
                    "SiteService": block["SiteService"],
                    "Address": company["address"],
                    "Street": company["street"],
                    "City": company["city"],
                    "State": company["state"],
                    "ZipCode": company["zipcode"]
                })
    except Exception as e:
        st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
    return rows

def scrape_achc(progress_bar=None):
    """
    Scrape ACHC data:
      1. Call the main API to get HTML.
      2. Parse each company’s info.
      3. In parallel, call the detail API to get accreditation details.
    Returns a pandas DataFrame.
    """
    headers = {
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accept-language': 'en-US,en;q=0.8',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'origin': 'https://www.achc.org',
        'priority': 'u=1, i',
        'referer': 'https://www.achc.org/find-a-provider/',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'sec-gpc': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    cookies = {
        'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}'
    }
    base_url = 'https://www.achc.org/wp-admin/admin-ajax.php'
    main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search='

    try:
        main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload)
        main_resp.raise_for_status()
        main_json = main_resp.json()
    except Exception as e:
        st.write(f"Error fetching ACHC main API: {e}")
        return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])

    main_html = main_json.get('response_html', '')
    main_soup = BeautifulSoup(main_html, 'html.parser')
    company_items = main_soup.find_all('li')
    companies = []
    for item in company_items:
        list_box = item.find('div', class_='list_cont_box')
        if not list_box:
            continue
        org_tag = list_box.find('b', class_='company_name')
        org_name = org_tag.get_text(strip=True) if org_tag else ''
        address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
        address = ' '.join(address_parts)
        parsed = usaddress.parse(address)
        street, city, state, zipcode = '', '', '', ''
        for value, label in parsed:
            if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
                street += f' {value}'
            elif label == 'PlaceName':
                city = value
            elif label == 'StateName':
                state = value
            elif label == 'ZipCode':
                zipcode = value
        view_more = item.find('p', class_='view_more_eye')
        if not view_more or not view_more.has_attr('data-company-id'):
            continue
        company_id = view_more['data-company-id']
        companies.append({
            "company_id": company_id,
            "org_name": org_name,
            "address": address,
            "street": street.strip(),
            "city": city,
            "state": state,
            "zipcode": zipcode
        })

    detail_rows_all = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = [
            executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies)
            for comp in companies
        ]
        total = len(futures)
        completed = 0
        for future in concurrent.futures.as_completed(futures):
            try:
                rows = future.result()
                detail_rows_all.extend(rows)
            except Exception as exc:
                st.write(f"Error fetching ACHC detail: {exc}")
            completed += 1
            if progress_bar is not None and total > 0:
                progress_bar.progress(min(100, int(100 * completed / total)))
    df = pd.DataFrame(detail_rows_all, columns=[
        "Organization Name",
        "Start Date",
        "Expiration Date",
        "Accreditation Status",
        "Program",
        "SiteProgram",
        "SiteService",
        "Address",
        "Street",
        "City",
        "State",
        "ZipCode"
    ])
    return df

# --- Streamlit UI ---

st.title("Accreditation Data Scraper")
st.write("Click the button below to start scraping and generate an Excel file.")

def run_scraper():
    # Scrape URAC data with its own progress bar.
    with st.spinner("Scraping URAC data..."):
        urac_progress = st.progress(0)
        urac_df = scrape_urac(progress_bar=urac_progress)
    # Scrape ACHC data with its own progress bar.
    with st.spinner("Scraping ACHC data..."):
        achc_progress = st.progress(0)
        achc_df = scrape_achc(progress_bar=achc_progress)
    # Merge data and write to an in-memory Excel file.
    with st.spinner("Merging data and generating Excel..."):
        merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
                             suffixes=("_URAC", "_ACHC"))
        output = io.BytesIO()
        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
            if not urac_df.empty:
                urac_df.to_excel(writer, sheet_name="URAC", index=False)
            if not achc_df.empty:
                achc_df.to_excel(writer, sheet_name="ACHC", index=False)
            if not urac_df.empty and not achc_df.empty:
                merged_df.to_excel(writer, sheet_name="Merged", index=False)
            # writer.save()
        output.seek(0)
    return output

if st.button("Start Scraping"):
    excel_data = run_scraper()
    st.success("Scraping completed!")
    st.download_button(
        label="Download Excel File",
        data=excel_data,
        file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )