File size: 17,787 Bytes
82a334c
8fd8293
 
 
 
 
 
 
 
82a334c
8fd8293
 
 
 
c7bccba
 
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
8fd8293
 
 
 
c7bccba
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
 
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
 
8fd8293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7bccba
8fd8293
c7bccba
 
 
8fd8293
c7bccba
 
 
8fd8293
c7bccba
8fd8293
 
 
c7bccba
 
 
 
 
 
 
8fd8293
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
import streamlit as st
import pandas as pd
import requests
import usaddress
import concurrent.futures
import re
from bs4 import BeautifulSoup
from datetime import datetime
import io

# --- Scraper functions (adapted from your scraper.py) ---

def fetch_detail(cert_number, main_data, headers):
    """
    For a given certification number, call the URAC detail API and return a list of rows.
    If no site records are returned, a row with blank site fields is returned.
    """
    detail_rows = []
    url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        detail_data = response.json()
        entities = detail_data.get("certificationEntities", [])
        if not entities:
            row = main_data.copy()
            row.update({
                "Site Name": None,
                "Site Address": None,
                "Site Street": None,
                "Site City": None,
                "Site State": None,
                "Site ZipCode": None
            })
            detail_rows.append(row)
        else:
            for entity_item in entities:
                site_entity = entity_item.get("entity", {})
                site_name = site_entity.get("name", None)
                # Combine the site address parts.
                site_address_parts = []
                for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
                    part = site_entity.get(key)
                    if part:
                        site_address_parts.append(part)
                site_address = ', '.join(site_address_parts)
                # Parse the site address using usaddress.
                parsed_site = usaddress.parse(site_address)
                site_street, site_city, site_state, site_zipcode = '', '', '', ''
                for value, label in parsed_site:
                    if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
                        site_street += f' {value}'
                    elif label == 'PlaceName':
                        site_city = value
                    elif label == 'StateName':
                        site_state = value
                    elif label == 'ZipCode':
                        site_zipcode = value
                row = main_data.copy()
                row.update({
                    "Site Name": site_name,
                    "Site Address": site_address,
                    "Site Street": site_street.strip(),
                    "Site City": site_city,
                    "Site State": site_state,
                    "Site ZipCode": site_zipcode
                })
                detail_rows.append(row)
    except Exception as e:
        st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
    return detail_rows

def scrape_urac(progress_bar=None):
    """
    Scrape URAC accreditation data:
      1. Call the main filter API.
      2. Parse organization details.
      3. For each organization, call the detail API in parallel to get one row per site address.
    Returns a pandas DataFrame.
    """
    organizations = []
    all_rows = []
    headers = {
        'accept': '*/*',
        'accept-language': 'en-US,en;q=0.9',
        'content-type': 'application/json',
        'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C',
        'origin': 'https://accreditnet.urac.org',
        'priority': 'u=1, i',
        'referer': 'https://accreditnet.urac.org/directory/',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'sec-gpc': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    json_data = {
        'filter': {
            'allParts': [
                {
                    'name': 'completedApplicationDecisionItem.typeDisplay.value',
                    'comparator': 0,
                    'valueType': 0,
                    'textValue': 'Accreditation Program',
                    'integerValue': None,
                    'decimalValue': None,
                    'dateTimeValue': None,
                    'booleanValue': None,
                    'innerFilter': None,
                },
                {
                    'name': 'certificateType.programName',
                    'comparator': 0,
                    'valueType': 0,
                    'textValue': 'Specialty Pharmacy',
                    'integerValue': None,
                    'decimalValue': None,
                    'dateTimeValue': None,
                    'booleanValue': None,
                    'innerFilter': None,
                },
            ],
            'anyParts': [],
            'notParts': [],
        },
        'orderBy': 'certificationNumber',
        'pageSize': 15,
        'limit': 100,
    }
    try:
        response = requests.post(
            'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter',
            headers=headers,
            json=json_data
        )
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        st.write("Error processing URAC main API:", e)
        return pd.DataFrame()
    
    # Parse organization items.
    for item in data.get('items', []):
        entity = item.get('entity', {})
        org_name = entity.get('name', None)
        decision = item.get('completedApplicationDecisionItem', {})
        outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value')
        status = outcome if outcome is not None else item.get('effectiveStatusName', None)
        srt_date = item.get('issuedDate', None)
        exp_date = item.get('expirationDate', None)
        program = item.get('certificateType', {}).get('displayName', None)
        address_parts = []
        for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
            part = entity.get(key)
            if part:
                address_parts.append(part)
        address = ', '.join(address_parts)
        parsed_address = usaddress.parse(address)
        street, city, state, zipcode = '', '', '', ''
        for value, label in parsed_address:
            if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
                street += f' {value}'
            elif label == 'PlaceName':
                city = value
            elif label == 'StateName':
                state = value
            elif label == 'ZipCode':
                zipcode = value
        cert_number = item.get("primaryCertification", {}).get("certificationNumber")
        if not cert_number:
            cert_number = item.get("certificationNumber")
        org_data = {
            "Organization Name": org_name,
            "Accreditation Status": status,
            "Start Date": srt_date,
            "Expiration Date": exp_date,
            "Program": program,
            "Address": address,
            "Street": street.strip(),
            "City": city,
            "State": state,
            "ZipCode": zipcode,
            "Certification Number": cert_number
        }
        organizations.append(org_data)
    
    # Fetch detail API calls in parallel and update the progress bar.
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_org = {
            executor.submit(fetch_detail, org["Certification Number"], org, headers): org
            for org in organizations if org["Certification Number"]
        }
        total = len(future_to_org)
        completed = 0
        for future in concurrent.futures.as_completed(future_to_org):
            try:
                detail_rows = future.result()
                all_rows.extend(detail_rows)
            except Exception as exc:
                org = future_to_org[future]
                st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
            completed += 1
            if progress_bar is not None and total > 0:
                progress_bar.progress(min(100, int(100 * completed / total)))
    return pd.DataFrame(all_rows)

def _parse_accreditation_blocks(detail_soup):
    """
    Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts.
    """
    results = []
    blocks = detail_soup.find_all('div', class_='main_cont_det')
    for block in blocks:
        start_date, expiration_date = '', ''
        site_program, site_service = '', ''
        for p in block.find_all('p'):
            text = p.get_text(strip=True)
            if 'Date:' in text:
                m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text)
                if m:
                    start_date = m.group(1)
                    expiration_date = m.group(2)
            elif 'Program:' in text:
                site_program = text.split('Program:')[-1].strip()
            elif 'Service:' in text:
                site_service = text.split('Service:')[-1].strip()
        results.append({
            "Start Date": start_date,
            "Expiration Date": expiration_date,
            "SiteProgram": site_program,
            "SiteService": site_service
        })
    return results

def _extract_original_program(detail_soup):
    """
    Extract the original Program value from the detail soup.
    """
    program = ''
    for p in detail_soup.find_all('p'):
        if 'Program:' in p.get_text():
            program = p.get_text(strip=True).split('Program:')[-1].strip()
            break
    return program

def _fetch_detail_for_company(company, base_url, headers, cookies):
    """
    For a given company from the ACHC main API, fetch the detail API,
    parse the HTML detail, and return one or more rows.
    """
    rows = []
    company_id = company["company_id"]
    detail_payload = f'action=view_provider_details&data_company_id={company_id}'
    try:
        detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload)
        detail_resp.raise_for_status()
        detail_json = detail_resp.json()
        detail_html = detail_json.get('response_html', '')
        detail_soup = BeautifulSoup(detail_html, 'html.parser')
        
        original_program = _extract_original_program(detail_soup)
        acc_blocks = _parse_accreditation_blocks(detail_soup)
        if not acc_blocks:
            rows.append({
                "Organization Name": company["org_name"],
                "Start Date": '',
                "Expiration Date": '',
                "Accreditation Status": "N/A",
                "Program": original_program,
                "SiteProgram": '',
                "SiteService": '',
                "Address": company["address"],
                "Street": company["street"],
                "City": company["city"],
                "State": company["state"],
                "ZipCode": company["zipcode"]
            })
        else:
            for block in acc_blocks:
                rows.append({
                    "Organization Name": company["org_name"],
                    "Start Date": block["Start Date"],
                    "Expiration Date": block["Expiration Date"],
                    "Accreditation Status": "N/A",
                    "Program": original_program,
                    "SiteProgram": block["SiteProgram"],
                    "SiteService": block["SiteService"],
                    "Address": company["address"],
                    "Street": company["street"],
                    "City": company["city"],
                    "State": company["state"],
                    "ZipCode": company["zipcode"]
                })
    except Exception as e:
        st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
    return rows

def scrape_achc(progress_bar=None):
    """
    Scrape ACHC data:
      1. Call the main API to get HTML.
      2. Parse each company’s info.
      3. In parallel, call the detail API to get accreditation details.
    Returns a pandas DataFrame.
    """
    headers = {
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accept-language': 'en-US,en;q=0.8',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'origin': 'https://www.achc.org',
        'priority': 'u=1, i',
        'referer': 'https://www.achc.org/find-a-provider/',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'sec-gpc': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    cookies = {
        'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}'
    }
    base_url = 'https://www.achc.org/wp-admin/admin-ajax.php'
    main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search='

    try:
        main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload)
        main_resp.raise_for_status()
        main_json = main_resp.json()
    except Exception as e:
        st.write(f"Error fetching ACHC main API: {e}")
        return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])

    main_html = main_json.get('response_html', '')
    main_soup = BeautifulSoup(main_html, 'html.parser')
    company_items = main_soup.find_all('li')
    companies = []
    for item in company_items:
        list_box = item.find('div', class_='list_cont_box')
        if not list_box:
            continue
        org_tag = list_box.find('b', class_='company_name')
        org_name = org_tag.get_text(strip=True) if org_tag else ''
        address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
        address = ' '.join(address_parts)
        parsed = usaddress.parse(address)
        street, city, state, zipcode = '', '', '', ''
        for value, label in parsed:
            if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
                street += f' {value}'
            elif label == 'PlaceName':
                city = value
            elif label == 'StateName':
                state = value
            elif label == 'ZipCode':
                zipcode = value
        view_more = item.find('p', class_='view_more_eye')
        if not view_more or not view_more.has_attr('data-company-id'):
            continue
        company_id = view_more['data-company-id']
        companies.append({
            "company_id": company_id,
            "org_name": org_name,
            "address": address,
            "street": street.strip(),
            "city": city,
            "state": state,
            "zipcode": zipcode
        })

    detail_rows_all = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = [
            executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies)
            for comp in companies
        ]
        total = len(futures)
        completed = 0
        for future in concurrent.futures.as_completed(futures):
            try:
                rows = future.result()
                detail_rows_all.extend(rows)
            except Exception as exc:
                st.write(f"Error fetching ACHC detail: {exc}")
            completed += 1
            if progress_bar is not None and total > 0:
                progress_bar.progress(min(100, int(100 * completed / total)))
    df = pd.DataFrame(detail_rows_all, columns=[
        "Organization Name",
        "Start Date",
        "Expiration Date",
        "Accreditation Status",
        "Program",
        "SiteProgram",
        "SiteService",
        "Address",
        "Street",
        "City",
        "State",
        "ZipCode"
    ])
    return df

# --- Streamlit UI ---

st.title("Accreditation Data Scraper")
st.write("Click the button below to start scraping and generate an Excel file.")

def run_scraper():
    # Scrape URAC data with its own progress bar.
    with st.spinner("Scraping URAC data..."):
        urac_progress = st.progress(0)
        urac_df = scrape_urac(progress_bar=urac_progress)
    # Scrape ACHC data with its own progress bar.
    with st.spinner("Scraping ACHC data..."):
        achc_progress = st.progress(0)
        achc_df = scrape_achc(progress_bar=achc_progress)
    # Merge data and write to an in-memory Excel file.
    with st.spinner("Merging data and generating Excel..."):
        merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
                             suffixes=("_URAC", "_ACHC"))
        output = io.BytesIO()
        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
            if not urac_df.empty:
                urac_df.to_excel(writer, sheet_name="URAC", index=False)
            if not achc_df.empty:
                achc_df.to_excel(writer, sheet_name="ACHC", index=False)
            if not urac_df.empty and not achc_df.empty:
                merged_df.to_excel(writer, sheet_name="Merged", index=False)
            # writer.save()
        output.seek(0)
    return output

if st.button("Start Scraping"):
    excel_data = run_scraper()
    st.success("Scraping completed!")
    st.download_button(
        label="Download Excel File",
        data=excel_data,
        file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )