salmanmapkar commited on
Commit
c7bccba
·
verified ·
1 Parent(s): 8fd8293

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -26
app.py CHANGED
@@ -12,8 +12,8 @@ import io
12
 
13
  def fetch_detail(cert_number, main_data, headers):
14
  """
15
- For a given certification number, call the URAC detail API and return a list of rows,
16
- one per site address. If no site records are returned, a row with blank site fields is returned.
17
  """
18
  detail_rows = []
19
  url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
@@ -23,7 +23,6 @@ def fetch_detail(cert_number, main_data, headers):
23
  detail_data = response.json()
24
  entities = detail_data.get("certificationEntities", [])
25
  if not entities:
26
- # No site records: return row with blank site fields.
27
  row = main_data.copy()
28
  row.update({
29
  "Site Name": None,
@@ -71,12 +70,12 @@ def fetch_detail(cert_number, main_data, headers):
71
  st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
72
  return detail_rows
73
 
74
- def scrape_urac():
75
  """
76
  Scrape URAC accreditation data:
77
  1. Call the main filter API.
78
  2. Parse organization details.
79
- 3. For each organization, call the detail API (in parallel) to get one row per site address.
80
  Returns a pandas DataFrame.
81
  """
82
  organizations = []
@@ -143,7 +142,7 @@ def scrape_urac():
143
  st.write("Error processing URAC main API:", e)
144
  return pd.DataFrame()
145
 
146
- # Parse each organization item.
147
  for item in data.get('items', []):
148
  entity = item.get('entity', {})
149
  org_name = entity.get('name', None)
@@ -170,7 +169,6 @@ def scrape_urac():
170
  state = value
171
  elif label == 'ZipCode':
172
  zipcode = value
173
- # Get certification number.
174
  cert_number = item.get("primaryCertification", {}).get("certificationNumber")
175
  if not cert_number:
176
  cert_number = item.get("certificationNumber")
@@ -189,7 +187,7 @@ def scrape_urac():
189
  }
190
  organizations.append(org_data)
191
 
192
- # Use a thread pool to fetch details in parallel.
193
  with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
194
  future_to_org = {
195
  executor.submit(fetch_detail, org["Certification Number"], org, headers): org
@@ -205,6 +203,8 @@ def scrape_urac():
205
  org = future_to_org[future]
206
  st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
207
  completed += 1
 
 
208
  return pd.DataFrame(all_rows)
209
 
210
  def _parse_accreditation_blocks(detail_soup):
@@ -298,7 +298,7 @@ def _fetch_detail_for_company(company, base_url, headers, cookies):
298
  st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
299
  return rows
300
 
301
- def scrape_achc():
302
  """
303
  Scrape ACHC data:
304
  1. Call the main API to get HTML.
@@ -334,7 +334,7 @@ def scrape_achc():
334
  main_json = main_resp.json()
335
  except Exception as e:
336
  st.write(f"Error fetching ACHC main API: {e}")
337
- return pd.DataFrame()
338
 
339
  main_html = main_json.get('response_html', '')
340
  main_soup = BeautifulSoup(main_html, 'html.parser')
@@ -346,7 +346,6 @@ def scrape_achc():
346
  continue
347
  org_tag = list_box.find('b', class_='company_name')
348
  org_name = org_tag.get_text(strip=True) if org_tag else ''
349
- # Join all <p> texts for the address.
350
  address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
351
  address = ' '.join(address_parts)
352
  parsed = usaddress.parse(address)
@@ -389,6 +388,8 @@ def scrape_achc():
389
  except Exception as exc:
390
  st.write(f"Error fetching ACHC detail: {exc}")
391
  completed += 1
 
 
392
  df = pd.DataFrame(detail_rows_all, columns=[
393
  "Organization Name",
394
  "Start Date",
@@ -411,28 +412,28 @@ st.title("Accreditation Data Scraper")
411
  st.write("Click the button below to start scraping and generate an Excel file.")
412
 
413
  def run_scraper():
414
- progress_bar = st.progress(0)
415
-
416
  with st.spinner("Scraping URAC data..."):
417
- urac_df = scrape_urac()
418
- progress_bar.progress(33)
419
-
420
  with st.spinner("Scraping ACHC data..."):
421
- achc_df = scrape_achc()
422
- progress_bar.progress(66)
423
-
424
  with st.spinner("Merging data and generating Excel..."):
425
- merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
426
  suffixes=("_URAC", "_ACHC"))
427
- # Write to an in-memory bytes buffer.
428
  output = io.BytesIO()
429
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
430
- urac_df.to_excel(writer, sheet_name="URAC", index=False)
431
- achc_df.to_excel(writer, sheet_name="ACHC", index=False)
432
- merged_df.to_excel(writer, sheet_name="Merged", index=False)
433
- writer.save()
 
 
 
434
  output.seek(0)
435
- progress_bar.progress(100)
436
  return output
437
 
438
  if st.button("Start Scraping"):
 
12
 
13
  def fetch_detail(cert_number, main_data, headers):
14
  """
15
+ For a given certification number, call the URAC detail API and return a list of rows.
16
+ If no site records are returned, a row with blank site fields is returned.
17
  """
18
  detail_rows = []
19
  url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
 
23
  detail_data = response.json()
24
  entities = detail_data.get("certificationEntities", [])
25
  if not entities:
 
26
  row = main_data.copy()
27
  row.update({
28
  "Site Name": None,
 
70
  st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
71
  return detail_rows
72
 
73
+ def scrape_urac(progress_bar=None):
74
  """
75
  Scrape URAC accreditation data:
76
  1. Call the main filter API.
77
  2. Parse organization details.
78
+ 3. For each organization, call the detail API in parallel to get one row per site address.
79
  Returns a pandas DataFrame.
80
  """
81
  organizations = []
 
142
  st.write("Error processing URAC main API:", e)
143
  return pd.DataFrame()
144
 
145
+ # Parse organization items.
146
  for item in data.get('items', []):
147
  entity = item.get('entity', {})
148
  org_name = entity.get('name', None)
 
169
  state = value
170
  elif label == 'ZipCode':
171
  zipcode = value
 
172
  cert_number = item.get("primaryCertification", {}).get("certificationNumber")
173
  if not cert_number:
174
  cert_number = item.get("certificationNumber")
 
187
  }
188
  organizations.append(org_data)
189
 
190
+ # Fetch detail API calls in parallel and update the progress bar.
191
  with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
192
  future_to_org = {
193
  executor.submit(fetch_detail, org["Certification Number"], org, headers): org
 
203
  org = future_to_org[future]
204
  st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
205
  completed += 1
206
+ if progress_bar is not None and total > 0:
207
+ progress_bar.progress(min(100, int(100 * completed / total)))
208
  return pd.DataFrame(all_rows)
209
 
210
  def _parse_accreditation_blocks(detail_soup):
 
298
  st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
299
  return rows
300
 
301
+ def scrape_achc(progress_bar=None):
302
  """
303
  Scrape ACHC data:
304
  1. Call the main API to get HTML.
 
334
  main_json = main_resp.json()
335
  except Exception as e:
336
  st.write(f"Error fetching ACHC main API: {e}")
337
+ return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])
338
 
339
  main_html = main_json.get('response_html', '')
340
  main_soup = BeautifulSoup(main_html, 'html.parser')
 
346
  continue
347
  org_tag = list_box.find('b', class_='company_name')
348
  org_name = org_tag.get_text(strip=True) if org_tag else ''
 
349
  address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
350
  address = ' '.join(address_parts)
351
  parsed = usaddress.parse(address)
 
388
  except Exception as exc:
389
  st.write(f"Error fetching ACHC detail: {exc}")
390
  completed += 1
391
+ if progress_bar is not None and total > 0:
392
+ progress_bar.progress(min(100, int(100 * completed / total)))
393
  df = pd.DataFrame(detail_rows_all, columns=[
394
  "Organization Name",
395
  "Start Date",
 
412
  st.write("Click the button below to start scraping and generate an Excel file.")
413
 
414
  def run_scraper():
415
+ # Scrape URAC data with its own progress bar.
 
416
  with st.spinner("Scraping URAC data..."):
417
+ urac_progress = st.progress(0)
418
+ urac_df = scrape_urac(progress_bar=urac_progress)
419
+ # Scrape ACHC data with its own progress bar.
420
  with st.spinner("Scraping ACHC data..."):
421
+ achc_progress = st.progress(0)
422
+ achc_df = scrape_achc(progress_bar=achc_progress)
423
+ # Merge data and write to an in-memory Excel file.
424
  with st.spinner("Merging data and generating Excel..."):
425
+ merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
426
  suffixes=("_URAC", "_ACHC"))
 
427
  output = io.BytesIO()
428
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
429
+ if not urac_df.empty:
430
+ urac_df.to_excel(writer, sheet_name="URAC", index=False)
431
+ if not achc_df.empty:
432
+ achc_df.to_excel(writer, sheet_name="ACHC", index=False)
433
+ if not urac_df.empty and not achc_df.empty:
434
+ merged_df.to_excel(writer, sheet_name="Merged", index=False)
435
+ # writer.save()
436
  output.seek(0)
 
437
  return output
438
 
439
  if st.button("Start Scraping"):