Spaces:

salmanmapkar
/

Random_Stuff

Sleeping

App Files Files Community

salmanmapkar commited on Mar 22

Commit

c7bccba

verified ·

1 Parent(s): 8fd8293

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -26

app.py CHANGED Viewed

@@ -12,8 +12,8 @@ import io
 def fetch_detail(cert_number, main_data, headers):
     """
-    For a given certification number, call the URAC detail API and return a list of rows,
-    one per site address. If no site records are returned, a row with blank site fields is returned.
     """
     detail_rows = []
     url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
@@ -23,7 +23,6 @@ def fetch_detail(cert_number, main_data, headers):
         detail_data = response.json()
         entities = detail_data.get("certificationEntities", [])
         if not entities:
-            # No site records: return row with blank site fields.
             row = main_data.copy()
             row.update({
                 "Site Name": None,
@@ -71,12 +70,12 @@ def fetch_detail(cert_number, main_data, headers):
         st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
     return detail_rows
-def scrape_urac():
     """
     Scrape URAC accreditation data:
       1. Call the main filter API.
       2. Parse organization details.
-      3. For each organization, call the detail API (in parallel) to get one row per site address.
     Returns a pandas DataFrame.
     """
     organizations = []
@@ -143,7 +142,7 @@ def scrape_urac():
         st.write("Error processing URAC main API:", e)
         return pd.DataFrame()
-    # Parse each organization item.
     for item in data.get('items', []):
         entity = item.get('entity', {})
         org_name = entity.get('name', None)
@@ -170,7 +169,6 @@ def scrape_urac():
                 state = value
             elif label == 'ZipCode':
                 zipcode = value
-        # Get certification number.
         cert_number = item.get("primaryCertification", {}).get("certificationNumber")
         if not cert_number:
             cert_number = item.get("certificationNumber")
@@ -189,7 +187,7 @@ def scrape_urac():
         }
         organizations.append(org_data)
-    # Use a thread pool to fetch details in parallel.
     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
         future_to_org = {
             executor.submit(fetch_detail, org["Certification Number"], org, headers): org
@@ -205,6 +203,8 @@ def scrape_urac():
                 org = future_to_org[future]
                 st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
             completed += 1
     return pd.DataFrame(all_rows)
 def _parse_accreditation_blocks(detail_soup):
@@ -298,7 +298,7 @@ def _fetch_detail_for_company(company, base_url, headers, cookies):
         st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
     return rows
-def scrape_achc():
     """
     Scrape ACHC data:
       1. Call the main API to get HTML.
@@ -334,7 +334,7 @@ def scrape_achc():
         main_json = main_resp.json()
     except Exception as e:
         st.write(f"Error fetching ACHC main API: {e}")
-        return pd.DataFrame()
     main_html = main_json.get('response_html', '')
     main_soup = BeautifulSoup(main_html, 'html.parser')
@@ -346,7 +346,6 @@ def scrape_achc():
             continue
         org_tag = list_box.find('b', class_='company_name')
         org_name = org_tag.get_text(strip=True) if org_tag else ''
-        # Join all <p> texts for the address.
         address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
         address = ' '.join(address_parts)
         parsed = usaddress.parse(address)
@@ -389,6 +388,8 @@ def scrape_achc():
             except Exception as exc:
                 st.write(f"Error fetching ACHC detail: {exc}")
             completed += 1
     df = pd.DataFrame(detail_rows_all, columns=[
         "Organization Name",
         "Start Date",
@@ -411,28 +412,28 @@ st.title("Accreditation Data Scraper")
 st.write("Click the button below to start scraping and generate an Excel file.")
 def run_scraper():
-    progress_bar = st.progress(0)
     with st.spinner("Scraping URAC data..."):
-        urac_df = scrape_urac()
-    progress_bar.progress(33)
     with st.spinner("Scraping ACHC data..."):
-        achc_df = scrape_achc()
-    progress_bar.progress(66)
     with st.spinner("Merging data and generating Excel..."):
-        merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
                              suffixes=("_URAC", "_ACHC"))
-        # Write to an in-memory bytes buffer.
         output = io.BytesIO()
         with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
-            urac_df.to_excel(writer, sheet_name="URAC", index=False)
-            achc_df.to_excel(writer, sheet_name="ACHC", index=False)
-            merged_df.to_excel(writer, sheet_name="Merged", index=False)
-            writer.save()
         output.seek(0)
-    progress_bar.progress(100)
     return output
 if st.button("Start Scraping"):

 def fetch_detail(cert_number, main_data, headers):
     """
+    For a given certification number, call the URAC detail API and return a list of rows.
+    If no site records are returned, a row with blank site fields is returned.
     """
     detail_rows = []
     url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
         detail_data = response.json()
         entities = detail_data.get("certificationEntities", [])
         if not entities:
             row = main_data.copy()
             row.update({
                 "Site Name": None,
         st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
     return detail_rows
+def scrape_urac(progress_bar=None):
     """
     Scrape URAC accreditation data:
       1. Call the main filter API.
       2. Parse organization details.
+      3. For each organization, call the detail API in parallel to get one row per site address.
     Returns a pandas DataFrame.
     """
     organizations = []
         st.write("Error processing URAC main API:", e)
         return pd.DataFrame()
+    # Parse organization items.
     for item in data.get('items', []):
         entity = item.get('entity', {})
         org_name = entity.get('name', None)
                 state = value
             elif label == 'ZipCode':
                 zipcode = value
         cert_number = item.get("primaryCertification", {}).get("certificationNumber")
         if not cert_number:
             cert_number = item.get("certificationNumber")
         }
         organizations.append(org_data)
+    # Fetch detail API calls in parallel and update the progress bar.
     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
         future_to_org = {
             executor.submit(fetch_detail, org["Certification Number"], org, headers): org
                 org = future_to_org[future]
                 st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
             completed += 1
+            if progress_bar is not None and total > 0:
+                progress_bar.progress(min(100, int(100 * completed / total)))
     return pd.DataFrame(all_rows)
 def _parse_accreditation_blocks(detail_soup):
         st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
     return rows
+def scrape_achc(progress_bar=None):
     """
     Scrape ACHC data:
       1. Call the main API to get HTML.
         main_json = main_resp.json()
     except Exception as e:
         st.write(f"Error fetching ACHC main API: {e}")
+        return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])
     main_html = main_json.get('response_html', '')
     main_soup = BeautifulSoup(main_html, 'html.parser')
             continue
         org_tag = list_box.find('b', class_='company_name')
         org_name = org_tag.get_text(strip=True) if org_tag else ''
         address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
         address = ' '.join(address_parts)
         parsed = usaddress.parse(address)
             except Exception as exc:
                 st.write(f"Error fetching ACHC detail: {exc}")
             completed += 1
+            if progress_bar is not None and total > 0:
+                progress_bar.progress(min(100, int(100 * completed / total)))
     df = pd.DataFrame(detail_rows_all, columns=[
         "Organization Name",
         "Start Date",
 st.write("Click the button below to start scraping and generate an Excel file.")
 def run_scraper():
+    # Scrape URAC data with its own progress bar.
     with st.spinner("Scraping URAC data..."):
+        urac_progress = st.progress(0)
+        urac_df = scrape_urac(progress_bar=urac_progress)
+    # Scrape ACHC data with its own progress bar.
     with st.spinner("Scraping ACHC data..."):
+        achc_progress = st.progress(0)
+        achc_df = scrape_achc(progress_bar=achc_progress)
+    # Merge data and write to an in-memory Excel file.
     with st.spinner("Merging data and generating Excel..."):
+        merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
                              suffixes=("_URAC", "_ACHC"))
         output = io.BytesIO()
         with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+            if not urac_df.empty:
+                urac_df.to_excel(writer, sheet_name="URAC", index=False)
+            if not achc_df.empty:
+                achc_df.to_excel(writer, sheet_name="ACHC", index=False)
+            if not urac_df.empty and not achc_df.empty:
+                merged_df.to_excel(writer, sheet_name="Merged", index=False)
+            # writer.save()
         output.seek(0)
     return output
 if st.button("Start Scraping"):