Spaces:

tferhan
/

data_gov_ma

Sleeping

App Files Files Community

tferhan commited on Jun 4, 2024

Commit

1ab54f4

verified ·

1 Parent(s): 165eb5e

Delete document_scrapped.py

Browse files

Files changed (1) hide show

document_scrapped.py +0 -161

document_scrapped.py DELETED Viewed

@@ -1,161 +0,0 @@
-# -*- coding: utf-8 -*-
-"""document_scrapped.ipynb
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
-"""
-!pip install -r requirements.txt
-!pip install python-docx
-!pip install docx
-!pip install PyMuPDF
-!pip install python-pptx
-from bs4 import BeautifulSoup
-import requests
-import json
-import io
-import fitz
-from pptx import Presentation
-import chardet
-from docx import Document
-import pandas as pd
-def downl(url):
-  try:
-    rq = requests.get(url)
-    if rq.status_code != 200:
-      return None
-    bs = BeautifulSoup(rq.text, features='lxml')
-    lis = bs.find_all('ul', class_='dropdown-menu')[-1].find_all('li')
-    link = lis[-1].find('a').get('href')
-    return link
-  except Exception as e:
-    return None
-def excel(link):
-    try:
-        ls = downl(link)
-        response = requests.get(ls)
-        if response.status_code == 200:
-            file_content = response.content
-            df = pd.read_excel(file_content)
-            if df.shape[0] > 50:
-              sample_size = 50
-              sample_df = df.sample(n=sample_size, random_state=42)
-              json_data = sample_df.to_json(orient='records')
-              js = json.loads(json_data)
-            return js
-        else:
-            print("Failed to download file")
-    except Exception as e:
-        return None
-def csv(link):
-  try:
-    ls = downl(link)
-    print(ls)
-    response = requests.get(ls)
-    if response.status_code == 200:
-      file_content = response.content
-      detected_encoding = chardet.detect(file_content)['encoding']
-      df = pd.read_csv(io.BytesIO(file_content), encoding=detected_encoding, sep=';')
-      if df.empty:
-                print("The DataFrame is empty.")
-                return None
-      if df.shape[0] > 50:
-                sample_size = 50
-                sample_df = df.sample(n=sample_size, random_state=42)
-      else:
-                sample_df = df
-      json_data = sample_df.to_json(orient='records')
-      js = json.loads(json_data)
-      return js
-  except Exception as e:
-    return None
-def docx(url):
-    try:
-        ls = downl(url)
-        # Download the .docx file
-        response = requests.get(ls)
-        response.raise_for_status()  # Ensure we notice bad responses
-        # Read the .docx file
-        file_stream = io.BytesIO(response.content)
-        doc = Document(file_stream)
-        # Extract text
-        full_text = []
-        for para in doc.paragraphs:
-            full_text.append(para.text)
-        return '\n'.join(full_text)
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        return None
-def pdf(url):
-    try:
-        ls = downl(url)
-        # Download the PDF file
-        response = requests.get(ls)
-        response.raise_for_status()  # Ensure we notice bad responses
-        # Read the PDF file
-        file_stream = io.BytesIO(response.content)
-        pdf_document = fitz.open(stream=file_stream, filetype='pdf')
-        # Extract text
-        full_text = []
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            full_text.append(page.get_text())
-        return '\n'.join(full_text)
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        return None
-def pptx(url):
-    try:
-        ls = downl(url)
-        response = requests.get(ls)
-        response.raise_for_status()
-        # Read the .pptx file
-        file_stream = io.BytesIO(response.content)
-        presentation = Presentation(file_stream)
-        # Extract text
-        full_text = []
-        for slide in presentation.slides:
-            for shape in slide.shapes:
-                if hasattr(shape, "text"):
-                    full_text.append(shape.text)
-        return '\n'.join(full_text)
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        return None