data_gov_ma / document_scrapped.py
tferhan's picture
Upload document_scrapped.py
4d0ed30 verified
raw
history blame
3.94 kB
# -*- coding: utf-8 -*-
"""document_scrapped.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
"""
!pip install -r requirements.txt
!pip install python-docx
!pip install docx
!pip install PyMuPDF
!pip install python-pptx
from bs4 import BeautifulSoup
import requests
import json
import io
import fitz
from pptx import Presentation
import chardet
from docx import Document
import pandas as pd
def downl(url):
try:
rq = requests.get(url)
if rq.status_code != 200:
return None
bs = BeautifulSoup(rq.text, features='lxml')
lis = bs.find_all('ul', class_='dropdown-menu')[-1].find_all('li')
link = lis[-1].find('a').get('href')
return link
except Exception as e:
return None
def excel(link):
try:
ls = downl(link)
response = requests.get(ls)
if response.status_code == 200:
file_content = response.content
df = pd.read_excel(file_content)
if df.shape[0] > 50:
sample_size = 50
sample_df = df.sample(n=sample_size, random_state=42)
json_data = sample_df.to_json(orient='records')
js = json.loads(json_data)
return js
else:
print("Failed to download file")
except Exception as e:
return None
def csv(link):
try:
ls = downl(link)
print(ls)
response = requests.get(ls)
if response.status_code == 200:
file_content = response.content
detected_encoding = chardet.detect(file_content)['encoding']
df = pd.read_csv(io.BytesIO(file_content), encoding=detected_encoding, sep=';')
if df.empty:
print("The DataFrame is empty.")
return None
if df.shape[0] > 50:
sample_size = 50
sample_df = df.sample(n=sample_size, random_state=42)
else:
sample_df = df
json_data = sample_df.to_json(orient='records')
js = json.loads(json_data)
return js
except Exception as e:
return None
def docx(url):
try:
ls = downl(url)
# Download the .docx file
response = requests.get(ls)
response.raise_for_status() # Ensure we notice bad responses
# Read the .docx file
file_stream = io.BytesIO(response.content)
doc = Document(file_stream)
# Extract text
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
except Exception as e:
print(f"An error occurred: {e}")
return None
def pdf(url):
try:
ls = downl(url)
# Download the PDF file
response = requests.get(ls)
response.raise_for_status() # Ensure we notice bad responses
# Read the PDF file
file_stream = io.BytesIO(response.content)
pdf_document = fitz.open(stream=file_stream, filetype='pdf')
# Extract text
full_text = []
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
full_text.append(page.get_text())
return '\n'.join(full_text)
except Exception as e:
print(f"An error occurred: {e}")
return None
def pptx(url):
try:
ls = downl(url)
response = requests.get(ls)
response.raise_for_status()
# Read the .pptx file
file_stream = io.BytesIO(response.content)
presentation = Presentation(file_stream)
# Extract text
full_text = []
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
full_text.append(shape.text)
return '\n'.join(full_text)
except Exception as e:
print(f"An error occurred: {e}")
return None