Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""document_scrapped.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ | |
""" | |
!pip install -r requirements.txt | |
!pip install python-docx | |
!pip install docx | |
!pip install PyMuPDF | |
!pip install python-pptx | |
from bs4 import BeautifulSoup | |
import requests | |
import json | |
import io | |
import fitz | |
from pptx import Presentation | |
import chardet | |
from docx import Document | |
import pandas as pd | |
def downl(url): | |
try: | |
rq = requests.get(url) | |
if rq.status_code != 200: | |
return None | |
bs = BeautifulSoup(rq.text, features='lxml') | |
lis = bs.find_all('ul', class_='dropdown-menu')[-1].find_all('li') | |
link = lis[-1].find('a').get('href') | |
return link | |
except Exception as e: | |
return None | |
def excel(link): | |
try: | |
ls = downl(link) | |
response = requests.get(ls) | |
if response.status_code == 200: | |
file_content = response.content | |
df = pd.read_excel(file_content) | |
if df.shape[0] > 50: | |
sample_size = 50 | |
sample_df = df.sample(n=sample_size, random_state=42) | |
json_data = sample_df.to_json(orient='records') | |
js = json.loads(json_data) | |
return js | |
else: | |
print("Failed to download file") | |
except Exception as e: | |
return None | |
def csv(link): | |
try: | |
ls = downl(link) | |
print(ls) | |
response = requests.get(ls) | |
if response.status_code == 200: | |
file_content = response.content | |
detected_encoding = chardet.detect(file_content)['encoding'] | |
df = pd.read_csv(io.BytesIO(file_content), encoding=detected_encoding, sep=';') | |
if df.empty: | |
print("The DataFrame is empty.") | |
return None | |
if df.shape[0] > 50: | |
sample_size = 50 | |
sample_df = df.sample(n=sample_size, random_state=42) | |
else: | |
sample_df = df | |
json_data = sample_df.to_json(orient='records') | |
js = json.loads(json_data) | |
return js | |
except Exception as e: | |
return None | |
def docx(url): | |
try: | |
ls = downl(url) | |
# Download the .docx file | |
response = requests.get(ls) | |
response.raise_for_status() # Ensure we notice bad responses | |
# Read the .docx file | |
file_stream = io.BytesIO(response.content) | |
doc = Document(file_stream) | |
# Extract text | |
full_text = [] | |
for para in doc.paragraphs: | |
full_text.append(para.text) | |
return '\n'.join(full_text) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return None | |
def pdf(url): | |
try: | |
ls = downl(url) | |
# Download the PDF file | |
response = requests.get(ls) | |
response.raise_for_status() # Ensure we notice bad responses | |
# Read the PDF file | |
file_stream = io.BytesIO(response.content) | |
pdf_document = fitz.open(stream=file_stream, filetype='pdf') | |
# Extract text | |
full_text = [] | |
for page_num in range(len(pdf_document)): | |
page = pdf_document.load_page(page_num) | |
full_text.append(page.get_text()) | |
return '\n'.join(full_text) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return None | |
def pptx(url): | |
try: | |
ls = downl(url) | |
response = requests.get(ls) | |
response.raise_for_status() | |
# Read the .pptx file | |
file_stream = io.BytesIO(response.content) | |
presentation = Presentation(file_stream) | |
# Extract text | |
full_text = [] | |
for slide in presentation.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
full_text.append(shape.text) | |
return '\n'.join(full_text) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return None |