tferhan commited on
Commit
1ab54f4
·
verified ·
1 Parent(s): 165eb5e

Delete document_scrapped.py

Browse files
Files changed (1) hide show
  1. document_scrapped.py +0 -161
document_scrapped.py DELETED
@@ -1,161 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """document_scrapped.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
8
- """
9
-
10
- !pip install -r requirements.txt
11
-
12
- !pip install python-docx
13
-
14
- !pip install docx
15
-
16
- !pip install PyMuPDF
17
-
18
- !pip install python-pptx
19
-
20
- from bs4 import BeautifulSoup
21
- import requests
22
- import json
23
- import io
24
- import fitz
25
- from pptx import Presentation
26
- import chardet
27
- from docx import Document
28
- import pandas as pd
29
-
30
-
31
- def downl(url):
32
- try:
33
- rq = requests.get(url)
34
- if rq.status_code != 200:
35
- return None
36
- bs = BeautifulSoup(rq.text, features='lxml')
37
- lis = bs.find_all('ul', class_='dropdown-menu')[-1].find_all('li')
38
- link = lis[-1].find('a').get('href')
39
- return link
40
- except Exception as e:
41
- return None
42
-
43
-
44
-
45
- def excel(link):
46
- try:
47
- ls = downl(link)
48
- response = requests.get(ls)
49
-
50
- if response.status_code == 200:
51
- file_content = response.content
52
- df = pd.read_excel(file_content)
53
- if df.shape[0] > 50:
54
- sample_size = 50
55
- sample_df = df.sample(n=sample_size, random_state=42)
56
- json_data = sample_df.to_json(orient='records')
57
- js = json.loads(json_data)
58
- return js
59
- else:
60
- print("Failed to download file")
61
- except Exception as e:
62
- return None
63
-
64
-
65
- def csv(link):
66
- try:
67
- ls = downl(link)
68
- print(ls)
69
- response = requests.get(ls)
70
-
71
- if response.status_code == 200:
72
- file_content = response.content
73
- detected_encoding = chardet.detect(file_content)['encoding']
74
- df = pd.read_csv(io.BytesIO(file_content), encoding=detected_encoding, sep=';')
75
- if df.empty:
76
- print("The DataFrame is empty.")
77
- return None
78
-
79
- if df.shape[0] > 50:
80
- sample_size = 50
81
- sample_df = df.sample(n=sample_size, random_state=42)
82
- else:
83
- sample_df = df
84
-
85
- json_data = sample_df.to_json(orient='records')
86
- js = json.loads(json_data)
87
- return js
88
-
89
- except Exception as e:
90
- return None
91
-
92
-
93
- def docx(url):
94
- try:
95
- ls = downl(url)
96
- # Download the .docx file
97
- response = requests.get(ls)
98
- response.raise_for_status() # Ensure we notice bad responses
99
-
100
- # Read the .docx file
101
- file_stream = io.BytesIO(response.content)
102
- doc = Document(file_stream)
103
-
104
- # Extract text
105
- full_text = []
106
- for para in doc.paragraphs:
107
- full_text.append(para.text)
108
-
109
- return '\n'.join(full_text)
110
- except Exception as e:
111
- print(f"An error occurred: {e}")
112
- return None
113
-
114
-
115
- def pdf(url):
116
- try:
117
- ls = downl(url)
118
- # Download the PDF file
119
- response = requests.get(ls)
120
- response.raise_for_status() # Ensure we notice bad responses
121
-
122
- # Read the PDF file
123
- file_stream = io.BytesIO(response.content)
124
- pdf_document = fitz.open(stream=file_stream, filetype='pdf')
125
-
126
- # Extract text
127
- full_text = []
128
- for page_num in range(len(pdf_document)):
129
- page = pdf_document.load_page(page_num)
130
- full_text.append(page.get_text())
131
-
132
- return '\n'.join(full_text)
133
- except Exception as e:
134
- print(f"An error occurred: {e}")
135
- return None
136
-
137
-
138
-
139
-
140
- def pptx(url):
141
- try:
142
-
143
- ls = downl(url)
144
- response = requests.get(ls)
145
- response.raise_for_status()
146
-
147
- # Read the .pptx file
148
- file_stream = io.BytesIO(response.content)
149
- presentation = Presentation(file_stream)
150
-
151
- # Extract text
152
- full_text = []
153
- for slide in presentation.slides:
154
- for shape in slide.shapes:
155
- if hasattr(shape, "text"):
156
- full_text.append(shape.text)
157
-
158
- return '\n'.join(full_text)
159
- except Exception as e:
160
- print(f"An error occurred: {e}")
161
- return None