tferhan commited on
Commit
4d0ed30
·
verified ·
1 Parent(s): 0f970b7

Upload document_scrapped.py

Browse files
Files changed (1) hide show
  1. document_scrapped.py +161 -0
document_scrapped.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """document_scrapped.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
8
+ """
9
+
10
+ !pip install -r requirements.txt
11
+
12
+ !pip install python-docx
13
+
14
+ !pip install docx
15
+
16
+ !pip install PyMuPDF
17
+
18
+ !pip install python-pptx
19
+
20
+ from bs4 import BeautifulSoup
21
+ import requests
22
+ import json
23
+ import io
24
+ import fitz
25
+ from pptx import Presentation
26
+ import chardet
27
+ from docx import Document
28
+ import pandas as pd
29
+
30
+
31
+ def downl(url):
32
+ try:
33
+ rq = requests.get(url)
34
+ if rq.status_code != 200:
35
+ return None
36
+ bs = BeautifulSoup(rq.text, features='lxml')
37
+ lis = bs.find_all('ul', class_='dropdown-menu')[-1].find_all('li')
38
+ link = lis[-1].find('a').get('href')
39
+ return link
40
+ except Exception as e:
41
+ return None
42
+
43
+
44
+
45
+ def excel(link):
46
+ try:
47
+ ls = downl(link)
48
+ response = requests.get(ls)
49
+
50
+ if response.status_code == 200:
51
+ file_content = response.content
52
+ df = pd.read_excel(file_content)
53
+ if df.shape[0] > 50:
54
+ sample_size = 50
55
+ sample_df = df.sample(n=sample_size, random_state=42)
56
+ json_data = sample_df.to_json(orient='records')
57
+ js = json.loads(json_data)
58
+ return js
59
+ else:
60
+ print("Failed to download file")
61
+ except Exception as e:
62
+ return None
63
+
64
+
65
+ def csv(link):
66
+ try:
67
+ ls = downl(link)
68
+ print(ls)
69
+ response = requests.get(ls)
70
+
71
+ if response.status_code == 200:
72
+ file_content = response.content
73
+ detected_encoding = chardet.detect(file_content)['encoding']
74
+ df = pd.read_csv(io.BytesIO(file_content), encoding=detected_encoding, sep=';')
75
+ if df.empty:
76
+ print("The DataFrame is empty.")
77
+ return None
78
+
79
+ if df.shape[0] > 50:
80
+ sample_size = 50
81
+ sample_df = df.sample(n=sample_size, random_state=42)
82
+ else:
83
+ sample_df = df
84
+
85
+ json_data = sample_df.to_json(orient='records')
86
+ js = json.loads(json_data)
87
+ return js
88
+
89
+ except Exception as e:
90
+ return None
91
+
92
+
93
+ def docx(url):
94
+ try:
95
+ ls = downl(url)
96
+ # Download the .docx file
97
+ response = requests.get(ls)
98
+ response.raise_for_status() # Ensure we notice bad responses
99
+
100
+ # Read the .docx file
101
+ file_stream = io.BytesIO(response.content)
102
+ doc = Document(file_stream)
103
+
104
+ # Extract text
105
+ full_text = []
106
+ for para in doc.paragraphs:
107
+ full_text.append(para.text)
108
+
109
+ return '\n'.join(full_text)
110
+ except Exception as e:
111
+ print(f"An error occurred: {e}")
112
+ return None
113
+
114
+
115
+ def pdf(url):
116
+ try:
117
+ ls = downl(url)
118
+ # Download the PDF file
119
+ response = requests.get(ls)
120
+ response.raise_for_status() # Ensure we notice bad responses
121
+
122
+ # Read the PDF file
123
+ file_stream = io.BytesIO(response.content)
124
+ pdf_document = fitz.open(stream=file_stream, filetype='pdf')
125
+
126
+ # Extract text
127
+ full_text = []
128
+ for page_num in range(len(pdf_document)):
129
+ page = pdf_document.load_page(page_num)
130
+ full_text.append(page.get_text())
131
+
132
+ return '\n'.join(full_text)
133
+ except Exception as e:
134
+ print(f"An error occurred: {e}")
135
+ return None
136
+
137
+
138
+
139
+
140
+ def pptx(url):
141
+ try:
142
+
143
+ ls = downl(url)
144
+ response = requests.get(ls)
145
+ response.raise_for_status()
146
+
147
+ # Read the .pptx file
148
+ file_stream = io.BytesIO(response.content)
149
+ presentation = Presentation(file_stream)
150
+
151
+ # Extract text
152
+ full_text = []
153
+ for slide in presentation.slides:
154
+ for shape in slide.shapes:
155
+ if hasattr(shape, "text"):
156
+ full_text.append(shape.text)
157
+
158
+ return '\n'.join(full_text)
159
+ except Exception as e:
160
+ print(f"An error occurred: {e}")
161
+ return None