tferhan commited on
Commit
b7d3039
·
verified ·
1 Parent(s): f1288ad

Upload document_scrapped.py

Browse files
Files changed (1) hide show
  1. document_scrapped.py +205 -0
document_scrapped.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """document_scrapped.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
8
+ """
9
+
10
+ import re
11
+ def select_words_until_char_limit(s, char_limit):
12
+ s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
13
+ words = s_no_punct.split()
14
+ selected_words = []
15
+ total_chars = 0
16
+ for word in words:
17
+ if total_chars + len(word) + 1 <= char_limit:
18
+ selected_words.append(word)
19
+ total_chars += len(word) + 1 # add 1 for the space
20
+ else:
21
+ break
22
+ return ' '.join(selected_words)
23
+
24
+ from bs4 import BeautifulSoup
25
+ import requests
26
+ import json
27
+ import io
28
+ import fitz
29
+ from pptx import Presentation
30
+ import chardet
31
+ from docx import Document
32
+ import pandas as pd
33
+ from io import BytesIO
34
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
35
+ from pdfminer.converter import TextConverter
36
+ from io import StringIO
37
+ from pdfminer.layout import LAParams
38
+ from pdfminer.pdfpage import PDFPage
39
+ from langchain.agents import (
40
+ create_react_agent,
41
+ AgentExecutor,
42
+ tool,
43
+ )
44
+ from langchain import hub
45
+ from langchain.tools.base import StructuredTool
46
+ from langchain.agents import initialize_agent
47
+ from langchain.prompts import StringPromptTemplate
48
+ from langchain_core.output_parsers import StrOutputParser
49
+ from unidecode import unidecode
50
+ from langchain_huggingface import HuggingFaceEndpoint
51
+ import os
52
+
53
+ def downl(url):
54
+ try:
55
+ rq = requests.get(url)
56
+ if rq.status_code != 200:
57
+ return ""
58
+ bs = BeautifulSoup(rq.text, features='lxml')
59
+ lis = bs.find_all('ul', class_='dropdown-menu')[-1].find_all('li')
60
+ link = lis[-1].find('a').get('href')
61
+ print(link)
62
+ return link
63
+ except Exception as e:
64
+ return ""
65
+
66
+
67
+ def pdf(url):
68
+ # Download the PDF content
69
+ response = requests.get(url)
70
+ pdf_content = response.content
71
+
72
+ # Convert the bytes object to a file-like object
73
+ pdf_file = BytesIO(pdf_content)
74
+
75
+ # Extract text from the downloaded PDF content
76
+ resource_manager = PDFResourceManager()
77
+ fake_file_handle = StringIO()
78
+ converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
79
+ page_interpreter = PDFPageInterpreter(resource_manager, converter)
80
+
81
+ for page in PDFPage.get_pages(pdf_file):
82
+ page_interpreter.process_page(page)
83
+
84
+ text = fake_file_handle.getvalue()
85
+ f = select_words_until_char_limit(text, 30000)
86
+ converter.close()
87
+ fake_file_handle.close()
88
+ return f
89
+
90
+
91
+ def excel(link : str) -> str:
92
+ try:
93
+ response = requests.get(link)
94
+ if response.status_code == 200:
95
+ file_content = response.content
96
+ df = pd.read_excel(file_content)
97
+ if df.shape[0] > 50:
98
+ sample_size = 50
99
+ sample_df = df.sample(n=sample_size, random_state=42)
100
+ else:
101
+ sample_df = df
102
+ json_data = sample_df.to_json(orient='records')
103
+ js = json.loads(json_data)
104
+ return f"{js}"
105
+ else:
106
+ print("Failed to download file")
107
+ return "No dat avaible error"
108
+ except Exception as e:
109
+ print(e)
110
+ return "No data avaible"
111
+
112
+
113
+ def csv(link : str) -> str:
114
+ try:
115
+ response = requests.get(link)
116
+
117
+ if response.status_code == 200:
118
+ file_content = response.content
119
+ detected_encoding = chardet.detect(file_content)['encoding']
120
+ df = pd.read_csv(io.BytesIO(file_content), encoding=detected_encoding, sep=';')
121
+ if df.empty:
122
+ print("The DataFrame is empty.")
123
+ return 'The data frame is empty'
124
+
125
+ if df.shape[0] > 50:
126
+ sample_size = 50
127
+ sample_df = df.sample(n=sample_size, random_state=42)
128
+ else:
129
+ sample_df = df
130
+
131
+ json_data = sample_df.to_json(orient='records')
132
+ js = json.loads(json_data)
133
+ return f"{js}"
134
+
135
+ except Exception as e:
136
+ return 'No data avaible'
137
+
138
+
139
+ def docx(url : str) -> str:
140
+ try:
141
+ response = requests.get(url)
142
+ response.raise_for_status() # Ensure we notice bad responses
143
+
144
+ # Read the .docx file
145
+ file_stream = io.BytesIO(response.content)
146
+ doc = Document(file_stream)
147
+
148
+ # Extract text
149
+ full_text = []
150
+ for para in doc.paragraphs:
151
+ full_text.append(para.text)
152
+
153
+ f = "\n".join(full_text)
154
+ n = select_words_until_char_limit(f, 30000)
155
+ return n
156
+ except Exception as e:
157
+ print(f"An error occurred: {e}")
158
+ return 'No data avaible'
159
+
160
+
161
+
162
+
163
+ def pptx(url : str) -> str:
164
+ try:
165
+ response = requests.get(url)
166
+ response.raise_for_status()
167
+
168
+ # Read the .pptx file
169
+ file_stream = io.BytesIO(response.content)
170
+ presentation = Presentation(file_stream)
171
+
172
+ # Extract text
173
+ full_text = []
174
+ for slide in presentation.slides:
175
+ for shape in slide.shapes:
176
+ if hasattr(shape, "text"):
177
+ full_text.append(shape.text)
178
+
179
+ g = "\n".join(full_text)
180
+ c = select_words_until_char_limit(g, 30000)
181
+ return c
182
+ except Exception as e:
183
+ print(f"An error occurred: {e}")
184
+ return 'No data avaible'
185
+
186
+ def get_data(url):
187
+ jo = downl(url)
188
+ ext = jo.split(".")[-1]
189
+ print(ext)
190
+ if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
191
+ rs = excel(jo)
192
+ return rs
193
+ elif ext == 'pdf':
194
+ rs = pdf(jo)
195
+ return rs
196
+ elif ext == 'docx' or ext == 'doc':
197
+ rs = docx(jo)
198
+ return rs
199
+ elif ext == 'csv':
200
+ rs = csv(jo)
201
+ return rs
202
+ elif ext == 'pptx' or ext == 'ppt':
203
+ rs = pptx(jo)
204
+ return rs
205
+ return "No data returned"