Spaces:
Sleeping
Sleeping
Update document_scrapped.py
Browse files- document_scrapped.py +20 -26
document_scrapped.py
CHANGED
@@ -1,26 +1,4 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""document_scrapped.ipynb
|
3 |
-
|
4 |
-
Automatically generated by Colab.
|
5 |
-
|
6 |
-
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
|
8 |
-
"""
|
9 |
-
|
10 |
import re
|
11 |
-
def select_words_until_char_limit(s, char_limit):
|
12 |
-
s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
|
13 |
-
words = s_no_punct.split()
|
14 |
-
selected_words = []
|
15 |
-
total_chars = 0
|
16 |
-
for word in words:
|
17 |
-
if total_chars + len(word) + 1 <= char_limit:
|
18 |
-
selected_words.append(word)
|
19 |
-
total_chars += len(word) + 1 # add 1 for the space
|
20 |
-
else:
|
21 |
-
break
|
22 |
-
return ' '.join(selected_words)
|
23 |
-
|
24 |
from bs4 import BeautifulSoup
|
25 |
import requests
|
26 |
import json
|
@@ -50,6 +28,22 @@ from unidecode import unidecode
|
|
50 |
from langchain_huggingface import HuggingFaceEndpoint
|
51 |
import os
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def downl(url):
|
54 |
try:
|
55 |
rq = requests.get(url)
|
@@ -183,10 +177,10 @@ def pptx(url : str) -> str:
|
|
183 |
print(f"An error occurred: {e}")
|
184 |
return 'No data avaible'
|
185 |
|
186 |
-
def get_data(url
|
187 |
-
|
|
|
188 |
ext = jo.split(".")[-1]
|
189 |
-
print(ext)
|
190 |
if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
|
191 |
rs = excel(jo)
|
192 |
return rs
|
@@ -202,4 +196,4 @@ def get_data(url : str) -> str:
|
|
202 |
elif ext == 'pptx' or ext == 'ppt':
|
203 |
rs = pptx(jo)
|
204 |
return rs
|
205 |
-
return "No data returned"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import requests
|
4 |
import json
|
|
|
28 |
from langchain_huggingface import HuggingFaceEndpoint
|
29 |
import os
|
30 |
|
31 |
+
|
32 |
+
def select_words_until_char_limit(s, char_limit):
|
33 |
+
s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
|
34 |
+
words = s_no_punct.split()
|
35 |
+
selected_words = []
|
36 |
+
total_chars = 0
|
37 |
+
for word in words:
|
38 |
+
if total_chars + len(word) + 1 <= char_limit:
|
39 |
+
selected_words.append(word)
|
40 |
+
total_chars += len(word) + 1 # add 1 for the space
|
41 |
+
else:
|
42 |
+
break
|
43 |
+
return ' '.join(selected_words)
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
def downl(url):
|
48 |
try:
|
49 |
rq = requests.get(url)
|
|
|
177 |
print(f"An error occurred: {e}")
|
178 |
return 'No data avaible'
|
179 |
|
180 |
+
def get_data(url):
|
181 |
+
ki = url.replace('\nObservation', '').replace('"\nObservation', '')
|
182 |
+
jo = downl(ki)
|
183 |
ext = jo.split(".")[-1]
|
|
|
184 |
if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
|
185 |
rs = excel(jo)
|
186 |
return rs
|
|
|
196 |
elif ext == 'pptx' or ext == 'ppt':
|
197 |
rs = pptx(jo)
|
198 |
return rs
|
199 |
+
return "No data returned"
|