tferhan commited on
Commit
3e4b286
·
verified ·
1 Parent(s): bf95cbd

Update document_scrapped.py

Browse files
Files changed (1) hide show
  1. document_scrapped.py +20 -26
document_scrapped.py CHANGED
@@ -1,26 +1,4 @@
1
- # -*- coding: utf-8 -*-
2
- """document_scrapped.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
8
- """
9
-
10
  import re
11
- def select_words_until_char_limit(s, char_limit):
12
- s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
13
- words = s_no_punct.split()
14
- selected_words = []
15
- total_chars = 0
16
- for word in words:
17
- if total_chars + len(word) + 1 <= char_limit:
18
- selected_words.append(word)
19
- total_chars += len(word) + 1 # add 1 for the space
20
- else:
21
- break
22
- return ' '.join(selected_words)
23
-
24
  from bs4 import BeautifulSoup
25
  import requests
26
  import json
@@ -50,6 +28,22 @@ from unidecode import unidecode
50
  from langchain_huggingface import HuggingFaceEndpoint
51
  import os
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def downl(url):
54
  try:
55
  rq = requests.get(url)
@@ -183,10 +177,10 @@ def pptx(url : str) -> str:
183
  print(f"An error occurred: {e}")
184
  return 'No data avaible'
185
 
186
- def get_data(url : str) -> str:
187
- jo = downl(url)
 
188
  ext = jo.split(".")[-1]
189
- print(ext)
190
  if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
191
  rs = excel(jo)
192
  return rs
@@ -202,4 +196,4 @@ def get_data(url : str) -> str:
202
  elif ext == 'pptx' or ext == 'ppt':
203
  rs = pptx(jo)
204
  return rs
205
- return "No data returned"
 
 
 
 
 
 
 
 
 
 
1
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from bs4 import BeautifulSoup
3
  import requests
4
  import json
 
28
  from langchain_huggingface import HuggingFaceEndpoint
29
  import os
30
 
31
+
32
+ def select_words_until_char_limit(s, char_limit):
33
+ s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
34
+ words = s_no_punct.split()
35
+ selected_words = []
36
+ total_chars = 0
37
+ for word in words:
38
+ if total_chars + len(word) + 1 <= char_limit:
39
+ selected_words.append(word)
40
+ total_chars += len(word) + 1 # add 1 for the space
41
+ else:
42
+ break
43
+ return ' '.join(selected_words)
44
+
45
+
46
+
47
  def downl(url):
48
  try:
49
  rq = requests.get(url)
 
177
  print(f"An error occurred: {e}")
178
  return 'No data avaible'
179
 
180
+ def get_data(url):
181
+ ki = url.replace('\nObservation', '').replace('"\nObservation', '')
182
+ jo = downl(ki)
183
  ext = jo.split(".")[-1]
 
184
  if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
185
  rs = excel(jo)
186
  return rs
 
196
  elif ext == 'pptx' or ext == 'ppt':
197
  rs = pptx(jo)
198
  return rs
199
+ return "No data returned"