Spaces:
Sleeping
Sleeping
Update document_scrapped.py
Browse files- document_scrapped.py +14 -4
document_scrapped.py
CHANGED
@@ -27,7 +27,15 @@ from langchain_core.output_parsers import StrOutputParser
|
|
27 |
from unidecode import unidecode
|
28 |
from langchain_huggingface import HuggingFaceEndpoint
|
29 |
import os
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def select_words_until_char_limit(s, char_limit):
|
33 |
s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
|
@@ -40,7 +48,8 @@ def select_words_until_char_limit(s, char_limit):
|
|
40 |
total_chars += len(word) + 1 # add 1 for the space
|
41 |
else:
|
42 |
break
|
43 |
-
|
|
|
44 |
|
45 |
|
46 |
|
@@ -95,7 +104,8 @@ def excel(link : str) -> str:
|
|
95 |
sample_df = df
|
96 |
json_data = sample_df.to_json(orient='records')
|
97 |
js = json.loads(json_data)
|
98 |
-
|
|
|
99 |
else:
|
100 |
print("Failed to download file")
|
101 |
return "No dat avaible error"
|
@@ -124,7 +134,7 @@ def csv(link : str) -> str:
|
|
124 |
|
125 |
json_data = sample_df.to_json(orient='records')
|
126 |
js = json.loads(json_data)
|
127 |
-
|
128 |
|
129 |
except Exception as e:
|
130 |
return 'No data avaible'
|
|
|
27 |
from unidecode import unidecode
|
28 |
from langchain_huggingface import HuggingFaceEndpoint
|
29 |
import os
|
30 |
+
def trim_input_words(input_str, max_new_tokens = 512, max_total_tokens=32768):
|
31 |
+
words = input_str.split()
|
32 |
+
max_input_tokens = max_total_tokens - max_new_tokens
|
33 |
+
|
34 |
+
if len(words) > max_input_tokens - 100:
|
35 |
+
words = words[:max_input_tokens]
|
36 |
+
trimmed_input_str = ' '.join(words)
|
37 |
+
|
38 |
+
return trimmed_input_str
|
39 |
|
40 |
def select_words_until_char_limit(s, char_limit):
|
41 |
s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
|
|
|
48 |
total_chars += len(word) + 1 # add 1 for the space
|
49 |
else:
|
50 |
break
|
51 |
+
f = trim_input_words(' '.join(selected_words))
|
52 |
+
return f
|
53 |
|
54 |
|
55 |
|
|
|
104 |
sample_df = df
|
105 |
json_data = sample_df.to_json(orient='records')
|
106 |
js = json.loads(json_data)
|
107 |
+
rs = trim_input_words(js)
|
108 |
+
return rs
|
109 |
else:
|
110 |
print("Failed to download file")
|
111 |
return "No dat avaible error"
|
|
|
134 |
|
135 |
json_data = sample_df.to_json(orient='records')
|
136 |
js = json.loads(json_data)
|
137 |
+
rs = trim_input_words(js)
|
138 |
|
139 |
except Exception as e:
|
140 |
return 'No data avaible'
|