tferhan commited on
Commit
406d45c
·
verified ·
1 Parent(s): 632df39

Update document_scrapped.py

Browse files
Files changed (1) hide show
  1. document_scrapped.py +14 -4
document_scrapped.py CHANGED
@@ -27,7 +27,15 @@ from langchain_core.output_parsers import StrOutputParser
27
  from unidecode import unidecode
28
  from langchain_huggingface import HuggingFaceEndpoint
29
  import os
30
-
 
 
 
 
 
 
 
 
31
 
32
  def select_words_until_char_limit(s, char_limit):
33
  s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
@@ -40,7 +48,8 @@ def select_words_until_char_limit(s, char_limit):
40
  total_chars += len(word) + 1 # add 1 for the space
41
  else:
42
  break
43
- return ' '.join(selected_words)
 
44
 
45
 
46
 
@@ -95,7 +104,8 @@ def excel(link : str) -> str:
95
  sample_df = df
96
  json_data = sample_df.to_json(orient='records')
97
  js = json.loads(json_data)
98
- return f"{js}"
 
99
  else:
100
  print("Failed to download file")
101
  return "No dat avaible error"
@@ -124,7 +134,7 @@ def csv(link : str) -> str:
124
 
125
  json_data = sample_df.to_json(orient='records')
126
  js = json.loads(json_data)
127
- return f"{js}"
128
 
129
  except Exception as e:
130
  return 'No data avaible'
 
27
  from unidecode import unidecode
28
  from langchain_huggingface import HuggingFaceEndpoint
29
  import os
30
+ def trim_input_words(input_str, max_new_tokens = 512, max_total_tokens=32768):
31
+ words = input_str.split()
32
+ max_input_tokens = max_total_tokens - max_new_tokens
33
+
34
+ if len(words) > max_input_tokens - 100:
35
+ words = words[:max_input_tokens]
36
+ trimmed_input_str = ' '.join(words)
37
+
38
+ return trimmed_input_str
39
 
40
  def select_words_until_char_limit(s, char_limit):
41
  s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
 
48
  total_chars += len(word) + 1 # add 1 for the space
49
  else:
50
  break
51
+ f = trim_input_words(' '.join(selected_words))
52
+ return f
53
 
54
 
55
 
 
104
  sample_df = df
105
  json_data = sample_df.to_json(orient='records')
106
  js = json.loads(json_data)
107
+ rs = trim_input_words(js)
108
+ return rs
109
  else:
110
  print("Failed to download file")
111
  return "No dat avaible error"
 
134
 
135
  json_data = sample_df.to_json(orient='records')
136
  js = json.loads(json_data)
137
+ rs = trim_input_words(js)
138
 
139
  except Exception as e:
140
  return 'No data avaible'