Tihsrah-CD commited on
Commit
3c0e9ce
·
1 Parent(s): f756937
Files changed (1) hide show
  1. app_test.py +0 -283
app_test.py DELETED
@@ -1,283 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pickle
4
- from tqdm import tqdm
5
- from Levenshtein import distance as lev
6
- import joblib
7
- from googletrans import Translator
8
- from indictrans import Transliterator
9
- from pyphonetics import RefinedSoundex
10
- import enchant
11
- from bs4 import BeautifulSoup
12
- import re
13
-
14
- def main():
15
- st.title('Text Processing App')
16
-
17
- dictn = enchant.Dict("en_US")
18
- rs = RefinedSoundex()
19
- normalized_string_final=[]
20
- translator = Translator()
21
- trn = Transliterator(source='eng', target='hin')
22
-
23
- with open(r'./english_vocab.pkl', "rb") as fp:
24
- english = pickle.load(fp)
25
- english_vocab=english
26
- with open(r'./hinglish_vocab.pkl', "rb") as fp:
27
- hinglish = pickle.load(fp)
28
- hinglish_vocab=hinglish
29
-
30
- english_vocab['and'] = ['and']
31
- english_vocab['is'] = ['is']
32
-
33
- def clean_tweet(tweet):
34
- text=re.sub(r'@ [A-Za-z0-9\']+','',tweet)
35
- text=BeautifulSoup(text,'lxml').get_text()
36
- text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text)
37
- text=re.sub(r'https[A-Za-z0-9/. ]*','',text)
38
- text=re.sub("[^a-zA-Z]"," ",text)
39
- text=re.sub(r'\bRT\b',' ',text)
40
- text=re.sub(r'\bnan\b',' ',text)
41
- return text
42
-
43
- input_text = st.text_area("Enter the text:")
44
- total_translated = []
45
- if st.button('Process'):
46
- # Create a DataFrame with the user input text
47
- data = {'Text': [input_text]}
48
- df1 = pd.DataFrame(data)
49
-
50
- # Apply the clean_tweet function to the user input text
51
- df1['Text'] = df1['Text'].apply(clean_tweet)
52
-
53
- # Extract the cleaned text
54
- cleaned_text = df1['Text'].tolist()[0]
55
-
56
- # Process the cleaned text further if needed
57
- total_text = [cleaned_text]
58
- st.write("Input Text:", total_text)
59
-
60
- for i in tqdm(total_text):
61
- test_text=i.split()
62
-
63
- # english word change from vocab
64
- not_changed_idx=[]
65
- for i in range(len(test_text)):
66
- not_changed_idx.append(0)
67
-
68
- changed_text=[]
69
- changed_idx=[]
70
- # print("1st",changed_text)
71
- for i in range(len(test_text)):
72
-
73
- for key in english_vocab:
74
- done=0
75
- for val in english_vocab[key]:
76
- if(test_text[i]==val):
77
- # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
78
- # print("yahan par",key,val,test_text[i])
79
- changed_text.append(key)
80
- changed_idx.append(i)
81
- not_changed_idx[i]=1
82
- done=1
83
- # print("breaking")
84
- break
85
- if done==1:
86
- # print("breaking again")
87
- break
88
-
89
- normalized_string=[]
90
-
91
- # making changed text and idx to a dictionary with two lists
92
- res = dict(zip(changed_idx, changed_text))
93
- # print(res)
94
- for i in range(len(test_text)):
95
- try:
96
- normalized_string.append(res[i])
97
- except:
98
- normalized_string.append(test_text[i])
99
- print("English Normalized String : ",normalized_string)
100
-
101
-
102
- # hinglish word change
103
- test_list = [i for i in range(len(test_text))]
104
- changed_hing_idx = [i for i in test_list if i not in changed_idx]
105
- # print(changed_hing_idx)
106
- hinglish_text_part=[]
107
- for i in changed_hing_idx:
108
- try:
109
- hinglish_text_part.append(test_text[i])
110
- except:
111
- pass
112
- # print(hinglish_text_part)
113
-
114
- changed_text2=[]
115
- changed_idx2=[]
116
- # print("1st hing",changed_text2)
117
- for i in range(len(hinglish_text_part)):
118
-
119
- for key in hinglish_vocab:
120
- done=0
121
- for val in hinglish_vocab[key]:
122
- if(hinglish_text_part[i]==val):
123
- # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
124
- # print(key,val,hinglish_text_part[i])
125
- changed_text2.append(key)
126
- changed_idx2.append(i)
127
- not_changed_idx[i]=1
128
- done=1
129
- # print("breaking")
130
- break
131
- if done==1:
132
- # print("breaking again")
133
- break
134
-
135
-
136
- # making changed text and idx to a dictionary with two lists
137
- normalized_string2=[]
138
- # print("changed_text 2 ",changed_text2)
139
- res2 = dict(zip(changed_idx2, changed_text2))
140
- # print(res2)
141
- for i in range(len(hinglish_text_part)):
142
- try:
143
- normalized_string2.append(res2[i])
144
- except:
145
- normalized_string2.append(hinglish_text_part[i])
146
- # print("normalised string 2 :",normalized_string2)
147
-
148
-
149
- changed_idx=list(set(changed_idx))
150
- changed_idx.sort()
151
- # print("changed idx",changed_idx)
152
- for i in changed_idx:
153
- normalized_string2.append(res[i])
154
-
155
- print("Hinglish Normalized String : ",normalized_string)
156
- # print(not_changed_idx)
157
-
158
-
159
- # finding phoneme and leventise distance for unchanged word
160
-
161
- for i in range(len(not_changed_idx)):
162
- try:
163
- if not_changed_idx[i]==0:
164
- eng_phoneme_correction=[]
165
- for j in english_vocab:
166
- # print(normalized_string2[i],j)
167
- try:
168
- phoneme=rs.distance(normalized_string2[i],j)
169
- except:
170
- pass
171
- if phoneme<=1:
172
- eng_phoneme_correction.append(j)
173
- eng_lev_correction=[]
174
- for k in eng_phoneme_correction:
175
- dist=lev(normalized_string2[i],k)
176
- if dist <=2:
177
- eng_lev_correction.append(k)
178
- # print(eng_phoneme_correction)
179
- # print(eng_lev_correction)
180
-
181
-
182
- hing_phoneme_correction=[]
183
- for j in hinglish_vocab:
184
- try:
185
- phoneme=rs.distance(normalized_string2[i],j)
186
- except:
187
- pass
188
- if phoneme<=1:
189
- hing_phoneme_correction.append(j)
190
- hing_lev_correction=[]
191
- for k in hing_phoneme_correction:
192
- dist=lev(normalized_string2[i],k)
193
- if dist <=2:
194
- hing_lev_correction.append(k)
195
- # print(hing_phoneme_correction)
196
- # print(hing_lev_correction)
197
-
198
- eng_lev_correction.extend(hing_lev_correction)
199
- new_correction=eng_lev_correction
200
- eng_lev_correction=[]
201
- # hing_lev_correction=[]
202
- # print(eng_lev_correction)
203
-
204
- for l in new_correction:
205
- dist=lev(normalized_string2[i],l)
206
- eng_lev_correction.append(dist)
207
- min_val=min(eng_lev_correction)
208
- min_idx=eng_lev_correction.index(min_val)
209
-
210
-
211
- suggestion=dictn.suggest(new_correction[min_idx])
212
- suggestion_lit=[]
213
- for t in suggestion:
214
- dist=lev(new_correction[min_idx],t)
215
- suggestion_lit.append(dist)
216
- min_suggestion_val=min(suggestion_lit)
217
- min_suggestion_idx=suggestion_lit.index(min_suggestion_val)
218
- # print("Suggestions : ",min_suggestion_val)
219
- # print(suggestion[min_suggestion_idx])
220
-
221
-
222
-
223
- normalized_string2[i]=suggestion[min_suggestion_idx]
224
- except:
225
- pass
226
- normalized_string=normalized_string2
227
- normalized_string_final=normalized_string2
228
- print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final)
229
- # sentence tagging
230
- classifier=joblib.load(r"./classifer.joblib")
231
- classify=[]
232
- for i in normalized_string:
233
- test_classify=classifier(i)
234
- classify.append(test_classify[0].get("label"))
235
-
236
- # print(normalized_string)
237
- # print(classify)
238
-
239
- for i in range(len(classify)):
240
- if classify[i]=='en':
241
- try:
242
- normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text
243
- except:
244
- normalized_string[i]="delete"
245
- print("English -> Hindi Translated String : ",normalized_string)
246
-
247
-
248
- conversion_list=[]
249
-
250
- for i in tqdm(normalized_string):
251
- conversion_list.append(trn.transform(i))
252
-
253
- print("Hinglish -> Hindi Transliterated String : ",conversion_list)
254
- conversion_list=normalized_string
255
- string=""
256
- sentence=[]
257
- for i in conversion_list:
258
- string=i+' '+string
259
- sentence.append(string)
260
- translated=[]
261
- for i in tqdm(sentence):
262
- try:
263
- translated_text = translator.translate(i ,src='hi',dest='en')
264
- translated.append(translated_text.text)
265
- except:
266
- translated.append("delete")
267
- print("Hindi -> English Translated String : ",translated)
268
- total_translated.append(translated[0])
269
-
270
- total_translated=pd.DataFrame(total_translated)
271
-
272
-
273
-
274
-
275
- st.write("English Normalized String:", normalized_string)
276
- st.write("Hinglish Normalized String:", normalized_string)
277
- st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
278
- st.write("English -> Hindi Translated String:", normalized_string)
279
- st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
280
- st.write("Hindi -> English Translated String:", translated)
281
-
282
- if __name__ == '__main__':
283
- main()