Update app.py
Browse files
app.py
CHANGED
@@ -40,22 +40,16 @@ for uploaded_file in uploaded_files:
|
|
40 |
data = pd.Series(text_data, name = 'Text')
|
41 |
st.dataframe(data)
|
42 |
frames = [job, data]
|
43 |
-
|
|
|
|
|
|
|
44 |
st.dataframe(result)
|
45 |
|
46 |
-
import re
|
47 |
-
def preprocess_text(text):
|
48 |
-
text = text.lower() # Lowercase text
|
49 |
-
text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
|
50 |
-
text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines
|
51 |
-
|
52 |
-
return text
|
53 |
|
54 |
-
result['Text']= result['Text'].map(preprocess_text)
|
55 |
-
st.dataframe(result['Text'])
|
56 |
|
57 |
vectorizer = TfidfVectorizer()
|
58 |
-
tfidf_matrix = vectorizer.fit_transform(result
|
59 |
|
60 |
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
61 |
st.subheader("TF-IDF Values:")
|
|
|
40 |
data = pd.Series(text_data, name = 'Text')
|
41 |
st.dataframe(data)
|
42 |
frames = [job, data]
|
43 |
+
result1 = pd.concat(frames)
|
44 |
+
st.dataframe(result1)
|
45 |
+
|
46 |
+
result = result1['Text'].drop_duplicates().to_list()
|
47 |
st.dataframe(result)
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
50 |
|
51 |
vectorizer = TfidfVectorizer()
|
52 |
+
tfidf_matrix = vectorizer.fit_transform(result)
|
53 |
|
54 |
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
55 |
st.subheader("TF-IDF Values:")
|