sergiomar73 commited on
Commit
f3bcc60
·
1 Parent(s): 02d6d31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -47
app.py CHANGED
@@ -48,23 +48,36 @@ def process_categories(categories):
48
  return df_category_list
49
 
50
  def compare_text(transcript, categories):
51
- df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
52
- for idx, sentence in enumerate(transcript_to_sentences(transcript)):
53
- embeddings = calculate_embeddings_with_roberta(sentence)
54
- # Create new row
55
- new_row = {
56
- 'line': idx + 1,
57
- 'sentence': sentence,
58
- 'embedding': embeddings
59
- }
60
- df_sentences = df_sentences.append(new_row, ignore_index=True)
61
- # print(df_sentences.shape)
62
- # df_sentences.head()
63
- return df_sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
66
- # print(f"targets:{targets.shape}")
67
- df_cosines = pd.DataFrame(columns=['line'])
68
  for i, row in df_sentences.iterrows():
69
  line = f'{row["line"]:03}'
70
  # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
@@ -132,37 +145,9 @@ def compare_text(transcript, categories):
132
 
133
  return res, fig, details
134
 
135
- df_category_list = process_categories(categories)
136
- sentences = transcript_to_sentences(transcript)
137
- print(f"{len(sentences)} sentences")
138
- df_results = pd.DataFrame(sentences, columns=['Sentence'])
139
- embeddings = model.encode(sentences, convert_to_tensor=True)
140
- for _, df_category in enumerate(df_category_list):
141
- phrases_list = df_category["embeddings"].values.tolist()
142
- phrases = torch.stack(phrases_list)
143
- # Compute cosine-similarities
144
- cosine_scores = util.cos_sim(embeddings, phrases).numpy()
145
- max_scores = np.max(cosine_scores, axis=1)
146
- df_results_plot[df_category.iloc[0,2]] = max_scores
147
- df_results_grid[df_category.iloc[0,2]] = max_scores
148
- df_results_plot = df_results_plot.round(decimals = 2)
149
- df_results_grid = df_results_grid.round(decimals = 3)
150
-
151
- df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
152
- for idx, sentence in enumerate(transcript_to_sentences(transcript)):
153
- embeddings = calculate_embeddings_with_roberta(sentence)
154
- # Create new row
155
- new_row = {
156
- 'line': idx + 1,
157
- 'sentence': sentence,
158
- 'embedding': embeddings
159
- }
160
- df_sentences = df_sentences.append(new_row, ignore_index=True)
161
- # print(df_sentences.shape)
162
- # df_sentences.head()
163
- return df_sentences
164
- #return res, fig, details
165
-
166
 
167
  doc = nlp(transcript)
168
  sentences = [ sentence.text for sentence in list(doc.sents) ]
 
48
  return df_category_list
49
 
50
  def compare_text(transcript, categories):
51
+ # Sentences
52
+ # df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
53
+ sentences = transcript_to_sentences(transcript)
54
+ embeddings = model.encode(sentences, convert_to_tensor=True)
55
+ #for idx, sentence in enumerate(sentences):
56
+ # embeddings = calculate_embeddings_with_roberta(sentence)
57
+ # # Create new row
58
+ # new_row = {
59
+ # 'line': idx + 1,
60
+ # 'sentence': sentence,
61
+ # 'embedding': embeddings
62
+ # }
63
+ # df_sentences = df_sentences.append(new_row, ignore_index=True)
64
+ # Categories
65
+ df_category_list = process_categories(categories)
66
+ df_cosines = pd.DataFrame(data=range(len(sentences)),columns=['line'])
67
+ return df_cosines
68
+ for _, df_category in enumerate(df_category_list):
69
+ phrases_list = df_category["embeddings"].values.tolist()
70
+ phrases = torch.stack(phrases_list)
71
+ # Compute cosine-similarities
72
+ cosine_scores = util.cos_sim(embeddings, phrases).numpy()
73
+ max_scores = np.max(cosine_scores, axis=1)
74
+
75
+
76
+ df_results_plot[df_category.iloc[0,2]] = max_scores
77
+ df_results_grid[df_category.iloc[0,2]] = max_scores
78
+
79
+
80
 
 
 
 
81
  for i, row in df_sentences.iterrows():
82
  line = f'{row["line"]:03}'
83
  # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
 
145
 
146
  return res, fig, details
147
 
148
+ #*********************************************************
149
+
150
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  doc = nlp(transcript)
153
  sentences = [ sentence.text for sentence in list(doc.sents) ]