sergiomar73 commited on
Commit
02d6d31
·
1 Parent(s): fc4f2ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -10
app.py CHANGED
@@ -48,20 +48,139 @@ def process_categories(categories):
48
  return df_category_list
49
 
50
  def compare_text(transcript, categories):
51
- df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
52
- for idx, sentence in enumerate(transcript_to_sentences(transcript)):
53
- embedding = calculate_embeddings_with_roberta(sentence)
54
- # Create new row
55
- new_row = {
56
- 'line': idx + 1,
57
- 'sentence': sentence,
58
- 'embedding': embedding
59
- }
60
- df_sentences = df_sentences.append(new_row, ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # print(df_sentences.shape)
62
  # df_sentences.head()
63
  return df_sentences
64
  #return res, fig, details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  categories = """Hello=Hello, how are you doing today?;Hi, everybody;Hi;My name's Johnny
67
  What=most advanced conversation intelligence and AI powered coaching platform;a software platform that helps people reach their potential;for communicating and connecting;empowered by behavioral science;uses artificial intelligence;drives performance outcomes for customer facing teams;help them sell more;help them deliver better experiences
 
48
  return df_category_list
49
 
50
  def compare_text(transcript, categories):
51
+ df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
52
+ for idx, sentence in enumerate(transcript_to_sentences(transcript)):
53
+ embeddings = calculate_embeddings_with_roberta(sentence)
54
+ # Create new row
55
+ new_row = {
56
+ 'line': idx + 1,
57
+ 'sentence': sentence,
58
+ 'embedding': embeddings
59
+ }
60
+ df_sentences = df_sentences.append(new_row, ignore_index=True)
61
+ # print(df_sentences.shape)
62
+ # df_sentences.head()
63
+ return df_sentences
64
+
65
+ targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
66
+ # print(f"targets:{targets.shape}")
67
+ df_cosines = pd.DataFrame(columns=['line'])
68
+ for i, row in df_sentences.iterrows():
69
+ line = f'{row["line"]:03}'
70
+ # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
71
+ source = np.array(row["embedding"])
72
+ cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
73
+ # Create new row
74
+ new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
75
+ new_row["line"] = row["line"]
76
+ df_cosines = df_cosines.append(new_row, ignore_index=True)
77
+
78
+ df_cosines['line'] = df_cosines['line'].astype('int')
79
+ # print(df_cosines.shape)
80
+ # df_cosines.head(3)
81
+ df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
82
+ # print(df_comparison.shape)
83
+ # df_comparison.head(3)
84
+
85
+ threshold = threshold / 100
86
+
87
+ df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])
88
+
89
+ for i, row in df_comparison.iterrows():
90
+ for n in range(1,64+1):
91
+ col = f"Cosine{f'{n:02}'}"
92
+ # if row[col] > threshold:
93
+ phrase = df_phrases.loc[[ n - 1 ]]
94
+ new_row = {
95
+ 'line': row["line"],
96
+ 'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
97
+ 'phrase': df_phrases.at[n-1,"example"],
98
+ 'category': df_phrases.at[n-1,"category"],
99
+ 'tag': df_phrases.at[n-1,"label"],
100
+ 'similarity': row[col]
101
+ }
102
+ df_results = df_results.append(new_row, ignore_index=True)
103
+ df_results['line'] = df_cosines['line'].astype('int')
104
+ # print(df_results.shape)
105
+ # df_results.head(3)
106
+
107
+ df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
108
+ df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
109
+ # df_summary
110
+
111
+ fig = px.bar(
112
+ df_summary,
113
+ y='similarity',
114
+ color='ok',
115
+ color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
116
+ text='similarity',
117
+ text_auto='.3f',
118
+ labels={'tag': 'Category', 'similarity': 'Similarity'},
119
+ title = f"{transcript[:200]}..."
120
+ )
121
+ fig.add_shape( # add a horizontal "target" line
122
+ type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
123
+ x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
124
+ )
125
+ fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
126
+ fig.update_yaxes(range=[0, 1])
127
+ # fig.show()
128
+
129
+ details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index() .drop(labels='index',axis=1)
130
+
131
+ res = df_summary['similarity'].to_dict()
132
+
133
+ return res, fig, details
134
+
135
+ df_category_list = process_categories(categories)
136
+ sentences = transcript_to_sentences(transcript)
137
+ print(f"{len(sentences)} sentences")
138
+ df_results = pd.DataFrame(sentences, columns=['Sentence'])
139
+ embeddings = model.encode(sentences, convert_to_tensor=True)
140
+ for _, df_category in enumerate(df_category_list):
141
+ phrases_list = df_category["embeddings"].values.tolist()
142
+ phrases = torch.stack(phrases_list)
143
+ # Compute cosine-similarities
144
+ cosine_scores = util.cos_sim(embeddings, phrases).numpy()
145
+ max_scores = np.max(cosine_scores, axis=1)
146
+ df_results_plot[df_category.iloc[0,2]] = max_scores
147
+ df_results_grid[df_category.iloc[0,2]] = max_scores
148
+ df_results_plot = df_results_plot.round(decimals = 2)
149
+ df_results_grid = df_results_grid.round(decimals = 3)
150
+
151
+ df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
152
+ for idx, sentence in enumerate(transcript_to_sentences(transcript)):
153
+ embeddings = calculate_embeddings_with_roberta(sentence)
154
+ # Create new row
155
+ new_row = {
156
+ 'line': idx + 1,
157
+ 'sentence': sentence,
158
+ 'embedding': embeddings
159
+ }
160
+ df_sentences = df_sentences.append(new_row, ignore_index=True)
161
  # print(df_sentences.shape)
162
  # df_sentences.head()
163
  return df_sentences
164
  #return res, fig, details
165
+
166
+
167
+ doc = nlp(transcript)
168
+ sentences = [ sentence.text for sentence in list(doc.sents) ]
169
+ embeddings = model.encode(sentences, convert_to_tensor=True)
170
+ print(f"{len(sentences)} sentences")
171
+ sentences_mini = [ s[:50] for s in sentences ]
172
+ df_results_grid = pd.DataFrame(sentences, columns=['Sentence'])
173
+ df_results_plot = pd.DataFrame(index=sentences_mini)
174
+ for _, df_category in enumerate(df_category_list):
175
+ phrases_list = df_category["embeddings"].values.tolist()
176
+ phrases = torch.stack(phrases_list)
177
+ # Compute cosine-similarities
178
+ cosine_scores = util.cos_sim(embeddings, phrases).numpy()
179
+ max_scores = np.max(cosine_scores, axis=1)
180
+ df_results_plot[df_category.iloc[0,2]] = max_scores
181
+ df_results_grid[df_category.iloc[0,2]] = max_scores
182
+ df_results_plot = df_results_plot.round(decimals = 2)
183
+ df_results_grid = df_results_grid.round(decimals = 3)
184
 
185
  categories = """Hello=Hello, how are you doing today?;Hi, everybody;Hi;My name's Johnny
186
  What=most advanced conversation intelligence and AI powered coaching platform;a software platform that helps people reach their potential;for communicating and connecting;empowered by behavioral science;uses artificial intelligence;drives performance outcomes for customer facing teams;help them sell more;help them deliver better experiences