Commit
·
02d6d31
1
Parent(s):
fc4f2ad
Update app.py
Browse files
app.py
CHANGED
@@ -48,20 +48,139 @@ def process_categories(categories):
|
|
48 |
return df_category_list
|
49 |
|
50 |
def compare_text(transcript, categories):
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
# print(df_sentences.shape)
|
62 |
# df_sentences.head()
|
63 |
return df_sentences
|
64 |
#return res, fig, details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
categories = """Hello=Hello, how are you doing today?;Hi, everybody;Hi;My name's Johnny
|
67 |
What=most advanced conversation intelligence and AI powered coaching platform;a software platform that helps people reach their potential;for communicating and connecting;empowered by behavioral science;uses artificial intelligence;drives performance outcomes for customer facing teams;help them sell more;help them deliver better experiences
|
|
|
48 |
return df_category_list
|
49 |
|
50 |
def compare_text(transcript, categories):
|
51 |
+
df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
|
52 |
+
for idx, sentence in enumerate(transcript_to_sentences(transcript)):
|
53 |
+
embeddings = calculate_embeddings_with_roberta(sentence)
|
54 |
+
# Create new row
|
55 |
+
new_row = {
|
56 |
+
'line': idx + 1,
|
57 |
+
'sentence': sentence,
|
58 |
+
'embedding': embeddings
|
59 |
+
}
|
60 |
+
df_sentences = df_sentences.append(new_row, ignore_index=True)
|
61 |
+
# print(df_sentences.shape)
|
62 |
+
# df_sentences.head()
|
63 |
+
return df_sentences
|
64 |
+
|
65 |
+
targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
|
66 |
+
# print(f"targets:{targets.shape}")
|
67 |
+
df_cosines = pd.DataFrame(columns=['line'])
|
68 |
+
for i, row in df_sentences.iterrows():
|
69 |
+
line = f'{row["line"]:03}'
|
70 |
+
# print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
|
71 |
+
source = np.array(row["embedding"])
|
72 |
+
cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
|
73 |
+
# Create new row
|
74 |
+
new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
|
75 |
+
new_row["line"] = row["line"]
|
76 |
+
df_cosines = df_cosines.append(new_row, ignore_index=True)
|
77 |
+
|
78 |
+
df_cosines['line'] = df_cosines['line'].astype('int')
|
79 |
+
# print(df_cosines.shape)
|
80 |
+
# df_cosines.head(3)
|
81 |
+
df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
|
82 |
+
# print(df_comparison.shape)
|
83 |
+
# df_comparison.head(3)
|
84 |
+
|
85 |
+
threshold = threshold / 100
|
86 |
+
|
87 |
+
df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])
|
88 |
+
|
89 |
+
for i, row in df_comparison.iterrows():
|
90 |
+
for n in range(1,64+1):
|
91 |
+
col = f"Cosine{f'{n:02}'}"
|
92 |
+
# if row[col] > threshold:
|
93 |
+
phrase = df_phrases.loc[[ n - 1 ]]
|
94 |
+
new_row = {
|
95 |
+
'line': row["line"],
|
96 |
+
'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
|
97 |
+
'phrase': df_phrases.at[n-1,"example"],
|
98 |
+
'category': df_phrases.at[n-1,"category"],
|
99 |
+
'tag': df_phrases.at[n-1,"label"],
|
100 |
+
'similarity': row[col]
|
101 |
+
}
|
102 |
+
df_results = df_results.append(new_row, ignore_index=True)
|
103 |
+
df_results['line'] = df_cosines['line'].astype('int')
|
104 |
+
# print(df_results.shape)
|
105 |
+
# df_results.head(3)
|
106 |
+
|
107 |
+
df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
|
108 |
+
df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
|
109 |
+
# df_summary
|
110 |
+
|
111 |
+
fig = px.bar(
|
112 |
+
df_summary,
|
113 |
+
y='similarity',
|
114 |
+
color='ok',
|
115 |
+
color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
|
116 |
+
text='similarity',
|
117 |
+
text_auto='.3f',
|
118 |
+
labels={'tag': 'Category', 'similarity': 'Similarity'},
|
119 |
+
title = f"{transcript[:200]}..."
|
120 |
+
)
|
121 |
+
fig.add_shape( # add a horizontal "target" line
|
122 |
+
type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
|
123 |
+
x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
|
124 |
+
)
|
125 |
+
fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
|
126 |
+
fig.update_yaxes(range=[0, 1])
|
127 |
+
# fig.show()
|
128 |
+
|
129 |
+
details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index() .drop(labels='index',axis=1)
|
130 |
+
|
131 |
+
res = df_summary['similarity'].to_dict()
|
132 |
+
|
133 |
+
return res, fig, details
|
134 |
+
|
135 |
+
df_category_list = process_categories(categories)
|
136 |
+
sentences = transcript_to_sentences(transcript)
|
137 |
+
print(f"{len(sentences)} sentences")
|
138 |
+
df_results = pd.DataFrame(sentences, columns=['Sentence'])
|
139 |
+
embeddings = model.encode(sentences, convert_to_tensor=True)
|
140 |
+
for _, df_category in enumerate(df_category_list):
|
141 |
+
phrases_list = df_category["embeddings"].values.tolist()
|
142 |
+
phrases = torch.stack(phrases_list)
|
143 |
+
# Compute cosine-similarities
|
144 |
+
cosine_scores = util.cos_sim(embeddings, phrases).numpy()
|
145 |
+
max_scores = np.max(cosine_scores, axis=1)
|
146 |
+
df_results_plot[df_category.iloc[0,2]] = max_scores
|
147 |
+
df_results_grid[df_category.iloc[0,2]] = max_scores
|
148 |
+
df_results_plot = df_results_plot.round(decimals = 2)
|
149 |
+
df_results_grid = df_results_grid.round(decimals = 3)
|
150 |
+
|
151 |
+
df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
|
152 |
+
for idx, sentence in enumerate(transcript_to_sentences(transcript)):
|
153 |
+
embeddings = calculate_embeddings_with_roberta(sentence)
|
154 |
+
# Create new row
|
155 |
+
new_row = {
|
156 |
+
'line': idx + 1,
|
157 |
+
'sentence': sentence,
|
158 |
+
'embedding': embeddings
|
159 |
+
}
|
160 |
+
df_sentences = df_sentences.append(new_row, ignore_index=True)
|
161 |
# print(df_sentences.shape)
|
162 |
# df_sentences.head()
|
163 |
return df_sentences
|
164 |
#return res, fig, details
|
165 |
+
|
166 |
+
|
167 |
+
doc = nlp(transcript)
|
168 |
+
sentences = [ sentence.text for sentence in list(doc.sents) ]
|
169 |
+
embeddings = model.encode(sentences, convert_to_tensor=True)
|
170 |
+
print(f"{len(sentences)} sentences")
|
171 |
+
sentences_mini = [ s[:50] for s in sentences ]
|
172 |
+
df_results_grid = pd.DataFrame(sentences, columns=['Sentence'])
|
173 |
+
df_results_plot = pd.DataFrame(index=sentences_mini)
|
174 |
+
for _, df_category in enumerate(df_category_list):
|
175 |
+
phrases_list = df_category["embeddings"].values.tolist()
|
176 |
+
phrases = torch.stack(phrases_list)
|
177 |
+
# Compute cosine-similarities
|
178 |
+
cosine_scores = util.cos_sim(embeddings, phrases).numpy()
|
179 |
+
max_scores = np.max(cosine_scores, axis=1)
|
180 |
+
df_results_plot[df_category.iloc[0,2]] = max_scores
|
181 |
+
df_results_grid[df_category.iloc[0,2]] = max_scores
|
182 |
+
df_results_plot = df_results_plot.round(decimals = 2)
|
183 |
+
df_results_grid = df_results_grid.round(decimals = 3)
|
184 |
|
185 |
categories = """Hello=Hello, how are you doing today?;Hi, everybody;Hi;My name's Johnny
|
186 |
What=most advanced conversation intelligence and AI powered coaching platform;a software platform that helps people reach their potential;for communicating and connecting;empowered by behavioral science;uses artificial intelligence;drives performance outcomes for customer facing teams;help them sell more;help them deliver better experiences
|