Spaces:

sergiomar73
/

qc-nlp-004-transcription-classifier-with-roberta

Build error

App Files Files Community

sergiomar73 commited on Oct 4, 2022

Commit

02d6d31

1 Parent(s): fc4f2ad

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -10

app.py CHANGED Viewed

@@ -48,20 +48,139 @@ def process_categories(categories):
     return df_category_list
 def compare_text(transcript, categories):
-  df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
-  for idx, sentence in enumerate(transcript_to_sentences(transcript)):
-    embedding = calculate_embeddings_with_roberta(sentence)
-    # Create new row
-    new_row = {
-      'line': idx + 1,
-      'sentence': sentence,
-      'embedding': embedding
-    }
-    df_sentences = df_sentences.append(new_row, ignore_index=True)
     # print(df_sentences.shape)
     # df_sentences.head()
     return df_sentences
     #return res, fig, details
 categories = """Hello=Hello, how are you doing today?;Hi, everybody;Hi;My name's Johnny
 What=most advanced conversation intelligence and AI powered coaching platform;a software platform that helps people reach their potential;for communicating and connecting;empowered by behavioral science;uses artificial intelligence;drives performance outcomes for customer facing teams;help them sell more;help them deliver better experiences

     return df_category_list
 def compare_text(transcript, categories):
+    df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
+    for idx, sentence in enumerate(transcript_to_sentences(transcript)):
+        embeddings = calculate_embeddings_with_roberta(sentence)
+        # Create new row
+        new_row = {
+            'line': idx + 1,
+            'sentence': sentence,
+            'embedding': embeddings
+        }
+        df_sentences = df_sentences.append(new_row, ignore_index=True)
+    # print(df_sentences.shape)
+    # df_sentences.head()
+    return df_sentences
+    targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
+    # print(f"targets:{targets.shape}")
+    df_cosines = pd.DataFrame(columns=['line'])
+    for i, row in df_sentences.iterrows():
+        line = f'{row["line"]:03}'
+        # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
+        source = np.array(row["embedding"])
+        cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
+        # Create new row
+        new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
+        new_row["line"] = row["line"]
+        df_cosines = df_cosines.append(new_row, ignore_index=True)
+    df_cosines['line'] = df_cosines['line'].astype('int')
+    # print(df_cosines.shape)
+    # df_cosines.head(3)
+    df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
+    # print(df_comparison.shape)
+    # df_comparison.head(3)
+    threshold = threshold / 100
+    df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])
+    for i, row in df_comparison.iterrows():
+        for n in range(1,64+1):
+            col = f"Cosine{f'{n:02}'}"
+            # if row[col] > threshold:
+            phrase = df_phrases.loc[[ n - 1 ]]
+            new_row = {
+                'line': row["line"],
+                'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
+                'phrase': df_phrases.at[n-1,"example"],
+                'category': df_phrases.at[n-1,"category"],
+                'tag': df_phrases.at[n-1,"label"],
+                'similarity': row[col]
+            }
+            df_results = df_results.append(new_row, ignore_index=True)
+    df_results['line'] = df_cosines['line'].astype('int')
+    # print(df_results.shape)
+    # df_results.head(3)
+    df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
+    df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
+    # df_summary
+    fig = px.bar(
+        df_summary,
+        y='similarity',
+        color='ok',
+        color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
+        text='similarity',
+        text_auto='.3f',
+        labels={'tag': 'Category', 'similarity': 'Similarity'},
+        title = f"{transcript[:200]}..."
+    )
+    fig.add_shape( # add a horizontal "target" line
+        type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
+        x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
+    )
+    fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
+    fig.update_yaxes(range=[0, 1])
+     # fig.show()
+    details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index()    .drop(labels='index',axis=1)
+    res = df_summary['similarity'].to_dict()
+    return res, fig, details
+    df_category_list = process_categories(categories)
+    sentences = transcript_to_sentences(transcript)
+    print(f"{len(sentences)} sentences")
+    df_results = pd.DataFrame(sentences, columns=['Sentence'])
+    embeddings = model.encode(sentences, convert_to_tensor=True)
+    for _, df_category in enumerate(df_category_list):
+        phrases_list = df_category["embeddings"].values.tolist()
+        phrases = torch.stack(phrases_list)
+        # Compute cosine-similarities
+        cosine_scores = util.cos_sim(embeddings, phrases).numpy()
+        max_scores = np.max(cosine_scores, axis=1)
+        df_results_plot[df_category.iloc[0,2]] = max_scores
+        df_results_grid[df_category.iloc[0,2]] = max_scores
+    df_results_plot = df_results_plot.round(decimals = 2)
+    df_results_grid = df_results_grid.round(decimals = 3)
+    df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
+    for idx, sentence in enumerate(transcript_to_sentences(transcript)):
+        embeddings = calculate_embeddings_with_roberta(sentence)
+        # Create new row
+        new_row = {
+            'line': idx + 1,
+            'sentence': sentence,
+            'embedding': embeddings
+        }
+        df_sentences = df_sentences.append(new_row, ignore_index=True)
     # print(df_sentences.shape)
     # df_sentences.head()
     return df_sentences
     #return res, fig, details
+    doc = nlp(transcript)
+    sentences = [ sentence.text for sentence in list(doc.sents) ]
+    embeddings = model.encode(sentences, convert_to_tensor=True)
+    print(f"{len(sentences)} sentences")
+    sentences_mini = [ s[:50] for s in sentences ]
+    df_results_grid = pd.DataFrame(sentences, columns=['Sentence'])
+    df_results_plot = pd.DataFrame(index=sentences_mini)
+    for _, df_category in enumerate(df_category_list):
+        phrases_list = df_category["embeddings"].values.tolist()
+        phrases = torch.stack(phrases_list)
+        # Compute cosine-similarities
+        cosine_scores = util.cos_sim(embeddings, phrases).numpy()
+        max_scores = np.max(cosine_scores, axis=1)
+        df_results_plot[df_category.iloc[0,2]] = max_scores
+        df_results_grid[df_category.iloc[0,2]] = max_scores
+    df_results_plot = df_results_plot.round(decimals = 2)
+    df_results_grid = df_results_grid.round(decimals = 3)
 categories = """Hello=Hello, how are you doing today?;Hi, everybody;Hi;My name's Johnny
 What=most advanced conversation intelligence and AI powered coaching platform;a software platform that helps people reach their potential;for communicating and connecting;empowered by behavioral science;uses artificial intelligence;drives performance outcomes for customer facing teams;help them sell more;help them deliver better experiences