|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline |
|
from transformers import BertTokenizerFast |
|
import matplotlib.pyplot as plt |
|
import json |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
|
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER") |
|
pos_model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos") |
|
pos_tokenizer = BertTokenizerFast.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos") |
|
pos_pipeline = pipeline("token-classification", model=pos_model, tokenizer=pos_tokenizer) |
|
|
|
|
|
intent_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") |
|
|
|
def get_token_colors(token_type): |
|
colors = { |
|
"prefix": "#D8BFD8", |
|
"suffix": "#AEDAA4", |
|
"stem": "#A4C2F4", |
|
"compound_first": "#FFCC80", |
|
"compound_second": "#FFCC80", |
|
"word": "#E5E5E5" |
|
} |
|
return colors.get(token_type, "#E5E5E5") |
|
|
|
def simulate_historical_data(token): |
|
"""Generate simulated historical usage data for a token""" |
|
eras = ["1900s", "1950s", "1980s", "2000s", "2010s", "Present"] |
|
|
|
|
|
if len(token) > 8: |
|
|
|
values = [10, 20, 30, 60, 85, 95] |
|
elif token.startswith(("un", "re", "de", "pre")): |
|
|
|
values = [45, 50, 60, 70, 75, 80] |
|
else: |
|
|
|
base = 50 + (hash(token) % 30) |
|
noise = np.random.normal(0, 5, 6) |
|
values = [max(5, min(95, base + i*5 + n)) for i, n in enumerate(noise)] |
|
|
|
return list(zip(eras, values)) |
|
|
|
def generate_origin_data(token): |
|
"""Generate simulated origin/etymology data for a token""" |
|
origins = [ |
|
{"era": "Ancient", "language": "Latin"}, |
|
{"era": "Ancient", "language": "Greek"}, |
|
{"era": "Medieval", "language": "Old English"}, |
|
{"era": "16th century", "language": "French"}, |
|
{"era": "18th century", "language": "Germanic"}, |
|
{"era": "19th century", "language": "Anglo-Saxon"}, |
|
{"era": "20th century", "language": "Modern English"} |
|
] |
|
|
|
|
|
index = hash(token) % len(origins) |
|
origin = origins[index] |
|
|
|
note = f"First appeared in {origin['era']} texts derived from {origin['language']}." |
|
origin["note"] = note |
|
|
|
return origin |
|
|
|
def analyze_token_types(tokens): |
|
"""Identify token types (prefix, suffix, compound, etc.)""" |
|
processed_tokens = [] |
|
|
|
prefixes = ["un", "re", "de", "pre", "post", "anti", "pro", "inter", "sub", "super"] |
|
suffixes = ["ing", "ed", "ly", "ment", "tion", "able", "ible", "ness", "ful", "less"] |
|
|
|
for token in tokens: |
|
token_text = token.lower() |
|
token_type = "word" |
|
|
|
|
|
for prefix in prefixes: |
|
if token_text.startswith(prefix) and len(token_text) > len(prefix) + 2: |
|
if token_text != prefix: |
|
token_type = "prefix" |
|
break |
|
|
|
|
|
if token_type == "word": |
|
for suffix in suffixes: |
|
if token_text.endswith(suffix) and len(token_text) > len(suffix) + 2: |
|
token_type = "suffix" |
|
break |
|
|
|
|
|
if token_type == "word" and len(token_text) > 8: |
|
token_type = "compound_first" |
|
|
|
processed_tokens.append({ |
|
"text": token_text, |
|
"type": token_type |
|
}) |
|
|
|
return processed_tokens |
|
|
|
def plot_historical_data(historical_data): |
|
"""Create a plot of historical usage data""" |
|
eras = [item[0] for item in historical_data] |
|
values = [item[1] for item in historical_data] |
|
|
|
plt.figure(figsize=(8, 3)) |
|
plt.bar(eras, values, color='skyblue') |
|
plt.title('Historical Usage') |
|
plt.xlabel('Era') |
|
plt.ylabel('Usage Level') |
|
plt.ylim(0, 100) |
|
plt.xticks(rotation=45) |
|
plt.tight_layout() |
|
|
|
return plt |
|
|
|
def analyze_keyword(keyword): |
|
if not keyword.strip(): |
|
return None, None, None, None, None |
|
|
|
|
|
words = keyword.strip().lower().split() |
|
|
|
|
|
token_analysis = analyze_token_types(words) |
|
|
|
|
|
ner_results = ner_pipeline(keyword) |
|
|
|
|
|
pos_results = pos_pipeline(keyword) |
|
|
|
|
|
full_token_analysis = [] |
|
for token in token_analysis: |
|
|
|
pos_tag = "NOUN" |
|
for pos_result in pos_results: |
|
if pos_result["word"].lower() == token["text"]: |
|
pos_tag = pos_result["entity"] |
|
break |
|
|
|
|
|
entity_type = None |
|
for ner_result in ner_results: |
|
if ner_result["word"].lower() == token["text"]: |
|
entity_type = ner_result["entity"] |
|
break |
|
|
|
|
|
historical_data = simulate_historical_data(token["text"]) |
|
|
|
|
|
origin = generate_origin_data(token["text"]) |
|
|
|
|
|
importance = 60 + (len(token["text"]) * 2) |
|
importance = min(95, importance) |
|
|
|
|
|
related_terms = [f"{token['text']}-related-1", f"{token['text']}-related-2"] |
|
|
|
full_token_analysis.append({ |
|
"token": token["text"], |
|
"type": token["type"], |
|
"posTag": pos_tag, |
|
"entityType": entity_type, |
|
"importance": importance, |
|
"historicalData": historical_data, |
|
"origin": origin, |
|
"relatedTerms": related_terms |
|
}) |
|
|
|
|
|
intent_result = intent_classifier( |
|
keyword, |
|
candidate_labels=["informational", "navigational", "transactional"] |
|
) |
|
|
|
intent_analysis = { |
|
"type": intent_result["labels"][0].capitalize(), |
|
"strength": round(intent_result["scores"][0] * 100), |
|
"mutations": [ |
|
f"{intent_result['labels'][0]}-variation-1", |
|
f"{intent_result['labels'][0]}-variation-2" |
|
] |
|
} |
|
|
|
|
|
evolution_potential = min(95, 65 + (len(keyword) % 30)) |
|
|
|
|
|
trends = [ |
|
"Voice search adaptation", |
|
"Visual search integration" |
|
] |
|
|
|
|
|
evolution_data = [ |
|
{"month": "Jan", "searchVolume": 1000, "competitionScore": 45, "intentClarity": 80}, |
|
{"month": "Feb", "searchVolume": 1200, "competitionScore": 48, "intentClarity": 82}, |
|
{"month": "Mar", "searchVolume": 1100, "competitionScore": 52, "intentClarity": 85}, |
|
{"month": "Apr", "searchVolume": 1400, "competitionScore": 55, "intentClarity": 88}, |
|
{"month": "May", "searchVolume": 1800, "competitionScore": 58, "intentClarity": 90}, |
|
{"month": "Jun", "searchVolume": 2200, "competitionScore": 60, "intentClarity": 92} |
|
] |
|
|
|
|
|
evolution_chart = create_evolution_chart(evolution_data) |
|
|
|
|
|
token_viz_html = generate_token_visualization_html(token_analysis, full_token_analysis) |
|
|
|
|
|
analysis_html = generate_full_analysis_html( |
|
keyword, |
|
full_token_analysis, |
|
intent_analysis, |
|
evolution_potential, |
|
trends |
|
) |
|
|
|
|
|
json_results = { |
|
"keyword": keyword, |
|
"tokenAnalysis": full_token_analysis, |
|
"intentAnalysis": intent_analysis, |
|
"evolutionPotential": evolution_potential, |
|
"predictedTrends": trends |
|
} |
|
|
|
return token_viz_html, analysis_html, json_results, evolution_chart, full_token_analysis |
|
|
|
def create_evolution_chart(data): |
|
"""Create an evolution chart from data""" |
|
df = pd.DataFrame(data) |
|
|
|
plt.figure(figsize=(10, 5)) |
|
plt.plot(df['month'], df['searchVolume'], marker='o', label='Search Volume') |
|
plt.plot(df['month'], df['competitionScore']*20, marker='s', label='Competition Score') |
|
plt.plot(df['month'], df['intentClarity']*20, marker='^', label='Intent Clarity') |
|
|
|
plt.title('Predicted Evolution') |
|
plt.xlabel('Month') |
|
plt.ylabel('Value') |
|
plt.legend() |
|
plt.grid(True, linestyle='--', alpha=0.7) |
|
plt.tight_layout() |
|
|
|
return plt |
|
|
|
def generate_token_visualization_html(token_analysis, full_analysis): |
|
"""Generate HTML for token visualization""" |
|
html = """ |
|
<div style="font-family: Arial, sans-serif; padding: 20px; border: 1px solid #ddd; border-radius: 8px;"> |
|
<h2 style="margin-top: 0;">Token Visualization</h2> |
|
|
|
<div style="margin-bottom: 20px; padding: 15px; background-color: #f8f9fa; border-radius: 6px;"> |
|
<div style="margin-bottom: 8px; font-weight: bold; color: #4a5568;">Human View:</div> |
|
<div style="display: flex; flex-wrap: wrap; gap: 8px;"> |
|
""" |
|
|
|
|
|
for token in token_analysis: |
|
html += f""" |
|
<div style="padding: 6px 12px; background-color: white; border: 1px solid #cbd5e0; border-radius: 4px;"> |
|
{token['text']} |
|
</div> |
|
""" |
|
|
|
html += """ |
|
</div> |
|
</div> |
|
|
|
<div style="text-align: center; margin: 15px 0;"> |
|
<span style="font-size: 20px;">↓</span> |
|
</div> |
|
|
|
<div style="padding: 15px; background-color: #f0fff4; border-radius: 6px;"> |
|
<div style="margin-bottom: 8px; font-weight: bold; color: #2f855a;">Machine View:</div> |
|
<div style="display: flex; flex-wrap: wrap; gap: 8px;"> |
|
""" |
|
|
|
|
|
for token in full_analysis: |
|
bg_color = get_token_colors(token["type"]) |
|
html += f""" |
|
<div style="padding: 6px 12px; background-color: {bg_color}; border: 1px solid #a0aec0; border-radius: 4px; font-family: monospace;"> |
|
{token['token']} |
|
<span style="font-size: 10px; opacity: 0.7; display: block;">{token['type']}</span> |
|
</div> |
|
""" |
|
|
|
html += """ |
|
</div> |
|
</div> |
|
|
|
<div style="margin-top: 20px; display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; text-align: center;"> |
|
""" |
|
|
|
|
|
word_count = len(token_analysis) |
|
token_count = len(full_analysis) |
|
ratio = round(token_count / max(1, word_count), 2) |
|
|
|
html += f""" |
|
<div style="background-color: #ebf8ff; padding: 10px; border-radius: 6px;"> |
|
<div style="font-size: 24px; font-weight: bold; color: #3182ce;">{word_count}</div> |
|
<div style="font-size: 14px; color: #4299e1;">Words</div> |
|
</div> |
|
|
|
<div style="background-color: #f0fff4; padding: 10px; border-radius: 6px;"> |
|
<div style="font-size: 24px; font-weight: bold; color: #38a169;">{token_count}</div> |
|
<div style="font-size: 14px; color: #48bb78;">Tokens</div> |
|
</div> |
|
|
|
<div style="background-color: #faf5ff; padding: 10px; border-radius: 6px;"> |
|
<div style="font-size: 24px; font-weight: bold; color: #805ad5;">{ratio}</div> |
|
<div style="font-size: 14px; color: #9f7aea;">Tokens per Word</div> |
|
</div> |
|
""" |
|
|
|
html += """ |
|
</div> |
|
</div> |
|
""" |
|
|
|
return html |
|
|
|
def generate_full_analysis_html(keyword, token_analysis, intent_analysis, evolution_potential, trends): |
|
"""Generate HTML for full keyword analysis""" |
|
html = f""" |
|
<div style="font-family: Arial, sans-serif; padding: 20px; border: 1px solid #ddd; border-radius: 8px;"> |
|
<h2 style="margin-top: 0;">Keyword DNA Analysis for: {keyword}</h2> |
|
|
|
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin-bottom: 20px;"> |
|
<div style="padding: 15px; border: 1px solid #e2e8f0; border-radius: 6px;"> |
|
<h3 style="margin-top: 0; font-size: 16px;">Intent Gene</h3> |
|
<div style="display: flex; justify-content: space-between; margin-bottom: 10px;"> |
|
<span>Type:</span> |
|
<span>{intent_analysis['type']}</span> |
|
</div> |
|
<div style="display: flex; justify-content: space-between; align-items: center;"> |
|
<span>Strength:</span> |
|
<div style="width: 120px; height: 8px; background-color: #edf2f7; border-radius: 4px; overflow: hidden;"> |
|
<div style="height: 100%; background-color: #48bb78; width: {intent_analysis['strength']}%;"></div> |
|
</div> |
|
</div> |
|
</div> |
|
|
|
<div style="padding: 15px; border: 1px solid #e2e8f0; border-radius: 6px;"> |
|
<h3 style="margin-top: 0; font-size: 16px;">Evolution Potential</h3> |
|
<div style="display: flex; justify-content: center; align-items: center; height: 100px;"> |
|
<div style="position: relative; width: 100px; height: 100px;"> |
|
<div style="position: absolute; inset: 0; display: flex; align-items: center; justify-content: center;"> |
|
<span style="font-size: 24px; font-weight: bold;">{evolution_potential}</span> |
|
</div> |
|
<svg width="100" height="100" viewBox="0 0 36 36"> |
|
<path |
|
d="M18 2.0845 a 15.9155 15.9155 0 0 1 0 31.831 a 15.9155 15.9155 0 0 1 0 -31.831" |
|
fill="none" |
|
stroke="#4CAF50" |
|
stroke-width="3" |
|
stroke-dasharray="{evolution_potential}, 100" |
|
/> |
|
</svg> |
|
</div> |
|
</div> |
|
</div> |
|
</div> |
|
|
|
<div style="padding: 15px; border: 1px solid #e2e8f0; border-radius: 6px; margin-bottom: 20px;"> |
|
<h3 style="margin-top: 0; font-size: 16px;">Future Mutations</h3> |
|
<div style="display: flex; flex-direction: column; gap: 8px;"> |
|
""" |
|
|
|
|
|
for trend in trends: |
|
html += f""" |
|
<div style="display: flex; align-items: center; gap: 8px;"> |
|
<span style="color: #48bb78;">↗</span> |
|
<span>{trend}</span> |
|
</div> |
|
""" |
|
|
|
html += """ |
|
</div> |
|
</div> |
|
|
|
<h3 style="margin-bottom: 10px;">Token Details & Historical Analysis</h3> |
|
""" |
|
|
|
|
|
for token in token_analysis: |
|
html += f""" |
|
<div style="padding: 15px; border: 1px solid #e2e8f0; border-radius: 6px; margin-bottom: 15px;"> |
|
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;"> |
|
<div style="display: flex; align-items: center; gap: 8px;"> |
|
<span style="font-size: 18px; font-weight: medium;">{token['token']}</span> |
|
<span style="padding: 2px 8px; background-color: #edf2f7; border-radius: 4px; font-size: 12px;">{token['posTag']}</span> |
|
""" |
|
|
|
if token['entityType']: |
|
html += f""" |
|
<span style="padding: 2px 8px; background-color: #ebf8ff; color: #3182ce; border-radius: 4px; font-size: 12px; display: flex; align-items: center;"> |
|
ⓘ {token['entityType']} |
|
</span> |
|
""" |
|
|
|
html += f""" |
|
</div> |
|
<div style="display: flex; align-items: center; gap: 4px;"> |
|
<span style="font-size: 12px; color: #718096;">Importance:</span> |
|
<div style="width: 64px; height: 8px; background-color: #edf2f7; border-radius: 4px; overflow: hidden;"> |
|
<div style="height: 100%; background-color: #4299e1; width: {token['importance']}%;"></div> |
|
</div> |
|
</div> |
|
</div> |
|
|
|
<div style="margin-top: 15px;"> |
|
<div style="font-size: 12px; color: #718096; margin-bottom: 4px;">Historical Relevance:</div> |
|
<div style="border: 1px solid #e2e8f0; border-radius: 4px; padding: 10px; background-color: #f7fafc;"> |
|
<div style="font-size: 12px; margin-bottom: 8px;"> |
|
<span style="font-weight: 500;">Origin: </span> |
|
<span>{token['origin']['era']}, </span> |
|
<span style="font-style: italic;">{token['origin']['language']}</span> |
|
</div> |
|
<div style="font-size: 12px; margin-bottom: 12px;">{token['origin']['note']}</div> |
|
|
|
<div style="display: flex; align-items: flex-end; height: 50px; gap: 4px; margin-top: 8px;"> |
|
""" |
|
|
|
|
|
for period, value in token['historicalData']: |
|
opacity = 0.3 + (token['historicalData'].index((period, value)) * 0.1) |
|
html += f""" |
|
<div style="display: flex; flex-direction: column; align-items: center; flex: 1;"> |
|
<div style="width: 100%; background-color: rgba(66, 153, 225, {opacity}); border-radius: 2px 2px 0 0; height: {max(4, value)}%;"></div> |
|
<div style="font-size: 9px; margin-top: 4px; color: #718096; transform: rotate(45deg); transform-origin: top left; white-space: nowrap;"> |
|
{period} |
|
</div> |
|
</div> |
|
""" |
|
|
|
html += """ |
|
</div> |
|
</div> |
|
</div> |
|
</div> |
|
""" |
|
|
|
html += """ |
|
</div> |
|
""" |
|
|
|
return html |
|
|
|
|
|
with gr.Blocks(css="footer {visibility: hidden}") as demo: |
|
gr.Markdown("# Keyword DNA Analyzer") |
|
gr.Markdown("Analyze the linguistic DNA of your keywords to understand their structure, intent, and potential.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_text = gr.Textbox(label="Enter keyword to analyze", placeholder="e.g. artificial intelligence") |
|
analyze_btn = gr.Button("Analyze DNA", variant="primary") |
|
|
|
with gr.Row(): |
|
example_btns = [] |
|
for example in ["preprocessing", "breakdown", "artificial intelligence", "transformer model", "machine learning"]: |
|
example_btns.append(gr.Button(example)) |
|
|
|
with gr.Column(): |
|
with gr.Tabs(): |
|
with gr.Tab("Token Visualization"): |
|
token_viz_html = gr.HTML() |
|
|
|
with gr.Tab("Full Analysis"): |
|
analysis_html = gr.HTML() |
|
|
|
with gr.Tab("Evolution Chart"): |
|
evolution_chart = gr.Plot() |
|
|
|
with gr.Tab("Raw Data"): |
|
json_output = gr.JSON() |
|
|
|
|
|
analyze_btn.click( |
|
analyze_keyword, |
|
inputs=[input_text], |
|
outputs=[token_viz_html, analysis_html, json_output, evolution_chart, None] |
|
) |
|
|
|
|
|
for btn in example_btns: |
|
btn.click( |
|
lambda btn_text: btn_text, |
|
inputs=[btn], |
|
outputs=[input_text] |
|
).then( |
|
analyze_keyword, |
|
inputs=[input_text], |
|
outputs=[token_viz_html, analysis_html, json_output, evolution_chart, None] |
|
) |
|
|
|
|
|
demo.launch() |