Spaces:
Running
Running
#v3/modules/studentact/current_situation_analysis.py | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
import networkx as nx | |
import seaborn as sns | |
from collections import Counter | |
from itertools import combinations | |
import numpy as np | |
import matplotlib.patches as patches | |
import logging | |
logger = logging.getLogger(__name__) | |
def analyze_text_dimensions(doc): | |
""" | |
Analiza las dimensiones principales del texto. | |
Args: | |
doc: Documento procesado por spaCy | |
Returns: | |
dict: M茅tricas del an谩lisis | |
""" | |
try: | |
# An谩lisis de vocabulario | |
vocab_score = analyze_vocabulary_diversity(doc) | |
vocab_normalized = normalize_score( | |
value=vocab_score, | |
optimal_connections=len(doc) * 0.4 # 40% del total de palabras como conexiones 贸ptimas | |
) | |
# An谩lisis de estructura | |
struct_score = analyze_structure(doc) | |
struct_normalized = normalize_score( | |
value=struct_score, | |
optimal_length=20 # Longitud 贸ptima promedio de oraci贸n | |
) | |
# An谩lisis de cohesi贸n | |
cohesion_score = analyze_cohesion(doc) | |
cohesion_normalized = normalize_score( | |
value=cohesion_score, | |
optimal_value=0.7 # 70% de cohesi贸n como valor 贸ptimo | |
) | |
# An谩lisis de claridad | |
clarity_score = analyze_clarity(doc) | |
clarity_normalized = normalize_score( | |
value=clarity_score, | |
optimal_value=0.8 # 80% de claridad como valor 贸ptimo | |
) | |
return { | |
'vocabulary': { | |
'raw_score': vocab_score, | |
'normalized_score': vocab_normalized | |
}, | |
'structure': { | |
'raw_score': struct_score, | |
'normalized_score': struct_normalized | |
}, | |
'cohesion': { | |
'raw_score': cohesion_score, | |
'normalized_score': cohesion_normalized | |
}, | |
'clarity': { | |
'raw_score': clarity_score, | |
'normalized_score': clarity_normalized | |
} | |
} | |
except Exception as e: | |
logger.error(f"Error en analyze_text_dimensions: {str(e)}") | |
raise | |
def analyze_clarity(doc): | |
""" | |
Analiza la claridad del texto considerando m煤ltiples factores: | |
- Longitud y variaci贸n de oraciones | |
- Uso de conectores | |
- Complejidad estructural | |
- Claridad referencial | |
- Densidad l茅xica | |
""" | |
try: | |
# 1. An谩lisis de oraciones | |
sentences = list(doc.sents) | |
if not sentences: | |
return 0.0 | |
# Longitud de oraciones | |
sentence_lengths = [len(sent) for sent in sentences] | |
avg_length = sum(sentence_lengths) / len(sentences) | |
length_variation = np.std(sentence_lengths) if len(sentences) > 1 else 0 | |
# Penalizar oraciones muy cortas o muy largas | |
length_score = normalize_score( | |
avg_length, | |
optimal_length=20, # Longitud 贸ptima | |
range_factor=1.5 # Factor de tolerancia | |
) | |
# 2. An谩lisis de conectores | |
connector_count = 0 | |
connector_types = { | |
'CCONJ': 0.8, # Coordinantes | |
'SCONJ': 1.0, # Subordinantes | |
'ADV': 0.6 # Adverbios conectivos | |
} | |
for token in doc: | |
if token.pos_ in connector_types and token.dep_ in ['cc', 'mark', 'advmod']: | |
connector_count += connector_types[token.pos_] | |
connector_score = min(1.0, connector_count / (len(sentences) * 0.8)) | |
# 3. Complejidad estructural | |
clause_count = 0 | |
for sent in sentences: | |
verbs = [token for token in sent if token.pos_ == 'VERB'] | |
clause_count += len(verbs) | |
complexity_score = normalize_score( | |
clause_count / len(sentences), | |
optimal_value=2.0, # Promedio 贸ptimo de cl谩usulas por oraci贸n | |
range_factor=1.5 | |
) | |
# 4. Claridad referencial | |
reference_score = analyze_reference_clarity(doc) | |
# 5. Densidad l茅xica | |
content_words = len([token for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']]) | |
function_words = len([token for token in doc if token.pos_ not in ['NOUN', 'VERB', 'ADJ', 'ADV']]) | |
density_score = normalize_score( | |
content_words / (content_words + function_words) if (content_words + function_words) > 0 else 0, | |
optimal_value=0.6, # 60% de palabras de contenido es 贸ptimo | |
range_factor=1.5 | |
) | |
# Pesos para cada factor | |
weights = { | |
'length': 0.2, | |
'connectors': 0.2, | |
'complexity': 0.2, | |
'reference': 0.2, | |
'density': 0.2 | |
} | |
# C谩lculo del score final ponderado | |
clarity_score = ( | |
weights['length'] * length_score + | |
weights['connectors'] * connector_score + | |
weights['complexity'] * complexity_score + | |
weights['reference'] * reference_score + | |
weights['density'] * density_score | |
) | |
# Informaci贸n detallada para diagn贸stico | |
details = { | |
'length_score': length_score, | |
'connector_score': connector_score, | |
'complexity_score': complexity_score, | |
'reference_score': reference_score, | |
'density_score': density_score, | |
'avg_sentence_length': avg_length, | |
'length_variation': length_variation, | |
'connectors_per_sentence': connector_count / len(sentences) | |
} | |
return clarity_score, details | |
except Exception as e: | |
logger.error(f"Error en analyze_clarity: {str(e)}") | |
return 0.0, {} | |
def analyze_reference_clarity(doc): | |
""" | |
Analiza la claridad de las referencias en el texto | |
""" | |
try: | |
# Contar referencias anaf贸ricas | |
reference_count = 0 | |
unclear_references = 0 | |
for token in doc: | |
# Detectar pronombres y determinantes | |
if token.pos_ in ['PRON', 'DET']: | |
reference_count += 1 | |
# Verificar si tiene antecedente claro | |
has_antecedent = False | |
for ancestor in token.ancestors: | |
if ancestor.pos_ == 'NOUN': | |
has_antecedent = True | |
break | |
if not has_antecedent: | |
unclear_references += 1 | |
# Calcular score | |
if reference_count == 0: | |
return 1.0 # No hay referencias = claridad m谩xima | |
clarity = 1.0 - (unclear_references / reference_count) | |
return max(0.0, min(1.0, clarity)) | |
except Exception as e: | |
logger.error(f"Error en analyze_reference_clarity: {str(e)}") | |
return 0.0 | |
def analyze_vocabulary_diversity(doc): | |
"""An谩lisis mejorado de la diversidad y calidad del vocabulario""" | |
try: | |
# 1. An谩lisis b谩sico de diversidad | |
unique_lemmas = {token.lemma_ for token in doc if token.is_alpha} | |
total_words = len([token for token in doc if token.is_alpha]) | |
basic_diversity = len(unique_lemmas) / total_words if total_words > 0 else 0 | |
# 2. An谩lisis de registro | |
academic_words = 0 | |
narrative_words = 0 | |
technical_terms = 0 | |
# Clasificar palabras por registro | |
for token in doc: | |
if token.is_alpha: | |
# Detectar t茅rminos acad茅micos/t茅cnicos | |
if token.pos_ in ['NOUN', 'VERB', 'ADJ']: | |
if any(parent.pos_ == 'NOUN' for parent in token.ancestors): | |
technical_terms += 1 | |
# Detectar palabras narrativas | |
if token.pos_ in ['VERB', 'ADV'] and token.dep_ in ['ROOT', 'advcl']: | |
narrative_words += 1 | |
# 3. An谩lisis de complejidad sint谩ctica | |
avg_sentence_length = sum(len(sent) for sent in doc.sents) / len(list(doc.sents)) | |
# 4. Calcular score ponderado | |
weights = { | |
'diversity': 0.3, | |
'technical': 0.3, | |
'narrative': 0.2, | |
'complexity': 0.2 | |
} | |
scores = { | |
'diversity': basic_diversity, | |
'technical': technical_terms / total_words if total_words > 0 else 0, | |
'narrative': narrative_words / total_words if total_words > 0 else 0, | |
'complexity': min(1.0, avg_sentence_length / 20) # Normalizado a 20 palabras | |
} | |
# Score final ponderado | |
final_score = sum(weights[key] * scores[key] for key in weights) | |
# Informaci贸n adicional para diagn贸stico | |
details = { | |
'text_type': 'narrative' if scores['narrative'] > scores['technical'] else 'academic', | |
'scores': scores | |
} | |
return final_score, details | |
except Exception as e: | |
logger.error(f"Error en analyze_vocabulary_diversity: {str(e)}") | |
return 0.0, {} | |
def analyze_cohesion(doc): | |
"""Analiza la cohesi贸n textual""" | |
try: | |
sentences = list(doc.sents) | |
if len(sentences) < 2: | |
logger.warning("Texto demasiado corto para an谩lisis de cohesi贸n") | |
return 0.0 | |
connections = 0 | |
for i in range(len(sentences)-1): | |
sent1_words = {token.lemma_ for token in sentences[i]} | |
sent2_words = {token.lemma_ for token in sentences[i+1]} | |
connections += len(sent1_words.intersection(sent2_words)) | |
# Validar que haya conexiones antes de normalizar | |
if connections == 0: | |
logger.warning("No se encontraron conexiones entre oraciones") | |
return 0.0 | |
return normalize_score(connections, optimal_connections=max(5, len(sentences) * 0.2)) | |
except Exception as e: | |
logger.error(f"Error en analyze_cohesion: {str(e)}") | |
return 0.0 | |
def analyze_structure(doc): | |
"""Analiza la complejidad estructural""" | |
try: | |
if len(doc) == 0: | |
logger.warning("Documento vac铆o") | |
return 0.0 | |
root_distances = [] | |
for token in doc: | |
if token.dep_ == 'ROOT': | |
depths = get_dependency_depths(token) | |
root_distances.extend(depths) | |
if not root_distances: | |
logger.warning("No se encontraron estructuras de dependencia") | |
return 0.0 | |
avg_depth = sum(root_distances) / len(root_distances) | |
return normalize_score(avg_depth, optimal_depth=max(3, len(doc) * 0.1)) | |
except Exception as e: | |
logger.error(f"Error en analyze_structure: {str(e)}") | |
return 0.0 | |
# Funciones auxiliares de an谩lisis | |
def get_dependency_depths(token, depth=0): | |
"""Obtiene las profundidades de dependencia""" | |
depths = [depth] | |
for child in token.children: | |
depths.extend(get_dependency_depths(child, depth + 1)) | |
return depths | |
def normalize_score(value, optimal_value=1.0, range_factor=2.0, optimal_length=None, | |
optimal_connections=None, optimal_depth=None): | |
""" | |
Normaliza un valor a una escala de 0-1 con manejo de casos extremos. | |
Args: | |
value: Valor a normalizar | |
optimal_value: Valor 贸ptimo de referencia | |
range_factor: Factor para ajustar el rango | |
optimal_length: Longitud 贸ptima (opcional) | |
optimal_connections: N煤mero 贸ptimo de conexiones (opcional) | |
optimal_depth: Profundidad 贸ptima de estructura (opcional) | |
Returns: | |
float: Valor normalizado entre 0 y 1 | |
""" | |
try: | |
# Validar valores negativos o cero | |
if value < 0: | |
logger.warning(f"Valor negativo recibido: {value}") | |
return 0.0 | |
# Manejar caso donde el valor es cero | |
if value == 0: | |
logger.warning("Valor cero recibido") | |
return 0.0 | |
# Identificar el valor de referencia a usar | |
if optimal_depth is not None: | |
reference = optimal_depth | |
elif optimal_connections is not None: | |
reference = optimal_connections | |
elif optimal_length is not None: | |
reference = optimal_length | |
else: | |
reference = optimal_value | |
# Validar valor de referencia | |
if reference <= 0: | |
logger.warning(f"Valor de referencia inv谩lido: {reference}") | |
return 0.0 | |
# Calcular diferencia y m谩xima diferencia permitida | |
diff = abs(value - reference) | |
max_diff = reference * range_factor | |
# Validar max_diff | |
if max_diff <= 0: | |
logger.warning(f"M谩xima diferencia inv谩lida: {max_diff}") | |
return 0.0 | |
# Calcular score normalizado | |
score = 1.0 - min(diff / max_diff, 1.0) | |
# Asegurar que el resultado est茅 entre 0 y 1 | |
return max(0.0, min(1.0, score)) | |
except Exception as e: | |
logger.error(f"Error en normalize_score: {str(e)}") | |
return 0.0 | |
# Funciones de generaci贸n de gr谩ficos | |
def generate_sentence_graphs(doc): | |
"""Genera visualizaciones de estructura de oraciones""" | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
# Implementar visualizaci贸n | |
plt.close() | |
return fig | |
def generate_word_connections(doc): | |
"""Genera red de conexiones de palabras""" | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
# Implementar visualizaci贸n | |
plt.close() | |
return fig | |
def generate_connection_paths(doc): | |
"""Genera patrones de conexi贸n""" | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
# Implementar visualizaci贸n | |
plt.close() | |
return fig | |
def create_vocabulary_network(doc): | |
""" | |
Genera el grafo de red de vocabulario. | |
""" | |
G = nx.Graph() | |
# Crear nodos para palabras significativas | |
words = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop] | |
word_freq = Counter(words) | |
# A帽adir nodos con tama帽o basado en frecuencia | |
for word, freq in word_freq.items(): | |
G.add_node(word, size=freq) | |
# Crear conexiones basadas en co-ocurrencia | |
window_size = 5 | |
for i in range(len(words) - window_size): | |
window = words[i:i+window_size] | |
for w1, w2 in combinations(set(window), 2): | |
if G.has_edge(w1, w2): | |
G[w1][w2]['weight'] += 1 | |
else: | |
G.add_edge(w1, w2, weight=1) | |
# Crear visualizaci贸n | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
pos = nx.spring_layout(G) | |
# Dibujar nodos | |
nx.draw_networkx_nodes(G, pos, | |
node_size=[G.nodes[node]['size']*100 for node in G.nodes], | |
node_color='lightblue', | |
alpha=0.7) | |
# Dibujar conexiones | |
nx.draw_networkx_edges(G, pos, | |
width=[G[u][v]['weight']*0.5 for u,v in G.edges], | |
alpha=0.5) | |
# A帽adir etiquetas | |
nx.draw_networkx_labels(G, pos) | |
plt.title("Red de Vocabulario") | |
plt.axis('off') | |
return fig | |
def create_syntax_complexity_graph(doc): | |
""" | |
Genera el diagrama de arco de complejidad sint谩ctica. | |
Muestra la estructura de dependencias con colores basados en la complejidad. | |
""" | |
try: | |
# Preparar datos para la visualizaci贸n | |
sentences = list(doc.sents) | |
if not sentences: | |
return None | |
# Crear figura para el gr谩fico | |
fig, ax = plt.subplots(figsize=(12, len(sentences) * 2)) | |
# Colores para diferentes niveles de profundidad | |
depth_colors = plt.cm.viridis(np.linspace(0, 1, 6)) | |
y_offset = 0 | |
max_x = 0 | |
for sent in sentences: | |
words = [token.text for token in sent] | |
x_positions = range(len(words)) | |
max_x = max(max_x, len(words)) | |
# Dibujar palabras | |
plt.plot(x_positions, [y_offset] * len(words), 'k-', alpha=0.2) | |
plt.scatter(x_positions, [y_offset] * len(words), alpha=0) | |
# A帽adir texto | |
for i, word in enumerate(words): | |
plt.annotate(word, (i, y_offset), xytext=(0, -10), | |
textcoords='offset points', ha='center') | |
# Dibujar arcos de dependencia | |
for token in sent: | |
if token.dep_ != "ROOT": | |
# Calcular profundidad de dependencia | |
depth = 0 | |
current = token | |
while current.head != current: | |
depth += 1 | |
current = current.head | |
# Determinar posiciones para el arco | |
start = token.i - sent[0].i | |
end = token.head.i - sent[0].i | |
# Altura del arco basada en la distancia entre palabras | |
height = 0.5 * abs(end - start) | |
# Color basado en la profundidad | |
color = depth_colors[min(depth, len(depth_colors)-1)] | |
# Crear arco | |
arc = patches.Arc((min(start, end) + abs(end - start)/2, y_offset), | |
width=abs(end - start), | |
height=height, | |
angle=0, | |
theta1=0, | |
theta2=180, | |
color=color, | |
alpha=0.6) | |
ax.add_patch(arc) | |
y_offset -= 2 | |
# Configurar el gr谩fico | |
plt.xlim(-1, max_x) | |
plt.ylim(y_offset - 1, 1) | |
plt.axis('off') | |
plt.title("Complejidad Sint谩ctica") | |
return fig | |
except Exception as e: | |
logger.error(f"Error en create_syntax_complexity_graph: {str(e)}") | |
return None | |
def create_cohesion_heatmap(doc): | |
"""Genera un mapa de calor que muestra la cohesi贸n entre p谩rrafos/oraciones.""" | |
try: | |
sentences = list(doc.sents) | |
n_sentences = len(sentences) | |
if n_sentences < 2: | |
return None | |
similarity_matrix = np.zeros((n_sentences, n_sentences)) | |
for i in range(n_sentences): | |
for j in range(n_sentences): | |
sent1_lemmas = {token.lemma_ for token in sentences[i] | |
if token.is_alpha and not token.is_stop} | |
sent2_lemmas = {token.lemma_ for token in sentences[j] | |
if token.is_alpha and not token.is_stop} | |
if sent1_lemmas and sent2_lemmas: | |
intersection = len(sent1_lemmas & sent2_lemmas) # Corregido aqu铆 | |
union = len(sent1_lemmas | sent2_lemmas) # Y aqu铆 | |
similarity_matrix[i, j] = intersection / union if union > 0 else 0 | |
# Crear visualizaci贸n | |
fig, ax = plt.subplots(figsize=(10, 8)) | |
sns.heatmap(similarity_matrix, | |
cmap='YlOrRd', | |
square=True, | |
xticklabels=False, | |
yticklabels=False, | |
cbar_kws={'label': 'Cohesi贸n'}, | |
ax=ax) | |
plt.title("Mapa de Cohesi贸n Textual") | |
plt.xlabel("Oraciones") | |
plt.ylabel("Oraciones") | |
plt.tight_layout() | |
return fig | |
except Exception as e: | |
logger.error(f"Error en create_cohesion_heatmap: {str(e)}") | |
return None | |