Update seo_analyzer.py
Browse files- seo_analyzer.py +27 -85
seo_analyzer.py
CHANGED
@@ -26,7 +26,6 @@ import matplotlib.pyplot as plt
|
|
26 |
|
27 |
from utils import sanitize_filename
|
28 |
|
29 |
-
# Configuración de logging
|
30 |
logging.basicConfig(
|
31 |
level=logging.INFO,
|
32 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
@@ -37,10 +36,6 @@ class SEOSpaceAnalyzer:
|
|
37 |
def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
|
38 |
"""
|
39 |
Inicializa la sesión HTTP, carga modelos NLP y prepara el directorio de almacenamiento.
|
40 |
-
|
41 |
-
Args:
|
42 |
-
max_urls: Número máximo de URLs a procesar por análisis.
|
43 |
-
max_workers: Número de hilos para la ejecución concurrente.
|
44 |
"""
|
45 |
self.max_urls = max_urls
|
46 |
self.max_workers = max_workers
|
@@ -66,23 +61,6 @@ class SEOSpaceAnalyzer:
|
|
66 |
except Exception as e:
|
67 |
logger.error(f"Error cargando modelos: {e}")
|
68 |
raise
|
69 |
-
def plot_internal_links(self, links_data: dict) -> any:
|
70 |
-
"""Genera un gráfico de barras horizontales mostrando los 20 principales enlaces internos."""
|
71 |
-
internal_links = links_data.get('internal_links', {})
|
72 |
-
if not internal_links:
|
73 |
-
# Crear una figura que indique que no hay datos
|
74 |
-
fig, ax = plt.subplots()
|
75 |
-
ax.text(0.5, 0.5, 'No hay enlaces internos', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
|
76 |
-
ax.axis('off')
|
77 |
-
return fig
|
78 |
-
fig, ax = plt.subplots()
|
79 |
-
names = list(internal_links.keys())
|
80 |
-
counts = list(internal_links.values())
|
81 |
-
ax.barh(names, counts)
|
82 |
-
ax.set_xlabel("Cantidad de enlaces")
|
83 |
-
ax.set_title("Top 20 Enlaces Internos")
|
84 |
-
plt.tight_layout()
|
85 |
-
return fig
|
86 |
|
87 |
def _configure_session(self) -> requests.Session:
|
88 |
"""Configura una sesión HTTP con reintentos y headers personalizados."""
|
@@ -104,18 +82,7 @@ class SEOSpaceAnalyzer:
|
|
104 |
|
105 |
def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict, List[Dict]]:
|
106 |
"""
|
107 |
-
Procesa el sitemap: extrae URLs, analiza cada página
|
108 |
-
|
109 |
-
Args:
|
110 |
-
sitemap_url: URL del sitemap XML.
|
111 |
-
|
112 |
-
Returns:
|
113 |
-
Una tupla con 5 elementos:
|
114 |
-
- Estadísticas generales (dict)
|
115 |
-
- Recomendaciones SEO (lista de strings)
|
116 |
-
- Análisis de contenido agregado (dict)
|
117 |
-
- Análisis de enlaces (dict)
|
118 |
-
- Detalle individual de cada URL procesada (lista de dicts)
|
119 |
"""
|
120 |
try:
|
121 |
urls = self._parse_sitemap(sitemap_url)
|
@@ -163,15 +130,18 @@ class SEOSpaceAnalyzer:
|
|
163 |
result.update({'type': 'unknown', 'content': '', 'word_count': 0})
|
164 |
self._save_content(url, response.content)
|
165 |
return result
|
166 |
-
except requests.exceptions.
|
167 |
-
logger.
|
168 |
-
return {'url': url, 'status': 'error', 'error':
|
|
|
|
|
|
|
169 |
except Exception as e:
|
170 |
-
logger.error(f"Error inesperado en {url}: {
|
171 |
return {'url': url, 'status': 'error', 'error': str(e)}
|
172 |
|
173 |
def _process_html(self, html: str, base_url: str) -> Dict:
|
174 |
-
"""Extrae y limpia el contenido HTML, metadatos y enlaces
|
175 |
soup = BeautifulSoup(html, 'html.parser')
|
176 |
clean_text = self._clean_text(soup.get_text())
|
177 |
return {
|
@@ -198,9 +168,12 @@ class SEOSpaceAnalyzer:
|
|
198 |
'word_count': len(clean_text.split()),
|
199 |
'page_count': len(reader.pages)
|
200 |
}
|
201 |
-
except PyPDF2.PdfReadError as e:
|
202 |
logger.error(f"Error leyendo PDF: {e}")
|
203 |
return {'type': 'pdf', 'error': str(e)}
|
|
|
|
|
|
|
204 |
|
205 |
def _clean_text(self, text: str) -> str:
|
206 |
"""Limpia y normaliza el texto removiendo espacios y caracteres especiales."""
|
@@ -210,7 +183,7 @@ class SEOSpaceAnalyzer:
|
|
210 |
return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
|
211 |
|
212 |
def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
|
213 |
-
"""Extrae metadatos relevantes
|
214 |
metadata = {'title': '', 'description': '', 'keywords': [], 'og': {}}
|
215 |
if soup.title and soup.title.string:
|
216 |
metadata['title'] = soup.title.string.strip()[:200]
|
@@ -227,7 +200,7 @@ class SEOSpaceAnalyzer:
|
|
227 |
return metadata
|
228 |
|
229 |
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
230 |
-
"""Extrae
|
231 |
links: List[Dict] = []
|
232 |
base_netloc = urlparse(base_url).netloc
|
233 |
for tag in soup.find_all('a', href=True):
|
@@ -249,12 +222,12 @@ class SEOSpaceAnalyzer:
|
|
249 |
return links
|
250 |
|
251 |
def _get_file_type(self, path: str) -> str:
|
252 |
-
"""Determina el tipo de archivo
|
253 |
ext = Path(path).suffix.lower()
|
254 |
return ext[1:] if ext else 'html'
|
255 |
|
256 |
def _parse_sitemap(self, sitemap_url: str) -> List[str]:
|
257 |
-
"""Parsea un sitemap XML
|
258 |
try:
|
259 |
response = self.session.get(sitemap_url, timeout=10)
|
260 |
response.raise_for_status()
|
@@ -284,10 +257,11 @@ class SEOSpaceAnalyzer:
|
|
284 |
try:
|
285 |
parsed = urlparse(url)
|
286 |
domain_dir = self.base_dir / parsed.netloc
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
|
|
291 |
save_path = domain_dir / safe_path
|
292 |
save_path.parent.mkdir(parents=True, exist_ok=True)
|
293 |
new_hash = hashlib.md5(content).hexdigest()
|
@@ -319,7 +293,9 @@ class SEOSpaceAnalyzer:
|
|
319 |
}
|
320 |
|
321 |
def _analyze_content(self, results: List[Dict]) -> Dict:
|
322 |
-
"""
|
|
|
|
|
323 |
successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
|
324 |
texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
|
325 |
if not texts:
|
@@ -338,7 +314,7 @@ class SEOSpaceAnalyzer:
|
|
338 |
return {'top_keywords': top_keywords, 'content_samples': samples}
|
339 |
|
340 |
def _analyze_links(self, results: List[Dict]) -> Dict:
|
341 |
-
"""Genera un análisis de enlaces internos
|
342 |
all_links = []
|
343 |
for result in results:
|
344 |
if result.get('links'):
|
@@ -354,38 +330,4 @@ class SEOSpaceAnalyzer:
|
|
354 |
}
|
355 |
|
356 |
def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
|
357 |
-
"""Genera recomendaciones SEO en
|
358 |
-
successful = [r for r in results if r.get('status') == 'success']
|
359 |
-
if not successful:
|
360 |
-
return ["No se pudo analizar ningún contenido exitosamente"]
|
361 |
-
recs = []
|
362 |
-
missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
|
363 |
-
if missing_titles:
|
364 |
-
recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
|
365 |
-
short_descriptions = sum(1 for r in successful if not r.get('metadata', {}).get('description'))
|
366 |
-
if short_descriptions:
|
367 |
-
recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
|
368 |
-
short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
|
369 |
-
if short_content:
|
370 |
-
recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
|
371 |
-
all_links = [link for r in results for link in r.get('links', [])]
|
372 |
-
if all_links:
|
373 |
-
df_links = pd.DataFrame(all_links)
|
374 |
-
internal_links = df_links[df_links['type'] == 'internal']
|
375 |
-
if len(internal_links) > 100:
|
376 |
-
recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
|
377 |
-
return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
|
378 |
-
|
379 |
-
def plot_internal_links(self, links_data: Dict) -> Any:
|
380 |
-
"""Genera un gráfico de barras horizontales mostrando los 20 principales enlaces internos."""
|
381 |
-
internal_links = links_data.get('internal_links', {})
|
382 |
-
if not internal_links:
|
383 |
-
return {}
|
384 |
-
fig, ax = plt.subplots()
|
385 |
-
names = list(internal_links.keys())
|
386 |
-
counts = list(internal_links.values())
|
387 |
-
ax.barh(names, counts)
|
388 |
-
ax.set_xlabel("Cantidad de enlaces")
|
389 |
-
ax.set_title("Top 20 Enlaces Internos")
|
390 |
-
plt.tight_layout()
|
391 |
-
return fig
|
|
|
26 |
|
27 |
from utils import sanitize_filename
|
28 |
|
|
|
29 |
logging.basicConfig(
|
30 |
level=logging.INFO,
|
31 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
36 |
def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
|
37 |
"""
|
38 |
Inicializa la sesión HTTP, carga modelos NLP y prepara el directorio de almacenamiento.
|
|
|
|
|
|
|
|
|
39 |
"""
|
40 |
self.max_urls = max_urls
|
41 |
self.max_workers = max_workers
|
|
|
61 |
except Exception as e:
|
62 |
logger.error(f"Error cargando modelos: {e}")
|
63 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
def _configure_session(self) -> requests.Session:
|
66 |
"""Configura una sesión HTTP con reintentos y headers personalizados."""
|
|
|
82 |
|
83 |
def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict, List[Dict]]:
|
84 |
"""
|
85 |
+
Procesa el sitemap: extrae URLs, analiza cada página y devuelve datos agregados.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
"""
|
87 |
try:
|
88 |
urls = self._parse_sitemap(sitemap_url)
|
|
|
130 |
result.update({'type': 'unknown', 'content': '', 'word_count': 0})
|
131 |
self._save_content(url, response.content)
|
132 |
return result
|
133 |
+
except requests.exceptions.Timeout as e:
|
134 |
+
logger.error(f"Timeout al procesar {url}: {e}")
|
135 |
+
return {'url': url, 'status': 'error', 'error': "Timeout"}
|
136 |
+
except requests.exceptions.HTTPError as e:
|
137 |
+
logger.error(f"HTTPError al procesar {url}: {e}")
|
138 |
+
return {'url': url, 'status': 'error', 'error': "HTTP Error"}
|
139 |
except Exception as e:
|
140 |
+
logger.error(f"Error inesperado en {url}: {e}")
|
141 |
return {'url': url, 'status': 'error', 'error': str(e)}
|
142 |
|
143 |
def _process_html(self, html: str, base_url: str) -> Dict:
|
144 |
+
"""Extrae y limpia el contenido HTML, metadatos y enlaces."""
|
145 |
soup = BeautifulSoup(html, 'html.parser')
|
146 |
clean_text = self._clean_text(soup.get_text())
|
147 |
return {
|
|
|
168 |
'word_count': len(clean_text.split()),
|
169 |
'page_count': len(reader.pages)
|
170 |
}
|
171 |
+
except PyPDF2.errors.PdfReadError as e:
|
172 |
logger.error(f"Error leyendo PDF: {e}")
|
173 |
return {'type': 'pdf', 'error': str(e)}
|
174 |
+
except Exception as e:
|
175 |
+
logger.error(f"Error procesando PDF: {e}")
|
176 |
+
return {'type': 'pdf', 'error': str(e)}
|
177 |
|
178 |
def _clean_text(self, text: str) -> str:
|
179 |
"""Limpia y normaliza el texto removiendo espacios y caracteres especiales."""
|
|
|
183 |
return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
|
184 |
|
185 |
def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
|
186 |
+
"""Extrae metadatos relevantes de la página."""
|
187 |
metadata = {'title': '', 'description': '', 'keywords': [], 'og': {}}
|
188 |
if soup.title and soup.title.string:
|
189 |
metadata['title'] = soup.title.string.strip()[:200]
|
|
|
200 |
return metadata
|
201 |
|
202 |
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
203 |
+
"""Extrae y clasifica los enlaces internos y externos."""
|
204 |
links: List[Dict] = []
|
205 |
base_netloc = urlparse(base_url).netloc
|
206 |
for tag in soup.find_all('a', href=True):
|
|
|
222 |
return links
|
223 |
|
224 |
def _get_file_type(self, path: str) -> str:
|
225 |
+
"""Determina el tipo de archivo basado en la extensión."""
|
226 |
ext = Path(path).suffix.lower()
|
227 |
return ext[1:] if ext else 'html'
|
228 |
|
229 |
def _parse_sitemap(self, sitemap_url: str) -> List[str]:
|
230 |
+
"""Parsea un sitemap XML para extraer URLs."""
|
231 |
try:
|
232 |
response = self.session.get(sitemap_url, timeout=10)
|
233 |
response.raise_for_status()
|
|
|
257 |
try:
|
258 |
parsed = urlparse(url)
|
259 |
domain_dir = self.base_dir / parsed.netloc
|
260 |
+
raw_path = parsed.path.lstrip('/')
|
261 |
+
# Si la ruta está vacía o termina en '/', asigna 'index.html'
|
262 |
+
if not raw_path or raw_path.endswith('/'):
|
263 |
+
raw_path = os.path.join(raw_path, 'index.html') if raw_path else 'index.html'
|
264 |
+
safe_path = sanitize_filename(raw_path)
|
265 |
save_path = domain_dir / safe_path
|
266 |
save_path.parent.mkdir(parents=True, exist_ok=True)
|
267 |
new_hash = hashlib.md5(content).hexdigest()
|
|
|
293 |
}
|
294 |
|
295 |
def _analyze_content(self, results: List[Dict]) -> Dict:
|
296 |
+
"""
|
297 |
+
Analiza el contenido agregado usando TF-IDF para extraer las palabras clave principales y muestras.
|
298 |
+
"""
|
299 |
successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
|
300 |
texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
|
301 |
if not texts:
|
|
|
314 |
return {'top_keywords': top_keywords, 'content_samples': samples}
|
315 |
|
316 |
def _analyze_links(self, results: List[Dict]) -> Dict:
|
317 |
+
"""Genera un análisis de enlaces internos y externos."""
|
318 |
all_links = []
|
319 |
for result in results:
|
320 |
if result.get('links'):
|
|
|
330 |
}
|
331 |
|
332 |
def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
|
333 |
+
"""Genera recomendaciones SEO basadas en las deficiencias encontr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|