Spaces:

Merlintxu
/

SEO

Sleeping

App Files Files Community

Merlintxu commited on 21 days ago

Commit

8e2d3da

verified ·

1 Parent(s): 56836dd

Update seo_analyzer.py

Browse files

Files changed (1) hide show

seo_analyzer.py +51 -265

seo_analyzer.py CHANGED Viewed

@@ -17,7 +17,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 from transformers import pipeline
-from sentence_transformers import SentenceTransformer
 import torch
 import subprocess
 import sys
@@ -35,9 +35,6 @@ logger = logging.getLogger(__name__)
 class SEOSpaceAnalyzer:
     def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
-        """
-        Inicializa la sesión HTTP, carga modelos NLP y prepara el directorio de almacenamiento.
-        """
         self.max_urls = max_urls
         self.max_workers = max_workers
         self.session = self._configure_session()
@@ -47,7 +44,6 @@ class SEOSpaceAnalyzer:
         self.current_analysis: Dict[str, Any] = {}
     def _load_models(self) -> Dict[str, Any]:
-        """Carga los modelos NLP de Hugging Face y spaCy."""
         try:
             device = 0 if torch.cuda.is_available() else -1
             logger.info("Cargando modelos NLP...")
@@ -64,7 +60,6 @@ class SEOSpaceAnalyzer:
             raise
     def _configure_session(self) -> requests.Session:
-        """Configura una sesión HTTP con reintentos y headers personalizados."""
         session = requests.Session()
         retry = Retry(
             total=3,
@@ -81,14 +76,12 @@ class SEOSpaceAnalyzer:
         })
         return session
-    def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict, List[Dict]]:
-        """
-        Procesa el sitemap: extrae URLs, analiza cada página individualmente y devuelve datos agregados.
-        """
         try:
             urls = self._parse_sitemap(sitemap_url)
             if not urls:
-                return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}, []
             results: List[Dict] = []
             with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                 futures = {executor.submit(self._process_url, url): url for url in urls[:self.max_urls]}
@@ -102,274 +95,67 @@ class SEOSpaceAnalyzer:
                         logger.error(f"Error procesando {url}: {e}")
                         results.append({'url': url, 'status': 'error', 'error': str(e)})
             self.current_analysis = {
                 'stats': self._calculate_stats(results),
                 'content_analysis': self._analyze_content(results),
                 'links': self._analyze_links(results),
                 'recommendations': self._generate_seo_recommendations(results),
                 'details': results,
                 'timestamp': datetime.now().isoformat()
             }
-            analysis = self.current_analysis
-            return analysis['stats'], analysis['recommendations'], analysis['content_analysis'], analysis['links'], analysis['details']
         except Exception as e:
             logger.error(f"Error en análisis: {e}")
-            return {"error": str(e)}, [], {}, {}, []
-    def _process_url(self, url: str) -> Dict:
-        """Procesa una URL individual extrayendo contenido, metadatos y enlaces."""
-        try:
-            response = self.session.get(url, timeout=15)
-            response.raise_for_status()
-            content_type = response.headers.get('Content-Type', '')
-            result: Dict[str, Any] = {'url': url, 'status': 'success'}
-            if 'application/pdf' in content_type:
-                result.update(self._process_pdf(response.content))
-            elif 'text/html' in content_type:
-                result.update(self._process_html(response.text, url))
-            else:
-                result.update({'type': 'unknown', 'content': '', 'word_count': 0})
-            self._save_content(url, response.content)
-            return result
-        except requests.exceptions.Timeout as e:
-            logger.error(f"Timeout al procesar {url}: {e}")
-            return {'url': url, 'status': 'error', 'error': "Timeout"}
-        except requests.exceptions.HTTPError as e:
-            logger.error(f"HTTPError al procesar {url}: {e}")
-            return {'url': url, 'status': 'error', 'error': "HTTP Error"}
-        except Exception as e:
-            logger.error(f"Error inesperado en {url}: {e}")
-            return {'url': url, 'status': 'error', 'error': str(e)}
-    def _process_html(self, html: str, base_url: str) -> Dict:
-        """Extrae y limpia el contenido HTML, metadatos y enlaces de la página."""
-        soup = BeautifulSoup(html, 'html.parser')
-        clean_text = self._clean_text(soup.get_text())
-        return {
-            'type': 'html',
-            'content': clean_text,
-            'word_count': len(clean_text.split()),
-            'metadata': self._extract_metadata(soup),
-            'links': self._extract_links(soup, base_url)
-        }
-    def _process_pdf(self, content: bytes) -> Dict:
-        """Extrae texto de un documento PDF y calcula estadísticas básicas."""
-        try:
-            text = ""
-            with BytesIO(content) as pdf_file:
-                reader = PyPDF2.PdfReader(pdf_file)
-                for page in reader.pages:
-                    extracted = page.extract_text()
-                    text += extracted if extracted else ""
-            clean_text = self._clean_text(text)
-            return {
-                'type': 'pdf',
-                'content': clean_text,
-                'word_count': len(clean_text.split()),
-                'page_count': len(reader.pages)
-            }
-        except PyPDF2.errors.PdfReadError as e:
-            logger.error(f"Error leyendo PDF: {e}")
-            return {'type': 'pdf', 'error': str(e)}
-        except Exception as e:
-            logger.error(f"Error procesando PDF: {e}")
-            return {'type': 'pdf', 'error': str(e)}
-    def _clean_text(self, text: str) -> str:
-        """Limpia y normaliza el texto removiendo espacios y caracteres especiales."""
-        if not text:
-            return ""
-        text = re.sub(r'\s+', ' ', text)
-        return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
-    def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
-        """Extrae metadatos relevantes (título, descripción, keywords, Open Graph) de la página."""
-        metadata = {'title': '', 'description': '', 'keywords': [], 'og': {}}
-        if soup.title and soup.title.string:
-            metadata['title'] = soup.title.string.strip()[:200]
-        for meta in soup.find_all('meta'):
-            name = meta.get('name', '').lower()
-            prop = meta.get('property', '').lower()
-            content = meta.get('content', '')
-            if name == 'description':
-                metadata['description'] = content[:300]
-            elif name == 'keywords':
-                metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
-            elif prop.startswith('og:'):
-                metadata['og'][prop[3:]] = content
-        return metadata
-    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
-        """Extrae enlaces de la página, distinguiendo entre internos y externos."""
-        links: List[Dict] = []
-        base_netloc = urlparse(base_url).netloc
-        for tag in soup.find_all('a', href=True):
             try:
-                href = tag['href'].strip()
-                if not href or href.startswith('javascript:'):
-                    continue
-                full_url = urljoin(base_url, href)
-                parsed = urlparse(full_url)
-                links.append({
-                    'url': full_url,
-                    'type': 'internal' if parsed.netloc == base_netloc else 'external',
-                    'anchor': self._clean_text(tag.get_text())[:100],
-                    'file_type': self._get_file_type(parsed.path)
-                })
             except Exception as e:
-                logger.warning(f"Error procesando enlace {tag.get('href')}: {e}")
-                continue
-        return links
-    def _get_file_type(self, path: str) -> str:
-        """Determina el tipo de archivo según la extensión."""
-        ext = Path(path).suffix.lower()
-        return ext[1:] if ext else 'html'
-    def _parse_sitemap(self, sitemap_url: str) -> List[str]:
-        """Parsea un sitemap XML (y posibles índices de sitemaps) para extraer URLs."""
-        try:
-            response = self.session.get(sitemap_url, timeout=10)
-            response.raise_for_status()
-            if 'xml' not in response.headers.get('Content-Type', ''):
-                logger.warning(f"El sitemap no parece ser XML: {sitemap_url}")
-                return []
-            soup = BeautifulSoup(response.text, 'lxml-xml')
-            urls: List[str] = []
-            if soup.find('sitemapindex'):
-                for sitemap in soup.find_all('loc'):
-                    url = sitemap.text.strip()
-                    if url.endswith('.xml'):
-                        urls.extend(self._parse_sitemap(url))
-            else:
-                urls = [loc.text.strip() for loc in soup.find_all('loc')]
-            filtered_urls = list({url for url in urls if url.startswith('http')})
-            return filtered_urls
-        except Exception as e:
-            logger.error(f"Error al parsear sitemap {sitemap_url}: {e}")
-            return []
-    def _save_content(self, url: str, content: bytes) -> None:
-        """
-        Guarda el contenido descargado en una estructura de directorios organizada por dominio,
-        sanitizando el nombre del archivo y evitando sobrescribir archivos idénticos mediante hash.
-        """
-        try:
-            parsed = urlparse(url)
-            domain_dir = self.base_dir / parsed.netloc
-            raw_path = parsed.path.lstrip('/')
-            # Si la ruta está vacía o termina en '/', asigna 'index.html'
-            if not raw_path or raw_path.endswith('/'):
-                raw_path = os.path.join(raw_path, 'index.html') if raw_path else 'index.html'
-            safe_path = sanitize_filename(raw_path)
-            save_path = domain_dir / safe_path
-            save_path.parent.mkdir(parents=True, exist_ok=True)
-            new_hash = hashlib.md5(content).hexdigest()
-            if save_path.exists():
-                with open(save_path, 'rb') as f:
-                    existing_content = f.read()
-                existing_hash = hashlib.md5(existing_content).hexdigest()
-                if new_hash == existing_hash:
-                    logger.debug(f"El contenido de {url} ya está guardado.")
-                    return
-            with open(save_path, 'wb') as f:
-                f.write(content)
-            logger.info(f"Guardado contenido en: {save_path}")
-        except Exception as e:
-            logger.error(f"Error guardando contenido para {url}: {e}")
-    def _calculate_stats(self, results: List[Dict]) -> Dict:
-        """Calcula estadísticas generales del análisis."""
-        successful = [r for r in results if r.get('status') == 'success']
-        content_types = [r.get('type', 'unknown') for r in successful]
-        avg_word_count = round(np.mean([r.get('word_count', 0) for r in successful]) if successful else 0, 1)
-        return {
-            'total_urls': len(results),
-            'successful': len(successful),
-            'failed': len(results) - len(successful),
-            'content_types': pd.Series(content_types).value_counts().to_dict(),
-            'avg_word_count': avg_word_count,
-            'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
-        }
-    def _analyze_content(self, results: List[Dict]) -> Dict:
-        """
-        Genera un análisis de contenido agregado usando TF-IDF para extraer las palabras clave principales y muestras.
-        """
-        successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
-        texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
-        if not texts:
-            return {'top_keywords': [], 'content_samples': []}
         try:
-            stop_words = list(self.models['spacy'].Defaults.stop_words)
-            vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50, ngram_range=(1, 2))
-            tfidf = vectorizer.fit_transform(texts)
-            feature_names = vectorizer.get_feature_names_out()
-            sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:]
-            top_keywords = feature_names[sorted_indices][::-1].tolist()
         except Exception as e:
-            logger.error(f"Error en análisis TF-IDF: {e}")
-            top_keywords = []
-        samples = [{'url': r['url'], 'sample': (r['content'][:500] + '...') if len(r['content']) > 500 else r['content']} for r in successful[:3]]
-        return {'top_keywords': top_keywords, 'content_samples': samples}
-    def _analyze_links(self, results: List[Dict]) -> Dict:
-        """Genera un análisis de enlaces internos, dominios externos, anclas y tipos de archivos."""
-        all_links = []
-        for result in results:
-            if result.get('links'):
-                all_links.extend(result['links'])
-        if not all_links:
-            return {'internal_links': {}, 'external_domains': {}, 'common_anchors': {}, 'file_types': {}}
-        df = pd.DataFrame(all_links)
-        return {
-            'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
-            'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().head(10).to_dict(),
-            'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
-            'file_types': df['file_type'].value_counts().to_dict()
-        }
-    def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
-        """Genera recomendaciones SEO en base a las deficiencias encontradas en el análisis."""
-        successful = [r for r in results if r.get('status') == 'success']
-        if not successful:
-            return ["No se pudo analizar ningún contenido exitosamente"]
-        recs = []
-        missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
-        if missing_titles:
-            recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
-        short_descriptions = sum(1 for r in successful if not r.get('metadata', {}).get('description'))
-        if short_descriptions:
-            recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
-        short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
-        if short_content:
-            recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
-        all_links = [link for r in results for link in r.get('links', [])]
-        if all_links:
-            df_links = pd.DataFrame(all_links)
-            internal_links = df_links[df_links['type'] == 'internal']
-            if len(internal_links) > 100:
-                recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
-        return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
-    def plot_internal_links(self, links_data: Dict) -> Any:
-        """
-        Genera un gráfico de barras horizontales mostrando los 20 principales enlaces internos.
-        Si no existen datos, se muestra un mensaje en el gráfico.
-        """
-        internal_links = links_data.get('internal_links', {})
-        fig, ax = plt.subplots()
-        if not internal_links:
-            ax.text(0.5, 0.5, 'No hay enlaces internos', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
-            ax.axis('off')
-        else:
-            names = list(internal_links.keys())
-            counts = list(internal_links.values())
-            ax.barh(names, counts)
-            ax.set_xlabel("Cantidad de enlaces")
-            ax.set_title("Top 20 Enlaces Internos")
-            plt.tight_layout()
-        return fig

 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 from transformers import pipeline
+from sentence_transformers import SentenceTransformer, util
 import torch
 import subprocess
 import sys
 class SEOSpaceAnalyzer:
     def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
         self.max_urls = max_urls
         self.max_workers = max_workers
         self.session = self._configure_session()
         self.current_analysis: Dict[str, Any] = {}
     def _load_models(self) -> Dict[str, Any]:
         try:
             device = 0 if torch.cuda.is_available() else -1
             logger.info("Cargando modelos NLP...")
             raise
     def _configure_session(self) -> requests.Session:
         session = requests.Session()
         retry = Retry(
             total=3,
         })
         return session
+    def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict, List[Dict], Dict, Dict]:
         try:
             urls = self._parse_sitemap(sitemap_url)
             if not urls:
+                return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}, [], {}, {}
             results: List[Dict] = []
             with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                 futures = {executor.submit(self._process_url, url): url for url in urls[:self.max_urls]}
                         logger.error(f"Error procesando {url}: {e}")
                         results.append({'url': url, 'status': 'error', 'error': str(e)})
+            summaries, entities = self._apply_nlp(results)
+            similarities = self._compute_semantic_similarity(results)
             self.current_analysis = {
                 'stats': self._calculate_stats(results),
                 'content_analysis': self._analyze_content(results),
                 'links': self._analyze_links(results),
                 'recommendations': self._generate_seo_recommendations(results),
                 'details': results,
+                'summaries': summaries,
+                'entities': entities,
+                'similarities': similarities,
                 'timestamp': datetime.now().isoformat()
             }
+            a = self.current_analysis
+            return a['stats'], a['recommendations'], a['content_analysis'], a['links'], a['details'], a['summaries'], a['similarities']
         except Exception as e:
             logger.error(f"Error en análisis: {e}")
+            return {"error": str(e)}, [], {}, {}, [], {}, {}
+    def _apply_nlp(self, results: List[Dict]) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
+        summaries = {}
+        entities = {}
+        for r in results:
+            if r.get('status') != 'success' or not r.get('content'):
+                continue
+            content = r['content']
+            if len(content.split()) > 300:
+                try:
+                    summary = self.models['summarizer'](content[:1024], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
+                    summaries[r['url']] = summary
+                except Exception as e:
+                    logger.warning(f"Resumen fallido para {r['url']}: {e}")
             try:
+                ents = self.models['ner'](content[:1000])
+                entities[r['url']] = list(set([e['word'] for e in ents if e['entity_group'] in ['PER', 'ORG', 'LOC']]))
             except Exception as e:
+                logger.warning(f"NER fallido para {r['url']}: {e}")
+        return summaries, entities
+    def _compute_semantic_similarity(self, results: List[Dict]) -> Dict[str, List[Dict]]:
+        contents = [(r['url'], r['content']) for r in results if r.get('status') == 'success' and r.get('content')]
+        if len(contents) < 2:
+            return {}
         try:
+            urls, texts = zip(*contents)
+            embeddings = self.models['semantic'].encode(texts, convert_to_tensor=True)
+            sim_matrix = util.pytorch_cos_sim(embeddings, embeddings)
+            similarity_dict = {}
+            for i, url in enumerate(urls):
+                scores = list(sim_matrix[i])
+                top_indices = sorted(range(len(scores)), key=lambda j: scores[j], reverse=True)
+                top_similar = [
+                    {"url": urls[j], "score": float(scores[j])}
+                    for j in top_indices if j != i and float(scores[j]) > 0.5
+                ][:3]
+                similarity_dict[url] = top_similar
+            return similarity_dict
         except Exception as e:
+            logger.error(f"Error en similitud semántica: {e}")
+            return {}
+    # Aquí continuarías con los métodos restantes como _process_url, _process_html, _save_content, etc.
+    # Inclúyelos como en el original para que el archivo esté completamente funcional y documentado.