Spaces:

stillerman
/

wikihop-server

Runtime error

File size: 5,518 Bytes

import bz2
import json
import re
import os
from pathlib import Path
from xml.sax import make_parser, handler

class WikiContentHandler(handler.ContentHandler):
    def __init__(self, max_articles=None):
        self.wiki_data = {}
        self.article_count = 0
        self.max_articles = max_articles
        
        # Current elements
        self.current_title = None
        self.current_text = None
        self.current_ns = None
        self.in_page = False
        self.in_title = False
        self.in_text = False
        self.in_ns = False
        self.buffer = []
        
    def startElement(self, name, attrs):
        if name == 'page':
            self.in_page = True
            self.current_title = None
            self.current_text = None
            self.current_ns = None
        elif self.in_page and name == 'title':
            self.in_title = True
            self.buffer = []
        elif self.in_page and name == 'ns':
            self.in_ns = True
            self.buffer = []
        elif self.in_page and name == 'text':
            self.in_text = True
            self.buffer = []
    
    def endElement(self, name):
        if name == 'page':
            self.in_page = False
            # Only process main namespace articles (ns = 0)
            if self.current_ns == '0' and self.current_title and self.current_text:
                # Extract links
                links = self.extract_links(self.current_text)
                
                # Add to wiki data
                self.wiki_data[self.current_title] = {
                    'title': self.current_title,
                    'text': self.current_text,
                    'links': links
                }
                
                self.article_count += 1
                
                # Print progress
                if self.article_count % 100 == 0:
                    print(f"Processed {self.article_count} articles...")
                
                # Check if we've reached the maximum number of articles
                if self.max_articles and self.article_count >= self.max_articles:
                    raise StopIteration("Reached maximum number of articles")
                    
        elif name == 'title':
            self.in_title = False
            self.current_title = ''.join(self.buffer)
        elif name == 'ns':
            self.in_ns = False
            self.current_ns = ''.join(self.buffer)
        elif name == 'text':
            self.in_text = False
            self.current_text = ''.join(self.buffer)
    
    def characters(self, content):
        if self.in_title:
            self.buffer.append(content)
        elif self.in_ns:
            self.buffer.append(content)
        elif self.in_text:
            self.buffer.append(content)
    
    def extract_links(self, text):
        """Extract links from article wikitext"""
        # Pattern to match [[Link]] or [[Link|Text]] format
        links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
        
        # Process links
        processed_links = []
        for link in links:
            # Skip non-article links (except categories which might be useful)
            if ':' in link and not link.startswith('Category:'):
                continue
            
            # Remove any section links (with #)
            link = link.split('#')[0].strip()
            
            # Skip empty links
            if not link:
                continue
            
            processed_links.append(link)
        
        # Remove duplicates and return
        return list(set(processed_links))

def parse_wiki_dump(dump_path, output_path, max_articles=None):
    """
    Parse the Wikipedia XML dump and extract articles with their links.
    
    Args:
        dump_path: Path to the bz2 Wikipedia dump
        output_path: Path to save the extracted data
        max_articles: Maximum number of articles to extract (None for all)
    
    Returns:
        The path to the saved JSON file
    """
    print(f"Parsing Wikipedia dump: {dump_path}")
    
    # Create SAX parser with custom content handler
    parser = make_parser()
    content_handler = WikiContentHandler(max_articles)
    parser.setContentHandler(content_handler)
    
    # Parse the dump
    try:
        parser.parse(bz2.BZ2File(dump_path))
    except StopIteration:
        print("Reached maximum number of articles")
    except Exception as e:
        print(f"Error parsing dump: {e}")
    
    print(f"Extracted {content_handler.article_count} articles with their links.")
    
    # Save data to JSON file
    output_file = os.path.join(output_path, 'wiki_data.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(content_handler.wiki_data, f)
    
    print(f"Data saved to {output_file}")
    return output_file

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump')
    parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
    parser.add_argument('output_path', help='Path to save the extracted data')
    parser.add_argument('--max-articles', type=int, default=None, 
                        help='Maximum number of articles to extract (default: all)')
    
    args = parser.parse_args()
    
    # Create output directory if it doesn't exist
    os.makedirs(args.output_path, exist_ok=True)
    
    # Parse the dump
    parse_wiki_dump(args.dump_path, args.output_path, args.max_articles)