import bz2 import json import re import os from pathlib import Path from xml.sax import make_parser, handler class WikiContentHandler(handler.ContentHandler): def __init__(self, max_articles=None): self.wiki_data = {} self.article_count = 0 self.max_articles = max_articles # Current elements self.current_title = None self.current_text = None self.current_ns = None self.in_page = False self.in_title = False self.in_text = False self.in_ns = False self.buffer = [] def startElement(self, name, attrs): if name == 'page': self.in_page = True self.current_title = None self.current_text = None self.current_ns = None elif self.in_page and name == 'title': self.in_title = True self.buffer = [] elif self.in_page and name == 'ns': self.in_ns = True self.buffer = [] elif self.in_page and name == 'text': self.in_text = True self.buffer = [] def endElement(self, name): if name == 'page': self.in_page = False # Only process main namespace articles (ns = 0) if self.current_ns == '0' and self.current_title and self.current_text: # Extract links links = self.extract_links(self.current_text) # Add to wiki data self.wiki_data[self.current_title] = { 'title': self.current_title, 'text': self.current_text, 'links': links } self.article_count += 1 # Print progress if self.article_count % 100 == 0: print(f"Processed {self.article_count} articles...") # Check if we've reached the maximum number of articles if self.max_articles and self.article_count >= self.max_articles: raise StopIteration("Reached maximum number of articles") elif name == 'title': self.in_title = False self.current_title = ''.join(self.buffer) elif name == 'ns': self.in_ns = False self.current_ns = ''.join(self.buffer) elif name == 'text': self.in_text = False self.current_text = ''.join(self.buffer) def characters(self, content): if self.in_title: self.buffer.append(content) elif self.in_ns: self.buffer.append(content) elif self.in_text: self.buffer.append(content) def extract_links(self, text): """Extract links from article wikitext""" # Pattern to match [[Link]] or [[Link|Text]] format links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text) # Process links processed_links = [] for link in links: # Skip non-article links (except categories which might be useful) if ':' in link and not link.startswith('Category:'): continue # Remove any section links (with #) link = link.split('#')[0].strip() # Skip empty links if not link: continue processed_links.append(link) # Remove duplicates and return return list(set(processed_links)) def parse_wiki_dump(dump_path, output_path, max_articles=None): """ Parse the Wikipedia XML dump and extract articles with their links. Args: dump_path: Path to the bz2 Wikipedia dump output_path: Path to save the extracted data max_articles: Maximum number of articles to extract (None for all) Returns: The path to the saved JSON file """ print(f"Parsing Wikipedia dump: {dump_path}") # Create SAX parser with custom content handler parser = make_parser() content_handler = WikiContentHandler(max_articles) parser.setContentHandler(content_handler) # Parse the dump try: parser.parse(bz2.BZ2File(dump_path)) except StopIteration: print("Reached maximum number of articles") except Exception as e: print(f"Error parsing dump: {e}") print(f"Extracted {content_handler.article_count} articles with their links.") # Save data to JSON file output_file = os.path.join(output_path, 'wiki_data.json') with open(output_file, 'w', encoding='utf-8') as f: json.dump(content_handler.wiki_data, f) print(f"Data saved to {output_file}") return output_file if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump') parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)') parser.add_argument('output_path', help='Path to save the extracted data') parser.add_argument('--max-articles', type=int, default=None, help='Maximum number of articles to extract (default: all)') args = parser.parse_args() # Create output directory if it doesn't exist os.makedirs(args.output_path, exist_ok=True) # Parse the dump parse_wiki_dump(args.dump_path, args.output_path, args.max_articles)