Spaces:
Runtime error
Runtime error
import bz2 | |
import json | |
import re | |
import os | |
from pathlib import Path | |
from xml.sax import make_parser, handler | |
class WikiContentHandler(handler.ContentHandler): | |
def __init__(self, max_articles=None): | |
self.wiki_data = {} | |
self.article_count = 0 | |
self.max_articles = max_articles | |
# Current elements | |
self.current_title = None | |
self.current_text = None | |
self.current_ns = None | |
self.in_page = False | |
self.in_title = False | |
self.in_text = False | |
self.in_ns = False | |
self.buffer = [] | |
def startElement(self, name, attrs): | |
if name == 'page': | |
self.in_page = True | |
self.current_title = None | |
self.current_text = None | |
self.current_ns = None | |
elif self.in_page and name == 'title': | |
self.in_title = True | |
self.buffer = [] | |
elif self.in_page and name == 'ns': | |
self.in_ns = True | |
self.buffer = [] | |
elif self.in_page and name == 'text': | |
self.in_text = True | |
self.buffer = [] | |
def endElement(self, name): | |
if name == 'page': | |
self.in_page = False | |
# Only process main namespace articles (ns = 0) | |
if self.current_ns == '0' and self.current_title and self.current_text: | |
# Extract links | |
links = self.extract_links(self.current_text) | |
# Add to wiki data | |
self.wiki_data[self.current_title] = { | |
'title': self.current_title, | |
'text': self.current_text, | |
'links': links | |
} | |
self.article_count += 1 | |
# Print progress | |
if self.article_count % 100 == 0: | |
print(f"Processed {self.article_count} articles...") | |
# Check if we've reached the maximum number of articles | |
if self.max_articles and self.article_count >= self.max_articles: | |
raise StopIteration("Reached maximum number of articles") | |
elif name == 'title': | |
self.in_title = False | |
self.current_title = ''.join(self.buffer) | |
elif name == 'ns': | |
self.in_ns = False | |
self.current_ns = ''.join(self.buffer) | |
elif name == 'text': | |
self.in_text = False | |
self.current_text = ''.join(self.buffer) | |
def characters(self, content): | |
if self.in_title: | |
self.buffer.append(content) | |
elif self.in_ns: | |
self.buffer.append(content) | |
elif self.in_text: | |
self.buffer.append(content) | |
def extract_links(self, text): | |
"""Extract links from article wikitext""" | |
# Pattern to match [[Link]] or [[Link|Text]] format | |
links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text) | |
# Process links | |
processed_links = [] | |
for link in links: | |
# Skip non-article links (except categories which might be useful) | |
if ':' in link and not link.startswith('Category:'): | |
continue | |
# Remove any section links (with #) | |
link = link.split('#')[0].strip() | |
# Skip empty links | |
if not link: | |
continue | |
processed_links.append(link) | |
# Remove duplicates and return | |
return list(set(processed_links)) | |
def parse_wiki_dump(dump_path, output_path, max_articles=None): | |
""" | |
Parse the Wikipedia XML dump and extract articles with their links. | |
Args: | |
dump_path: Path to the bz2 Wikipedia dump | |
output_path: Path to save the extracted data | |
max_articles: Maximum number of articles to extract (None for all) | |
Returns: | |
The path to the saved JSON file | |
""" | |
print(f"Parsing Wikipedia dump: {dump_path}") | |
# Create SAX parser with custom content handler | |
parser = make_parser() | |
content_handler = WikiContentHandler(max_articles) | |
parser.setContentHandler(content_handler) | |
# Parse the dump | |
try: | |
parser.parse(bz2.BZ2File(dump_path)) | |
except StopIteration: | |
print("Reached maximum number of articles") | |
except Exception as e: | |
print(f"Error parsing dump: {e}") | |
print(f"Extracted {content_handler.article_count} articles with their links.") | |
# Save data to JSON file | |
output_file = os.path.join(output_path, 'wiki_data.json') | |
with open(output_file, 'w', encoding='utf-8') as f: | |
json.dump(content_handler.wiki_data, f) | |
print(f"Data saved to {output_file}") | |
return output_file | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump') | |
parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)') | |
parser.add_argument('output_path', help='Path to save the extracted data') | |
parser.add_argument('--max-articles', type=int, default=None, | |
help='Maximum number of articles to extract (default: all)') | |
args = parser.parse_args() | |
# Create output directory if it doesn't exist | |
os.makedirs(args.output_path, exist_ok=True) | |
# Parse the dump | |
parse_wiki_dump(args.dump_path, args.output_path, args.max_articles) | |