wikihop-server / db /wiki_parser_json.py
stillerman's picture
stillerman HF Staff
sqlite backend
e91ced9
import bz2
import json
import re
import os
from pathlib import Path
from xml.sax import make_parser, handler
class WikiContentHandler(handler.ContentHandler):
def __init__(self, max_articles=None):
self.wiki_data = {}
self.article_count = 0
self.max_articles = max_articles
# Current elements
self.current_title = None
self.current_text = None
self.current_ns = None
self.in_page = False
self.in_title = False
self.in_text = False
self.in_ns = False
self.buffer = []
def startElement(self, name, attrs):
if name == 'page':
self.in_page = True
self.current_title = None
self.current_text = None
self.current_ns = None
elif self.in_page and name == 'title':
self.in_title = True
self.buffer = []
elif self.in_page and name == 'ns':
self.in_ns = True
self.buffer = []
elif self.in_page and name == 'text':
self.in_text = True
self.buffer = []
def endElement(self, name):
if name == 'page':
self.in_page = False
# Only process main namespace articles (ns = 0)
if self.current_ns == '0' and self.current_title and self.current_text:
# Extract links
links = self.extract_links(self.current_text)
# Add to wiki data
self.wiki_data[self.current_title] = {
'title': self.current_title,
'text': self.current_text,
'links': links
}
self.article_count += 1
# Print progress
if self.article_count % 100 == 0:
print(f"Processed {self.article_count} articles...")
# Check if we've reached the maximum number of articles
if self.max_articles and self.article_count >= self.max_articles:
raise StopIteration("Reached maximum number of articles")
elif name == 'title':
self.in_title = False
self.current_title = ''.join(self.buffer)
elif name == 'ns':
self.in_ns = False
self.current_ns = ''.join(self.buffer)
elif name == 'text':
self.in_text = False
self.current_text = ''.join(self.buffer)
def characters(self, content):
if self.in_title:
self.buffer.append(content)
elif self.in_ns:
self.buffer.append(content)
elif self.in_text:
self.buffer.append(content)
def extract_links(self, text):
"""Extract links from article wikitext"""
# Pattern to match [[Link]] or [[Link|Text]] format
links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
# Process links
processed_links = []
for link in links:
# Skip non-article links (except categories which might be useful)
if ':' in link and not link.startswith('Category:'):
continue
# Remove any section links (with #)
link = link.split('#')[0].strip()
# Skip empty links
if not link:
continue
processed_links.append(link)
# Remove duplicates and return
return list(set(processed_links))
def parse_wiki_dump(dump_path, output_path, max_articles=None):
"""
Parse the Wikipedia XML dump and extract articles with their links.
Args:
dump_path: Path to the bz2 Wikipedia dump
output_path: Path to save the extracted data
max_articles: Maximum number of articles to extract (None for all)
Returns:
The path to the saved JSON file
"""
print(f"Parsing Wikipedia dump: {dump_path}")
# Create SAX parser with custom content handler
parser = make_parser()
content_handler = WikiContentHandler(max_articles)
parser.setContentHandler(content_handler)
# Parse the dump
try:
parser.parse(bz2.BZ2File(dump_path))
except StopIteration:
print("Reached maximum number of articles")
except Exception as e:
print(f"Error parsing dump: {e}")
print(f"Extracted {content_handler.article_count} articles with their links.")
# Save data to JSON file
output_file = os.path.join(output_path, 'wiki_data.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(content_handler.wiki_data, f)
print(f"Data saved to {output_file}")
return output_file
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump')
parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
parser.add_argument('output_path', help='Path to save the extracted data')
parser.add_argument('--max-articles', type=int, default=None,
help='Maximum number of articles to extract (default: all)')
args = parser.parse_args()
# Create output directory if it doesn't exist
os.makedirs(args.output_path, exist_ok=True)
# Parse the dump
parse_wiki_dump(args.dump_path, args.output_path, args.max_articles)