Spaces:

stillerman
/

wikihop-server

Runtime error

App Files Files Community

wikihop-server / db /wiki_parser_json.py

stillerman HF Staff

sqlite backend

e91ced9 4 days ago

raw

history blame contribute delete

5.52 kB

	import bz2
	import json
	import re
	import os
	from pathlib import Path
	from xml.sax import make_parser, handler

	class WikiContentHandler(handler.ContentHandler):
	def __init__(self, max_articles=None):
	self.wiki_data = {}
	self.article_count = 0
	self.max_articles = max_articles

	# Current elements
	self.current_title = None
	self.current_text = None
	self.current_ns = None
	self.in_page = False
	self.in_title = False
	self.in_text = False
	self.in_ns = False
	self.buffer = []

	def startElement(self, name, attrs):
	if name == 'page':
	self.in_page = True
	self.current_title = None
	self.current_text = None
	self.current_ns = None
	elif self.in_page and name == 'title':
	self.in_title = True
	self.buffer = []
	elif self.in_page and name == 'ns':
	self.in_ns = True
	self.buffer = []
	elif self.in_page and name == 'text':
	self.in_text = True
	self.buffer = []

	def endElement(self, name):
	if name == 'page':
	self.in_page = False
	# Only process main namespace articles (ns = 0)
	if self.current_ns == '0' and self.current_title and self.current_text:
	# Extract links
	links = self.extract_links(self.current_text)

	# Add to wiki data
	self.wiki_data[self.current_title] = {
	'title': self.current_title,
	'text': self.current_text,
	'links': links
	}

	self.article_count += 1

	# Print progress
	if self.article_count % 100 == 0:
	print(f"Processed {self.article_count} articles...")

	# Check if we've reached the maximum number of articles
	if self.max_articles and self.article_count >= self.max_articles:
	raise StopIteration("Reached maximum number of articles")

	elif name == 'title':
	self.in_title = False
	self.current_title = ''.join(self.buffer)
	elif name == 'ns':
	self.in_ns = False
	self.current_ns = ''.join(self.buffer)
	elif name == 'text':
	self.in_text = False
	self.current_text = ''.join(self.buffer)

	def characters(self, content):
	if self.in_title:
	self.buffer.append(content)
	elif self.in_ns:
	self.buffer.append(content)
	elif self.in_text:
	self.buffer.append(content)

	def extract_links(self, text):
	"""Extract links from article wikitext"""
	# Pattern to match [[Link]] or [[Link\|Text]] format
	links = re.findall(r'\[\[([^\|\]]+)(?:\\|[^\]]+)?\]\]', text)

	# Process links
	processed_links = []
	for link in links:
	# Skip non-article links (except categories which might be useful)
	if ':' in link and not link.startswith('Category:'):
	continue

	# Remove any section links (with #)
	link = link.split('#')[0].strip()

	# Skip empty links
	if not link:
	continue

	processed_links.append(link)

	# Remove duplicates and return
	return list(set(processed_links))

	def parse_wiki_dump(dump_path, output_path, max_articles=None):
	"""
	Parse the Wikipedia XML dump and extract articles with their links.

	Args:
	dump_path: Path to the bz2 Wikipedia dump
	output_path: Path to save the extracted data
	max_articles: Maximum number of articles to extract (None for all)

	Returns:
	The path to the saved JSON file
	"""
	print(f"Parsing Wikipedia dump: {dump_path}")

	# Create SAX parser with custom content handler
	parser = make_parser()
	content_handler = WikiContentHandler(max_articles)
	parser.setContentHandler(content_handler)

	# Parse the dump
	try:
	parser.parse(bz2.BZ2File(dump_path))
	except StopIteration:
	print("Reached maximum number of articles")
	except Exception as e:
	print(f"Error parsing dump: {e}")

	print(f"Extracted {content_handler.article_count} articles with their links.")

	# Save data to JSON file
	output_file = os.path.join(output_path, 'wiki_data.json')
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(content_handler.wiki_data, f)

	print(f"Data saved to {output_file}")
	return output_file

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump')
	parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
	parser.add_argument('output_path', help='Path to save the extracted data')
	parser.add_argument('--max-articles', type=int, default=None,
	help='Maximum number of articles to extract (default: all)')

	args = parser.parse_args()

	# Create output directory if it doesn't exist
	os.makedirs(args.output_path, exist_ok=True)

	# Parse the dump
	parse_wiki_dump(args.dump_path, args.output_path, args.max_articles)