Spaces:
Runtime error
Runtime error
File size: 5,518 Bytes
fa70ae5 f64fe29 fa70ae5 f64fe29 fa70ae5 f64fe29 fa70ae5 f64fe29 fa70ae5 f64fe29 fa70ae5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import bz2
import json
import re
import os
from pathlib import Path
from xml.sax import make_parser, handler
class WikiContentHandler(handler.ContentHandler):
def __init__(self, max_articles=None):
self.wiki_data = {}
self.article_count = 0
self.max_articles = max_articles
# Current elements
self.current_title = None
self.current_text = None
self.current_ns = None
self.in_page = False
self.in_title = False
self.in_text = False
self.in_ns = False
self.buffer = []
def startElement(self, name, attrs):
if name == 'page':
self.in_page = True
self.current_title = None
self.current_text = None
self.current_ns = None
elif self.in_page and name == 'title':
self.in_title = True
self.buffer = []
elif self.in_page and name == 'ns':
self.in_ns = True
self.buffer = []
elif self.in_page and name == 'text':
self.in_text = True
self.buffer = []
def endElement(self, name):
if name == 'page':
self.in_page = False
# Only process main namespace articles (ns = 0)
if self.current_ns == '0' and self.current_title and self.current_text:
# Extract links
links = self.extract_links(self.current_text)
# Add to wiki data
self.wiki_data[self.current_title] = {
'title': self.current_title,
'text': self.current_text,
'links': links
}
self.article_count += 1
# Print progress
if self.article_count % 100 == 0:
print(f"Processed {self.article_count} articles...")
# Check if we've reached the maximum number of articles
if self.max_articles and self.article_count >= self.max_articles:
raise StopIteration("Reached maximum number of articles")
elif name == 'title':
self.in_title = False
self.current_title = ''.join(self.buffer)
elif name == 'ns':
self.in_ns = False
self.current_ns = ''.join(self.buffer)
elif name == 'text':
self.in_text = False
self.current_text = ''.join(self.buffer)
def characters(self, content):
if self.in_title:
self.buffer.append(content)
elif self.in_ns:
self.buffer.append(content)
elif self.in_text:
self.buffer.append(content)
def extract_links(self, text):
"""Extract links from article wikitext"""
# Pattern to match [[Link]] or [[Link|Text]] format
links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
# Process links
processed_links = []
for link in links:
# Skip non-article links (except categories which might be useful)
if ':' in link and not link.startswith('Category:'):
continue
# Remove any section links (with #)
link = link.split('#')[0].strip()
# Skip empty links
if not link:
continue
processed_links.append(link)
# Remove duplicates and return
return list(set(processed_links))
def parse_wiki_dump(dump_path, output_path, max_articles=None):
"""
Parse the Wikipedia XML dump and extract articles with their links.
Args:
dump_path: Path to the bz2 Wikipedia dump
output_path: Path to save the extracted data
max_articles: Maximum number of articles to extract (None for all)
Returns:
The path to the saved JSON file
"""
print(f"Parsing Wikipedia dump: {dump_path}")
# Create SAX parser with custom content handler
parser = make_parser()
content_handler = WikiContentHandler(max_articles)
parser.setContentHandler(content_handler)
# Parse the dump
try:
parser.parse(bz2.BZ2File(dump_path))
except StopIteration:
print("Reached maximum number of articles")
except Exception as e:
print(f"Error parsing dump: {e}")
print(f"Extracted {content_handler.article_count} articles with their links.")
# Save data to JSON file
output_file = os.path.join(output_path, 'wiki_data.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(content_handler.wiki_data, f)
print(f"Data saved to {output_file}")
return output_file
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump')
parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
parser.add_argument('output_path', help='Path to save the extracted data')
parser.add_argument('--max-articles', type=int, default=None,
help='Maximum number of articles to extract (default: all)')
args = parser.parse_args()
# Create output directory if it doesn't exist
os.makedirs(args.output_path, exist_ok=True)
# Parse the dump
parse_wiki_dump(args.dump_path, args.output_path, args.max_articles)
|