import bz2 import re import os import sqlite3 from pathlib import Path from xml.sax import make_parser, handler import time class WikiContentHandler(handler.ContentHandler): def __init__(self, db_conn, batch_size=1000, max_articles=None): self.db_conn = db_conn self.cursor = db_conn.cursor() self.batch_size = batch_size self.article_count = 0 self.max_articles = max_articles self.article_batch = [] self.links_batch = [] # Current elements self.current_title = None self.current_text = None self.current_ns = None self.in_page = False self.in_title = False self.in_text = False self.in_ns = False self.buffer = [] def startElement(self, name, attrs): if name == 'page': self.in_page = True self.current_title = None self.current_text = None self.current_ns = None elif self.in_page and name == 'title': self.in_title = True self.buffer = [] elif self.in_page and name == 'ns': self.in_ns = True self.buffer = [] elif self.in_page and name == 'text': self.in_text = True self.buffer = [] def endElement(self, name): if name == 'page': self.in_page = False # Only process main namespace articles (ns = 0) if self.current_ns == '0' and self.current_title and self.current_text: # Extract links links = self.extract_links(self.current_text) # Add to batch self.article_batch.append( (self.current_title, self.current_text) ) # Add links to batch for link in links: self.links_batch.append( (self.current_title, link) ) self.article_count += 1 # Print progress if self.article_count % 100 == 0: print(f"Processed {self.article_count} articles...") # Insert batch if reached batch size if len(self.article_batch) >= self.batch_size: self._insert_batch() # Check if we've reached the maximum number of articles if self.max_articles and self.article_count >= self.max_articles: self._insert_batch() # Insert any remaining items raise StopIteration("Reached maximum number of articles") elif name == 'title': self.in_title = False self.current_title = ''.join(self.buffer) elif name == 'ns': self.in_ns = False self.current_ns = ''.join(self.buffer) elif name == 'text': self.in_text = False self.current_text = ''.join(self.buffer) def characters(self, content): if self.in_title: self.buffer.append(content) elif self.in_ns: self.buffer.append(content) elif self.in_text: self.buffer.append(content) def extract_links(self, text): """Extract links from article wikitext""" # Pattern to match [[Link]] or [[Link|Text]] format links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text) # Process links processed_links = [] for link in links: # Skip non-article links (except categories which might be useful) if ':' in link and not link.startswith('Category:'): continue # Remove any section links (with #) link = link.split('#')[0].strip() # Skip empty links if not link: continue processed_links.append(link) # Remove duplicates and return return list(set(processed_links)) def _insert_batch(self): """Insert batched data into the database""" if self.article_batch: self.cursor.executemany( "INSERT OR IGNORE INTO articles (title, text) VALUES (?, ?)", self.article_batch ) if self.links_batch: self.cursor.executemany( "INSERT OR IGNORE INTO links (source_title, target_title) VALUES (?, ?)", self.links_batch ) self.db_conn.commit() self.article_batch = [] self.links_batch = [] def create_db_schema(db_conn): """Create the database schema""" cursor = db_conn.cursor() # Create articles table cursor.execute(''' CREATE TABLE IF NOT EXISTS articles ( title TEXT PRIMARY KEY, text TEXT ) ''') # Create links table cursor.execute(''' CREATE TABLE IF NOT EXISTS links ( id INTEGER PRIMARY KEY AUTOINCREMENT, source_title TEXT, target_title TEXT, FOREIGN KEY (source_title) REFERENCES articles (title), UNIQUE (source_title, target_title) ) ''') # Create index on links for faster queries cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_links_source ON links (source_title) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_links_target ON links (target_title) ''') db_conn.commit() def parse_wiki_dump(dump_path, db_path, batch_size=1000, max_articles=None): """ Parse the Wikipedia XML dump and extract articles with their links into SQLite database. Args: dump_path: Path to the bz2 Wikipedia dump db_path: Path to save the SQLite database batch_size: Number of articles to process before committing to the database max_articles: Maximum number of articles to extract (None for all) Returns: The path to the created SQLite database """ start_time = time.time() print(f"Parsing Wikipedia dump: {dump_path}") # Create or connect to SQLite database db_conn = sqlite3.connect(db_path) # Create schema create_db_schema(db_conn) # Create SAX parser with custom content handler parser = make_parser() content_handler = WikiContentHandler(db_conn, batch_size, max_articles) parser.setContentHandler(content_handler) # Parse the dump try: parser.parse(bz2.BZ2File(dump_path)) # Insert any remaining items in the batch content_handler._insert_batch() except StopIteration: print("Reached maximum number of articles") except Exception as e: print(f"Error parsing dump: {e}") raise finally: db_conn.commit() db_conn.close() duration = time.time() - start_time print(f"Extracted {content_handler.article_count} articles in {duration:.2f} seconds.") print(f"Data saved to {db_path}") return db_path if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump to SQLite') parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)') parser.add_argument('output_path', help='Path to save the SQLite database') parser.add_argument('--batch-size', type=int, default=1000, help='Batch size for database inserts (default: 1000)') parser.add_argument('--max-articles', type=int, default=None, help='Maximum number of articles to extract (default: all)') args = parser.parse_args() # Create output directory if it doesn't exist output_dir = os.path.dirname(args.output_path) if output_dir: os.makedirs(output_dir, exist_ok=True) # Parse the dump parse_wiki_dump(args.dump_path, args.output_path, args.batch_size, args.max_articles)