Spaces:

stillerman
/

wikihop-server

Runtime error

App Files Files Community

stillerman HF Staff commited on Apr 25

Commit

e91ced9

1 Parent(s): 2c2bab6

sqlite backend

Browse files

Files changed (5) hide show

README.md +3 -0
db/wiki_db_sqlite.py +79 -0
db/{wiki_parser.py → wiki_parser_json.py} +0 -0
db/wiki_parser_sqlite.py +238 -0
engine.py +6 -1

README.md CHANGED Viewed

	@@ -1 +1,4 @@
1	wget https://dumps.wikimedia.org/simplewiki/20250420/simplewiki-20250420-pages-articles-multistream.xml.bz2


1	wget https://dumps.wikimedia.org/simplewiki/20250420/simplewiki-20250420-pages-articles-multistream.xml.bz2
2	+
3	+
4	+ python db/wiki_parser_sqlite.py simplewiki-20250420-pages-articles-multistream.xml.bz2 db/data/wikihop.db --batch-size 10000

db/wiki_db_sqlite.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import sqlite3
+class WikiDBSqlite:
+    def __init__(self, db_path):
+        """Initialize the database with path to SQLite database"""
+        self.db_path = db_path
+        self.conn = sqlite3.connect(db_path)
+        self.conn.row_factory = sqlite3.Row
+        self.cursor = self.conn.cursor()
+        self._article_count = self._get_article_count()
+        print(f"Connected to SQLite database with {self._article_count} articles")
+    def __del__(self):
+        """Close database connection when object is destroyed"""
+        if hasattr(self, 'conn') and self.conn:
+            self.conn.close()
+    def _get_article_count(self):
+        """Get the number of articles in the database"""
+        self.cursor.execute("SELECT COUNT(*) FROM articles")
+        return self.cursor.fetchone()[0]
+    def get_article_count(self):
+        """Return the number of articles in the database"""
+        return self._article_count
+    def get_all_article_titles(self):
+        """Return a list of all article titles"""
+        self.cursor.execute("SELECT title FROM articles")
+        return [row[0] for row in self.cursor.fetchall()]
+    def get_article(self, title):
+        """Get article data by title"""
+        self.cursor.execute(
+            "SELECT title, text FROM articles WHERE title = ?",
+            (title,)
+        )
+        article = self.cursor.fetchone()
+        if not article:
+            return {}
+        # Get links for this article
+        self.cursor.execute(
+            "SELECT target_title FROM links WHERE source_title = ?",
+            (title,)
+        )
+        links = [row[0] for row in self.cursor.fetchall()]
+        return {
+            'title': article['title'],
+            'text': article['text'],
+            'links': links
+        }
+    def article_exists(self, title):
+        """Check if an article exists in the database"""
+        self.cursor.execute(
+            "SELECT 1 FROM articles WHERE title = ? LIMIT 1",
+            (title,)
+        )
+        return bool(self.cursor.fetchone())
+    def get_article_text(self, title):
+        """Get the text of an article"""
+        self.cursor.execute(
+            "SELECT text FROM articles WHERE title = ?",
+            (title,)
+        )
+        result = self.cursor.fetchone()
+        return result['text'] if result else ''
+    def get_article_links(self, title):
+        """Get the links of an article"""
+        self.cursor.execute(
+            "SELECT target_title FROM links WHERE source_title = ?",
+            (title,)
+        )
+        return [row[0] for row in self.cursor.fetchall()]

db/{wiki_parser.py → wiki_parser_json.py} RENAMED Viewed

File without changes

db/wiki_parser_sqlite.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import bz2
+import re
+import os
+import sqlite3
+from pathlib import Path
+from xml.sax import make_parser, handler
+import time
+class WikiContentHandler(handler.ContentHandler):
+    def __init__(self, db_conn, batch_size=1000, max_articles=None):
+        self.db_conn = db_conn
+        self.cursor = db_conn.cursor()
+        self.batch_size = batch_size
+        self.article_count = 0
+        self.max_articles = max_articles
+        self.article_batch = []
+        self.links_batch = []
+        # Current elements
+        self.current_title = None
+        self.current_text = None
+        self.current_ns = None
+        self.in_page = False
+        self.in_title = False
+        self.in_text = False
+        self.in_ns = False
+        self.buffer = []
+    def startElement(self, name, attrs):
+        if name == 'page':
+            self.in_page = True
+            self.current_title = None
+            self.current_text = None
+            self.current_ns = None
+        elif self.in_page and name == 'title':
+            self.in_title = True
+            self.buffer = []
+        elif self.in_page and name == 'ns':
+            self.in_ns = True
+            self.buffer = []
+        elif self.in_page and name == 'text':
+            self.in_text = True
+            self.buffer = []
+    def endElement(self, name):
+        if name == 'page':
+            self.in_page = False
+            # Only process main namespace articles (ns = 0)
+            if self.current_ns == '0' and self.current_title and self.current_text:
+                # Extract links
+                links = self.extract_links(self.current_text)
+                # Add to batch
+                self.article_batch.append(
+                    (self.current_title, self.current_text)
+                )
+                # Add links to batch
+                for link in links:
+                    self.links_batch.append(
+                        (self.current_title, link)
+                    )
+                self.article_count += 1
+                # Print progress
+                if self.article_count % 100 == 0:
+                    print(f"Processed {self.article_count} articles...")
+                # Insert batch if reached batch size
+                if len(self.article_batch) >= self.batch_size:
+                    self._insert_batch()
+                # Check if we've reached the maximum number of articles
+                if self.max_articles and self.article_count >= self.max_articles:
+                    self._insert_batch()  # Insert any remaining items
+                    raise StopIteration("Reached maximum number of articles")
+        elif name == 'title':
+            self.in_title = False
+            self.current_title = ''.join(self.buffer)
+        elif name == 'ns':
+            self.in_ns = False
+            self.current_ns = ''.join(self.buffer)
+        elif name == 'text':
+            self.in_text = False
+            self.current_text = ''.join(self.buffer)
+    def characters(self, content):
+        if self.in_title:
+            self.buffer.append(content)
+        elif self.in_ns:
+            self.buffer.append(content)
+        elif self.in_text:
+            self.buffer.append(content)
+    def extract_links(self, text):
+        """Extract links from article wikitext"""
+        # Pattern to match [[Link]] or [[Link|Text]] format
+        links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
+        # Process links
+        processed_links = []
+        for link in links:
+            # Skip non-article links (except categories which might be useful)
+            if ':' in link and not link.startswith('Category:'):
+                continue
+            # Remove any section links (with #)
+            link = link.split('#')[0].strip()
+            # Skip empty links
+            if not link:
+                continue
+            processed_links.append(link)
+        # Remove duplicates and return
+        return list(set(processed_links))
+    def _insert_batch(self):
+        """Insert batched data into the database"""
+        if self.article_batch:
+            self.cursor.executemany(
+                "INSERT OR IGNORE INTO articles (title, text) VALUES (?, ?)",
+                self.article_batch
+            )
+        if self.links_batch:
+            self.cursor.executemany(
+                "INSERT OR IGNORE INTO links (source_title, target_title) VALUES (?, ?)",
+                self.links_batch
+            )
+        self.db_conn.commit()
+        self.article_batch = []
+        self.links_batch = []
+def create_db_schema(db_conn):
+    """Create the database schema"""
+    cursor = db_conn.cursor()
+    # Create articles table
+    cursor.execute('''
+    CREATE TABLE IF NOT EXISTS articles (
+        title TEXT PRIMARY KEY,
+        text TEXT
+    )
+    ''')
+    # Create links table
+    cursor.execute('''
+    CREATE TABLE IF NOT EXISTS links (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        source_title TEXT,
+        target_title TEXT,
+        FOREIGN KEY (source_title) REFERENCES articles (title),
+        UNIQUE (source_title, target_title)
+    )
+    ''')
+    # Create index on links for faster queries
+    cursor.execute('''
+    CREATE INDEX IF NOT EXISTS idx_links_source ON links (source_title)
+    ''')
+    cursor.execute('''
+    CREATE INDEX IF NOT EXISTS idx_links_target ON links (target_title)
+    ''')
+    db_conn.commit()
+def parse_wiki_dump(dump_path, db_path, batch_size=1000, max_articles=None):
+    """
+    Parse the Wikipedia XML dump and extract articles with their links into SQLite database.
+    Args:
+        dump_path: Path to the bz2 Wikipedia dump
+        db_path: Path to save the SQLite database
+        batch_size: Number of articles to process before committing to the database
+        max_articles: Maximum number of articles to extract (None for all)
+    Returns:
+        The path to the created SQLite database
+    """
+    start_time = time.time()
+    print(f"Parsing Wikipedia dump: {dump_path}")
+    # Create or connect to SQLite database
+    db_conn = sqlite3.connect(db_path)
+    # Create schema
+    create_db_schema(db_conn)
+    # Create SAX parser with custom content handler
+    parser = make_parser()
+    content_handler = WikiContentHandler(db_conn, batch_size, max_articles)
+    parser.setContentHandler(content_handler)
+    # Parse the dump
+    try:
+        parser.parse(bz2.BZ2File(dump_path))
+        # Insert any remaining items in the batch
+        content_handler._insert_batch()
+    except StopIteration:
+        print("Reached maximum number of articles")
+    except Exception as e:
+        print(f"Error parsing dump: {e}")
+        raise
+    finally:
+        db_conn.commit()
+        db_conn.close()
+    duration = time.time() - start_time
+    print(f"Extracted {content_handler.article_count} articles in {duration:.2f} seconds.")
+    print(f"Data saved to {db_path}")
+    return db_path
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump to SQLite')
+    parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
+    parser.add_argument('output_path', help='Path to save the SQLite database')
+    parser.add_argument('--batch-size', type=int, default=1000,
+                        help='Batch size for database inserts (default: 1000)')
+    parser.add_argument('--max-articles', type=int, default=None,
+                        help='Maximum number of articles to extract (default: all)')
+    args = parser.parse_args()
+    # Create output directory if it doesn't exist
+    output_dir = os.path.dirname(args.output_path)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    # Parse the dump
+    parse_wiki_dump(args.dump_path, args.output_path, args.batch_size, args.max_articles)

engine.py CHANGED Viewed

@@ -4,12 +4,17 @@
 # 3. Navigation between articles
 import random
 from db.wiki_db_json import WikiDBJson
 class WikiRunEnvironment:
     def __init__(self, wiki_data_path):
         """Initialize with path to Wikipedia data"""
-        self.db = WikiDBJson(wiki_data_path)
         self.current_article = None
         self.target_article = None
         self.path_taken = []

 # 3. Navigation between articles
 import random
+from db.wiki_db_sqlite import WikiDBSqlite
 from db.wiki_db_json import WikiDBJson
 class WikiRunEnvironment:
     def __init__(self, wiki_data_path):
         """Initialize with path to Wikipedia data"""
+        if wiki_data_path.endswith('.json'):
+            self.db = WikiDBJson(wiki_data_path)
+        else:
+            self.db = WikiDBSqlite(wiki_data_path)
         self.current_article = None
         self.target_article = None
         self.path_taken = []