File size: 5,518 Bytes
fa70ae5
 
 
 
f64fe29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa70ae5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f64fe29
 
 
 
fa70ae5
f64fe29
 
 
 
 
 
 
fa70ae5
f64fe29
fa70ae5
 
 
 
f64fe29
fa70ae5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import bz2
import json
import re
import os
from pathlib import Path
from xml.sax import make_parser, handler

class WikiContentHandler(handler.ContentHandler):
    def __init__(self, max_articles=None):
        self.wiki_data = {}
        self.article_count = 0
        self.max_articles = max_articles
        
        # Current elements
        self.current_title = None
        self.current_text = None
        self.current_ns = None
        self.in_page = False
        self.in_title = False
        self.in_text = False
        self.in_ns = False
        self.buffer = []
        
    def startElement(self, name, attrs):
        if name == 'page':
            self.in_page = True
            self.current_title = None
            self.current_text = None
            self.current_ns = None
        elif self.in_page and name == 'title':
            self.in_title = True
            self.buffer = []
        elif self.in_page and name == 'ns':
            self.in_ns = True
            self.buffer = []
        elif self.in_page and name == 'text':
            self.in_text = True
            self.buffer = []
    
    def endElement(self, name):
        if name == 'page':
            self.in_page = False
            # Only process main namespace articles (ns = 0)
            if self.current_ns == '0' and self.current_title and self.current_text:
                # Extract links
                links = self.extract_links(self.current_text)
                
                # Add to wiki data
                self.wiki_data[self.current_title] = {
                    'title': self.current_title,
                    'text': self.current_text,
                    'links': links
                }
                
                self.article_count += 1
                
                # Print progress
                if self.article_count % 100 == 0:
                    print(f"Processed {self.article_count} articles...")
                
                # Check if we've reached the maximum number of articles
                if self.max_articles and self.article_count >= self.max_articles:
                    raise StopIteration("Reached maximum number of articles")
                    
        elif name == 'title':
            self.in_title = False
            self.current_title = ''.join(self.buffer)
        elif name == 'ns':
            self.in_ns = False
            self.current_ns = ''.join(self.buffer)
        elif name == 'text':
            self.in_text = False
            self.current_text = ''.join(self.buffer)
    
    def characters(self, content):
        if self.in_title:
            self.buffer.append(content)
        elif self.in_ns:
            self.buffer.append(content)
        elif self.in_text:
            self.buffer.append(content)
    
    def extract_links(self, text):
        """Extract links from article wikitext"""
        # Pattern to match [[Link]] or [[Link|Text]] format
        links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
        
        # Process links
        processed_links = []
        for link in links:
            # Skip non-article links (except categories which might be useful)
            if ':' in link and not link.startswith('Category:'):
                continue
            
            # Remove any section links (with #)
            link = link.split('#')[0].strip()
            
            # Skip empty links
            if not link:
                continue
            
            processed_links.append(link)
        
        # Remove duplicates and return
        return list(set(processed_links))

def parse_wiki_dump(dump_path, output_path, max_articles=None):
    """
    Parse the Wikipedia XML dump and extract articles with their links.
    
    Args:
        dump_path: Path to the bz2 Wikipedia dump
        output_path: Path to save the extracted data
        max_articles: Maximum number of articles to extract (None for all)
    
    Returns:
        The path to the saved JSON file
    """
    print(f"Parsing Wikipedia dump: {dump_path}")
    
    # Create SAX parser with custom content handler
    parser = make_parser()
    content_handler = WikiContentHandler(max_articles)
    parser.setContentHandler(content_handler)
    
    # Parse the dump
    try:
        parser.parse(bz2.BZ2File(dump_path))
    except StopIteration:
        print("Reached maximum number of articles")
    except Exception as e:
        print(f"Error parsing dump: {e}")
    
    print(f"Extracted {content_handler.article_count} articles with their links.")
    
    # Save data to JSON file
    output_file = os.path.join(output_path, 'wiki_data.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(content_handler.wiki_data, f)
    
    print(f"Data saved to {output_file}")
    return output_file

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump')
    parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
    parser.add_argument('output_path', help='Path to save the extracted data')
    parser.add_argument('--max-articles', type=int, default=None, 
                        help='Maximum number of articles to extract (default: all)')
    
    args = parser.parse_args()
    
    # Create output directory if it doesn't exist
    os.makedirs(args.output_path, exist_ok=True)
    
    # Parse the dump
    parse_wiki_dump(args.dump_path, args.output_path, args.max_articles)