File size: 8,291 Bytes
bcd2144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import time
import requests
import pandas as pd
from datetime import datetime

def extract_comment_data(comment, post_info):
    return {
        'subreddit': post_info['subreddit'],
        'post_title': post_info['title'],
        'post_score': post_info['score'],
        'post_created_utc': post_info['created_utc'],
        'comment_id': comment['data'].get('id'),
        'comment_author': comment['data'].get('author'),
        'comment_body': comment['data'].get('body'),
        'comment_score': comment['data'].get('score', 0),
        'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
        'post_url': post_info['url'],
        'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
    }

def fetch_top_comments(post_df, num_comments=2):
    all_comments = []
    total_posts = len(post_df)
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
    }
    
    print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")
    
    for idx, post in post_df.iterrows():
        print(f"\nProcessing post {idx + 1}/{total_posts}")
        print(f"Title: {post['title'][:100]}...")
        print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")
        
        try:
            json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
            url = f'https://www.reddit.com{json_url}'
            
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            if len(data) > 1:
                comments_data = data[1]['data']['children']
                
                # Filter out non-comment entries and extract scores
                valid_comments = [
                    comment for comment in comments_data 
                    if comment['kind'] == 't1' and comment['data'].get('score') is not None
                ]
                
                # Sort comments by score (upvotes) in descending order
                sorted_comments = sorted(
                    valid_comments,
                    key=lambda x: x['data'].get('score', 0),
                    reverse=True
                )
                
                # Take only the top N comments
                top_comments = sorted_comments[:num_comments]
                
                # Print comment scores for verification
                print("\nTop comment scores for this post:")
                for i, comment in enumerate(top_comments, 1):
                    score = comment['data'].get('score', 0)
                    print(f"Comment {i}: {score} upvotes")
                
                # Add to main list
                for comment in top_comments:
                    all_comments.append(extract_comment_data(comment, post))
            
            time.sleep(20)
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching comments for post {idx + 1}: {e}")
            continue
            
    # Create DataFrame and sort
    comments_df = pd.DataFrame(all_comments)
    
    if not comments_df.empty:
        # Verify sorting by showing top comments for each post
        print("\nVerification of comment sorting:")
        for post_title in comments_df['post_title'].unique():
            post_comments = comments_df[comments_df['post_title'] == post_title]
            print(f"\nPost: {post_title[:100]}...")
            print("Comment scores:", post_comments['comment_score'].tolist())
    
    return comments_df


def fetch_subreddits(limit=10, min_subscribers=1000):
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
    }
    subreddits_data = []
    after = None
    
    while len(subreddits_data) < limit:
        try:
            url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
            if after:
                url += f'&after={after}'
            
            print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            for subreddit in data['data']['children']:
                subreddit_data = subreddit['data']
                    
                if subreddit_data.get('subscribers', 0) >= min_subscribers:
                    sub_info = {
                        'display_name': subreddit_data.get('display_name'),
                        'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
                        'title': subreddit_data.get('title'),
                        'subscribers': subreddit_data.get('subscribers', 0),
                        'active_users': subreddit_data.get('active_user_count', 0),
                        'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
                        'description': subreddit_data.get('description'),
                        'subreddit_type': subreddit_data.get('subreddit_type'),
                        'over18': subreddit_data.get('over18', False),
                        'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
                    }
                    subreddits_data.append(sub_info)
            
            after = data['data'].get('after')
            if not after:
                print("Reached end of listings")
                break
                
            time.sleep(2)
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break
            
    return pd.DataFrame(subreddits_data)

def fetch_top_posts(subreddit, limit=5):
    posts_data = []
    url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for post in data['data']['children']:
            post_data = post['data']
            posts_data.append({
                'subreddit': subreddit,
                'title': post_data.get('title'),
                'score': post_data.get('score'),
                'num_comments': post_data.get('num_comments'),
                'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
                'url': post_data.get('url'),
                'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
            })
        
        time.sleep(2)
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching posts from r/{subreddit}: {e}")
    
    return pd.DataFrame(posts_data)

def main():
    # Step 1: Fetch Subreddits
    print("Fetching subreddits...")
    subreddits_df = fetch_subreddits(limit=10, min_subscribers=1000)
    print(f"Fetched {len(subreddits_df)} subreddits.")
    subreddits_df.to_csv("subreddits.csv")

    # # Step 2: Fetch Top Posts for each subreddit
    all_posts_data = []
    for subreddit in subreddits_df['display_name']:
        print(f"\nFetching top posts for subreddit: {subreddit}...")
        posts_df = fetch_top_posts(subreddit, limit=5)
        all_posts_data.append(posts_df)

    # Combine all posts into a single DataFrame
    posts_df = pd.concat(all_posts_data, ignore_index=True)
    print(f"Fetched {len(posts_df)} top posts.")
    posts_df.to_csv("posts.csv")

    posts_df = pd.read_csv("posts.csv")

    # Step 3: Fetch Top Comments for each post
    all_comments_data = []
    if not posts_df.empty:
        all_comments_data = fetch_top_comments(posts_df, num_comments=2)
        print(f"Fetched {len(all_comments_data)} top comments.")
    all_comments_data.to_csv("comments.csv")

if __name__ == "__main__":
    main()