File size: 8,466 Bytes
340cbde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
from flask import Markup
from requests_futures.sessions import FuturesSession
from werkzeug.datastructures import Headers
from concurrent.futures import as_completed
from numerize import numerize
from bs4 import BeautifulSoup
from re import findall
import time, datetime
import requests
import bleach
import urllib
import json
import re
##########################
#### Config variables ####
##########################
config = json.load(open('yotter-config.json'))
config['nitterInstance']
def get_user_info(username):
response = urllib.request.urlopen(f'{config["nitterInstance"]}{username}').read()
#rssFeed = feedparser.parse(response.content)
html = BeautifulSoup(str(response), "lxml")
if html.body.find('div', attrs={'class':'error-panel'}):
return False
else:
html = html.body.find('div', attrs={'class':'profile-card'})
if html.find('a', attrs={'class':'profile-card-fullname'}):
fullName = html.find('a', attrs={'class':'profile-card-fullname'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8')
else:
fullName = None
if html.find('div', attrs={'class':'profile-bio'}):
profileBio = html.find('div', attrs={'class':'profile-bio'}).getText().encode('latin1').decode('unicode_escape').encode('latin1').decode('utf8')
else:
profileBio = None
user = {
"profileFullName":fullName,
"profileUsername":html.find('a', attrs={'class':'profile-card-username'}).string.encode('latin_1').decode('unicode_escape').encode('latin_1').decode('utf8'),
"profileBio":profileBio,
"tweets":html.find_all('span', attrs={'class':'profile-stat-num'})[0].string,
"following":html.find_all('span', attrs={'class':'profile-stat-num'})[1].string,
"followers":numerize.numerize(int(html.find_all('span', attrs={'class':'profile-stat-num'})[2].string.replace(",",""))),
"likes":html.find_all('span', attrs={'class':'profile-stat-num'})[3].string,
"profilePic":config['nitterInstance'] + html.find('a', attrs={'class':'profile-card-avatar'})['href'][1:],
}
return user
def get_tweets(user, page=1):
feed = urllib.request.urlopen(f'{config["nitterInstance"]}{user}').read()
#Gather feedPosts
res = feed.decode('utf-8')
html = BeautifulSoup(res, "html.parser")
feedPosts = get_feed_tweets(html)
if page == 2:
nextPage = html.find('div', attrs={'class':'show-more'}).find('a')['href']
url = f'{config["nitterInstance"]}{user}{nextPage}'
print(url)
feed = urllib.request.urlopen(url).read()
res = feed.decode('utf-8')
html = BeautifulSoup(res, "html.parser")
feedPosts = get_feed_tweets(html)
return feedPosts
def yotterify(text):
URLS = ['https://youtube.com']
text = str(text)
for url in URLS:
text.replace(url, "")
return text
def get_feed_tweets(html):
feedPosts = []
if 'No items found' in str(html.body):
return 'Empty feed'
if "This account's tweets are protected." in str(html.body):
return 'Protected feed'
userFeed = html.find_all('div', attrs={'class':'timeline-item'})
if userFeed != []:
for post in userFeed[:-1]:
if 'show-more' in str(post):
continue
date_time_str = post.find('span', attrs={'class':'tweet-date'}).find('a')['title'].replace(",","")
if post.find('div', attrs={'class':'pinned'}):
if post.find('div', attrs={'class':'pinned'}).find('span', attrs={'icon-pin'}):
continue
tweet = {}
tweet['op'] = post.find('a', attrs={'class':'username'}).text
tweet['twitterName'] = post.find('a', attrs={'class':'fullname'}).text
tweet['timeStamp'] = str(datetime.datetime.strptime(date_time_str, '%d/%m/%Y %H:%M:%S'))
tweet['date'] = post.find('span', attrs={'class':'tweet-date'}).find('a').text
tweet['content'] = Markup(yotterify(post.find('div', attrs={'class':'tweet-content'}).decode_contents().replace("\n", "<br>")))
if post.find('div', attrs={'class':'retweet-header'}):
tweet['username'] = post.find('div', attrs={'class':'retweet-header'}).find('div', attrs={'class':'icon-container'}).text
tweet['isRT'] = True
else:
tweet['username'] = tweet['op']
tweet['isRT'] = False
tweet['profilePic'] = config['nitterInstance']+post.find('a', attrs={'class':'tweet-avatar'}).find('img')['src'][1:]
tweet['url'] = config['nitterInstance'] + post.find('a', attrs={'class':'tweet-link'})['href'][1:]
# Is quoting another tweet
if post.find('div', attrs={'class':'quote'}):
tweet['isReply'] = True
quote = post.find('div', attrs={'class':'quote'})
if 'unavailable' in str(quote):
tweet['unavailableReply'] = True
else:
tweet['unavailableReply'] = False
if not tweet['unavailableReply']:
if quote.find('div', attrs={'class':'quote-text'}):
try:
tweet['replyingTweetContent'] = Markup(quote.find('div', attrs={'class':'quote-text'}).replace("\n", "<br>"))
except:
tweet['replyingTweetContent'] = Markup(quote.find('div', attrs={'class':'quote-text'}))
if quote.find('a', attrs={'class':'still-image'}):
tweet['replyAttachedImages'] = []
images = quote.find_all('a', attrs={'class':'still-image'})
for img in images:
img = BeautifulSoup(str(img), "lxml")
url = config['nitterInstance'] + img.find('a')['href'][1:]
tweet['replyAttachedImages'].append(url)
tweet['replyingUser']=quote.find('a', attrs={'class':'username'}).text
post.find('div', attrs={'class':'quote'}).decompose()
else:
tweet['isReply'] = False
# Has attatchments
if post.find('div', attrs={'class':'attachments'}):
# Images
if post.find('div', attrs={'class':'attachments'}).find('a', attrs={'class':'still-image'}):
tweet['attachedImages'] = []
images = post.find('div', attrs={'class':'attachments'}).find_all('a', attrs={'class':'still-image'})
for img in images:
img = BeautifulSoup(str(img), 'lxml')
url = config['nitterInstance'] + img.find('a')['href'][1:]
tweet['attachedImages'].append(url)
else:
tweet['attachedImages'] = False
# Videos
if post.find('div', attrs={'attachments'}).find('div', attrs={'gallery-video'}):
tweet['attachedVideo'] = True
else:
tweet['attachedVideo'] = False
else:
tweet['attachedVideo'] = False
tweet['attachedImages'] = False
if post.find('div', attrs={'class':'tweet-stats'}):
stats = post.find('div', attrs={'class':'tweet-stats'}).find_all('span', attrs={'class':'tweet-stat'})
for stat in stats:
if 'comment' in str(stat):
tweet['comments'] = stat.find('div',attrs={'class':'icon-container'}).text
elif 'retweet' in str(stat):
tweet['retweets'] = stat.find('div',attrs={'class':'icon-container'}).text
elif 'heart' in str(stat):
tweet['likes'] = stat.find('div',attrs={'class':'icon-container'}).text
else:
tweet['quotes'] = stat.find('div',attrs={'class':'icon-container'}).text
feedPosts.append(tweet)
else:
return {"emptyFeed": True}
return feedPosts
|