create the scraper class, reorganizing
This commit is contained in:
@@ -1,72 +0,0 @@
|
||||
from time import sleep
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from dotenv import dotenv_values
|
||||
import pytz
|
||||
|
||||
from tweety import Twitter
|
||||
from tweety.types import *
|
||||
|
||||
creds = dotenv_values()
|
||||
|
||||
app = Twitter("session")
|
||||
app.sign_in(creds["username"], creds["password"])
|
||||
|
||||
def url(t: Tweet):
|
||||
return f'https://twitter.com/{t.author.username}/status/{t.id}'
|
||||
|
||||
def print_tweets(tweets: list):
|
||||
print(f'{len(tweets)} tweets:')
|
||||
for t in tweets:
|
||||
if isinstance(t, Tweet):
|
||||
print(f'{t.date} : {url(t)} : RT? {t.is_retweet}')
|
||||
elif isinstance(t, TweetThread):
|
||||
print('-----------TTd----------')
|
||||
print_tweets(t.tweets)
|
||||
print('-----------end----------')
|
||||
|
||||
def get_tweets_from_user(uid: int | str, since: datetime = None) -> list:
|
||||
reached_backdate = False
|
||||
tweets: [Tweet] = []
|
||||
if since == None:
|
||||
since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7)
|
||||
print(f'Grabbing tweets since 7 days ago ({since.date()})')
|
||||
|
||||
if isinstance(uid, str):
|
||||
name = uid
|
||||
uid = app._get_user_id(uid)
|
||||
print(f"{name} = {uid}")
|
||||
|
||||
def add_tweet(tweet: Tweet):
|
||||
nonlocal reached_backdate
|
||||
try:
|
||||
if tweet.is_retweet or tweet.author.id == uid:
|
||||
tweets.append(tweet)
|
||||
if not reached_backdate and tweet.date <= since:
|
||||
print("reached backdate")
|
||||
reached_backdate = True
|
||||
except AttributeError:
|
||||
print("skipping malformed tweet: {tweet}")
|
||||
return
|
||||
|
||||
uts = app.get_tweets(uid, replies=True)
|
||||
while not reached_backdate:
|
||||
cur_page = uts.tweets
|
||||
print(f'obtained {len(cur_page)} tweets')
|
||||
|
||||
if len(cur_page) == 0: break
|
||||
|
||||
for e in cur_page:
|
||||
if isinstance(e, Tweet):
|
||||
add_tweet(e)
|
||||
elif isinstance(e, TweetThread):
|
||||
for t in e.tweets:
|
||||
add_tweet(t)
|
||||
|
||||
uts.get_next_page()
|
||||
|
||||
tweets.sort(key=lambda t: t.id)
|
||||
return tweets
|
||||
|
||||
tweets = get_tweets_from_user("ninakosaka", since=datetime(2023, 7, 1))
|
||||
print_tweets(tweets)
|
||||
@@ -0,0 +1,99 @@
|
||||
from os.path import exists
|
||||
from time import sleep
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from dotenv import dotenv_values
|
||||
import pytz
|
||||
|
||||
from tweety import Twitter
|
||||
from tweety.types import *
|
||||
|
||||
from tweety_utils import *
|
||||
from talenttweet import *
|
||||
from talent_lists import is_niji, is_holo
|
||||
|
||||
class Scraper:
|
||||
def __init__(self):
|
||||
creds = dotenv_values()
|
||||
self.app = Twitter("session")
|
||||
if exists("session.json"):
|
||||
self.app.connect()
|
||||
else:
|
||||
self.app.sign_in(creds["scraper_username"], creds["scraper_password"])
|
||||
|
||||
# since MUST BE TIMEZONE AWARE
|
||||
# usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc)
|
||||
def get_tweets_from_user(self, uid: int | str, since: datetime = None) -> list:
|
||||
reached_backdate = False
|
||||
tweets: list[Tweet] = []
|
||||
|
||||
if since == None:
|
||||
since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7)
|
||||
print(f'Grabbing tweets since 7 days ago ({since.date()})')
|
||||
|
||||
if isinstance(uid, str):
|
||||
name = uid
|
||||
uid = self.app._get_user_id(uid)
|
||||
print(f"{name} = {uid}")
|
||||
|
||||
def add_tweet(tweet: Tweet):
|
||||
nonlocal reached_backdate
|
||||
try:
|
||||
tweet.author
|
||||
tweets.append(tweet)
|
||||
if not reached_backdate and tweet.date <= since:
|
||||
print("reached backdate")
|
||||
reached_backdate = True
|
||||
except AttributeError:
|
||||
print("skipping malformed tweet: {tweet}")
|
||||
return
|
||||
|
||||
uts = self.app.get_tweets(uid, replies=True)
|
||||
while not reached_backdate:
|
||||
cur_page = uts.tweets
|
||||
print(f'obtained {len(cur_page)} tweets')
|
||||
|
||||
if len(cur_page) == 0: break
|
||||
|
||||
for e in cur_page:
|
||||
if isinstance(e, Tweet):
|
||||
add_tweet(e)
|
||||
elif isinstance(e, TweetThread):
|
||||
# FIXME: rework when replied_to is fixed (currently only user_mentions works)
|
||||
t = e[-1] # latest tweet in thread = og author's reply
|
||||
t.replied_to = e[-2]
|
||||
add_tweet(t)
|
||||
print(f"adding thread latest: {t.id}")
|
||||
|
||||
uts = self.app.get_tweets(uid, replies=True, cursor=uts.cursor)
|
||||
|
||||
tweets.sort(key=lambda t: t.id)
|
||||
return tweets
|
||||
|
||||
def get_cross_ttweets_from_user(self, uid: int | str, since: datetime = None):
|
||||
tweets = self.get_tweets_from_user(uid, since)
|
||||
ret: [TalentTweet] = []
|
||||
for t in tweets:
|
||||
is_niji = is_niji(int(t.author.id))
|
||||
is_cross = False
|
||||
|
||||
# cross-rt?
|
||||
|
||||
# rt mentions cross-company?
|
||||
|
||||
# cross-qrt?
|
||||
|
||||
# cross-reply?
|
||||
if t.replied_to is not None:
|
||||
if is_niji == is_holo(int(t.replied_to.author.id)):
|
||||
is_cross = True
|
||||
|
||||
# cross-mention? in-thread?
|
||||
for u in t.user_mentions:
|
||||
if is_niji == is_holo(int(u.id)):
|
||||
is_cross = True
|
||||
|
||||
if __name__ == '__main__':
|
||||
app = Scraper()
|
||||
tweets = app.get_tweets_from_user("pomurainpuff")
|
||||
print_tweets(tweets)
|
||||
@@ -47,6 +47,9 @@ def is_niji(id: int) -> bool:
|
||||
def is_holo(id: int) -> bool:
|
||||
return id in holo_en or id in holo_id
|
||||
|
||||
def is_cross_company(id1: int, id2: int):
|
||||
return is_niji(id1) == is_holo(id2)
|
||||
|
||||
# For filtered stream
|
||||
# DEPRECATED: thx elon
|
||||
def get_twitter_rules():
|
||||
|
||||
+25
-36
@@ -1,14 +1,33 @@
|
||||
import datetime
|
||||
from datetime import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
import platform
|
||||
|
||||
import pytz
|
||||
|
||||
import twapi
|
||||
import talent_lists
|
||||
import util
|
||||
|
||||
class TalentTweet:
|
||||
# Serialized one-liner format:
|
||||
# {tweet} {author} {time in seconds since epoch} m {mention set} r {reply to author} q {quote tweet author} rt {retweeted tweet's id}
|
||||
def serialize(self):
|
||||
s = f'{self.tweet_id} {self.author_id} {self.date_time.timestamp()} '
|
||||
|
||||
if None not in [self.rt_target, self.rt_author_id]:
|
||||
s += f'rt {self.rt_target} {self.rt_author_id}'
|
||||
return s[:-1] # stop here since retweets can't have other info
|
||||
|
||||
if len(self.mentions) > 0:
|
||||
s += 'm '
|
||||
for id in self.mentions:
|
||||
s += f'{id} '
|
||||
if self.reply_to:
|
||||
s += f'r {self.reply_to} '
|
||||
if self.quote_retweeted:
|
||||
s += f'q {self.quote_retweeted} '
|
||||
|
||||
return s[:-1]
|
||||
|
||||
@staticmethod
|
||||
def deserialize(serialized_str: str):
|
||||
tokens = serialized_str.split()
|
||||
@@ -16,7 +35,7 @@ class TalentTweet:
|
||||
raise ValueError('not enough tokens to reconstruct a TalentTweet')
|
||||
|
||||
tweet_id, author_id = int(tokens[0]), int(tokens[1])
|
||||
date_time = datetime.datetime.fromtimestamp(float(tokens[2]), tz=pytz.utc)
|
||||
date_time = datetime.fromtimestamp(float(tokens[2]), tz=pytz.utc)
|
||||
|
||||
mentions = set()
|
||||
reply_to = None
|
||||
@@ -43,27 +62,7 @@ class TalentTweet:
|
||||
date_time=date_time, mrq=(mentions, reply_to, quote_retweeted)
|
||||
)
|
||||
|
||||
# Serialized one-liner format:
|
||||
# {tweet} {author} {time in seconds since epoch} m {mention_set} r {reply_to_author} q {quote_retweet_author}
|
||||
def serialize(self):
|
||||
s = f'{self.tweet_id} {self.author_id} {self.date_time.timestamp()} '
|
||||
|
||||
if None not in [self.rt_target, self.rt_author_id]:
|
||||
s += f'rt {self.rt_target} {self.rt_author_id}'
|
||||
return s[:-1] # stop here since retweets can't have other info
|
||||
|
||||
if len(self.mentions) > 0:
|
||||
s += 'm '
|
||||
for id in self.mentions:
|
||||
s += f'{id} '
|
||||
if self.reply_to:
|
||||
s += f'r {self.reply_to} '
|
||||
if self.quote_retweeted:
|
||||
s += f'q {self.quote_retweeted} '
|
||||
|
||||
return s[:-1]
|
||||
|
||||
def __init__(self, tweet_id: int, author_id: int, date_time: datetime.datetime, mrq: tuple, rt_target: int=None, rt_author_id: int=None):
|
||||
def __init__(self, tweet_id: int, author_id: int, date_time: datetime, mrq: tuple, rt_target: int=None, rt_author_id: int=None):
|
||||
self.tweet_id, self.author_id = tweet_id, author_id
|
||||
self.date_time = date_time
|
||||
self.mentions = tuple(int(x) for x in mrq[0])
|
||||
@@ -97,18 +96,8 @@ class TalentTweet:
|
||||
|
||||
def is_cross_company(self):
|
||||
for other_id in self.all_parties:
|
||||
if self.author_id in talent_lists.holo_en:
|
||||
if other_id in talent_lists.niji_en or other_id in talent_lists.niji_exid:
|
||||
return True
|
||||
if self.author_id in talent_lists.niji_en:
|
||||
if other_id in talent_lists.holo_en or other_id in talent_lists.holo_id:
|
||||
return True
|
||||
if self.author_id in talent_lists.holo_id:
|
||||
if other_id in talent_lists.niji_en or other_id in talent_lists.niji_exid:
|
||||
return True
|
||||
if self.author_id in talent_lists.niji_exid:
|
||||
if other_id in talent_lists.holo_en or other_id in talent_lists.holo_id:
|
||||
return True
|
||||
if talent_lists.is_cross_company(self.author_id, other_id):
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_all_parties_usernames(self):
|
||||
|
||||
+1
-44
@@ -124,35 +124,7 @@ class TwAPI:
|
||||
# else:
|
||||
# print('Saul Gone')
|
||||
|
||||
def get_all_tweet_ids_from_user(self, user_id):
|
||||
next_page_token = None
|
||||
tokens_retrieved = 0
|
||||
tweets_retrieved = 0
|
||||
tweets = list()
|
||||
while True:
|
||||
print(f'Retrieved {tokens_retrieved} tokens so far...')
|
||||
resp = self.client.get_users_tweets(
|
||||
user_id, max_results=100, pagination_token=next_page_token,
|
||||
media_fields=TwAPI.TWEET_MEDIA_FIELDS,
|
||||
tweet_fields=TwAPI.TWEET_FIELDS,
|
||||
expansions=TwAPI.TWEET_EXPANSIONS
|
||||
)
|
||||
|
||||
for tweet in resp.data:
|
||||
tweets.append(tweet)
|
||||
|
||||
# update counters and pagination token
|
||||
tweets_retrieved += resp.meta['result_count']
|
||||
try:
|
||||
next_page_token = resp.meta['next_token']
|
||||
tokens_retrieved += 1
|
||||
except KeyError:
|
||||
print("next_token wasn't provided; we've reached the end!")
|
||||
break # reached end of user's tweets
|
||||
|
||||
print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
|
||||
return tweets
|
||||
|
||||
# DEPRECATED: thx elon
|
||||
async def get_tweet_response(self, id, attempt = 0):
|
||||
try:
|
||||
twt = TwAPI.instance.client.get_tweet(
|
||||
@@ -275,18 +247,3 @@ class TwAPI:
|
||||
else:
|
||||
raise e
|
||||
return True
|
||||
|
||||
def post_ttweet_by_id(self, tweet_id, is_catchup=False, dry_run=False):
|
||||
ttweet = asyncio.run(tt.TalentTweet.create_from_id(tweet_id))
|
||||
print(f'm({ttweet.mentions}), r({ttweet.reply_to}), q({ttweet.quote_retweeted})')
|
||||
if ttweet.is_cross_company():
|
||||
print(f'Tweet {ttweet.tweet_id} is cross-company! Creating post...')
|
||||
asyncio.run(self.post_ttweet(ttweet, is_catchup=is_catchup, dry_run=dry_run))
|
||||
ttq.TalentTweetQueue.instance.add_finished_tweet(ttweet.tweet_id)
|
||||
else:
|
||||
print(f'Tweet {tweet_id} is not cross-company.')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
from tweety.types import *
|
||||
|
||||
def url(t: Tweet):
|
||||
return f'https://twitter.com/{t.author.username}/status/{t.id}'
|
||||
|
||||
def print_tweets(tweets: list[Tweet | TweetThread]):
|
||||
print(f'{len(tweets)} tweets:')
|
||||
for t in tweets:
|
||||
if isinstance(t, Tweet):
|
||||
print(f'{t.date} : {url(t)} : RT? {t.is_retweet} ', end=' ')
|
||||
|
||||
if t.replied_to is not None:
|
||||
print(f'reply to {t.replied_to.author.username}', end=' ')
|
||||
|
||||
print("m=" + ",".join([x.username for x in t.user_mentions]))
|
||||
elif isinstance(t, TweetThread):
|
||||
print('-----------TTd----------')
|
||||
print_tweets(t.tweets)
|
||||
print('-----------end----------')
|
||||
+1
-1
@@ -8,7 +8,7 @@ from datetime import datetime
|
||||
import tweepy
|
||||
import pytz
|
||||
import twint
|
||||
import twapi
|
||||
#import twapi
|
||||
from tweetcapture import TweetCapture
|
||||
|
||||
from recrop import fix_aspect_ratio
|
||||
|
||||
Reference in New Issue
Block a user