Files
NijiHolo_EN_ID_Bot/src/catchup.py
T

197 lines
7.4 KiB
Python
Raw Normal View History

2022-09-24 17:56:58 -07:00
## The bot's catch-up mode
# Scan all accounts for cross-company interactions.
# Terminates when finished scanning and posting.
#
# We should post, at the fastest, one tweet per minute.
import traceback
2022-09-26 14:44:46 -07:00
import datetime
2022-09-24 17:56:58 -07:00
import twint
from util import *
from talent_lists import *
2022-09-25 18:31:50 -07:00
from twapi import TwAPI
2022-09-24 17:56:58 -07:00
import talenttweet as tt
2022-09-27 02:49:03 -07:00
PROGRAM_ARGS = None
2022-09-28 02:20:06 -07:00
safe_to_post_tweets = True
2022-09-26 14:44:46 -07:00
2022-09-27 02:49:03 -07:00
def write_user_timestamp(user_id, file, timestamp = None, error = False):
if timestamp is None:
timestamp = datetime.datetime.now().timestamp()
file.write(f'# {user_id} {timestamp if not error else "-1"}\n')
2022-09-26 14:44:46 -07:00
pass
2022-09-27 02:49:03 -07:00
def get_queue_path():
2022-09-25 18:31:50 -07:00
return f'{util.get_project_dir()}/queue.txt'
2022-09-24 17:56:58 -07:00
def get_local_queue():
# f = open(os.path.join(get_project_dir(), 'queue.txt'))
pass
## Returns the ID of all tweets (up to limit) from a user ID.
2022-09-27 02:49:03 -07:00
def get_user_tweets(id, since_timestamp=None, limit=None):
qrt_count = 0
2022-09-24 17:56:58 -07:00
tweets = list()
c = twint.Config()
c.User_id = id
c.Limit = limit
c.Store_object = True
c.Store_object_tweets_list = tweets
c.Hide_output = True
2022-09-27 02:49:03 -07:00
c.Since = '' if since_timestamp == None else util.timestamp_to_tdate(since_timestamp)
2022-09-24 17:56:58 -07:00
2022-09-28 02:20:06 -07:00
user_str = f'@{util.get_username_local(id)}'
2022-09-27 02:49:03 -07:00
print(f'Scraping tweets from {user_str} since {"forever ago" if c.Since == "" else c.Since}...')
2022-09-25 18:31:50 -07:00
try:
twint.run.Search(c)
except:
print(f'Had trouble getting tweets from {user_str}')
2022-09-28 02:20:06 -07:00
traceback.print_exc()
2022-09-27 02:49:03 -07:00
for twt in tweets:
2022-09-28 02:20:06 -07:00
if type(twt.quote_url) is str and twt.quote_url != '':
2022-09-27 02:49:03 -07:00
qrt_count += 1
2022-09-27 02:49:03 -07:00
print(f'Scraped {len(tweets)} tweets, {qrt_count} of which are quote tweets.')
return tweets
2022-09-27 02:49:03 -07:00
# Returns dict of accounts that successfully caught up.
# LINE FORMAT: "# {user_id} {status_num} {UNIX_timestamp}
def get_finished_user_timestamps(queue_file):
results = dict()
for line in queue_file:
tokens = line.split()
2022-09-27 15:09:09 -07:00
if len(tokens) == 0: continue
if tokens[0][0] != '#':
print(f'{line} is our stopper!')
2022-09-27 02:49:03 -07:00
# reached end of accounts list
break
if tokens[2] != '-1':
results[int(tokens[1])] = float(tokens[2])
return results
def get_user_timestamps_str(queue_file):
results = str()
for line in queue_file:
tokens = line.split()
if len(tokens) != 3 or tokens[0][0] != '#':
# reached end of accounts list
break
results += f'{line}\n'
return results[:-1]
# If queue.txt doesn't exist, creates and populates it.
# Returns a list of sorted and filtered TalentTweets (should
# be equivalent to queue.txt)
2022-09-26 14:44:46 -07:00
async def get_cross_talent_tweets(queue_path):
2022-09-27 02:49:03 -07:00
finished_user_timestamps = dict()
ttweets_dict = dict()
2022-09-27 15:09:09 -07:00
posted_ttweets = set() # TODO: don't add TTweet to ttweets_dict if its id exists in posted_ttweets
2022-09-28 02:20:06 -07:00
global safe_to_post_tweets
2022-09-24 17:56:58 -07:00
# Populate structures with existing data from queue.txt
try:
2022-09-26 14:44:46 -07:00
with open(queue_path, 'r') as f:
2022-09-27 15:09:09 -07:00
finished_user_timestamps = get_finished_user_timestamps(f)
2022-09-28 02:20:06 -07:00
with open(queue_path, 'r') as f: # reset seek head
2022-09-27 02:49:03 -07:00
# Get existing queued TalentTweets
for line in f:
tokens = line.split()
if len(tokens) == 0 or tokens[0][0] == '#':
continue
ttweet = tt.TalentTweet.deserialize(line)
ttweets_dict[ttweet.tweet_id] = ttweet
2022-09-27 02:49:03 -07:00
print(f'Found {len(finished_user_timestamps)} scraped accounts and {len(ttweets_dict)} tweets.')
except FileNotFoundError:
2022-09-27 02:49:03 -07:00
print('queue.txt not found.')
2022-09-28 02:20:06 -07:00
# Begin getting tweets from online
2022-09-26 14:44:46 -07:00
with open(queue_path, 'w') as f:
2022-09-27 02:49:03 -07:00
print('Pulling tweets from online!')
try:
2022-09-28 02:20:06 -07:00
for i, (talent_id, talent_username) in enumerate(talent_lists.talents.items()):
print(f'[{i+1}/{len(talent_lists.talents)}]{util.get_username(talent_id)}----------------------------')
try:
# tweets = get_user_tweets(talent_id, since_timestamp=1663698621) # shorten test runs
tweets = get_user_tweets(talent_id, since_timestamp=finished_user_timestamps.get(talent_id, None))
for tweet in tweets:
if tweet.id not in ttweets_dict:
ttweet = await tt.TalentTweet.create_from_twint_tweet(tweet)
if ttweet.is_cross_company():
ttweets_dict[ttweet.tweet_id] = ttweet
except:
print('Error occurred processing tweet data.')
safe_to_post_tweets = False
print(traceback.format_exc())
write_user_timestamp(user_id=talent_id, file=f, error=True)
2022-09-26 14:44:46 -07:00
else:
2022-09-28 02:20:06 -07:00
write_user_timestamp(user_id=talent_id, file=f)
2022-09-27 02:49:03 -07:00
f.write('\n')
ttweets_dict = dict(sorted(ttweets_dict.items()))
for ttweet in ttweets_dict.values():
f.write(f'{ttweet.serialize()}\n')
except:
print('Unhandled error occurred while pulling tweets.')
traceback.print_exc()
print('Saving queue.txt and exiting.')
2022-09-28 02:20:06 -07:00
safe_to_post_tweets = False
2022-09-26 14:44:46 -07:00
return ttweets_dict
2022-09-28 02:20:06 -07:00
# Return number of TalentTweets successfully posted
2022-09-27 15:09:09 -07:00
async def process_queue(ttweets_dict: dict) -> int:
2022-09-27 02:49:03 -07:00
global PROGRAM_ARGS
2022-09-27 15:09:09 -07:00
ttweets_posted = 0
2022-09-27 02:49:03 -07:00
2022-09-27 15:09:09 -07:00
if len(ttweets_dict) == 0: return ttweets_posted
2022-09-27 02:49:03 -07:00
if PROGRAM_ARGS.announce_catchup:
2022-09-27 15:09:09 -07:00
TwAPI.instance.post_tweet(text=f'Starting to catch up through {len(ttweets_dict)} logged tweets.')
2022-09-27 02:49:03 -07:00
try:
while len(ttweets_dict) > 0:
key = list(ttweets_dict.keys())[0]
ttweet = ttweets_dict[key]
2022-09-27 15:09:09 -07:00
if await TwAPI.instance.post_ttweet(ttweet, is_catchup=True):
ttweets_posted += 1
2022-09-27 02:49:03 -07:00
ttweets_dict.pop(key)
2022-09-27 15:09:09 -07:00
# TODO: add ttweet.tweet_id to some success list
2022-09-27 02:49:03 -07:00
except:
print('Unhandled error occurred while posting tweets from queue.')
traceback.print_exc()
else:
if PROGRAM_ARGS.announce_catchup:
await TwAPI.instance.post_tweet('Finished with catch-up tweets!')
print('Updating what\'s left in ttweet_dict to queue.txt.')
with open(get_queue_path(), 'r') as f:
user_timestamps_str = get_user_timestamps_str(f)
with open(get_queue_path(), 'w') as f:
f.write(user_timestamps_str + '\n\n')
for ttweet in ttweets_dict.values():
f.write(f'{ttweet.serialize()}\n')
2022-09-27 15:09:09 -07:00
return ttweets_posted
2022-09-28 02:20:06 -07:00
# return True = no problems
# return False = issue occurred where we couldn't post all past tweets properly
2022-09-27 02:49:03 -07:00
async def run(program_args):
global PROGRAM_ARGS
2022-09-28 02:20:06 -07:00
global safe_to_post_tweets
2022-09-27 02:49:03 -07:00
PROGRAM_ARGS = program_args
queue_path = get_queue_path()
2022-09-27 15:09:09 -07:00
while True:
ttweets_dict = await get_cross_talent_tweets(queue_path)
print(f'found {len(ttweets_dict)} cross-company tweets')
2022-09-28 02:20:06 -07:00
if safe_to_post_tweets:
if await process_queue(ttweets_dict) == 0:
print('Posted no new tweets; we\'re caught up!')
return True
else:
print('Tweets were not retrieved cleanly.')
return False