From c06e712e0627b7d237cf118b58c49823b9bf5e3d Mon Sep 17 00:00:00 2001 From: muskit <15199219+muskit@users.noreply.github.com> Date: Mon, 26 Sep 2022 14:44:46 -0700 Subject: [PATCH] fix code, implement twint stuff --- src/catchup.py | 57 ++++++++++++++++++++++++++++++++++++--------- src/talent_lists.py | 7 ++++++ src/talenttweet.py | 32 ++++++++++++++----------- src/util.py | 7 ++++++ 4 files changed, 79 insertions(+), 24 deletions(-) diff --git a/src/catchup.py b/src/catchup.py index fd8ec3b..2d26bad 100644 --- a/src/catchup.py +++ b/src/catchup.py @@ -5,18 +5,25 @@ # We should post, at the fastest, one tweet per minute. import traceback +import datetime import sys import os import asyncio import twint -import tweepy from util import * from talent_lists import * from twapi import TwAPI import talenttweet as tt +def write_user_date(user_id, file, date_str = None, error = False): + if date_str is None: + date_str = util.datetime_to_tdate(datetime.datetime.now()) + + file.write(f'# {user_id} {date_str if not error else "-1"}\n') + pass + def get_queue_file(): return f'{util.get_project_dir()}/queue.txt' @@ -25,7 +32,7 @@ def get_local_queue(): pass ## Returns the ID of all tweets (up to limit) from a user ID. -def get_user_tweets(id, limit=None): +def get_user_tweets(id, since_date='', limit=None): tweets = list() c = twint.Config() c.User_id = id @@ -33,6 +40,7 @@ def get_user_tweets(id, limit=None): c.Store_object = True c.Store_object_tweets_list = tweets c.Hide_output = True + c.Since = since_date user_str = f'{util.get_username(id)}' print(f'Scraping tweets from {user_str}...') @@ -47,23 +55,24 @@ def get_user_tweets(id, limit=None): # If queue.txt doesn't exist, creates and populates it. # Returns a list of sorted and filtered TalentTweets (should # be equivalent to queue.txt) -async def get_cross_talent_tweets(queue_file): - finished_users = set() +async def get_cross_talent_tweets(queue_path): + finished_user_tdates = dict() ttweets_dict = dict() # Populate structures with existing data from queue.txt try: print('Processing existing data in queue.txt...') - with open(queue_file, 'r') as f: + with open(queue_path, 'r') as f: # Check for finished and incomplete accounts - # LINE FORMAT: "# {user_id} {status_num}" + # LINE FORMAT: "# {user_id} {status_num} (TODO: use date of retrival YYYY-MM-DD) for line in f: tokens = line.split() if len(tokens) != 3 or tokens[0][0] != '#': # reached end of accounts list break - if tokens[2] == 0: - finished_users.add(tokens[1]) + + if tokens[2] != '-1': + finished_user_tdates[int(tokens[1])] = tokens[2] # Add existing serialized TalentTweets into ttweets for line in f: @@ -75,7 +84,34 @@ async def get_cross_talent_tweets(queue_file): except FileNotFoundError: print('Couldn\'t find queue.txt.') - # TODO: implement ordered cross-company ttweets dict creation using twint + # Pull tweets from twint + with open(queue_path, 'w') as f: + # for talent_id in talent_lists.talents: + for talent_id in talent_lists.test_talents: + print('using test_talents') + if talent_id not in finished_user_tdates or \ + finished_user_tdates[talent_id] != util.datetime_to_tdate(datetime.datetime.today()): + try: + tweets = get_user_tweets(talent_id, since_date=finished_user_tdates.get(talent_id, None)) + for tweet in tweets: + ttweet = await tt.TalentTweet.create_from_twint_tweet(tweet) + if ttweet.is_cross_company(): + ttweets_dict[ttweet.tweet_id] = ttweet + except: + print('Error occurred processing tweet data. Traceback:') + print(traceback.format_exc()) + write_user_date(user_id=talent_id, file=f, error=True) + else: + write_user_date(user_id=talent_id, file=f) + else: + print(f'Skipping already completed {util.get_username(talent_id)}') + write_user_date(user_id=talent_id, file=f, date_str=finished_user_tdates[talent_id]) + f.write('\n') + ttweets_dict = dict(sorted(ttweets_dict.items())) + for ttweet in ttweets_dict.values(): + f.write(f'{ttweet.serialize()}\n') + + return ttweets_dict def process_queue(file): print('TODO: implement process_queue') @@ -101,5 +137,4 @@ async def run(): queue_path = get_queue_file() ttweet_dict = await get_cross_talent_tweets(queue_path) - for ttweet in ttweet_dict.values(): - print(ttweet) \ No newline at end of file + print(f'got {len(ttweet_dict)} tweets') \ No newline at end of file diff --git a/src/talent_lists.py b/src/talent_lists.py index 875bad8..6a6ef75 100644 --- a/src/talent_lists.py +++ b/src/talent_lists.py @@ -6,6 +6,8 @@ niji_en = dict() niji_exid = dict() talents = dict() +test_talents = dict() + def __create_dict(file, _dict): print(f'Initializing talents\' account list from {file}...') global talents @@ -23,6 +25,7 @@ def init(): global holo_id global niji_en global niji_exid + global test_talents # holoEN __create_dict(f'{util.get_project_dir()}/lists/holoen.txt', holo_en) @@ -33,3 +36,7 @@ def init(): # nijiexID __create_dict(f'{util.get_project_dir()}/lists/nijiexid.txt', niji_exid) + test_talents = { + 1390637197167038464: 'PomuRainpuff' + } + diff --git a/src/talenttweet.py b/src/talenttweet.py index 0c499b8..22451e6 100644 --- a/src/talenttweet.py +++ b/src/talenttweet.py @@ -43,24 +43,30 @@ class TalentTweet: ) @staticmethod - def create_from_twint_tweet(tweet): + async def create_from_twint_tweet(tweet): # qrt - if tweet.quote_url != '': - return TalentTweet(tweet_id=tweet.id) + # -- COMMENTED OUT FOR TESTING PURPOSES -- + # TODO: uncomment + # if tweet.quote_url != '': + # api_ttweet = await TalentTweet.create_from_id(tweet.id) + # return api_ttweet # MRQ (Q is guaranteed to be None) mentions = set() reply_to = None # reply_to/mentions - is_reply = tweet.id == int(tweet.conversation_id) - if is_reply: - reply_to = tweet.reply_to[0] - mentions = set(tweet.reply_to[1:]) - mentions.add(*tweet.mentions) + is_reply = tweet.id != int(tweet.conversation_id) + mentions = set([x['id'] for x in tweet.mentions]) + if is_reply and len(tweet.reply_to) > 0: # FIXME: QRT = is_reply and len(tweet.reply_to) == 0? + reply_to = tweet.reply_to[0]['id'] + reply_others = [x['id'] for x in tweet.reply_to[1:]] + mentions.update(reply_others) + try: mentions.remove(reply_to) + except: pass - datetime = datetime.strptime(tweet.datetime, '%Y-%m-%d %H:%M:%S %Z') - return TalentTweet(tweet_id=tweet.id, author_id=tweet.user_id, date_time=datetime, mrq=(mentions, reply_to, None)) + date_time = datetime.strptime(tweet.datetime, '%Y-%m-%d %H:%M:%S %Z') + return TalentTweet(tweet_id=tweet.id, author_id=tweet.user_id, date_time=date_time, mrq=(mentions, reply_to, None)) @staticmethod @@ -79,9 +85,9 @@ class TalentTweet: def __init__(self, tweet_id: int, author_id: int,date_time: datetime, mrq: tuple): self.tweet_id, self.author_id = tweet_id, author_id self.date_time = date_time - self.mentions = mrq[0] - self.reply_to = mrq[1] - self.quote_retweeted = mrq[2] + self.mentions = tuple(int(x) for x in mrq[0]) + self.reply_to = int(mrq[1]) if mrq[1] is not None else None + self.quote_retweeted = int(mrq[2]) if mrq[2] is not None else None # all users involved, except for the author self.all_parties = {self.reply_to, self.quote_retweeted} diff --git a/src/util.py b/src/util.py index 6fd4742..455a5f4 100644 --- a/src/util.py +++ b/src/util.py @@ -1,5 +1,6 @@ ## Shared utility functions. +import datetime import os import twint @@ -16,6 +17,12 @@ def get_project_dir(): def clamp(n, smallest, largest): return max(smallest, min(n, largest)) +def datetime_to_tdate(date_time: datetime.datetime): + return date_time.strftime("%Y-%m-%d") + +def tdate_to_datetime(tdate: str): + return datetime.datetime.strptime("%Y-%m-%d") + async def create_ttweet_image(ttweet): tc = TweetCapture() filename = 'img.png'