diff --git a/.gitignore b/.gitignore index 3bbb790..b1c8472 100644 --- a/.gitignore +++ b/.gitignore @@ -143,4 +143,5 @@ cython_debug/ .vscode # project-specific -/secrets.ini \ No newline at end of file +/secrets.ini +/queue.txt \ No newline at end of file diff --git a/queue.txt b/queue.txt deleted file mode 100644 index e69de29..0000000 diff --git a/src/catchup.py b/src/catchup.py index 7cf0396..8c1ecf3 100644 --- a/src/catchup.py +++ b/src/catchup.py @@ -4,10 +4,13 @@ # # We should post, at the fastest, one tweet per minute. +import traceback +import sys import os import asyncio import twint +import tweepy from util import * from talent_lists import * @@ -31,17 +34,100 @@ def get_user_tweet_ids(id, limit=None): c.Store_object_tweets_list = tweets c.Hide_output = True - user_str = f'{id} ({util.get_username(id)})' - print(f'Finding tweets from {user_str})') + user_str = f'{util.get_username(id)}' + print(f'Scraping tweets from {user_str}...') try: twint.run.Search(c) - return [x.id for x in tweets] except: print(f'Had trouble getting tweets from {user_str}') - return list() + + ret = [x.id for x in tweets] + print(f'Scraped {len(ret)} tweets') + return ret -def work_on_queue(file): - print('TODO: implement work_on_queue') +# Produce tweet IDs from talent_list.talents for the producer/consumer model. +# Put lists of tweet IDs as we create them. +# Put None to queue to indicate end. +async def produce_ids_from_talents(queue: asyncio.Queue, finished_users): + def debug(str): + print(f'[prd] {str}') + + for talent_id in talents.keys(): + if talent_id in finished_users: + debug(f'@{util.get_username(talent_id)} already done, skipping...') + else: + tweet_ids = get_user_tweet_ids(talent_id) + debug(f'adding {util.get_username(talent_id)}\'s tweets to queue') + await queue.put(tweet_ids) + + await queue.put(None) + +async def consume_ids_into_ttweets(queue: asyncio.Queue, queue_file: str): + def debug(str): + print(f'[con] {str}') + + ttweets_dict = dict() + with open(queue_file, 'w') as f: + while True: + tweet_ids = await queue.get() + if tweet_ids is None: break + try: + for tweet_id in tweet_ids: + ttweet = await tt.TalentTweet.create_from_id(id=tweet_id) + if ttweet.is_cross_company(): + ttweets_dict['tweet_id'] = ttweet + except: + debug(traceback.format_exc()) + debug(f'Error retrieving Tweet #{tweet_id} from api!') + f.write('1\n') # 1 = error/incomplete + break + else: + f.write('0\n') # 0 = success + f.write('\n') + ttweets_dict = dict(sorted(ttweets_dict.items())) + for ttweet in ttweets_dict.values(): + f.write(f'{ttweet.serialize()}\n') + return ttweets_dict + +# If queue.txt doesn't exist, creates and populates it. +# Returns a list of sorted and filtered TalentTweets (should +# be equivalent to queue.txt) +async def get_cross_talent_tweets(queue_file): + finished_users = set() + ttweets_dict = dict() + + # Populate structures with existing data from queue.txt + try: + print('Processing existing data in queue.txt...') + with open(queue_file, 'r') as f: + # Check for finished and incomplete accounts + # LINE FORMAT: "# {user_id} {status_num}" + for line in f: + tokens = line.split() + if len(tokens) != 3 or tokens[0][0] != '#': + # reached end of accounts list + break + if tokens[2] == 0: + finished_users.add(tokens[1]) + + # Add existing serialized TalentTweets into ttweets + for line in f: + tokens = line.split() + if len(tokens) == 0 or tokens[0][0] == '#': + continue + ttweet = tt.TalentTweet.deserialize(line) + ttweets_dict[ttweet.tweet_id] = ttweet + except FileNotFoundError: + print('Couldn\'t find queue.txt.') + + async_queue = asyncio.Queue() + consumer = asyncio.create_task(consume_ids_into_ttweets(queue=async_queue, queue_file=queue_file)) + await produce_ids_from_talents(queue=async_queue, finished_users=finished_users) + ttweets_dict = await consumer + return ttweets_dict + +def process_queue(file): + print('TODO: implement process_queue') # while Queue.txt has lines present # attempt to deserialize first line of Queue.txt # exit program if failed, stating error @@ -52,41 +138,17 @@ def work_on_queue(file): # we're done! post tweet announcing done with archives pass -# If queue.txt doesn't exist, creates and populates it. -# Returns a list of sorted and filtered TalentTweets (should -# be equivalent to queue.txt) -def create_ttweets_queue(path) -> list: - print('Creating ttweets queue') - if not os.path.exists(path): - ttweets = list() - with open(path, 'x') as f: - for talent_id in talents.keys(): - tweet_ids = get_user_tweet_ids(talent_id) - print(f'retrieved {len(tweet_ids)} tweets') - for tweet_id in tweet_ids: - ttweet = tt.TalentAPITweet(tweet_id) - if ttweet.is_cross_company(): - ttweets.append(ttweet) - - ttweets.sort(key=lambda ttweet: ttweet.tweet_id) - for ttweet in ttweets: - f.write(f'{ttweet.serialize()}\n') - return ttweets - else: - return list() - - async def run(): # if Queue.txt exists # work through the tweets in Queue.txt # else # look through every talent's tweets, saving only cross-company tweets into a list # sort the list by tweet_id - # create Queue.txt and save all tweets through there + # create Queue.txt and save all tweets (serialized) there # post a tweet announcing archival intent # work through the tweets in Queue.txt + queue_path = get_queue_file() - if os.path.exists(queue_path): - work_on_queue(queue_path) - else: - ttweets = create_ttweets_queue(queue_path) \ No newline at end of file + ttweet_dict = await get_cross_talent_tweets(queue_path) + for ttweet in ttweet_dict.values(): + print(ttweet) \ No newline at end of file diff --git a/src/talent_lists.py b/src/talent_lists.py index f5c249a..875bad8 100644 --- a/src/talent_lists.py +++ b/src/talent_lists.py @@ -15,7 +15,7 @@ def __create_dict(file, _dict): if len(words) == 2 and line[0] != '#': name, id = line.split() talents[int(id)] = name - name = util.get_username_online(id) # attempt to get updated name + # name = util.get_username_online(id) # attempt to get updated name talents[int(id)] = name _dict[int(id)] = name def init(): diff --git a/src/talenttweet.py b/src/talenttweet.py index 9917366..4e14dd7 100644 --- a/src/talenttweet.py +++ b/src/talenttweet.py @@ -40,7 +40,19 @@ class TalentTweet: tweet_id=tweet_id, author_id=author_id, date_time=date_time, mrq=(mentions, reply_to, quote_retweeted) ) - + + @staticmethod + async def create_from_id(id): + resp = await TwAPI.instance.get_tweet_response(id) + tweet = resp.data + mrq = TwAPI.get_mrq(tweet, resp) + + return TalentTweet( + tweet_id=tweet.id, + author_id=tweet.author_id, + date_time=tweet.created_at, + mrq=mrq + ) def __init__(self, tweet_id: int, author_id: int,date_time: datetime, mrq: tuple): self.tweet_id, self.author_id = tweet_id, author_id @@ -113,40 +125,4 @@ class TalentTweet: def get_datetime_str(self): unpad = '#' if platform.system() == 'Windows' else '-' - return self.date_time.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)') - - -class TalentAPITweet(TalentTweet): - def __init__(self, tweet_id=None, tweet=None, mrq: tuple=None): - if tweet and mrq: - self.tweet = tweet - elif tweet_id: - tweet_id = int(tweet_id) - resp = TwAPI.instance.get_tweet_response(tweet_id) - self.tweet = resp.data - mrq = TwAPI.get_mrq(self.tweet, resp) - else: - raise ValueError('did not supply sufficient tweet information') - - TalentTweet.__init__( - self, - tweet_id=self.tweet.id, - author_id=self.tweet.author_id, - date_time=self.tweet.created_at, - mrq=mrq - ) - - def __repr__(self) -> str: - return ( - f'{self.tweet_id} from {util.get_username(self.author_id)}:\n' - f'{self.tweet.text}\n' - f'------------------------------------------------------\n' - f'{self.get_datetime_str()}\n' - f'{self.get_all_parties_usernames()}\n' - f'mentions: {self.mentions}\n' - f'reply_to: {self.reply_to}\n' - f'quote_retweeted: {self.quote_retweeted}\n' - f'{self.serialize()}\n' - f'Cross-company: {self.is_cross_company()}\n' - f'======================================================' - ) \ No newline at end of file + return self.date_time.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)') \ No newline at end of file diff --git a/src/twapi.py b/src/twapi.py index 42dd6e9..7acedb7 100644 --- a/src/twapi.py +++ b/src/twapi.py @@ -1,5 +1,6 @@ import asyncio from math import inf +from time import time import tweepy from tweetcapture import TweetCapture @@ -61,13 +62,18 @@ class TwAPI: access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret() ) - def get_tweet_response(self, id): - return TwAPI.instance.client.get_tweet( - id, - media_fields=TwAPI.TWEET_MEDIA_FIELDS, - tweet_fields=TwAPI.TWEET_FIELDS, - expansions=TwAPI.TWEET_EXPANSIONS - ) + async def get_tweet_response(self, id, attempt = 0): + try: + return TwAPI.instance.client.get_tweet( + id, + media_fields=TwAPI.TWEET_MEDIA_FIELDS, + tweet_fields=TwAPI.TWEET_FIELDS, + expansions=TwAPI.TWEET_EXPANSIONS + ) + except tweepy.TooManyRequests: + print(f'[{attempt}]get_tweet_response({id}):\n\ttoo many API requests -- trying again in 1 minute...') + await asyncio.sleep(60) + return await self.get_tweet_response(id, attempt=attempt+1) # Create a post that showcases given tweet and its mentions set. # Try do do this without retireving Tweet data. diff --git a/src/util.py b/src/util.py index 28bc8f7..6fd4742 100644 --- a/src/util.py +++ b/src/util.py @@ -51,9 +51,9 @@ def get_username_online(user_id): c.Store_object = True c.Hide_output = True try: + twint.output.users_list.clear() twint.run.Lookup(c) user = twint.output.users_list[0] - twint.output.users_list.clear() return user.username except: return f'#{user_id}' \ No newline at end of file