implemented producer/consumer system with asyncio

2022-09-26 02:44:26 -07:00
parent 0038439728
commit 4575466874
7 changed files with 128 additions and 83 deletions
@@ -143,4 +143,5 @@ cython_debug/
 .vscode
 # project-specific
-/secrets.ini
+/secrets.ini
 /queue.txt
@@ -4,10 +4,13 @@
 #
 # We should post, at the fastest, one tweet per minute.
 import traceback
 import sys
 import os
 import asyncio
 import twint
 import tweepy
 from util import *
 from talent_lists import *
@@ -31,17 +34,100 @@ def get_user_tweet_ids(id, limit=None):
    c.Store_object_tweets_list = tweets
    c.Hide_output = True
-    user_str = f'{id} ({util.get_username(id)})'
+    user_str = f'{util.get_username(id)}'
-    print(f'Finding tweets from {user_str})')
+    print(f'Scraping tweets from {user_str}...')
    try:
        twint.run.Search(c)
        return [x.id for x in tweets]
    except:
        print(f'Had trouble getting tweets from {user_str}')
-        return list()
+    
    ret = [x.id for x in tweets]
    print(f'Scraped {len(ret)} tweets')
    return ret
-def work_on_queue(file):
+# Produce tweet IDs from talent_list.talents for the producer/consumer model.
-    print('TODO: implement work_on_queue')
+# Put lists of tweet IDs as we create them.
 # Put None to queue to indicate end.
 async def produce_ids_from_talents(queue: asyncio.Queue, finished_users):
    def debug(str):
        print(f'[prd] {str}')
    for talent_id in talents.keys():
        if talent_id in finished_users:
            debug(f'@{util.get_username(talent_id)} already done, skipping...')
        else:
            tweet_ids = get_user_tweet_ids(talent_id)
            debug(f'adding {util.get_username(talent_id)}\'s tweets to queue')
            await queue.put(tweet_ids)
    await queue.put(None)
 async def consume_ids_into_ttweets(queue: asyncio.Queue, queue_file: str):
    def debug(str):
        print(f'[con] {str}')
    ttweets_dict = dict()
    with open(queue_file, 'w') as f:
        while True:
            tweet_ids = await queue.get()
            if tweet_ids is None: break
            try:
                for tweet_id in tweet_ids:
                    ttweet = await tt.TalentTweet.create_from_id(id=tweet_id)
                    if ttweet.is_cross_company():
                        ttweets_dict['tweet_id'] = ttweet
            except:
                debug(traceback.format_exc())
                debug(f'Error retrieving Tweet #{tweet_id} from api!')
                f.write('1\n') # 1 = error/incomplete
                break
            else:
                f.write('0\n') # 0 = success
        f.write('\n')
        ttweets_dict = dict(sorted(ttweets_dict.items()))
        for ttweet in ttweets_dict.values():
            f.write(f'{ttweet.serialize()}\n')
    return ttweets_dict
 # If queue.txt doesn't exist, creates and populates it.
 # Returns a list of sorted and filtered TalentTweets (should
 # be equivalent to queue.txt)
 async def get_cross_talent_tweets(queue_file):
    finished_users = set()
    ttweets_dict = dict()
    # Populate structures with existing data from queue.txt
    try:
        print('Processing existing data in queue.txt...')
        with open(queue_file, 'r') as f:
            # Check for finished and incomplete accounts
            # LINE FORMAT: "# {user_id} {status_num}"
            for line in f:
                tokens = line.split()
                if len(tokens) != 3 or tokens[0][0] != '#':
                    # reached end of accounts list
                    break
                if tokens[2] == 0:
                    finished_users.add(tokens[1])
            # Add existing serialized TalentTweets into ttweets
            for line in f:
                tokens = line.split()
                if len(tokens) == 0 or tokens[0][0] == '#':
                    continue
                ttweet = tt.TalentTweet.deserialize(line)
                ttweets_dict[ttweet.tweet_id] = ttweet
    except FileNotFoundError:
        print('Couldn\'t find queue.txt.')
    async_queue = asyncio.Queue()
    consumer = asyncio.create_task(consume_ids_into_ttweets(queue=async_queue, queue_file=queue_file))
    await produce_ids_from_talents(queue=async_queue, finished_users=finished_users)
    ttweets_dict = await consumer
    return ttweets_dict
 def process_queue(file):
    print('TODO: implement process_queue')
    # while Queue.txt has lines present
    #   attempt to deserialize first line of Queue.txt
    #     exit program if failed, stating error
@@ -52,41 +138,17 @@ def work_on_queue(file):
    # we're done! post tweet announcing done with archives
    pass
 # If queue.txt doesn't exist, creates and populates it.
 # Returns a list of sorted and filtered TalentTweets (should
 # be equivalent to queue.txt)
 def create_ttweets_queue(path) -> list:
    print('Creating ttweets queue')
    if not os.path.exists(path):
        ttweets = list()
        with open(path, 'x') as f:
            for talent_id in talents.keys():
                tweet_ids = get_user_tweet_ids(talent_id)
                print(f'retrieved {len(tweet_ids)} tweets')
                for tweet_id in tweet_ids:
                    ttweet = tt.TalentAPITweet(tweet_id)
                    if ttweet.is_cross_company():
                        ttweets.append(ttweet)
            ttweets.sort(key=lambda ttweet: ttweet.tweet_id)
            for ttweet in ttweets:
                f.write(f'{ttweet.serialize()}\n')
        return ttweets
    else:
        return list()
 async def run():
    # if Queue.txt exists
    #   work through the tweets in Queue.txt
    # else
    #   look through every talent's tweets, saving only cross-company tweets into a list
    #   sort the list by tweet_id
-    #   create Queue.txt and save all tweets through there
+    #   create Queue.txt and save all tweets (serialized) there
    #   post a tweet announcing archival intent
    #   work through the tweets in Queue.txt
    queue_path = get_queue_file()
-    if os.path.exists(queue_path):
+    ttweet_dict = await get_cross_talent_tweets(queue_path)
-        work_on_queue(queue_path)
+    for ttweet in ttweet_dict.values():
-    else:
+        print(ttweet)
        ttweets = create_ttweets_queue(queue_path)
@@ -15,7 +15,7 @@ def __create_dict(file, _dict):
            if len(words) == 2 and line[0] != '#':
                name, id = line.split()
                talents[int(id)] = name
-                name = util.get_username_online(id) # attempt to get updated name
+                # name = util.get_username_online(id) # attempt to get updated name
                talents[int(id)] = name
                _dict[int(id)] = name
 def init():
@@ -40,7 +40,19 @@ class TalentTweet:
            tweet_id=tweet_id, author_id=author_id,
            date_time=date_time, mrq=(mentions, reply_to, quote_retweeted)
        )
-        
+    
    @staticmethod
    async def create_from_id(id):
        resp = await TwAPI.instance.get_tweet_response(id)
        tweet = resp.data
        mrq = TwAPI.get_mrq(tweet, resp)
        return TalentTweet(
            tweet_id=tweet.id,
            author_id=tweet.author_id,
            date_time=tweet.created_at,
            mrq=mrq
        )
    def __init__(self, tweet_id: int, author_id: int,date_time: datetime, mrq: tuple):
        self.tweet_id, self.author_id = tweet_id, author_id
@@ -113,40 +125,4 @@ class TalentTweet:
    def get_datetime_str(self):
        unpad = '#' if platform.system() == 'Windows' else '-'
-        return self.date_time.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
+        return self.date_time.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
 class TalentAPITweet(TalentTweet):
    def __init__(self, tweet_id=None, tweet=None, mrq: tuple=None):
        if tweet and mrq:
            self.tweet = tweet
        elif tweet_id:
            tweet_id = int(tweet_id)
            resp = TwAPI.instance.get_tweet_response(tweet_id)
            self.tweet = resp.data
            mrq = TwAPI.get_mrq(self.tweet, resp)
        else:
            raise ValueError('did not supply sufficient tweet information')
        TalentTweet.__init__(
            self,
            tweet_id=self.tweet.id,
            author_id=self.tweet.author_id,
            date_time=self.tweet.created_at,
            mrq=mrq
        )
    def __repr__(self) -> str:
        return (
            f'{self.tweet_id} from {util.get_username(self.author_id)}:\n'
            f'{self.tweet.text}\n'
            f'------------------------------------------------------\n'
            f'{self.get_datetime_str()}\n'
            f'{self.get_all_parties_usernames()}\n'
            f'mentions: {self.mentions}\n'
            f'reply_to: {self.reply_to}\n'
            f'quote_retweeted: {self.quote_retweeted}\n'
            f'{self.serialize()}\n'
            f'Cross-company: {self.is_cross_company()}\n'
            f'======================================================'
        )
@@ -1,5 +1,6 @@
 import asyncio
 from math import inf
 from time import time
 import tweepy
 from tweetcapture import TweetCapture
@@ -61,13 +62,18 @@ class TwAPI:
            access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
        )
-    def get_tweet_response(self, id):
+    async def get_tweet_response(self, id, attempt = 0):
-        return TwAPI.instance.client.get_tweet(
+        try:
-            id,
+            return TwAPI.instance.client.get_tweet(
-            media_fields=TwAPI.TWEET_MEDIA_FIELDS,
+                id,
-            tweet_fields=TwAPI.TWEET_FIELDS,
+                media_fields=TwAPI.TWEET_MEDIA_FIELDS,
-            expansions=TwAPI.TWEET_EXPANSIONS
+                tweet_fields=TwAPI.TWEET_FIELDS,
-        )
+                expansions=TwAPI.TWEET_EXPANSIONS
            )
        except tweepy.TooManyRequests:
            print(f'[{attempt}]get_tweet_response({id}):\n\ttoo many API requests -- trying again in 1 minute...')
            await asyncio.sleep(60)
            return await self.get_tweet_response(id, attempt=attempt+1)
    # Create a post that showcases given tweet and its mentions set.
    # Try do do this without retireving Tweet data.
@@ -51,9 +51,9 @@ def get_username_online(user_id):
    c.Store_object = True
    c.Hide_output = True
    try:
        twint.output.users_list.clear()
        twint.run.Lookup(c)
        user = twint.output.users_list[0]
        twint.output.users_list.clear()
        return user.username
    except:
        return f'#{user_id}'