src/catchup.py

## The bot's catch-up mode
# Scan all accounts for cross-company interactions.
# Terminates when finished scanning and posting.
#
# We should post, at the fastest, one tweet per minute.

import traceback
import sys
import os
import asyncio

import twint
import tweepy

from util import *
from talent_lists import *
from twapi import TwAPI
import talenttweet as tt

def get_queue_file():
    return f'{util.get_project_dir()}/queue.txt'

def get_local_queue():
    # f = open(os.path.join(get_project_dir(), 'queue.txt'))
    pass

## Returns the ID of all tweets (up to limit) from a user ID.
def get_user_tweets(id, limit=None):
    tweets = list()
    c = twint.Config()
    c.User_id = id
    c.Limit = limit
    c.Store_object = True
    c.Store_object_tweets_list = tweets
    c.Hide_output = True
    
    user_str = f'{util.get_username(id)}'
    print(f'Scraping tweets from {user_str}...')
    try:
        twint.run.Search(c)
    except:
        print(f'Had trouble getting tweets from {user_str}')
    
    print(f'Scraped {len(tweets)} tweets')
    return tweets

# If queue.txt doesn't exist, creates and populates it.
# Returns a list of sorted and filtered TalentTweets (should
# be equivalent to queue.txt)
async def get_cross_talent_tweets(queue_file):
    finished_users = set()
    ttweets_dict = dict()

    # Populate structures with existing data from queue.txt
    try:
        print('Processing existing data in queue.txt...')
        with open(queue_file, 'r') as f:
            # Check for finished and incomplete accounts
            # LINE FORMAT: "# {user_id} {status_num}"
            for line in f:
                tokens = line.split()
                if len(tokens) != 3 or tokens[0][0] != '#':
                    # reached end of accounts list
                    break
                if tokens[2] == 0:
                    finished_users.add(tokens[1])
            
            # Add existing serialized TalentTweets into ttweets
            for line in f:
                tokens = line.split()
                if len(tokens) == 0 or tokens[0][0] == '#':
                    continue
                ttweet = tt.TalentTweet.deserialize(line)
                ttweets_dict[ttweet.tweet_id] = ttweet
    except FileNotFoundError:
        print('Couldn\'t find queue.txt.')

    # TODO: implement ordered cross-company ttweets dict creation using twint

def process_queue(file):
    print('TODO: implement process_queue')
    # while Queue.txt has lines present
    #   attempt to deserialize first line of Queue.txt
    #     exit program if failed, stating error
    #   while post isn't successful
    #     attempt to post tweet
    #   delete serialized line from Queue.txt, save it
    # 
    # we're done! post tweet announcing done with archives
    pass

async def run():
    # if Queue.txt exists
    #   work through the tweets in Queue.txt
    # else
    #   look through every talent's tweets, saving only cross-company tweets into a list
    #   sort the list by tweet_id
    #   create Queue.txt and save all tweets (serialized) there
    #   post a tweet announcing archival intent
    #   work through the tweets in Queue.txt

    queue_path = get_queue_file()
    ttweet_dict = await get_cross_talent_tweets(queue_path)
    for ttweet in ttweet_dict.values():
        print(ttweet)
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00			`## The bot's catch-up mode`
			`# Scan all accounts for cross-company interactions.`
			`# Terminates when finished scanning and posting.`
			`#`
			`# We should post, at the fastest, one tweet per minute.`

implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00			`import traceback`
			`import sys`
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00			`import os`
a couple libraries and even MORE restructuring 2022-09-25 03:39:15 -07:00			`import asyncio`
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00
			`import twint`
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00			`import tweepy`
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00
			`from util import *`
			`from talent_lists import *`
fundamental progression, fixes 2022-09-25 18:31:50 -07:00			`from twapi import TwAPI`
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00			`import talenttweet as tt`

fundamental progression, fixes 2022-09-25 18:31:50 -07:00			`def get_queue_file():`
			`return f'{util.get_project_dir()}/queue.txt'`

added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00			`def get_local_queue():`
			`# f = open(os.path.join(get_project_dir(), 'queue.txt'))`
			`pass`

			`## Returns the ID of all tweets (up to limit) from a user ID.`
transition to using twint for ttweet construction 2022-09-26 03:50:11 -07:00			`def get_user_tweets(id, limit=None):`
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00			`tweets = list()`
			`c = twint.Config()`
			`c.User_id = id`
			`c.Limit = limit`
			`c.Store_object = True`
			`c.Store_object_tweets_list = tweets`
a couple libraries and even MORE restructuring 2022-09-25 03:39:15 -07:00			`c.Hide_output = True`
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00			`user_str = f'{util.get_username(id)}'`
			`print(f'Scraping tweets from {user_str}...')`
fundamental progression, fixes 2022-09-25 18:31:50 -07:00			`try:`
			`twint.run.Search(c)`
			`except:`
			`print(f'Had trouble getting tweets from {user_str}')`
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00
transition to using twint for ttweet construction 2022-09-26 03:50:11 -07:00			`print(f'Scraped {len(tweets)} tweets')`
			`return tweets`
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00
			`# If queue.txt doesn't exist, creates and populates it.`
			`# Returns a list of sorted and filtered TalentTweets (should`
			`# be equivalent to queue.txt)`
			`async def get_cross_talent_tweets(queue_file):`
			`finished_users = set()`
			`ttweets_dict = dict()`
added twint (scraper), restructuring 2022-09-24 17:56:58 -07:00
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00			`# Populate structures with existing data from queue.txt`
			`try:`
			`print('Processing existing data in queue.txt...')`
			`with open(queue_file, 'r') as f:`
			`# Check for finished and incomplete accounts`
			`# LINE FORMAT: "# {user_id} {status_num}"`
			`for line in f:`
			`tokens = line.split()`
			`if len(tokens) != 3 or tokens[0][0] != '#':`
			`# reached end of accounts list`
			`break`
			`if tokens[2] == 0:`
			`finished_users.add(tokens[1])`

			`# Add existing serialized TalentTweets into ttweets`
			`for line in f:`
			`tokens = line.split()`
			`if len(tokens) == 0 or tokens[0][0] == '#':`
			`continue`
			`ttweet = tt.TalentTweet.deserialize(line)`
			`ttweets_dict[ttweet.tweet_id] = ttweet`
			`except FileNotFoundError:`
			`print('Couldn\'t find queue.txt.')`

transition to using twint for ttweet construction 2022-09-26 03:50:11 -07:00			`# TODO: implement ordered cross-company ttweets dict creation using twint`
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00
			`def process_queue(file):`
			`print('TODO: implement process_queue')`
pseudocode for catch runtime 2022-09-25 04:39:43 -07:00			`# while Queue.txt has lines present`
			`# attempt to deserialize first line of Queue.txt`
			`# exit program if failed, stating error`
			`# while post isn't successful`
			`# attempt to post tweet`
			`# delete serialized line from Queue.txt, save it`
			`#`
			`# we're done! post tweet announcing done with archives`
			`pass`
a couple libraries and even MORE restructuring 2022-09-25 03:39:15 -07:00
pseudocode for catch runtime 2022-09-25 04:39:43 -07:00			`async def run():`
			`# if Queue.txt exists`
			`# work through the tweets in Queue.txt`
			`# else`
			`# look through every talent's tweets, saving only cross-company tweets into a list`
			`# sort the list by tweet_id`
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00			`# create Queue.txt and save all tweets (serialized) there`
pseudocode for catch runtime 2022-09-25 04:39:43 -07:00			`# post a tweet announcing archival intent`
fundamental progression, fixes 2022-09-25 18:31:50 -07:00			`# work through the tweets in Queue.txt`
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00
fundamental progression, fixes 2022-09-25 18:31:50 -07:00			`queue_path = get_queue_file()`
implemented producer/consumer system with asyncio 2022-09-26 02:44:26 -07:00			`ttweet_dict = await get_cross_talent_tweets(queue_path)`
			`for ttweet in ttweet_dict.values():`
			`print(ttweet)`