implemented producer/consumer system with asyncio
This commit is contained in:
+2
-1
@@ -143,4 +143,5 @@ cython_debug/
|
|||||||
.vscode
|
.vscode
|
||||||
|
|
||||||
# project-specific
|
# project-specific
|
||||||
/secrets.ini
|
/secrets.ini
|
||||||
|
/queue.txt
|
||||||
+97
-35
@@ -4,10 +4,13 @@
|
|||||||
#
|
#
|
||||||
# We should post, at the fastest, one tweet per minute.
|
# We should post, at the fastest, one tweet per minute.
|
||||||
|
|
||||||
|
import traceback
|
||||||
|
import sys
|
||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
import twint
|
import twint
|
||||||
|
import tweepy
|
||||||
|
|
||||||
from util import *
|
from util import *
|
||||||
from talent_lists import *
|
from talent_lists import *
|
||||||
@@ -31,17 +34,100 @@ def get_user_tweet_ids(id, limit=None):
|
|||||||
c.Store_object_tweets_list = tweets
|
c.Store_object_tweets_list = tweets
|
||||||
c.Hide_output = True
|
c.Hide_output = True
|
||||||
|
|
||||||
user_str = f'{id} ({util.get_username(id)})'
|
user_str = f'{util.get_username(id)}'
|
||||||
print(f'Finding tweets from {user_str})')
|
print(f'Scraping tweets from {user_str}...')
|
||||||
try:
|
try:
|
||||||
twint.run.Search(c)
|
twint.run.Search(c)
|
||||||
return [x.id for x in tweets]
|
|
||||||
except:
|
except:
|
||||||
print(f'Had trouble getting tweets from {user_str}')
|
print(f'Had trouble getting tweets from {user_str}')
|
||||||
return list()
|
|
||||||
|
ret = [x.id for x in tweets]
|
||||||
|
print(f'Scraped {len(ret)} tweets')
|
||||||
|
return ret
|
||||||
|
|
||||||
def work_on_queue(file):
|
# Produce tweet IDs from talent_list.talents for the producer/consumer model.
|
||||||
print('TODO: implement work_on_queue')
|
# Put lists of tweet IDs as we create them.
|
||||||
|
# Put None to queue to indicate end.
|
||||||
|
async def produce_ids_from_talents(queue: asyncio.Queue, finished_users):
|
||||||
|
def debug(str):
|
||||||
|
print(f'[prd] {str}')
|
||||||
|
|
||||||
|
for talent_id in talents.keys():
|
||||||
|
if talent_id in finished_users:
|
||||||
|
debug(f'@{util.get_username(talent_id)} already done, skipping...')
|
||||||
|
else:
|
||||||
|
tweet_ids = get_user_tweet_ids(talent_id)
|
||||||
|
debug(f'adding {util.get_username(talent_id)}\'s tweets to queue')
|
||||||
|
await queue.put(tweet_ids)
|
||||||
|
|
||||||
|
await queue.put(None)
|
||||||
|
|
||||||
|
async def consume_ids_into_ttweets(queue: asyncio.Queue, queue_file: str):
|
||||||
|
def debug(str):
|
||||||
|
print(f'[con] {str}')
|
||||||
|
|
||||||
|
ttweets_dict = dict()
|
||||||
|
with open(queue_file, 'w') as f:
|
||||||
|
while True:
|
||||||
|
tweet_ids = await queue.get()
|
||||||
|
if tweet_ids is None: break
|
||||||
|
try:
|
||||||
|
for tweet_id in tweet_ids:
|
||||||
|
ttweet = await tt.TalentTweet.create_from_id(id=tweet_id)
|
||||||
|
if ttweet.is_cross_company():
|
||||||
|
ttweets_dict['tweet_id'] = ttweet
|
||||||
|
except:
|
||||||
|
debug(traceback.format_exc())
|
||||||
|
debug(f'Error retrieving Tweet #{tweet_id} from api!')
|
||||||
|
f.write('1\n') # 1 = error/incomplete
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
f.write('0\n') # 0 = success
|
||||||
|
f.write('\n')
|
||||||
|
ttweets_dict = dict(sorted(ttweets_dict.items()))
|
||||||
|
for ttweet in ttweets_dict.values():
|
||||||
|
f.write(f'{ttweet.serialize()}\n')
|
||||||
|
return ttweets_dict
|
||||||
|
|
||||||
|
# If queue.txt doesn't exist, creates and populates it.
|
||||||
|
# Returns a list of sorted and filtered TalentTweets (should
|
||||||
|
# be equivalent to queue.txt)
|
||||||
|
async def get_cross_talent_tweets(queue_file):
|
||||||
|
finished_users = set()
|
||||||
|
ttweets_dict = dict()
|
||||||
|
|
||||||
|
# Populate structures with existing data from queue.txt
|
||||||
|
try:
|
||||||
|
print('Processing existing data in queue.txt...')
|
||||||
|
with open(queue_file, 'r') as f:
|
||||||
|
# Check for finished and incomplete accounts
|
||||||
|
# LINE FORMAT: "# {user_id} {status_num}"
|
||||||
|
for line in f:
|
||||||
|
tokens = line.split()
|
||||||
|
if len(tokens) != 3 or tokens[0][0] != '#':
|
||||||
|
# reached end of accounts list
|
||||||
|
break
|
||||||
|
if tokens[2] == 0:
|
||||||
|
finished_users.add(tokens[1])
|
||||||
|
|
||||||
|
# Add existing serialized TalentTweets into ttweets
|
||||||
|
for line in f:
|
||||||
|
tokens = line.split()
|
||||||
|
if len(tokens) == 0 or tokens[0][0] == '#':
|
||||||
|
continue
|
||||||
|
ttweet = tt.TalentTweet.deserialize(line)
|
||||||
|
ttweets_dict[ttweet.tweet_id] = ttweet
|
||||||
|
except FileNotFoundError:
|
||||||
|
print('Couldn\'t find queue.txt.')
|
||||||
|
|
||||||
|
async_queue = asyncio.Queue()
|
||||||
|
consumer = asyncio.create_task(consume_ids_into_ttweets(queue=async_queue, queue_file=queue_file))
|
||||||
|
await produce_ids_from_talents(queue=async_queue, finished_users=finished_users)
|
||||||
|
ttweets_dict = await consumer
|
||||||
|
return ttweets_dict
|
||||||
|
|
||||||
|
def process_queue(file):
|
||||||
|
print('TODO: implement process_queue')
|
||||||
# while Queue.txt has lines present
|
# while Queue.txt has lines present
|
||||||
# attempt to deserialize first line of Queue.txt
|
# attempt to deserialize first line of Queue.txt
|
||||||
# exit program if failed, stating error
|
# exit program if failed, stating error
|
||||||
@@ -52,41 +138,17 @@ def work_on_queue(file):
|
|||||||
# we're done! post tweet announcing done with archives
|
# we're done! post tweet announcing done with archives
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# If queue.txt doesn't exist, creates and populates it.
|
|
||||||
# Returns a list of sorted and filtered TalentTweets (should
|
|
||||||
# be equivalent to queue.txt)
|
|
||||||
def create_ttweets_queue(path) -> list:
|
|
||||||
print('Creating ttweets queue')
|
|
||||||
if not os.path.exists(path):
|
|
||||||
ttweets = list()
|
|
||||||
with open(path, 'x') as f:
|
|
||||||
for talent_id in talents.keys():
|
|
||||||
tweet_ids = get_user_tweet_ids(talent_id)
|
|
||||||
print(f'retrieved {len(tweet_ids)} tweets')
|
|
||||||
for tweet_id in tweet_ids:
|
|
||||||
ttweet = tt.TalentAPITweet(tweet_id)
|
|
||||||
if ttweet.is_cross_company():
|
|
||||||
ttweets.append(ttweet)
|
|
||||||
|
|
||||||
ttweets.sort(key=lambda ttweet: ttweet.tweet_id)
|
|
||||||
for ttweet in ttweets:
|
|
||||||
f.write(f'{ttweet.serialize()}\n')
|
|
||||||
return ttweets
|
|
||||||
else:
|
|
||||||
return list()
|
|
||||||
|
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
# if Queue.txt exists
|
# if Queue.txt exists
|
||||||
# work through the tweets in Queue.txt
|
# work through the tweets in Queue.txt
|
||||||
# else
|
# else
|
||||||
# look through every talent's tweets, saving only cross-company tweets into a list
|
# look through every talent's tweets, saving only cross-company tweets into a list
|
||||||
# sort the list by tweet_id
|
# sort the list by tweet_id
|
||||||
# create Queue.txt and save all tweets through there
|
# create Queue.txt and save all tweets (serialized) there
|
||||||
# post a tweet announcing archival intent
|
# post a tweet announcing archival intent
|
||||||
# work through the tweets in Queue.txt
|
# work through the tweets in Queue.txt
|
||||||
|
|
||||||
queue_path = get_queue_file()
|
queue_path = get_queue_file()
|
||||||
if os.path.exists(queue_path):
|
ttweet_dict = await get_cross_talent_tweets(queue_path)
|
||||||
work_on_queue(queue_path)
|
for ttweet in ttweet_dict.values():
|
||||||
else:
|
print(ttweet)
|
||||||
ttweets = create_ttweets_queue(queue_path)
|
|
||||||
+1
-1
@@ -15,7 +15,7 @@ def __create_dict(file, _dict):
|
|||||||
if len(words) == 2 and line[0] != '#':
|
if len(words) == 2 and line[0] != '#':
|
||||||
name, id = line.split()
|
name, id = line.split()
|
||||||
talents[int(id)] = name
|
talents[int(id)] = name
|
||||||
name = util.get_username_online(id) # attempt to get updated name
|
# name = util.get_username_online(id) # attempt to get updated name
|
||||||
talents[int(id)] = name
|
talents[int(id)] = name
|
||||||
_dict[int(id)] = name
|
_dict[int(id)] = name
|
||||||
def init():
|
def init():
|
||||||
|
|||||||
+14
-38
@@ -40,7 +40,19 @@ class TalentTweet:
|
|||||||
tweet_id=tweet_id, author_id=author_id,
|
tweet_id=tweet_id, author_id=author_id,
|
||||||
date_time=date_time, mrq=(mentions, reply_to, quote_retweeted)
|
date_time=date_time, mrq=(mentions, reply_to, quote_retweeted)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def create_from_id(id):
|
||||||
|
resp = await TwAPI.instance.get_tweet_response(id)
|
||||||
|
tweet = resp.data
|
||||||
|
mrq = TwAPI.get_mrq(tweet, resp)
|
||||||
|
|
||||||
|
return TalentTweet(
|
||||||
|
tweet_id=tweet.id,
|
||||||
|
author_id=tweet.author_id,
|
||||||
|
date_time=tweet.created_at,
|
||||||
|
mrq=mrq
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, tweet_id: int, author_id: int,date_time: datetime, mrq: tuple):
|
def __init__(self, tweet_id: int, author_id: int,date_time: datetime, mrq: tuple):
|
||||||
self.tweet_id, self.author_id = tweet_id, author_id
|
self.tweet_id, self.author_id = tweet_id, author_id
|
||||||
@@ -113,40 +125,4 @@ class TalentTweet:
|
|||||||
|
|
||||||
def get_datetime_str(self):
|
def get_datetime_str(self):
|
||||||
unpad = '#' if platform.system() == 'Windows' else '-'
|
unpad = '#' if platform.system() == 'Windows' else '-'
|
||||||
return self.date_time.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
|
return self.date_time.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
|
||||||
|
|
||||||
|
|
||||||
class TalentAPITweet(TalentTweet):
|
|
||||||
def __init__(self, tweet_id=None, tweet=None, mrq: tuple=None):
|
|
||||||
if tweet and mrq:
|
|
||||||
self.tweet = tweet
|
|
||||||
elif tweet_id:
|
|
||||||
tweet_id = int(tweet_id)
|
|
||||||
resp = TwAPI.instance.get_tweet_response(tweet_id)
|
|
||||||
self.tweet = resp.data
|
|
||||||
mrq = TwAPI.get_mrq(self.tweet, resp)
|
|
||||||
else:
|
|
||||||
raise ValueError('did not supply sufficient tweet information')
|
|
||||||
|
|
||||||
TalentTweet.__init__(
|
|
||||||
self,
|
|
||||||
tweet_id=self.tweet.id,
|
|
||||||
author_id=self.tweet.author_id,
|
|
||||||
date_time=self.tweet.created_at,
|
|
||||||
mrq=mrq
|
|
||||||
)
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return (
|
|
||||||
f'{self.tweet_id} from {util.get_username(self.author_id)}:\n'
|
|
||||||
f'{self.tweet.text}\n'
|
|
||||||
f'------------------------------------------------------\n'
|
|
||||||
f'{self.get_datetime_str()}\n'
|
|
||||||
f'{self.get_all_parties_usernames()}\n'
|
|
||||||
f'mentions: {self.mentions}\n'
|
|
||||||
f'reply_to: {self.reply_to}\n'
|
|
||||||
f'quote_retweeted: {self.quote_retweeted}\n'
|
|
||||||
f'{self.serialize()}\n'
|
|
||||||
f'Cross-company: {self.is_cross_company()}\n'
|
|
||||||
f'======================================================'
|
|
||||||
)
|
|
||||||
+13
-7
@@ -1,5 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from math import inf
|
from math import inf
|
||||||
|
from time import time
|
||||||
|
|
||||||
import tweepy
|
import tweepy
|
||||||
from tweetcapture import TweetCapture
|
from tweetcapture import TweetCapture
|
||||||
@@ -61,13 +62,18 @@ class TwAPI:
|
|||||||
access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
|
access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_tweet_response(self, id):
|
async def get_tweet_response(self, id, attempt = 0):
|
||||||
return TwAPI.instance.client.get_tweet(
|
try:
|
||||||
id,
|
return TwAPI.instance.client.get_tweet(
|
||||||
media_fields=TwAPI.TWEET_MEDIA_FIELDS,
|
id,
|
||||||
tweet_fields=TwAPI.TWEET_FIELDS,
|
media_fields=TwAPI.TWEET_MEDIA_FIELDS,
|
||||||
expansions=TwAPI.TWEET_EXPANSIONS
|
tweet_fields=TwAPI.TWEET_FIELDS,
|
||||||
)
|
expansions=TwAPI.TWEET_EXPANSIONS
|
||||||
|
)
|
||||||
|
except tweepy.TooManyRequests:
|
||||||
|
print(f'[{attempt}]get_tweet_response({id}):\n\ttoo many API requests -- trying again in 1 minute...')
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
return await self.get_tweet_response(id, attempt=attempt+1)
|
||||||
|
|
||||||
# Create a post that showcases given tweet and its mentions set.
|
# Create a post that showcases given tweet and its mentions set.
|
||||||
# Try do do this without retireving Tweet data.
|
# Try do do this without retireving Tweet data.
|
||||||
|
|||||||
+1
-1
@@ -51,9 +51,9 @@ def get_username_online(user_id):
|
|||||||
c.Store_object = True
|
c.Store_object = True
|
||||||
c.Hide_output = True
|
c.Hide_output = True
|
||||||
try:
|
try:
|
||||||
|
twint.output.users_list.clear()
|
||||||
twint.run.Lookup(c)
|
twint.run.Lookup(c)
|
||||||
user = twint.output.users_list[0]
|
user = twint.output.users_list[0]
|
||||||
twint.output.users_list.clear()
|
|
||||||
return user.username
|
return user.username
|
||||||
except:
|
except:
|
||||||
return f'#{user_id}'
|
return f'#{user_id}'
|
||||||
Reference in New Issue
Block a user