added twint (scraper), restructuring

This commit is contained in:
msk
2022-09-24 17:56:58 -07:00
parent 5b458f2e1f
commit 579929559f
8 changed files with 349 additions and 296 deletions
+3 -2
View File
@@ -1,2 +1,3 @@
tweepy tweepy
tweet-capture tweet-capture
git+https://github.com/muskit/twint_2022_fix.git
+96 -108
View File
@@ -1,109 +1,97 @@
from lib2to3.pgen2 import token from lib2to3.pgen2 import token
from math import inf from math import inf
from urllib import response from urllib import response
import tweepy import tweepy
import secrets import api_secrets
import util import talenttweet as tt
import util
class TwAPI:
instance = None class TwAPI:
TWEET_MEDIA_FIELDS = ['url'] instance = None
TWEET_FIELDS = ['created_at', 'in_reply_to_user_id'] TWEET_MEDIA_FIELDS = ['url']
TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id'] TWEET_FIELDS = ['created_at', 'in_reply_to_user_id']
TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id']
def __init__(self):
TwAPI.instance = self # Returns a set of involved parties for a single tweet.
self.client = tweepy.Client( #
bearer_token=secrets.bearer_token(), # Tweet must have been queried with these parameters:
consumer_key=secrets.api_key(), consumer_secret=secrets.api_secret(), # media_fields=['url'],
access_token=secrets.access_token(), access_token_secret=secrets.access_secret() # tweet_fields=['created_at', 'in_reply_to_user_id'],
) # expansions=['entities.mentions.username', 'referenced_tweets.id.author_id']
@staticmethod
# Returns a set of involved parties for a single tweet. def get_involved_parties(tweet, response):
# involved_parties = set()
# Tweet must have been queried with these parameters: # mentions
# media_fields=['url'], try:
# tweet_fields=['created_at', 'in_reply_to_user_id'], mention_list = tweet.entities['mentions']
# expansions=['entities.mentions.username', 'referenced_tweets.id.author_id'] for mention in mention_list:
@staticmethod involved_parties.add(int(mention['id']))
def get_involved_parties(tweet, response): except: pass
involved_parties = set() # reply-to
# mentions if tweet.in_reply_to_user_id != None:
try: involved_parties.add(tweet.in_reply_to_user_id)
mention_list = tweet.entities['mentions'] # qrt
for mention in mention_list: if tweet.attachments:
involved_parties.add(int(mention['id'])) for ref_tweet in tweet.attachments:
except: pass if ref_tweet.type == 'quoted':
# reply-to for incl_tweet in response.includes['tweets']:
if tweet.in_reply_to_user_id != None: if incl_tweet.id == ref_tweet.id:
involved_parties.add(tweet.in_reply_to_user_id) involved_parties.add(incl_tweet.author_id)
# qrt
if tweet.attachments: return involved_parties
for ref_tweet in tweet.attachments:
if ref_tweet.type == 'quoted': def __init__(self):
for incl_tweet in response.includes['tweets']: TwAPI.instance = self
if incl_tweet.id == ref_tweet.id: self.client = tweepy.Client(
involved_parties.add(incl_tweet.author_id) bearer_token=api_secrets.bearer_token(),
consumer_key=api_secrets.api_key(), consumer_secret=api_secrets.api_secret(),
return involved_parties access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
)
# Returns a tweet and mention-set pair, given a tweet ID.
def get_tweet_mentions(self, id): # Returns a list of TalentTweets from a user.
resp = self.client.get_tweet(id, def get_users_all_tweets_mentions(self, id: int, count=inf):
media_fields=TwAPI.TWEET_MEDIA_FIELDS, ttweets = list()
tweet_fields=TwAPI.TWEET_FIELDS,
expansions=TwAPI.TWEET_EXPANSIONS) retrieve_size = util.clamp(count, 5, 100)
next_page_token = None
tweet = resp.data tokens_retrieved = 0
mentions = TwAPI.get_involved_parties(tweet, resp) tweets_retrieved = 0
return (tweet, mentions)
while tweets_retrieved < count:
# Returns a list (tweet, {mentions}) from a user. print(f'Retrieved {tokens_retrieved} tokens so far...')
# mentions- a set comprised of any other parties involved resp = self.client.get_users_tweets(id, max_results=retrieve_size, pagination_token=next_page_token,
# in this tweet (reply, mention, qrt) media_fields=TwAPI.TWEET_MEDIA_FIELDS,
def get_users_all_tweets_mentions(self, id: int, count=inf): tweet_fields=TwAPI.TWEET_FIELDS,
pairs = list() expansions=TwAPI.TWEET_EXPANSIONS)
retrieve_size = util.clamp(count, 5, 100) for tweet in resp.data:
next_page_token = None mentions = TwAPI.get_involved_parties(tweet, resp)
tokens_retrieved = 0 ttweets.append(tt.TalentTweet(tweet=tweet, other_parties=mentions))
tweets_retrieved = 0
# update counters and pagination token
while tweets_retrieved < count: tweets_retrieved += resp.meta['result_count']
print(f'Retrieved {tokens_retrieved} tokens so far...') if tweets_retrieved < count:
resp = self.client.get_users_tweets(id, max_results=retrieve_size, pagination_token=next_page_token, try:
media_fields=TwAPI.TWEET_MEDIA_FIELDS, next_page_token = resp.meta['next_token']
tweet_fields=TwAPI.TWEET_FIELDS, tokens_retrieved += 1
expansions=TwAPI.TWEET_EXPANSIONS) except KeyError:
print("next_token wasn't provided; we've reached the end!")
for tweet in resp.data: break # reached end of user's tweets
mentions = TwAPI.get_involved_parties(tweet, resp)
pairs.append((tweet, mentions)) print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
return ttweets
# update counters and pagination token
tweets_retrieved += resp.meta['result_count'] # Returns a list of cross-company TalentTweets from a user.
if tweets_retrieved < count: def get_users_cross_tweets_mentions(self, id):
try: ret = list()
next_page_token = resp.meta['next_token'] ttweets = self.get_users_all_tweets_mentions(id)
tokens_retrieved += 1 for ttweet in ttweets:
except KeyError: if ttweet.is_cross_company():
print("next_token wasn't provided; we've reached the end!") ret.append(ttweet)
break # reached end of user's tweets
return ret
print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
return pairs # Create a post that showcases given tweet and its mentions set.
def create_post(self, tweet, mentions):
# returns a filtered list (tweet, [mentions]) from a user
def get_users_cross_tweets_mentions(self, id):
ret = list()
pairs = self.get_users_all_tweets_mentions(id)
for pair in pairs:
if util.is_cross_company(pair):
ret.append(pair)
return ret
# Create a post that showcases given tweet and its mentions set.
def create_post(self, tweet, mentions):
pass pass
+41 -41
View File
@@ -1,42 +1,42 @@
## Twitter developer credentials management. ## Twitter developer credentials management.
import os import os
import configparser import configparser
from util import * import util
# returns dictionary of the Credentials section. # returns dictionary of the Credentials section.
# [NOT TO BE USED OUTSIDE OF THIS FILE.] # [NOT TO BE USED OUTSIDE OF THIS FILE.]
def __get_ini_credentials(): def __get_ini_credentials():
c = configparser.RawConfigParser() c = configparser.RawConfigParser()
if len(c.read(os.path.join(get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'): if len(c.read(os.path.join(util.get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
return c['Credentials'] return c['Credentials']
return None return None
# returns the consumer api_key stored in secrets.ini # returns the consumer api_key stored in secrets.ini
def api_key(): def api_key():
c = __get_ini_credentials() c = __get_ini_credentials()
return c.get(option='api_key', fallback='xxx') if c is not None else 'xxx' return c.get(option='api_key', fallback='xxx') if c is not None else 'xxx'
# returns the consumer api_secret stored in secrets.ini # returns the consumer api_secret stored in secrets.ini
def api_secret(): def api_secret():
c = __get_ini_credentials() c = __get_ini_credentials()
return c.get(option='api_secret', fallback='yyy') if c is not None else 'yyy' return c.get(option='api_secret', fallback='yyy') if c is not None else 'yyy'
# returns the bearer_token stored in secrets.ini # returns the bearer_token stored in secrets.ini
def bearer_token(): def bearer_token():
c = __get_ini_credentials() c = __get_ini_credentials()
return c.get(option='bearer_token', fallback='zzz') if c is not None else 'zzz' return c.get(option='bearer_token', fallback='zzz') if c is not None else 'zzz'
# returns the access_token stroed in secrets.ini # returns the access_token stroed in secrets.ini
def access_token(): def access_token():
c = __get_ini_credentials() c = __get_ini_credentials()
return c.get(option='oauth1_access_token', fallback='zzz') if c is not None else 'aaa' return c.get(option='oauth1_access_token', fallback='zzz') if c is not None else 'aaa'
# returns the access_secret stroed in secrets.ini # returns the access_secret stroed in secrets.ini
def access_secret(): def access_secret():
c = __get_ini_credentials() c = __get_ini_credentials()
return c.get(option='oauth1_access_secret', fallback='zzz') if c is not None else 'bbb' return c.get(option='oauth1_access_secret', fallback='zzz') if c is not None else 'bbb'
def get_all_secrets(): def get_all_secrets():
return f'api_key:{api_key()}\napi_secret:{api_secret()}\nbearer_token:{bearer_token()}\naccess_token:{access_token()}\naccess_secret:{access_secret()}' return f'api_key:{api_key()}\napi_secret:{api_secret()}\nbearer_token:{bearer_token()}\naccess_token:{access_token()}\naccess_secret:{access_secret()}'
+39 -21
View File
@@ -1,21 +1,39 @@
## The bot's catch-up mode ## The bot's catch-up mode
# Scan all accounts for cross-company interactions. # Scan all accounts for cross-company interactions.
# Terminates when finished scanning and posting. # Terminates when finished scanning and posting.
# #
# We should post, at the fastest, one tweet per minute. # We should post, at the fastest, one tweet per minute.
import os import os
from util import * import twint
from api import TwAPI
from util import *
## Returns list of tweets present in queue.txt from talent_lists import *
def get_local_queue(): from api import TwAPI
# f = open(os.path.join(get_project_dir(), 'queue.txt')) import talenttweet as tt
pass
## Returns list of tweets present in queue.txt
def run(): def get_local_queue():
queue = get_local_queue() # f = open(os.path.join(get_project_dir(), 'queue.txt'))
pairs = TwAPI.instance.get_users_all_tweets_mentions(1390620618001838086, count=5) pass
for (tweet, mentions) in pairs:
print_tweet(tweet, mentions) ## Returns the ID of all tweets (up to limit) from a user ID.
def get_user_tweet_ids(id, limit=None):
tweets = list()
c = twint.Config()
c.User_id = id
c.Limit = limit
c.Store_object = True
c.Store_object_tweets_list = tweets
twint.run.Search(c)
return [x.id for x in tweets]
def run():
queue = get_local_queue()
tweets_ids = get_user_tweet_ids(1390620618001838086, limit=20)
for id in tweets_ids:
ttweet = tt.TalentTweet(id)
print(ttweet)
+63 -64
View File
@@ -1,64 +1,63 @@
import sys import sys
import argparse import argparse
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
import talent_lists import talent_lists
import secrets import api_secrets
import catchup import catchup
import listen import listen
from api import TwAPI
from api import TwAPI
from util import is_cross_company, print_tweet MODES_HELP_STR = '''mode to run the bot at:
l,listen: listen for new tweets from all accounts; will not terminate unless error occurs
MODES_HELP_STR = '''mode to run the bot at: c,catchup: scan all tweets from all accounts; will terminate when done'''
l,listen: listen for new tweets from all accounts; will not terminate unless error occurs
c,catchup: scan all tweets from all accounts; will terminate when done''' def init_argparse():
p = argparse.ArgumentParser(description='Twitter bot that follows interactions between Nijisanji EN/ID and hololive EN/ID members.', formatter_class=RawTextHelpFormatter)
def init_argparse(): p.add_argument('mode', nargs='?', \
p = argparse.ArgumentParser(description='Twitter bot that follows interactions between Nijisanji EN/ID and hololive EN/ID members.', formatter_class=RawTextHelpFormatter) help=MODES_HELP_STR)
p.add_argument('mode', nargs='?', \ p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini')
help=MODES_HELP_STR) return p
p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini')
return p # TODO: implement command line mode for manually controlling the bot
def command_line():
def main(): pass
parser = init_argparse()
if len(sys.argv) < 2: def main():
parser.print_help() parser = init_argparse()
return if len(sys.argv) < 2:
parser.print_help()
args = parser.parse_args() return
if args.show_tokens: args = parser.parse_args()
print(secrets.get_all_secrets())
if args.show_tokens:
if args.mode is None: return print(api_secrets.get_all_secrets())
## We expect to run in some mode now. if args.mode is None: return
# Initialize shared API instance ## We expect to run in some mode now.
twApi = TwAPI.instance = TwAPI()
# Initialize shared API instance
# Initialize talent account lists twApi = TwAPI.instance = TwAPI()
talent_lists.init()
# Initialize talent account lists
## TEST CODE ## talent_lists.init()
cross_pairs = twApi.get_users_cross_tweets_mentions(1390620618001838086)
for pair in cross_pairs: ## Determine running mode
print_tweet(pair) match args.mode.lower():
case 'l' | 'listen':
## Determine running mode print('RUNNING IN LISTEN MODE\n')
match args.mode.lower(): listen.run()
case 'l' | 'listen': case 'c' | 'catchup':
print('RUNNING IN LISTEN MODE\n') print('RUNNING IN CATCH-UP MODE\n')
listen.run() catchup.run()
case 'c' | 'catchup': case _:
print('RUNNING IN CATCH-UP MODE\n') command_line()
catchup.run() #TODO: remove message
case _: print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.')
print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.') return
return
if __name__ == "__main__":
if __name__ == "__main__": main()
main()
+23 -21
View File
@@ -1,21 +1,23 @@
import util import util
niji_en = dict() niji_en = dict()
holo_en = dict() holo_en = dict()
talents = dict()
def __create_dict(file, _dict):
with open(file, 'r') as f: def __create_dict(file, _dict):
for line in f: global talents
words = line.split() with open(file, 'r') as f:
if len(words) == 2 and line[0] != '#': for line in f:
name, id = line.split() words = line.split()
_dict[int(id)] = name if len(words) == 2 and line[0] != '#':
name, id = line.split()
def init(): _dict[int(id)] = name
global niji_en talents[int(id)] = name
global holo_en def init():
global niji_en
# holoEN global holo_en
__create_dict(f'{util.get_project_dir()}/lists/holoen.txt', holo_en)
# nijiEN # holoEN
__create_dict(f'{util.get_project_dir()}/lists/nijien.txt', niji_en) __create_dict(f'{util.get_project_dir()}/lists/holoen.txt', holo_en)
# nijiEN
__create_dict(f'{util.get_project_dir()}/lists/nijien.txt', niji_en)
+69
View File
@@ -0,0 +1,69 @@
import platform
import tweepy
from api import *
import talent_lists
class TalentTweet:
def __init__(self, tweet: tweepy.Tweet, other_parties: set):
self.tweet = tweet
self.other_parties = other_parties
def __init__(self, tweet_id):
resp = TwAPI.instance.client.get_tweet(tweet_id,
media_fields=TwAPI.TWEET_MEDIA_FIELDS,
tweet_fields=TwAPI.TWEET_FIELDS,
expansions=TwAPI.TWEET_EXPANSIONS)
self.tweet = resp.data
self.other_parties = TwAPI.get_involved_parties(self.tweet, resp)
def __repr__(self) -> str:
return (
f'{self.tweet.id} from {talent_lists.talents.get(self.tweet.author_id, "???")}:\n'
f'{self.tweet.text}\n'
f'------------------------------------------------------\n'
f'{self.get_datetime_str()}\n'
f'{self.get_mentions_usernames()}\n'
f'Cross-company: {self.is_cross_company()}\n'
f'======================================================'
)
def is_cross_company(self):
author_id = self.tweet.author_id
mentions = self.other_parties
# TODO: update for EN/ID
for mention_id in mentions:
if author_id in talent_lists.niji_en:
if mention_id in talent_lists.holo_en:
return True
elif author_id in talent_lists.holo_en:
if mention_id in talent_lists.niji_en:
return True
return False
def get_mentions_usernames(self):
if len(self.other_parties) > 0:
s = str()
for id in self.other_parties:
s += f'{talent_lists.talents.get(id, "???")}, '
return s[0:-2]
return 'none'
def get_datetime_str(self):
unpad = '#' if platform.system() == 'Windows' else '-'
return self.tweet.created_at.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
class TalentTweets:
def __init__(self):
self.ttweets = list()
def get_ttweets(self):
pass
def get_ttweet_ids(self):
pass
+15 -39
View File
@@ -1,40 +1,16 @@
## Shared utility functions. ## Shared utility functions.
import os import os
import talent_lists import talent_lists
import talenttweet as tt
# returns system path to this project, which is
# up one level from this file's directory (src). # returns system path to this project, which is
def get_project_dir(): # up one level from this file's directory (src).
return os.path.join(os.path.dirname(__file__), os.pardir) def get_project_dir():
return os.path.join(os.path.dirname(__file__), os.pardir)
# determine if tweet involves cross-company interaction
def is_cross_company(pair: tuple): def tweet_id_to_url(id):
author_id, mentions = pair[0].author_id, pair[1] return f'https://twitter.com/twitter/status/{id}'
for mention_id in mentions: def clamp(n, smallest, largest):
if author_id in talent_lists.niji_en:
if mention_id in talent_lists.holo_en:
return True
elif author_id in talent_lists.holo_en:
if mention_id in talent_lists.niji_en:
return True
return False
def tweet_id_to_url(id):
return f'https://twitter.com/twitter/status/{id}'
def print_tweet(pair: tuple):
tweet, mentions = pair
s = (
f'{tweet.id}: {tweet.created_at}: involves {mentions}\n'
f'{tweet.text}\n'
f'-----\n'
f'{tweet.entities}\n'
f'{tweet.referenced_tweets}\n'
f'================================================='
)
print(s)
def clamp(n, smallest, largest):
return max(smallest, min(n, largest)) return max(smallest, min(n, largest))