added twint (scraper), restructuring

This commit is contained in:
msk
2022-09-24 17:56:58 -07:00
parent 5b458f2e1f
commit 579929559f
8 changed files with 349 additions and 296 deletions
+1
View File
@@ -1,2 +1,3 @@
tweepy tweepy
tweet-capture tweet-capture
git+https://github.com/muskit/twint_2022_fix.git
+18 -30
View File
@@ -3,7 +3,8 @@ from math import inf
from urllib import response from urllib import response
import tweepy import tweepy
import secrets import api_secrets
import talenttweet as tt
import util import util
class TwAPI: class TwAPI:
@@ -12,14 +13,6 @@ class TwAPI:
TWEET_FIELDS = ['created_at', 'in_reply_to_user_id'] TWEET_FIELDS = ['created_at', 'in_reply_to_user_id']
TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id'] TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id']
def __init__(self):
TwAPI.instance = self
self.client = tweepy.Client(
bearer_token=secrets.bearer_token(),
consumer_key=secrets.api_key(), consumer_secret=secrets.api_secret(),
access_token=secrets.access_token(), access_token_secret=secrets.access_secret()
)
# Returns a set of involved parties for a single tweet. # Returns a set of involved parties for a single tweet.
# #
# Tweet must have been queried with these parameters: # Tweet must have been queried with these parameters:
@@ -48,22 +41,17 @@ class TwAPI:
return involved_parties return involved_parties
# Returns a tweet and mention-set pair, given a tweet ID. def __init__(self):
def get_tweet_mentions(self, id): TwAPI.instance = self
resp = self.client.get_tweet(id, self.client = tweepy.Client(
media_fields=TwAPI.TWEET_MEDIA_FIELDS, bearer_token=api_secrets.bearer_token(),
tweet_fields=TwAPI.TWEET_FIELDS, consumer_key=api_secrets.api_key(), consumer_secret=api_secrets.api_secret(),
expansions=TwAPI.TWEET_EXPANSIONS) access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
)
tweet = resp.data # Returns a list of TalentTweets from a user.
mentions = TwAPI.get_involved_parties(tweet, resp)
return (tweet, mentions)
# Returns a list (tweet, {mentions}) from a user.
# mentions- a set comprised of any other parties involved
# in this tweet (reply, mention, qrt)
def get_users_all_tweets_mentions(self, id: int, count=inf): def get_users_all_tweets_mentions(self, id: int, count=inf):
pairs = list() ttweets = list()
retrieve_size = util.clamp(count, 5, 100) retrieve_size = util.clamp(count, 5, 100)
next_page_token = None next_page_token = None
@@ -79,7 +67,7 @@ class TwAPI:
for tweet in resp.data: for tweet in resp.data:
mentions = TwAPI.get_involved_parties(tweet, resp) mentions = TwAPI.get_involved_parties(tweet, resp)
pairs.append((tweet, mentions)) ttweets.append(tt.TalentTweet(tweet=tweet, other_parties=mentions))
# update counters and pagination token # update counters and pagination token
tweets_retrieved += resp.meta['result_count'] tweets_retrieved += resp.meta['result_count']
@@ -92,15 +80,15 @@ class TwAPI:
break # reached end of user's tweets break # reached end of user's tweets
print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.') print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
return pairs return ttweets
# returns a filtered list (tweet, [mentions]) from a user # Returns a list of cross-company TalentTweets from a user.
def get_users_cross_tweets_mentions(self, id): def get_users_cross_tweets_mentions(self, id):
ret = list() ret = list()
pairs = self.get_users_all_tweets_mentions(id) ttweets = self.get_users_all_tweets_mentions(id)
for pair in pairs: for ttweet in ttweets:
if util.is_cross_company(pair): if ttweet.is_cross_company():
ret.append(pair) ret.append(ttweet)
return ret return ret
+2 -2
View File
@@ -3,13 +3,13 @@
import os import os
import configparser import configparser
from util import * import util
# returns dictionary of the Credentials section. # returns dictionary of the Credentials section.
# [NOT TO BE USED OUTSIDE OF THIS FILE.] # [NOT TO BE USED OUTSIDE OF THIS FILE.]
def __get_ini_credentials(): def __get_ini_credentials():
c = configparser.RawConfigParser() c = configparser.RawConfigParser()
if len(c.read(os.path.join(get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'): if len(c.read(os.path.join(util.get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
return c['Credentials'] return c['Credentials']
return None return None
+21 -3
View File
@@ -6,16 +6,34 @@
import os import os
import twint
from util import * from util import *
from talent_lists import *
from api import TwAPI from api import TwAPI
import talenttweet as tt
## Returns list of tweets present in queue.txt ## Returns list of tweets present in queue.txt
def get_local_queue(): def get_local_queue():
# f = open(os.path.join(get_project_dir(), 'queue.txt')) # f = open(os.path.join(get_project_dir(), 'queue.txt'))
pass pass
## Returns the ID of all tweets (up to limit) from a user ID.
def get_user_tweet_ids(id, limit=None):
tweets = list()
c = twint.Config()
c.User_id = id
c.Limit = limit
c.Store_object = True
c.Store_object_tweets_list = tweets
twint.run.Search(c)
return [x.id for x in tweets]
def run(): def run():
queue = get_local_queue() queue = get_local_queue()
pairs = TwAPI.instance.get_users_all_tweets_mentions(1390620618001838086, count=5)
for (tweet, mentions) in pairs: tweets_ids = get_user_tweet_ids(1390620618001838086, limit=20)
print_tweet(tweet, mentions) for id in tweets_ids:
ttweet = tt.TalentTweet(id)
print(ttweet)
+8 -9
View File
@@ -3,12 +3,10 @@ import argparse
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
import talent_lists import talent_lists
import secrets import api_secrets
import catchup import catchup
import listen import listen
from api import TwAPI from api import TwAPI
from util import is_cross_company, print_tweet
MODES_HELP_STR = '''mode to run the bot at: MODES_HELP_STR = '''mode to run the bot at:
l,listen: listen for new tweets from all accounts; will not terminate unless error occurs l,listen: listen for new tweets from all accounts; will not terminate unless error occurs
@@ -21,6 +19,10 @@ def init_argparse():
p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini') p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini')
return p return p
# TODO: implement command line mode for manually controlling the bot
def command_line():
pass
def main(): def main():
parser = init_argparse() parser = init_argparse()
if len(sys.argv) < 2: if len(sys.argv) < 2:
@@ -30,7 +32,7 @@ def main():
args = parser.parse_args() args = parser.parse_args()
if args.show_tokens: if args.show_tokens:
print(secrets.get_all_secrets()) print(api_secrets.get_all_secrets())
if args.mode is None: return if args.mode is None: return
@@ -42,11 +44,6 @@ def main():
# Initialize talent account lists # Initialize talent account lists
talent_lists.init() talent_lists.init()
## TEST CODE ##
cross_pairs = twApi.get_users_cross_tweets_mentions(1390620618001838086)
for pair in cross_pairs:
print_tweet(pair)
## Determine running mode ## Determine running mode
match args.mode.lower(): match args.mode.lower():
case 'l' | 'listen': case 'l' | 'listen':
@@ -56,6 +53,8 @@ def main():
print('RUNNING IN CATCH-UP MODE\n') print('RUNNING IN CATCH-UP MODE\n')
catchup.run() catchup.run()
case _: case _:
command_line()
#TODO: remove message
print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.') print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.')
return return
+3 -1
View File
@@ -2,15 +2,17 @@ import util
niji_en = dict() niji_en = dict()
holo_en = dict() holo_en = dict()
talents = dict()
def __create_dict(file, _dict): def __create_dict(file, _dict):
global talents
with open(file, 'r') as f: with open(file, 'r') as f:
for line in f: for line in f:
words = line.split() words = line.split()
if len(words) == 2 and line[0] != '#': if len(words) == 2 and line[0] != '#':
name, id = line.split() name, id = line.split()
_dict[int(id)] = name _dict[int(id)] = name
talents[int(id)] = name
def init(): def init():
global niji_en global niji_en
global holo_en global holo_en
+69
View File
@@ -0,0 +1,69 @@
import platform
import tweepy
from api import *
import talent_lists
class TalentTweet:
def __init__(self, tweet: tweepy.Tweet, other_parties: set):
self.tweet = tweet
self.other_parties = other_parties
def __init__(self, tweet_id):
resp = TwAPI.instance.client.get_tweet(tweet_id,
media_fields=TwAPI.TWEET_MEDIA_FIELDS,
tweet_fields=TwAPI.TWEET_FIELDS,
expansions=TwAPI.TWEET_EXPANSIONS)
self.tweet = resp.data
self.other_parties = TwAPI.get_involved_parties(self.tweet, resp)
def __repr__(self) -> str:
return (
f'{self.tweet.id} from {talent_lists.talents.get(self.tweet.author_id, "???")}:\n'
f'{self.tweet.text}\n'
f'------------------------------------------------------\n'
f'{self.get_datetime_str()}\n'
f'{self.get_mentions_usernames()}\n'
f'Cross-company: {self.is_cross_company()}\n'
f'======================================================'
)
def is_cross_company(self):
author_id = self.tweet.author_id
mentions = self.other_parties
# TODO: update for EN/ID
for mention_id in mentions:
if author_id in talent_lists.niji_en:
if mention_id in talent_lists.holo_en:
return True
elif author_id in talent_lists.holo_en:
if mention_id in talent_lists.niji_en:
return True
return False
def get_mentions_usernames(self):
if len(self.other_parties) > 0:
s = str()
for id in self.other_parties:
s += f'{talent_lists.talents.get(id, "???")}, '
return s[0:-2]
return 'none'
def get_datetime_str(self):
unpad = '#' if platform.system() == 'Windows' else '-'
return self.tweet.created_at.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
class TalentTweets:
def __init__(self):
self.ttweets = list()
def get_ttweets(self):
pass
def get_ttweet_ids(self):
pass
+1 -25
View File
@@ -2,39 +2,15 @@
import os import os
import talent_lists import talent_lists
import talenttweet as tt
# returns system path to this project, which is # returns system path to this project, which is
# up one level from this file's directory (src). # up one level from this file's directory (src).
def get_project_dir(): def get_project_dir():
return os.path.join(os.path.dirname(__file__), os.pardir) return os.path.join(os.path.dirname(__file__), os.pardir)
# determine if tweet involves cross-company interaction
def is_cross_company(pair: tuple):
author_id, mentions = pair[0].author_id, pair[1]
for mention_id in mentions:
if author_id in talent_lists.niji_en:
if mention_id in talent_lists.holo_en:
return True
elif author_id in talent_lists.holo_en:
if mention_id in talent_lists.niji_en:
return True
return False
def tweet_id_to_url(id): def tweet_id_to_url(id):
return f'https://twitter.com/twitter/status/{id}' return f'https://twitter.com/twitter/status/{id}'
def print_tweet(pair: tuple):
tweet, mentions = pair
s = (
f'{tweet.id}: {tweet.created_at}: involves {mentions}\n'
f'{tweet.text}\n'
f'-----\n'
f'{tweet.entities}\n'
f'{tweet.referenced_tweets}\n'
f'================================================='
)
print(s)
def clamp(n, smallest, largest): def clamp(n, smallest, largest):
return max(smallest, min(n, largest)) return max(smallest, min(n, largest))