added twint (scraper), restructuring

This commit is contained in:
msk
2022-09-24 17:56:58 -07:00
parent 5b458f2e1f
commit 579929559f
8 changed files with 349 additions and 296 deletions
+1
View File
@@ -1,2 +1,3 @@
tweepy
tweet-capture
git+https://github.com/muskit/twint_2022_fix.git
+18 -30
View File
@@ -3,7 +3,8 @@ from math import inf
from urllib import response
import tweepy
import secrets
import api_secrets
import talenttweet as tt
import util
class TwAPI:
@@ -12,14 +13,6 @@ class TwAPI:
TWEET_FIELDS = ['created_at', 'in_reply_to_user_id']
TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id']
def __init__(self):
TwAPI.instance = self
self.client = tweepy.Client(
bearer_token=secrets.bearer_token(),
consumer_key=secrets.api_key(), consumer_secret=secrets.api_secret(),
access_token=secrets.access_token(), access_token_secret=secrets.access_secret()
)
# Returns a set of involved parties for a single tweet.
#
# Tweet must have been queried with these parameters:
@@ -48,22 +41,17 @@ class TwAPI:
return involved_parties
# Returns a tweet and mention-set pair, given a tweet ID.
def get_tweet_mentions(self, id):
resp = self.client.get_tweet(id,
media_fields=TwAPI.TWEET_MEDIA_FIELDS,
tweet_fields=TwAPI.TWEET_FIELDS,
expansions=TwAPI.TWEET_EXPANSIONS)
def __init__(self):
TwAPI.instance = self
self.client = tweepy.Client(
bearer_token=api_secrets.bearer_token(),
consumer_key=api_secrets.api_key(), consumer_secret=api_secrets.api_secret(),
access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
)
tweet = resp.data
mentions = TwAPI.get_involved_parties(tweet, resp)
return (tweet, mentions)
# Returns a list (tweet, {mentions}) from a user.
# mentions- a set comprised of any other parties involved
# in this tweet (reply, mention, qrt)
# Returns a list of TalentTweets from a user.
def get_users_all_tweets_mentions(self, id: int, count=inf):
pairs = list()
ttweets = list()
retrieve_size = util.clamp(count, 5, 100)
next_page_token = None
@@ -79,7 +67,7 @@ class TwAPI:
for tweet in resp.data:
mentions = TwAPI.get_involved_parties(tweet, resp)
pairs.append((tweet, mentions))
ttweets.append(tt.TalentTweet(tweet=tweet, other_parties=mentions))
# update counters and pagination token
tweets_retrieved += resp.meta['result_count']
@@ -92,15 +80,15 @@ class TwAPI:
break # reached end of user's tweets
print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
return pairs
return ttweets
# returns a filtered list (tweet, [mentions]) from a user
# Returns a list of cross-company TalentTweets from a user.
def get_users_cross_tweets_mentions(self, id):
ret = list()
pairs = self.get_users_all_tweets_mentions(id)
for pair in pairs:
if util.is_cross_company(pair):
ret.append(pair)
ttweets = self.get_users_all_tweets_mentions(id)
for ttweet in ttweets:
if ttweet.is_cross_company():
ret.append(ttweet)
return ret
+2 -2
View File
@@ -3,13 +3,13 @@
import os
import configparser
from util import *
import util
# returns dictionary of the Credentials section.
# [NOT TO BE USED OUTSIDE OF THIS FILE.]
def __get_ini_credentials():
c = configparser.RawConfigParser()
if len(c.read(os.path.join(get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
if len(c.read(os.path.join(util.get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
return c['Credentials']
return None
+21 -3
View File
@@ -6,16 +6,34 @@
import os
import twint
from util import *
from talent_lists import *
from api import TwAPI
import talenttweet as tt
## Returns list of tweets present in queue.txt
def get_local_queue():
# f = open(os.path.join(get_project_dir(), 'queue.txt'))
pass
## Returns the ID of all tweets (up to limit) from a user ID.
def get_user_tweet_ids(id, limit=None):
tweets = list()
c = twint.Config()
c.User_id = id
c.Limit = limit
c.Store_object = True
c.Store_object_tweets_list = tweets
twint.run.Search(c)
return [x.id for x in tweets]
def run():
queue = get_local_queue()
pairs = TwAPI.instance.get_users_all_tweets_mentions(1390620618001838086, count=5)
for (tweet, mentions) in pairs:
print_tweet(tweet, mentions)
tweets_ids = get_user_tweet_ids(1390620618001838086, limit=20)
for id in tweets_ids:
ttweet = tt.TalentTweet(id)
print(ttweet)
+8 -9
View File
@@ -3,12 +3,10 @@ import argparse
from argparse import RawTextHelpFormatter
import talent_lists
import secrets
import api_secrets
import catchup
import listen
from api import TwAPI
from util import is_cross_company, print_tweet
MODES_HELP_STR = '''mode to run the bot at:
l,listen: listen for new tweets from all accounts; will not terminate unless error occurs
@@ -21,6 +19,10 @@ def init_argparse():
p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini')
return p
# TODO: implement command line mode for manually controlling the bot
def command_line():
pass
def main():
parser = init_argparse()
if len(sys.argv) < 2:
@@ -30,7 +32,7 @@ def main():
args = parser.parse_args()
if args.show_tokens:
print(secrets.get_all_secrets())
print(api_secrets.get_all_secrets())
if args.mode is None: return
@@ -42,11 +44,6 @@ def main():
# Initialize talent account lists
talent_lists.init()
## TEST CODE ##
cross_pairs = twApi.get_users_cross_tweets_mentions(1390620618001838086)
for pair in cross_pairs:
print_tweet(pair)
## Determine running mode
match args.mode.lower():
case 'l' | 'listen':
@@ -56,6 +53,8 @@ def main():
print('RUNNING IN CATCH-UP MODE\n')
catchup.run()
case _:
command_line()
#TODO: remove message
print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.')
return
+3 -1
View File
@@ -2,15 +2,17 @@ import util
niji_en = dict()
holo_en = dict()
talents = dict()
def __create_dict(file, _dict):
global talents
with open(file, 'r') as f:
for line in f:
words = line.split()
if len(words) == 2 and line[0] != '#':
name, id = line.split()
_dict[int(id)] = name
talents[int(id)] = name
def init():
global niji_en
global holo_en
+69
View File
@@ -0,0 +1,69 @@
import platform
import tweepy
from api import *
import talent_lists
class TalentTweet:
def __init__(self, tweet: tweepy.Tweet, other_parties: set):
self.tweet = tweet
self.other_parties = other_parties
def __init__(self, tweet_id):
resp = TwAPI.instance.client.get_tweet(tweet_id,
media_fields=TwAPI.TWEET_MEDIA_FIELDS,
tweet_fields=TwAPI.TWEET_FIELDS,
expansions=TwAPI.TWEET_EXPANSIONS)
self.tweet = resp.data
self.other_parties = TwAPI.get_involved_parties(self.tweet, resp)
def __repr__(self) -> str:
return (
f'{self.tweet.id} from {talent_lists.talents.get(self.tweet.author_id, "???")}:\n'
f'{self.tweet.text}\n'
f'------------------------------------------------------\n'
f'{self.get_datetime_str()}\n'
f'{self.get_mentions_usernames()}\n'
f'Cross-company: {self.is_cross_company()}\n'
f'======================================================'
)
def is_cross_company(self):
author_id = self.tweet.author_id
mentions = self.other_parties
# TODO: update for EN/ID
for mention_id in mentions:
if author_id in talent_lists.niji_en:
if mention_id in talent_lists.holo_en:
return True
elif author_id in talent_lists.holo_en:
if mention_id in talent_lists.niji_en:
return True
return False
def get_mentions_usernames(self):
if len(self.other_parties) > 0:
s = str()
for id in self.other_parties:
s += f'{talent_lists.talents.get(id, "???")}, '
return s[0:-2]
return 'none'
def get_datetime_str(self):
unpad = '#' if platform.system() == 'Windows' else '-'
return self.tweet.created_at.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
class TalentTweets:
def __init__(self):
self.ttweets = list()
def get_ttweets(self):
pass
def get_ttweet_ids(self):
pass
+1 -25
View File
@@ -2,39 +2,15 @@
import os
import talent_lists
import talenttweet as tt
# returns system path to this project, which is
# up one level from this file's directory (src).
def get_project_dir():
return os.path.join(os.path.dirname(__file__), os.pardir)
# determine if tweet involves cross-company interaction
def is_cross_company(pair: tuple):
author_id, mentions = pair[0].author_id, pair[1]
for mention_id in mentions:
if author_id in talent_lists.niji_en:
if mention_id in talent_lists.holo_en:
return True
elif author_id in talent_lists.holo_en:
if mention_id in talent_lists.niji_en:
return True
return False
def tweet_id_to_url(id):
return f'https://twitter.com/twitter/status/{id}'
def print_tweet(pair: tuple):
tweet, mentions = pair
s = (
f'{tweet.id}: {tweet.created_at}: involves {mentions}\n'
f'{tweet.text}\n'
f'-----\n'
f'{tweet.entities}\n'
f'{tweet.referenced_tweets}\n'
f'================================================='
)
print(s)
def clamp(n, smallest, largest):
return max(smallest, min(n, largest))