added twint (scraper), restructuring

2022-09-24 17:56:58 -07:00
parent 5b458f2e1f
commit 579929559f
8 changed files with 349 additions and 296 deletions
@@ -1,2 +1,3 @@
 tweepy
 tweet-capture
 git+https://github.com/muskit/twint_2022_fix.git
@@ -3,7 +3,8 @@ from math import inf
 from urllib import response
 import tweepy
-import secrets
+import api_secrets
 import talenttweet as tt
 import util
 class TwAPI:
@@ -12,14 +13,6 @@ class TwAPI:
    TWEET_FIELDS = ['created_at', 'in_reply_to_user_id']
    TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id']
    def __init__(self):
        TwAPI.instance = self
        self.client = tweepy.Client(
            bearer_token=secrets.bearer_token(),
            consumer_key=secrets.api_key(), consumer_secret=secrets.api_secret(),
            access_token=secrets.access_token(), access_token_secret=secrets.access_secret()
        )
    # Returns a set of involved parties for a single tweet.
    #
    # Tweet must have been queried with these parameters:
@@ -48,22 +41,17 @@ class TwAPI:
        return involved_parties
-    # Returns a tweet and mention-set pair, given a tweet ID.
+    def __init__(self):
-    def get_tweet_mentions(self, id):
+        TwAPI.instance = self
-        resp = self.client.get_tweet(id,
+        self.client = tweepy.Client(
-            media_fields=TwAPI.TWEET_MEDIA_FIELDS,
+            bearer_token=api_secrets.bearer_token(),
-            tweet_fields=TwAPI.TWEET_FIELDS,
+            consumer_key=api_secrets.api_key(), consumer_secret=api_secrets.api_secret(),
-            expansions=TwAPI.TWEET_EXPANSIONS)
+            access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
        )
-        tweet = resp.data
+    # Returns a list of TalentTweets from a user.
        mentions = TwAPI.get_involved_parties(tweet, resp)
        return (tweet, mentions)
    # Returns a list (tweet, {mentions}) from a user.
    # mentions- a set comprised of any other parties involved
    # in this tweet (reply, mention, qrt)
    def get_users_all_tweets_mentions(self, id: int, count=inf):
-        pairs = list()
+        ttweets = list()
        retrieve_size = util.clamp(count, 5, 100)
        next_page_token = None
@@ -79,7 +67,7 @@ class TwAPI:
            for tweet in resp.data:
                mentions = TwAPI.get_involved_parties(tweet, resp)
-                pairs.append((tweet, mentions))
+                ttweets.append(tt.TalentTweet(tweet=tweet, other_parties=mentions))
            # update counters and pagination token
            tweets_retrieved += resp.meta['result_count']
@@ -92,15 +80,15 @@ class TwAPI:
                    break  # reached end of user's tweets
        print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
-        return pairs
+        return ttweets
-    # returns a filtered list (tweet, [mentions]) from a user
+    # Returns a list of cross-company TalentTweets from a user.
    def get_users_cross_tweets_mentions(self, id):
        ret = list()
-        pairs = self.get_users_all_tweets_mentions(id)
+        ttweets = self.get_users_all_tweets_mentions(id)
-        for pair in pairs:
+        for ttweet in ttweets:
-            if util.is_cross_company(pair):
+            if ttweet.is_cross_company():
-                ret.append(pair)
+                ret.append(ttweet)
        return ret
@@ -3,13 +3,13 @@
 import os
 import configparser
-from util import *
+import util
 # returns dictionary of the Credentials section.
 # [NOT TO BE USED OUTSIDE OF THIS FILE.]
 def __get_ini_credentials():
    c = configparser.RawConfigParser()
-    if len(c.read(os.path.join(get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
+    if len(c.read(os.path.join(util.get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
        return c['Credentials']
    return None
@@ -6,16 +6,34 @@
 import os
 import twint
 from util import *
 from talent_lists import *
 from api import TwAPI
 import talenttweet as tt
 ## Returns list of tweets present in queue.txt
 def get_local_queue():
    # f = open(os.path.join(get_project_dir(), 'queue.txt'))
    pass
 ## Returns the ID of all tweets (up to limit) from a user ID.
 def get_user_tweet_ids(id, limit=None):
    tweets = list()
    c = twint.Config()
    c.User_id = id
    c.Limit = limit
    c.Store_object = True
    c.Store_object_tweets_list = tweets
    twint.run.Search(c)
    return [x.id for x in tweets]
 def run():
    queue = get_local_queue()
-    pairs = TwAPI.instance.get_users_all_tweets_mentions(1390620618001838086, count=5)
+
-    for (tweet, mentions) in pairs:
+    tweets_ids = get_user_tweet_ids(1390620618001838086, limit=20)
-        print_tweet(tweet, mentions)
+    for id in tweets_ids:
        ttweet = tt.TalentTweet(id)
        print(ttweet)
@@ -3,12 +3,10 @@ import argparse
 from argparse import RawTextHelpFormatter
 import talent_lists
-import secrets
+import api_secrets
 import catchup
 import listen
 from api import TwAPI
 from util import is_cross_company, print_tweet
 MODES_HELP_STR = '''mode to run the bot at:
 l,listen:       listen for new tweets from all accounts; will not terminate unless error occurs
@@ -21,6 +19,10 @@ def init_argparse():
    p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini')
    return p
 # TODO: implement command line mode for manually controlling the bot
 def command_line():
    pass
 def main():
    parser = init_argparse()
    if len(sys.argv) < 2:
@@ -30,7 +32,7 @@ def main():
    args = parser.parse_args()
    if args.show_tokens:
-        print(secrets.get_all_secrets())
+        print(api_secrets.get_all_secrets())
    if args.mode is None: return
@@ -42,11 +44,6 @@ def main():
    # Initialize talent account lists
    talent_lists.init()
    ## TEST CODE ##
    cross_pairs = twApi.get_users_cross_tweets_mentions(1390620618001838086)
    for pair in cross_pairs:
        print_tweet(pair)
    ## Determine running mode
    match args.mode.lower():
        case 'l' | 'listen':
@@ -56,6 +53,8 @@ def main():
            print('RUNNING IN CATCH-UP MODE\n')
            catchup.run()
        case _: 
            command_line()
            #TODO: remove message
            print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.')
            return
@@ -2,15 +2,17 @@ import util
 niji_en = dict()
 holo_en = dict()
 talents = dict()
 def __create_dict(file, _dict):
    global talents
    with open(file, 'r') as f:
        for line in f:
            words = line.split()
            if len(words) == 2 and line[0] != '#':
                name, id = line.split()
                _dict[int(id)] = name
-
+                talents[int(id)] = name
 def init():
    global niji_en
    global holo_en
@@ -0,0 +1,69 @@
 import platform
 import tweepy
 from api import *
 import talent_lists
 class TalentTweet:
    def __init__(self, tweet: tweepy.Tweet, other_parties: set):
        self.tweet = tweet
        self.other_parties = other_parties
    def __init__(self, tweet_id):
        resp = TwAPI.instance.client.get_tweet(tweet_id,
            media_fields=TwAPI.TWEET_MEDIA_FIELDS,
            tweet_fields=TwAPI.TWEET_FIELDS,
            expansions=TwAPI.TWEET_EXPANSIONS)
        self.tweet = resp.data
        self.other_parties = TwAPI.get_involved_parties(self.tweet, resp)
    def __repr__(self) -> str:
        return (
            f'{self.tweet.id} from {talent_lists.talents.get(self.tweet.author_id, "???")}:\n'
            f'{self.tweet.text}\n'
            f'------------------------------------------------------\n'
            f'{self.get_datetime_str()}\n'
            f'{self.get_mentions_usernames()}\n'
            f'Cross-company: {self.is_cross_company()}\n'
            f'======================================================'
        )
    def is_cross_company(self):
        author_id = self.tweet.author_id
        mentions = self.other_parties
        # TODO: update for EN/ID
        for mention_id in mentions:
            if author_id in talent_lists.niji_en:
                if mention_id in talent_lists.holo_en:
                    return True
            elif author_id in talent_lists.holo_en:
                if mention_id in talent_lists.niji_en:
                    return True
        return False
    def get_mentions_usernames(self):
        if len(self.other_parties) > 0:
            s = str()
            for id in self.other_parties:
                s += f'{talent_lists.talents.get(id, "???")}, '
            return s[0:-2]
        return 'none'
    def get_datetime_str(self):
        unpad = '#' if platform.system() == 'Windows' else '-'
        return self.tweet.created_at.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
 class TalentTweets:
    def __init__(self):
        self.ttweets = list()
    def get_ttweets(self):
        pass
    def get_ttweet_ids(self):
        pass
@@ -2,39 +2,15 @@
 import os
 import talent_lists
 import talenttweet as tt
 # returns system path to this project, which is
 # up one level from this file's directory (src).
 def get_project_dir():
    return os.path.join(os.path.dirname(__file__), os.pardir)
 # determine if tweet involves cross-company interaction
 def is_cross_company(pair: tuple):
    author_id, mentions = pair[0].author_id, pair[1]
    for mention_id in mentions:
        if author_id in talent_lists.niji_en:
            if mention_id in talent_lists.holo_en:
                return True
        elif author_id in talent_lists.holo_en:
            if mention_id in talent_lists.niji_en:
                return True
    return False
 def tweet_id_to_url(id):
    return f'https://twitter.com/twitter/status/{id}'
 def print_tweet(pair: tuple):
    tweet, mentions = pair
    s = (
        f'{tweet.id}: {tweet.created_at}: involves {mentions}\n'
        f'{tweet.text}\n'
        f'-----\n'
        f'{tweet.entities}\n'
        f'{tweet.referenced_tweets}\n'
        f'================================================='
    )
    print(s)
 def clamp(n, smallest, largest):
    return max(smallest, min(n, largest))