added twint (scraper), restructuring

2022-09-24 17:56:58 -07:00
parent 5b458f2e1f
commit 579929559f
8 changed files with 349 additions and 296 deletions
@@ -1,2 +1,3 @@
-tweepy
+tweepy
-tweet-capture
+tweet-capture
 git+https://github.com/muskit/twint_2022_fix.git
@@ -1,109 +1,97 @@
-from lib2to3.pgen2 import token
+from lib2to3.pgen2 import token
-from math import inf
+from math import inf
-from urllib import response
+from urllib import response
-import tweepy
+import tweepy
-
+
-import secrets
+import api_secrets
-import util
+import talenttweet as tt
-
+import util
-class TwAPI:
+
-    instance = None
+class TwAPI:
-    TWEET_MEDIA_FIELDS = ['url']
+    instance = None
-    TWEET_FIELDS = ['created_at', 'in_reply_to_user_id']
+    TWEET_MEDIA_FIELDS = ['url']
-    TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id']
+    TWEET_FIELDS = ['created_at', 'in_reply_to_user_id']
-
+    TWEET_EXPANSIONS = ['entities.mentions.username', 'referenced_tweets.id.author_id']
-    def __init__(self):
+
-        TwAPI.instance = self
+    # Returns a set of involved parties for a single tweet.
-        self.client = tweepy.Client(
+    #
-            bearer_token=secrets.bearer_token(),
+    # Tweet must have been queried with these parameters:
-            consumer_key=secrets.api_key(), consumer_secret=secrets.api_secret(),
+    # media_fields=['url'],
-            access_token=secrets.access_token(), access_token_secret=secrets.access_secret()
+    # tweet_fields=['created_at', 'in_reply_to_user_id'],
-        )
+    # expansions=['entities.mentions.username', 'referenced_tweets.id.author_id']
-    
+    @staticmethod
-    # Returns a set of involved parties for a single tweet.
+    def get_involved_parties(tweet, response):
-    #
+        involved_parties = set()
-    # Tweet must have been queried with these parameters:
+        # mentions
-    # media_fields=['url'],
+        try:
-    # tweet_fields=['created_at', 'in_reply_to_user_id'],
+            mention_list = tweet.entities['mentions']
-    # expansions=['entities.mentions.username', 'referenced_tweets.id.author_id']
+            for mention in mention_list:
-    @staticmethod
+                involved_parties.add(int(mention['id']))
-    def get_involved_parties(tweet, response):
+        except: pass
-        involved_parties = set()
+        # reply-to
-        # mentions
+        if tweet.in_reply_to_user_id != None:
-        try:
+            involved_parties.add(tweet.in_reply_to_user_id)
-            mention_list = tweet.entities['mentions']
+        # qrt
-            for mention in mention_list:
+        if tweet.attachments:
-                involved_parties.add(int(mention['id']))
+            for ref_tweet in tweet.attachments:
-        except: pass
+                if ref_tweet.type == 'quoted':
-        # reply-to
+                    for incl_tweet in response.includes['tweets']:
-        if tweet.in_reply_to_user_id != None:
+                        if incl_tweet.id == ref_tweet.id:
-            involved_parties.add(tweet.in_reply_to_user_id)
+                            involved_parties.add(incl_tweet.author_id)
-        # qrt
+
-        if tweet.attachments:
+        return involved_parties
-            for ref_tweet in tweet.attachments:
+
-                if ref_tweet.type == 'quoted':
+    def __init__(self):
-                    for incl_tweet in response.includes['tweets']:
+        TwAPI.instance = self
-                        if incl_tweet.id == ref_tweet.id:
+        self.client = tweepy.Client(
-                            involved_parties.add(incl_tweet.author_id)
+            bearer_token=api_secrets.bearer_token(),
-
+            consumer_key=api_secrets.api_key(), consumer_secret=api_secrets.api_secret(),
-        return involved_parties
+            access_token=api_secrets.access_token(), access_token_secret=api_secrets.access_secret()
-    
+        )
-    # Returns a tweet and mention-set pair, given a tweet ID.
+    
-    def get_tweet_mentions(self, id):
+    # Returns a list of TalentTweets from a user.
-        resp = self.client.get_tweet(id,
+    def get_users_all_tweets_mentions(self, id: int, count=inf):
-            media_fields=TwAPI.TWEET_MEDIA_FIELDS,
+        ttweets = list()
-            tweet_fields=TwAPI.TWEET_FIELDS,
+
-            expansions=TwAPI.TWEET_EXPANSIONS)
+        retrieve_size = util.clamp(count, 5, 100)
-        
+        next_page_token = None
-        tweet = resp.data
+        tokens_retrieved = 0
-        mentions = TwAPI.get_involved_parties(tweet, resp)
+        tweets_retrieved = 0
-        return (tweet, mentions)
+
-    
+        while tweets_retrieved < count:
-    # Returns a list (tweet, {mentions}) from a user.
+            print(f'Retrieved {tokens_retrieved} tokens so far...')
-    # mentions- a set comprised of any other parties involved
+            resp = self.client.get_users_tweets(id, max_results=retrieve_size, pagination_token=next_page_token,
-    # in this tweet (reply, mention, qrt)
+                media_fields=TwAPI.TWEET_MEDIA_FIELDS,
-    def get_users_all_tweets_mentions(self, id: int, count=inf):
+                tweet_fields=TwAPI.TWEET_FIELDS,
-        pairs = list()
+                expansions=TwAPI.TWEET_EXPANSIONS)
-
+
-        retrieve_size = util.clamp(count, 5, 100)
+            for tweet in resp.data:
-        next_page_token = None
+                mentions = TwAPI.get_involved_parties(tweet, resp)
-        tokens_retrieved = 0
+                ttweets.append(tt.TalentTweet(tweet=tweet, other_parties=mentions))
-        tweets_retrieved = 0
+
-
+            # update counters and pagination token
-        while tweets_retrieved < count:
+            tweets_retrieved += resp.meta['result_count']
-            print(f'Retrieved {tokens_retrieved} tokens so far...')
+            if tweets_retrieved < count:
-            resp = self.client.get_users_tweets(id, max_results=retrieve_size, pagination_token=next_page_token,
+                try:
-                media_fields=TwAPI.TWEET_MEDIA_FIELDS,
+                    next_page_token = resp.meta['next_token']
-                tweet_fields=TwAPI.TWEET_FIELDS,
+                    tokens_retrieved += 1
-                expansions=TwAPI.TWEET_EXPANSIONS)
+                except KeyError:
-
+                    print("next_token wasn't provided; we've reached the end!")
-            for tweet in resp.data:
+                    break  # reached end of user's tweets
-                mentions = TwAPI.get_involved_parties(tweet, resp)
+
-                pairs.append((tweet, mentions))
+        print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
-
+        return ttweets
-            # update counters and pagination token
+    
-            tweets_retrieved += resp.meta['result_count']
+    # Returns a list of cross-company TalentTweets from a user.
-            if tweets_retrieved < count:
+    def get_users_cross_tweets_mentions(self, id):
-                try:
+        ret = list()
-                    next_page_token = resp.meta['next_token']
+        ttweets = self.get_users_all_tweets_mentions(id)
-                    tokens_retrieved += 1
+        for ttweet in ttweets:
-                except KeyError:
+            if ttweet.is_cross_company():
-                    print("next_token wasn't provided; we've reached the end!")
+                ret.append(ttweet)
-                    break  # reached end of user's tweets
+        
-
+        return ret
-        print(f'Retrieved {tweets_retrieved} tweets using {tokens_retrieved} tokens.')
+
-        return pairs
+    # Create a post that showcases given tweet and its mentions set.
-    
+    def create_post(self, tweet, mentions):
    # returns a filtered list (tweet, [mentions]) from a user
    def get_users_cross_tweets_mentions(self, id):
        ret = list()
        pairs = self.get_users_all_tweets_mentions(id)
        for pair in pairs:
            if util.is_cross_company(pair):
                ret.append(pair)
        return ret
    # Create a post that showcases given tweet and its mentions set.
    def create_post(self, tweet, mentions):
        pass
@@ -1,42 +1,42 @@
-## Twitter developer credentials management.
+## Twitter developer credentials management.
-
+
-import os
+import os
-import configparser
+import configparser
-
+
-from util import *
+import util
-
+
-# returns dictionary of the Credentials section.
+# returns dictionary of the Credentials section.
-# [NOT TO BE USED OUTSIDE OF THIS FILE.]
+# [NOT TO BE USED OUTSIDE OF THIS FILE.]
-def __get_ini_credentials():
+def __get_ini_credentials():
-    c = configparser.RawConfigParser()
+    c = configparser.RawConfigParser()
-    if len(c.read(os.path.join(get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
+    if len(c.read(os.path.join(util.get_project_dir(), 'secrets.ini'))) > 0 and c.has_section('Credentials'):
-        return c['Credentials']
+        return c['Credentials']
-    return None
+    return None
-
+
-# returns the consumer api_key stored in secrets.ini
+# returns the consumer api_key stored in secrets.ini
-def api_key():
+def api_key():
-    c = __get_ini_credentials()
+    c = __get_ini_credentials()
-    return c.get(option='api_key', fallback='xxx') if c is not None else 'xxx'
+    return c.get(option='api_key', fallback='xxx') if c is not None else 'xxx'
-
+
-# returns the consumer api_secret stored in secrets.ini
+# returns the consumer api_secret stored in secrets.ini
-def api_secret():
+def api_secret():
-    c = __get_ini_credentials()
+    c = __get_ini_credentials()
-    return c.get(option='api_secret', fallback='yyy') if c is not None else 'yyy'
+    return c.get(option='api_secret', fallback='yyy') if c is not None else 'yyy'
-
+
-# returns the bearer_token stored in secrets.ini
+# returns the bearer_token stored in secrets.ini
-def bearer_token():
+def bearer_token():
-    c = __get_ini_credentials()
+    c = __get_ini_credentials()
-    return c.get(option='bearer_token', fallback='zzz') if c is not None else 'zzz'
+    return c.get(option='bearer_token', fallback='zzz') if c is not None else 'zzz'
-
+
-# returns the access_token stroed in secrets.ini
+# returns the access_token stroed in secrets.ini
-def access_token():
+def access_token():
-    c = __get_ini_credentials()
+    c = __get_ini_credentials()
-    return c.get(option='oauth1_access_token', fallback='zzz') if c is not None else 'aaa'
+    return c.get(option='oauth1_access_token', fallback='zzz') if c is not None else 'aaa'
-
+
-# returns the access_secret stroed in secrets.ini
+# returns the access_secret stroed in secrets.ini
-def access_secret():
+def access_secret():
-    c = __get_ini_credentials()
+    c = __get_ini_credentials()
-    return c.get(option='oauth1_access_secret', fallback='zzz') if c is not None else 'bbb'
+    return c.get(option='oauth1_access_secret', fallback='zzz') if c is not None else 'bbb'
-
+
-def get_all_secrets():
+def get_all_secrets():
    return f'api_key:{api_key()}\napi_secret:{api_secret()}\nbearer_token:{bearer_token()}\naccess_token:{access_token()}\naccess_secret:{access_secret()}'
@@ -1,21 +1,39 @@
-## The bot's catch-up mode
+## The bot's catch-up mode
-# Scan all accounts for cross-company interactions.
+# Scan all accounts for cross-company interactions.
-# Terminates when finished scanning and posting.
+# Terminates when finished scanning and posting.
-#
+#
-# We should post, at the fastest, one tweet per minute.
+# We should post, at the fastest, one tweet per minute.
-
+
-import os
+import os
-
+
-from util import *
+import twint
-from api import TwAPI
+
-
+from util import *
-## Returns list of tweets present in queue.txt
+from talent_lists import *
-def get_local_queue():
+from api import TwAPI
-    # f = open(os.path.join(get_project_dir(), 'queue.txt'))
+import talenttweet as tt
-    pass
+
-
+## Returns list of tweets present in queue.txt
-def run():
+def get_local_queue():
-    queue = get_local_queue()
+    # f = open(os.path.join(get_project_dir(), 'queue.txt'))
-    pairs = TwAPI.instance.get_users_all_tweets_mentions(1390620618001838086, count=5)
+    pass
-    for (tweet, mentions) in pairs:
+
-        print_tweet(tweet, mentions)
+## Returns the ID of all tweets (up to limit) from a user ID.
 def get_user_tweet_ids(id, limit=None):
    tweets = list()
    c = twint.Config()
    c.User_id = id
    c.Limit = limit
    c.Store_object = True
    c.Store_object_tweets_list = tweets
    twint.run.Search(c)
    return [x.id for x in tweets]
 def run():
    queue = get_local_queue()
    tweets_ids = get_user_tweet_ids(1390620618001838086, limit=20)
    for id in tweets_ids:
        ttweet = tt.TalentTweet(id)
        print(ttweet)
@@ -1,64 +1,63 @@
-import sys
+import sys
-import argparse
+import argparse
-from argparse import RawTextHelpFormatter
+from argparse import RawTextHelpFormatter
-
+
-import talent_lists
+import talent_lists
-import secrets
+import api_secrets
-import catchup
+import catchup
-import listen
+import listen
-
+from api import TwAPI
-from api import TwAPI
+
-from util import is_cross_company, print_tweet
+MODES_HELP_STR = '''mode to run the bot at:
-
+l,listen:       listen for new tweets from all accounts; will not terminate unless error occurs
-MODES_HELP_STR = '''mode to run the bot at:
+c,catchup:      scan all tweets from all accounts; will terminate when done'''
-l,listen:       listen for new tweets from all accounts; will not terminate unless error occurs
+
-c,catchup:      scan all tweets from all accounts; will terminate when done'''
+def init_argparse():
-
+    p = argparse.ArgumentParser(description='Twitter bot that follows interactions between Nijisanji EN/ID and hololive EN/ID members.', formatter_class=RawTextHelpFormatter)
-def init_argparse():
+    p.add_argument('mode', nargs='?', \
-    p = argparse.ArgumentParser(description='Twitter bot that follows interactions between Nijisanji EN/ID and hololive EN/ID members.', formatter_class=RawTextHelpFormatter)
+        help=MODES_HELP_STR)
-    p.add_argument('mode', nargs='?', \
+    p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini')
-        help=MODES_HELP_STR)
+    return p
-    p.add_argument('--show-tokens', action='store_true', help='[DO NOT USE IN PUBLIC SETTING] print stored tokens from secrets.ini')
+
-    return p
+# TODO: implement command line mode for manually controlling the bot
-
+def command_line():
-def main():
+    pass
-    parser = init_argparse()
+
-    if len(sys.argv) < 2:
+def main():
-        parser.print_help()
+    parser = init_argparse()
-        return
+    if len(sys.argv) < 2:
-
+        parser.print_help()
-    args = parser.parse_args()
+        return
-
+
-    if args.show_tokens:
+    args = parser.parse_args()
-        print(secrets.get_all_secrets())
+
-
+    if args.show_tokens:
-    if args.mode is None: return
+        print(api_secrets.get_all_secrets())
-
+
-    ## We expect to run in some mode now.
+    if args.mode is None: return
-
+
-    # Initialize shared API instance
+    ## We expect to run in some mode now.
-    twApi = TwAPI.instance = TwAPI()
+
-
+    # Initialize shared API instance
-    # Initialize talent account lists
+    twApi = TwAPI.instance = TwAPI()
-    talent_lists.init()
+
-
+    # Initialize talent account lists
-    ## TEST CODE ##
+    talent_lists.init()
-    cross_pairs = twApi.get_users_cross_tweets_mentions(1390620618001838086)
+
-    for pair in cross_pairs:
+    ## Determine running mode
-        print_tweet(pair)
+    match args.mode.lower():
-
+        case 'l' | 'listen':
-    ## Determine running mode
+            print('RUNNING IN LISTEN MODE\n')
-    match args.mode.lower():
+            listen.run()
-        case 'l' | 'listen':
+        case 'c' | 'catchup':
-            print('RUNNING IN LISTEN MODE\n')
+            print('RUNNING IN CATCH-UP MODE\n')
-            listen.run()
+            catchup.run()
-        case 'c' | 'catchup':
+        case _: 
-            print('RUNNING IN CATCH-UP MODE\n')
+            command_line()
-            catchup.run()
+            #TODO: remove message
-        case _:
+            print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.')
-            print('\ninvalid mode. run with no arguments or "-h" for help page, including mode list.')
+            return
-            return
+    
-    
+
-
+if __name__ == "__main__":
-if __name__ == "__main__":
+    main()
    main()
@@ -1,21 +1,23 @@
-import util
+import util
-
+
-niji_en = dict()
+niji_en = dict()
-holo_en = dict()
+holo_en = dict()
-
+talents = dict()
-def __create_dict(file, _dict):
+
-    with open(file, 'r') as f:
+def __create_dict(file, _dict):
-        for line in f:
+    global talents
-            words = line.split()
+    with open(file, 'r') as f:
-            if len(words) == 2 and line[0] != '#':
+        for line in f:
-                name, id = line.split()
+            words = line.split()
-                _dict[int(id)] = name
+            if len(words) == 2 and line[0] != '#':
-
+                name, id = line.split()
-def init():
+                _dict[int(id)] = name
-    global niji_en
+                talents[int(id)] = name
-    global holo_en
+def init():
-
+    global niji_en
-    # holoEN
+    global holo_en
-    __create_dict(f'{util.get_project_dir()}/lists/holoen.txt', holo_en)
+
-    # nijiEN
+    # holoEN
-    __create_dict(f'{util.get_project_dir()}/lists/nijien.txt', niji_en)
+    __create_dict(f'{util.get_project_dir()}/lists/holoen.txt', holo_en)
    # nijiEN
    __create_dict(f'{util.get_project_dir()}/lists/nijien.txt', niji_en)
@@ -0,0 +1,69 @@
 import platform
 import tweepy
 from api import *
 import talent_lists
 class TalentTweet:
    def __init__(self, tweet: tweepy.Tweet, other_parties: set):
        self.tweet = tweet
        self.other_parties = other_parties
    def __init__(self, tweet_id):
        resp = TwAPI.instance.client.get_tweet(tweet_id,
            media_fields=TwAPI.TWEET_MEDIA_FIELDS,
            tweet_fields=TwAPI.TWEET_FIELDS,
            expansions=TwAPI.TWEET_EXPANSIONS)
        self.tweet = resp.data
        self.other_parties = TwAPI.get_involved_parties(self.tweet, resp)
    def __repr__(self) -> str:
        return (
            f'{self.tweet.id} from {talent_lists.talents.get(self.tweet.author_id, "???")}:\n'
            f'{self.tweet.text}\n'
            f'------------------------------------------------------\n'
            f'{self.get_datetime_str()}\n'
            f'{self.get_mentions_usernames()}\n'
            f'Cross-company: {self.is_cross_company()}\n'
            f'======================================================'
        )
    def is_cross_company(self):
        author_id = self.tweet.author_id
        mentions = self.other_parties
        # TODO: update for EN/ID
        for mention_id in mentions:
            if author_id in talent_lists.niji_en:
                if mention_id in talent_lists.holo_en:
                    return True
            elif author_id in talent_lists.holo_en:
                if mention_id in talent_lists.niji_en:
                    return True
        return False
    def get_mentions_usernames(self):
        if len(self.other_parties) > 0:
            s = str()
            for id in self.other_parties:
                s += f'{talent_lists.talents.get(id, "???")}, '
            return s[0:-2]
        return 'none'
    def get_datetime_str(self):
        unpad = '#' if platform.system() == 'Windows' else '-'
        return self.tweet.created_at.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)')
 class TalentTweets:
    def __init__(self):
        self.ttweets = list()
    def get_ttweets(self):
        pass
    def get_ttweet_ids(self):
        pass
@@ -1,40 +1,16 @@
-## Shared utility functions.
+## Shared utility functions.
-
+
-import os
+import os
-import talent_lists
+import talent_lists
-
+import talenttweet as tt
-# returns system path to this project, which is
+
-# up one level from this file's directory (src).
+# returns system path to this project, which is
-def get_project_dir():
+# up one level from this file's directory (src).
-    return os.path.join(os.path.dirname(__file__), os.pardir)
+def get_project_dir():
-
+    return os.path.join(os.path.dirname(__file__), os.pardir)
-# determine if tweet involves cross-company interaction
+
-def is_cross_company(pair: tuple):
+def tweet_id_to_url(id):
-    author_id, mentions = pair[0].author_id, pair[1]
+    return f'https://twitter.com/twitter/status/{id}'
-
+
-    for mention_id in mentions:
+def clamp(n, smallest, largest):
        if author_id in talent_lists.niji_en:
            if mention_id in talent_lists.holo_en:
                return True
        elif author_id in talent_lists.holo_en:
            if mention_id in talent_lists.niji_en:
                return True
    return False
 def tweet_id_to_url(id):
    return f'https://twitter.com/twitter/status/{id}'
 def print_tweet(pair: tuple):
    tweet, mentions = pair
    s = (
        f'{tweet.id}: {tweet.created_at}: involves {mentions}\n'
        f'{tweet.text}\n'
        f'-----\n'
        f'{tweet.entities}\n'
        f'{tweet.referenced_tweets}\n'
        f'================================================='
    )
    print(s)
 def clamp(n, smallest, largest):
    return max(smallest, min(n, largest))