diff --git a/.gitignore b/.gitignore index a0e4b05..a6213d7 100644 --- a/.gitignore +++ b/.gitignore @@ -142,6 +142,10 @@ cython_debug/ # VS Code files .vscode -# project-specific (secret.ini: can't ignore existing file?) +# project-specific *.png -*.json \ No newline at end of file +*.json +queue.txt +_queue_backup.txt +finished_ttweets.txt +_current_ttweet.txt \ No newline at end of file diff --git a/src/catchup.py b/src/catchup.py index 79f5d5a..c7d6e0a 100644 --- a/src/catchup.py +++ b/src/catchup.py @@ -5,10 +5,7 @@ # We should post, at the fastest, one tweet per minute. import traceback -import datetime import asyncio -import shutil -from datetime import datetime from scraper import Scraper from util import * @@ -20,19 +17,20 @@ import ttweetqueue as ttq safe_to_post_tweets = True errored = False +scraper: Scraper + # Returns a list of sorted and filtered TalentTweets (should # be equivalent to queue.txt) async def get_cross_tweets_online(): global safe_to_post_tweets - - scraper = Scraper() - queue = ttq.TalentTweetQueue.instance + global queue + global scraper # Begin getting tweets from online print('Pulling tweets from online!') try: - for i, (talent_id, talent_username) in enumerate(talent_lists.talents.items()): - print(f'[{i+1}/{len(talent_lists.talents)}] {talent_username}-----------------------------------') + for i, (talent_id, talent_username) in enumerate(talents.items()): + print(f'[{i+1}/{len(talents)}] {talent_username}-----------------------------------') try: since_date = queue.finished_user_dates.get(talent_id, None) ttweets = scraper.get_cross_ttweets_from_user(talent_username, since_date=since_date) @@ -48,7 +46,7 @@ async def get_cross_tweets_online(): safe_to_post_tweets = False traceback.print_exc() else: - queue.finished_user_dates[talent_id] = util.get_current_date() + queue.finished_user_dates[talent_id] = get_current_date() queue.save_file() except KeyboardInterrupt: print('Interrupting tweet pulling... NOTE: remaining dates in queue file will not be updated!') @@ -65,13 +63,14 @@ async def get_cross_tweets_online(): # return True = we didn't post a single ttweet async def process_queue() -> bool: global errored + global scraper + global queue + + errored = False + queued_ttweets_count = queue.get_count() WAIT_TIME = 60*15 ttweets_posted = 0 - errored = False - - queue = ttq.TalentTweetQueue.instance - queued_ttweets_count = queue.get_count() if queued_ttweets_count == 0: print('Posting queue is empty!') @@ -106,9 +105,20 @@ async def process_queue() -> bool: async def run(PROGRAM_ARGS): global errored global safe_to_post_tweets + global scraper + global queue + scraper = Scraper() queue = ttq.TalentTweetQueue.instance + if PROGRAM_ARGS.refresh_queue: + PROGRAM_ARGS.refresh_queue = False + print('Refreshing queue tweets...') + for id in queue.ttweets_dict: + t = scraper.get_tweet(id, queue.ttweets_dict[id].author_id in privated_accounts) + queue.ttweets_dict[id] = tt.TalentTweet.create_from_tweety(t) + queue.save_file() + async def queue_loop(): while True: print(f'{queue.get_count()} cross-company tweets to attempt sharing.') @@ -134,6 +144,7 @@ async def run(PROGRAM_ARGS): await get_cross_tweets_online() if PROGRAM_ARGS.straight_to_queue: + PROGRAM_ARGS.straight_to_queue = False print('Processing queue first before pulling tweets...') return await queue_loop() else: diff --git a/src/main.py b/src/main.py index 09f2e56..74c219d 100644 --- a/src/main.py +++ b/src/main.py @@ -22,6 +22,7 @@ def init_argparse(): p = argparse.ArgumentParser(description='Twitter bot that follows interactions between Nijisanji EN/ID and hololive EN/ID members.', formatter_class=RawTextHelpFormatter) p.add_argument('mode', nargs='?', help=MODES_HELP_STR) p.add_argument('--no-listen', action='store_true', help='Run one scraping-posting cycle without waiting to run again.') + p.add_argument('--refresh-queue', action='store_true', help='Refresh the details on each tweet currently in queue.') p.add_argument('--straight-to-queue', action='store_true', help='Go through queue first before attempting to pull tweets.') return p @@ -46,6 +47,16 @@ async def async_main(): else: print('\nunknown mode. run with no arguments or -h for help and modes') +def init_data(): + # Initialize shared API instance + TwAPI() + + # Initialize talent account lists + talent_lists.init() + + # Initialize queue files system + ttq.TalentTweetQueue() + def main(): global PROGRAM_ARGS @@ -56,14 +67,7 @@ def main(): PROGRAM_ARGS = parser.parse_args() - # Initialize shared API instance - TwAPI() - - # Initialize talent account lists - talent_lists.init() - - # Initialize queue files system - ttq.TalentTweetQueue() + init_data() ## Asynchronous execution nest_asyncio.apply() diff --git a/src/scraper.py b/src/scraper.py index 7a1f8f8..29cd534 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -40,6 +40,65 @@ class Scraper: return True print('exhausted all accounts!') return False + + def login_wait(self, private=False): + if private: + print(f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 2 minutes...") + sleep(120) + print() + l = self.try_login(0) + else: + l = self.try_login() + if not l: + print("sleeping for 2 minutes...") + sleep(120) + print() + self.try_login() + + # recover lost info + def fix_tweet(self, tweet: Tweet): + if tweet.is_retweet: + if tweet.retweeted_tweet is None: + print(f'{tweet.author.username}/{tweet.id} is missing the RT! It\'s probably nothing...') + # tweet.retweeted_tweet = self.app.tweet_detail(str(tweet.id)).retweeted_tweet + tweet.is_retweet = False + elif tweet.retweeted_tweet.author is None: + print(f'WARNING: {tweet.author.username}/{tweet.id} is missing the RT author! Recovering details...') + tweet.retweeted_tweet = self.get_tweet(tweet.retweeted_tweet.id) + + if tweet.is_quoted: + if tweet.quoted_tweet is None: # quoted tweet is deleted + # print(f'{tweet.author.username}/{tweet.id} is missing the QRT! Recovering...') + # tweet.quoted_tweet = self.app.tweet_detail(str(tweet.id)).quoted_tweet + tweet.is_quoted = False + elif tweet.quoted_tweet.author is None: + print(f'WARNING: {tweet.author.username}/{tweet.id} is missing the QRT author! Recovering details...') + tweet.quoted_tweet = self.get_tweet(tweet.quoted_tweet.id) + + if tweet.is_reply and tweet.replied_to is None: + print('missing reply-to tweet. recovering...') + tweet.replied_to = self.get_tweet(tweet.original_tweet['in_reply_to_status_id_str']) + return tweet + + def get_tweet(self, id: int, private_user=False): + print(f'{id}{" on private" if private_user else ""}') + if private_user: + self.try_login(0) + while True: + try: + t = self.app.tweet_detail(str(id)) + return self.fix_tweet(t) if t is not None else None + except UnknownError: + print("UnknownError occurred, probably rate-limited") + self.login_wait(private_user) + except Exception as e: + if private_user: + print("Unknown exception occurred, tweet is probably unavailable") + print(e.with_traceback()) + return None + else: + print("Unknown exception occurred, trying again as private...") + self.get_tweet(id, True) # since MUST BE TIMEZONE AWARE # usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc) @@ -66,28 +125,8 @@ class Scraper: print(f"skipping malformed tweet: {tweet}") return - # recover lost info - if tweet.is_retweet: - if tweet.retweeted_tweet is None: - print(f'{tweet.author.username}/{tweet.id} is missing the RT! It\'s probably nothing...') - # tweet.retweeted_tweet = self.app.tweet_detail(str(tweet.id)).retweeted_tweet - tweet.is_retweet = False - elif tweet.retweeted_tweet.author is None: - print(f'WARNING: {tweet.author.username}/{tweet.id} is missing the RT author! Recovering details...') - tweet.retweeted_tweet = self.app.tweet_detail(tweet.retweeted_tweet.id) - - if tweet.is_quoted: - if tweet.quoted_tweet is None: # quoted tweet is deleted - # print(f'{tweet.author.username}/{tweet.id} is missing the QRT! Recovering...') - # tweet.quoted_tweet = self.app.tweet_detail(str(tweet.id)).quoted_tweet - tweet.is_quoted = False - elif tweet.quoted_tweet.author is None: - print(f'WARNING: {tweet.author.username}/{tweet.id} is missing the QRT author! Recovering details...') - tweet.quoted_tweet = self.app.tweet_detail(tweet.quoted_tweet.id) - - # fix reply if it exists - # if tweet.is_reply and tweet.replied_to is None: - # tweet.replied_to = self.app.tweet_detail(tweet.original_tweet['in_reply_to_status_id_str']) + tweet = self.fix_tweet(self, tweet) + tweets.append(tweet) if not reached_backdate and int(tweet.author.id) == uid and tweet.date <= since: @@ -118,18 +157,7 @@ class Scraper: cur = search.cursor except UnknownError: print("UnknownError occurred, probably rate-limited") - if uid in talent_lists.privated_accounts: - print("sticking pvt-accessible account. sleeping for 2 minutes...") - sleep(120) - print() - l = self.try_login(0) - else: - l = self.try_login() - if not l: - print("sleeping for 2 minutes...") - sleep(120) - print() - self.try_login() + self.login_wait(uid in talent_lists.privated_accounts) tweets.sort(key=lambda t: t.id) return tweets @@ -153,4 +181,4 @@ if __name__== '__main__': talent_lists.init() s = Scraper() ttweets = s.get_cross_ttweets_from_user("pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc)) - print("\n".join([x.__repr__() for x in ttweets])) \ No newline at end of file + print("\n".join([x.__repr__() for x in ttweets])) diff --git a/src/talent_lists.py b/src/talent_lists.py index dbe9f35..97a73cb 100644 --- a/src/talent_lists.py +++ b/src/talent_lists.py @@ -1,4 +1,4 @@ -import util +from util import get_project_dir holo_en: dict[int, str] = dict() holo_id: dict[int, str] = dict() @@ -34,13 +34,13 @@ def init(): global test_talents # holoEN - __create_dict(f'{util.get_project_dir()}/lists/holoen.txt', holo_en, 'holoEN') + __create_dict(f'{get_project_dir()}/lists/holoen.txt', holo_en, 'holoEN') # holoID - __create_dict(f'{util.get_project_dir()}/lists/holoid.txt', holo_id, 'holoID') + __create_dict(f'{get_project_dir()}/lists/holoid.txt', holo_id, 'holoID') # nijiEN - __create_dict(f'{util.get_project_dir()}/lists/nijien.txt', niji_en, 'nijiEN') + __create_dict(f'{get_project_dir()}/lists/nijien.txt', niji_en, 'nijiEN') # nijiexID - __create_dict(f'{util.get_project_dir()}/lists/nijiexid.txt', niji_exid, 'nijiex\'ID') + __create_dict(f'{get_project_dir()}/lists/nijiexid.txt', niji_exid, 'nijiex\'ID') # TODO: nijiex-KR test_talents = holo_en diff --git a/src/talenttweet.py b/src/talenttweet.py index 8cf228e..91f1b39 100644 --- a/src/talenttweet.py +++ b/src/talenttweet.py @@ -1,5 +1,4 @@ from datetime import datetime -from zoneinfo import ZoneInfo import platform import pytz @@ -85,7 +84,7 @@ class TalentTweet: def create_from_tweety(tweety: Tweet): if tweety.is_retweet: rtm = [int(x.id) for x in tweety.retweeted_tweet.user_mentions] - elif tweety.is_quoted: + elif tweety.quoted_tweet: rtm = [int(x.id) for x in tweety.quoted_tweet.user_mentions] else: rtm = list() @@ -132,6 +131,9 @@ class TalentTweet: except: pass try: self.all_parties.remove(self.author_id) except: pass + + if not self.is_cross_company(): + print(f'WARNING: {self.tweet_id} is not cross-company!') def __repr__(self) -> str: @@ -169,7 +171,7 @@ class TalentTweet: def get_datetime_str(self): unpad = '#' if platform.system() == 'Windows' else '-' - return self.date_time.strftime(f'%b %{unpad}d %Y, %{unpad}I:%M%p (%Z)') + return self.date_time.strftime(f'%{unpad}I:%M%p (%Z) ยท %b %{unpad}d, %Y') def announce_text(self): # templates diff --git a/src/ttweetqueue.py b/src/ttweetqueue.py index 7e51391..7880e5d 100644 --- a/src/ttweetqueue.py +++ b/src/ttweetqueue.py @@ -20,9 +20,9 @@ class TalentTweetQueue: self.finished_ttweets_path = f'{util.get_project_dir()}/finished_ttweets.txt' self.is_good = True self.__sorted = False - self.finished_user_dates = dict() - self.ttweets_dict = dict() - self.finished_ttweets = list() + self.finished_user_dates: dict[int, str] = dict() + self.ttweets_dict: dict[int, tt.TalentTweet] = dict() + self.finished_ttweets: list[int] = list() ## file check, backup copy if os.path.exists(self.queue_backup_path): @@ -62,6 +62,14 @@ class TalentTweetQueue: except: traceback.print_exc() pass + # unfinished ttweet + if os.path.exists(self.current_ttweet_path): + with open(self.current_ttweet_path, 'r') as f: + for line in f: + if len(line) > 0: + ttweet = tt.TalentTweet.deserialize(line) + if ttweet.tweet_id in self.ttweets_dict: + self.ttweets_dict[ttweet.tweet_id] = ttweet # finished ttweets try: with open(self.finished_ttweets_path, 'r') as f: @@ -82,13 +90,6 @@ class TalentTweetQueue: def get_next_ttweet(self): self.is_good = False - if os.path.exists(self.current_ttweet_path): - with open(self.current_ttweet_path, 'r') as f: - ttweet = tt.TalentTweet.deserialize(f.readline()) - if ttweet.tweet_id in self.ttweets_dict: - self.ttweets_dict.pop(ttweet.tweet_id) - return ttweet - self.__sort_ttweets_dict() key = list(self.ttweets_dict.keys())[0] ttweet = self.ttweets_dict.pop(key) @@ -112,7 +113,7 @@ class TalentTweetQueue: # overwrite queue.txt def save_file(self): - print('saving file...', end='') + print('saving queue files...', end='') shutil.copyfile(self.queue_path, self.queue_backup_path) self.__sort_ttweets_dict() with open(self.queue_path, 'w') as f: diff --git a/src/twapi.py b/src/twapi.py index 712a787..ee4f82b 100644 --- a/src/twapi.py +++ b/src/twapi.py @@ -7,7 +7,6 @@ import tweepy import talenttweet as tt import talent_lists as tl -import ttweetqueue as ttq import util class TwAPI: diff --git a/src/util.py b/src/util.py index a419371..02e381f 100644 --- a/src/util.py +++ b/src/util.py @@ -6,11 +6,9 @@ import traceback from datetime import datetime from dotenv import dotenv_values -import tweepy import pytz -import twint -import twapi from tweetcapture import TweetCapture +import tweepy from recrop import fix_aspect_ratio import talent_lists @@ -52,6 +50,7 @@ def get_key_from_value(d: dict, val): return keys[0] return None +# FIXME: web_auth_token under rate-limitation will fail to screenshot async def create_ttweet_image(ttweet): tc = TweetCapture() tc.cookies = [{'name': 'auth_token', 'value': dotenv_values()['web_auth_token']}] @@ -100,6 +99,7 @@ def get_username_local(id: int): # Retrieve username via API v2 (tweepy) def get_username_online(id, default=None): + import twapi try: resp = twapi.TwAPI.instance.client.get_user(id=id) return resp.data.username