From ab3a8fcc896c28fc9ab466a7a994c170ac1b0f3f Mon Sep 17 00:00:00 2001 From: muskit <15199219+muskit@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:29:01 -0800 Subject: [PATCH] move working files into its own directory --- .gitignore | 8 +--- README.md | 2 +- src/catchup.py | 103 +++++++++++++++++++++++++------------------- src/talent_lists.py | 31 +++++++------ src/ttweetqueue.py | 93 +++++++++++++++++++++------------------ src/util.py | 68 ++++++++++++++++++----------- 6 files changed, 176 insertions(+), 129 deletions(-) diff --git a/.gitignore b/.gitignore index a6213d7..bf619a7 100644 --- a/.gitignore +++ b/.gitignore @@ -143,9 +143,5 @@ cython_debug/ .vscode # project-specific -*.png -*.json -queue.txt -_queue_backup.txt -finished_ttweets.txt -_current_ttweet.txt \ No newline at end of file +run/ +*.json \ No newline at end of file diff --git a/README.md b/README.md index ba3dbc3..0b961c2 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ This is the authentication token obtained from a browser when signed in on the T ``` web_auth_token= ``` -### Example contents of `.env` without values +### Example `.env` without values ``` scraper_username0= scraper_password0= diff --git a/src/catchup.py b/src/catchup.py index ed4185c..ce78e1d 100644 --- a/src/catchup.py +++ b/src/catchup.py @@ -19,6 +19,7 @@ PROGRAM_ARGS = None safe_to_post_tweets = True scraper: Scraper + # Updates TTweetQueue async def get_cross_tweets_online(): global safe_to_post_tweets @@ -26,91 +27,101 @@ async def get_cross_tweets_online(): global scraper safe_to_post_tweets = True - dbg_curr_user = '' + dbg_curr_user = "" # Begin getting tweets from online - print('Pulling tweets from online!') + print("Pulling tweets from online!") try: for i, (talent_id, talent_username) in enumerate(talents.items()): - print(f'[{i+1}/{len(talents)}] {talent_username}-----------------------------------') - dbg_curr_user = f'{talent_id}: {talent_username}' + print( + f"[{i+1}/{len(talents)}] {talent_username}-----------------------------------" + ) + dbg_curr_user = f"{talent_id}: {talent_username}" try: since_date = queue.finished_user_dates.get(talent_id, None) - ttweets = scraper.get_cross_ttweets_from_user(talent_username, since_date=since_date) - print(f'got {len(ttweets)} TalentTweets') + ttweets = scraper.get_cross_ttweets_from_user( + talent_username, since_date=since_date + ) + print(f"got {len(ttweets)} TalentTweets") for ttweet in ttweets: - if ttweet.tweet_id not in queue.finished_ttweets \ - and ttweet.is_cross_company(): + if ( + ttweet.tweet_id not in queue.finished_ttweets + and ttweet.is_cross_company() + ): queue.add_ttweet(ttweet) except KeyboardInterrupt as e: raise e except Exception as e: - print('Unhandled error occurred processing tweet data.') + print("Unhandled error occurred processing tweet data.") safe_to_post_tweets = False raise e else: queue.finished_user_dates[talent_id] = get_current_date() queue.save_file() except KeyboardInterrupt as e: - print('Interrupting tweet pulling... NOTE: remaining dates in queue file will not be updated!') + print( + "Interrupting tweet pulling... NOTE: remaining dates in queue file will not be updated!" + ) queue.save_file() raise e except: - print('Unhandled error occurred while pulling tweets.') + print("Unhandled error occurred while pulling tweets.") traceback.print_exc() - with open("error_catchup.txt", "a") as f: - f.write(f'Error getting tweets from user {dbg_curr_user}\n') + with open(os.path.join(working_path(), "error_catchup.txt"), "a") as f: + f.write(f"Error getting tweets from user {dbg_curr_user}\n") traceback.print_exc(file=f) safe_to_post_tweets = False else: - print('Successfully saved all tweets from online!') + print("Successfully saved all tweets from online!") queue.save_file() + # return False = we posted at least one ttweet # return True = we didn't post a single ttweet async def process_queue() -> bool: - ''' + """ Go through the queue and post stored TalentTweets. - ''' + """ global scraper global queue queued_ttweets_count = queue.get_count() - - WAIT_TIME = 60*15 + + WAIT_TIME = 60 * 15 ttweets_posted = 0 if queued_ttweets_count == 0: - print('Posting queue is empty!') + print("Posting queue is empty!") return True - + try: while not queue.is_empty(): ttweet = queue.get_next_ttweet() if ttweet.tweet_id in queue.finished_ttweets: - print('skipping finished tweet...') + print("skipping finished tweet...") queue.good(ttweet.tweet_id) continue tweet_was_successful = await TwAPI.instance.post_ttweet(ttweet) - - print('running queue.good()...') + + print("running queue.good()...") queue.good(ttweet.tweet_id) if tweet_was_successful: ttweets_posted += 1 - print(f'({ttweets_posted}/{queued_ttweets_count}) done') + print(f"({ttweets_posted}/{queued_ttweets_count}) done") if not queue.is_empty(): - print(f'resting for {WAIT_TIME}s...') - await asyncio.sleep(WAIT_TIME-5) - print('5 second warning!') + print(f"resting for {WAIT_TIME}s...") + await asyncio.sleep(WAIT_TIME - 5) + print("5 second warning!") await asyncio.sleep(5) except Exception as e: - print('Unhandled error occurred while posting tweets from queue.') + print("Unhandled error occurred while posting tweets from queue.") traceback.print_exc() if ttweets_posted > 0: return False return True + # return True = no problems # return False = issue occurred where we couldn't post all past tweets properly async def run(PROGRAM_ARGS): @@ -124,63 +135,67 @@ async def run(PROGRAM_ARGS): # post tweets given in command line first if PROGRAM_ARGS.post_id is not None and len(PROGRAM_ARGS.post_id) > 0: PROGRAM_ARGS.post_id.sort() - print('Posting specified tweets first.') + print("Posting specified tweets first.") for id in PROGRAM_ARGS.post_id: try: i = int(id) except ValueError: - print(f'Invalid tweet {id}!') + print(f"Invalid tweet {id}!") continue - + posted = await TwAPI.instance.post_ttweet_by_id(i) if posted: queue.add_finished_tweet(i) - print('Successfully posted tweet. Sleeping for 5 minutes') - await asyncio.sleep(60*5) + print("Successfully posted tweet. Sleeping for 5 minutes") + await asyncio.sleep(60 * 5) else: - print('Did not post tweet') - print('Done processing specified tweets') + print("Did not post tweet") + print("Done processing specified tweets") PROGRAM_ARGS.post_id = None # refresh stored queue first if PROGRAM_ARGS.refresh_queue: PROGRAM_ARGS.refresh_queue = False - print('Refreshing queue tweets...') + print("Refreshing queue tweets...") for id in queue.ttweets_dict: - t = scraper.get_tweet(id, queue.ttweets_dict[id].author_id in privated_accounts) + t = scraper.get_tweet( + id, queue.ttweets_dict[id].author_id in privated_accounts + ) queue.ttweets_dict[id] = tt.TalentTweet.create_from_tweety(t) queue.save_file() async def queue_loop(): while True: - print(f'{queue.get_count()} cross-company tweets to announce.') + print(f"{queue.get_count()} cross-company tweets to announce.") try: if safe_to_post_tweets: if await process_queue(): print("Finished processing queue") return else: - print('Posted no new tweets; we\'re caught up!') + print("Posted no new tweets; we're caught up!") return else: - print('Tweets were not retrieved cleanly. Not processing queue.') + print("Tweets were not retrieved cleanly. Not processing queue.") return except KeyboardInterrupt as e: - print('Interrupting queue processing...') + print("Interrupting queue processing...") raise e except: - print('Unhandled error occurred while running catch up in posting phase.') + print( + "Unhandled error occurred while running catch up in posting phase." + ) traceback.print_exc() await get_cross_tweets_online() try: if PROGRAM_ARGS.straight_to_queue: PROGRAM_ARGS.straight_to_queue = False - print('Processing queue first before fetching tweets...') + print("Processing queue first before fetching tweets...") await queue_loop() else: await get_cross_tweets_online() await queue_loop() except KeyboardInterrupt: - print('Interrupt received. Ending catchup mode...') + print("Interrupt received. Ending catchup mode...") return False diff --git a/src/talent_lists.py b/src/talent_lists.py index 97a73cb..7b84f6f 100644 --- a/src/talent_lists.py +++ b/src/talent_lists.py @@ -1,4 +1,4 @@ -from util import get_project_dir +from util import project_root holo_en: dict[int, str] = dict() holo_id: dict[int, str] = dict() @@ -10,22 +10,25 @@ privated_accounts: dict[int, str] = dict() test_talents = dict() + # TODO: talents(id) -> (name, company) def __create_dict(file, _dict, company): - print(f'Initializing talents\' account list from {file}...') + print(f"Initializing talents' account list from {file}...") global talents - with open(file, 'r') as f: + with open(file, "r") as f: for line in f: words = line.split() - if len(words) >= 2 and line[0] != '#': + if len(words) >= 2 and line[0] != "#": t = line.split() id, name = int(t[0]), t[1] # name = f'{util.get_username_online(id, default=name)}' # attempt to get updated name talents[id] = name _dict[id] = name talents_company[id] = company - if len(words) > 2 and words[2] == 'p': + if len(words) > 2 and words[2] == "p": privated_accounts[id] = name + + def init(): global holo_en global holo_id @@ -34,26 +37,30 @@ def init(): global test_talents # holoEN - __create_dict(f'{get_project_dir()}/lists/holoen.txt', holo_en, 'holoEN') + __create_dict(f"{project_root()}/lists/holoen.txt", holo_en, "holoEN") # holoID - __create_dict(f'{get_project_dir()}/lists/holoid.txt', holo_id, 'holoID') + __create_dict(f"{project_root()}/lists/holoid.txt", holo_id, "holoID") # nijiEN - __create_dict(f'{get_project_dir()}/lists/nijien.txt', niji_en, 'nijiEN') + __create_dict(f"{project_root()}/lists/nijien.txt", niji_en, "nijiEN") # nijiexID - __create_dict(f'{get_project_dir()}/lists/nijiexid.txt', niji_exid, 'nijiex\'ID') + __create_dict(f"{project_root()}/lists/nijiexid.txt", niji_exid, "nijiex'ID") # TODO: nijiex-KR test_talents = holo_en + def is_niji(id: int) -> bool: return id in niji_en or id in niji_exid + def is_holo(id: int) -> bool: return id in holo_en or id in holo_id + def is_cross_company(id1: int, id2: int): return (is_niji(id1) and is_holo(id2)) or (is_holo(id1) and is_niji(id2)) + # For filtered stream # DEPRECATED: thx elon def get_twitter_rules(): @@ -61,12 +68,12 @@ def get_twitter_rules(): rules = list() names = list(talents.values()) - curr_rule = f'from:{names}' + curr_rule = f"from:{names}" for name in list(talents.values())[1:]: - test_rule = curr_rule + f' OR from:{name}' + test_rule = curr_rule + f" OR from:{name}" if len(test_rule) > 512: rules.append(curr_rule) - curr_rule = f'from:{name}' + curr_rule = f"from:{name}" else: curr_rule = test_rule rules.append(curr_rule) diff --git a/src/ttweetqueue.py b/src/ttweetqueue.py index 7987bdb..5151545 100644 --- a/src/ttweetqueue.py +++ b/src/ttweetqueue.py @@ -9,15 +9,20 @@ import talenttweet as tt # User timestamps line format: # {user_id} {status_num} {UNIX_timestamp} + class TalentTweetQueue: instance = None - + def __init__(self): TalentTweetQueue.instance = self - self.queue_path = util.get_queue_path() - self.queue_backup_path = util.get_queue_backup_path() - self.current_ttweet_path = f'{util.get_project_dir()}/_current_ttweet.txt' - self.finished_ttweets_path = f'{util.get_project_dir()}/finished_ttweets.txt' + self.queue_path = os.path.join(util.working_path(), "queue.txt") + self.queue_backup_path = os.path.join(util.working_path(), "_queue_backup.txt") + self.current_ttweet_path = os.path.join( + util.working_path(), "_current_ttweet.txt" + ) + self.finished_ttweets_path = os.path.join( + util.working_path(), "finished_ttweets.txt" + ) self.is_good = True self.__sorted = False self.finished_user_dates: dict[int, str] = dict() @@ -26,58 +31,62 @@ class TalentTweetQueue: ## file check, backup copy if os.path.exists(self.queue_backup_path): - print('Found backup queue! We errored in the previous run.') + print("Found backup queue! We errored in the previous run.") shutil.copyfile(self.queue_backup_path, self.queue_path) elif os.path.exists(self.queue_path): - print('Creating backup queue...') + print("Creating backup queue...") shutil.copyfile(self.queue_path, self.queue_backup_path) ## initialize structures # user timestamps try: - with open(self.queue_path, 'r') as f: + with open(self.queue_path, "r") as f: for line in f: tokens = line.split() - if len(tokens) == 0: continue + if len(tokens) == 0: + continue - if tokens[0][0] != '#': - print(f'Stopped finding user dates at {line}') + if tokens[0][0] != "#": + print(f"Stopped finding user dates at {line}") # reached end of accounts list break - if tokens[2] != '-1': + if tokens[2] != "-1": self.finished_user_dates[int(tokens[1])] = tokens[2] - except: pass + except: + pass # ttweets try: - with open(self.queue_path, 'r') as f: # reset seek head + with open(self.queue_path, "r") as f: # reset seek head # Get existing queued TalentTweets for line in f: tokens = line.split() - if len(tokens) == 0 or tokens[0][0] == '#': + if len(tokens) == 0 or tokens[0][0] == "#": continue ttweet = tt.TalentTweet.deserialize(line) # print(f'{ttweet.tweet_id}:\n{ttweet}') self.ttweets_dict[ttweet.tweet_id] = ttweet - print(f'Found {len(self.finished_user_dates)} scraped accounts and {len(self.ttweets_dict)} tweets in queue.') + print( + f"Found {len(self.finished_user_dates)} scraped accounts and {len(self.ttweets_dict)} tweets in queue." + ) except: traceback.print_exc() pass # unfinished ttweet if os.path.exists(self.current_ttweet_path): - with open(self.current_ttweet_path, 'r') as f: + with open(self.current_ttweet_path, "r") as f: for line in f: if len(line) > 0: ttweet = tt.TalentTweet.deserialize(line) if ttweet.tweet_id in self.ttweets_dict: self.ttweets_dict[ttweet.tweet_id] = ttweet - print(f'adding unfinished tweet {ttweet.tweet_id}') + print(f"adding unfinished tweet {ttweet.tweet_id}") # finished ttweets try: - with open(self.finished_ttweets_path, 'r') as f: + with open(self.finished_ttweets_path, "r") as f: for line in f: self.finished_ttweets.add(int(line)) - except: pass - + except: + pass def is_empty(self): return self.get_count() <= 0 @@ -94,57 +103,59 @@ class TalentTweetQueue: self.__sort_ttweets_dict() key = list(self.ttweets_dict.keys())[0] ttweet = self.ttweets_dict.pop(key) - with open(self.current_ttweet_path, 'w') as f: + with open(self.current_ttweet_path, "w") as f: f.write(ttweet.serialize()) return ttweet - + def get_count(self): return len(self.ttweets_dict) - + ## Call when the TalentTweet retrieved from get_next_ttweet() was # posted successfully. def good(self, tweet_id: int): - try: os.remove(self.current_ttweet_path) - except: pass + try: + os.remove(self.current_ttweet_path) + except: + pass self.add_finished_tweet(tweet_id) self.save_file() self.is_good = True - + # overwrite queue.txt def save_file(self, replace_backup=True): - print('saving queue...', end='') + print("saving queue...", end="") if replace_backup: - print('overwriting backup...', end='') + print("overwriting backup...", end="") shutil.copyfile(self.queue_path, self.queue_backup_path) self.__sort_ttweets_dict() - with open(self.queue_path, 'w') as f: + with open(self.queue_path, "w") as f: # write dates - for (id, date) in self.finished_user_dates.items(): - f.write(f'# {id} {date}\n') + for id, date in self.finished_user_dates.items(): + f.write(f"# {id} {date}\n") - f.write('\n') + f.write("\n") # write sorted ttweets for ttweet in self.ttweets_dict.values(): - f.write(ttweet.serialize() + '\n') - print('done') + f.write(ttweet.serialize() + "\n") + print("done") def add_finished_tweet(self, id): self.finished_ttweets.add(id) - with open(self.finished_ttweets_path, 'a') as f: - f.write(f'{id}\n') - + with open(self.finished_ttweets_path, "a") as f: + f.write(f"{id}\n") + def __sort_ttweets_dict(self): if not self.__sorted: self.ttweets_dict = dict(sorted(self.ttweets_dict.items())) self.__sorted = True - + # destructor def __del__(self): if self.is_good: - print('Ended in good state, deleting backup queue...') + print("Ended in good state, deleting backup queue...") os.remove(self.queue_backup_path) else: - print('Ended in bad state, keeping backup queue.') \ No newline at end of file + print("Ended in bad state, keeping backup queue.") diff --git a/src/util.py b/src/util.py index 5c8a17e..e21cece 100644 --- a/src/util.py +++ b/src/util.py @@ -13,36 +13,45 @@ import tweepy from recrop import fix_aspect_ratio import talent_lists -# returns system path to this project, which is -# up one level from this file's directory (effective path: ..../src/../). -def get_project_dir(): + +def project_root(): + """Returns the project root.""" return os.path.join(os.path.dirname(__file__), os.pardir) -def get_queue_path(): - return f'{get_project_dir()}/queue.txt' -def get_queue_backup_path(): - return f'{get_project_dir()}/_queue_backup.txt' +def working_path(): + """Returns the working ephemeral directory.""" + path = os.path.join(os.path.join(project_root(), "run")) + if not os.path.isdir(path): + os.mkdir(path) + return path + def clamp(n, smallest, largest): return max(smallest, min(n, largest)) + def datetime_to_tdate(date_time: datetime): return date_time.strftime("%Y-%m-%d") + def tdate_to_datetime(tdate: str): return datetime.strptime("%Y-%m-%d") + def timestamp_to_tdate(timestamp=None): - if timestamp==None: + if timestamp == None: timestamp = datetime.now().timestamp() return datetime_to_tdate(datetime.fromtimestamp(timestamp, tz=pytz.utc)) + def get_current_timestamp(): return datetime.now().timestamp() + def get_current_date(): - return datetime.today().strftime('%Y-%m-%d') + return datetime.today().strftime("%Y-%m-%d") + def get_key_from_value(d: dict, val): keys = [k for k, v in d.items() if v == val] @@ -50,38 +59,43 @@ def get_key_from_value(d: dict, val): return keys[0] return None + # FIXME: web_auth_token under rate-limitation will fail to screenshot async def create_ttweet_image(ttweet): tc = TweetCapture() - auth_token = dotenv_values().get('web_auth_token') + auth_token = dotenv_values().get("web_auth_token") if auth_token: - tc.cookies = [{'name': 'auth_token', 'value': auth_token}] - if 'linux' in sys.platform: + tc.cookies = [{"name": "auth_token", "value": auth_token}] + if "linux" in sys.platform: # Linux chromedriver path - tc.driver_path = '/usr/bin/chromedriver' - filename = f'{get_project_dir()}/img.png' + tc.driver_path = "/usr/bin/chromedriver" + filename = f"{working_path()}/img.png" img = None - try: os.remove(filename) - except: pass + try: + os.remove(filename) + except: + pass try: img = await tc.screenshot( url=ttweet.url(), path=filename, mode=4, night_mode=1, - show_parent_tweets=True + show_parent_tweets=True, ) img = fix_aspect_ratio(img) except: - print('unable to create tweet image') + print("unable to create tweet image") traceback.print_exc() return None - - print(f'successfully saved {img}') + + print(f"successfully saved {img}") return img + def get_tweet_url(id, username): - return f'https://www.twitter.com/{username}/status/{id}' + return f"https://www.twitter.com/{username}/status/{id}" + ## Attempt to pull username from local; pull from online if doesn't exist. def get_username(id): @@ -90,22 +104,26 @@ def get_username(id): return get_username_online(id) return ret + def get_username_with_company(id): company = talent_lists.talents_company.get(id, None) return f'{get_username(id)} {f"({company})" if company is not None else ""}' + def get_username_local(id: int): - return talent_lists.talents.get(id, f'{id}') + return talent_lists.talents.get(id, f"{id}") + # Retrieve username via API v2 (tweepy) def get_username_online(id, default=None): import twapi + try: resp = twapi.TwAPI.instance.client.get_user(id=id) return resp.data.username except tweepy.TooManyRequests: - return str(default) if default is not None else f'id:{id}' + return str(default) if default is not None else f"id:{id}" except: - print(f'Unhandled error retrieving username for {id}!') + print(f"Unhandled error retrieving username for {id}!") traceback.print_exc() - return str(default) if default is not None else f'id:{id}' \ No newline at end of file + return str(default) if default is not None else f"id:{id}"