add print to indicate each cross tweet found

2024-01-25 18:45:53 -08:00
parent ecce333a05
commit 81eea91d02
3 changed files with 185 additions and 164 deletions
@@ -24,7 +24,7 @@ The bot may run in these modes:
 * Command-line (`cmd`): an interactive mode for manual control and debugging (drops into Python interpretor)
 ## `.env`
-These need to be defined in a `.env` file at the project root (outside of `src`):
+These need to be defined in a `.env` file in the `run` ephemeral directory.
 ### Scraper Credentials
 To get around rate limitations imposed on users, we scrape with multiple accounts. Each account is defined in the file using the following format:
@@ -41,13 +41,12 @@ async def get_cross_tweets_online():
                ttweets = scraper.get_cross_ttweets_from_user(
                    talent_username, since_date=since_date
                )
-                print(f"got {len(ttweets)} TalentTweets")
+                added_to_queue = 0
                for ttweet in ttweets:
-                    if (
+                    if ttweet.tweet_id not in queue.finished_ttweets:
-                        ttweet.tweet_id not in queue.finished_ttweets
+                        added_to_queue += 1
                        and ttweet.is_cross_company()
                    ):
                        queue.add_ttweet(ttweet)
                print(f"Enqueued {added_to_queue}/{len(ttweets)} tweets.")
            except KeyboardInterrupt as e:
                raise e
            except Exception as e:
@@ -14,6 +14,7 @@ from tweety_utils import *
 from talenttweet import *
 import talent_lists
 class Scraper:
    def __init__(self):
        Scraper.instance = self
@@ -40,12 +41,14 @@ class Scraper:
            else:
                self.app.sign_in(*acc)
            return True
-		print('exhausted all accounts!')
+        print("exhausted all accounts!")
        return False
    def login_wait(self, private=False):
        if private:
-			print(f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes...")
+            print(
                f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes..."
            )
            sleep(240)
            print()
            l = self.try_login(0)
@@ -77,7 +80,9 @@ class Scraper:
        if tweet.is_reply and tweet.replied_to is None:
            # print(f'{tweet.author.username}/{tweet.id} is missing reply-to tweet! Recovering...')
-			tweet.replied_to = self.get_tweet(tweet.original_tweet['in_reply_to_status_id_str'])
+            tweet.replied_to = self.get_tweet(
                tweet.original_tweet["in_reply_to_status_id_str"]
            )
        return tweet
    def get_tweet(self, id: int, private_user=False):
@@ -93,29 +98,33 @@ class Scraper:
                self.login_wait(private_user)
            except UnknownError:
                print("UnknownError occurred, probably rate-limited")
-				#traceback.print_exc()
+                # traceback.print_exc()
                self.login_wait(private_user)
            except Exception as e:
                if not private_user:
                    print("Unhandled exception occurred, trying again as private...")
                    return self.get_tweet(id, True)
                else:
-					print(f"Unhandled exception occurred, tweet {id} is probably unavailable")
+                    print(
                        f"Unhandled exception occurred, tweet {id} is probably unavailable"
                    )
                    print(e)
                    return None
    # since MUST BE TIMEZONE AWARE
    # usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc)
-	def get_tweets_from_user(self, username: str, since: datetime = None) -> list[Tweet]:
+    def get_tweets_from_user(
        self, username: str, since: datetime = None
    ) -> list[Tweet]:
        reached_backdate = False
        tweets: list[Tweet] = []
        cur = None
        if since == None:
            since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7)
-			print(f'falling back to grabbing tweets since 7 days ago ({since.date()})')
+            print(f"falling back to grabbing tweets since 7 days ago ({since.date()})")
        else:
-			print(f'grabbing tweets since {since.date()}')
+            print(f"grabbing tweets since {since.date()}")
        uid = self.app._get_user_id(username)
        print(f"{username} = {uid}")
@@ -133,7 +142,11 @@ class Scraper:
            tweets.append(tweet)
-			if not reached_backdate and int(tweet.author.id) == uid and tweet.date <= since:
+            if (
                not reached_backdate
                and int(tweet.author.id) == uid
                and tweet.date <= since
            ):
                print("reached backdate")
                reached_backdate = True
@@ -143,11 +156,14 @@ class Scraper:
        while not reached_backdate:
            try:
                # uts = self.app.get_tweets(uid, replies=True, cursor=cur)
-				search = self.app.search(f'from:{username}', filter_=SearchFilters.Latest(), cursor=cur)
+                search = self.app.search(
                    f"from:{username}", filter_=SearchFilters.Latest(), cursor=cur
                )
                cur_page = search.tweets
-				print(f'obtained {len(cur_page)} tweets')
+                print(f"obtained {len(cur_page)} tweets")
-				if len(cur_page) == 0: break
+                if len(cur_page) == 0:
                    break
                for e in cur_page:
                    if isinstance(e, Tweet):
@@ -166,9 +182,11 @@ class Scraper:
        tweets.sort(key=lambda t: t.id)
        return tweets
-	def get_cross_ttweets_from_user(self, username: str, since_date: str = None) -> list[TalentTweet]:
+    def get_cross_ttweets_from_user(
        self, username: str, since_date: str = None
    ) -> list[TalentTweet]:
        if since_date is not None:
-			d = since_date.split('-')
+            d = since_date.split("-")
            since = datetime(*[int(x) for x in d]).replace(tzinfo=pytz.utc)
        else:
            since = None
@@ -178,12 +196,16 @@ class Scraper:
        for t in tweets:
            tt = TalentTweet.create_from_tweety(t)
            if tt.is_cross_company():
                print(f"cross t_id: {tt.tweet_id}")
                ret.append(tt)
-		print(f'Found {len(ret)}/{len(tweets)} TalentTweets')
+        print(f"Found {len(ret)}/{len(tweets)} cross tweets")
        return ret
-if __name__== '__main__':
+
 if __name__ == "__main__":
    talent_lists.init()
    s = Scraper()
-	ttweets = s.get_cross_ttweets_from_user("pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc))
+    ttweets = s.get_cross_ttweets_from_user(
        "pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc)
    )
    print("\n".join([x.__repr__() for x in ttweets]))