add print to indicate each cross tweet found

This commit is contained in:
muskit
2024-01-25 18:45:53 -08:00
parent ecce333a05
commit 81eea91d02
3 changed files with 185 additions and 164 deletions
+1 -1
View File
@@ -24,7 +24,7 @@ The bot may run in these modes:
* Command-line (`cmd`): an interactive mode for manual control and debugging (drops into Python interpretor) * Command-line (`cmd`): an interactive mode for manual control and debugging (drops into Python interpretor)
## `.env` ## `.env`
These need to be defined in a `.env` file at the project root (outside of `src`): These need to be defined in a `.env` file in the `run` ephemeral directory.
### Scraper Credentials ### Scraper Credentials
To get around rate limitations imposed on users, we scrape with multiple accounts. Each account is defined in the file using the following format: To get around rate limitations imposed on users, we scrape with multiple accounts. Each account is defined in the file using the following format:
+4 -5
View File
@@ -41,13 +41,12 @@ async def get_cross_tweets_online():
ttweets = scraper.get_cross_ttweets_from_user( ttweets = scraper.get_cross_ttweets_from_user(
talent_username, since_date=since_date talent_username, since_date=since_date
) )
print(f"got {len(ttweets)} TalentTweets") added_to_queue = 0
for ttweet in ttweets: for ttweet in ttweets:
if ( if ttweet.tweet_id not in queue.finished_ttweets:
ttweet.tweet_id not in queue.finished_ttweets added_to_queue += 1
and ttweet.is_cross_company()
):
queue.add_ttweet(ttweet) queue.add_ttweet(ttweet)
print(f"Enqueued {added_to_queue}/{len(ttweets)} tweets.")
except KeyboardInterrupt as e: except KeyboardInterrupt as e:
raise e raise e
except Exception as e: except Exception as e:
+172 -150
View File
@@ -14,176 +14,198 @@ from tweety_utils import *
from talenttweet import * from talenttweet import *
import talent_lists import talent_lists
class Scraper: class Scraper:
def __init__(self): def __init__(self):
Scraper.instance = self Scraper.instance = self
self.__account = AccountPool() self.__account = AccountPool()
self.try_login() self.try_login()
def try_login(self, account_idx: int = None) -> bool: def try_login(self, account_idx: int = None) -> bool:
# decide on which account to use # decide on which account to use
if account_idx is not None: if account_idx is not None:
acc = self.__account.use_index(account_idx) acc = self.__account.use_index(account_idx)
else: else:
acc = self.__account.next() acc = self.__account.next()
# attempt to login with the account # attempt to login with the account
if acc is not None: if acc is not None:
name = acc[0] name = acc[0]
print(f"using {name}") print(f"using {name}")
self.app = Twitter(name) self.app = Twitter(name)
if exists(f"{name}.json"): if exists(f"{name}.json"):
try: try:
self.app.connect() self.app.connect()
except: except:
self.app.sign_in(*acc) self.app.sign_in(*acc)
else: else:
self.app.sign_in(*acc) self.app.sign_in(*acc)
return True return True
print('exhausted all accounts!') print("exhausted all accounts!")
return False return False
def login_wait(self, private=False): def login_wait(self, private=False):
if private: if private:
print(f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes...") print(
sleep(240) f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes..."
print() )
l = self.try_login(0) sleep(240)
else: print()
l = self.try_login() l = self.try_login(0)
if not l: else:
print("sleeping for 4 minutes...") l = self.try_login()
sleep(240) if not l:
print() print("sleeping for 4 minutes...")
self.try_login() sleep(240)
print()
self.try_login()
# recover lost info # recover lost info
def fix_tweet(self, tweet: Tweet): def fix_tweet(self, tweet: Tweet):
if tweet.is_retweet: if tweet.is_retweet:
if tweet.retweeted_tweet is None: if tweet.retweeted_tweet is None:
# tweet.retweeted_tweet = self.app.tweet_detail(str(tweet.id)).retweeted_tweet # tweet.retweeted_tweet = self.app.tweet_detail(str(tweet.id)).retweeted_tweet
# print(f'{tweet.author.username}/{tweet.id} is missing the RT! It\'s probably nothing...') # print(f'{tweet.author.username}/{tweet.id} is missing the RT! It\'s probably nothing...')
tweet.is_retweet = False tweet.is_retweet = False
elif tweet.retweeted_tweet.author is None: elif tweet.retweeted_tweet.author is None:
# print(f'{tweet.author.username}/{tweet.id} is missing the RT author! Fetching RT\'d...') # print(f'{tweet.author.username}/{tweet.id} is missing the RT author! Fetching RT\'d...')
tweet.retweeted_tweet = self.get_tweet(tweet.retweeted_tweet.id) tweet.retweeted_tweet = self.get_tweet(tweet.retweeted_tweet.id)
if tweet.is_quoted: if tweet.is_quoted:
if tweet.quoted_tweet is None: # quoted tweet is deleted if tweet.quoted_tweet is None: # quoted tweet is deleted
tweet.is_quoted = False tweet.is_quoted = False
elif tweet.quoted_tweet.author is None: elif tweet.quoted_tweet.author is None:
# print(f'{tweet.author.username}/{tweet.id} is missing the QRT author! Fetching QRT\'d...') # print(f'{tweet.author.username}/{tweet.id} is missing the QRT author! Fetching QRT\'d...')
tweet.quoted_tweet = self.get_tweet(tweet.quoted_tweet.id) tweet.quoted_tweet = self.get_tweet(tweet.quoted_tweet.id)
if tweet.is_reply and tweet.replied_to is None: if tweet.is_reply and tweet.replied_to is None:
# print(f'{tweet.author.username}/{tweet.id} is missing reply-to tweet! Recovering...') # print(f'{tweet.author.username}/{tweet.id} is missing reply-to tweet! Recovering...')
tweet.replied_to = self.get_tweet(tweet.original_tweet['in_reply_to_status_id_str']) tweet.replied_to = self.get_tweet(
return tweet tweet.original_tweet["in_reply_to_status_id_str"]
)
return tweet
def get_tweet(self, id: int, private_user=False): def get_tweet(self, id: int, private_user=False):
# print(f'{id}{" on private" if private_user else ""}') # print(f'{id}{" on private" if private_user else ""}')
if private_user: if private_user:
self.try_login(0) self.try_login(0)
while True: while True:
try: try:
t = self.app.tweet_detail(str(id)) t = self.app.tweet_detail(str(id))
return self.fix_tweet(t) if t is not None else None return self.fix_tweet(t) if t is not None else None
except RateLimitReached: except RateLimitReached:
print("RateLimitReached occurred") print("RateLimitReached occurred")
self.login_wait(private_user) self.login_wait(private_user)
except UnknownError: except UnknownError:
print("UnknownError occurred, probably rate-limited") print("UnknownError occurred, probably rate-limited")
#traceback.print_exc() # traceback.print_exc()
self.login_wait(private_user) self.login_wait(private_user)
except Exception as e: except Exception as e:
if not private_user: if not private_user:
print("Unhandled exception occurred, trying again as private...") print("Unhandled exception occurred, trying again as private...")
return self.get_tweet(id, True) return self.get_tweet(id, True)
else: else:
print(f"Unhandled exception occurred, tweet {id} is probably unavailable") print(
print(e) f"Unhandled exception occurred, tweet {id} is probably unavailable"
return None )
print(e)
return None
# since MUST BE TIMEZONE AWARE # since MUST BE TIMEZONE AWARE
# usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc) # usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc)
def get_tweets_from_user(self, username: str, since: datetime = None) -> list[Tweet]: def get_tweets_from_user(
reached_backdate = False self, username: str, since: datetime = None
tweets: list[Tweet] = [] ) -> list[Tweet]:
cur = None reached_backdate = False
tweets: list[Tweet] = []
cur = None
if since == None: if since == None:
since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7) since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7)
print(f'falling back to grabbing tweets since 7 days ago ({since.date()})') print(f"falling back to grabbing tweets since 7 days ago ({since.date()})")
else: else:
print(f'grabbing tweets since {since.date()}') print(f"grabbing tweets since {since.date()}")
uid = self.app._get_user_id(username) uid = self.app._get_user_id(username)
print(f"{username} = {uid}") print(f"{username} = {uid}")
def add_tweet(tweet: Tweet): def add_tweet(tweet: Tweet):
# malformed tweet check # malformed tweet check
nonlocal reached_backdate nonlocal reached_backdate
try: try:
tweet.author.id tweet.author.id
except: except:
print(f"skipping malformed tweet: {tweet}") print(f"skipping malformed tweet: {tweet}")
return return
tweet = self.fix_tweet(tweet) tweet = self.fix_tweet(tweet)
tweets.append(tweet) tweets.append(tweet)
if not reached_backdate and int(tweet.author.id) == uid and tweet.date <= since: if (
print("reached backdate") not reached_backdate
reached_backdate = True and int(tweet.author.id) == uid
and tweet.date <= since
):
print("reached backdate")
reached_backdate = True
if uid in talent_lists.privated_accounts: if uid in talent_lists.privated_accounts:
self.try_login(0) self.try_login(0)
while not reached_backdate: while not reached_backdate:
try: try:
# uts = self.app.get_tweets(uid, replies=True, cursor=cur) # uts = self.app.get_tweets(uid, replies=True, cursor=cur)
search = self.app.search(f'from:{username}', filter_=SearchFilters.Latest(), cursor=cur) search = self.app.search(
cur_page = search.tweets f"from:{username}", filter_=SearchFilters.Latest(), cursor=cur
print(f'obtained {len(cur_page)} tweets') )
cur_page = search.tweets
print(f"obtained {len(cur_page)} tweets")
if len(cur_page) == 0: break if len(cur_page) == 0:
break
for e in cur_page: for e in cur_page:
if isinstance(e, Tweet): if isinstance(e, Tweet):
add_tweet(e) add_tweet(e)
elif isinstance(e, TweetThread): elif isinstance(e, TweetThread):
# FIXME: rework when replied_to is fixed (currently populates user_mentions) # FIXME: rework when replied_to is fixed (currently populates user_mentions)
# latest tweet in thread = og author's reply # latest tweet in thread = og author's reply
for t in e: for t in e:
add_tweet(t) add_tweet(t)
cur = search.cursor cur = search.cursor
except (UnknownError, RateLimitReached): except (UnknownError, RateLimitReached):
print("UnknownError occurred, probably rate-limited") print("UnknownError occurred, probably rate-limited")
self.login_wait(uid in talent_lists.privated_accounts) self.login_wait(uid in talent_lists.privated_accounts)
tweets.sort(key=lambda t: t.id) tweets.sort(key=lambda t: t.id)
return tweets return tweets
def get_cross_ttweets_from_user(self, username: str, since_date: str = None) -> list[TalentTweet]: def get_cross_ttweets_from_user(
if since_date is not None: self, username: str, since_date: str = None
d = since_date.split('-') ) -> list[TalentTweet]:
since = datetime(*[int(x) for x in d]).replace(tzinfo=pytz.utc) if since_date is not None:
else: d = since_date.split("-")
since = None since = datetime(*[int(x) for x in d]).replace(tzinfo=pytz.utc)
tweets = self.get_tweets_from_user(username, since) else:
# print_tweets(tweets) since = None
ret: list[TalentTweet] = [] tweets = self.get_tweets_from_user(username, since)
for t in tweets: # print_tweets(tweets)
tt = TalentTweet.create_from_tweety(t) ret: list[TalentTweet] = []
if tt.is_cross_company(): for t in tweets:
ret.append(tt) tt = TalentTweet.create_from_tweety(t)
print(f'Found {len(ret)}/{len(tweets)} TalentTweets') if tt.is_cross_company():
return ret print(f"cross t_id: {tt.tweet_id}")
ret.append(tt)
print(f"Found {len(ret)}/{len(tweets)} cross tweets")
return ret
if __name__== '__main__':
talent_lists.init() if __name__ == "__main__":
s = Scraper() talent_lists.init()
ttweets = s.get_cross_ttweets_from_user("pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc)) s = Scraper()
print("\n".join([x.__repr__() for x in ttweets])) ttweets = s.get_cross_ttweets_from_user(
"pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc)
)
print("\n".join([x.__repr__() for x in ttweets]))