add print to indicate each cross tweet found

This commit is contained in:
muskit
2024-01-25 18:45:53 -08:00
parent ecce333a05
commit 81eea91d02
3 changed files with 185 additions and 164 deletions
+1 -1
View File
@@ -24,7 +24,7 @@ The bot may run in these modes:
* Command-line (`cmd`): an interactive mode for manual control and debugging (drops into Python interpretor)
## `.env`
These need to be defined in a `.env` file at the project root (outside of `src`):
These need to be defined in a `.env` file in the `run` ephemeral directory.
### Scraper Credentials
To get around rate limitations imposed on users, we scrape with multiple accounts. Each account is defined in the file using the following format:
+4 -5
View File
@@ -41,13 +41,12 @@ async def get_cross_tweets_online():
ttweets = scraper.get_cross_ttweets_from_user(
talent_username, since_date=since_date
)
print(f"got {len(ttweets)} TalentTweets")
added_to_queue = 0
for ttweet in ttweets:
if (
ttweet.tweet_id not in queue.finished_ttweets
and ttweet.is_cross_company()
):
if ttweet.tweet_id not in queue.finished_ttweets:
added_to_queue += 1
queue.add_ttweet(ttweet)
print(f"Enqueued {added_to_queue}/{len(ttweets)} tweets.")
except KeyboardInterrupt as e:
raise e
except Exception as e:
+38 -16
View File
@@ -14,6 +14,7 @@ from tweety_utils import *
from talenttweet import *
import talent_lists
class Scraper:
def __init__(self):
Scraper.instance = self
@@ -40,12 +41,14 @@ class Scraper:
else:
self.app.sign_in(*acc)
return True
print('exhausted all accounts!')
print("exhausted all accounts!")
return False
def login_wait(self, private=False):
if private:
print(f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes...")
print(
f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes..."
)
sleep(240)
print()
l = self.try_login(0)
@@ -77,7 +80,9 @@ class Scraper:
if tweet.is_reply and tweet.replied_to is None:
# print(f'{tweet.author.username}/{tweet.id} is missing reply-to tweet! Recovering...')
tweet.replied_to = self.get_tweet(tweet.original_tweet['in_reply_to_status_id_str'])
tweet.replied_to = self.get_tweet(
tweet.original_tweet["in_reply_to_status_id_str"]
)
return tweet
def get_tweet(self, id: int, private_user=False):
@@ -100,22 +105,26 @@ class Scraper:
print("Unhandled exception occurred, trying again as private...")
return self.get_tweet(id, True)
else:
print(f"Unhandled exception occurred, tweet {id} is probably unavailable")
print(
f"Unhandled exception occurred, tweet {id} is probably unavailable"
)
print(e)
return None
# since MUST BE TIMEZONE AWARE
# usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc)
def get_tweets_from_user(self, username: str, since: datetime = None) -> list[Tweet]:
def get_tweets_from_user(
self, username: str, since: datetime = None
) -> list[Tweet]:
reached_backdate = False
tweets: list[Tweet] = []
cur = None
if since == None:
since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7)
print(f'falling back to grabbing tweets since 7 days ago ({since.date()})')
print(f"falling back to grabbing tweets since 7 days ago ({since.date()})")
else:
print(f'grabbing tweets since {since.date()}')
print(f"grabbing tweets since {since.date()}")
uid = self.app._get_user_id(username)
print(f"{username} = {uid}")
@@ -133,7 +142,11 @@ class Scraper:
tweets.append(tweet)
if not reached_backdate and int(tweet.author.id) == uid and tweet.date <= since:
if (
not reached_backdate
and int(tweet.author.id) == uid
and tweet.date <= since
):
print("reached backdate")
reached_backdate = True
@@ -143,11 +156,14 @@ class Scraper:
while not reached_backdate:
try:
# uts = self.app.get_tweets(uid, replies=True, cursor=cur)
search = self.app.search(f'from:{username}', filter_=SearchFilters.Latest(), cursor=cur)
search = self.app.search(
f"from:{username}", filter_=SearchFilters.Latest(), cursor=cur
)
cur_page = search.tweets
print(f'obtained {len(cur_page)} tweets')
print(f"obtained {len(cur_page)} tweets")
if len(cur_page) == 0: break
if len(cur_page) == 0:
break
for e in cur_page:
if isinstance(e, Tweet):
@@ -166,9 +182,11 @@ class Scraper:
tweets.sort(key=lambda t: t.id)
return tweets
def get_cross_ttweets_from_user(self, username: str, since_date: str = None) -> list[TalentTweet]:
def get_cross_ttweets_from_user(
self, username: str, since_date: str = None
) -> list[TalentTweet]:
if since_date is not None:
d = since_date.split('-')
d = since_date.split("-")
since = datetime(*[int(x) for x in d]).replace(tzinfo=pytz.utc)
else:
since = None
@@ -178,12 +196,16 @@ class Scraper:
for t in tweets:
tt = TalentTweet.create_from_tweety(t)
if tt.is_cross_company():
print(f"cross t_id: {tt.tweet_id}")
ret.append(tt)
print(f'Found {len(ret)}/{len(tweets)} TalentTweets')
print(f"Found {len(ret)}/{len(tweets)} cross tweets")
return ret
if __name__== '__main__':
if __name__ == "__main__":
talent_lists.init()
s = Scraper()
ttweets = s.get_cross_ttweets_from_user("pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc))
ttweets = s.get_cross_ttweets_from_user(
"pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc)
)
print("\n".join([x.__repr__() for x in ttweets]))