add print to indicate each cross tweet found
This commit is contained in:
@@ -24,7 +24,7 @@ The bot may run in these modes:
|
|||||||
* Command-line (`cmd`): an interactive mode for manual control and debugging (drops into Python interpretor)
|
* Command-line (`cmd`): an interactive mode for manual control and debugging (drops into Python interpretor)
|
||||||
|
|
||||||
## `.env`
|
## `.env`
|
||||||
These need to be defined in a `.env` file at the project root (outside of `src`):
|
These need to be defined in a `.env` file in the `run` ephemeral directory.
|
||||||
|
|
||||||
### Scraper Credentials
|
### Scraper Credentials
|
||||||
To get around rate limitations imposed on users, we scrape with multiple accounts. Each account is defined in the file using the following format:
|
To get around rate limitations imposed on users, we scrape with multiple accounts. Each account is defined in the file using the following format:
|
||||||
|
|||||||
+4
-5
@@ -41,13 +41,12 @@ async def get_cross_tweets_online():
|
|||||||
ttweets = scraper.get_cross_ttweets_from_user(
|
ttweets = scraper.get_cross_ttweets_from_user(
|
||||||
talent_username, since_date=since_date
|
talent_username, since_date=since_date
|
||||||
)
|
)
|
||||||
print(f"got {len(ttweets)} TalentTweets")
|
added_to_queue = 0
|
||||||
for ttweet in ttweets:
|
for ttweet in ttweets:
|
||||||
if (
|
if ttweet.tweet_id not in queue.finished_ttweets:
|
||||||
ttweet.tweet_id not in queue.finished_ttweets
|
added_to_queue += 1
|
||||||
and ttweet.is_cross_company()
|
|
||||||
):
|
|
||||||
queue.add_ttweet(ttweet)
|
queue.add_ttweet(ttweet)
|
||||||
|
print(f"Enqueued {added_to_queue}/{len(ttweets)} tweets.")
|
||||||
except KeyboardInterrupt as e:
|
except KeyboardInterrupt as e:
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
+172
-150
@@ -14,176 +14,198 @@ from tweety_utils import *
|
|||||||
from talenttweet import *
|
from talenttweet import *
|
||||||
import talent_lists
|
import talent_lists
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
Scraper.instance = self
|
Scraper.instance = self
|
||||||
self.__account = AccountPool()
|
self.__account = AccountPool()
|
||||||
self.try_login()
|
self.try_login()
|
||||||
|
|
||||||
def try_login(self, account_idx: int = None) -> bool:
|
def try_login(self, account_idx: int = None) -> bool:
|
||||||
# decide on which account to use
|
# decide on which account to use
|
||||||
if account_idx is not None:
|
if account_idx is not None:
|
||||||
acc = self.__account.use_index(account_idx)
|
acc = self.__account.use_index(account_idx)
|
||||||
else:
|
else:
|
||||||
acc = self.__account.next()
|
acc = self.__account.next()
|
||||||
|
|
||||||
# attempt to login with the account
|
# attempt to login with the account
|
||||||
if acc is not None:
|
if acc is not None:
|
||||||
name = acc[0]
|
name = acc[0]
|
||||||
print(f"using {name}")
|
print(f"using {name}")
|
||||||
self.app = Twitter(name)
|
self.app = Twitter(name)
|
||||||
if exists(f"{name}.json"):
|
if exists(f"{name}.json"):
|
||||||
try:
|
try:
|
||||||
self.app.connect()
|
self.app.connect()
|
||||||
except:
|
except:
|
||||||
self.app.sign_in(*acc)
|
self.app.sign_in(*acc)
|
||||||
else:
|
else:
|
||||||
self.app.sign_in(*acc)
|
self.app.sign_in(*acc)
|
||||||
return True
|
return True
|
||||||
print('exhausted all accounts!')
|
print("exhausted all accounts!")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def login_wait(self, private=False):
|
def login_wait(self, private=False):
|
||||||
if private:
|
if private:
|
||||||
print(f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes...")
|
print(
|
||||||
sleep(240)
|
f"keeping pvt-accessible account ({self.__account.use_index(0)[0]}). sleeping for 4 minutes..."
|
||||||
print()
|
)
|
||||||
l = self.try_login(0)
|
sleep(240)
|
||||||
else:
|
print()
|
||||||
l = self.try_login()
|
l = self.try_login(0)
|
||||||
if not l:
|
else:
|
||||||
print("sleeping for 4 minutes...")
|
l = self.try_login()
|
||||||
sleep(240)
|
if not l:
|
||||||
print()
|
print("sleeping for 4 minutes...")
|
||||||
self.try_login()
|
sleep(240)
|
||||||
|
print()
|
||||||
|
self.try_login()
|
||||||
|
|
||||||
# recover lost info
|
# recover lost info
|
||||||
def fix_tweet(self, tweet: Tweet):
|
def fix_tweet(self, tweet: Tweet):
|
||||||
if tweet.is_retweet:
|
if tweet.is_retweet:
|
||||||
if tweet.retweeted_tweet is None:
|
if tweet.retweeted_tweet is None:
|
||||||
# tweet.retweeted_tweet = self.app.tweet_detail(str(tweet.id)).retweeted_tweet
|
# tweet.retweeted_tweet = self.app.tweet_detail(str(tweet.id)).retweeted_tweet
|
||||||
# print(f'{tweet.author.username}/{tweet.id} is missing the RT! It\'s probably nothing...')
|
# print(f'{tweet.author.username}/{tweet.id} is missing the RT! It\'s probably nothing...')
|
||||||
tweet.is_retweet = False
|
tweet.is_retweet = False
|
||||||
elif tweet.retweeted_tweet.author is None:
|
elif tweet.retweeted_tweet.author is None:
|
||||||
# print(f'{tweet.author.username}/{tweet.id} is missing the RT author! Fetching RT\'d...')
|
# print(f'{tweet.author.username}/{tweet.id} is missing the RT author! Fetching RT\'d...')
|
||||||
tweet.retweeted_tweet = self.get_tweet(tweet.retweeted_tweet.id)
|
tweet.retweeted_tweet = self.get_tweet(tweet.retweeted_tweet.id)
|
||||||
|
|
||||||
if tweet.is_quoted:
|
if tweet.is_quoted:
|
||||||
if tweet.quoted_tweet is None: # quoted tweet is deleted
|
if tweet.quoted_tweet is None: # quoted tweet is deleted
|
||||||
tweet.is_quoted = False
|
tweet.is_quoted = False
|
||||||
elif tweet.quoted_tweet.author is None:
|
elif tweet.quoted_tweet.author is None:
|
||||||
# print(f'{tweet.author.username}/{tweet.id} is missing the QRT author! Fetching QRT\'d...')
|
# print(f'{tweet.author.username}/{tweet.id} is missing the QRT author! Fetching QRT\'d...')
|
||||||
tweet.quoted_tweet = self.get_tweet(tweet.quoted_tweet.id)
|
tweet.quoted_tweet = self.get_tweet(tweet.quoted_tweet.id)
|
||||||
|
|
||||||
if tweet.is_reply and tweet.replied_to is None:
|
if tweet.is_reply and tweet.replied_to is None:
|
||||||
# print(f'{tweet.author.username}/{tweet.id} is missing reply-to tweet! Recovering...')
|
# print(f'{tweet.author.username}/{tweet.id} is missing reply-to tweet! Recovering...')
|
||||||
tweet.replied_to = self.get_tweet(tweet.original_tweet['in_reply_to_status_id_str'])
|
tweet.replied_to = self.get_tweet(
|
||||||
return tweet
|
tweet.original_tweet["in_reply_to_status_id_str"]
|
||||||
|
)
|
||||||
|
return tweet
|
||||||
|
|
||||||
def get_tweet(self, id: int, private_user=False):
|
def get_tweet(self, id: int, private_user=False):
|
||||||
# print(f'{id}{" on private" if private_user else ""}')
|
# print(f'{id}{" on private" if private_user else ""}')
|
||||||
if private_user:
|
if private_user:
|
||||||
self.try_login(0)
|
self.try_login(0)
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
t = self.app.tweet_detail(str(id))
|
t = self.app.tweet_detail(str(id))
|
||||||
return self.fix_tweet(t) if t is not None else None
|
return self.fix_tweet(t) if t is not None else None
|
||||||
except RateLimitReached:
|
except RateLimitReached:
|
||||||
print("RateLimitReached occurred")
|
print("RateLimitReached occurred")
|
||||||
self.login_wait(private_user)
|
self.login_wait(private_user)
|
||||||
except UnknownError:
|
except UnknownError:
|
||||||
print("UnknownError occurred, probably rate-limited")
|
print("UnknownError occurred, probably rate-limited")
|
||||||
#traceback.print_exc()
|
# traceback.print_exc()
|
||||||
self.login_wait(private_user)
|
self.login_wait(private_user)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not private_user:
|
if not private_user:
|
||||||
print("Unhandled exception occurred, trying again as private...")
|
print("Unhandled exception occurred, trying again as private...")
|
||||||
return self.get_tweet(id, True)
|
return self.get_tweet(id, True)
|
||||||
else:
|
else:
|
||||||
print(f"Unhandled exception occurred, tweet {id} is probably unavailable")
|
print(
|
||||||
print(e)
|
f"Unhandled exception occurred, tweet {id} is probably unavailable"
|
||||||
return None
|
)
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
# since MUST BE TIMEZONE AWARE
|
# since MUST BE TIMEZONE AWARE
|
||||||
# usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc)
|
# usage example: since=datetime(2023, 8, 1).replace(tzinfo=pytz.utc)
|
||||||
def get_tweets_from_user(self, username: str, since: datetime = None) -> list[Tweet]:
|
def get_tweets_from_user(
|
||||||
reached_backdate = False
|
self, username: str, since: datetime = None
|
||||||
tweets: list[Tweet] = []
|
) -> list[Tweet]:
|
||||||
cur = None
|
reached_backdate = False
|
||||||
|
tweets: list[Tweet] = []
|
||||||
|
cur = None
|
||||||
|
|
||||||
if since == None:
|
if since == None:
|
||||||
since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7)
|
since = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=7)
|
||||||
print(f'falling back to grabbing tweets since 7 days ago ({since.date()})')
|
print(f"falling back to grabbing tweets since 7 days ago ({since.date()})")
|
||||||
else:
|
else:
|
||||||
print(f'grabbing tweets since {since.date()}')
|
print(f"grabbing tweets since {since.date()}")
|
||||||
|
|
||||||
uid = self.app._get_user_id(username)
|
uid = self.app._get_user_id(username)
|
||||||
print(f"{username} = {uid}")
|
print(f"{username} = {uid}")
|
||||||
|
|
||||||
def add_tweet(tweet: Tweet):
|
def add_tweet(tweet: Tweet):
|
||||||
# malformed tweet check
|
# malformed tweet check
|
||||||
nonlocal reached_backdate
|
nonlocal reached_backdate
|
||||||
try:
|
try:
|
||||||
tweet.author.id
|
tweet.author.id
|
||||||
except:
|
except:
|
||||||
print(f"skipping malformed tweet: {tweet}")
|
print(f"skipping malformed tweet: {tweet}")
|
||||||
return
|
return
|
||||||
|
|
||||||
tweet = self.fix_tweet(tweet)
|
tweet = self.fix_tweet(tweet)
|
||||||
|
|
||||||
tweets.append(tweet)
|
tweets.append(tweet)
|
||||||
|
|
||||||
if not reached_backdate and int(tweet.author.id) == uid and tweet.date <= since:
|
if (
|
||||||
print("reached backdate")
|
not reached_backdate
|
||||||
reached_backdate = True
|
and int(tweet.author.id) == uid
|
||||||
|
and tweet.date <= since
|
||||||
|
):
|
||||||
|
print("reached backdate")
|
||||||
|
reached_backdate = True
|
||||||
|
|
||||||
if uid in talent_lists.privated_accounts:
|
if uid in talent_lists.privated_accounts:
|
||||||
self.try_login(0)
|
self.try_login(0)
|
||||||
|
|
||||||
while not reached_backdate:
|
while not reached_backdate:
|
||||||
try:
|
try:
|
||||||
# uts = self.app.get_tweets(uid, replies=True, cursor=cur)
|
# uts = self.app.get_tweets(uid, replies=True, cursor=cur)
|
||||||
search = self.app.search(f'from:{username}', filter_=SearchFilters.Latest(), cursor=cur)
|
search = self.app.search(
|
||||||
cur_page = search.tweets
|
f"from:{username}", filter_=SearchFilters.Latest(), cursor=cur
|
||||||
print(f'obtained {len(cur_page)} tweets')
|
)
|
||||||
|
cur_page = search.tweets
|
||||||
|
print(f"obtained {len(cur_page)} tweets")
|
||||||
|
|
||||||
if len(cur_page) == 0: break
|
if len(cur_page) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
for e in cur_page:
|
for e in cur_page:
|
||||||
if isinstance(e, Tweet):
|
if isinstance(e, Tweet):
|
||||||
add_tweet(e)
|
add_tweet(e)
|
||||||
elif isinstance(e, TweetThread):
|
elif isinstance(e, TweetThread):
|
||||||
# FIXME: rework when replied_to is fixed (currently populates user_mentions)
|
# FIXME: rework when replied_to is fixed (currently populates user_mentions)
|
||||||
# latest tweet in thread = og author's reply
|
# latest tweet in thread = og author's reply
|
||||||
for t in e:
|
for t in e:
|
||||||
add_tweet(t)
|
add_tweet(t)
|
||||||
|
|
||||||
cur = search.cursor
|
cur = search.cursor
|
||||||
except (UnknownError, RateLimitReached):
|
except (UnknownError, RateLimitReached):
|
||||||
print("UnknownError occurred, probably rate-limited")
|
print("UnknownError occurred, probably rate-limited")
|
||||||
self.login_wait(uid in talent_lists.privated_accounts)
|
self.login_wait(uid in talent_lists.privated_accounts)
|
||||||
|
|
||||||
tweets.sort(key=lambda t: t.id)
|
tweets.sort(key=lambda t: t.id)
|
||||||
return tweets
|
return tweets
|
||||||
|
|
||||||
def get_cross_ttweets_from_user(self, username: str, since_date: str = None) -> list[TalentTweet]:
|
def get_cross_ttweets_from_user(
|
||||||
if since_date is not None:
|
self, username: str, since_date: str = None
|
||||||
d = since_date.split('-')
|
) -> list[TalentTweet]:
|
||||||
since = datetime(*[int(x) for x in d]).replace(tzinfo=pytz.utc)
|
if since_date is not None:
|
||||||
else:
|
d = since_date.split("-")
|
||||||
since = None
|
since = datetime(*[int(x) for x in d]).replace(tzinfo=pytz.utc)
|
||||||
tweets = self.get_tweets_from_user(username, since)
|
else:
|
||||||
# print_tweets(tweets)
|
since = None
|
||||||
ret: list[TalentTweet] = []
|
tweets = self.get_tweets_from_user(username, since)
|
||||||
for t in tweets:
|
# print_tweets(tweets)
|
||||||
tt = TalentTweet.create_from_tweety(t)
|
ret: list[TalentTweet] = []
|
||||||
if tt.is_cross_company():
|
for t in tweets:
|
||||||
ret.append(tt)
|
tt = TalentTweet.create_from_tweety(t)
|
||||||
print(f'Found {len(ret)}/{len(tweets)} TalentTweets')
|
if tt.is_cross_company():
|
||||||
return ret
|
print(f"cross t_id: {tt.tweet_id}")
|
||||||
|
ret.append(tt)
|
||||||
|
print(f"Found {len(ret)}/{len(tweets)} cross tweets")
|
||||||
|
return ret
|
||||||
|
|
||||||
if __name__== '__main__':
|
|
||||||
talent_lists.init()
|
if __name__ == "__main__":
|
||||||
s = Scraper()
|
talent_lists.init()
|
||||||
ttweets = s.get_cross_ttweets_from_user("pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc))
|
s = Scraper()
|
||||||
print("\n".join([x.__repr__() for x in ttweets]))
|
ttweets = s.get_cross_ttweets_from_user(
|
||||||
|
"pomurainpuff", since=datetime(2023, 7, 30).replace(tzinfo=pytz.utc)
|
||||||
|
)
|
||||||
|
print("\n".join([x.__repr__() for x in ttweets]))
|
||||||
|
|||||||
Reference in New Issue
Block a user