diff --git a/README.md b/README.md index 6a65cd3..7f5679b 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,17 @@ Setup the `.env` in the project root. Refer to the [`.env`](#env) section for va Build and run the Docker container: ```bash -# to run attached (can CTRL+P,CTRL+Q to detach) -sh run.sh +# to delete container and built image +sh scripts/delete.sh -# ... or to run headless -sh run_detached.sh +# to build image +sh scripts/build.sh + +# to create container and run attached (can CTRL+P,CTRL+Q to detach) +sh scripts/run.sh + +# ... or to run headless/detached +sh scripts/run_detached.sh ``` If attached to a container prepared by Dockerfile, you can run the program from project root (not in `src`). Refer to the following section for options. @@ -36,17 +42,17 @@ These need to be defined in a `.env` file in the `run` ephemeral directory. ### Scraper Credentials To get around rate limitations imposed on users, we scrape with multiple accounts. Each account is defined in the file using the following format: ``` -scraper_username_X=twitter_username -scraper_auth_token_X=twitter_auth_token +scraperX_username=twitter_username +scraperX_password=twitter_auth_token ``` where `X` is a number starting from 0, increasing by 1 for each account added. For instance: ``` -scraper_username_0= -scraper_auth_token_0= -scraper_username_1= -scraper_auth_token_1= +scraper0_username= +scraper0_password= +scraper1_username= +scraper1_password= ``` -The first account (`scraper_username_0` and `scraper_auth_token_0`) **MUST be defined (`scraper_username_` and `scraper_auth_token_` without number will not work!)** and will be used to attempt scraping private accounts. Make sure this account follows any private accounts that you want to scrape! +The first account (`scraper0_username` and `scraper0_password`) **MUST be defined (`scraper_username` and `scraper_password` without number will not work!)** and will be used to attempt scraping private accounts. Make sure this account follows any private accounts that you want to scrape! ### Twitter API Stuff The following keys/tokens are used for the official API via `tweepy`. We mainly use these to just post tweets. ``` @@ -56,20 +62,16 @@ user_token= user_secret= ``` ### Screenshot Cookie *(optional)* -This is the authentication token obtained from a browser when signed in on the Twitter website. It's only needed if you want to screenshot tweets from privated accounts. Make sure the token belongs to an account that follows desired private accounts! Maybe have it belong to `scraper_username_0`? +This is the authentication token obtained from a browser when signed in on the Twitter website. It's only needed if you want to screenshot tweets from privated accounts. Make sure the token belongs to an account that follows desired private accounts! Maybe have it belong to `scraper0`? ``` web_auth_token= ``` ### Example `.env` without values ``` -scraper_username_0= -scraper_auth_token_0= -scraper_username_1= -scraper_auth_token_1= -scraper_username_2= -scraper_auth_token_2= -scraper_username_3= -scraper_auth_token_3= +scraper0_username= +scraper0_password= +scraper1_username= +scraper1_password= web_auth_token= app_key= app_secret= diff --git a/src/account_pool.py b/src/account_pool.py index 9bba59b..a9dfb72 100644 --- a/src/account_pool.py +++ b/src/account_pool.py @@ -11,13 +11,14 @@ class AccountPool: creds = dotenv_values(working_path(file=".env")) i = 0 while True: - if f"scraper_username_{i}" in creds and f"scraper_auth_token_{i}" in creds: + if f"scraper{i}_username" in creds and f"scraper{i}_password" in creds: self.__accounts.append( - (creds[f"scraper_username_{i}"], creds[f"scraper_auth_token_{i}"]) + (creds[f"scraper{i}_username"], creds[f"scraper{i}_password"]) ) i += 1 else: break + print(f"{len(self.__accounts)} scraper credentials found!") def use_index(self, idx): self.__idx = idx diff --git a/src/scraper.py b/src/scraper.py index 1dee4c3..de3c22a 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,6 +1,7 @@ from os.path import exists from time import sleep from datetime import datetime, timedelta +import traceback import pytz @@ -37,9 +38,9 @@ class Scraper: try: self.app.connect() except: - self.app.load_auth_token(acc[1]) + self.app.sign_in(*acc) else: - self.app.load_auth_token(acc[1]) + self.app.sign_in(*acc) return True print("exhausted all accounts!") return False @@ -101,12 +102,14 @@ class Scraper: if "_Missing" in e.message: # tweet is probably unavailable print(f"tweet {id} seems unavailable; skipping...") return None - print("treating like RateLimitReached...") + print("treating like RateLimitReached and using the next scraper...") # traceback.print_exc() self.login_wait(private_user) except Exception as e: if not private_user: - print("Unhandled exception occurred, trying again as private...") + print("Unhandled exception occurred getting tweet!") + traceback.print_exc() + print("trying again as pvt-accessible...\n") return self.get_tweet(id, True) else: print(