Merge pull request #25 from shaikhsajid1111/version_2.0

Updates for version 2.0.0
shaikhsajid1111 · Jul 9, 2022 · 3a4c313 · 3a4c313
2 parents 84ec015 + 9ba4586
commit 3a4c313
Show file tree

Hide file tree

Showing 9 changed files with 693 additions and 589 deletions.
diff --git a/README.md b/README.md
@@ -227,6 +227,11 @@ Output:
             <td>String</td>
             <td>If output_format parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory.</td>
         </tr>
+        <tr>
+            <td>headless</td>
+            <td>Boolean</td>
+            <td>Whether to run crawler headlessly?. Default is <code>True</code></td>
+        </tr>
     </tbody>
 </table>
 
@@ -486,6 +491,26 @@ Output:
             <td>String</td>
             <td>If output parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory.</td>
         </tr>
+        <tr>
+            <td>since_id</td>
+            <td>Integer</td>
+            <td>After (NOT inclusive) a specified Snowflake ID. Example <a href="https://twitter.com/search?q=since_id%3A1138872932887924737%20max_id%3A1144730280353247233%20%23nasamoontunes&src=typed_query&f=live">here</a></td>
+        </tr>
+        <tr>
+            <td>max_id</td>
+            <td>Integer</td>
+            <td>At or before (inclusive) a specified Snowflake ID. Example <a href="https://twitter.com/search?q=since_id%3A1138872932887924737%20max_id%3A1144730280353247233%20%23nasamoontunes&src=typed_query&f=live">here</a></td>
+        </tr>
+        <tr>
+            <td>within_time</td>
+            <td>String</td>
+            <td>Search within the last number of days, hours, minutes, or seconds. Example <code>2d, 3h, 5m, 30s</code>.</td>
+        </tr>
+        <tr>
+            <td>headless</td>
+            <td>Boolean</td>
+            <td>Whether to run crawler headlessly?. Default is <code>True</code></td>
+        </tr>
     </tbody>
 </table>
 </div>

diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -3,14 +3,10 @@
 with open("README.md", "r", encoding="utf-8") as file:
     long_description = file.read()
 
-requirements = []
-
-for line in open("requirements.txt", 'r', encoding="utf-8").readlines():
-  requirements.append(line.replace("\n", ""))
 
 setuptools.setup(
     name="twitter_scraper_selenium",
-    version="0.1.7",
+    version="2.0.0",
     author="Sajid Shaikh",
     author_email="[email protected]",
     description="Python package to scrap twitter's front-end easily with selenium",
@@ -41,5 +37,11 @@
 
     ],
     python_requires=">=3.6",
-    install_requires=requirements
+    install_requires=[
+        'python-dateutil==2.8.2',
+        'selenium==4.3.0',
+        'selenium-wire==4.6.4',
+        'webdriver-manager==3.2.2',
+        'fake-headers==1.0.2'
+    ]
 )
diff --git a/twitter_scraper_selenium/driver_initialization.py b/twitter_scraper_selenium/driver_initialization.py
@@ -2,27 +2,32 @@
 try:
     from seleniumwire import webdriver
     # to add capabilities for chrome and firefox, import their Options with different aliases
-    from selenium.webdriver.chrome.options import Options as ChromeOptions
-    from selenium.webdriver.firefox.options import Options as FirefoxOptions
+    from selenium.webdriver.chrome.options import Options as CustomChromeOptions
+    from selenium.webdriver.firefox.options import Options as CustomFireFoxOptions
     # import webdriver for downloading respective driver for the browser
     from webdriver_manager.chrome import ChromeDriverManager
     from webdriver_manager.firefox import GeckoDriverManager
     from fake_headers import Headers
+    from selenium.webdriver.chrome.service import Service as ChromeService
+    from selenium.webdriver.firefox.service import Service as FirefoxService
+
 except Exception as ex:
     print(ex)
 
 
 class Initializer:
 
-    def __init__(self, browser_name, proxy=None):
+    def __init__(self, browser_name, headless, proxy=None):
         self.browser_name = browser_name
         self.proxy = proxy
+        self.headless = headless
 
     def set_properties(self, browser_option):
         """adds capabilities to the driver"""
         header = Headers().generate()['User-Agent']
-        browser_option.add_argument(
-            '--headless')  # runs browser in headless mode
+        if self.headless:
+            browser_option.add_argument(
+                '--headless')  # runs browser in headless mode
         browser_option.add_argument('--no-sandbox')
         browser_option.add_argument("--disable-dev-shm-usage")
         browser_option.add_argument('--ignore-certificate-errors')
@@ -37,7 +42,7 @@ def set_driver_for_browser(self, browser_name):
         """expects browser name and returns a driver instance"""
         # if browser is suppose to be chrome
         if browser_name.lower() == "chrome":
-            browser_option = ChromeOptions()
+            browser_option = CustomChromeOptions()
             # automatically installs chromedriver and initialize it and returns the instance
             if self.proxy is not None:
                 options = {
@@ -46,24 +51,26 @@ def set_driver_for_browser(self, browser_name):
                     'no_proxy': 'localhost, 127.0.0.1'
                 }
                 print("Using: {}".format(self.proxy))
-                return webdriver.Chrome(executable_path=ChromeDriverManager().install(),
+
+                return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),
                                         options=self.set_properties(browser_option), seleniumwire_options=options)
 
-            return webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=self.set_properties(browser_option))
+            return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.set_properties(browser_option))
         elif browser_name.lower() == "firefox":
-            browser_option = FirefoxOptions()
+            browser_option = CustomFireFoxOptions()
             if self.proxy is not None:
                 options = {
                     'https': 'https://{}'.format(self.proxy.replace(" ", "")),
                     'http': 'http://{}'.format(self.proxy.replace(" ", "")),
                     'no_proxy': 'localhost, 127.0.0.1'
                 }
                 print("Using: {}".format(self.proxy))
-                return webdriver.Firefox(executable_path=GeckoDriverManager().install(),
+
+                return webdriver.Firefox(service=FirefoxService(executable_path=GeckoDriverManager().install()),
                                          options=self.set_properties(browser_option), seleniumwire_options=options)
 
             # automatically installs geckodriver and initialize it and returns the instance
-            return webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=self.set_properties(browser_option))
+            return webdriver.Firefox(service=FirefoxService(executable_path=GeckoDriverManager().install()), options=self.set_properties(browser_option))
         else:
             # if browser_name is not chrome neither firefox than raise an exception
             raise Exception("Browser not supported!")

diff --git a/twitter_scraper_selenium/driver_utils.py b/twitter_scraper_selenium/driver_utils.py
@@ -13,39 +13,41 @@
     from random import randint
 except Exception as ex:
     frameinfo = currentframe()
-    print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex))
+    print("Error on line no. {} : {}".format(frameinfo.f_lineno, ex))
 
 frameinfo = currentframe()
 
+
 class Utilities:
-  """this class contains all the method related to driver behaviour,
-  like scrolling, waiting for element to appear, it contains all static
-  method, which accepts driver instance as a argument"""
+    """this class contains all the method related to driver behaviour,
+    like scrolling, waiting for element to appear, it contains all static
+    method, which accepts driver instance as a argument"""
 
-  @staticmethod
-  def __wait_until_tweets_appear(driver):
-    try:
-      WebDriverWait(driver, 10).until(EC.presence_of_element_located(
-        (By.CSS_SELECTOR, '[data-testid="tweet"]')))
-    except WebDriverException:
-      print("Tweets did not appear!")
+    @staticmethod
+    def __wait_until_tweets_appear(driver):
+        try:
+            WebDriverWait(driver, 10).until(EC.presence_of_element_located(
+                (By.CSS_SELECTOR, '[data-testid="tweet"]')))
+        except WebDriverException:
+            print(
+                "Tweets did not appear!, Try setting headless=False to see what is happening")
 
-  @staticmethod
-  def __scroll_down(driver):
-    try:
-      body = driver.find_element_by_css_selector('body')
-      for _ in range(3):
-        body.send_keys(Keys.PAGE_DOWN)
-    except Exception as ex:
-      print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex))
+    @staticmethod
+    def __scroll_down(driver):
+        try:
+            body = driver.find_element(By.CSS_SELECTOR, 'body')
+            for _ in range(randint(1,3)):
+                body.send_keys(Keys.PAGE_DOWN)
+        except Exception as ex:
+            print("Error on line no. {} : {}".format(frameinfo.f_lineno, ex))
 
-  @staticmethod
-  def __wait_until_completion(driver):
-    """waits until the page have completed loading"""
-    try:
-      state = ""
-      while state != "complete":
-        time.sleep(randint(3, 5))
-        state = driver.execute_script("return document.readyState")
-    except Exception as ex:
-      print(ex)
+    @staticmethod
+    def __wait_until_completion(driver):
+        """waits until the page have completed loading"""
+        try:
+            state = ""
+            while state != "complete":
+                time.sleep(randint(3, 5))
+                state = driver.execute_script("return document.readyState")
+        except Exception as ex:
+            print(ex)