Skip to content

Commit

Permalink
Merge pull request #25 from shaikhsajid1111/version_2.0
Browse files Browse the repository at this point in the history
Updates for version 2.0.0
  • Loading branch information
shaikhsajid1111 authored Jul 9, 2022
2 parents 84ec015 + 9ba4586 commit 3a4c313
Show file tree
Hide file tree
Showing 9 changed files with 693 additions and 589 deletions.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,11 @@ Output:
<td>String</td>
<td>If output_format parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory.</td>
</tr>
<tr>
<td>headless</td>
<td>Boolean</td>
<td>Whether to run crawler headlessly?. Default is <code>True</code></td>
</tr>
</tbody>
</table>

Expand Down Expand Up @@ -486,6 +491,26 @@ Output:
<td>String</td>
<td>If output parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory.</td>
</tr>
<tr>
<td>since_id</td>
<td>Integer</td>
<td>After (NOT inclusive) a specified Snowflake ID. Example <a href="https://twitter.com/search?q=since_id%3A1138872932887924737%20max_id%3A1144730280353247233%20%23nasamoontunes&src=typed_query&f=live">here</a></td>
</tr>
<tr>
<td>max_id</td>
<td>Integer</td>
<td>At or before (inclusive) a specified Snowflake ID. Example <a href="https://twitter.com/search?q=since_id%3A1138872932887924737%20max_id%3A1144730280353247233%20%23nasamoontunes&src=typed_query&f=live">here</a></td>
</tr>
<tr>
<td>within_time</td>
<td>String</td>
<td>Search within the last number of days, hours, minutes, or seconds. Example <code>2d, 3h, 5m, 30s</code>.</td>
</tr>
<tr>
<td>headless</td>
<td>Boolean</td>
<td>Whether to run crawler headlessly?. Default is <code>True</code></td>
</tr>
</tbody>
</table>
</div>
Expand Down
5 changes: 0 additions & 5 deletions requirements.txt

This file was deleted.

14 changes: 8 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@
with open("README.md", "r", encoding="utf-8") as file:
long_description = file.read()

requirements = []

for line in open("requirements.txt", 'r', encoding="utf-8").readlines():
requirements.append(line.replace("\n", ""))

setuptools.setup(
name="twitter_scraper_selenium",
version="0.1.7",
version="2.0.0",
author="Sajid Shaikh",
author_email="[email protected]",
description="Python package to scrap twitter's front-end easily with selenium",
Expand Down Expand Up @@ -41,5 +37,11 @@

],
python_requires=">=3.6",
install_requires=requirements
install_requires=[
'python-dateutil==2.8.2',
'selenium==4.3.0',
'selenium-wire==4.6.4',
'webdriver-manager==3.2.2',
'fake-headers==1.0.2'
]
)
29 changes: 18 additions & 11 deletions twitter_scraper_selenium/driver_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,32 @@
try:
from seleniumwire import webdriver
# to add capabilities for chrome and firefox, import their Options with different aliases
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as CustomChromeOptions
from selenium.webdriver.firefox.options import Options as CustomFireFoxOptions
# import webdriver for downloading respective driver for the browser
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from fake_headers import Headers
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.firefox.service import Service as FirefoxService

except Exception as ex:
print(ex)


class Initializer:

def __init__(self, browser_name, proxy=None):
def __init__(self, browser_name, headless, proxy=None):
self.browser_name = browser_name
self.proxy = proxy
self.headless = headless

def set_properties(self, browser_option):
"""adds capabilities to the driver"""
header = Headers().generate()['User-Agent']
browser_option.add_argument(
'--headless') # runs browser in headless mode
if self.headless:
browser_option.add_argument(
'--headless') # runs browser in headless mode
browser_option.add_argument('--no-sandbox')
browser_option.add_argument("--disable-dev-shm-usage")
browser_option.add_argument('--ignore-certificate-errors')
Expand All @@ -37,7 +42,7 @@ def set_driver_for_browser(self, browser_name):
"""expects browser name and returns a driver instance"""
# if browser is suppose to be chrome
if browser_name.lower() == "chrome":
browser_option = ChromeOptions()
browser_option = CustomChromeOptions()
# automatically installs chromedriver and initialize it and returns the instance
if self.proxy is not None:
options = {
Expand All @@ -46,24 +51,26 @@ def set_driver_for_browser(self, browser_name):
'no_proxy': 'localhost, 127.0.0.1'
}
print("Using: {}".format(self.proxy))
return webdriver.Chrome(executable_path=ChromeDriverManager().install(),

return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),
options=self.set_properties(browser_option), seleniumwire_options=options)

return webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=self.set_properties(browser_option))
return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.set_properties(browser_option))
elif browser_name.lower() == "firefox":
browser_option = FirefoxOptions()
browser_option = CustomFireFoxOptions()
if self.proxy is not None:
options = {
'https': 'https://{}'.format(self.proxy.replace(" ", "")),
'http': 'http://{}'.format(self.proxy.replace(" ", "")),
'no_proxy': 'localhost, 127.0.0.1'
}
print("Using: {}".format(self.proxy))
return webdriver.Firefox(executable_path=GeckoDriverManager().install(),

return webdriver.Firefox(service=FirefoxService(executable_path=GeckoDriverManager().install()),
options=self.set_properties(browser_option), seleniumwire_options=options)

# automatically installs geckodriver and initialize it and returns the instance
return webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=self.set_properties(browser_option))
return webdriver.Firefox(service=FirefoxService(executable_path=GeckoDriverManager().install()), options=self.set_properties(browser_option))
else:
# if browser_name is not chrome neither firefox than raise an exception
raise Exception("Browser not supported!")
Expand Down
60 changes: 31 additions & 29 deletions twitter_scraper_selenium/driver_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,39 +13,41 @@
from random import randint
except Exception as ex:
frameinfo = currentframe()
print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex))
print("Error on line no. {} : {}".format(frameinfo.f_lineno, ex))

frameinfo = currentframe()


class Utilities:
"""this class contains all the method related to driver behaviour,
like scrolling, waiting for element to appear, it contains all static
method, which accepts driver instance as a argument"""
"""this class contains all the method related to driver behaviour,
like scrolling, waiting for element to appear, it contains all static
method, which accepts driver instance as a argument"""

@staticmethod
def __wait_until_tweets_appear(driver):
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '[data-testid="tweet"]')))
except WebDriverException:
print("Tweets did not appear!")
@staticmethod
def __wait_until_tweets_appear(driver):
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '[data-testid="tweet"]')))
except WebDriverException:
print(
"Tweets did not appear!, Try setting headless=False to see what is happening")

@staticmethod
def __scroll_down(driver):
try:
body = driver.find_element_by_css_selector('body')
for _ in range(3):
body.send_keys(Keys.PAGE_DOWN)
except Exception as ex:
print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex))
@staticmethod
def __scroll_down(driver):
try:
body = driver.find_element(By.CSS_SELECTOR, 'body')
for _ in range(randint(1,3)):
body.send_keys(Keys.PAGE_DOWN)
except Exception as ex:
print("Error on line no. {} : {}".format(frameinfo.f_lineno, ex))

@staticmethod
def __wait_until_completion(driver):
"""waits until the page have completed loading"""
try:
state = ""
while state != "complete":
time.sleep(randint(3, 5))
state = driver.execute_script("return document.readyState")
except Exception as ex:
print(ex)
@staticmethod
def __wait_until_completion(driver):
"""waits until the page have completed loading"""
try:
state = ""
while state != "complete":
time.sleep(randint(3, 5))
state = driver.execute_script("return document.readyState")
except Exception as ex:
print(ex)
Loading

0 comments on commit 3a4c313

Please sign in to comment.