From da428859750e98fb2b307c790d151fd4e4abef20 Mon Sep 17 00:00:00 2001 From: Johannes Habel Date: Tue, 6 Feb 2024 00:56:21 +0100 Subject: [PATCH] - implemented searching - unit tests for searching - implemented searching filters --- xvideos_api/modules/consts.py | 4 +- xvideos_api/modules/sorting.py | 1 + xvideos_api/tests/test_search.py | 86 ++++++++++++++++++++++++++++++++ xvideos_api/xvideos_api.py | 44 ++++++++++------ 4 files changed, 117 insertions(+), 18 deletions(-) create mode 100644 xvideos_api/tests/test_search.py diff --git a/xvideos_api/modules/consts.py b/xvideos_api/modules/consts.py index a4c269d..d8e11c1 100644 --- a/xvideos_api/modules/consts.py +++ b/xvideos_api/modules/consts.py @@ -1,6 +1,6 @@ import re -REGEX_VIDEO_CHECK_URL = re.compile(r'https://www.xvideos.com/(.*?)') +REGEX_VIDEO_CHECK_URL = re.compile(r'https://www.xvideos.com/video(.*?)') REGEX_VIDEO_M3U8 = re.compile(r"html5player\.setVideoHLS\('([^']+)'\);") REGEX_VIDEO_TAGS = re.compile(r'href="/tags/(.*?)" class="is-keyword', re.DOTALL) REGEX_VIDEO_VIEWS = re.compile(r'(.*?)') @@ -12,4 +12,4 @@ REGEX_VIDEO_LENGTH = re.compile(r'(.*?)') REGEX_VIDEO_PORNSTARS = re.compile(r'a href="/models/(.*?)" class=') -REGEX_SEARCH_SCRAPE_VIDEOS = re.compile(r'
', re.DOTALL) diff --git a/xvideos_api/modules/sorting.py b/xvideos_api/modules/sorting.py index 088393e..1aae4e3 100644 --- a/xvideos_api/modules/sorting.py +++ b/xvideos_api/modules/sorting.py @@ -17,6 +17,7 @@ class SortDate: class SortVideoTime: + Sort_all = "allduration" Sort_short = "1-3min" Sort_middle = "3-10min" Sort_long = "10min_more" diff --git a/xvideos_api/tests/test_search.py b/xvideos_api/tests/test_search.py new file mode 100644 index 0000000..72bee21 --- /dev/null +++ b/xvideos_api/tests/test_search.py @@ -0,0 +1,86 @@ +from ..xvideos_api import Client, Sort, SortVideoTime, SortQuality, SortDate + +# This is a deep test for the searching functionalities... + +client = Client() +query = "Mia Khalifa" + + +def video_object_test(object): + for idx, video in enumerate(object): + assert isinstance(video.title, str) and len(video.title) > 0 + + if idx == 3: + break + + +def test_base_search(): + videos = client.search(query, pages=1) + for video in videos: + assert isinstance(video.title, str) and len(video.title) > 0 + + +def test_Sort_search(): + videos = client.search(query, sorting_Sort=Sort.Sort_rating) + videos_2 = client.search(query, sorting_Sort=Sort.Sort_relevance) + videos_3 = client.search(query, sorting_Sort=Sort.Sort_views) + videos_4 = client.search(query, sorting_Sort=Sort.Sort_length) + videos_5 = client.search(query, sorting_Sort=Sort.Sort_random) + videos_6 = client.search(query, sorting_Sort=Sort.Sort_upload_date) + + video_object_test(videos) + video_object_test(videos_2) + video_object_test(videos_3) + video_object_test(videos_4) + video_object_test(videos_5) + video_object_test(videos_6) + + +def test_SortVideoTime_search(): + videos = client.search(query, pages=1, sorting_Time=SortVideoTime.Sort_long) + videos_2 = client.search(query, pages=1, sorting_Time=SortVideoTime.Sort_all) + videos_3 = client.search(query, pages=1, sorting_Time=SortVideoTime.Sort_short) + videos_4 = client.search(query, pages=1, sorting_Time=SortVideoTime.Sort_middle) + videos_5 = client.search(query, pages=1, sorting_Time=SortVideoTime.Sort_really_long) + videos_6 = client.search(query, pages=1, sorting_Time=SortVideoTime.Sort_long_10_20min) + + + + video_object_test(videos) + video_object_test(videos_2) + video_object_test(videos_3) + video_object_test(videos_4) + video_object_test(videos_5) + video_object_test(videos_6) + + +def test_SortQuality_search(): + videos = client.search(query, pages=1, sort_Quality=SortQuality.Sort_720p) + videos_2 = client.search(query, pages=1, sort_Quality=SortQuality.Sort_all) + videos_3 = client.search(query, pages=1, sort_Quality=SortQuality.Sort_1080_plus) + + video_object_test(videos) + video_object_test(videos_2) + video_object_test(videos_3) + + +def test_SortDate_search(): + videos = client.search(query, pages=1, sorting_Date=SortDate.Sort_all) + videos_2 = client.search(query, pages=1, sorting_Date=SortDate.Sort_week) + videos_3 = client.search(query, pages=1, sorting_Date=SortDate.Sort_month) + videos_4 = client.search(query, pages=1, sorting_Date=SortDate.Sort_last_3_days) + videos_5 = client.search(query, pages=1, sorting_Date=SortDate.Sort_last_3_months) + videos_6 = client.search(query, pages=1, sorting_Date=SortDate.Sort_last_6_months) + + video_object_test(videos) + video_object_test(videos_2) + video_object_test(videos_3) + video_object_test(videos_4) + video_object_test(videos_5) + video_object_test(videos_6) + + + + + + diff --git a/xvideos_api/xvideos_api.py b/xvideos_api/xvideos_api.py index 2d24d09..128535b 100644 --- a/xvideos_api/xvideos_api.py +++ b/xvideos_api/xvideos_api.py @@ -51,7 +51,6 @@ def __init__(self, url): self.json_data = self.flatten_json(nested_json=self.extract_json_from_html()) self.script_content = self.get_script_content() - @classmethod def check_url(cls, url): match = REGEX_VIDEO_CHECK_URL.match(url) @@ -272,27 +271,40 @@ def get_video(cls, url): return Video(url) @classmethod - def search(cls, query, sorting_Sort: Sort, sorting_Date: SortDate, sorting_Time: SortVideoTime, - sort_Quality: SortQuality, pages=2): - - url = f"https://www.xvideos.com/?k={query}&sort={sorting_Sort}%&datef={sorting_Date}&durf={sorting_Time}&quality={sort_Quality}" - videos_ids = [] - - for page in range(pages): - response = requests.get(f"{url}&p={page}").content.decode("utf-8") - list_ids = REGEX_SEARCH_SCRAPE_VIDEOS.findall(response) + def extract_video_urls(cls, html_content): + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(html_content, 'lxml') + video_urls = [] - for video_id in list_ids: - videos_ids.append(video_id) - - for id in videos_ids: - yield Video(f"https://xvideos.com/video{id}") + # Find all 'div' elements with the class 'thumb' + thumb_divs = soup.find_all('div', class_='thumb') + # Iterate over each 'thumb' div and extract the 'href' attribute from the 'a' tag within it + for div in thumb_divs: + a_tag = div.find('a', href=True) # Find the first 'a' tag with an 'href' attribute + if a_tag and a_tag['href']: # Ensure the 'a' tag and its 'href' attribute exist + video_urls.append(a_tag['href']) + return video_urls + @classmethod + def search(cls, query, sorting_Sort: Sort = Sort.Sort_relevance, sorting_Date: SortDate = SortDate.Sort_all, + sorting_Time: SortVideoTime = SortVideoTime.Sort_all, sort_Quality: SortQuality = SortQuality.Sort_all, + pages=2): + query = query.replace(" ", "+") -Client().search(query="mia", sort_Quality=SortQuality.Sort_720p, sorting_Sort=Sort.Sort_rating, sorting_Date=SortDate.Sort_all, sorting_Time=SortVideoTime.Sort_long) + url = f"https://www.xvideos.com/?k={query}&sort={sorting_Sort}%&datef={sorting_Date}&durf={sorting_Time}&quality={sort_Quality}" + urls = [] + for page in range(pages): + response = requests.get(f"{url}&p={page}").content.decode("utf-8") + urls_ = Client.extract_video_urls(response) + for url in urls_: + url = f"https://www.xvideos.com{url}" + if REGEX_VIDEO_CHECK_URL.match(url): + urls.append(url) + for id in urls: + yield Video(id)