-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* first stage added to submodule * refactors log analysis in add-data * adds happy path test and start of acceptance test * adds/fixes tests * adds pipeline and collection validation * adds acceptance tests * tidies logs and comments * adds consecutive run handling (temporary) * parametrizes tests * raise exception for duplicate endpoint added * fixes log endpoint message * modifies collection.load() to handle logs without entry in source.csv * adds log message to collection.load() * adds extra url validity check * adds tests for collection and is_url_valid * removes unnecessary raise * removes unnecessary pipeline_dir fixtures * uses specification and organisation classes, moves validation to utils * removes log_path from Collector.fetch return * raises HTTPError when failing to collect from URL * adds logging to try except in collection.py * changes default logging in collection.py to True * renames logging boolean * renames error ogging boolean * removes collection.py edits, now deletes log after exiting --------- Co-authored-by: averheecke-tpx <[email protected]>
- Loading branch information
1 parent
290ea3e
commit 0019785
Showing
9 changed files
with
1,074 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import os | ||
from datetime import datetime | ||
from urllib.parse import urlparse | ||
|
||
from digital_land.collect import Collector | ||
|
||
|
||
def is_url_valid(url, url_type): | ||
if not url or url.strip() == "": | ||
return False, f"The {url_type} must be populated" | ||
|
||
parsed_url = urlparse(url) | ||
# is url scheme valid i.e start with http:// or https:// | ||
if parsed_url.scheme not in ["http", "https"] or not parsed_url.scheme: | ||
return False, f"The {url_type} must start with 'http://' or 'https://'" | ||
|
||
# does url have domain | ||
if not parsed_url.netloc: | ||
return False, f"The {url_type} must have a domain" | ||
|
||
# ensure domain has correct format | ||
if "." not in parsed_url.netloc: | ||
return ( | ||
False, | ||
f"The {url_type} must have a valid domain with a top-level domain (e.g., '.gov.uk', '.com')", | ||
) | ||
|
||
return True, "" | ||
|
||
|
||
def is_date_valid(date, date_type): | ||
if len(date) == 0: | ||
return False, "Date is blank" | ||
try: | ||
date = datetime.strptime(date, "%Y-%m-%d").date() | ||
# need to catch ValueError here otherwise datetime will raise it's own error, not the clear format we want | ||
except ValueError: | ||
return False, f"{date_type} {date} must be format YYYY-MM-DD" | ||
|
||
if date > datetime.today().date(): | ||
return False, f"The {date_type} {date} cannot be in the future" | ||
|
||
return True, "" | ||
|
||
|
||
def clear_log(collection_dir, endpoint): | ||
collector = Collector(collection_dir=collection_dir) | ||
log_path = collector.log_path(datetime.utcnow(), endpoint) | ||
if os.path.isfile(log_path): | ||
os.remove(log_path) |
Oops, something went wrong.