From 6ffff4694cfdbf8a7737bf995c60ea2c16e23d9b Mon Sep 17 00:00:00 2001 From: azazelm3dj3d <56496067+azazelm3dj3d@users.noreply.github.com> Date: Fri, 13 Oct 2023 11:48:10 -0500 Subject: [PATCH] Add exclusion to sitemap source --- threatingestor/sources/sitemap.py | 38 +++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/threatingestor/sources/sitemap.py b/threatingestor/sources/sitemap.py index 677b918..53a1674 100644 --- a/threatingestor/sources/sitemap.py +++ b/threatingestor/sources/sitemap.py @@ -8,10 +8,11 @@ class Plugin(Source): - def __init__(self, name, url, filter=None, path=None): + def __init__(self, name, url, include=None, exclude=None, path=None): self.name = name self.url = url - self.filter = filter + self.include = include + self.exclude = exclude self.path = path def run(self, saved_state): @@ -47,13 +48,13 @@ def run(self, saved_state): [x.unwrap() for x in soup.find_all('i')] soup = BeautifulSoup(soup.decode(), 'html.parser') - if self.filter is not None: + if self.exclude is not None: # Regex input via config.yml # Example: security|threat|malware - xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|'))) + xml_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|'))) # Iterates over the regex output to locate all provided keywords - for x in xml_query: + for xe in xml_exclude: # Uses a path instead of a keyword if self.path is not None: if self.path in loc: @@ -62,19 +63,38 @@ def run(self, saved_state): # Only filters using a keyword if self.path is None: - if x in loc: + if xe not in loc: text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1] artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True) - elif self.filter is None and self.path is not None: - # Filters only by path in XML loc, no set filter + if self.include is not None: + # Regex input via config.yml + # Example: security|threat|malware + xml_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|'))) + + # Iterates over the regex output to locate all provided keywords + for xi in xml_include: + # Uses a path instead of a keyword + if self.path is not None: + if self.path in loc: + text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1] + artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True) + + # Only filters using a keyword + if self.path is None: + if xi in loc: + text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1] + artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True) + + if self.include is None or self.exclude is None and self.path is not None: + # Filters only by path in XML loc, no set include # Default: /path/name/* if self.path in loc: text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1] artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True) - else: + if self.include is None and self.path is None and self.exclude is None: # Locates all blog links within the sitemap if "blog" in loc: text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]