Skip to content

Commit

Permalink
Add exclusion to sitemap source
Browse files Browse the repository at this point in the history
  • Loading branch information
battleoverflow committed Oct 13, 2023
1 parent b387488 commit 6ffff46
Showing 1 changed file with 29 additions and 9 deletions.
38 changes: 29 additions & 9 deletions threatingestor/sources/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

class Plugin(Source):

def __init__(self, name, url, filter=None, path=None):
def __init__(self, name, url, include=None, exclude=None, path=None):
self.name = name
self.url = url
self.filter = filter
self.include = include
self.exclude = exclude
self.path = path

def run(self, saved_state):
Expand Down Expand Up @@ -47,13 +48,13 @@ def run(self, saved_state):
[x.unwrap() for x in soup.find_all('i')]
soup = BeautifulSoup(soup.decode(), 'html.parser')

if self.filter is not None:
if self.exclude is not None:
# Regex input via config.yml
# Example: security|threat|malware
xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|')))
xml_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))

# Iterates over the regex output to locate all provided keywords
for x in xml_query:
for xe in xml_exclude:
# Uses a path instead of a keyword
if self.path is not None:
if self.path in loc:
Expand All @@ -62,19 +63,38 @@ def run(self, saved_state):

# Only filters using a keyword
if self.path is None:
if x in loc:
if xe not in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

elif self.filter is None and self.path is not None:
# Filters only by path in XML loc, no set filter
if self.include is not None:
# Regex input via config.yml
# Example: security|threat|malware
xml_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|')))

# Iterates over the regex output to locate all provided keywords
for xi in xml_include:
# Uses a path instead of a keyword
if self.path is not None:
if self.path in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

# Only filters using a keyword
if self.path is None:
if xi in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

if self.include is None or self.exclude is None and self.path is not None:
# Filters only by path in XML loc, no set include
# Default: /path/name/*

if self.path in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

else:
if self.include is None and self.path is None and self.exclude is None:
# Locates all blog links within the sitemap
if "blog" in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
Expand Down

0 comments on commit 6ffff46

Please sign in to comment.