diff --git a/setup.py b/setup.py index 4d26227..0af81bb 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( name='threatingestor', - version='1.1.0', + version='1.1.1', include_package_data=True, install_requires=requires, extras_require={ diff --git a/threatingestor/sources/rss.py b/threatingestor/sources/rss.py index 8c440cd..923b1a6 100644 --- a/threatingestor/sources/rss.py +++ b/threatingestor/sources/rss.py @@ -1,6 +1,4 @@ import feedparser -import requests -import iocextract import regex as re try: @@ -69,12 +67,8 @@ def run(self, saved_state): artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True) else: # Default: self.feed_type == 'messy'. - data = requests.get(item.get('link')).text - - if r in data: - # Extract IOCs from HTML page content - payload = str(list(iocextract.extract_iocs(str(data)))) - artifacts += self.process_element(payload, item.get('link') or self.url) + text = soup.get_text(separator=' ') + artifacts += self.process_element(text, item.get('link') or self.url) else: @@ -86,10 +80,8 @@ def run(self, saved_state): artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True) else: # Default: self.feed_type == 'messy'. - data = requests.get(item.get('link')).text - # Extract IOCs from HTML page content - payload = str(list(iocextract.extract_iocs(str(data)))) - artifacts += self.process_element(payload, item.get('link') or self.url) + text = soup.get_text(separator=' ') + artifacts += self.process_element(text, item.get('link') or self.url) saved_state = item.get('published') or item.get('updated') diff --git a/threatingestor/sources/sitemap.py b/threatingestor/sources/sitemap.py index 1ed859e..30f9a67 100644 --- a/threatingestor/sources/sitemap.py +++ b/threatingestor/sources/sitemap.py @@ -1,6 +1,5 @@ import requests import datetime -import iocextract import regex as re from bs4 import BeautifulSoup from urllib.parse import urlparse @@ -64,35 +63,23 @@ def run(self, saved_state): if self.path is not None: if self.path in row["loc"]: - data = requests.get(row["loc"]).text - # Extract IOCs from HTML page content - payload = str(list(iocextract.extract_iocs(str(data)))) - artifacts += self.process_element(payload, reference_link=row["loc"] or self.url) + artifacts += self.process_element(row["loc"], self.url) # Only filters using a keyword if self.path is None: if x in row["loc"]: - data = requests.get(row["loc"]).text - # Extract IOCs from HTML page content - payload = str(list(iocextract.extract_iocs(str(data)))) - artifacts += self.process_element(payload, reference_link=row["loc"] or self.url) + artifacts += self.process_element(row["loc"], self.url) elif self.filter is None and self.path is not None: # Filters only by path in XML loc, no set filter # Default: /path/name/* if self.path in row["loc"]: - data = requests.get(row["loc"]).text - # Extract IOCs from HTML page content - payload = str(list(iocextract.extract_iocs(str(data)))) - artifacts += self.process_element(payload, reference_link=row["loc"] or self.url) + artifacts += self.process_element(row["loc"], self.url) else: # Locates all blog links within the sitemap if "blog" in row["loc"]: - data = requests.get(row["loc"]).text - # Extract IOCs from HTML page content - payload = str(list(iocextract.extract_iocs(str(data)))) - artifacts += self.process_element(payload, reference_link=row["loc"] or self.url) + artifacts += self.process_element(row["loc"], self.url) return saved_state, artifacts \ No newline at end of file