Skip to content

Commit

Permalink
RSS and sitemap sources ingesting too much (hard to differentiate)
Browse files Browse the repository at this point in the history
  • Loading branch information
battleoverflow committed May 19, 2023
1 parent bc686aa commit 2b64461
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 30 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

setup(
name='threatingestor',
version='1.1.0',
version='1.1.1',
include_package_data=True,
install_requires=requires,
extras_require={
Expand Down
16 changes: 4 additions & 12 deletions threatingestor/sources/rss.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import feedparser
import requests
import iocextract
import regex as re

try:
Expand Down Expand Up @@ -69,12 +67,8 @@ def run(self, saved_state):
artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True)
else:
# Default: self.feed_type == 'messy'.
data = requests.get(item.get('link')).text

if r in data:
# Extract IOCs from HTML page content
payload = str(list(iocextract.extract_iocs(str(data))))
artifacts += self.process_element(payload, item.get('link') or self.url)
text = soup.get_text(separator=' ')
artifacts += self.process_element(text, item.get('link') or self.url)

else:

Expand All @@ -86,10 +80,8 @@ def run(self, saved_state):
artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True)
else:
# Default: self.feed_type == 'messy'.
data = requests.get(item.get('link')).text
# Extract IOCs from HTML page content
payload = str(list(iocextract.extract_iocs(str(data))))
artifacts += self.process_element(payload, item.get('link') or self.url)
text = soup.get_text(separator=' ')
artifacts += self.process_element(text, item.get('link') or self.url)

saved_state = item.get('published') or item.get('updated')

Expand Down
21 changes: 4 additions & 17 deletions threatingestor/sources/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import requests
import datetime
import iocextract
import regex as re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
Expand Down Expand Up @@ -64,35 +63,23 @@ def run(self, saved_state):
if self.path is not None:

if self.path in row["loc"]:
data = requests.get(row["loc"]).text
# Extract IOCs from HTML page content
payload = str(list(iocextract.extract_iocs(str(data))))
artifacts += self.process_element(payload, reference_link=row["loc"] or self.url)
artifacts += self.process_element(row["loc"], self.url)

# Only filters using a keyword
if self.path is None:
if x in row["loc"]:
data = requests.get(row["loc"]).text
# Extract IOCs from HTML page content
payload = str(list(iocextract.extract_iocs(str(data))))
artifacts += self.process_element(payload, reference_link=row["loc"] or self.url)
artifacts += self.process_element(row["loc"], self.url)

elif self.filter is None and self.path is not None:
# Filters only by path in XML loc, no set filter
# Default: /path/name/*

if self.path in row["loc"]:
data = requests.get(row["loc"]).text
# Extract IOCs from HTML page content
payload = str(list(iocextract.extract_iocs(str(data))))
artifacts += self.process_element(payload, reference_link=row["loc"] or self.url)
artifacts += self.process_element(row["loc"], self.url)

else:
# Locates all blog links within the sitemap
if "blog" in row["loc"]:
data = requests.get(row["loc"]).text
# Extract IOCs from HTML page content
payload = str(list(iocextract.extract_iocs(str(data))))
artifacts += self.process_element(payload, reference_link=row["loc"] or self.url)
artifacts += self.process_element(row["loc"], self.url)

return saved_state, artifacts

0 comments on commit 2b64461

Please sign in to comment.