krmaxwell · sooshie · Jun 30, 2015 · Jun 30, 2015 · Jun 30, 2015 · Jun 30, 2015
diff --git a/README.md b/README.md
@@ -13,15 +13,13 @@
 
 ## Maltrieve
 
-Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mwcrawler). It retrieves malware directly from the sources as listed at a number of sites. Currently we crawl the following:
+Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mwcrawler). It retrieves malware directly from the sources as listed at a number of sites. Currently we crawl the following (via included plugins):
 
-* [Malc0de](http://malc0de.com/rss)
 * [Malware Domain List](http://www.malwaredomainlist.com/hostslist/mdl.xml)
 * [Malware URLs](http://malwareurls.joxeankoret.com/normal.txt)
 * [VX Vault](http://vxvault.siri-urz.net/URL_List.php)
 * [URLquery](http://urlquery.net/)
-* [CleanMX](http://support.clean-mx.de/clean-mx/xmlviruses.php?)
-* [ZeusTracker](https://zeustracker.abuse.ch/monitor.php?urlfeed=binaries)
+* Additional plugins available: [Combine plugins](https://github.com/mlsecproject/combine/tree/dev/combine/plugins)
 
 Other improvements include:
 
@@ -42,6 +40,7 @@ Maltrieve requires the following dependencies:
 * [feedparser](https://pypi.python.org/pypi/feedparser)
 * [python-magic](https://pypi.python.org/pypi/python-magic/)
 * [Requests](http://www.python-requests.org)
+* [Yapsy](https://pypi.python.org/pypi/Yapsy)
 
 With the exception of the Python header files, these can all be found in [requirements.txt](./requirements.txt). On Debian-based distributions, run `sudo apt-get install python-dev`. On Red Hat-based distributions, run `sudo yum install python-devel`. After that, just `pip install -e .`.  You may need to prepend that with ```sudo``` if not running in a virtual environment, but using such an environment is highly encouraged.
 

diff --git a/maltrieve.cfg b/maltrieve.cfg
@@ -3,6 +3,7 @@ dumpdir = archive
 logfile = maltrieve.log
 logheaders = true
 User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
+plugin_dir = ./plugins/
 
 #viper = http://127.0.0.1:8080
 #cuckoo = http://127.0.0.1:8090

diff --git a/maltrieve.py b/maltrieve.py
@@ -37,6 +37,8 @@
 import requests
 from bs4 import BeautifulSoup
 
+from yapsy.PluginManager import PluginManager
+
 
 class config(object):
 
@@ -45,6 +47,13 @@ class config(object):
     def __init__(self, args, filename='maltrieve.cfg'):
         self.configp = ConfigParser.ConfigParser()
         self.configp.read(filename)
+        self.plugin_dir = './plugins'
+
+        try:
+            if self.configp.get('Maltrieve', 'plugin_dir'):
+                self.plugin_dir = self.configp.get('Maltrieve', 'plugin_dir')
+        except Exception as e:
+            pass
 
         if args.logfile or self.configp.get('Maltrieve', 'logfile'):
             if args.logfile:
@@ -343,45 +352,6 @@ def save_malware(response, cfg):
     return True
 
 
-def process_xml_list_desc(response):
-    feed = feedparser.parse(response)
-    urls = set()
-
-    for entry in feed.entries:
-        desc = entry.description
-        url = desc.split(' ')[1].rstrip(',')
-        if url == '':
-            continue
-        if url == '-':
-            url = desc.split(' ')[4].rstrip(',')
-        url = re.sub('&amp;', '&', url)
-        if not re.match('http', url):
-            url = 'http://' + url
-        urls.add(url)
-
-    return urls
-
-
-def process_xml_list_title(response):
-    feed = feedparser.parse(response)
-    urls = set([re.sub('&amp;', '&', entry.title) for entry in feed.entries])
-    return urls
-
-
-def process_simple_list(response):
-    urls = set([re.sub('&amp;', '&', line.strip()) for line in response.split('\n') if line.startswith('http')])
-    return urls
-
-
-def process_urlquery(response):
-    soup = BeautifulSoup(response)
-    urls = set()
-    for t in soup.find_all("table", class_="test"):
-        for a in t.find_all("a"):
-            urls.add('http://' + re.sub('&amp;', '&', a.text))
-    return urls
-
-
 def chunker(seq, size):
     return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))
 
@@ -463,28 +433,51 @@ def main():
     hashes = load_hashes('hashes.json')
     past_urls = load_urls('urls.json')
 
-    print "Processing source URLs"
+    print 'Loading Plugins'
+    # Load the plugins from the plugin directory.
+    manager = PluginManager()
+    manager.setPluginPlaces([cfg.plugin_dir])
+    manager.collectPlugins()
 
-    # TODO: Replace with plugins
-    source_urls = {'https://zeustracker.abuse.ch/monitor.php?urlfeed=binaries': process_xml_list_desc,
-                   'http://www.malwaredomainlist.com/hostslist/mdl.xml': process_xml_list_desc,
-                   'http://malc0de.com/rss/': process_xml_list_desc,
-                   'http://vxvault.net/URL_List.php': process_simple_list,
-                   'http://urlquery.net/': process_urlquery,
-                   'http://support.clean-mx.de/clean-mx/rss?scope=viruses&limit=0%2C64': process_xml_list_title,
-                   'http://malwareurls.joxeankoret.com/normal.txt': process_simple_list}
-    headers = {'User-Agent': 'Maltrieve'}
+    source_urls = []
+    for plugin in manager.getAllPlugins():
+        print 'Processing: ' + plugin.plugin_object.get_name()
+        o_headers = None
+        try:
+            o_headers = plugin.plugin_object.get_headers()
+        except Exception as e:
+            pass  # because we don't care if this isn't implemented in plugins
+        for url in plugin.plugin_object.get_URLs():
+            if url.startswith('file://'):
+                files.append(url.partition('://')[2])
+            else:
+                source_urls.append(url)
 
+    headers = {'User-Agent': 'Maltrieve'}
     reqs = [grequests.get(url, timeout=60, headers=headers, proxies=cfg.proxy) for url in source_urls]
     source_lists = grequests.map(reqs)
 
-    print "Completed source processing"
+    print "Completed source retrieval"
 
     headers['User-Agent'] = cfg.useragent
     malware_urls = set()
     for response in source_lists:
         if hasattr(response, 'status_code') and response.status_code == 200:
-            malware_urls.update(source_urls[response.url](response.text))
+            print "Processing feed from %s" % response.url
+            # Loop through all the plugins and see which ones have matching names
+            for plugin in manager.getAllPlugins():
+                urls = set(plugin.plugin_object.URLS)
+                # For those plugins that build dynamic URLs, we should get those for the comparison for parsing
+                urls = set(plugin.plugin_object.get_URLs())
+                if response.url in urls:
+                        print 'Parsing feed from %s' % response.url
+                        result = plugin.plugin_object.process_data(response.url, response.text)
+                        for r in result:
+                            if r['indicator_type'] == 'IPv4' or r['indicator_type'] == 'FQDN':
+                                indicator = 'http://' + r['indicator']
+                                malware_urls.add(indicator)
+                            elif r['indicator_type'] == 'URL':
+                                malware_urls.add(r['indicator'])
 
     if cfg.inputfile:
         with open(cfg.inputfile, 'rb') as f:

diff --git a/plugins/joxeankoret.py b/plugins/joxeankoret.py
@@ -0,0 +1,27 @@
+import datetime
+
+from yapsy.IPlugin import IPlugin
+
+
+class PluginOne(IPlugin):
+    NAME = "joxeankoret"
+    DIRECTION = "outbound"
+    URLS = ['http://malwareurls.joxeankoret.com/normal.txt']
+
+    def get_URLs(self):
+        return self.URLS
+
+    def get_direction(self):
+        return self.DIRECTION
+
+    def get_name(self):
+        return self.NAME
+
+    def process_data(self, source, response):
+        current_date = str(datetime.date.today())
+        data = []
+        for line in response.splitlines():
+            if line.startswith('http'):
+                data.append({'indicator': line, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
+                             'source_name': self.NAME, 'source': source, 'date': current_date})
+        return data
diff --git a/plugins/joxeankoret.yapsy-plugin b/plugins/joxeankoret.yapsy-plugin
@@ -0,0 +1,9 @@
+[Core]
+Name = joxeankoret
+Module = joxeankoret
+
+[Documentation]   
+Author = sooshie@gmail.com
+Version = 0.1
+Website = http://secrepo.com
+Description = Joxean Koret Malware URLs
diff --git a/plugins/malwaredomainlist.py b/plugins/malwaredomainlist.py
@@ -0,0 +1,39 @@
+import datetime
+import feedparser
+import re
+
+from yapsy.IPlugin import IPlugin
+
+
+class PluginOne(IPlugin):
+    NAME = "malwaredomainlist"
+    DIRECTION = "outbound"
+    URLS = ['http://www.malwaredomainlist.com/hostslist/mdl.xml']
+
+    def get_URLs(self):
+        return self.URLS
+
+    def get_direction(self):
+        return self.DIRECTION
+
+    def get_name(self):
+        return self.NAME
+
+    def process_data(self, source, response):
+        current_date = str(datetime.date.today())
+        data = []
+        feed = feedparser.parse(response)
+
+        for entry in feed.entries:
+            desc = entry.description
+            url = desc.split(' ')[1].rstrip(',')
+            if url == '':
+                continue
+            if url == '-':
+                url = desc.split(' ')[4].rstrip(',')
+            url = re.sub('&amp;', '&', url)
+            if not re.match('http', url):
+                url = 'http://' + url
+            data.append({'indicator': url, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
+                         'source_name': self.NAME, 'source': source, 'note': desc, 'date': current_date})
+        return data
diff --git a/plugins/malwaredomainlist.yapsy-plugin b/plugins/malwaredomainlist.yapsy-plugin
@@ -0,0 +1,9 @@
+[Core]
+Name = malwaredomainlist
+Module = malwaredomainlist
+
+[Documentation]   
+Author = sooshie@gmail.com
+Version = 0.1
+Website = http://secrepo.com
+Description = Malware Domain List
diff --git a/plugins/urlquery.py b/plugins/urlquery.py
@@ -0,0 +1,31 @@
+import datetime
+import bs4
+import re
+
+from yapsy.IPlugin import IPlugin
+
+
+class PluginOne(IPlugin):
+    NAME = "urlquery"
+    DIRECTION = "outbound"
+    URLS = ['https://urlquery.net/']
+
+    def get_URLs(self):
+        return self.URLS
+
+    def get_direction(self):
+        return self.DIRECTION
+
+    def get_name(self):
+        return self.NAME
+
+    def process_data(self, source, response):
+        current_date = str(datetime.date.today())
+        data = []
+        soup = bs4.BeautifulSoup(response)
+        for t in soup.find_all("table", class_="test"):
+            for a in t.find_all("a"):
+                indicator = 'http://' + re.sub('&amp;', '&', a.text)
+                data.append({'indicator': indicator, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
+                             'source_name': self.NAME, 'source': source, 'date': current_date})
+        return data
diff --git a/plugins/urlquery.yapsy-plugin b/plugins/urlquery.yapsy-plugin
@@ -0,0 +1,9 @@
+[Core]
+Name = urlquery
+Module = urlquery
+
+[Documentation]   
+Author = sooshie@gmail.com
+Version = 0.1
+Website = http://secrepo.com
+Description = URLQuery
diff --git a/plugins/vxvault.py b/plugins/vxvault.py
@@ -0,0 +1,28 @@
+import datetime
+import re
+
+from yapsy.IPlugin import IPlugin
+
+
+class PluginOne(IPlugin):
+    NAME = "vxvault"
+    DIRECTION = "outbound"
+    URLS = ['http://vxvault.net/URL_List.php']
+
+    def get_URLs(self):
+        return self.URLS
+
+    def get_direction(self):
+        return self.DIRECTION
+
+    def get_name(self):
+        return self.NAME
+
+    def process_data(self, source, response):
+        current_date = str(datetime.date.today())
+        data = []
+        for line in response.splitlines():
+            if line.startswith('http'):
+                data.append({'indicator': line, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
+                             'source_name': self.NAME, 'source': source, 'date': current_date})
+        return data
diff --git a/plugins/vxvault.yapsy-plugin b/plugins/vxvault.yapsy-plugin
@@ -0,0 +1,9 @@
+[Core]
+Name = vxvault
+Module = vxvault
+
+[Documentation]   
+Author = sooshie@gmail.com
+Version = 0.1
+Website = http://secrepo.com
+Description = VX Vault
diff --git a/setup.py b/setup.py
@@ -22,6 +22,7 @@
           'pytest-cov',
           'coveralls',
           'LinkChecker',
+          'yapsy',
           'markdown'
       ],
       package_dir={'maltrieve': 'src'},

diff --git a/test.py b/test.py
@@ -71,24 +71,6 @@ def test_create_default_dumpdir_when_specified_doesnt_exist():
     assert cfg.dumpdir == '/tmp/malware'
 
 
-def test_parse_simple_list():
-    source = requests.get('http://xwell.org/assets/maltrieve-test.txt').text
-    assert maltrieve.process_simple_list(source) == \
-        set(['http://example.org/mylist', 'http://example.com/yourlist'])
-
-
-def test_parse_xml_list():
-    source = requests.get('http://xwell.org/assets/maltrieve-test-list.xml').text
-    assert maltrieve.process_xml_list_title(source) == \
-        set(['http://example.org/mylist', 'http://example.com/yourlist'])
-
-
-def test_parse_xml_desc():
-    source = requests.get('http://xwell.org/assets/maltrieve-test-desc.xml').text
-    assert maltrieve.process_xml_list_desc(source) == \
-        set(['http://example.org/mylist', 'http://example.com/yourlist'])
-
-
 def test_load_hashes(hashfile='test-load-hashes.json'):
     assert maltrieve.load_hashes(hashfile) == \
         set(['d41d8cd98f00b204e9800998ecf8427e'])