From 79ce043219b2ab35c635f2de6a28031051d5ca32 Mon Sep 17 00:00:00 2001
From: Jason Bau
Date: Mon, 21 Oct 2013 17:06:32 -0700
Subject: [PATCH 1/2] use bleach instead of lxml.html.clean for sanitize_html
OEE
---
.../openendedchild.py | 57 +++++++----------
.../xmodule/tests/test_combined_open_ended.py | 62 +++++++++++++++++++
requirements/edx/base.txt | 2 +
3 files changed, 86 insertions(+), 35 deletions(-)
diff --git a/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py b/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py
index 67a058f478af..4804ffbf8151 100644
--- a/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py
+++ b/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py
@@ -1,7 +1,7 @@
import json
import logging
-from lxml.html.clean import Cleaner, autolink_html
import re
+import bleach
from xmodule.progress import Progress
import capa.xqueue_interface as xqueue_interface
from capa.util import *
@@ -50,24 +50,14 @@ def upload_to_s3(file_to_upload, keyname, s3_interface):
return public_url
-class WhiteListCleaner(Cleaner):
- """
- By default, lxml cleaner strips out all links that are not in a defined whitelist.
- We want to allow all links, and rely on the peer grading flagging mechanic to catch
- the "bad" ones. So, don't define a whitelist at all.
- """
- def allow_embedded_url(self, el, url):
- """
- Override the Cleaner allow_embedded_url method to remove the whitelist url requirement.
- Ensure that any tags not in the whitelist are stripped beforehand.
- """
-
- # Tell cleaner to strip any element with a tag that isn't whitelisted.
- if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
- return False
-
- # Tell cleaner to allow all urls.
- return True
+# Used by sanitize_html
+ALLOWED_HTML_ATTRS = {
+ '*': ['id', 'class', 'height', 'width', 'alt'],
+ 'a': ['href', 'title', 'rel'],
+ 'embed': ['src'],
+ 'iframe': ['src'],
+ 'img': ['src'],
+}
class OpenEndedChild(object):
@@ -228,22 +218,19 @@ def sanitize_html(answer):
answer - any string
return - a cleaned version of the string
"""
- try:
- answer = autolink_html(answer)
- cleaner = WhiteListCleaner(
- style=True,
- links=True,
- add_nofollow=False,
- page_structure=True,
- safe_attrs_only=True,
- whitelist_tags=('embed', 'iframe', 'a', 'img', 'br',)
- )
- clean_html = cleaner.clean_html(answer)
- clean_html = re.sub(r'
$', '', re.sub(r'^', '', clean_html))
- clean_html = re.sub("\n","
", clean_html)
- except Exception:
- clean_html = answer
- return clean_html
+ clean_html = bleach.clean(answer,
+ tags=['embed', 'iframe', 'a', 'img', 'br'],
+ attributes=ALLOWED_HTML_ATTRS,
+ strip=True)
+ return OpenEndedChild.replace_newlines(clean_html)
+
+ @staticmethod
+ def replace_newlines(html):
+ """
+ Replaces "\n" newlines with
+ """
+ retv = re.sub(r'
$', '', re.sub(r'^', '', html))
+ return re.sub("\n","
", retv)
def new_history_entry(self, answer):
"""
diff --git a/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py b/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py
index 9aa5aeb5ce2c..aaebd0382144 100644
--- a/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py
+++ b/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py
@@ -1001,3 +1001,65 @@ def test_link_submission_success(self):
self.assertTrue(response['success'])
self.assertIn(self.answer_link, response['student_response'])
self.assertIn(self.autolink_tag, response['student_response'])
+
+
+class OpenEndedModuleUtilTest(unittest.TestCase):
+ """
+ Tests for the util functions of OpenEndedModule. Currently just for the html_sanitizer and
inserter
+ """
+ script_dirty = u''
+ script_clean = u'alert("xss!")'
+ img_dirty = u''
+ img_clean = u''
+ embed_dirty = u'