diff --git a/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py b/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py index 67a058f478af..49c66ed0af78 100644 --- a/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py +++ b/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py @@ -1,7 +1,8 @@ import json import logging -from lxml.html.clean import Cleaner, autolink_html import re +import bleach +from html5lib.tokenizer import HTMLTokenizer from xmodule.progress import Progress import capa.xqueue_interface as xqueue_interface from capa.util import * @@ -50,24 +51,14 @@ def upload_to_s3(file_to_upload, keyname, s3_interface): return public_url -class WhiteListCleaner(Cleaner): - """ - By default, lxml cleaner strips out all links that are not in a defined whitelist. - We want to allow all links, and rely on the peer grading flagging mechanic to catch - the "bad" ones. So, don't define a whitelist at all. - """ - def allow_embedded_url(self, el, url): - """ - Override the Cleaner allow_embedded_url method to remove the whitelist url requirement. - Ensure that any tags not in the whitelist are stripped beforehand. - """ - - # Tell cleaner to strip any element with a tag that isn't whitelisted. - if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: - return False - - # Tell cleaner to allow all urls. - return True +# Used by sanitize_html +ALLOWED_HTML_ATTRS = { + '*': ['id', 'class', 'height', 'width', 'alt'], + 'a': ['href', 'title', 'rel', 'target'], + 'embed': ['src'], + 'iframe': ['src'], + 'img': ['src'], +} class OpenEndedChild(object): @@ -228,22 +219,23 @@ def sanitize_html(answer): answer - any string return - a cleaned version of the string """ - try: - answer = autolink_html(answer) - cleaner = WhiteListCleaner( - style=True, - links=True, - add_nofollow=False, - page_structure=True, - safe_attrs_only=True, - whitelist_tags=('embed', 'iframe', 'a', 'img', 'br',) - ) - clean_html = cleaner.clean_html(answer) - clean_html = re.sub(r'

$', '', re.sub(r'^

', '', clean_html)) - clean_html = re.sub("\n","
", clean_html) - except Exception: - clean_html = answer - return clean_html + clean_html = bleach.clean(answer, + tags=['embed', 'iframe', 'a', 'img', 'br'], + attributes=ALLOWED_HTML_ATTRS, + strip=True) + autolinked = bleach.linkify(clean_html, + callbacks=[bleach.callbacks.target_blank], + skip_pre=True, + tokenizer=HTMLTokenizer) + return OpenEndedChild.replace_newlines(autolinked) + + @staticmethod + def replace_newlines(html): + """ + Replaces "\n" newlines with
+ """ + retv = re.sub(r'

$', '', re.sub(r'^

', '', html)) + return re.sub("\n","
", retv) def new_history_entry(self, answer): """ diff --git a/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py b/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py index 9aa5aeb5ce2c..47e6d9bdb728 100644 --- a/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py +++ b/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py @@ -1001,3 +1001,92 @@ def test_link_submission_success(self): self.assertTrue(response['success']) self.assertIn(self.answer_link, response['student_response']) self.assertIn(self.autolink_tag, response['student_response']) + + +class OpenEndedModuleUtilTest(unittest.TestCase): + """ + Tests for the util functions of OpenEndedModule. Currently just for the html_sanitizer and
inserter + """ + script_dirty = u'' + script_clean = u'alert("xss!")' + img_dirty = u'cats' + img_clean = u'cats' + embed_dirty = u'' + embed_clean = u'' + iframe_dirty = u'' + + text = u'I am a \u201c\xfcber student\u201d' + text_lessthan_noencd = u'This used to be broken < by the other parser. 3>5' + text_lessthan_encode = u'This used to be broken < by the other parser. 3>5' + text_linebreaks = u"St\xfcdent submission:\nI like lamp." + text_brs = u"St\xfcdent submission:
I like lamp." + + link_text = u'I love going to www.lolcatz.com' + link_atag = u'I love going to www.lolcatz.com' + + def test_script(self): + """ + Basic test for stripping