From f0bb8a45268a61e240188376547454579d25f600 Mon Sep 17 00:00:00 2001 From: "Kyle D. McCormick" Date: Mon, 24 Jun 2024 16:52:31 -0400 Subject: [PATCH] feat: generate OLX with single-quoted attributes (WIP) --- mypy.ini | 3 +- xmodule/modulestore/xml_exporter.py | 17 ++--- xmodule/util/olx.py | 98 +++++++++++++++++++++++++++++ xmodule/xml_block.py | 5 +- 4 files changed, 112 insertions(+), 11 deletions(-) create mode 100644 xmodule/util/olx.py diff --git a/mypy.ini b/mypy.ini index 5027c3bb559..29e5414d4c4 100644 --- a/mypy.ini +++ b/mypy.ini @@ -12,7 +12,8 @@ files = openedx/core/djangoapps/xblock, openedx/core/types, openedx/core/djangoapps/content_tagging, - xmodule/util/keys.py + xmodule/util/keys.py, + xmodule/util/olx.py [mypy.plugins.django-stubs] # content_staging only works with CMS; others work with either, so we run mypy with CMS settings. diff --git a/xmodule/modulestore/xml_exporter.py b/xmodule/modulestore/xml_exporter.py index 84986e2c153..133feb5b74b 100644 --- a/xmodule/modulestore/xml_exporter.py +++ b/xmodule/modulestore/xml_exporter.py @@ -1,10 +1,11 @@ """ Methods for exporting course data to XML """ - +from __future__ import annotations import logging import os +import typing as t from abc import abstractmethod from json import dumps @@ -24,6 +25,7 @@ from xmodule.modulestore.draft_and_published import DIRECT_ONLY_CATEGORIES from xmodule.modulestore.inheritance import own_metadata from xmodule.modulestore.store_utilities import draft_node_constructor, get_draft_subtree_roots +from xmodule.util.olx import write_xml_to_file DRAFT_DIR = "drafts" PUBLISHED_DIR = "published" @@ -202,8 +204,8 @@ def get_courselike(self): return self.modulestore.get_course(self.courselike_key, depth=None, lazy=False) def process_root(self, root, export_fs): - with export_fs.open('course.xml', 'wb') as course_xml: - lxml.etree.ElementTree(root).write(course_xml, encoding='utf-8') + with export_fs.open('course.xml', 'w') as course_xml: + write_xml_to_file(root, course_xml) def process_extra(self, root, courselike, root_courselike_dir, xml_centric_courselike_key, export_fs): # Export the modulestore's asset metadata. @@ -216,8 +218,8 @@ def process_extra(self, root, courselike, root_courselike_dir, xml_centric_cours # All asset types are exported using the "asset" tag - but their asset type is specified in each asset key. asset = lxml.etree.SubElement(asset_root, AssetMetadata.ASSET_XML_TAG) asset_md.to_xml(asset) - with OSFS(asset_dir).open(AssetMetadata.EXPORTED_ASSET_FILENAME, 'wb') as asset_xml_file: - lxml.etree.ElementTree(asset_root).write(asset_xml_file, encoding='utf-8') + with OSFS(asset_dir).open(AssetMetadata.EXPORTED_ASSET_FILENAME, 'w') as asset_xml_file: + write_xml_to_file(asset_root, asset_xml_file) # export the static assets policies_dir = export_fs.makedir('policies', recreate=True) @@ -344,9 +346,8 @@ def post_process(self, root, export_fs): called library.xml. """ # Create the Library.xml file, which acts as the index of all library contents. - xml_file = export_fs.open(LIBRARY_ROOT, 'wb') - xml_file.write(lxml.etree.tostring(root, pretty_print=True, encoding='utf-8')) - xml_file.close() + with export_fs.open(LIBRARY_ROOT, 'w') as xml_file: + write_xml_to_file(root, xml_file) def export_course_to_xml(modulestore, contentstore, course_key, root_dir, course_dir): diff --git a/xmodule/util/olx.py b/xmodule/util/olx.py new file mode 100644 index 00000000000..c65201481be --- /dev/null +++ b/xmodule/util/olx.py @@ -0,0 +1,98 @@ +""" +Utility methods for parsing/generating OLX. +""" +from __future__ import annotations + +import typing as t +from abc import abstractmethod + +from lxml import etree + + +def write_xml_to_file(root_node: etree._Element, target_file: t.IO[str]) -> None: + """ + @@TODO docstring + @@TODO unit tests + """ + xml_str = etree.tostring(root_node, pretty_print=True, encoding='utf-8').decode('utf-8') + convert_quotes = lambda: True # @@TODO: Make this a waffle flag? + if convert_quotes(): + xml_chars = iter(xml_str) # @@TODO: try to optimize this by streaming xml rather than rendering xml_str? + target_file.write("".join(_convert_attrs_to_single_quotes(xml_str))) + else: + target_file.write(xml_str) + + +def _convert_attrs_to_single_quotes(xml_chars: t.Iterator[str]) -> t.Iterable[str]: + """ + Given a stream of chars in an XML string, convert the quoutes around attrs from double to single. + + This means exchanging double-quote escape sequences in attributes values (") for literal double quotes ("). + It also means exchanging literal single quotes (') for single-quote escape sequences ('). + Since OLX heavily uses JSON it our XML attribute values, this conversion generally nets clearner XML. + + Example: + Before: + ... + After: + ... + + ASSUMPTIONS: + * String is valid XML based on: <@@TODO link to spec> + * String is rendered by Python's LXML library. + * @@TODO state other syntax assumptions + """ + try: + while c := next(xml_chars): + + # We're outside of any tag until we see < + + if c == "<": + yield "<" + + # We're inside a tag: < ... > + + while c := next(xml_chars): + if c == ">": + yield ">" + + break # Done with the tag, back to the outside + + elif c == "\"": + + yield "'" # Convert double-quote close to single-quote close + + # We're inside an attribute value: + + while c := next(xml_chars): + if c == "\"": + yield "'" # Convert double-quote close to single-quote close + + break # Done with the attribute value, back to the tag + + elif c == "'": + + yield "'" # Single quote in attr --> convert to escape sequence + + elif c == "&": + + # We're inside an escape sequence: + escape_code = "" + while c := next(xml_chars): + if c == ";": + break # Done with escape sequence + else: + escape_code += c + if escape_code == "quot": + yield "\"" # Escape sequence is " --> convert to literal double-qoute + else: + yield "&{escape_code};" # Any other escape sequence --> leave it as-is + + else: + yield c # yield char from attribute value + else: + yield c # yield char from inside a tag + else: + yield c # yield char from outside tags + except StopIteration: + pass diff --git a/xmodule/xml_block.py b/xmodule/xml_block.py index 2753d455adc..0cf41a39c0d 100644 --- a/xmodule/xml_block.py +++ b/xmodule/xml_block.py @@ -12,6 +12,7 @@ from xblock.runtime import KvsFieldData from xmodule.modulestore import EdxJSONEncoder from xmodule.modulestore.inheritance import InheritanceKeyValueStore, own_metadata +from xmodule.util.olx import write_xml_to_file log = logging.getLogger(__name__) @@ -472,8 +473,8 @@ def add_xml_to_node(self, node): self.location.run if self.category == 'course' else url_path, ) self.runtime.export_fs.makedirs(os.path.dirname(filepath), recreate=True) - with self.runtime.export_fs.open(filepath, 'wb') as fileobj: - ElementTree(xml_object).write(fileobj, pretty_print=True, encoding='utf-8') + with self.runtime.export_fs.open(filepath, 'w') as fileobj: + write_xml_to_file(xml_object, fileobj) else: # Write all attributes from xml_object onto node node.clear()