SDAAP-75 Add function to remove embedded media content

superdesk · Jul 10, 2023 · 147b08c · 147b08c
1 parent 674df75
commit 147b08c
Show file tree

Hide file tree

Showing 7 changed files with 1,140 additions and 54 deletions.
diff --git a/superdesk/default_settings.py b/superdesk/default_settings.py
@@ -1062,3 +1062,9 @@ def local_to_utc_hour(hour):
 #: .. versionadded:: 2.4.1
 #:
 APM_SERVICE_NAME = env("APM_SERVICE_NAME")
+
+#: Apply product filtering to embedded media items
+#:
+#: .. versionadded::
+#:
+EMBED_PRODUCT_FILTERING = strtobool(env("EMBED_PRODUCT_FILTERING", "false"))
diff --git a/superdesk/editor_utils.py b/superdesk/editor_utils.py
@@ -167,6 +167,10 @@ def insert(self, index, value):
         self._ranges.insert(index, value.ranges)
         self._mapping[value.key] = value.data
 
+    def clear(self):
+        for idx, kk in enumerate(self._ranges):
+            del self[kk.get("key")]
+
 
 class Block:
     """Abstraction of DraftJS block"""
@@ -855,3 +859,34 @@ def copy_fields(source: Dict, dest: Dict, ignore_empty: bool = False):
         for field in source["fields_meta"]:
             if ignore_empty is False or not is_empty_content_state(source, field):
                 dest.setdefault("fields_meta", {})[field] = source["fields_meta"][field].copy()
+
+
+def remove_all_embeds(article):
+    """
+    Removes any embeds from the draftjs state and regenerates the html, can be used by text only
+    formatters to remove embeds from the article
+    :param article:
+    :return:
+    """
+
+    # List of keys of the removed entities
+    keys = []
+
+    def not_embed(block):
+        if block.type.lower() == "atomic":
+            keys.extend([e.key for e in block.entities])
+            block.entities.clear()
+            return False
+        return True
+
+    fields = get_content_state_fields(article)
+    for field in fields:
+        filter_blocks(article, field, not_embed)
+
+    # Remove the corresponding items from the associations and refs
+    for key_suffix in keys:
+        key = "editor_{}".format(key_suffix)
+        if article.get("associations", {}).get(key):
+            article.get("associations").pop(key)
+        if "refs" in article:
+            article["refs"] = [r for r in article.get("refs", []) if r["key"] != key]
diff --git a/superdesk/publish/formatters/__init__.py b/superdesk/publish/formatters/__init__.py
@@ -15,6 +15,7 @@
 from superdesk.metadata.item import ITEM_TYPE, CONTENT_TYPE, FORMATS, FORMAT
 from superdesk.etree import parse_html
 from superdesk.text_utils import get_text
+from superdesk.editor_utils import get_content_state_fields, filter_blocks
 
 formatters = []  # type: List[Type[Formatter]]
 

diff --git a/superdesk/publish/formatters/email_formatter.py b/superdesk/publish/formatters/email_formatter.py
@@ -17,6 +17,7 @@
 from copy import deepcopy
 from superdesk.errors import FormatterError
 from superdesk import etree as sd_etree
+from superdesk.editor_utils import remove_all_embeds
 
 
 class EmailFormatter(Formatter):
@@ -55,6 +56,7 @@ def _inject_dateline(self, formatted_article):
 
     def format(self, article, subscriber, codes=None):
         formatted_article = deepcopy(article)
+        remove_all_embeds(formatted_article)
         pub_seq_num = superdesk.get_resource_service("subscribers").generate_sequence_number(subscriber)
         doc = {}
         try:

diff --git a/superdesk/publish/formatters/ninjs_ftp_formatter.py b/superdesk/publish/formatters/ninjs_ftp_formatter.py
@@ -10,11 +10,15 @@
 
 
 from .ninjs_formatter import NINJSFormatter
+from flask import current_app as app
 from superdesk.media.renditions import get_rendition_file_name
-from lxml import html as lxml_html
-from superdesk.etree import to_string
+from superdesk import get_resource_service
+from superdesk.editor_utils import get_content_state_fields, Editor3Content, DraftJSHTMLExporter
+from superdesk.media.renditions import get_renditions_spec
+from draftjs_exporter.dom import DOM
+from copy import deepcopy
+from textwrap import dedent
 import logging
-import re
 
 logger = logging.getLogger(__name__)
 
@@ -28,15 +32,7 @@ def __init__(self):
         super().__init__()
         self.format_type = "ftp ninjs"
         self.internal_renditions = []
-
-    def _get_source_ref(self, marker, ninjs):
-        try:
-            return ninjs.get("associations").get(marker).get("renditions").get("original").get("href")
-        except Exception:
-            logger.warning(
-                "href not found for the original in FTP NINJS formatter, ensure the formatter has it enabled"
-            )
-            return None
+        self.path = None
 
     def _transform_to_ninjs(self, article, subscriber, recursive=True):
         """
@@ -47,48 +43,215 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
         :param recursive:
         :return:
         """
-
+        # Get the path that the renditions will be pushed to
+        self.path = subscriber.get("destinations")[0].get("config").get("associated_path")
         include_original = subscriber.get("destinations")[0].get("config").get("include_original", False)
         if include_original:
             self.internal_renditions = ["original"]
 
-        ninjs = super()._transform_to_ninjs(article, subscriber, recursive)
+        formatted_article = deepcopy(article)
 
-        # Get the path that the renditions will be pushed to
-        path = subscriber.get("destinations")[0].get("config").get("associated_path")
+        if article.get("type") == "text" and recursive:
+            self.apply_product_filtering_to_associations(formatted_article, subscriber)
+
+        ninjs = super()._transform_to_ninjs(formatted_article, subscriber, recursive)
+
+        renditions = ninjs.get("renditions")
+        if renditions:
+            for name, rendition in renditions.items():
+                rendition["href"] = (
+                    self.path.lstrip("/")
+                    + ("/" if not self.path.endswith("/") and self.path else "")
+                    + get_rendition_file_name(rendition)
+                )
+
+        return ninjs
+
+    def apply_product_filtering_to_associations(self, article, subscriber):
+        """
+        Remove the embedded items from the article that the subscriber has no matching product for.
+        :param article:
+        :param subscriber:
+        :return:
+        """
+        if not app.config["EMBED_PRODUCT_FILTERING"]:
+            return
+
+        remove_keys = []
+        permitted_products = set(subscriber["products"])
+
+        for key, item in article.get("associations", {}).items():
+            if key.startswith("editor_"):
+                result = get_resource_service("product_tests").test_products(item, lookup=None)
+                matching_products = set(p["product_id"] for p in result if p.get("matched", False))
+                if not matching_products.intersection(permitted_products):
+                    remove_keys.append(key)
+
+        self.remove_embeds(article, remove_keys)
+
+    def remove_embeds(self, article, remove_keys):
+        """
+        Removes the nominated embeds from the draftjs state and regenerates the HTML.
+        :param article:
+        :param remove_keys
+        :return:
+        """
+
+        to_remove = [k.lstrip("editor_") for k in remove_keys]
+
+        def not_embed(block):
+            if block.type.lower() == "atomic":
+                bk = [e.key for e in block.entities if e.key in to_remove]
+                if bk:
+                    return False
+            return True
+
+        fields = get_content_state_fields(article)
+        for field in fields:
+            self.filter_blocks(article, field, not_embed)
+
+        for key in remove_keys:
+            article.get("associations", {}).pop(key, None)
+            if "refs" in article:
+                article["refs"] = [r for r in article.get("refs", []) if r["key"] != key]
+
+    def filter_blocks(self, item, field, filter, is_html=True):
+        editor = Editor3Content(item, field, is_html)
+        # assign special Ninjs FTP exporter
+        exporter = NinjsFTPExporter(editor)
+        exporter.set_formatter(self)
+        editor.html_exporter = exporter
+        blocks = []
+        for block in editor.blocks:
+            if filter(block):
+                blocks.append(block)
+        editor.set_blocks(blocks)
+        editor.update_item()
+
+
+class NinjsFTPExporter(DraftJSHTMLExporter):
+
+    formatter = None
+
+    def set_formatter(self, formatter):
+        self.formatter = formatter
+
+    def render_media(self, props):
+        # we need to retrieve the key, there is not straightforward way to do it
+        # so we find the key in entityMap with a corresponding value
+        embed_key = next(
+            k for k, v in self.content_state["entityMap"].items() if v["data"].get("media") == props["media"]
+        )
+        media_props = props["media"]
+        media_type = media_props.get("type", "picture")
+
+        rendition = media_props["renditions"].get("original") or media_props["renditions"]["viewImage"]
+        alt_text = media_props.get("alt_text") or ""
+        desc = media_props.get("description_text")
+        if media_type == "picture":
+            path = self.formatter.path
+
+            renditions_to_publish = self.formatter.internal_renditions + list(
+                get_renditions_spec(without_internal_renditions=True).keys()
+            )
+
+            renditions = media_props.get("renditions")
+            # filter the renditions for those we wish to publish
+            renditions = {name: rendition for name, rendition in renditions.items() if name in renditions_to_publish}
 
-        if path:
-            renditions = ninjs.get("renditions")
             if renditions:
                 for name, rendition in renditions.items():
                     rendition["href"] = (
-                        "/"
-                        + path.lstrip("/")
-                        + ("/" if not path.endswith("/") else "")
+                        path.lstrip("/")
+                        + ("/" if not path.endswith("/") and path else "")
                         + get_rendition_file_name(rendition)
                     )
 
-        if article.get("type", "") == "text":
-            # Find any embeded image references in the body_html and re-wire the img src reference and insert an id
-            html_updated = False
-            root_elem = lxml_html.fromstring(ninjs.get("body_html"))
-            # Scan any comments for embed markers
-            comments = root_elem.xpath("//comment()")
-            for comment in comments:
-                if "EMBED START Image" in comment.text:
-                    regex = r"<!-- EMBED START Image {id: \"editor_([0:9]+)"
-                    m = re.search(regex, ninjs.get("body_html", ""))
-                    # Assumes the sibling of the Embed Image comment is the figure tag containing the image
-                    figureElem = comment.getnext()
-                    if figureElem is not None and figureElem.tag == "figure":
-                        imgElem = figureElem.find("./img")
-                        if imgElem is not None and m and m.group(1):
-                            embed_id = "editor_" + m.group(1)
-                            imgElem.attrib["id"] = embed_id
-                            src = self._get_source_ref(embed_id, ninjs)
-                            if src:
-                                imgElem.attrib["src"] = src
-                            html_updated = True
-            if html_updated:
-                ninjs["body_html"] = to_string(root_elem, method="html")
-        return ninjs
+            src = self.get_source_ref(renditions)
+            srcset = self.get_source_set_refs(renditions)
+
+            embed_type = "Image"
+            elt = DOM.create_element(
+                "img",
+                {"src": src, "srcset": srcset, "sizes": "80vw", "alt": alt_text, "id": f"editor_{embed_key}"},
+                props["children"],
+            )
+        elif media_type == "video":
+            embed_type = "Video"
+            src = (
+                self.formatter.path.lstrip("/")
+                + ("/" if not self.formatter.path.endswith("/") and self.formatter.path else "")
+                + get_rendition_file_name(rendition)
+            )
+            # It seems impossible to add an attribute that has no value for "controls" the W3C validator accepts an
+            # empty string
+            elt = DOM.create_element(
+                "video",
+                {"controls": "", "src": src, "title": alt_text, "id": f"editor_{embed_key}"},
+                props["children"],
+            )
+        elif media_type == "audio":
+            embed_type = "Audio"
+            src = (
+                self.formatter.path.lstrip("/")
+                + ("/" if not self.formatter.path.endswith("/") and self.formatter.path else "")
+                + get_rendition_file_name(rendition)
+            )
+            elt = DOM.create_element(
+                "audio",
+                {"controls": "", "src": src, "title": alt_text, "id": f"editor_{embed_key}"},
+                props["children"],
+            )
+        else:
+            logger.error("Invalid or not implemented media type: {media_type}".format(media_type=media_type))
+            return None
+
+        content = DOM.render(elt)
+
+        if desc:
+            content += "<figcaption>{}</figcaption>".format(desc)
+
+        # <dummy_tag> is needed for the comments, because a root node is necessary
+        # it will be removed during rendering.
+        embed = DOM.parse_html(
+            dedent(
+                """\
+            <dummy_tag><!-- EMBED START {embed_type} {{id: "editor_{key}"}} -->
+            <figure>{content}</figure>
+            <!-- EMBED END {embed_type} {{id: "editor_{key}"}} --></dummy_tag>"""
+            ).format(embed_type=embed_type, key=embed_key, content=content)
+        )
+
+        return embed
+
+    def get_source_ref(self, renditions):
+        try:
+            return renditions.get("original").get("href")
+        except Exception:
+            widest = -1
+            src_rendition = ""
+            for rendition in renditions:
+                width = renditions.get(rendition).get("width")
+                if width > widest:
+                    widest = width
+                    src_rendition = rendition
+
+        if widest > 0:
+            return renditions.get(src_rendition).get("href").lstrip("/")
+
+        logger.warning("href not found in FTP NINJS formatter, ensure the formatter has it enabled")
+        return None
+
+    def get_source_set_refs(self, renditions):
+        try:
+            srcset = []
+            for rendition in renditions:
+                srcset.append(
+                    renditions.get(rendition).get("href").lstrip("/")
+                    + " "
+                    + str(renditions.get(rendition).get("width"))
+                    + "w"
+                )
+            return ",".join(srcset)
+        except Exception:
+            return None