Skip to content

Commit

Permalink
SDAAP-75 Add function to remove embedded media content
Browse files Browse the repository at this point in the history
  • Loading branch information
marwoodandrew committed Jul 10, 2023
1 parent 674df75 commit 147b08c
Show file tree
Hide file tree
Showing 7 changed files with 1,140 additions and 54 deletions.
6 changes: 6 additions & 0 deletions superdesk/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,3 +1062,9 @@ def local_to_utc_hour(hour):
#: .. versionadded:: 2.4.1
#:
APM_SERVICE_NAME = env("APM_SERVICE_NAME")

#: Apply product filtering to embedded media items
#:
#: .. versionadded::
#:
EMBED_PRODUCT_FILTERING = strtobool(env("EMBED_PRODUCT_FILTERING", "false"))
35 changes: 35 additions & 0 deletions superdesk/editor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ def insert(self, index, value):
self._ranges.insert(index, value.ranges)
self._mapping[value.key] = value.data

def clear(self):
for idx, kk in enumerate(self._ranges):
del self[kk.get("key")]


class Block:
"""Abstraction of DraftJS block"""
Expand Down Expand Up @@ -855,3 +859,34 @@ def copy_fields(source: Dict, dest: Dict, ignore_empty: bool = False):
for field in source["fields_meta"]:
if ignore_empty is False or not is_empty_content_state(source, field):
dest.setdefault("fields_meta", {})[field] = source["fields_meta"][field].copy()


def remove_all_embeds(article):
"""
Removes any embeds from the draftjs state and regenerates the html, can be used by text only
formatters to remove embeds from the article
:param article:
:return:
"""

# List of keys of the removed entities
keys = []

def not_embed(block):
if block.type.lower() == "atomic":
keys.extend([e.key for e in block.entities])
block.entities.clear()
return False
return True

fields = get_content_state_fields(article)
for field in fields:
filter_blocks(article, field, not_embed)

# Remove the corresponding items from the associations and refs
for key_suffix in keys:
key = "editor_{}".format(key_suffix)
if article.get("associations", {}).get(key):
article.get("associations").pop(key)
if "refs" in article:
article["refs"] = [r for r in article.get("refs", []) if r["key"] != key]
1 change: 1 addition & 0 deletions superdesk/publish/formatters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from superdesk.metadata.item import ITEM_TYPE, CONTENT_TYPE, FORMATS, FORMAT
from superdesk.etree import parse_html
from superdesk.text_utils import get_text
from superdesk.editor_utils import get_content_state_fields, filter_blocks

formatters = [] # type: List[Type[Formatter]]

Expand Down
2 changes: 2 additions & 0 deletions superdesk/publish/formatters/email_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from copy import deepcopy
from superdesk.errors import FormatterError
from superdesk import etree as sd_etree
from superdesk.editor_utils import remove_all_embeds


class EmailFormatter(Formatter):
Expand Down Expand Up @@ -55,6 +56,7 @@ def _inject_dateline(self, formatted_article):

def format(self, article, subscriber, codes=None):
formatted_article = deepcopy(article)
remove_all_embeds(formatted_article)
pub_seq_num = superdesk.get_resource_service("subscribers").generate_sequence_number(subscriber)
doc = {}
try:
Expand Down
253 changes: 208 additions & 45 deletions superdesk/publish/formatters/ninjs_ftp_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@


from .ninjs_formatter import NINJSFormatter
from flask import current_app as app
from superdesk.media.renditions import get_rendition_file_name
from lxml import html as lxml_html
from superdesk.etree import to_string
from superdesk import get_resource_service
from superdesk.editor_utils import get_content_state_fields, Editor3Content, DraftJSHTMLExporter
from superdesk.media.renditions import get_renditions_spec
from draftjs_exporter.dom import DOM
from copy import deepcopy
from textwrap import dedent
import logging
import re

logger = logging.getLogger(__name__)

Expand All @@ -28,15 +32,7 @@ def __init__(self):
super().__init__()
self.format_type = "ftp ninjs"
self.internal_renditions = []

def _get_source_ref(self, marker, ninjs):
try:
return ninjs.get("associations").get(marker).get("renditions").get("original").get("href")
except Exception:
logger.warning(
"href not found for the original in FTP NINJS formatter, ensure the formatter has it enabled"
)
return None
self.path = None

def _transform_to_ninjs(self, article, subscriber, recursive=True):
"""
Expand All @@ -47,48 +43,215 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
:param recursive:
:return:
"""

# Get the path that the renditions will be pushed to
self.path = subscriber.get("destinations")[0].get("config").get("associated_path")
include_original = subscriber.get("destinations")[0].get("config").get("include_original", False)
if include_original:
self.internal_renditions = ["original"]

ninjs = super()._transform_to_ninjs(article, subscriber, recursive)
formatted_article = deepcopy(article)

# Get the path that the renditions will be pushed to
path = subscriber.get("destinations")[0].get("config").get("associated_path")
if article.get("type") == "text" and recursive:
self.apply_product_filtering_to_associations(formatted_article, subscriber)

ninjs = super()._transform_to_ninjs(formatted_article, subscriber, recursive)

renditions = ninjs.get("renditions")
if renditions:
for name, rendition in renditions.items():
rendition["href"] = (
self.path.lstrip("/")
+ ("/" if not self.path.endswith("/") and self.path else "")
+ get_rendition_file_name(rendition)
)

return ninjs

def apply_product_filtering_to_associations(self, article, subscriber):
"""
Remove the embedded items from the article that the subscriber has no matching product for.
:param article:
:param subscriber:
:return:
"""
if not app.config["EMBED_PRODUCT_FILTERING"]:
return

remove_keys = []
permitted_products = set(subscriber["products"])

for key, item in article.get("associations", {}).items():
if key.startswith("editor_"):
result = get_resource_service("product_tests").test_products(item, lookup=None)
matching_products = set(p["product_id"] for p in result if p.get("matched", False))
if not matching_products.intersection(permitted_products):
remove_keys.append(key)

self.remove_embeds(article, remove_keys)

def remove_embeds(self, article, remove_keys):
"""
Removes the nominated embeds from the draftjs state and regenerates the HTML.
:param article:
:param remove_keys
:return:
"""

to_remove = [k.lstrip("editor_") for k in remove_keys]

def not_embed(block):
if block.type.lower() == "atomic":
bk = [e.key for e in block.entities if e.key in to_remove]
if bk:
return False
return True

fields = get_content_state_fields(article)
for field in fields:
self.filter_blocks(article, field, not_embed)

for key in remove_keys:
article.get("associations", {}).pop(key, None)
if "refs" in article:
article["refs"] = [r for r in article.get("refs", []) if r["key"] != key]

def filter_blocks(self, item, field, filter, is_html=True):
editor = Editor3Content(item, field, is_html)
# assign special Ninjs FTP exporter
exporter = NinjsFTPExporter(editor)
exporter.set_formatter(self)
editor.html_exporter = exporter
blocks = []
for block in editor.blocks:
if filter(block):
blocks.append(block)
editor.set_blocks(blocks)
editor.update_item()


class NinjsFTPExporter(DraftJSHTMLExporter):

formatter = None

def set_formatter(self, formatter):
self.formatter = formatter

def render_media(self, props):
# we need to retrieve the key, there is not straightforward way to do it
# so we find the key in entityMap with a corresponding value
embed_key = next(
k for k, v in self.content_state["entityMap"].items() if v["data"].get("media") == props["media"]
)
media_props = props["media"]
media_type = media_props.get("type", "picture")

rendition = media_props["renditions"].get("original") or media_props["renditions"]["viewImage"]
alt_text = media_props.get("alt_text") or ""
desc = media_props.get("description_text")
if media_type == "picture":
path = self.formatter.path

renditions_to_publish = self.formatter.internal_renditions + list(
get_renditions_spec(without_internal_renditions=True).keys()
)

renditions = media_props.get("renditions")
# filter the renditions for those we wish to publish
renditions = {name: rendition for name, rendition in renditions.items() if name in renditions_to_publish}

if path:
renditions = ninjs.get("renditions")
if renditions:
for name, rendition in renditions.items():
rendition["href"] = (
"/"
+ path.lstrip("/")
+ ("/" if not path.endswith("/") else "")
path.lstrip("/")
+ ("/" if not path.endswith("/") and path else "")
+ get_rendition_file_name(rendition)
)

if article.get("type", "") == "text":
# Find any embeded image references in the body_html and re-wire the img src reference and insert an id
html_updated = False
root_elem = lxml_html.fromstring(ninjs.get("body_html"))
# Scan any comments for embed markers
comments = root_elem.xpath("//comment()")
for comment in comments:
if "EMBED START Image" in comment.text:
regex = r"<!-- EMBED START Image {id: \"editor_([0:9]+)"
m = re.search(regex, ninjs.get("body_html", ""))
# Assumes the sibling of the Embed Image comment is the figure tag containing the image
figureElem = comment.getnext()
if figureElem is not None and figureElem.tag == "figure":
imgElem = figureElem.find("./img")
if imgElem is not None and m and m.group(1):
embed_id = "editor_" + m.group(1)
imgElem.attrib["id"] = embed_id
src = self._get_source_ref(embed_id, ninjs)
if src:
imgElem.attrib["src"] = src
html_updated = True
if html_updated:
ninjs["body_html"] = to_string(root_elem, method="html")
return ninjs
src = self.get_source_ref(renditions)
srcset = self.get_source_set_refs(renditions)

embed_type = "Image"
elt = DOM.create_element(
"img",
{"src": src, "srcset": srcset, "sizes": "80vw", "alt": alt_text, "id": f"editor_{embed_key}"},
props["children"],
)
elif media_type == "video":
embed_type = "Video"
src = (
self.formatter.path.lstrip("/")
+ ("/" if not self.formatter.path.endswith("/") and self.formatter.path else "")
+ get_rendition_file_name(rendition)
)
# It seems impossible to add an attribute that has no value for "controls" the W3C validator accepts an
# empty string
elt = DOM.create_element(
"video",
{"controls": "", "src": src, "title": alt_text, "id": f"editor_{embed_key}"},
props["children"],
)
elif media_type == "audio":
embed_type = "Audio"
src = (
self.formatter.path.lstrip("/")
+ ("/" if not self.formatter.path.endswith("/") and self.formatter.path else "")
+ get_rendition_file_name(rendition)
)
elt = DOM.create_element(
"audio",
{"controls": "", "src": src, "title": alt_text, "id": f"editor_{embed_key}"},
props["children"],
)
else:
logger.error("Invalid or not implemented media type: {media_type}".format(media_type=media_type))
return None

content = DOM.render(elt)

if desc:
content += "<figcaption>{}</figcaption>".format(desc)

# <dummy_tag> is needed for the comments, because a root node is necessary
# it will be removed during rendering.
embed = DOM.parse_html(
dedent(
"""\
<dummy_tag><!-- EMBED START {embed_type} {{id: "editor_{key}"}} -->
<figure>{content}</figure>
<!-- EMBED END {embed_type} {{id: "editor_{key}"}} --></dummy_tag>"""
).format(embed_type=embed_type, key=embed_key, content=content)
)

return embed

def get_source_ref(self, renditions):
try:
return renditions.get("original").get("href")
except Exception:
widest = -1
src_rendition = ""
for rendition in renditions:
width = renditions.get(rendition).get("width")
if width > widest:
widest = width
src_rendition = rendition

if widest > 0:
return renditions.get(src_rendition).get("href").lstrip("/")

logger.warning("href not found in FTP NINJS formatter, ensure the formatter has it enabled")
return None

def get_source_set_refs(self, renditions):
try:
srcset = []
for rendition in renditions:
srcset.append(
renditions.get(rendition).get("href").lstrip("/")
+ " "
+ str(renditions.get(rendition).get("width"))
+ "w"
)
return ",".join(srcset)
except Exception:
return None
Loading

0 comments on commit 147b08c

Please sign in to comment.