diff --git a/superdesk/io/feeding_services/email.py b/superdesk/io/feeding_services/email.py index ef11536c05..1469f490bc 100644 --- a/superdesk/io/feeding_services/email.py +++ b/superdesk/io/feeding_services/email.py @@ -8,6 +8,7 @@ # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license +from typing import List import socket import imaplib @@ -90,6 +91,13 @@ def authenticate(self, provider: dict, config: dict) -> imaplib.IMAP4_SSL: return imap + def parse_extra(self, imap: imaplib.IMAP4_SSL, num: str, parsed_items: List[dict]) -> None: + """Parse extra metadata + + This method is called after main parsing, and can be used by subclasses + """ + pass + def _update(self, provider, update, test=False): config = provider.get("config", {}) new_items = [] @@ -110,7 +118,9 @@ def _update(self, provider, update, test=False): if rv == "OK" and not test: try: parser = self.get_feed_parser(provider, data) - new_items.append(parser.parse(data, provider)) + parsed_items = parser.parse(data, provider) + self.parse_extra(imap, num, parsed_items) + new_items.append(parsed_items) rv, data = imap.store(num, "+FLAGS", "\\Seen") except IngestEmailError: continue diff --git a/superdesk/io/feeding_services/gmail.py b/superdesk/io/feeding_services/gmail.py index 1b5dfaa47a..81b2b6db09 100644 --- a/superdesk/io/feeding_services/gmail.py +++ b/superdesk/io/feeding_services/gmail.py @@ -6,7 +6,9 @@ # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license +import re import imaplib +from typing import List from bson import ObjectId from os.path import join import time @@ -20,6 +22,8 @@ logger = logging.getLogger(__name__) +RE_LABELS_STR = re.compile(r"\(X-GM-LABELS \((?P.*)\)\)") +RE_LABEL = re.compile(r'"(?P(?:[^"\\]|\\.)*)"|(?P\w+)') class GMailFeedingService(EmailFeedingService): @@ -99,6 +103,32 @@ def authenticate(self, provider: dict, config: dict) -> imaplib.IMAP4_SSL: imap.authenticate("XOAUTH2", lambda __: auth_string.encode()) return imap + def parse_extra(self, imap: imaplib.IMAP4_SSL, num: str, parsed_items: List[dict]) -> None: + """Add GMail labels to parsed_items""" + try: + # we use GMail IMAP Extensions + # https://developers.google.com/gmail/imap/imap-extensions#access_to_gmail_labels_x-gm-labels + _, data = imap.fetch(num, "(X-GM-LABELS)") + # it seems that there is nothing to help parsing in standard lib + # thus we use some regex to get our labels + data_bytes = data[0] + if not isinstance(data_bytes, bytes): + raise ValueError(f"Unexpected data type: {type(data_bytes)}") + data_str = data_bytes.decode("utf-7") + match_labels_str = RE_LABELS_STR.search(data_str) + if match_labels_str is None: + raise ValueError(f"Can't find the expected label string in data: {data_str:r}") + labels_str = match_labels_str.group(1) + labels = [ + (m.group("quoted") or m.group("unquoted")).replace('\\"', '"') for m in RE_LABEL.finditer(labels_str) + ] + for parsed_item in parsed_items: + subjects = parsed_item.setdefault("subject", []) + for label in labels: + subjects.append({"name": label, "qcode": label, "scheme": "gmail_label"}) + except Exception: + logger.exception("Can't retrieve GMail labels") + register_feeding_service(GMailFeedingService) register_feeding_service_parser(GMailFeedingService.NAME, "email_rfc822")