From e556c2dcba8e28b544042a2134cd623711cd90d6 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 15 Oct 2024 18:06:45 +0200
Subject: [PATCH 01/48] docs: remove references to old search fields in
 `api/queue-query`

---
 webtool/views/api_tool.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index 5b47c030..b37339f9 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -282,13 +282,6 @@ def queue_dataset():
 	Request parameters vary by data source. The ones mandated constitute the
 	minimum but more may be required.
 
-	:request-param str board:  Board ID to query
-	:request-param str datasource:  Data source ID to query
-	:request-param str body_match:  String to match in the post body
-	:request-param str subject_match:  String to match in the post subject
-    :request-param int min_date:  Timestamp marking the beginning of the match
-                                  period
-    :request-param int max_date:  Timestamp marking the end of the match period
     :request-param str ?access_token:  Access token; only required if not
                                        logged in currently.
 
@@ -296,6 +289,7 @@ def queue_dataset():
 	              status and results.
 	:return-error 404: If the datasource does not exist.
 	"""
+
 	datasource_id = request.form.get("datasource", "")
 	if datasource_id not in fourcat_modules.datasources:
 		return error(404, message="Datasource '%s' does not exist" % datasource_id)

From d5c873ae4841c15b35ed9354137a0237e0d60c6f Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 16 Oct 2024 13:05:22 +0200
Subject: [PATCH 02/48] Bye bye FAQ page

---
 common/lib/config_definition.py  |  3 +--
 webtool/pages/faq.md             | 15 ---------------
 webtool/templates/frontpage.html |  3 ++-
 webtool/templates/layout.html    |  2 +-
 4 files changed, 4 insertions(+), 19 deletions(-)
 delete mode 100644 webtool/pages/faq.md

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index a4fca2dc..ee38ce70 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -499,11 +499,10 @@
         "type": UserInput.OPTION_MULTI_SELECT,
         "help": "Pages in navigation",
         "options": {
-            "faq": "FAQ",
             "data-policy": "Data Policy",
             "citing": "How to cite",
         },
-        "default": ["faq"],
+        "default": [],
         "tooltip": "These pages will be included in the navigation bar at the top of the interface."
     },
     "ui.prefer_mapped_preview": {
diff --git a/webtool/pages/faq.md b/webtool/pages/faq.md
deleted file mode 100644
index 866a9675..00000000
--- a/webtool/pages/faq.md
+++ /dev/null
@@ -1,15 +0,0 @@
-## Frequently Asked Questions
-
-### How do I cite this tool in my research paper?
-
-Please refer to the [How to cite](/page/citing/) page.
-
-### Where can I find more information about this tool?
-
-Take a look at 4CAT's [website](https://4cat.nl) and its 
-[GitHub repository](https://github.com/digitalmethodsinitiative/4cat)!
-
-### What query syntax can I use?
-
-Most standard search engine query syntax is supported. An 
-[overview of syntax you can use](/page/query-syntax/) is available.
\ No newline at end of file
diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html
index d8b41b1e..fa5cbc73 100644
--- a/webtool/templates/frontpage.html
+++ b/webtool/templates/frontpage.html
@@ -11,7 +11,8 @@ <h2><span>What is {{ __user_config("4cat.name") }}?</span></h2>
                 from a variety of online sources, and analyze the data through analytical processors.</p>
             <p>4CAT is developed by <a href="https://oilab.eu">OILab</a> and the <a
                     href="https://www.digitalmethods.net">Digital Methods Initiative</a> at the University of Amsterdam.
-                For more information, take a look at the <a href="https://4cat.nl">4CAT website</a>.</p>
+                For more information, take a look at the <a href="https://4cat.nl">4CAT website</a> or the tool's
+                <a href="https://github.com/digitalmethodsinitiative/4cat">GitHub repository</a>.</p>
             {% if __user_config("4cat.about_this_server") %}
             <h2><span>About this server</span></h2>
             <p>{{ __user_config("4cat.about_this_server") }}</p>
diff --git a/webtool/templates/layout.html b/webtool/templates/layout.html
index 1815c233..e1ecda59 100644
--- a/webtool/templates/layout.html
+++ b/webtool/templates/layout.html
@@ -54,7 +54,7 @@ <h1>
             <li{% if navigation.current == "about" %} class="current"{% endif %}><a href="{{ url_for('show_about') }}">About</a></li>
             {% for page in __user_config("ui.nav_pages") %}
             <li{% if navigation.current == page %} class="current"{% endif %}>
-                <a href="{{ url_for("show_page", page=page) }}">{% if page == "faq" %}<abbr title="Frequently Asked Questions">FAQ</abbr>{% else %}{{ page|title }}{% endif %}</a>
+                <a href="{{ url_for("show_page", page=page) }}">{{ page|title }}</a>
             </li>
             {% endfor %}
 		</ul>

From bb4a581ca7d3efd7dad290d54c1fb7652c05e3b5 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 16 Oct 2024 16:43:45 +0200
Subject: [PATCH 03/48] annotate will use index if no id present

---
 processors/machine_learning/annotate_text.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py
index 022e96de..26234a18 100644
--- a/processors/machine_learning/annotate_text.py
+++ b/processors/machine_learning/annotate_text.py
@@ -184,8 +184,8 @@ def process(self):
         # prepare data for annotation
         data_path = staging_area.joinpath("data.temp.ndjson")
         with data_path.open("w", newline="") as outfile:
-            for item in self.source_dataset.iterate_items():
-                outfile.write(json.dumps({item.get("id"): item.get(textfield)}) + "\n")
+            for i, item in enumerate(self.source_dataset.iterate_items()):
+                outfile.write(json.dumps({item.get("id", str(i)): item.get(textfield)}) + "\n")
 
         path_to_files, path_to_results = dmi_service_manager.process_files(staging_area,
                                                                            [data_path.name, labels_path.name],
@@ -238,15 +238,14 @@ def make_filename(id, prompt):
         self.dataset.update_status("Loading annotated data")
         with output_dir.joinpath("results.json").open() as infile:
             annotations = json.load(infile)
-
         self.dataset.update_status("Writing results")
         with self.dataset.get_results_path().open("w") as outfile:
             writer = None
-            for item in self.source_dataset.iterate_items():
+            for i, item in enumerate(self.source_dataset.iterate_items()):
                 row = {
-                    "id": item.get("id"),
+                    "id": item.get("id", i),
                     textfield: item.get(textfield),
-                    "category": annotations[item.get("id")]
+                    "category": annotations.get(item.get("id", str(i))) # str(i) because it is not recorded as an int in the annotations
                 }
                 if not writer:
                     writer = csv.DictWriter(outfile, fieldnames=row.keys())

From 6aa7177edba61357638d858ad171468ffd51c945 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 18 Oct 2024 18:34:40 +0200
Subject: [PATCH 04/48] Don't crash on broken import NDJSON but ignore item
 instead

---
 backend/lib/search.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/backend/lib/search.py b/backend/lib/search.py
index 15b3982d..3258561e 100644
--- a/backend/lib/search.py
+++ b/backend/lib/search.py
@@ -170,10 +170,22 @@ def import_from_file(self, path):
 				if self.interrupted:
 					raise WorkerInterruptedException()
 
-				# remove NUL bytes here because they trip up a lot of other
-				# things
-				# also include import metadata in item
-				item = json.loads(line.replace("\0", ""))
+				try:
+					# remove NUL bytes here because they trip up a lot of other
+					# things
+					# also include import metadata in item
+					item = json.loads(line.replace("\0", ""))
+				except json.JSONDecodeError:
+					warning = (f"An item on line {i:,} of the imported file could not be parsed as JSON - this may "
+							   f"indicate that the file you uploaded was incomplete and you need to try uploading it "
+							   f"again. The item will be ignored.")
+
+					if warning not in import_warnings:
+						import_warnings[warning] = 0
+					import_warnings[warning] += 1
+					continue
+
+
 				new_item = {
 					**item["data"],
 					"__import_meta": {k: v for k, v in item.items() if k != "data"}

From e09e87502e2fcee8f2ad79b26e40c27f54842acf Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 22 Oct 2024 13:38:35 +0200
Subject: [PATCH 05/48] only show importables in frontpage if enabled

---
 webtool/views/views_misc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/views/views_misc.py b/webtool/views/views_misc.py
index 4690b622..e179085c 100644
--- a/webtool/views/views_misc.py
+++ b/webtool/views/views_misc.py
@@ -73,7 +73,7 @@ def show_about():
 
     datasources = {k: v for k, v in fourcat_modules.datasources.items() if
                    k in config.get("datasources.enabled") and not v["importable"]}
-    importables = {k: v for k, v in fourcat_modules.datasources.items() if v["importable"]}
+    importables = {k: v for k, v in fourcat_modules.datasources.items() if (v["importable"] and k in config.get("datasources.enabled"))}
 
     return render_template("frontpage.html", stats=stats, news=news, datasources=datasources, importables=importables)
 

From 8261b25d62718f981163d85238df4b83642e8f27 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Tue, 22 Oct 2024 15:52:52 +0200
Subject: [PATCH 06/48] Do not hide empty datasets by default

---
 webtool/views/views_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py
index 28aaa76a..1720b3a8 100644
--- a/webtool/views/views_dataset.py
+++ b/webtool/views/views_dataset.py
@@ -70,7 +70,7 @@ def show_results(page):
         filters["sort_by"] = "timestamp"
 
     if not request.args:
-        filters["hide_empty"] = True
+        filters["hide_empty"] = False
 
     # handle 'depth'; all, own datasets, or favourites?
     # 'all' is limited to admins

From aad7d57e1b44f7c59a3eb89ba6282d37f10339fe Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 15:39:27 +0200
Subject: [PATCH 07/48] New processor: deduplicate images

---
 processors/filtering/unique_images.py | 143 ++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 processors/filtering/unique_images.py

diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
new file mode 100644
index 00000000..819e4b9d
--- /dev/null
+++ b/processors/filtering/unique_images.py
@@ -0,0 +1,143 @@
+"""
+Filter by unique images
+"""
+import imagehash
+import hashlib
+import shutil
+import json
+
+from PIL import Image
+from backend.lib.processor import BasicProcessor
+from common.lib.exceptions import ProcessorInterruptedException
+from common.lib.helpers import UserInput
+
+__author__ = "Stijn Peeters"
+__credits__ = ["Stijn Peeters"]
+__maintainer__ = "Stijn Peeters"
+__email__ = "4cat@oilab.eu"
+
+
+class UniqueImageFilter(BasicProcessor):
+    """
+    Retain only unique images, by a user-defined metric
+    """
+    type = "image-downloader-unique"  # job type ID
+    category = "Visualisation"  # category
+    title = "Filter for unique images"  # title displayed in UI
+    description = "Only keeps one instance per image, using a choice of detection method."  # description displayed in UI
+    extension = "zip"
+
+    references = [
+        "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)",
+        "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+        "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+        "Explainer: [Difference hash](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)",
+
+    ]
+
+    options = {
+        "hash-type": {
+            "type": UserInput.OPTION_CHOICE,
+            "help": "Comparison method",
+            "default": "file-hash",
+            "options": {
+                "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+                "colorhash": "Colour hash (good at colours, worse at shapes)",
+                "phash": "Perceptual hash (decent at colours and shapes)",
+                "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+                "dhash": "Difference hash (similar to average hash, better at photos and art)"
+            }
+        }
+    }
+
+    @classmethod
+    def is_compatible_with(cls, module=None, user=None):
+        """
+        Allow processor on image archives
+
+        :param module: Module to determine compatibility with
+        """
+        return module.get_media_type() == "image" or module.type.startswith(
+            "image-downloader") or module.type == "video-frames"
+
+    def hash_file(self, image_file, hash_type="file-hash"):
+        """
+        Generate an image hash
+
+        :param Path image_file:  Image file to hash
+        :param str hash_type:  Hash type, one of `file-hash`, `colorhash`,
+        `phash`, `average_hash`, `dhash`
+        :return str:  Hexadecimal hash value
+        """
+        if not image_file.exists():
+            raise FileNotFoundError()
+
+        if hash_type == "file-hash":
+            hasher = hashlib.sha1()
+
+            # Open the file in binary mode
+            with image_file.open("rb") as infile:
+                # Read and update hash in chunks to handle large files
+                while chunk := infile.read(1024):
+                    hasher.update(chunk)
+
+            return hasher.hexdigest()
+
+        elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
+            image = Image.open(image_file)
+
+            return str(getattr(imagehash, hash_type)(image))
+
+        else:
+            raise NotImplementedError(f"Unknown hash type '{hash_type}'")
+
+    def process(self):
+        """
+        Loop through images and only retain ones that have not been seen yet
+
+        :return:
+        """
+        seen_hashes = set()
+        hash_map = {}
+        metadata = None
+        dupes = 0
+        processed = 0
+        staging_area = self.dataset.get_staging_area()
+
+        for image_file in self.iterate_archive_contents(self.source_file):
+            if self.interrupted:
+                raise ProcessorInterruptedException("Interrupted while filtering for unique images")
+
+            self.dataset.update_progress(processed / self.source_dataset.num_rows)
+            processed += 1
+
+            if image_file.name == ".metadata.json":
+                with image_file.open() as infile:
+                    metadata = json.load(infile)
+                continue
+
+            image_hash = self.hash_file(image_file, self.parameters.get("hash-type"))
+
+            if image_hash not in seen_hashes:
+                seen_hashes.add(image_hash)
+                shutil.copy2(image_file, staging_area)
+                hash_map[image_hash] = image_file.name
+            else:
+                self.dataset.log(f"{image_file.name} is a duplicate of {hash_map[image_hash]} - skipping")
+                dupes += 1
+
+        new_metadata = {}
+        inverse_hashmap = {v: k for k, v in hash_map.items()}
+        for url, item in metadata.items():
+            if item["filename"] in inverse_hashmap:
+                new_metadata[inverse_hashmap[item["filename"]]] = {
+                    **item,
+                    "hash": inverse_hashmap[item["filename"]],
+                    "hash_type": self.parameters.get("hash-type")
+                }
+
+        with staging_area.joinpath(".metadata.json").open("w") as outfile:
+            json.dump(new_metadata, outfile)
+
+        self.dataset.update_status(f"Image archive filtered, found {dupes:,} duplicate(s)", is_final=True)
+        self.write_archive_and_finish(staging_area, len(hash_map), finish=True)

From 25f9ffd464e07fcc19014173eec39af01adee76b Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 15:42:38 +0200
Subject: [PATCH 08/48] Add some progress logs

---
 processors/filtering/unique_images.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
index 819e4b9d..0970d1f8 100644
--- a/processors/filtering/unique_images.py
+++ b/processors/filtering/unique_images.py
@@ -104,11 +104,15 @@ def process(self):
         processed = 0
         staging_area = self.dataset.get_staging_area()
 
+        self.dataset.update_progress("Processing images and looking for duplicates")
         for image_file in self.iterate_archive_contents(self.source_file):
             if self.interrupted:
                 raise ProcessorInterruptedException("Interrupted while filtering for unique images")
 
             self.dataset.update_progress(processed / self.source_dataset.num_rows)
+            if processed % 100 == 0:
+                self.dataset.update_progress(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
+                                             f"found {dupes:,} duplicate(s)")
             processed += 1
 
             if image_file.name == ".metadata.json":

From 53821d10ffff6c9834f2cc73b0e3fc2e7f8269db Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 16:57:27 +0200
Subject: [PATCH 09/48] Filename filter in iterate_archive_contents

---
 backend/lib/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index e9e4d85a..5dbb09a1 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -480,7 +480,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,
 
 		self.dataset.update_status("Parent dataset updated.")
 
-	def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True):
+	def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True, filename_filter=[]):
 		"""
 		A generator that iterates through files in an archive
 

From f98addc7a3c5173b0f531d07a0379e73204c1c61 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 17:39:17 +0200
Subject: [PATCH 10/48] New 'Bipartite image-item network' processor

---
 processors/networks/image-network.py | 146 +++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 processors/networks/image-network.py

diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
new file mode 100644
index 00000000..123a1189
--- /dev/null
+++ b/processors/networks/image-network.py
@@ -0,0 +1,146 @@
+"""
+Make a bipartite Image-Item network
+"""
+import json
+
+from backend.lib.processor import BasicProcessor
+
+import networkx as nx
+
+__author__ = "Stijn Peeters"
+__credits__ = ["Stijn Peeters"]
+__maintainer__ = "Stijn Peeters"
+__email__ = "4cat@oilab.eu"
+
+from common.lib.exceptions import ProcessorInterruptedException
+from common.lib.user_input import UserInput
+
+
+class ImageGrapher(BasicProcessor):
+    """
+    Image network
+
+    Creates a bipartite network of images and some attribute of the dataset the
+    images were sourced from
+    """
+    type = "image-bipartite-network"  # job type ID
+    category = "Networks"
+    title = "Bipartite image-item network"  # title displayed in UI
+    description = ("Create a GEXF network file with a bipartite network of "
+                   "images and some data field (e.g. author) of the dataset "
+                   "the images were sourced from. Suitable for use with Gephi's "
+                   "'Image Preview' plugin.")
+    extension = "gexf"  # extension of result file, used internally and in UI
+
+    options = {}
+
+    @classmethod
+    def get_options(cls, parent_dataset=None, user=None):
+        root_dataset = None
+        columns = None
+        if parent_dataset:
+            for parent in reversed(parent_dataset.get_genealogy()):
+                if parent.get_columns():
+                    root_dataset = parent
+                    break
+            columns = root_dataset.get_columns()
+
+        return {
+            "column": {
+                "help": "Dataset field",
+                "type": UserInput.OPTION_TEXT,
+                "default": "id"
+            },
+            "image-value": {
+                "help": "Image node label",
+                "type": UserInput.OPTION_CHOICE,
+                "options": {
+                    "filename": "Image file name",
+                    "url": "Image URL"
+                },
+                "tooltip": "The image node label will have this value. Depending on the network visualisation software "
+                           "you use, one or the other is required to display the images as nodes."
+            },
+            **({
+                   "column": {
+                       "help": "Dataset field",
+                       "type": UserInput.OPTION_CHOICE,
+                       "options": {
+                           column: column
+                           for column in columns}
+                   }
+               } if columns else {})
+        }
+
+    @classmethod
+    def is_compatible_with(cls, module=None, user=None):
+        """
+        Allow processor to run on images downloaded from a dataset
+
+        :param module: Module to determine compatibility with
+        """
+        return module.type.startswith("image-downloader")
+
+    def process(self):
+        column = self.parameters.get("column")
+        metadata = None
+        for file in self.iterate_archive_contents(self.source_file, filename_filter=[".metadata.json"]):
+            with file.open() as infile:
+                try:
+                    metadata = json.load(infile)
+                except json.JSONDecodeError:
+                    pass
+
+        if not metadata:
+            return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only "
+                                                  "be run on sets of images sourced from another 4CAT dataset.")
+
+        id_file_map = {}
+        for url, details in metadata.items():
+            for item_id in details.get("post_ids", []):
+                id_file_map[item_id] = url if self.parameters.get("image-value") == "url" else details["filename"]
+
+        root_dataset = None
+        for parent in reversed(self.dataset.get_genealogy()):
+            if parent.get_columns():
+                root_dataset = parent
+                break
+
+        if not root_dataset:
+            return self.dataset.finish_with_error("No suitable parent dataset found - this processor can only "
+                                                  "be run on sets of images sourced from another 4CAT dataset.")
+
+        network = nx.DiGraph()
+        processed = 0
+        for item in root_dataset.iterate_items():
+            self.dataset.update_progress(processed / root_dataset.num_rows)
+            processed += 1
+            if processed % 100 == 0:
+                self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)")
+
+            if self.interrupted:
+                raise ProcessorInterruptedException()
+
+            if item.get("id") not in id_file_map:
+                continue
+
+            from_node_label = item.get(column)
+            from_node = f"{column}-{from_node_label}"
+            to_node_label = id_file_map[item.get("id")]
+            to_node = f"image-{to_node_label}"
+
+            if from_node not in network.nodes:
+                network.add_node(from_node, label=from_node_label, category=column)
+
+            if to_node not in network.nodes:
+                network.add_node(to_node, label=to_node_label, category="image", image=to_node_label)
+
+            edge = (from_node, to_node)
+            if edge not in network.edges():
+                network.add_edge(*edge, frequency=0)
+
+            network.edges[edge]["frequency"] += 1
+
+        self.dataset.update_status("Writing network file")
+        nx.write_gexf(network, self.dataset.get_results_path())
+        self.dataset.finish(len(network.nodes))

From c479a85b71710b65ca44cb5330112c6cc1a84c00 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 17:39:26 +0200
Subject: [PATCH 11/48] Filename filter lost code

---
 backend/lib/processor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 5dbb09a1..29efde8c 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -497,6 +497,8 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
 		:param bool immediately_delete:  Temporary files are removed after yielded;
 		  False keeps files until the staging_area is removed (usually during processor
 		  cleanup)
+		:param list filename_filter:  Whitelist of filenames to iterate.
+		Other files will be ignored. If empty, do not ignore anything.
 		:return:  An iterator with a Path item for each file
 		"""
 
@@ -513,6 +515,9 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
 			archive_contents = sorted(archive_file.namelist())
 
 			for archived_file in archive_contents:
+				if filename_filter and archived_file not in filename_filter:
+					continue
+
 				info = archive_file.getinfo(archived_file)
 				if info.is_dir():
 					continue

From 5d5a0e30bb111a4096c22dc7929e79ca1a9d1f9c Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 19:04:34 +0200
Subject: [PATCH 12/48] Catch rate limits in Telegram media downloads

---
 .../visualisation/download-telegram-images.py | 35 ++++++++++++++-----
 .../visualisation/download-telegram-videos.py | 12 ++++++-
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index 99ff5199..9f0d38ee 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -7,13 +7,14 @@
 
 from pathlib import Path
 
+import telethon.errors
 from telethon import TelegramClient
 from telethon.errors import TimedOutError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
 from common.lib.dataset import DataSet
 from processors.visualisation.download_images import ImageDownloader
 
@@ -194,6 +195,13 @@ async def get_images(self):
                     if self.interrupted:
                         raise ProcessorInterruptedException("Interrupted while downloading images")
 
+                    if not message:
+                        # message no longer exists
+                        self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it "
+                                         f"may have been deleted)")
+                        self.flawless = False
+                        continue
+
                     success = False
                     try:
                         # it's actually unclear if images are always jpegs, but this
@@ -215,14 +223,23 @@ async def get_images(self):
                         msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
                         self.dataset.log(f"Could not download image for message {msg_id} ({e})")
                         self.flawless = False
-
-                    media_done += 1
-                    self.metadata[filename] = {
-                        "filename": filename,
-                        "success": success,
-                        "from_dataset": self.source_dataset.key,
-                        "post_ids": [msg_id]
-                    }
+                    finally:
+                        media_done += 1
+                        self.metadata[filename] = {
+                            "filename": filename,
+                            "success": success,
+                            "from_dataset": self.source_dataset.key,
+                            "post_ids": [msg_id]
+                        }
+
+            except telethon.errors.FloodError as e:
+                later = "later"
+                if hasattr(e, "seconds"):
+                    later = f"in {timify_long(e.seconds)}"
+                self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+                                           f"halting download process. Try again {later}.", is_final=True)
+                self.flawless = False
+                break
                     
             except ValueError as e:
                 self.dataset.log(f"Couldn't retrieve images for {entity}, it probably does not exist anymore ({e})")
diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py
index ef6d4423..b441ff9d 100644
--- a/processors/visualisation/download-telegram-videos.py
+++ b/processors/visualisation/download-telegram-videos.py
@@ -8,12 +8,13 @@
 from pathlib import Path
 
 from telethon import TelegramClient
+from telethon.errors import FloodError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
 from processors.visualisation.download_videos import VideoDownloaderPlus
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
 from common.lib.dataset import DataSet
 
 __author__ = "Stijn Peeters"
@@ -210,6 +211,15 @@ async def get_videos(self):
                         "from_dataset": self.source_dataset.key,
                         "post_ids": [msg_id]
                     }
+
+            except FloodError as e:
+                later = "later"
+                if hasattr(e, "seconds"):
+                    later = f"in {timify_long(e.seconds)}"
+                self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+                                           f"halting download process. Try again {later}.", is_final=True)
+                self.flawless = False
+                break
                     
             except ValueError as e:
                 self.dataset.log(f"Couldn't retrieve video for {entity}, it probably does not exist anymore ({e})")

From 3df74c9be01118c354eb1457895052cace37cb9a Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 24 Oct 2024 17:11:09 +0200
Subject: [PATCH 13/48] Catch bad request error in Telegram media download

---
 processors/visualisation/download-telegram-images.py | 4 ++--
 processors/visualisation/download-telegram-videos.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index 9f0d38ee..dda0ad82 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -9,7 +9,7 @@
 
 import telethon.errors
 from telethon import TelegramClient
-from telethon.errors import TimedOutError
+from telethon.errors import TimedOutError, BadRequestError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
@@ -218,7 +218,7 @@ async def get_images(self):
                             await client.download_media(message, str(path), thumb=-1)
                         msg_id = message.id
                         success = True
-                    except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError) as e:
+                    except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError, BadRequestError) as e:
                         filename = f"{entity}-index-{media_done}"
                         msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
                         self.dataset.log(f"Could not download image for message {msg_id} ({e})")
diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py
index b441ff9d..aa05173c 100644
--- a/processors/visualisation/download-telegram-videos.py
+++ b/processors/visualisation/download-telegram-videos.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 from telethon import TelegramClient
-from telethon.errors import FloodError
+from telethon.errors import FloodError, BadRequestError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
@@ -198,7 +198,7 @@ async def get_videos(self):
 
                         msg_id = message.id
                         success = True
-                    except (AttributeError, RuntimeError, ValueError, TypeError) as e:
+                    except (AttributeError, RuntimeError, ValueError, TypeError, BadRequestError) as e:
                         filename = f"{entity}-index-{media_done}"
                         msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
                         self.dataset.log(f"Could not download video for message {msg_id} ({e})")

From a8bec31bfa17b8ac6042a4ff8c4262a229c73f35 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 24 Oct 2024 17:46:26 +0200
Subject: [PATCH 14/48] Catch error in the right place...

---
 processors/visualisation/download-telegram-images.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index dda0ad82..3cb3e924 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -218,7 +218,7 @@ async def get_images(self):
                             await client.download_media(message, str(path), thumb=-1)
                         msg_id = message.id
                         success = True
-                    except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError, BadRequestError) as e:
+                    except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError) as e:
                         filename = f"{entity}-index-{media_done}"
                         msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
                         self.dataset.log(f"Could not download image for message {msg_id} ({e})")
@@ -232,6 +232,10 @@ async def get_images(self):
                             "post_ids": [msg_id]
                         }
 
+            except BadRequestError:
+                self.dataset.log(f"Couldn't retrieve images for {entity} - the channel is no longer accessible ({e})")
+                self.flawless = False
+
             except telethon.errors.FloodError as e:
                 later = "later"
                 if hasattr(e, "seconds"):

From ec3422e2f85ac3f8576a7448c098aed9c95a3d83 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 24 Oct 2024 17:51:48 +0200
Subject: [PATCH 15/48] =?UTF-8?q?a=C4=81a=C4=81a=C4=81a=C4=81a=C4=81a?=
 =?UTF-8?q?=C4=81a=C4=81a=C4=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 processors/visualisation/download-telegram-images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index 3cb3e924..a3309e96 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -232,7 +232,7 @@ async def get_images(self):
                             "post_ids": [msg_id]
                         }
 
-            except BadRequestError:
+            except BadRequestError as e:
                 self.dataset.log(f"Couldn't retrieve images for {entity} - the channel is no longer accessible ({e})")
                 self.flawless = False
 

From ac543cc8bf73d4ecb61ca4d93f162fa198f645a5 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 24 Oct 2024 19:05:17 +0200
Subject: [PATCH 16/48] Break instead of continue when trying to download
 deleted Telegram image

---
 processors/visualisation/download-telegram-images.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index a3309e96..6394862e 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -200,7 +200,7 @@ async def get_images(self):
                         self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it "
                                          f"may have been deleted)")
                         self.flawless = False
-                        continue
+                        break
 
                     success = False
                     try:
@@ -223,6 +223,7 @@ async def get_images(self):
                         msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
                         self.dataset.log(f"Could not download image for message {msg_id} ({e})")
                         self.flawless = False
+
                     finally:
                         media_done += 1
                         self.metadata[filename] = {

From bb0909cc4d8908f2df2ca95c9a9896631c055a12 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 24 Oct 2024 23:21:33 +0200
Subject: [PATCH 17/48] dumb image things

---
 processors/filtering/unique_images.py | 40 +-------------
 processors/networks/image-network.py  | 79 +++++++++++++++++++++++----
 2 files changed, 72 insertions(+), 47 deletions(-)

diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
index 0970d1f8..b0a9a2cb 100644
--- a/processors/filtering/unique_images.py
+++ b/processors/filtering/unique_images.py
@@ -1,15 +1,12 @@
 """
 Filter by unique images
 """
-import imagehash
-import hashlib
 import shutil
 import json
 
-from PIL import Image
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, hash_file
 
 __author__ = "Stijn Peeters"
 __credits__ = ["Stijn Peeters"]
@@ -60,37 +57,6 @@ def is_compatible_with(cls, module=None, user=None):
         return module.get_media_type() == "image" or module.type.startswith(
             "image-downloader") or module.type == "video-frames"
 
-    def hash_file(self, image_file, hash_type="file-hash"):
-        """
-        Generate an image hash
-
-        :param Path image_file:  Image file to hash
-        :param str hash_type:  Hash type, one of `file-hash`, `colorhash`,
-        `phash`, `average_hash`, `dhash`
-        :return str:  Hexadecimal hash value
-        """
-        if not image_file.exists():
-            raise FileNotFoundError()
-
-        if hash_type == "file-hash":
-            hasher = hashlib.sha1()
-
-            # Open the file in binary mode
-            with image_file.open("rb") as infile:
-                # Read and update hash in chunks to handle large files
-                while chunk := infile.read(1024):
-                    hasher.update(chunk)
-
-            return hasher.hexdigest()
-
-        elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
-            image = Image.open(image_file)
-
-            return str(getattr(imagehash, hash_type)(image))
-
-        else:
-            raise NotImplementedError(f"Unknown hash type '{hash_type}'")
-
     def process(self):
         """
         Loop through images and only retain ones that have not been seen yet
@@ -111,7 +77,7 @@ def process(self):
 
             self.dataset.update_progress(processed / self.source_dataset.num_rows)
             if processed % 100 == 0:
-                self.dataset.update_progress(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
+                self.dataset.update_status(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
                                              f"found {dupes:,} duplicate(s)")
             processed += 1
 
@@ -120,7 +86,7 @@ def process(self):
                     metadata = json.load(infile)
                 continue
 
-            image_hash = self.hash_file(image_file, self.parameters.get("hash-type"))
+            image_hash = hash_file(image_file, self.parameters.get("hash-type"))
 
             if image_hash not in seen_hashes:
                 seen_hashes.add(image_hash)
diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
index 123a1189..3d153b9b 100644
--- a/processors/networks/image-network.py
+++ b/processors/networks/image-network.py
@@ -4,6 +4,7 @@
 import json
 
 from backend.lib.processor import BasicProcessor
+from common.lib.helpers import hash_file
 
 import networkx as nx
 
@@ -61,6 +62,20 @@ def get_options(cls, parent_dataset=None, user=None):
                 "tooltip": "The image node label will have this value. Depending on the network visualisation software "
                            "you use, one or the other is required to display the images as nodes."
             },
+            "deduplicate": {
+                "type": UserInput.OPTION_CHOICE,
+                "help": "Merge images",
+                "tooltip": "Similar images can be merged into a single node, represented by the first image of the set "
+                           "that was encountered.",
+                "options": {
+                    "none": "Do not merge",
+                    "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+                    "colorhash": "Colour hash (good at colours, worse at shapes)",
+                    "phash": "Perceptual hash (decent at colours and shapes)",
+                    "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+                    "dhash": "Difference hash (similar to average hash, better at photos and art)"
+                }
+            },
             **({
                    "column": {
                        "help": "Dataset field",
@@ -83,22 +98,48 @@ def is_compatible_with(cls, module=None, user=None):
 
     def process(self):
         column = self.parameters.get("column")
+        hash_type = self.parameters.get("deduplicate")
+        filename_filter = [".metadata.json"] if hash_type == "none" else []
         metadata = None
-        for file in self.iterate_archive_contents(self.source_file, filename_filter=[".metadata.json"]):
-            with file.open() as infile:
+        hashed = 0
+
+        # some maps to make sure we use the right value in the right place
+        # url or filename, original image or duplicate, etc
+        file_hash_map = {}
+        hash_file_map = {}
+        seen_hashes = set()
+        id_file_map = {}
+
+        for file in self.iterate_archive_contents(self.source_file, filename_filter=filename_filter):
+            if file.name == ".metadata.json":
+                with file.open() as infile:
+                    try:
+                        metadata = json.load(infile)
+                        file_hash_map = {i: v["filename"] for i, v in metadata.items()} if self.parameters.get("image-value") == "url" else {i["filename"]: i["filename"] for i in metadata.values()}
+                    except json.JSONDecodeError:
+                        pass
+            else:
                 try:
-                    metadata = json.load(infile)
-                except json.JSONDecodeError:
-                    pass
+                    hashed += 1
+                    if hashed % 100 == 0:
+                        self.dataset.update_status(f"Generated identity hashes for {hashed:,} of {self.source_dataset.num_rows-1:,} item(s)")
+                    self.dataset.update_progress(hashed / (self.source_dataset.num_rows-1) * 0.5)
+                    file_hash = hash_file(file, hash_type)
+                    file_hash_map[file.name] = file_hash
+                    if file_hash not in hash_file_map:
+                        hash_file_map[file_hash] = file.name
+
+                except (FileNotFoundError, ValueError) as e:
+                    continue
 
         if not metadata:
             return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only "
                                                   "be run on sets of images sourced from another 4CAT dataset.")
 
-        id_file_map = {}
+        file_url_map = {v["filename"]: u for u, v in metadata.items()}
         for url, details in metadata.items():
             for item_id in details.get("post_ids", []):
-                id_file_map[item_id] = url if self.parameters.get("image-value") == "url" else details["filename"]
+                id_file_map[item_id] = details["filename"]
 
         root_dataset = None
         for parent in reversed(self.dataset.get_genealogy()):
@@ -113,7 +154,12 @@ def process(self):
         network = nx.DiGraph()
         processed = 0
         for item in root_dataset.iterate_items():
-            self.dataset.update_progress(processed / root_dataset.num_rows)
+            progress = processed / root_dataset.num_rows
+            if hashed:
+                # if hashing was necessary, we approximate that as 50% of the work
+                progress = (progress * 0.5) + 0.5
+
+            self.dataset.update_progress(progress)
             processed += 1
             if processed % 100 == 0:
                 self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)")
@@ -126,9 +172,22 @@ def process(self):
 
             from_node_label = item.get(column)
             from_node = f"{column}-{from_node_label}"
-            to_node_label = id_file_map[item.get("id")]
-            to_node = f"image-{to_node_label}"
 
+            image_file = id_file_map[item.get("id")]
+            image_hash = file_hash_map[image_file]
+            if image_hash in seen_hashes:
+                to_node_label = hash_file_map[image_hash]
+                if image_file != to_node_label:
+                    self.dataset.update_status(f"Image {image_file} is a duplicate of {to_node_label} - merging.")
+
+            else:
+                seen_hashes.add(image_hash)
+                to_node_label = id_file_map[item.get("id")]
+
+            if self.parameters.get("image-value") == "url":
+                to_node_label = file_url_map[to_node_label]
+
+            to_node = f"image-{to_node_label}"
             if from_node not in network.nodes:
                 network.add_node(from_node, label=from_node_label, category=column)
 

From 1aec9e6837da3e322a4d128fc5254edf00ff4308 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 24 Oct 2024 23:23:44 +0200
Subject: [PATCH 18/48] help!!

---
 common/lib/helpers.py | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index 2911044f..5fe5df48 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -1,8 +1,9 @@
 """
 Miscellaneous helper functions for the 4CAT backend
 """
-import hashlib
 import subprocess
+import imagehash
+import hashlib
 import requests
 import datetime
 import smtplib
@@ -23,6 +24,7 @@
 from urllib.parse import urlparse, urlunparse
 from calendar import monthrange
 from packaging import version
+from PIL import Image
 
 from common.lib.user_input import UserInput
 from common.config_manager import config
@@ -404,6 +406,37 @@ def andify(items):
     return ", ".join([str(item) for item in items]) + result
 
 
+def hash_file(image_file, hash_type="file-hash"):
+    """
+    Generate an image hash
+
+    :param Path image_file:  Image file to hash
+    :param str hash_type:  Hash type, one of `file-hash`, `colorhash`,
+    `phash`, `average_hash`, `dhash`
+    :return str:  Hexadecimal hash value
+    """
+    if not image_file.exists():
+        raise FileNotFoundError()
+
+    if hash_type == "file-hash":
+        hasher = hashlib.sha1()
+
+        # Open the file in binary mode
+        with image_file.open("rb") as infile:
+            # Read and update hash in chunks to handle large files
+            while chunk := infile.read(1024):
+                hasher.update(chunk)
+
+        return hasher.hexdigest()
+
+    elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
+        image = Image.open(image_file)
+
+        return str(getattr(imagehash, hash_type)(image))
+
+    else:
+        raise NotImplementedError(f"Unknown hash type '{hash_type}'")
+
 def get_yt_compatible_ids(yt_ids):
     """
     :param yt_ids list, a list of strings

From ea9e3f5bddbb3ec7e4abb3b8cfd17d79675dcb74 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 24 Oct 2024 23:24:54 +0200
Subject: [PATCH 19/48] =?UTF-8?q?=F0=9F=98=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 processors/filtering/unique_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
index b0a9a2cb..a8dd8763 100644
--- a/processors/filtering/unique_images.py
+++ b/processors/filtering/unique_images.py
@@ -70,7 +70,7 @@ def process(self):
         processed = 0
         staging_area = self.dataset.get_staging_area()
 
-        self.dataset.update_progress("Processing images and looking for duplicates")
+        self.dataset.update_status("Processing images and looking for duplicates")
         for image_file in self.iterate_archive_contents(self.source_file):
             if self.interrupted:
                 raise ProcessorInterruptedException("Interrupted while filtering for unique images")

From 96528d15daba77bd4e9ef05e2eaa65e3336b0bee Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 25 Oct 2024 12:59:18 +0200
Subject: [PATCH 20/48] =?UTF-8?q?Clean=20that=20code=20=F0=9F=A7=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 processors/networks/image-network.py | 35 ++++++++++++++++------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
index 3d153b9b..3b93a3df 100644
--- a/processors/networks/image-network.py
+++ b/processors/networks/image-network.py
@@ -170,31 +170,36 @@ def process(self):
             if item.get("id") not in id_file_map:
                 continue
 
-            from_node_label = item.get(column)
-            from_node = f"{column}-{from_node_label}"
+            # from nodes are the dataset fields (e.g. 'body' or 'chat')
+            # to node names are filenames (optionally mapped to URLs later)
+            from_node = item.get(column)
+            from_node_id = f"{column}-{from_node}"
 
             image_file = id_file_map[item.get("id")]
-            image_hash = file_hash_map[image_file]
-            if image_hash in seen_hashes:
-                to_node_label = hash_file_map[image_hash]
-                if image_file != to_node_label:
-                    self.dataset.update_status(f"Image {image_file} is a duplicate of {to_node_label} - merging.")
+            image_hash = file_hash_map.get(image_file)
+            if hash_type != "none" and image_hash in seen_hashes:
+                # if we're deduplicating and the image is already in the graph,
+                # merge the nodes (use the original node as the 'to node')
+                to_node = hash_file_map[image_hash]
+                if image_file != to_node:
+                    self.dataset.update_status(f"Image {image_file} identified as a duplicate of {to_node} - "
+                                               f"merging.")
 
             else:
                 seen_hashes.add(image_hash)
-                to_node_label = id_file_map[item.get("id")]
+                to_node = image_file
 
             if self.parameters.get("image-value") == "url":
-                to_node_label = file_url_map[to_node_label]
+                to_node = file_url_map[to_node]
 
-            to_node = f"image-{to_node_label}"
-            if from_node not in network.nodes:
-                network.add_node(from_node, label=from_node_label, category=column)
+            to_node_id = f"image-{to_node}"
+            if from_node_id not in network.nodes:
+                network.add_node(from_node_id, label=from_node, category=column)
 
-            if to_node not in network.nodes:
-                network.add_node(to_node, label=to_node_label, category="image", image=to_node_label)
+            if to_node_id not in network.nodes:
+                network.add_node(to_node_id, label=to_node, category="image", image=to_node)
 
-            edge = (from_node, to_node)
+            edge = (from_node_id, to_node_id)
             if edge not in network.edges():
                 network.add_edge(*edge, frequency=0)
 

From caf451ad446dd4ba36b4da0939cb1a258a1e37e6 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 25 Oct 2024 14:57:54 +0200
Subject: [PATCH 21/48] Fix image network for Telegram images

---
 processors/networks/image-network.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
index 3b93a3df..8b1549ce 100644
--- a/processors/networks/image-network.py
+++ b/processors/networks/image-network.py
@@ -139,6 +139,9 @@ def process(self):
         file_url_map = {v["filename"]: u for u, v in metadata.items()}
         for url, details in metadata.items():
             for item_id in details.get("post_ids", []):
+                if self.source_dataset.type.endswith("-telegram"):
+                    # telegram has weird IDs
+                    item_id = "-".join(details["filename"].split("-")[:-1]) + "-" + str(item_id)
                 id_file_map[item_id] = details["filename"]
 
         root_dataset = None

From 80dfceddf377ddcff5e9f9c0d63ef07ea0138aa7 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 25 Oct 2024 15:12:14 +0200
Subject: [PATCH 22/48] asdfghjkl;

---
 processors/networks/image-network.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
index 8b1549ce..4267c965 100644
--- a/processors/networks/image-network.py
+++ b/processors/networks/image-network.py
@@ -183,8 +183,8 @@ def process(self):
             if hash_type != "none" and image_hash in seen_hashes:
                 # if we're deduplicating and the image is already in the graph,
                 # merge the nodes (use the original node as the 'to node')
-                to_node = hash_file_map[image_hash]
-                if image_file != to_node:
+                to_node = hash_file_map.get(image_hash)
+                if to_node and image_file != to_node:
                     self.dataset.update_status(f"Image {image_file} identified as a duplicate of {to_node} - "
                                                f"merging.")
 
@@ -192,6 +192,10 @@ def process(self):
                 seen_hashes.add(image_hash)
                 to_node = image_file
 
+            if not to_node:
+                # image could not be hashed, probably invalid file
+                continue
+
             if self.parameters.get("image-value") == "url":
                 to_node = file_url_map[to_node]
 

From dcc0a21d3ea0a7955cca26d6e34d032598128b80 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 31 Oct 2024 14:29:39 +0100
Subject: [PATCH 23/48] fix: weird config.get() bug w/ default values

---
 common/config_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/config_manager.py b/common/config_manager.py
index 1b8d4052..7760aae9 100644
--- a/common/config_manager.py
+++ b/common/config_manager.py
@@ -269,11 +269,11 @@ def get(self, attribute_name, default=None, is_json=False, user=None, tags=None)
 
             if not is_json and value is not None:
                 value = json.loads(value)
-            # TODO: check this as it feels like it could cause a default to return even if value is not None. - Dale
-            elif default is not None:
-                value = default
+            # TODO: Which default should have priority? The provided default feels like it should be the highest priority, but I think that is an old implementation and perhaps should be removed. - Dale
             elif value is None and setting_name in self.config_definition and "default" in self.config_definition[setting_name]:
                 value = self.config_definition[setting_name]["default"]
+            elif value is None and default is not None:
+                value = default
 
             final_settings[setting_name] = value
 

From 187101926bb7c650ced3d14a9cac17560ddb27b6 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 4 Nov 2024 10:25:37 +0000
Subject: [PATCH 24/48] Rebrand Twitter v2 datasource as compatible with the X
 Research API

---
 datasources/twitterv2/DESCRIPTION.md    |  81 ++++++++---------
 datasources/twitterv2/__init__.py       |   2 +-
 datasources/twitterv2/search_twitter.py | 115 ++++++++++++------------
 3 files changed, 95 insertions(+), 103 deletions(-)

diff --git a/datasources/twitterv2/DESCRIPTION.md b/datasources/twitterv2/DESCRIPTION.md
index 57f1f7a5..d138e675 100644
--- a/datasources/twitterv2/DESCRIPTION.md
+++ b/datasources/twitterv2/DESCRIPTION.md
@@ -1,93 +1,88 @@
-Twitter data is gathered through the official [Twitter v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT
-allows access to both the Standard and the Academic track. The Standard track is free for anyone to use, but only
-allows to retrieve tweets up to seven days old. The Academic track allows a full-archive search of up to ten million
-tweets per month (as of March 2022). For the Academic track, you need a valid Bearer token. You can request one
-[here](https://developer.twitter.com/en/portal/petition/academic/is-it-right-for-you).
+X/Twitter data is gathered through the official [X v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT can interface with X's Research API (sometimes 
+branded as the 'DSA API', referencing the EU's Digital Services Act). To retrieve posts via this API with 4CAT, you need
+a valid Bearer token. Read more about this mode of access [here](https://developer.x.com/en/use-cases/do-research/academic-research).
 
-Tweets are captured in batches at a speed of approximately 100,000 tweets per hour. 4CAT will warn you if your dataset
+Posts are captured in batches at a speed of approximately 100,000 posts per hour. 4CAT will warn you if your dataset
 is expected to take more than 30 minutes to collect. It is often a good idea to start small (with very specific
 queries or narrow date ranges) and then only create a larger dataset if you are confident that it will be manageable and
 useful for your analysis.
 
-If you hit your Twitter API quota while creating a dataset, the dataset will be finished with the tweets that have been
+If you hit your X API quota while creating a dataset, the dataset will be finished with the posts that have been 
 collected so far and a warning will be logged.
 
 ### Query syntax
 
-Check the [API documentation](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)
+Check the [API documentation](https://developer.x.com/en/docs/x-api/tweets/search/integrate/build-a-query)
 for available query syntax and operators. This information is crucial to what data you collect. Important operators for
-instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted tweets and retweets. Query syntax
-is roughly the same as for Twitter's search interface, so you can try out most queries by entering them in the Twitter
-app or website's search field and looking at the results. You can also test queries with
-Twitter's [Query Builder](https://developer.twitter.com/apitools/query?query=).
+instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted posts and reposts. Query syntax
+is roughly the same as for X's search interface, so you can try out most queries by entering them in the X app or 
+website's search field and looking at the results. You can also test queries with
+X's [Query Builder](https://developer.twitter.com/apitools/query?query=).
 
 ### Date ranges
 
-By default, Twitter returns tweets posted within the past 30 days. If you want to go back further, you need to
-explicitly set a date range. Note that Twitter does not like date ranges that end in the future, or start before
-Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date.
+By default, X returns posts posted within the past 30 days. If you want to go back further, you need to
+explicitly set a date range. Note that X does not like date ranges that end in the future, or start before
+Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date. Also note
+that API access may come with certain limitations on how far a query may extend into history.
 
 ### Geo parameters
 
-Twitter offers a number of ways
-to [query by location/geo data](https://developer.twitter.com/en/docs/tutorials/filtering-tweets-by-location)
-such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. This feature is only available for the Academic level;
-you will receive a 400 error if using queries filtering by geographic information.
+X offers a number of ways
+to [query by location/geo data](https://developer.x.com/en/docs/tutorials/filtering-tweets-by-location)
+such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. 
 
 ### Retweets
 
-A retweet from Twitter API v2 contains at maximum 140 characters from the original tweet. 4CAT therefore
-gathers both the retweet and the original tweet and reformats the retweet text so it resembles a user's experience.
+A repost from X API v2 contains at maximum 140 characters from the original post. 4CAT therefore
+gathers both the repost and the original post and reformats the repost text so it resembles a user's experience.
 
 This also affects mentions, hashtags, and other data as only those contained in the first 140 characters are provided
-by Twitter API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added
-to the retweet for 4CAT analysis methods. *4CAT stores the data from Twitter API v2 as similar as possible to the format
+by X API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added
+to the repost for 4CAT analysis methods. *4CAT stores the data from X API v2 as similar as possible to the format
 in which it was received which you can obtain by downloading the ndjson file.*
 
 *Example 1*
 
-[This retweet](https://twitter.com/tonino1630/status/1554618034299568128) returns the following data:
+[This repost](https://x.com/tonino1630/status/1554618034299568128) returns the following data:
 
 - *author:*    `tonino1630`
-- *
-  text:*     `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…`
+- *text:*     `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…`
 - *mentions:*     `ChuckyFrao`
 - *hashags:*
 
 <br>
-While the original tweet will return (as a reference tweet) this data:
+While the original post will return (as a reference post) this data:
 
 - *author:*    `ChuckyFrao`
-- *
-  text:*     `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0`
+- *text:*     `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0`
 - *mentions:*    `POTUS, usembassyve, StateSPEHA, StateDept, SecBlinken`
 - *hashtags:*    `FreeAlexSaab, BringAlexHome, IntegridadTerritorial`
 
 <br>
-As you can see, only the author of the original tweet is listed as a mention in the retweet.
+As you can see, only the author of the original post is listed as a mention in the repost.
 
 *Example 2*
 
-[This retweet](https://twitter.com/Macsmart31/status/1554618041459445760) returns the following:
+[This repost](https://x.com/Macsmart31/status/1554618041459445760) returns the following:
 
 - *author:* `Macsmart31`
-- *
-  text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…`
+- *text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…`
 - *mentions:* `mickyd123us, tribelaw, HonorDecency`
 
 <br>
-Compared with the original tweet referenced below:
+Compared with the original post referenced below:
 
 - *author:* `mickyd123us`
-- *
-  text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr`
+- *text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr`
 - *mentions:* `tribelaw, HonorDecency`
 
 <br>
-Because the mentioned users are in the first 140 characters of the original tweet, they are also listed as mentions in the retweet.
-
-The key difference here is that example one the retweet contains none of the hashtags or mentions from the original
-tweet (they are beyond the first 140 characters) while the second retweet example does return mentions from the original
-tweet. *Due to this discrepancy, for retweets all mentions and hashtags of the original tweet are considered as mentions
-and hashtags of the retweet.* A user on Twitter will see all mentions and hashtags when viewing a retweet and the
-retweet would be a part of any network around those mentions and hashtags.
+Because the mentioned users are in the first 140 characters of the original post, they are also listed as mentions in 
+the repost.
+
+The key difference here is that in example one the repost contains none of the hashtags or mentions from the original
+post (they are beyond the first 140 characters) while the second repost example does return mentions from the original
+post. *Due to this discrepancy, for reposts all mentions and hashtags of the original post are considered as mentions
+and hashtags of the repost.* A user on X will see all mentions and hashtags when viewing a repost and the
+repost would be a part of any network around those mentions and hashtags.
diff --git a/datasources/twitterv2/__init__.py b/datasources/twitterv2/__init__.py
index 3335bc7c..6aa80c7b 100644
--- a/datasources/twitterv2/__init__.py
+++ b/datasources/twitterv2/__init__.py
@@ -9,4 +9,4 @@
 
 # Internal identifier for this data source
 DATASOURCE = "twitterv2"
-NAME = "Twitter API (v2) Search"
\ No newline at end of file
+NAME = "X/Twitter API (v2) Search"
\ No newline at end of file
diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py
index 999680b6..8b91d1eb 100644
--- a/datasources/twitterv2/search_twitter.py
+++ b/datasources/twitterv2/search_twitter.py
@@ -1,5 +1,5 @@
 """
-Twitter keyword search via the Twitter API v2
+X/Twitter keyword search via the X API v2
 """
 import requests
 import datetime
@@ -17,13 +17,10 @@
 
 class SearchWithTwitterAPIv2(Search):
     """
-    Get Tweets via the Twitter API
-
-    This only allows for historical search - use f.ex. TCAT for more advanced
-    queries.
+    Get Tweets via the X API
     """
     type = "twitterv2-search"  # job ID
-    title = "Twitter API (v2)"
+    title = "X/Twitter API (v2)"
     extension = "ndjson"
     is_local = False    # Whether this datasource is locally scraped
     is_static = False   # Whether this datasource is still updated
@@ -32,15 +29,15 @@ class SearchWithTwitterAPIv2(Search):
     import_issues = True
 
     references = [
-        "[Twitter API documentation](https://developer.twitter.com/en/docs/twitter-api)"
+        "[X/Twitter API documentation](https://developer.x.com/en/docs/x-api)"
     ]
 
     config = {
         "twitterv2-search.academic_api_key": {
             "type": UserInput.OPTION_TEXT,
             "default": "",
-            "help": "Academic API Key",
-            "tooltip": "An API key for the Twitter v2 Academic API. If "
+            "help": "Research API Key",
+            "tooltip": "An API key for the X/Twitter v2 Research API. If "
                        "provided, the user will not need to enter their own "
                        "key to retrieve tweets. Note that this API key should "
                        "have access to the Full Archive Search endpoint."
@@ -50,15 +47,15 @@ class SearchWithTwitterAPIv2(Search):
             "default": 0,
             "min": 0,
             "max": 10_000_000,
-            "help": "Max tweets per dataset",
+            "help": "Max posts per dataset",
             "tooltip": "4CAT will never retrieve more than this amount of "
-                       "tweets per dataset. Enter '0' for unlimited tweets."
+                       "posts per dataset. Enter '0' for unlimited posts."
         },
         "twitterv2-search.id_lookup": {
             "type": UserInput.OPTION_TOGGLE,
             "default": False,
             "help": "Allow lookup by ID",
-            "tooltip": "If enabled, allow users to enter a list of tweet IDs "
+            "tooltip": "If enabled, allow users to enter a list of post IDs "
                        "to retrieve. This is disabled by default because it "
                        "can be confusing to novice users."
         }
@@ -110,7 +107,7 @@ def get_items(self, query):
         }
 
         if self.parameters.get("query_type", "query") == "id_lookup" and self.config.get("twitterv2-search.id_lookup"):
-            endpoint = "https://api.twitter.com/2/tweets"
+            endpoint = "https://api.x.com/2/tweets"
 
             tweet_ids = self.parameters.get("query", []).split(',')
 
@@ -126,7 +123,7 @@ def get_items(self, query):
 
         else:
             # Query to all or search
-            endpoint = "https://api.twitter.com/2/tweets/search/" + api_type
+            endpoint = "https://api.x.com/2/tweets/search/" + api_type
 
             queries = [self.parameters.get("query", "")]
 
@@ -158,7 +155,7 @@ def get_items(self, query):
             while True:
 
                 if self.interrupted:
-                    raise ProcessorInterruptedException("Interrupted while getting tweets from the Twitter API")
+                    raise ProcessorInterruptedException("Interrupted while getting posts from the Twitter API")
 
                 # there is a limit of one request per second, so stay on the safe side of this
                 while self.previous_request == int(time.time()):
@@ -188,18 +185,18 @@ def get_items(self, query):
                     try:
                         structured_response = api_response.json()
                         if structured_response.get("title") == "UsageCapExceeded":
-                            self.dataset.update_status("Hit the monthly tweet cap. You cannot capture more tweets "
-                                                       "until your API quota resets. Dataset completed with tweets "
+                            self.dataset.update_status("Hit the monthly post cap. You cannot capture more posts "
+                                                       "until your API quota resets. Dataset completed with posts "
                                                        "collected so far.", is_final=True)
                             return
                     except (json.JSONDecodeError, ValueError):
-                        self.dataset.update_status("Hit Twitter rate limit, but could not figure out why. Halting "
-                                                   "tweet collection.", is_final=True)
+                        self.dataset.update_status("Hit X's rate limit, but could not figure out why. Halting "
+                                                   "post collection.", is_final=True)
                         return
 
                     resume_at = convert_to_int(api_response.headers["x-rate-limit-reset"]) + 1
                     resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c")
-                    self.dataset.update_status("Hit Twitter rate limit - waiting until %s to continue." % resume_at_str)
+                    self.dataset.update_status("Hit X's rate limit - waiting until %s to continue." % resume_at_str)
                     while time.time() <= resume_at:
                         if self.interrupted:
                             raise ProcessorInterruptedException("Interrupted while waiting for rate limit to reset")
@@ -211,10 +208,10 @@ def get_items(self, query):
                 elif api_response.status_code == 403:
                     try:
                         structured_response = api_response.json()
-                        self.dataset.update_status("'Forbidden' error from the Twitter API. Could not connect to Twitter API "
+                        self.dataset.update_status("'Forbidden' error from the X API. Could not connect to X API "
                                                    "with this API key. %s" % structured_response.get("detail", ""), is_final=True)
                     except (json.JSONDecodeError, ValueError):
-                        self.dataset.update_status("'Forbidden' error from the Twitter API. Your key may not have access to "
+                        self.dataset.update_status("'Forbidden' error from the X API. Your key may not have access to "
                                                    "the full-archive search endpoint.", is_final=True)
                     finally:
                         return
@@ -224,7 +221,7 @@ def get_items(self, query):
                 elif api_response.status_code in (502, 503, 504):
                     resume_at = time.time() + 60
                     resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c")
-                    self.dataset.update_status("Twitter unavailable (status %i) - waiting until %s to continue." % (
+                    self.dataset.update_status("X unavailable (status %i) - waiting until %s to continue." % (
                     api_response.status_code, resume_at_str))
                     while time.time() <= resume_at:
                         time.sleep(0.5)
@@ -233,7 +230,7 @@ def get_items(self, query):
                 # this usually means the query is too long or otherwise contains
                 # a syntax error
                 elif api_response.status_code == 400:
-                    msg = "Response %i from the Twitter API; " % api_response.status_code
+                    msg = "Response %i from the X API; " % api_response.status_code
                     try:
                         api_response = api_response.json()
                         msg += api_response.get("title", "")
@@ -247,19 +244,19 @@ def get_items(self, query):
 
                 # invalid API key
                 elif api_response.status_code == 401:
-                    self.dataset.update_status("Invalid API key - could not connect to Twitter API", is_final=True)
+                    self.dataset.update_status("Invalid API key - could not connect to X API", is_final=True)
                     return
 
                 # haven't seen one yet, but they probably exist
                 elif api_response.status_code != 200:
                     self.dataset.update_status(
                         "Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True)
-                    self.log.warning("Twitter API v2 responded with status code %i. Response body: %s" % (
+                    self.log.warning("X API v2 responded with status code %i. Response body: %s" % (
                     api_response.status_code, api_response.text))
                     return
 
                 elif not api_response:
-                    self.dataset.update_status("Could not connect to Twitter. Cancelling.", is_final=True)
+                    self.dataset.update_status("Could not connect to X. Cancelling.", is_final=True)
                     return
 
                 api_response = api_response.json()
@@ -291,13 +288,13 @@ def get_items(self, query):
                 if num_missing_objects > 50:
                     # Large amount of missing objects; possible error with Twitter API
                     self.import_issues = False
-                    error_report.append('%i missing objects received following tweet number %i. Possible issue with Twitter API.' % (num_missing_objects, tweets))
+                    error_report.append('%i missing objects received following post number %i. Possible issue with X API.' % (num_missing_objects, tweets))
                     error_report.append('Missing objects collected: ' + ', '.join(['%s: %s' % (k, len(v)) for k, v in missing_objects.items()]))
 
                 # Warn if new missing object is recorded (for developers to handle)
                 expected_error_types = ['user', 'media', 'poll', 'tweet', 'place']
                 if any(key not in expected_error_types for key in missing_objects.keys()):
-                    self.log.warning("Twitter API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types]))
+                    self.log.warning("X API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types]))
 
                 # Loop through and collect tweets
                 for tweet in api_response.get("data", []):
@@ -312,7 +309,7 @@ def get_items(self, query):
 
                     tweets += 1
                     if tweets % 500 == 0:
-                        self.dataset.update_status("Received %s of ~%s tweets from the Twitter API" % ("{:,}".format(tweets), expected_tweets))
+                        self.dataset.update_status("Received %s of ~%s tweets from the X API" % ("{:,}".format(tweets), expected_tweets))
                         if num_expected_tweets is not None:
                             self.dataset.update_progress(tweets / num_expected_tweets)
 
@@ -474,21 +471,19 @@ def get_options(cls, parent_dataset=None, user=None):
         max_tweets = config.get("twitterv2-search.max_tweets", user=user)
 
         if have_api_key:
-            intro_text = ("This data source uses the full-archive search endpoint of the Twitter API (v2) to retrieve "
+            intro_text = ("This data source uses the full-archive search endpoint of the X API (v2) to retrieve "
                           "historic tweets that match a given query.")
 
         else:
-            intro_text = ("This data source uses either the Standard 7-day historical Search endpoint or the "
-                          "full-archive search endpoint of the Twitter API, v2. To use the latter, you must have "
-                          "access  to the Academic Research track of the Twitter API. In either case, you will need to "
-                          "provide a  valid [bearer "
-                          "token](https://developer.twitter.com/en/docs/authentication/oauth-2-0). The  bearer token "
-                          "**will be sent to the 4CAT server**, where it will be deleted after data collection has "
-                          "started. Note that any tweets retrieved  with 4CAT will count towards your monthly Tweet "
-                          "retrieval cap.")
-
-        intro_text += ("\n\nPlease refer to the [Twitter API documentation]("
-                          "https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) "
+            intro_text = ("This data source uses the full-archive search endpoint of the X/Twitter API, v2. To use the "
+                          "it, you must have access  to the Research track of the X API. You will need to provide a "
+                          "valid [bearer token](https://developer.x.com/en/docs/authentication/oauth-2-0). The "
+                          "bearer token **will be sent to the 4CAT server**, where it will be deleted after data "
+                          "collection has started. Note that any posts retrieved with 4CAT will count towards your "
+                          "monthly post retrieval cap.")
+
+        intro_text += ("\n\nPlease refer to the [X API documentation]("
+                          "https://developer.x.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) "
                           "documentation for more information about this API endpoint and the syntax you can use in your "
                           "search query. Retweets are included by default; add `-is:retweet` to exclude them.")
 
@@ -500,16 +495,18 @@ def get_options(cls, parent_dataset=None, user=None):
         }
 
         if not have_api_key:
+            # options.update({
+            #     "api_type": {
+            #         "type": UserInput.OPTION_CHOICE,
+            #         "help": "API track",
+            #         "options": {
+            #             "all": "Research API: Full-archive search",
+            #             "recent": "Standard: Recent search (Tweets published in last 7 days)",
+            #         },
+            #         "default": "all"
+            #     }
+            # })
             options.update({
-                "api_type": {
-                    "type": UserInput.OPTION_CHOICE,
-                    "help": "API track",
-                    "options": {
-                        "all": "Academic: Full-archive search",
-                        "recent": "Standard: Recent search (Tweets published in last 7 days)",
-                    },
-                    "default": "all"
-                },
                 "api_bearer_token": {
                     "type": UserInput.OPTION_TEXT,
                     "sensitive": True,
@@ -523,10 +520,10 @@ def get_options(cls, parent_dataset=None, user=None):
                 "query_type": {
                     "type": UserInput.OPTION_CHOICE,
                     "help": "Query type",
-                    "tooltip": "Note: Num of Tweets and Date fields ignored with 'Tweets by ID' lookup",
+                    "tooltip": "Note: Num of posts and date fields are ignored with 'Posts by ID' lookup",
                     "options": {
                         "query": "Search query",
-                        "id_lookup": "Tweets by ID (list IDs seperated by commas or one per line)",
+                        "id_lookup": "Posts by ID (list IDs seperated by commas or one per line)",
                     },
                     "default": "query"
                 }
@@ -539,7 +536,7 @@ def get_options(cls, parent_dataset=None, user=None):
             },
             "amount": {
                 "type": UserInput.OPTION_TEXT,
-                "help": "Tweets to retrieve",
+                "help": "Posts to retrieve",
                 "tooltip": "0 = unlimited (be careful!)" if not max_tweets else ("0 = maximum (%s)" % str(max_tweets)),
                 "min": 0,
                 "max": max_tweets if max_tweets else 10_000_000,
@@ -550,7 +547,7 @@ def get_options(cls, parent_dataset=None, user=None):
             },
             "daterange-info": {
                 "type": UserInput.OPTION_INFO,
-                "help": "By default, Twitter returns tweets up til 30 days ago. If you want to go back further, you "
+                "help": "By default, X returns posts up til 30 days ago. If you want to go back further, you "
                         "need to explicitly set a date range."
             },
             "daterange": {
@@ -591,7 +588,7 @@ def validate_query(query, request, user):
                 raise QueryParametersException("Please provide a valid bearer token.")
 
         if len(query.get("query")) > 1024 and query.get("query_type", "query") != "id_lookup":
-            raise QueryParametersException("Twitter API queries cannot be longer than 1024 characters.")
+            raise QueryParametersException("X API queries cannot be longer than 1024 characters.")
 
         if query.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup", user=user):
             # reformat queries to be a comma-separated list with no wrapping
@@ -630,7 +627,7 @@ def validate_query(query, request, user):
         # to dissuade users from running huge queries that will take forever
         # to process
         if params["query_type"] == "query" and (params.get("api_type") == "all" or have_api_key):
-            count_url = "https://api.twitter.com/2/tweets/counts/all"
+            count_url = "https://api.x.com/2/tweets/counts/all"
             count_params = {
                 "granularity": "day",
                 "query": params["query"],
@@ -668,7 +665,7 @@ def validate_query(query, request, user):
 
                 elif response.status_code == 401:
                     raise QueryParametersException("Your bearer token seems to be invalid. Please make sure it is valid "
-                                                   "for the Academic Track of the Twitter API.")
+                                                   "for the Research track of the X API.")
 
                 elif response.status_code == 400:
                     raise QueryParametersException("Your query is invalid. Please make sure the date range does not "
@@ -791,7 +788,7 @@ def map_item(item):
             "thread_id": item.get("conversation_id", item["id"]),
             "timestamp": tweet_time.strftime("%Y-%m-%d %H:%M:%S"),
             "unix_timestamp": int(tweet_time.timestamp()),
-            'link': "https://twitter.com/%s/status/%s" % (author_username, item.get('id')),
+            'link': "https://x.com/%s/status/%s" % (author_username, item.get('id')),
             "subject": "",
             "body": item["text"],
             "author": author_username,

From ded8d3df49e7b3cf1e142bfc80019edb43f1adfe Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 11 Nov 2024 16:29:30 +0100
Subject: [PATCH 25/48] Do not instantiate logging handlers twice

---
 common/lib/logger.py | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/common/lib/logger.py b/common/lib/logger.py
index bbd30c44..ddffa2d7 100644
--- a/common/lib/logger.py
+++ b/common/lib/logger.py
@@ -185,23 +185,24 @@ def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log'
         self.logger.setLevel(log_level)
 
         # this handler manages the text log files
-        handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
-        handler.setLevel(log_level)
-        handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
-                                               "%d-%m-%Y %H:%M:%S"))
-        self.logger.addHandler(handler)
-
-        # the slack webhook has its own handler, and is only active if the
-        # webhook URL is set
-        try:
-            if config.get("logging.slack.webhook"):
-                slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
-                slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
-                self.logger.addHandler(slack_handler)
-        except Exception:
-            # we *may* need the logger before the database is in working order
-            if config.db is not None:
-                config.db.rollback()
+        if not self.logger.handlers:
+            handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
+            handler.setLevel(log_level)
+            handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
+                                                   "%d-%m-%Y %H:%M:%S"))
+            self.logger.addHandler(handler)
+
+            # the slack webhook has its own handler, and is only active if the
+            # webhook URL is set
+            try:
+                if config.get("logging.slack.webhook"):
+                    slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
+                    slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
+                    self.logger.addHandler(slack_handler)
+            except Exception:
+                # we *may* need the logger before the database is in working order
+                if config.db is not None:
+                    config.db.rollback()
 
     def log(self, message, level=logging.INFO, frame=None):
         """

From dc173249e8e82f794f4b857249108fe191adf370 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 13 Nov 2024 11:19:40 +0100
Subject: [PATCH 26/48] Remove .readthedocs.yaml

We're not doing docs at the moment
---
 .readthedocs.yaml | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 .readthedocs.yaml

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
deleted file mode 100644
index faaf6921..00000000
--- a/.readthedocs.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# .readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-# Required
-version: 2
-
-# Set the version of Python and other tools you might need
-build:
-  os: ubuntu-20.04
-  tools:
-    python: "3.8"
-
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
-   configuration: docs/conf.py
-
-# Optionally build your docs in additional formats such as PDF
-# formats:
-#    - pdf
-
-# Optionally declare the Python requirements required to build your docs
-python:
-   install:
-   - requirements: docs/requirements.txt
\ No newline at end of file

From 4e5ef88be340b1880281f07587cbdbc2bbd74313 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 13 Nov 2024 14:11:37 +0100
Subject: [PATCH 27/48] Pass modules to dataset in video timelines processor

---
 processors/visualisation/video_timelines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py
index f668e6f5..9270fb3f 100644
--- a/processors/visualisation/video_timelines.py
+++ b/processors/visualisation/video_timelines.py
@@ -207,7 +207,7 @@ def get_video_labels(self, metadata):
                     labels[filename] = filename
 
         for dataset, urls in mapping_dataset.items():
-            dataset = DataSet(key=dataset, db=self.db).nearest("*-search")
+            dataset = DataSet(key=dataset, db=self.db, modules=self.modules).nearest("*-search")
 
             # determine appropriate label
             # is this the right place? should it be in the datasource?

From 9197c99af5b2bc27a2e98c0bb72e761712ff91f3 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 18 Nov 2024 12:16:48 +0100
Subject: [PATCH 28/48] Avoid use of chdir()

---
 common/lib/helpers.py | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index 5fe5df48..cd26d575 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -9,6 +9,7 @@
 import smtplib
 import fnmatch
 import socket
+import shlex
 import copy
 import time
 import json
@@ -112,10 +113,8 @@ def get_git_branch():
     repository or git is not installed an empty string is returned.
     """
     try:
-        cwd = os.getcwd()
-        os.chdir(config.get('PATH_ROOT'))
-        branch = subprocess.run(["git", "branch", "--show-current"], stdout=subprocess.PIPE)
-        os.chdir(cwd)
+        root_dir = str(config.get('PATH_ROOT').resolve())
+        branch = subprocess.run(shlex.split(f"git -C {shlex.quote(root_dir)} branch --show-current"), stdout=subprocess.PIPE)
         if branch.returncode != 0:
             raise ValueError()
         return branch.stdout.decode("utf-8").strip()
@@ -145,7 +144,6 @@ def get_software_commit(worker=None):
     # try git command line within the 4CAT root folder
     # if it is a checked-out git repository, it will tell us the hash of
     # the currently checked-out commit
-    cwd = os.getcwd()
 
     # path has no Path.relative()...
     relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent
@@ -155,24 +153,24 @@ def get_software_commit(worker=None):
         # useful version info (since the extension is by definition not in the
         # main 4CAT repository) and will return an empty value
         if worker and worker.is_extension:
-            extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath)
-            os.chdir(extension_dir)
+            working_dir = str(config.get("PATH_ROOT").joinpath(relative_filepath).resolve())
             # check if we are in the extensions' own repo or 4CAT's
-            repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+            git_cmd = f"git -C {shlex.quote(working_dir)} rev-parse --show-toplevel"
+            repo_level = subprocess.run(shlex.split(git_cmd), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
             if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"):
                 # not its own repository
                 return ("", "")
 
         else:
-            os.chdir(config.get("PATH_ROOT"))
+            working_dir = str(config.get("PATH_ROOT").resolve())
 
-        show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        show = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} show"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
         if show.returncode != 0:
             raise ValueError()
         commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1]
 
         # now get the repository the commit belongs to, if we can
-        origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        origin = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
         if origin.returncode != 0 or not origin.stdout:
             raise ValueError()
         repository = origin.stdout.decode("utf-8").strip()
@@ -182,9 +180,6 @@ def get_software_commit(worker=None):
     except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
         return ("", "")
 
-    finally:
-        os.chdir(cwd)
-
     return (commit, repository)
 
 def get_software_version():
@@ -280,7 +275,6 @@ def find_extensions():
 
     # collect metadata for extensions
     allowed_metadata_keys = ("name", "version", "url")
-    cwd = os.getcwd()
     for extension in extensions:
         extension_folder = extension_path.joinpath(extension)
         metadata_file = extension_folder.joinpath("metadata.json")
@@ -297,8 +291,8 @@ def find_extensions():
         if extensions[extension]["is_git"]:
             # try to get remote URL
             try:
-                os.chdir(extension_folder)
-                origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE,
+                extension_root = str(extension_folder.resolve())
+                origin = subprocess.run(shlex.split(f"git -C {shlex.quote(extension_root)} config --get remote.origin.url"), stderr=subprocess.PIPE,
                                         stdout=subprocess.PIPE)
                 if origin.returncode != 0 or not origin.stdout:
                     raise ValueError()
@@ -310,8 +304,6 @@ def find_extensions():
             except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
                 print(e)
                 pass
-            finally:
-                os.chdir(cwd)
 
     return extensions, errors
 

From 9453b76099ec03b3cea859f812890adb36e13df9 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 18 Nov 2024 13:57:27 +0100
Subject: [PATCH 29/48] Don't crash on skipped videos in scene detector

---
 processors/visualisation/video_scene_identifier.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py
index 634e8c49..5140baa0 100644
--- a/processors/visualisation/video_scene_identifier.py
+++ b/processors/visualisation/video_scene_identifier.py
@@ -252,8 +252,9 @@ def process(self):
 				if video_data.get('success'):
 					files = video_data.get('files') if 'files' in video_data else [{"filename": video_data.get("filename"), "success":True}]
 					for file in files:
-						if not file.get("success"):
+						if not file.get("success") or file.get("filename") not in collected_scenes:
 							continue
+							
 						# List types are not super fun for CSV
 						if 'post_ids' in video_data:
 							video_data['post_ids'] = ','.join([str(i) for i in video_data['post_ids']])

From 2d4d60f9873fd1a3fd7b8ba2b7ddbc50d17c44f2 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 19 Nov 2024 11:50:32 +0100
Subject: [PATCH 30/48] fix video_hasher to properly cleanup videos with errors

---
 processors/visualisation/video_hasher.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py
index ff1222bc..f23b3d0c 100644
--- a/processors/visualisation/video_hasher.py
+++ b/processors/visualisation/video_hasher.py
@@ -183,8 +183,9 @@ def process(self):
 		self.dataset.log('Frames per seconds: %f' % frame_interval)
 
 		# Prepare staging area for videos and video tracking
+		# VideoHash creates various files that may not be cleaned up on error so we use an output directory
 		staging_area = self.dataset.get_staging_area()
-		self.dataset.log('Staging directory location: %s' % staging_area)
+		output_dir = self.dataset.get_staging_area()
 
 		video_hashes = {}
 		video_metadata = None
@@ -224,7 +225,7 @@ def process(self):
 
 			video_hashes[path.name] = {'videohash': videohash}
 
-			shutil.copy(videohash.collage_path, staging_area.joinpath(path.stem + '.jpg'))
+			shutil.copy(videohash.collage_path, output_dir.joinpath(path.stem + '.jpg'))
 			video_hashes[path.name]['video_collage_filename'] = path.stem + '.jpg'
 
 			processed_videos += 1
@@ -240,7 +241,7 @@ def process(self):
 		if video_metadata is None:
 			# Grab the metadata directly, if it exists but was skipped (e.g., not found prior to max_videos)
 			try:
-				metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area)
+				metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, output_dir)
 			except FileNotFoundError:
 				metadata_path = None
 			if metadata_path:
@@ -293,7 +294,7 @@ def process(self):
 					num_posts += 1
 
 		writer = None
-		with staging_area.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile:
+		with output_dir.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile:
 			for row in rows:
 				if not writer:
 					writer = csv.DictWriter(outfile, fieldnames=row.keys())
@@ -303,7 +304,7 @@ def process(self):
 
 		# Finish up
 		self.dataset.update_status(f'Created {num_posts} video hashes and stored video collages')
-		self.write_archive_and_finish(staging_area)
+		self.write_archive_and_finish(output_dir)
 
 class VideoHashNetwork(BasicProcessor):
 	"""

From 176905a6307e9c1afeb7b9f36083a26893ce9ec0 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 19 Nov 2024 12:40:49 +0100
Subject: [PATCH 31/48] fixes to video frames when all videos are corrupt

---
 processors/visualisation/video_frames.py    | 18 +++++++++++-------
 processors/visualisation/video_hasher.py    | 13 +++++++++----
 processors/visualisation/video_timelines.py |  7 +++++++
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py
index 64b0c4f3..ec95f84f 100644
--- a/processors/visualisation/video_frames.py
+++ b/processors/visualisation/video_frames.py
@@ -94,7 +94,7 @@ def process(self):
 		processed_videos = 0
 
 		self.dataset.update_status("Extracting video frames")
-		for path in self.iterate_archive_contents(self.source_file, staging_area):
+		for i, path in enumerate(self.iterate_archive_contents(self.source_file, staging_area)):
 			if self.interrupted:
 				raise ProcessorInterruptedException("Interrupted while determining image wall order")
 
@@ -138,17 +138,21 @@ def process(self):
 					outfile.write(ffmpeg_error)
 
 			if result.returncode != 0:
-				error = 'Error Return Code with video %s: %s' % (vid_name, str(result.returncode))
-				self.dataset.log(error)
+				self.dataset.update_status(f"Unable to extract frames from video {vid_name} (see logs for details)")
+				self.dataset.log('Error Return Code (%s) with video %s: %s' % (str(result.returncode), vid_name, "\n".join(ffmpeg_error.split('\n')[-2:]) if ffmpeg_error else ''))
+			else:
+				processed_videos += 1
+				self.dataset.update_status("Created frames for %i of %i videos" % (processed_videos, total_possible_videos))
 
-			processed_videos += 1
-			self.dataset.update_status(
-				"Created frames for %i of %i videos" % (processed_videos, total_possible_videos))
-			self.dataset.update_progress(processed_videos / total_possible_videos)
+			self.dataset.update_progress(i / total_possible_videos)
 
 		# Finish up
 		# We've created a directory and folder structure here as opposed to a single folder with single files as
 		# expected by self.write_archive_and_finish() so we use make_archive instead
+		if not processed_videos:
+			self.dataset.finish_with_error("Unable to extract frames from any videos")
+			return
+
 		from shutil import make_archive
 		make_archive(self.dataset.get_results_path().with_suffix(''), "zip", output_directory)
 
diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py
index f23b3d0c..aad1baf6 100644
--- a/processors/visualisation/video_hasher.py
+++ b/processors/visualisation/video_hasher.py
@@ -216,11 +216,12 @@ def process(self):
 				self.dataset.update_status("FFmpeg software not found. Please contact 4CAT maintainers.", is_final=True)
 				self.dataset.finish(0)
 				return
-			except FileNotFoundError as e:
-				self.dataset.update_status(f"Unable to find file {str(path)}")
+			except FileNotFoundError:
+				self.dataset.update_status(f"Unable to find file {path.name}")
 				continue
 			except FFmpegFailedToExtractFrames as e:
-				self.dataset.update_status(f"Unable to extract frame for {str(path)}: {e}")
+				self.dataset.update_status(f"Unable to extract frame for {path.name} (see log for details)")
+				self.dataset.log(f"Unable to extract frame for {str(path)}: {e}")
 				continue
 
 			video_hashes[path.name] = {'videohash': videohash}
@@ -234,6 +235,10 @@ def process(self):
 			self.dataset.update_progress(processed_videos / total_possible_videos)
 			videohash.delete_storage_path()
 
+		if processed_videos == 0:
+			self.dataset.finish_with_error("Unable to create video hashes for any videos")
+			return
+
 		# Write hash file
 		# This file is held here and then copied as its own dataset via VideoHasherTwo
 		num_posts = 0
@@ -304,7 +309,7 @@ def process(self):
 
 		# Finish up
 		self.dataset.update_status(f'Created {num_posts} video hashes and stored video collages')
-		self.write_archive_and_finish(output_dir)
+		self.write_archive_and_finish(output_dir, num_items=processed_videos)
 
 class VideoHashNetwork(BasicProcessor):
 	"""
diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py
index 9270fb3f..3c73e57f 100644
--- a/processors/visualisation/video_timelines.py
+++ b/processors/visualisation/video_timelines.py
@@ -117,6 +117,9 @@ def process(self):
                 if previous_video is not None or not looping:
                     # draw the video filename/label on top of the rendered
                     # frame thumbnails
+                    if not previous_video:
+                        # This likely means no frames were found for the video and this processor should not have run
+                        continue
                     video_label = labels.get(previous_video, previous_video)
                     footersize = (fontsize * (len(video_label) + 2) * 0.5925, fontsize * 2)
                     footer_shape = SVG(insert=(0, base_height - footersize[1]), size=footersize)
@@ -165,6 +168,10 @@ def process(self):
                 timeline.add(frame_element)
                 timeline_widths[video] += frame_width
 
+        if not timeline_widths:
+            self.dataset.finish_with_error("No video frames found")
+            return
+
         # now we know all dimensions we can instantiate the canvas too
         canvas_width = max(timeline_widths.values())
         fontsize = 12

From da8328edbbcf64395e69d06b5abf3708e5d60cc8 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 22 Nov 2024 16:59:45 +0100
Subject: [PATCH 32/48] Don't show link thumbnails in Bsky widget

---
 webtool/templates/frontpage.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html
index fa5cbc73..fee8c095 100644
--- a/webtool/templates/frontpage.html
+++ b/webtool/templates/frontpage.html
@@ -20,6 +20,7 @@ <h2><span>About this server</span></h2>
             <h2><span>4CAT updates</span></h2>
             <p class="updates">
                   <bsky-embed
+                    id = "bsky-embed"
                     username="4cat.nl"
                     mode=""
                     limit="5"

From bfe30751d50f8b0a7ffaa9817b11b0143e8e8290 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 25 Nov 2024 11:51:11 +0100
Subject: [PATCH 33/48] Don't ignore TikTok comments with missing author_pin

---
 datasources/tiktok_comments/search_tiktok_comments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py
index efaffc21..31471fcd 100644
--- a/datasources/tiktok_comments/search_tiktok_comments.py
+++ b/datasources/tiktok_comments/search_tiktok_comments.py
@@ -58,7 +58,7 @@ def map_item(item):
             "post_url": item["share_info"]["url"].split(".html")[0],
             "post_body": item["share_info"]["title"],
             "comment_url": item["share_info"]["url"],
-            "is_liked_by_post_author": "yes" if bool(item["author_pin"]) else "no",
+            "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
             "is_sticky": "yes" if bool(item["stick_position"]) else "no",
             "is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
             "language_guess": item["comment_language"]

From 0792ef4dae41ddf4f282a82801fced557738e807 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 27 Nov 2024 18:07:14 +0100
Subject: [PATCH 34/48] Don't show version in footer unless logged in

---
 webtool/templates/layout.html | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/webtool/templates/layout.html b/webtool/templates/layout.html
index e1ecda59..33f4ad61 100644
--- a/webtool/templates/layout.html
+++ b/webtool/templates/layout.html
@@ -85,7 +85,9 @@ <h1>
 		{% endif %}
 		  <li><a href="{{ url_for('show_page', page='citing') }}">How to cite</a></li>
 		  <li><a href="{{ url_for('show_page', page='issues') }}">Help &amp; Bug Reports</a></li>
+		{% if current_user.is_authenticated %}
           <li><abbr title="Version">v</abbr>{{ __version }}</li>
+        {% endif %}
 		  <li><a href="https://www.oilab.eu">OILab</a>, 2018 &ndash; {{ __datenow.year }}</li>
 		</ul>
 	</nav>

From 1396bb5f79fe8d1f77f47ef4f258b91f123bb2f3 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 29 Nov 2024 11:15:54 +0100
Subject: [PATCH 35/48] pass through modules in merge_datasets

---
 processors/conversion/merge_datasets.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py
index 860c0ddb..461cdd54 100644
--- a/processors/conversion/merge_datasets.py
+++ b/processors/conversion/merge_datasets.py
@@ -60,7 +60,7 @@ def is_compatible_with(cls, module=None, user=None):
         return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector())
 
     @staticmethod
-    def get_dataset_from_url(url, db):
+    def get_dataset_from_url(url, db, modules=None):
         """
         Get dataset object based on dataset URL
 
@@ -68,6 +68,7 @@ def get_dataset_from_url(url, db):
 
         :param str url:  Dataset URL
         :param db:  Database handler (to retrieve metadata)
+        :param modules:  Modules handler (pass through to DataSet)
         :return DataSet:  The dataset
         """
         if not url:
@@ -75,7 +76,7 @@ def get_dataset_from_url(url, db):
 
         source_url = ural.normalize_url(url)
         source_key = source_url.split("/")[-1]
-        return DataSet(key=source_key, db=db)
+        return DataSet(key=source_key, db=db, modules=modules)
 
     def process(self):
         """
@@ -96,7 +97,7 @@ def process(self):
                 continue
 
             try:
-                source_dataset = self.get_dataset_from_url(source_dataset_url, self.db)
+                source_dataset = self.get_dataset_from_url(source_dataset_url, self.db, modules=self.modules)
             except DataSetException:
                 return self.dataset.finish_with_error(f"Dataset URL '{source_dataset_url} not found - cannot perform "
                                                       f"merge.")

From cb2ef691153ba1c3b1c78e567ae070480df14e72 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 29 Nov 2024 11:42:42 +0100
Subject: [PATCH 36/48] Shorten URLs in CSV preview

(links still work)
---
 webtool/lib/template_filters.py    | 34 ++++++++++++++++++++++++++++--
 webtool/templates/preview/csv.html |  2 +-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 6ac9272b..c1ec867a 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -1,5 +1,7 @@
 import urllib.parse
 import datetime
+from math import floor
+
 import markdown
 import json
 import ural
@@ -120,7 +122,7 @@ def _jinja2_filter_httpquery(data):
 		return ""
 
 @app.template_filter("add_ahref")
-def _jinja2_filter_add_ahref(content):
+def _jinja2_filter_add_ahref(content, ellipsiate=0):
 	"""
 	Add HTML links to text
 
@@ -135,7 +137,11 @@ def _jinja2_filter_add_ahref(content):
 		return content
 
 	for link in set(ural.urls_from_text(str(content))):
-		content = content.replace(link, f'<a href="{link.replace("<", "%3C").replace(">", "%3E").replace(chr(34), "%22")}" rel="external">{link}</a>')
+		if ellipsiate > 0:
+			link_text = _jinja2_filter_ellipsiate(link, ellipsiate, True, "[&hellip;]")
+		else:
+			link_text = link
+		content = content.replace(link, f'<a href="{link.replace("<", "%3C").replace(">", "%3E").replace(chr(34), "%22")}" rel="external">{link_text}</a>')
 
 	return content
 
@@ -200,6 +206,30 @@ def _jinja2_filter_extension_to_noun(ext):
 	else:
 		return "item"
 
+@app.template_filter("ellipsiate")
+def _jinja2_filter_ellipsiate(text, length, inside=False, ellipsis_str="&hellip;"):
+	if len(text) <= length:
+		return text
+
+	elif not inside:
+		return text[:length] + ellipsis_str
+
+	else:
+		# two cases: URLs and normal text
+		# for URLs, try to only ellipsiate after the domain name
+		# this makes the URLs easier to read when shortened
+		if ural.is_url(text):
+			pre_part = "/".join(text.split("/")[:3])
+			if len(pre_part) < length - 6:  # kind of arbitrary
+				before = len(pre_part) + 1
+			else:
+				before = floor(length / 2)
+		else:
+			before = floor(length / 2)
+
+		after = len(text) - before
+		return text[:before] + ellipsis_str + text[after:]
+
 @app.template_filter('4chan_image')
 def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
 
diff --git a/webtool/templates/preview/csv.html b/webtool/templates/preview/csv.html
index fc36bb9d..d2473735 100644
--- a/webtool/templates/preview/csv.html
+++ b/webtool/templates/preview/csv.html
@@ -20,7 +20,7 @@
                 {% endif %}
             {% endif %}
             <t{% if outer_loop.index == 1 %}h{% else %}d{% endif %}>
-                {{ cell|e|add_ahref|safe }}
+                {{ cell|e|add_ahref(ellipsiate=50)|safe }}
             </t{% if outer_loop.index == 1 %}h{% else %}d{% endif %}>
         {% endfor %}
         </tr>

From 8e660a4674b5e570a51730a342c3336437ab9817 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 29 Nov 2024 11:43:50 +0100
Subject: [PATCH 37/48] Fix author thumbnail in TikTok mapping

---
 datasources/tiktok/search_tiktok.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index f7cb7590..2c5a51c5 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -50,16 +50,16 @@ def map_item(post):
             # from intercepted API response
             user_nickname = post["author"]["uniqueId"]
             user_fullname = post["author"]["nickname"]
-            user_id = post["author"]["id"]
+            user_thumbnail = post["author"].get("avatarThumb", "")
         elif post.get("author"):
             # from embedded JSON object
             user_nickname = post["author"]
             user_fullname = post["nickname"]
-            user_id = ""
+            user_thumbnail = ""
         else:
             user_nickname = ""
             user_fullname = ""
-            user_id = ""
+            user_thumbnail = ""
 
         # there are various thumbnail URLs, some of them expire later than
         # others. Try to get the highest-resolution one that hasn't expired

From 2f4211354c1b15d41f850ca9bace3fb9a69070e2 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 29 Nov 2024 11:44:02 +0100
Subject: [PATCH 38/48] Add is_sensitive and is_photosensitive columns to
 TikTok mapping

---
 datasources/tiktok/search_tiktok.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index 2c5a51c5..6bee534d 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -84,13 +84,15 @@ def map_item(post):
             "author_followers": post.get("authorStats", {}).get("followerCount", ""),
             "author_likes": post.get("authorStats", {}).get("diggCount", ""),
             "author_videos": post.get("authorStats", {}).get("videoCount", ""),
-            "author_avatar": post.get("avatarThumb", ""),
+            "author_avatar": user_thumbnail,
             "body": post["desc"],
             "timestamp": datetime.utcfromtimestamp(int(post["createTime"])).strftime('%Y-%m-%d %H:%M:%S'),
             "unix_timestamp": int(post["createTime"]),
             "is_duet": "yes" if (post.get("duetInfo", {}).get("duetFromId") != "0" if post.get("duetInfo", {}) else False) else "no",
             "is_ad": "yes" if post.get("isAd", False) else "no",
             "is_paid_partnership": "yes" if post.get("adAuthorization") else "no",
+            "is_sensitive": "yes" if post.get("maskType") == 3 else "no",
+            "is_photosensitive": "yes" if post.get("maskType") == 4 else "no",
             "music_name": post["music"]["title"],
             "music_id": post["music"]["id"],
             "music_url": post["music"].get("playUrl", ""),

From 8da18b397c28888160ae5e434390cb2b1f59547b Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 29 Nov 2024 11:47:19 +0100
Subject: [PATCH 39/48] Zebra striping in csv preview table to help readability

---
 webtool/static/css/dataset-page.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 8e99832f..9eae3229 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -621,6 +621,10 @@ body.csv-preview table td, body.csv-preview table th {
     border: 1px solid var(--gray-light);
 }
 
+body.csv-preview table tr:nth-child(2n+1) {
+    background: var(--contrast-bright);
+}
+
 .child.focus:not(.card) > .sub-controls > .query-result > .query-result-iframe {
 	display: none;
 }

From 0abe88569b175a6c956cb2441883bd8d5509a284 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 29 Nov 2024 18:37:19 +0100
Subject: [PATCH 40/48] Delete unused webtool helper functions

---
 webtool/lib/helpers.py | 43 ------------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py
index 6cc91eba..d0e74a37 100644
--- a/webtool/lib/helpers.py
+++ b/webtool/lib/helpers.py
@@ -96,30 +96,6 @@ def error(code=200, **kwargs):
 	return response
 
 
-def string_to_timestamp(string):
-	"""
-	Convert dd-mm-yyyy date to unix time
-
-	:param string: Date string to parse
-	:return: The unix time, or 0 if value could not be parsed
-	"""
-	bits = string.split("-")
-	if re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", string):
-		bits = list(reversed(bits))
-
-	if len(bits) != 3:
-		return 0
-
-	try:
-		day = int(bits[0])
-		month = int(bits[1])
-		year = int(bits[2])
-		date = datetime.datetime(year, month, day)
-	except ValueError:
-		return 0
-
-	return int(date.timestamp())
-
 def pad_interval(intervals, first_interval=None, last_interval=None):
 	"""
 	Pad an interval so all intermediate intervals are filled
@@ -299,25 +275,6 @@ def generate_css_colours(force=False):
 	)
 
 
-def get_preview(query):
-	"""
-	Generate a data preview of 25 rows of a results csv
-	
-	:param query 
-	:return list: 
-	"""
-	preview = []
-	with query.get_results_path().open(encoding="utf-8") as resultfile:
-		posts = csv.DictReader(resultfile)
-		i = 0
-		for post in posts:
-			i += 1
-			preview.append(post)
-			if i > 25:
-				break
-	return preview
-
-
 def format_chan_post(post):
 	"""
 	Format a plain-text imageboard post post for HTML display

From 6881cbadf36f1ff28c39543ce25a6e8b8796e31e Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 2 Dec 2024 22:42:40 +0100
Subject: [PATCH 41/48] Add option to TikTok image downloader for user avatars

---
 processors/visualisation/download_tiktok.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py
index c02b53bf..3854e965 100644
--- a/processors/visualisation/download_tiktok.py
+++ b/processors/visualisation/download_tiktok.py
@@ -161,6 +161,7 @@ class TikTokImageDownloader(BasicProcessor):
             "options": {
                 "thumbnail": "Video Thumbnail",
                 "music": "Music Thumbnail",
+                "author_avatar": "User avatar"
             },
             "default": "thumbnail"
         }
@@ -217,6 +218,8 @@ def process(self):
             url_column = "thumbnail_url"
         elif self.parameters.get("thumb_type") == "music":
             url_column = "music_thumbnail"
+        elif self.parameters.get("thumb_type") == "author_avatar":
+            url_column = "author_avatar"
         else:
             self.dataset.update_status("No image column selected.", is_final=True)
             self.dataset.finish(0)

From e53b73f75a5acfa0373072d5786d07fa5d44a9bc Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Tue, 3 Dec 2024 17:00:48 +0100
Subject: [PATCH 42/48] Option for co-tag networks to ignore certain tags

---
 processors/networks/cotag_network.py      |  8 ++++++++
 processors/networks/two-column-network.py | 15 +++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py
index 236e9577..139b2ac9 100644
--- a/processors/networks/cotag_network.py
+++ b/processors/networks/cotag_network.py
@@ -29,6 +29,13 @@ class CoTaggerPreset(ProcessorPreset):
             "default": True,
             "help": "Convert tags to lowercase",
             "tooltip": "Merges tags with varying cases"
+        },
+        "ignore-tags": {
+            "type": UserInput.OPTION_TEXT,
+            "default": "",
+            "help": "Tags to ignore",
+            "tooltip": "Separate with commas if you want to ignore multiple tags. Do not include the '#' "
+                       "character."
         }
     }
 
@@ -72,6 +79,7 @@ def get_processor_pipeline(self):
                     "split-comma": True,
                     "categorise": True,
                     "allow-loops": False,
+                    "ignore-nodes": self.parameters.get("ignore-tags", ""),
                     "to-lowercase": self.parameters.get("to-lowercase", True)
                 }
             }
diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py
index 0f604570..43ceffdf 100644
--- a/processors/networks/two-column-network.py
+++ b/processors/networks/two-column-network.py
@@ -84,6 +84,12 @@ class ColumnNetworker(BasicProcessor):
             "default": False,
             "help": "Convert values to lowercase",
             "tooltip": "Merges values with varying cases"
+        },
+        "ignore-nodes": {
+            "type": UserInput.OPTION_TEXT,
+            "default": "",
+            "help": "Nodes to ignore",
+            "tooltip": "Separate with commas if you want to ignore multiple nodes"
         }
     }
 
@@ -145,6 +151,7 @@ def process(self):
         allow_loops = self.parameters.get("allow-loops")
         interval_type = self.parameters.get("interval")
         to_lower = self.parameters.get("to-lowercase", False)
+        ignoreable = [n.strip() for n in self.parameters.get("ignore-nodes", "").split(",") if n.strip()]
 
         processed = 0
 
@@ -193,6 +200,14 @@ def process(self):
                 values_a = [value.strip() for value_groups in values_a for value in value_groups.split(",")]
                 values_b = [value.strip() for value_groups in values_b for value in value_groups.split(",")]
 
+            if ignoreable:
+                values_a = [v for v in values_a if v not in ignoreable]
+                values_b = [v for v in values_b if v not in ignoreable]
+
+            # only proceed if we actually have any edges left
+            if not values_a or not values_b:
+                continue
+
             try:
                 interval = get_interval_descriptor(item, interval_type)
             except ValueError as e:

From 5021e85302fe8cf16b783496052929cb30287820 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 4 Dec 2024 11:11:28 +0100
Subject: [PATCH 43/48] Cast to str before word tree-izing

---
 processors/visualisation/word-trees.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py
index 0dfe2d40..0a1f235e 100644
--- a/processors/visualisation/word-trees.py
+++ b/processors/visualisation/word-trees.py
@@ -212,6 +212,12 @@ def process(self):
 			if processed % 500 == 0:
 				self.dataset.update_status("Processing and tokenising post %i" % processed)
 			body = post.get(column)
+			
+			try:
+				body = str(body)
+			except TypeError:
+				continue
+
 			if not body:
 				continue
 

From 3f06845a0e2dc63e772a071e77748e116eef896d Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 6 Dec 2024 13:32:23 +0100
Subject: [PATCH 44/48] tokenizer group by sentence fix: nltk renamed lang
 packs

---
 processors/text-analysis/tokenise.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index 17c350c8..1ee3b199 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -226,6 +226,7 @@ def process(self):
 
 		The result is valid JSON, written in chunks.
 		"""
+		sentence_error = False
 		columns = self.parameters.get("columns")
 		if not columns:
 			self.dataset.update_status("No columns selected, aborting.", is_final=True)
@@ -357,11 +358,11 @@ def dummy_function(x, *args, **kwargs):
 				# for russian we use a special purpose splitter with better
 				# performance
 				sentence_method = razdel.sentenize
-			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
-								'pickle' in lang]:
+			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab'))]:
 				self.dataset.update_status(
 					f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")
 				sentence_method = dummy_function
+				sentence_error = True
 			else:
 				sentence_method = sent_tokenize
 		else:
@@ -490,6 +491,9 @@ def dummy_function(x, *args, **kwargs):
 		with staging_area.joinpath(".token_metadata.json").open("w", encoding="utf-8") as outfile:
 			json.dump(metadata, outfile)
 
+		if sentence_error:
+			self.dataset.update_status(f"Finished tokenizing; Unable to group by sentence ({language} not supported), instead grouped by item.", is_final=True)
+
 		# create zip of archive and delete temporary files and folder
 		self.write_archive_and_finish(staging_area)
 

From 1765e8066e74624cd4b89cf96737f90901655336 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 10 Dec 2024 12:00:47 +0100
Subject: [PATCH 45/48] download video: handle broken connection in video
 download; also stop and remove files that exceed max file size

---
 processors/visualisation/download_videos.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
index 2b385ffe..08a632b8 100644
--- a/processors/visualisation/download_videos.py
+++ b/processors/visualisation/download_videos.py
@@ -3,6 +3,7 @@
 
 First attempt to download via request, but if that fails use yt-dlp
 """
+import os
 import json
 import re
 import time
@@ -601,15 +602,22 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
                                 f"Video size {response.headers.get('Content-Length')} larger than maximum allowed per 4CAT")
                     # Size unknown
                     elif not self.config.get("video-downloader.allow-unknown-size", False):
-                        FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
+                        raise FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
 
                 # Download video
                 self.dataset.update_status(
                     "Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
-                with open(results_path.joinpath(save_location), "wb") as f:
-                    for chunk in response.iter_content(chunk_size=1024 * 1024):
-                        if chunk:
-                            f.write(chunk)
+                try:
+                    with open(results_path.joinpath(save_location), "wb") as f:
+                        for chunk in response.iter_content(chunk_size=1024 * 1024):
+                            if not max_video_size == 0 and f.tell() > (max_video_size * 1000000):
+                                    # File size too large; stop download and remove file
+                                    os.remove(f.name)
+                                    raise FilesizeException("Video size larger than maximum allowed per 4CAT")
+                            if chunk:
+                                f.write(chunk)
+                except ChunkedEncodingError as e:
+                    raise FailedDownload(f"Failed to download video: {e}")
 
                 # Return filename to add to metadata
                 return save_location.name

From 8450304ab156fe27302412dcf4a112da0689074a Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 10 Dec 2024 12:13:45 +0100
Subject: [PATCH 46/48] video_download: forgot import exception type

---
 processors/visualisation/download_videos.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
index 08a632b8..d1d7bd67 100644
--- a/processors/visualisation/download_videos.py
+++ b/processors/visualisation/download_videos.py
@@ -607,8 +607,8 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
                 # Download video
                 self.dataset.update_status(
                     "Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
-                try:
-                    with open(results_path.joinpath(save_location), "wb") as f:
+                with open(results_path.joinpath(save_location), "wb") as f:
+                    try:
                         for chunk in response.iter_content(chunk_size=1024 * 1024):
                             if not max_video_size == 0 and f.tell() > (max_video_size * 1000000):
                                     # File size too large; stop download and remove file
@@ -616,8 +616,8 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
                                     raise FilesizeException("Video size larger than maximum allowed per 4CAT")
                             if chunk:
                                 f.write(chunk)
-                except ChunkedEncodingError as e:
-                    raise FailedDownload(f"Failed to download video: {e}")
+                    except requests.exceptions.ChunkedEncodingError as e:
+                        raise FailedDownload(f"Failed to complete download: {e}")
 
                 # Return filename to add to metadata
                 return save_location.name

From a296ff03c983103b902a830c585efd426e349ece Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 11 Dec 2024 12:09:24 +0100
Subject: [PATCH 47/48] export_datasets fix: only finish export dataset if
 primary dataset is not finished; children should be skipped instead

---
 processors/conversion/export_datasets.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
index bd7b8128..fbda0e85 100644
--- a/processors/conversion/export_datasets.py
+++ b/processors/conversion/export_datasets.py
@@ -40,6 +40,11 @@ def process(self):
 		This takes a CSV file as input and writes the same data as a JSON file
 		"""
 		self.dataset.update_status("Collecting dataset and all analyses")
+		primary_dataset = self.dataset.top_parent()
+		if not primary_dataset.is_finished():
+			# This ought not happen as processors (i.e., this processor) should only be available for finished datasets
+			self.dataset.finish_with_error("You cannot export unfinished datasets; please wait until dataset is finished to export.")
+			return
 
 		results_path = self.dataset.get_staging_area()
 
@@ -52,25 +57,26 @@ def process(self):
 
 			try:
 				dataset = DataSet(key=dataset_key, db=self.db)
-			# TODO: these two should fail for the primary dataset, but should they fail for the children too?
 			except DataSetException:
-				self.dataset.finish_with_error("Dataset not found.")
-				return
+				self.dataset.update_status(f"Dataset {dataset_key} not found: it may have been deleted prior to export; skipping.")
+				failed_exports.append(dataset_key)
+				continue
 			if not dataset.is_finished():
-				self.dataset.finish_with_error("You cannot export unfinished datasets.")
-				return
+				self.dataset.update_status(f"Dataset {dataset_key} not finished: cannot export unfinished datasets; skipping.")
+				failed_exports.append(dataset_key)
+				continue
 
 			# get metadata
 			metadata = dataset.get_metadata()
 			if metadata["num_rows"] == 0:
-				self.dataset.update_status(f"Skipping empty dataset {dataset_key}")
+				self.dataset.update_status(f"Dataset {dataset_key} has no results; skipping.")
 				failed_exports.append(dataset_key)
 				continue
 
 			# get data
 			data_file = dataset.get_results_path()
 			if not data_file.exists():
-				self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.")
+				self.dataset.update_status(f"Dataset {dataset_key} has no data file; skipping.")
 				failed_exports.append(dataset_key)
 				continue
 

From a60ac61cae5d3b4dc5d9f8e97ba7f30a0d2b2af4 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 11 Dec 2024 12:30:34 +0100
Subject: [PATCH 48/48] export_dataset: note that filters must be exported
 separately in description

---
 processors/conversion/export_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
index fbda0e85..f6c8bcc1 100644
--- a/processors/conversion/export_datasets.py
+++ b/processors/conversion/export_datasets.py
@@ -23,7 +23,7 @@ class ExportDatasets(BasicProcessor):
 	type = "export-datasets"  # job type ID
 	category = "Conversion"  # category
 	title = "Export Dataset and All Analyses"  # title displayed in UI
-	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again."  # description displayed in UI
+	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Filters are *not* included and must be exported separately as new datasets. Results automatically expires after 1 day, after which you must run again."  # description displayed in UI
 	extension = "zip"  # extension of result file, used internally and in UI
 
 	@classmethod