diff --git a/.readthedocs.yaml b/.readthedocs.yaml
deleted file mode 100644
index faaf6921a..000000000
--- a/.readthedocs.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# .readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-# Required
-version: 2
-
-# Set the version of Python and other tools you might need
-build:
-  os: ubuntu-20.04
-  tools:
-    python: "3.8"
-
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
-   configuration: docs/conf.py
-
-# Optionally build your docs in additional formats such as PDF
-# formats:
-#    - pdf
-
-# Optionally declare the Python requirements required to build your docs
-python:
-   install:
-   - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index cada86171..52ad88cec 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -481,7 +481,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,
 
 		self.dataset.update_status("Parent dataset updated.")
 
-	def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True):
+	def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True, filename_filter=[]):
 		"""
 		A generator that iterates through files in an archive
 
@@ -498,6 +498,8 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
 		:param bool immediately_delete:  Temporary files are removed after yielded;
 		  False keeps files until the staging_area is removed (usually during processor
 		  cleanup)
+		:param list filename_filter:  Whitelist of filenames to iterate.
+		Other files will be ignored. If empty, do not ignore anything.
 		:return:  An iterator with a Path item for each file
 		"""
 
@@ -514,6 +516,9 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
 			archive_contents = sorted(archive_file.namelist())
 
 			for archived_file in archive_contents:
+				if filename_filter and archived_file not in filename_filter:
+					continue
+
 				info = archive_file.getinfo(archived_file)
 				if info.is_dir():
 					continue
diff --git a/backend/lib/search.py b/backend/lib/search.py
index 15b3982d6..3258561e1 100644
--- a/backend/lib/search.py
+++ b/backend/lib/search.py
@@ -170,10 +170,22 @@ def import_from_file(self, path):
 				if self.interrupted:
 					raise WorkerInterruptedException()
 
-				# remove NUL bytes here because they trip up a lot of other
-				# things
-				# also include import metadata in item
-				item = json.loads(line.replace("\0", ""))
+				try:
+					# remove NUL bytes here because they trip up a lot of other
+					# things
+					# also include import metadata in item
+					item = json.loads(line.replace("\0", ""))
+				except json.JSONDecodeError:
+					warning = (f"An item on line {i:,} of the imported file could not be parsed as JSON - this may "
+							   f"indicate that the file you uploaded was incomplete and you need to try uploading it "
+							   f"again. The item will be ignored.")
+
+					if warning not in import_warnings:
+						import_warnings[warning] = 0
+					import_warnings[warning] += 1
+					continue
+
+
 				new_item = {
 					**item["data"],
 					"__import_meta": {k: v for k, v in item.items() if k != "data"}
diff --git a/common/config_manager.py b/common/config_manager.py
index 1b8d4052f..7760aae99 100644
--- a/common/config_manager.py
+++ b/common/config_manager.py
@@ -269,11 +269,11 @@ def get(self, attribute_name, default=None, is_json=False, user=None, tags=None)
 
             if not is_json and value is not None:
                 value = json.loads(value)
-            # TODO: check this as it feels like it could cause a default to return even if value is not None. - Dale
-            elif default is not None:
-                value = default
+            # TODO: Which default should have priority? The provided default feels like it should be the highest priority, but I think that is an old implementation and perhaps should be removed. - Dale
             elif value is None and setting_name in self.config_definition and "default" in self.config_definition[setting_name]:
                 value = self.config_definition[setting_name]["default"]
+            elif value is None and default is not None:
+                value = default
 
             final_settings[setting_name] = value
 
diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 1ae6c06e5..4138ef4d0 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -529,11 +529,10 @@
         "type": UserInput.OPTION_MULTI_SELECT,
         "help": "Pages in navigation",
         "options": {
-            "faq": "FAQ",
             "data-policy": "Data Policy",
             "citing": "How to cite",
         },
-        "default": ["faq"],
+        "default": [],
         "tooltip": "These pages will be included in the navigation bar at the top of the interface."
     },
     "ui.prefer_mapped_preview": {
diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index 148a2cd8d..1dcd3b27b 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -1,14 +1,16 @@
 """
 Miscellaneous helper functions for the 4CAT backend
 """
-import hashlib
 import subprocess
+import imagehash
+import hashlib
 import requests
 import hashlib
 import datetime
 import smtplib
 import fnmatch
 import socket
+import shlex
 import copy
 import time
 import json
@@ -24,6 +26,7 @@
 from urllib.parse import urlparse, urlunparse
 from calendar import monthrange
 from packaging import version
+from PIL import Image
 
 from common.lib.user_input import UserInput
 from common.config_manager import config
@@ -111,10 +114,8 @@ def get_git_branch():
     repository or git is not installed an empty string is returned.
     """
     try:
-        cwd = os.getcwd()
-        os.chdir(config.get('PATH_ROOT'))
-        branch = subprocess.run(["git", "branch", "--show-current"], stdout=subprocess.PIPE)
-        os.chdir(cwd)
+        root_dir = str(config.get('PATH_ROOT').resolve())
+        branch = subprocess.run(shlex.split(f"git -C {shlex.quote(root_dir)} branch --show-current"), stdout=subprocess.PIPE)
         if branch.returncode != 0:
             raise ValueError()
         return branch.stdout.decode("utf-8").strip()
@@ -144,7 +145,6 @@ def get_software_commit(worker=None):
     # try git command line within the 4CAT root folder
     # if it is a checked-out git repository, it will tell us the hash of
     # the currently checked-out commit
-    cwd = os.getcwd()
 
     # path has no Path.relative()...
     relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent
@@ -154,24 +154,24 @@ def get_software_commit(worker=None):
         # useful version info (since the extension is by definition not in the
         # main 4CAT repository) and will return an empty value
         if worker and worker.is_extension:
-            extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath)
-            os.chdir(extension_dir)
+            working_dir = str(config.get("PATH_ROOT").joinpath(relative_filepath).resolve())
             # check if we are in the extensions' own repo or 4CAT's
-            repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+            git_cmd = f"git -C {shlex.quote(working_dir)} rev-parse --show-toplevel"
+            repo_level = subprocess.run(shlex.split(git_cmd), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
             if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"):
                 # not its own repository
                 return ("", "")
 
         else:
-            os.chdir(config.get("PATH_ROOT"))
+            working_dir = str(config.get("PATH_ROOT").resolve())
 
-        show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        show = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} show"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
         if show.returncode != 0:
             raise ValueError()
         commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1]
 
         # now get the repository the commit belongs to, if we can
-        origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        origin = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
         if origin.returncode != 0 or not origin.stdout:
             raise ValueError()
         repository = origin.stdout.decode("utf-8").strip()
@@ -181,9 +181,6 @@ def get_software_commit(worker=None):
     except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
         return ("", "")
 
-    finally:
-        os.chdir(cwd)
-
     return (commit, repository)
 
 def get_software_version():
@@ -279,7 +276,6 @@ def find_extensions():
 
     # collect metadata for extensions
     allowed_metadata_keys = ("name", "version", "url")
-    cwd = os.getcwd()
     for extension in extensions:
         extension_folder = extension_path.joinpath(extension)
         metadata_file = extension_folder.joinpath("metadata.json")
@@ -296,8 +292,8 @@ def find_extensions():
         if extensions[extension]["is_git"]:
             # try to get remote URL
             try:
-                os.chdir(extension_folder)
-                origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE,
+                extension_root = str(extension_folder.resolve())
+                origin = subprocess.run(shlex.split(f"git -C {shlex.quote(extension_root)} config --get remote.origin.url"), stderr=subprocess.PIPE,
                                         stdout=subprocess.PIPE)
                 if origin.returncode != 0 or not origin.stdout:
                     raise ValueError()
@@ -309,8 +305,6 @@ def find_extensions():
             except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
                 print(e)
                 pass
-            finally:
-                os.chdir(cwd)
 
     return extensions, errors
 
@@ -421,6 +415,37 @@ def andify(items):
     return ", ".join([str(item) for item in items]) + result
 
 
+def hash_file(image_file, hash_type="file-hash"):
+    """
+    Generate an image hash
+
+    :param Path image_file:  Image file to hash
+    :param str hash_type:  Hash type, one of `file-hash`, `colorhash`,
+    `phash`, `average_hash`, `dhash`
+    :return str:  Hexadecimal hash value
+    """
+    if not image_file.exists():
+        raise FileNotFoundError()
+
+    if hash_type == "file-hash":
+        hasher = hashlib.sha1()
+
+        # Open the file in binary mode
+        with image_file.open("rb") as infile:
+            # Read and update hash in chunks to handle large files
+            while chunk := infile.read(1024):
+                hasher.update(chunk)
+
+        return hasher.hexdigest()
+
+    elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
+        image = Image.open(image_file)
+
+        return str(getattr(imagehash, hash_type)(image))
+
+    else:
+        raise NotImplementedError(f"Unknown hash type '{hash_type}'")
+
 def get_yt_compatible_ids(yt_ids):
     """
     :param yt_ids list, a list of strings
diff --git a/common/lib/logger.py b/common/lib/logger.py
index bbd30c444..ddffa2d72 100644
--- a/common/lib/logger.py
+++ b/common/lib/logger.py
@@ -185,23 +185,24 @@ def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log'
         self.logger.setLevel(log_level)
 
         # this handler manages the text log files
-        handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
-        handler.setLevel(log_level)
-        handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
-                                               "%d-%m-%Y %H:%M:%S"))
-        self.logger.addHandler(handler)
-
-        # the slack webhook has its own handler, and is only active if the
-        # webhook URL is set
-        try:
-            if config.get("logging.slack.webhook"):
-                slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
-                slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
-                self.logger.addHandler(slack_handler)
-        except Exception:
-            # we *may* need the logger before the database is in working order
-            if config.db is not None:
-                config.db.rollback()
+        if not self.logger.handlers:
+            handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
+            handler.setLevel(log_level)
+            handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
+                                                   "%d-%m-%Y %H:%M:%S"))
+            self.logger.addHandler(handler)
+
+            # the slack webhook has its own handler, and is only active if the
+            # webhook URL is set
+            try:
+                if config.get("logging.slack.webhook"):
+                    slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
+                    slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
+                    self.logger.addHandler(slack_handler)
+            except Exception:
+                # we *may* need the logger before the database is in working order
+                if config.db is not None:
+                    config.db.rollback()
 
     def log(self, message, level=logging.INFO, frame=None):
         """
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index f7cb7590e..6bee534dd 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -50,16 +50,16 @@ def map_item(post):
             # from intercepted API response
             user_nickname = post["author"]["uniqueId"]
             user_fullname = post["author"]["nickname"]
-            user_id = post["author"]["id"]
+            user_thumbnail = post["author"].get("avatarThumb", "")
         elif post.get("author"):
             # from embedded JSON object
             user_nickname = post["author"]
             user_fullname = post["nickname"]
-            user_id = ""
+            user_thumbnail = ""
         else:
             user_nickname = ""
             user_fullname = ""
-            user_id = ""
+            user_thumbnail = ""
 
         # there are various thumbnail URLs, some of them expire later than
         # others. Try to get the highest-resolution one that hasn't expired
@@ -84,13 +84,15 @@ def map_item(post):
             "author_followers": post.get("authorStats", {}).get("followerCount", ""),
             "author_likes": post.get("authorStats", {}).get("diggCount", ""),
             "author_videos": post.get("authorStats", {}).get("videoCount", ""),
-            "author_avatar": post.get("avatarThumb", ""),
+            "author_avatar": user_thumbnail,
             "body": post["desc"],
             "timestamp": datetime.utcfromtimestamp(int(post["createTime"])).strftime('%Y-%m-%d %H:%M:%S'),
             "unix_timestamp": int(post["createTime"]),
             "is_duet": "yes" if (post.get("duetInfo", {}).get("duetFromId") != "0" if post.get("duetInfo", {}) else False) else "no",
             "is_ad": "yes" if post.get("isAd", False) else "no",
             "is_paid_partnership": "yes" if post.get("adAuthorization") else "no",
+            "is_sensitive": "yes" if post.get("maskType") == 3 else "no",
+            "is_photosensitive": "yes" if post.get("maskType") == 4 else "no",
             "music_name": post["music"]["title"],
             "music_id": post["music"]["id"],
             "music_url": post["music"].get("playUrl", ""),
diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py
index efaffc21d..31471fcdc 100644
--- a/datasources/tiktok_comments/search_tiktok_comments.py
+++ b/datasources/tiktok_comments/search_tiktok_comments.py
@@ -58,7 +58,7 @@ def map_item(item):
             "post_url": item["share_info"]["url"].split(".html")[0],
             "post_body": item["share_info"]["title"],
             "comment_url": item["share_info"]["url"],
-            "is_liked_by_post_author": "yes" if bool(item["author_pin"]) else "no",
+            "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
             "is_sticky": "yes" if bool(item["stick_position"]) else "no",
             "is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
             "language_guess": item["comment_language"]
diff --git a/datasources/twitterv2/DESCRIPTION.md b/datasources/twitterv2/DESCRIPTION.md
index 57f1f7a59..d138e6754 100644
--- a/datasources/twitterv2/DESCRIPTION.md
+++ b/datasources/twitterv2/DESCRIPTION.md
@@ -1,93 +1,88 @@
-Twitter data is gathered through the official [Twitter v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT
-allows access to both the Standard and the Academic track. The Standard track is free for anyone to use, but only
-allows to retrieve tweets up to seven days old. The Academic track allows a full-archive search of up to ten million
-tweets per month (as of March 2022). For the Academic track, you need a valid Bearer token. You can request one
-[here](https://developer.twitter.com/en/portal/petition/academic/is-it-right-for-you).
+X/Twitter data is gathered through the official [X v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT can interface with X's Research API (sometimes 
+branded as the 'DSA API', referencing the EU's Digital Services Act). To retrieve posts via this API with 4CAT, you need
+a valid Bearer token. Read more about this mode of access [here](https://developer.x.com/en/use-cases/do-research/academic-research).
 
-Tweets are captured in batches at a speed of approximately 100,000 tweets per hour. 4CAT will warn you if your dataset
+Posts are captured in batches at a speed of approximately 100,000 posts per hour. 4CAT will warn you if your dataset
 is expected to take more than 30 minutes to collect. It is often a good idea to start small (with very specific
 queries or narrow date ranges) and then only create a larger dataset if you are confident that it will be manageable and
 useful for your analysis.
 
-If you hit your Twitter API quota while creating a dataset, the dataset will be finished with the tweets that have been
+If you hit your X API quota while creating a dataset, the dataset will be finished with the posts that have been 
 collected so far and a warning will be logged.
 
 ### Query syntax
 
-Check the [API documentation](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)
+Check the [API documentation](https://developer.x.com/en/docs/x-api/tweets/search/integrate/build-a-query)
 for available query syntax and operators. This information is crucial to what data you collect. Important operators for
-instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted tweets and retweets. Query syntax
-is roughly the same as for Twitter's search interface, so you can try out most queries by entering them in the Twitter
-app or website's search field and looking at the results. You can also test queries with
-Twitter's [Query Builder](https://developer.twitter.com/apitools/query?query=).
+instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted posts and reposts. Query syntax
+is roughly the same as for X's search interface, so you can try out most queries by entering them in the X app or 
+website's search field and looking at the results. You can also test queries with
+X's [Query Builder](https://developer.twitter.com/apitools/query?query=).
 
 ### Date ranges
 
-By default, Twitter returns tweets posted within the past 30 days. If you want to go back further, you need to
-explicitly set a date range. Note that Twitter does not like date ranges that end in the future, or start before
-Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date.
+By default, X returns posts posted within the past 30 days. If you want to go back further, you need to
+explicitly set a date range. Note that X does not like date ranges that end in the future, or start before
+Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date. Also note
+that API access may come with certain limitations on how far a query may extend into history.
 
 ### Geo parameters
 
-Twitter offers a number of ways
-to [query by location/geo data](https://developer.twitter.com/en/docs/tutorials/filtering-tweets-by-location)
-such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. This feature is only available for the Academic level;
-you will receive a 400 error if using queries filtering by geographic information.
+X offers a number of ways
+to [query by location/geo data](https://developer.x.com/en/docs/tutorials/filtering-tweets-by-location)
+such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. 
 
 ### Retweets
 
-A retweet from Twitter API v2 contains at maximum 140 characters from the original tweet. 4CAT therefore
-gathers both the retweet and the original tweet and reformats the retweet text so it resembles a user's experience.
+A repost from X API v2 contains at maximum 140 characters from the original post. 4CAT therefore
+gathers both the repost and the original post and reformats the repost text so it resembles a user's experience.
 
 This also affects mentions, hashtags, and other data as only those contained in the first 140 characters are provided
-by Twitter API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added
-to the retweet for 4CAT analysis methods. *4CAT stores the data from Twitter API v2 as similar as possible to the format
+by X API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added
+to the repost for 4CAT analysis methods. *4CAT stores the data from X API v2 as similar as possible to the format
 in which it was received which you can obtain by downloading the ndjson file.*
 
 *Example 1*
 
-[This retweet](https://twitter.com/tonino1630/status/1554618034299568128) returns the following data:
+[This repost](https://x.com/tonino1630/status/1554618034299568128) returns the following data:
 
 - *author:*    `tonino1630`
-- *
-  text:*     `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…`
+- *text:*     `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…`
 - *mentions:*     `ChuckyFrao`
 - *hashags:*
 
 <br>
-While the original tweet will return (as a reference tweet) this data:
+While the original post will return (as a reference post) this data:
 
 - *author:*    `ChuckyFrao`
-- *
-  text:*     `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0`
+- *text:*     `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0`
 - *mentions:*    `POTUS, usembassyve, StateSPEHA, StateDept, SecBlinken`
 - *hashtags:*    `FreeAlexSaab, BringAlexHome, IntegridadTerritorial`
 
 <br>
-As you can see, only the author of the original tweet is listed as a mention in the retweet.
+As you can see, only the author of the original post is listed as a mention in the repost.
 
 *Example 2*
 
-[This retweet](https://twitter.com/Macsmart31/status/1554618041459445760) returns the following:
+[This repost](https://x.com/Macsmart31/status/1554618041459445760) returns the following:
 
 - *author:* `Macsmart31`
-- *
-  text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…`
+- *text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…`
 - *mentions:* `mickyd123us, tribelaw, HonorDecency`
 
 <br>
-Compared with the original tweet referenced below:
+Compared with the original post referenced below:
 
 - *author:* `mickyd123us`
-- *
-  text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr`
+- *text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr`
 - *mentions:* `tribelaw, HonorDecency`
 
 <br>
-Because the mentioned users are in the first 140 characters of the original tweet, they are also listed as mentions in the retweet.
-
-The key difference here is that example one the retweet contains none of the hashtags or mentions from the original
-tweet (they are beyond the first 140 characters) while the second retweet example does return mentions from the original
-tweet. *Due to this discrepancy, for retweets all mentions and hashtags of the original tweet are considered as mentions
-and hashtags of the retweet.* A user on Twitter will see all mentions and hashtags when viewing a retweet and the
-retweet would be a part of any network around those mentions and hashtags.
+Because the mentioned users are in the first 140 characters of the original post, they are also listed as mentions in 
+the repost.
+
+The key difference here is that in example one the repost contains none of the hashtags or mentions from the original
+post (they are beyond the first 140 characters) while the second repost example does return mentions from the original
+post. *Due to this discrepancy, for reposts all mentions and hashtags of the original post are considered as mentions
+and hashtags of the repost.* A user on X will see all mentions and hashtags when viewing a repost and the
+repost would be a part of any network around those mentions and hashtags.
diff --git a/datasources/twitterv2/__init__.py b/datasources/twitterv2/__init__.py
index 3335bc7c0..6aa80c7b3 100644
--- a/datasources/twitterv2/__init__.py
+++ b/datasources/twitterv2/__init__.py
@@ -9,4 +9,4 @@
 
 # Internal identifier for this data source
 DATASOURCE = "twitterv2"
-NAME = "Twitter API (v2) Search"
\ No newline at end of file
+NAME = "X/Twitter API (v2) Search"
\ No newline at end of file
diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py
index 999680b6e..8b91d1eb2 100644
--- a/datasources/twitterv2/search_twitter.py
+++ b/datasources/twitterv2/search_twitter.py
@@ -1,5 +1,5 @@
 """
-Twitter keyword search via the Twitter API v2
+X/Twitter keyword search via the X API v2
 """
 import requests
 import datetime
@@ -17,13 +17,10 @@
 
 class SearchWithTwitterAPIv2(Search):
     """
-    Get Tweets via the Twitter API
-
-    This only allows for historical search - use f.ex. TCAT for more advanced
-    queries.
+    Get Tweets via the X API
     """
     type = "twitterv2-search"  # job ID
-    title = "Twitter API (v2)"
+    title = "X/Twitter API (v2)"
     extension = "ndjson"
     is_local = False    # Whether this datasource is locally scraped
     is_static = False   # Whether this datasource is still updated
@@ -32,15 +29,15 @@ class SearchWithTwitterAPIv2(Search):
     import_issues = True
 
     references = [
-        "[Twitter API documentation](https://developer.twitter.com/en/docs/twitter-api)"
+        "[X/Twitter API documentation](https://developer.x.com/en/docs/x-api)"
     ]
 
     config = {
         "twitterv2-search.academic_api_key": {
             "type": UserInput.OPTION_TEXT,
             "default": "",
-            "help": "Academic API Key",
-            "tooltip": "An API key for the Twitter v2 Academic API. If "
+            "help": "Research API Key",
+            "tooltip": "An API key for the X/Twitter v2 Research API. If "
                        "provided, the user will not need to enter their own "
                        "key to retrieve tweets. Note that this API key should "
                        "have access to the Full Archive Search endpoint."
@@ -50,15 +47,15 @@ class SearchWithTwitterAPIv2(Search):
             "default": 0,
             "min": 0,
             "max": 10_000_000,
-            "help": "Max tweets per dataset",
+            "help": "Max posts per dataset",
             "tooltip": "4CAT will never retrieve more than this amount of "
-                       "tweets per dataset. Enter '0' for unlimited tweets."
+                       "posts per dataset. Enter '0' for unlimited posts."
         },
         "twitterv2-search.id_lookup": {
             "type": UserInput.OPTION_TOGGLE,
             "default": False,
             "help": "Allow lookup by ID",
-            "tooltip": "If enabled, allow users to enter a list of tweet IDs "
+            "tooltip": "If enabled, allow users to enter a list of post IDs "
                        "to retrieve. This is disabled by default because it "
                        "can be confusing to novice users."
         }
@@ -110,7 +107,7 @@ def get_items(self, query):
         }
 
         if self.parameters.get("query_type", "query") == "id_lookup" and self.config.get("twitterv2-search.id_lookup"):
-            endpoint = "https://api.twitter.com/2/tweets"
+            endpoint = "https://api.x.com/2/tweets"
 
             tweet_ids = self.parameters.get("query", []).split(',')
 
@@ -126,7 +123,7 @@ def get_items(self, query):
 
         else:
             # Query to all or search
-            endpoint = "https://api.twitter.com/2/tweets/search/" + api_type
+            endpoint = "https://api.x.com/2/tweets/search/" + api_type
 
             queries = [self.parameters.get("query", "")]
 
@@ -158,7 +155,7 @@ def get_items(self, query):
             while True:
 
                 if self.interrupted:
-                    raise ProcessorInterruptedException("Interrupted while getting tweets from the Twitter API")
+                    raise ProcessorInterruptedException("Interrupted while getting posts from the Twitter API")
 
                 # there is a limit of one request per second, so stay on the safe side of this
                 while self.previous_request == int(time.time()):
@@ -188,18 +185,18 @@ def get_items(self, query):
                     try:
                         structured_response = api_response.json()
                         if structured_response.get("title") == "UsageCapExceeded":
-                            self.dataset.update_status("Hit the monthly tweet cap. You cannot capture more tweets "
-                                                       "until your API quota resets. Dataset completed with tweets "
+                            self.dataset.update_status("Hit the monthly post cap. You cannot capture more posts "
+                                                       "until your API quota resets. Dataset completed with posts "
                                                        "collected so far.", is_final=True)
                             return
                     except (json.JSONDecodeError, ValueError):
-                        self.dataset.update_status("Hit Twitter rate limit, but could not figure out why. Halting "
-                                                   "tweet collection.", is_final=True)
+                        self.dataset.update_status("Hit X's rate limit, but could not figure out why. Halting "
+                                                   "post collection.", is_final=True)
                         return
 
                     resume_at = convert_to_int(api_response.headers["x-rate-limit-reset"]) + 1
                     resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c")
-                    self.dataset.update_status("Hit Twitter rate limit - waiting until %s to continue." % resume_at_str)
+                    self.dataset.update_status("Hit X's rate limit - waiting until %s to continue." % resume_at_str)
                     while time.time() <= resume_at:
                         if self.interrupted:
                             raise ProcessorInterruptedException("Interrupted while waiting for rate limit to reset")
@@ -211,10 +208,10 @@ def get_items(self, query):
                 elif api_response.status_code == 403:
                     try:
                         structured_response = api_response.json()
-                        self.dataset.update_status("'Forbidden' error from the Twitter API. Could not connect to Twitter API "
+                        self.dataset.update_status("'Forbidden' error from the X API. Could not connect to X API "
                                                    "with this API key. %s" % structured_response.get("detail", ""), is_final=True)
                     except (json.JSONDecodeError, ValueError):
-                        self.dataset.update_status("'Forbidden' error from the Twitter API. Your key may not have access to "
+                        self.dataset.update_status("'Forbidden' error from the X API. Your key may not have access to "
                                                    "the full-archive search endpoint.", is_final=True)
                     finally:
                         return
@@ -224,7 +221,7 @@ def get_items(self, query):
                 elif api_response.status_code in (502, 503, 504):
                     resume_at = time.time() + 60
                     resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c")
-                    self.dataset.update_status("Twitter unavailable (status %i) - waiting until %s to continue." % (
+                    self.dataset.update_status("X unavailable (status %i) - waiting until %s to continue." % (
                     api_response.status_code, resume_at_str))
                     while time.time() <= resume_at:
                         time.sleep(0.5)
@@ -233,7 +230,7 @@ def get_items(self, query):
                 # this usually means the query is too long or otherwise contains
                 # a syntax error
                 elif api_response.status_code == 400:
-                    msg = "Response %i from the Twitter API; " % api_response.status_code
+                    msg = "Response %i from the X API; " % api_response.status_code
                     try:
                         api_response = api_response.json()
                         msg += api_response.get("title", "")
@@ -247,19 +244,19 @@ def get_items(self, query):
 
                 # invalid API key
                 elif api_response.status_code == 401:
-                    self.dataset.update_status("Invalid API key - could not connect to Twitter API", is_final=True)
+                    self.dataset.update_status("Invalid API key - could not connect to X API", is_final=True)
                     return
 
                 # haven't seen one yet, but they probably exist
                 elif api_response.status_code != 200:
                     self.dataset.update_status(
                         "Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True)
-                    self.log.warning("Twitter API v2 responded with status code %i. Response body: %s" % (
+                    self.log.warning("X API v2 responded with status code %i. Response body: %s" % (
                     api_response.status_code, api_response.text))
                     return
 
                 elif not api_response:
-                    self.dataset.update_status("Could not connect to Twitter. Cancelling.", is_final=True)
+                    self.dataset.update_status("Could not connect to X. Cancelling.", is_final=True)
                     return
 
                 api_response = api_response.json()
@@ -291,13 +288,13 @@ def get_items(self, query):
                 if num_missing_objects > 50:
                     # Large amount of missing objects; possible error with Twitter API
                     self.import_issues = False
-                    error_report.append('%i missing objects received following tweet number %i. Possible issue with Twitter API.' % (num_missing_objects, tweets))
+                    error_report.append('%i missing objects received following post number %i. Possible issue with X API.' % (num_missing_objects, tweets))
                     error_report.append('Missing objects collected: ' + ', '.join(['%s: %s' % (k, len(v)) for k, v in missing_objects.items()]))
 
                 # Warn if new missing object is recorded (for developers to handle)
                 expected_error_types = ['user', 'media', 'poll', 'tweet', 'place']
                 if any(key not in expected_error_types for key in missing_objects.keys()):
-                    self.log.warning("Twitter API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types]))
+                    self.log.warning("X API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types]))
 
                 # Loop through and collect tweets
                 for tweet in api_response.get("data", []):
@@ -312,7 +309,7 @@ def get_items(self, query):
 
                     tweets += 1
                     if tweets % 500 == 0:
-                        self.dataset.update_status("Received %s of ~%s tweets from the Twitter API" % ("{:,}".format(tweets), expected_tweets))
+                        self.dataset.update_status("Received %s of ~%s tweets from the X API" % ("{:,}".format(tweets), expected_tweets))
                         if num_expected_tweets is not None:
                             self.dataset.update_progress(tweets / num_expected_tweets)
 
@@ -474,21 +471,19 @@ def get_options(cls, parent_dataset=None, user=None):
         max_tweets = config.get("twitterv2-search.max_tweets", user=user)
 
         if have_api_key:
-            intro_text = ("This data source uses the full-archive search endpoint of the Twitter API (v2) to retrieve "
+            intro_text = ("This data source uses the full-archive search endpoint of the X API (v2) to retrieve "
                           "historic tweets that match a given query.")
 
         else:
-            intro_text = ("This data source uses either the Standard 7-day historical Search endpoint or the "
-                          "full-archive search endpoint of the Twitter API, v2. To use the latter, you must have "
-                          "access  to the Academic Research track of the Twitter API. In either case, you will need to "
-                          "provide a  valid [bearer "
-                          "token](https://developer.twitter.com/en/docs/authentication/oauth-2-0). The  bearer token "
-                          "**will be sent to the 4CAT server**, where it will be deleted after data collection has "
-                          "started. Note that any tweets retrieved  with 4CAT will count towards your monthly Tweet "
-                          "retrieval cap.")
-
-        intro_text += ("\n\nPlease refer to the [Twitter API documentation]("
-                          "https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) "
+            intro_text = ("This data source uses the full-archive search endpoint of the X/Twitter API, v2. To use the "
+                          "it, you must have access  to the Research track of the X API. You will need to provide a "
+                          "valid [bearer token](https://developer.x.com/en/docs/authentication/oauth-2-0). The "
+                          "bearer token **will be sent to the 4CAT server**, where it will be deleted after data "
+                          "collection has started. Note that any posts retrieved with 4CAT will count towards your "
+                          "monthly post retrieval cap.")
+
+        intro_text += ("\n\nPlease refer to the [X API documentation]("
+                          "https://developer.x.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) "
                           "documentation for more information about this API endpoint and the syntax you can use in your "
                           "search query. Retweets are included by default; add `-is:retweet` to exclude them.")
 
@@ -500,16 +495,18 @@ def get_options(cls, parent_dataset=None, user=None):
         }
 
         if not have_api_key:
+            # options.update({
+            #     "api_type": {
+            #         "type": UserInput.OPTION_CHOICE,
+            #         "help": "API track",
+            #         "options": {
+            #             "all": "Research API: Full-archive search",
+            #             "recent": "Standard: Recent search (Tweets published in last 7 days)",
+            #         },
+            #         "default": "all"
+            #     }
+            # })
             options.update({
-                "api_type": {
-                    "type": UserInput.OPTION_CHOICE,
-                    "help": "API track",
-                    "options": {
-                        "all": "Academic: Full-archive search",
-                        "recent": "Standard: Recent search (Tweets published in last 7 days)",
-                    },
-                    "default": "all"
-                },
                 "api_bearer_token": {
                     "type": UserInput.OPTION_TEXT,
                     "sensitive": True,
@@ -523,10 +520,10 @@ def get_options(cls, parent_dataset=None, user=None):
                 "query_type": {
                     "type": UserInput.OPTION_CHOICE,
                     "help": "Query type",
-                    "tooltip": "Note: Num of Tweets and Date fields ignored with 'Tweets by ID' lookup",
+                    "tooltip": "Note: Num of posts and date fields are ignored with 'Posts by ID' lookup",
                     "options": {
                         "query": "Search query",
-                        "id_lookup": "Tweets by ID (list IDs seperated by commas or one per line)",
+                        "id_lookup": "Posts by ID (list IDs seperated by commas or one per line)",
                     },
                     "default": "query"
                 }
@@ -539,7 +536,7 @@ def get_options(cls, parent_dataset=None, user=None):
             },
             "amount": {
                 "type": UserInput.OPTION_TEXT,
-                "help": "Tweets to retrieve",
+                "help": "Posts to retrieve",
                 "tooltip": "0 = unlimited (be careful!)" if not max_tweets else ("0 = maximum (%s)" % str(max_tweets)),
                 "min": 0,
                 "max": max_tweets if max_tweets else 10_000_000,
@@ -550,7 +547,7 @@ def get_options(cls, parent_dataset=None, user=None):
             },
             "daterange-info": {
                 "type": UserInput.OPTION_INFO,
-                "help": "By default, Twitter returns tweets up til 30 days ago. If you want to go back further, you "
+                "help": "By default, X returns posts up til 30 days ago. If you want to go back further, you "
                         "need to explicitly set a date range."
             },
             "daterange": {
@@ -591,7 +588,7 @@ def validate_query(query, request, user):
                 raise QueryParametersException("Please provide a valid bearer token.")
 
         if len(query.get("query")) > 1024 and query.get("query_type", "query") != "id_lookup":
-            raise QueryParametersException("Twitter API queries cannot be longer than 1024 characters.")
+            raise QueryParametersException("X API queries cannot be longer than 1024 characters.")
 
         if query.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup", user=user):
             # reformat queries to be a comma-separated list with no wrapping
@@ -630,7 +627,7 @@ def validate_query(query, request, user):
         # to dissuade users from running huge queries that will take forever
         # to process
         if params["query_type"] == "query" and (params.get("api_type") == "all" or have_api_key):
-            count_url = "https://api.twitter.com/2/tweets/counts/all"
+            count_url = "https://api.x.com/2/tweets/counts/all"
             count_params = {
                 "granularity": "day",
                 "query": params["query"],
@@ -668,7 +665,7 @@ def validate_query(query, request, user):
 
                 elif response.status_code == 401:
                     raise QueryParametersException("Your bearer token seems to be invalid. Please make sure it is valid "
-                                                   "for the Academic Track of the Twitter API.")
+                                                   "for the Research track of the X API.")
 
                 elif response.status_code == 400:
                     raise QueryParametersException("Your query is invalid. Please make sure the date range does not "
@@ -791,7 +788,7 @@ def map_item(item):
             "thread_id": item.get("conversation_id", item["id"]),
             "timestamp": tweet_time.strftime("%Y-%m-%d %H:%M:%S"),
             "unix_timestamp": int(tweet_time.timestamp()),
-            'link': "https://twitter.com/%s/status/%s" % (author_username, item.get('id')),
+            'link': "https://x.com/%s/status/%s" % (author_username, item.get('id')),
             "subject": "",
             "body": item["text"],
             "author": author_username,
diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
index bd7b81289..f6c8bcc11 100644
--- a/processors/conversion/export_datasets.py
+++ b/processors/conversion/export_datasets.py
@@ -23,7 +23,7 @@ class ExportDatasets(BasicProcessor):
 	type = "export-datasets"  # job type ID
 	category = "Conversion"  # category
 	title = "Export Dataset and All Analyses"  # title displayed in UI
-	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again."  # description displayed in UI
+	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Filters are *not* included and must be exported separately as new datasets. Results automatically expires after 1 day, after which you must run again."  # description displayed in UI
 	extension = "zip"  # extension of result file, used internally and in UI
 
 	@classmethod
@@ -40,6 +40,11 @@ def process(self):
 		This takes a CSV file as input and writes the same data as a JSON file
 		"""
 		self.dataset.update_status("Collecting dataset and all analyses")
+		primary_dataset = self.dataset.top_parent()
+		if not primary_dataset.is_finished():
+			# This ought not happen as processors (i.e., this processor) should only be available for finished datasets
+			self.dataset.finish_with_error("You cannot export unfinished datasets; please wait until dataset is finished to export.")
+			return
 
 		results_path = self.dataset.get_staging_area()
 
@@ -52,25 +57,26 @@ def process(self):
 
 			try:
 				dataset = DataSet(key=dataset_key, db=self.db)
-			# TODO: these two should fail for the primary dataset, but should they fail for the children too?
 			except DataSetException:
-				self.dataset.finish_with_error("Dataset not found.")
-				return
+				self.dataset.update_status(f"Dataset {dataset_key} not found: it may have been deleted prior to export; skipping.")
+				failed_exports.append(dataset_key)
+				continue
 			if not dataset.is_finished():
-				self.dataset.finish_with_error("You cannot export unfinished datasets.")
-				return
+				self.dataset.update_status(f"Dataset {dataset_key} not finished: cannot export unfinished datasets; skipping.")
+				failed_exports.append(dataset_key)
+				continue
 
 			# get metadata
 			metadata = dataset.get_metadata()
 			if metadata["num_rows"] == 0:
-				self.dataset.update_status(f"Skipping empty dataset {dataset_key}")
+				self.dataset.update_status(f"Dataset {dataset_key} has no results; skipping.")
 				failed_exports.append(dataset_key)
 				continue
 
 			# get data
 			data_file = dataset.get_results_path()
 			if not data_file.exists():
-				self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.")
+				self.dataset.update_status(f"Dataset {dataset_key} has no data file; skipping.")
 				failed_exports.append(dataset_key)
 				continue
 
diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py
index 860c0ddbe..461cdd54a 100644
--- a/processors/conversion/merge_datasets.py
+++ b/processors/conversion/merge_datasets.py
@@ -60,7 +60,7 @@ def is_compatible_with(cls, module=None, user=None):
         return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector())
 
     @staticmethod
-    def get_dataset_from_url(url, db):
+    def get_dataset_from_url(url, db, modules=None):
         """
         Get dataset object based on dataset URL
 
@@ -68,6 +68,7 @@ def get_dataset_from_url(url, db):
 
         :param str url:  Dataset URL
         :param db:  Database handler (to retrieve metadata)
+        :param modules:  Modules handler (pass through to DataSet)
         :return DataSet:  The dataset
         """
         if not url:
@@ -75,7 +76,7 @@ def get_dataset_from_url(url, db):
 
         source_url = ural.normalize_url(url)
         source_key = source_url.split("/")[-1]
-        return DataSet(key=source_key, db=db)
+        return DataSet(key=source_key, db=db, modules=modules)
 
     def process(self):
         """
@@ -96,7 +97,7 @@ def process(self):
                 continue
 
             try:
-                source_dataset = self.get_dataset_from_url(source_dataset_url, self.db)
+                source_dataset = self.get_dataset_from_url(source_dataset_url, self.db, modules=self.modules)
             except DataSetException:
                 return self.dataset.finish_with_error(f"Dataset URL '{source_dataset_url} not found - cannot perform "
                                                       f"merge.")
diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
new file mode 100644
index 000000000..a8dd8763e
--- /dev/null
+++ b/processors/filtering/unique_images.py
@@ -0,0 +1,113 @@
+"""
+Filter by unique images
+"""
+import shutil
+import json
+
+from backend.lib.processor import BasicProcessor
+from common.lib.exceptions import ProcessorInterruptedException
+from common.lib.helpers import UserInput, hash_file
+
+__author__ = "Stijn Peeters"
+__credits__ = ["Stijn Peeters"]
+__maintainer__ = "Stijn Peeters"
+__email__ = "4cat@oilab.eu"
+
+
+class UniqueImageFilter(BasicProcessor):
+    """
+    Retain only unique images, by a user-defined metric
+    """
+    type = "image-downloader-unique"  # job type ID
+    category = "Visualisation"  # category
+    title = "Filter for unique images"  # title displayed in UI
+    description = "Only keeps one instance per image, using a choice of detection method."  # description displayed in UI
+    extension = "zip"
+
+    references = [
+        "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)",
+        "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+        "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+        "Explainer: [Difference hash](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)",
+
+    ]
+
+    options = {
+        "hash-type": {
+            "type": UserInput.OPTION_CHOICE,
+            "help": "Comparison method",
+            "default": "file-hash",
+            "options": {
+                "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+                "colorhash": "Colour hash (good at colours, worse at shapes)",
+                "phash": "Perceptual hash (decent at colours and shapes)",
+                "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+                "dhash": "Difference hash (similar to average hash, better at photos and art)"
+            }
+        }
+    }
+
+    @classmethod
+    def is_compatible_with(cls, module=None, user=None):
+        """
+        Allow processor on image archives
+
+        :param module: Module to determine compatibility with
+        """
+        return module.get_media_type() == "image" or module.type.startswith(
+            "image-downloader") or module.type == "video-frames"
+
+    def process(self):
+        """
+        Loop through images and only retain ones that have not been seen yet
+
+        :return:
+        """
+        seen_hashes = set()
+        hash_map = {}
+        metadata = None
+        dupes = 0
+        processed = 0
+        staging_area = self.dataset.get_staging_area()
+
+        self.dataset.update_status("Processing images and looking for duplicates")
+        for image_file in self.iterate_archive_contents(self.source_file):
+            if self.interrupted:
+                raise ProcessorInterruptedException("Interrupted while filtering for unique images")
+
+            self.dataset.update_progress(processed / self.source_dataset.num_rows)
+            if processed % 100 == 0:
+                self.dataset.update_status(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
+                                             f"found {dupes:,} duplicate(s)")
+            processed += 1
+
+            if image_file.name == ".metadata.json":
+                with image_file.open() as infile:
+                    metadata = json.load(infile)
+                continue
+
+            image_hash = hash_file(image_file, self.parameters.get("hash-type"))
+
+            if image_hash not in seen_hashes:
+                seen_hashes.add(image_hash)
+                shutil.copy2(image_file, staging_area)
+                hash_map[image_hash] = image_file.name
+            else:
+                self.dataset.log(f"{image_file.name} is a duplicate of {hash_map[image_hash]} - skipping")
+                dupes += 1
+
+        new_metadata = {}
+        inverse_hashmap = {v: k for k, v in hash_map.items()}
+        for url, item in metadata.items():
+            if item["filename"] in inverse_hashmap:
+                new_metadata[inverse_hashmap[item["filename"]]] = {
+                    **item,
+                    "hash": inverse_hashmap[item["filename"]],
+                    "hash_type": self.parameters.get("hash-type")
+                }
+
+        with staging_area.joinpath(".metadata.json").open("w") as outfile:
+            json.dump(new_metadata, outfile)
+
+        self.dataset.update_status(f"Image archive filtered, found {dupes:,} duplicate(s)", is_final=True)
+        self.write_archive_and_finish(staging_area, len(hash_map), finish=True)
diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py
index 022e96de5..26234a186 100644
--- a/processors/machine_learning/annotate_text.py
+++ b/processors/machine_learning/annotate_text.py
@@ -184,8 +184,8 @@ def process(self):
         # prepare data for annotation
         data_path = staging_area.joinpath("data.temp.ndjson")
         with data_path.open("w", newline="") as outfile:
-            for item in self.source_dataset.iterate_items():
-                outfile.write(json.dumps({item.get("id"): item.get(textfield)}) + "\n")
+            for i, item in enumerate(self.source_dataset.iterate_items()):
+                outfile.write(json.dumps({item.get("id", str(i)): item.get(textfield)}) + "\n")
 
         path_to_files, path_to_results = dmi_service_manager.process_files(staging_area,
                                                                            [data_path.name, labels_path.name],
@@ -238,15 +238,14 @@ def make_filename(id, prompt):
         self.dataset.update_status("Loading annotated data")
         with output_dir.joinpath("results.json").open() as infile:
             annotations = json.load(infile)
-
         self.dataset.update_status("Writing results")
         with self.dataset.get_results_path().open("w") as outfile:
             writer = None
-            for item in self.source_dataset.iterate_items():
+            for i, item in enumerate(self.source_dataset.iterate_items()):
                 row = {
-                    "id": item.get("id"),
+                    "id": item.get("id", i),
                     textfield: item.get(textfield),
-                    "category": annotations[item.get("id")]
+                    "category": annotations.get(item.get("id", str(i))) # str(i) because it is not recorded as an int in the annotations
                 }
                 if not writer:
                     writer = csv.DictWriter(outfile, fieldnames=row.keys())
diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py
index 236e9577f..139b2ac93 100644
--- a/processors/networks/cotag_network.py
+++ b/processors/networks/cotag_network.py
@@ -29,6 +29,13 @@ class CoTaggerPreset(ProcessorPreset):
             "default": True,
             "help": "Convert tags to lowercase",
             "tooltip": "Merges tags with varying cases"
+        },
+        "ignore-tags": {
+            "type": UserInput.OPTION_TEXT,
+            "default": "",
+            "help": "Tags to ignore",
+            "tooltip": "Separate with commas if you want to ignore multiple tags. Do not include the '#' "
+                       "character."
         }
     }
 
@@ -72,6 +79,7 @@ def get_processor_pipeline(self):
                     "split-comma": True,
                     "categorise": True,
                     "allow-loops": False,
+                    "ignore-nodes": self.parameters.get("ignore-tags", ""),
                     "to-lowercase": self.parameters.get("to-lowercase", True)
                 }
             }
diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
new file mode 100644
index 000000000..4267c9650
--- /dev/null
+++ b/processors/networks/image-network.py
@@ -0,0 +1,217 @@
+"""
+Make a bipartite Image-Item network
+"""
+import json
+
+from backend.lib.processor import BasicProcessor
+from common.lib.helpers import hash_file
+
+import networkx as nx
+
+__author__ = "Stijn Peeters"
+__credits__ = ["Stijn Peeters"]
+__maintainer__ = "Stijn Peeters"
+__email__ = "4cat@oilab.eu"
+
+from common.lib.exceptions import ProcessorInterruptedException
+from common.lib.user_input import UserInput
+
+
+class ImageGrapher(BasicProcessor):
+    """
+    Image network
+
+    Creates a bipartite network of images and some attribute of the dataset the
+    images were sourced from
+    """
+    type = "image-bipartite-network"  # job type ID
+    category = "Networks"
+    title = "Bipartite image-item network"  # title displayed in UI
+    description = ("Create a GEXF network file with a bipartite network of "
+                   "images and some data field (e.g. author) of the dataset "
+                   "the images were sourced from. Suitable for use with Gephi's "
+                   "'Image Preview' plugin.")
+    extension = "gexf"  # extension of result file, used internally and in UI
+
+    options = {}
+
+    @classmethod
+    def get_options(cls, parent_dataset=None, user=None):
+        root_dataset = None
+        columns = None
+        if parent_dataset:
+            for parent in reversed(parent_dataset.get_genealogy()):
+                if parent.get_columns():
+                    root_dataset = parent
+                    break
+            columns = root_dataset.get_columns()
+
+        return {
+            "column": {
+                "help": "Dataset field",
+                "type": UserInput.OPTION_TEXT,
+                "default": "id"
+            },
+            "image-value": {
+                "help": "Image node label",
+                "type": UserInput.OPTION_CHOICE,
+                "options": {
+                    "filename": "Image file name",
+                    "url": "Image URL"
+                },
+                "tooltip": "The image node label will have this value. Depending on the network visualisation software "
+                           "you use, one or the other is required to display the images as nodes."
+            },
+            "deduplicate": {
+                "type": UserInput.OPTION_CHOICE,
+                "help": "Merge images",
+                "tooltip": "Similar images can be merged into a single node, represented by the first image of the set "
+                           "that was encountered.",
+                "options": {
+                    "none": "Do not merge",
+                    "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+                    "colorhash": "Colour hash (good at colours, worse at shapes)",
+                    "phash": "Perceptual hash (decent at colours and shapes)",
+                    "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+                    "dhash": "Difference hash (similar to average hash, better at photos and art)"
+                }
+            },
+            **({
+                   "column": {
+                       "help": "Dataset field",
+                       "type": UserInput.OPTION_CHOICE,
+                       "options": {
+                           column: column
+                           for column in columns}
+                   }
+               } if columns else {})
+        }
+
+    @classmethod
+    def is_compatible_with(cls, module=None, user=None):
+        """
+        Allow processor to run on images downloaded from a dataset
+
+        :param module: Module to determine compatibility with
+        """
+        return module.type.startswith("image-downloader")
+
+    def process(self):
+        column = self.parameters.get("column")
+        hash_type = self.parameters.get("deduplicate")
+        filename_filter = [".metadata.json"] if hash_type == "none" else []
+        metadata = None
+        hashed = 0
+
+        # some maps to make sure we use the right value in the right place
+        # url or filename, original image or duplicate, etc
+        file_hash_map = {}
+        hash_file_map = {}
+        seen_hashes = set()
+        id_file_map = {}
+
+        for file in self.iterate_archive_contents(self.source_file, filename_filter=filename_filter):
+            if file.name == ".metadata.json":
+                with file.open() as infile:
+                    try:
+                        metadata = json.load(infile)
+                        file_hash_map = {i: v["filename"] for i, v in metadata.items()} if self.parameters.get("image-value") == "url" else {i["filename"]: i["filename"] for i in metadata.values()}
+                    except json.JSONDecodeError:
+                        pass
+            else:
+                try:
+                    hashed += 1
+                    if hashed % 100 == 0:
+                        self.dataset.update_status(f"Generated identity hashes for {hashed:,} of {self.source_dataset.num_rows-1:,} item(s)")
+                    self.dataset.update_progress(hashed / (self.source_dataset.num_rows-1) * 0.5)
+                    file_hash = hash_file(file, hash_type)
+                    file_hash_map[file.name] = file_hash
+                    if file_hash not in hash_file_map:
+                        hash_file_map[file_hash] = file.name
+
+                except (FileNotFoundError, ValueError) as e:
+                    continue
+
+        if not metadata:
+            return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only "
+                                                  "be run on sets of images sourced from another 4CAT dataset.")
+
+        file_url_map = {v["filename"]: u for u, v in metadata.items()}
+        for url, details in metadata.items():
+            for item_id in details.get("post_ids", []):
+                if self.source_dataset.type.endswith("-telegram"):
+                    # telegram has weird IDs
+                    item_id = "-".join(details["filename"].split("-")[:-1]) + "-" + str(item_id)
+                id_file_map[item_id] = details["filename"]
+
+        root_dataset = None
+        for parent in reversed(self.dataset.get_genealogy()):
+            if parent.get_columns():
+                root_dataset = parent
+                break
+
+        if not root_dataset:
+            return self.dataset.finish_with_error("No suitable parent dataset found - this processor can only "
+                                                  "be run on sets of images sourced from another 4CAT dataset.")
+
+        network = nx.DiGraph()
+        processed = 0
+        for item in root_dataset.iterate_items():
+            progress = processed / root_dataset.num_rows
+            if hashed:
+                # if hashing was necessary, we approximate that as 50% of the work
+                progress = (progress * 0.5) + 0.5
+
+            self.dataset.update_progress(progress)
+            processed += 1
+            if processed % 100 == 0:
+                self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)")
+
+            if self.interrupted:
+                raise ProcessorInterruptedException()
+
+            if item.get("id") not in id_file_map:
+                continue
+
+            # from nodes are the dataset fields (e.g. 'body' or 'chat')
+            # to node names are filenames (optionally mapped to URLs later)
+            from_node = item.get(column)
+            from_node_id = f"{column}-{from_node}"
+
+            image_file = id_file_map[item.get("id")]
+            image_hash = file_hash_map.get(image_file)
+            if hash_type != "none" and image_hash in seen_hashes:
+                # if we're deduplicating and the image is already in the graph,
+                # merge the nodes (use the original node as the 'to node')
+                to_node = hash_file_map.get(image_hash)
+                if to_node and image_file != to_node:
+                    self.dataset.update_status(f"Image {image_file} identified as a duplicate of {to_node} - "
+                                               f"merging.")
+
+            else:
+                seen_hashes.add(image_hash)
+                to_node = image_file
+
+            if not to_node:
+                # image could not be hashed, probably invalid file
+                continue
+
+            if self.parameters.get("image-value") == "url":
+                to_node = file_url_map[to_node]
+
+            to_node_id = f"image-{to_node}"
+            if from_node_id not in network.nodes:
+                network.add_node(from_node_id, label=from_node, category=column)
+
+            if to_node_id not in network.nodes:
+                network.add_node(to_node_id, label=to_node, category="image", image=to_node)
+
+            edge = (from_node_id, to_node_id)
+            if edge not in network.edges():
+                network.add_edge(*edge, frequency=0)
+
+            network.edges[edge]["frequency"] += 1
+
+        self.dataset.update_status("Writing network file")
+        nx.write_gexf(network, self.dataset.get_results_path())
+        self.dataset.finish(len(network.nodes))
diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py
index 0f6045702..43ceffdf4 100644
--- a/processors/networks/two-column-network.py
+++ b/processors/networks/two-column-network.py
@@ -84,6 +84,12 @@ class ColumnNetworker(BasicProcessor):
             "default": False,
             "help": "Convert values to lowercase",
             "tooltip": "Merges values with varying cases"
+        },
+        "ignore-nodes": {
+            "type": UserInput.OPTION_TEXT,
+            "default": "",
+            "help": "Nodes to ignore",
+            "tooltip": "Separate with commas if you want to ignore multiple nodes"
         }
     }
 
@@ -145,6 +151,7 @@ def process(self):
         allow_loops = self.parameters.get("allow-loops")
         interval_type = self.parameters.get("interval")
         to_lower = self.parameters.get("to-lowercase", False)
+        ignoreable = [n.strip() for n in self.parameters.get("ignore-nodes", "").split(",") if n.strip()]
 
         processed = 0
 
@@ -193,6 +200,14 @@ def process(self):
                 values_a = [value.strip() for value_groups in values_a for value in value_groups.split(",")]
                 values_b = [value.strip() for value_groups in values_b for value in value_groups.split(",")]
 
+            if ignoreable:
+                values_a = [v for v in values_a if v not in ignoreable]
+                values_b = [v for v in values_b if v not in ignoreable]
+
+            # only proceed if we actually have any edges left
+            if not values_a or not values_b:
+                continue
+
             try:
                 interval = get_interval_descriptor(item, interval_type)
             except ValueError as e:
diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index 17c350c86..1ee3b1990 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -226,6 +226,7 @@ def process(self):
 
 		The result is valid JSON, written in chunks.
 		"""
+		sentence_error = False
 		columns = self.parameters.get("columns")
 		if not columns:
 			self.dataset.update_status("No columns selected, aborting.", is_final=True)
@@ -357,11 +358,11 @@ def dummy_function(x, *args, **kwargs):
 				# for russian we use a special purpose splitter with better
 				# performance
 				sentence_method = razdel.sentenize
-			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
-								'pickle' in lang]:
+			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab'))]:
 				self.dataset.update_status(
 					f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")
 				sentence_method = dummy_function
+				sentence_error = True
 			else:
 				sentence_method = sent_tokenize
 		else:
@@ -490,6 +491,9 @@ def dummy_function(x, *args, **kwargs):
 		with staging_area.joinpath(".token_metadata.json").open("w", encoding="utf-8") as outfile:
 			json.dump(metadata, outfile)
 
+		if sentence_error:
+			self.dataset.update_status(f"Finished tokenizing; Unable to group by sentence ({language} not supported), instead grouped by item.", is_final=True)
+
 		# create zip of archive and delete temporary files and folder
 		self.write_archive_and_finish(staging_area)
 
diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index 99ff5199b..6394862e8 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -7,13 +7,14 @@
 
 from pathlib import Path
 
+import telethon.errors
 from telethon import TelegramClient
-from telethon.errors import TimedOutError
+from telethon.errors import TimedOutError, BadRequestError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
 from common.lib.dataset import DataSet
 from processors.visualisation.download_images import ImageDownloader
 
@@ -194,6 +195,13 @@ async def get_images(self):
                     if self.interrupted:
                         raise ProcessorInterruptedException("Interrupted while downloading images")
 
+                    if not message:
+                        # message no longer exists
+                        self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it "
+                                         f"may have been deleted)")
+                        self.flawless = False
+                        break
+
                     success = False
                     try:
                         # it's actually unclear if images are always jpegs, but this
@@ -216,13 +224,27 @@ async def get_images(self):
                         self.dataset.log(f"Could not download image for message {msg_id} ({e})")
                         self.flawless = False
 
-                    media_done += 1
-                    self.metadata[filename] = {
-                        "filename": filename,
-                        "success": success,
-                        "from_dataset": self.source_dataset.key,
-                        "post_ids": [msg_id]
-                    }
+                    finally:
+                        media_done += 1
+                        self.metadata[filename] = {
+                            "filename": filename,
+                            "success": success,
+                            "from_dataset": self.source_dataset.key,
+                            "post_ids": [msg_id]
+                        }
+
+            except BadRequestError as e:
+                self.dataset.log(f"Couldn't retrieve images for {entity} - the channel is no longer accessible ({e})")
+                self.flawless = False
+
+            except telethon.errors.FloodError as e:
+                later = "later"
+                if hasattr(e, "seconds"):
+                    later = f"in {timify_long(e.seconds)}"
+                self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+                                           f"halting download process. Try again {later}.", is_final=True)
+                self.flawless = False
+                break
                     
             except ValueError as e:
                 self.dataset.log(f"Couldn't retrieve images for {entity}, it probably does not exist anymore ({e})")
diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py
index ef6d44231..aa05173ce 100644
--- a/processors/visualisation/download-telegram-videos.py
+++ b/processors/visualisation/download-telegram-videos.py
@@ -8,12 +8,13 @@
 from pathlib import Path
 
 from telethon import TelegramClient
+from telethon.errors import FloodError, BadRequestError
 
 from common.config_manager import config
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException
 from processors.visualisation.download_videos import VideoDownloaderPlus
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
 from common.lib.dataset import DataSet
 
 __author__ = "Stijn Peeters"
@@ -197,7 +198,7 @@ async def get_videos(self):
 
                         msg_id = message.id
                         success = True
-                    except (AttributeError, RuntimeError, ValueError, TypeError) as e:
+                    except (AttributeError, RuntimeError, ValueError, TypeError, BadRequestError) as e:
                         filename = f"{entity}-index-{media_done}"
                         msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
                         self.dataset.log(f"Could not download video for message {msg_id} ({e})")
@@ -210,6 +211,15 @@ async def get_videos(self):
                         "from_dataset": self.source_dataset.key,
                         "post_ids": [msg_id]
                     }
+
+            except FloodError as e:
+                later = "later"
+                if hasattr(e, "seconds"):
+                    later = f"in {timify_long(e.seconds)}"
+                self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+                                           f"halting download process. Try again {later}.", is_final=True)
+                self.flawless = False
+                break
                     
             except ValueError as e:
                 self.dataset.log(f"Couldn't retrieve video for {entity}, it probably does not exist anymore ({e})")
diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py
index c02b53bf7..3854e9653 100644
--- a/processors/visualisation/download_tiktok.py
+++ b/processors/visualisation/download_tiktok.py
@@ -161,6 +161,7 @@ class TikTokImageDownloader(BasicProcessor):
             "options": {
                 "thumbnail": "Video Thumbnail",
                 "music": "Music Thumbnail",
+                "author_avatar": "User avatar"
             },
             "default": "thumbnail"
         }
@@ -217,6 +218,8 @@ def process(self):
             url_column = "thumbnail_url"
         elif self.parameters.get("thumb_type") == "music":
             url_column = "music_thumbnail"
+        elif self.parameters.get("thumb_type") == "author_avatar":
+            url_column = "author_avatar"
         else:
             self.dataset.update_status("No image column selected.", is_final=True)
             self.dataset.finish(0)
diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
index 2b385ffe7..d1d7bd67c 100644
--- a/processors/visualisation/download_videos.py
+++ b/processors/visualisation/download_videos.py
@@ -3,6 +3,7 @@
 
 First attempt to download via request, but if that fails use yt-dlp
 """
+import os
 import json
 import re
 import time
@@ -601,15 +602,22 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
                                 f"Video size {response.headers.get('Content-Length')} larger than maximum allowed per 4CAT")
                     # Size unknown
                     elif not self.config.get("video-downloader.allow-unknown-size", False):
-                        FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
+                        raise FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
 
                 # Download video
                 self.dataset.update_status(
                     "Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
                 with open(results_path.joinpath(save_location), "wb") as f:
-                    for chunk in response.iter_content(chunk_size=1024 * 1024):
-                        if chunk:
-                            f.write(chunk)
+                    try:
+                        for chunk in response.iter_content(chunk_size=1024 * 1024):
+                            if not max_video_size == 0 and f.tell() > (max_video_size * 1000000):
+                                    # File size too large; stop download and remove file
+                                    os.remove(f.name)
+                                    raise FilesizeException("Video size larger than maximum allowed per 4CAT")
+                            if chunk:
+                                f.write(chunk)
+                    except requests.exceptions.ChunkedEncodingError as e:
+                        raise FailedDownload(f"Failed to complete download: {e}")
 
                 # Return filename to add to metadata
                 return save_location.name
diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py
index 64b0c4f34..ec95f84f9 100644
--- a/processors/visualisation/video_frames.py
+++ b/processors/visualisation/video_frames.py
@@ -94,7 +94,7 @@ def process(self):
 		processed_videos = 0
 
 		self.dataset.update_status("Extracting video frames")
-		for path in self.iterate_archive_contents(self.source_file, staging_area):
+		for i, path in enumerate(self.iterate_archive_contents(self.source_file, staging_area)):
 			if self.interrupted:
 				raise ProcessorInterruptedException("Interrupted while determining image wall order")
 
@@ -138,17 +138,21 @@ def process(self):
 					outfile.write(ffmpeg_error)
 
 			if result.returncode != 0:
-				error = 'Error Return Code with video %s: %s' % (vid_name, str(result.returncode))
-				self.dataset.log(error)
+				self.dataset.update_status(f"Unable to extract frames from video {vid_name} (see logs for details)")
+				self.dataset.log('Error Return Code (%s) with video %s: %s' % (str(result.returncode), vid_name, "\n".join(ffmpeg_error.split('\n')[-2:]) if ffmpeg_error else ''))
+			else:
+				processed_videos += 1
+				self.dataset.update_status("Created frames for %i of %i videos" % (processed_videos, total_possible_videos))
 
-			processed_videos += 1
-			self.dataset.update_status(
-				"Created frames for %i of %i videos" % (processed_videos, total_possible_videos))
-			self.dataset.update_progress(processed_videos / total_possible_videos)
+			self.dataset.update_progress(i / total_possible_videos)
 
 		# Finish up
 		# We've created a directory and folder structure here as opposed to a single folder with single files as
 		# expected by self.write_archive_and_finish() so we use make_archive instead
+		if not processed_videos:
+			self.dataset.finish_with_error("Unable to extract frames from any videos")
+			return
+
 		from shutil import make_archive
 		make_archive(self.dataset.get_results_path().with_suffix(''), "zip", output_directory)
 
diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py
index ff1222bc1..aad1baf69 100644
--- a/processors/visualisation/video_hasher.py
+++ b/processors/visualisation/video_hasher.py
@@ -183,8 +183,9 @@ def process(self):
 		self.dataset.log('Frames per seconds: %f' % frame_interval)
 
 		# Prepare staging area for videos and video tracking
+		# VideoHash creates various files that may not be cleaned up on error so we use an output directory
 		staging_area = self.dataset.get_staging_area()
-		self.dataset.log('Staging directory location: %s' % staging_area)
+		output_dir = self.dataset.get_staging_area()
 
 		video_hashes = {}
 		video_metadata = None
@@ -215,16 +216,17 @@ def process(self):
 				self.dataset.update_status("FFmpeg software not found. Please contact 4CAT maintainers.", is_final=True)
 				self.dataset.finish(0)
 				return
-			except FileNotFoundError as e:
-				self.dataset.update_status(f"Unable to find file {str(path)}")
+			except FileNotFoundError:
+				self.dataset.update_status(f"Unable to find file {path.name}")
 				continue
 			except FFmpegFailedToExtractFrames as e:
-				self.dataset.update_status(f"Unable to extract frame for {str(path)}: {e}")
+				self.dataset.update_status(f"Unable to extract frame for {path.name} (see log for details)")
+				self.dataset.log(f"Unable to extract frame for {str(path)}: {e}")
 				continue
 
 			video_hashes[path.name] = {'videohash': videohash}
 
-			shutil.copy(videohash.collage_path, staging_area.joinpath(path.stem + '.jpg'))
+			shutil.copy(videohash.collage_path, output_dir.joinpath(path.stem + '.jpg'))
 			video_hashes[path.name]['video_collage_filename'] = path.stem + '.jpg'
 
 			processed_videos += 1
@@ -233,6 +235,10 @@ def process(self):
 			self.dataset.update_progress(processed_videos / total_possible_videos)
 			videohash.delete_storage_path()
 
+		if processed_videos == 0:
+			self.dataset.finish_with_error("Unable to create video hashes for any videos")
+			return
+
 		# Write hash file
 		# This file is held here and then copied as its own dataset via VideoHasherTwo
 		num_posts = 0
@@ -240,7 +246,7 @@ def process(self):
 		if video_metadata is None:
 			# Grab the metadata directly, if it exists but was skipped (e.g., not found prior to max_videos)
 			try:
-				metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area)
+				metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, output_dir)
 			except FileNotFoundError:
 				metadata_path = None
 			if metadata_path:
@@ -293,7 +299,7 @@ def process(self):
 					num_posts += 1
 
 		writer = None
-		with staging_area.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile:
+		with output_dir.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile:
 			for row in rows:
 				if not writer:
 					writer = csv.DictWriter(outfile, fieldnames=row.keys())
@@ -303,7 +309,7 @@ def process(self):
 
 		# Finish up
 		self.dataset.update_status(f'Created {num_posts} video hashes and stored video collages')
-		self.write_archive_and_finish(staging_area)
+		self.write_archive_and_finish(output_dir, num_items=processed_videos)
 
 class VideoHashNetwork(BasicProcessor):
 	"""
diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py
index 634e8c49d..5140baa01 100644
--- a/processors/visualisation/video_scene_identifier.py
+++ b/processors/visualisation/video_scene_identifier.py
@@ -252,8 +252,9 @@ def process(self):
 				if video_data.get('success'):
 					files = video_data.get('files') if 'files' in video_data else [{"filename": video_data.get("filename"), "success":True}]
 					for file in files:
-						if not file.get("success"):
+						if not file.get("success") or file.get("filename") not in collected_scenes:
 							continue
+							
 						# List types are not super fun for CSV
 						if 'post_ids' in video_data:
 							video_data['post_ids'] = ','.join([str(i) for i in video_data['post_ids']])
diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py
index f668e6f5e..3c73e57f8 100644
--- a/processors/visualisation/video_timelines.py
+++ b/processors/visualisation/video_timelines.py
@@ -117,6 +117,9 @@ def process(self):
                 if previous_video is not None or not looping:
                     # draw the video filename/label on top of the rendered
                     # frame thumbnails
+                    if not previous_video:
+                        # This likely means no frames were found for the video and this processor should not have run
+                        continue
                     video_label = labels.get(previous_video, previous_video)
                     footersize = (fontsize * (len(video_label) + 2) * 0.5925, fontsize * 2)
                     footer_shape = SVG(insert=(0, base_height - footersize[1]), size=footersize)
@@ -165,6 +168,10 @@ def process(self):
                 timeline.add(frame_element)
                 timeline_widths[video] += frame_width
 
+        if not timeline_widths:
+            self.dataset.finish_with_error("No video frames found")
+            return
+
         # now we know all dimensions we can instantiate the canvas too
         canvas_width = max(timeline_widths.values())
         fontsize = 12
@@ -207,7 +214,7 @@ def get_video_labels(self, metadata):
                     labels[filename] = filename
 
         for dataset, urls in mapping_dataset.items():
-            dataset = DataSet(key=dataset, db=self.db).nearest("*-search")
+            dataset = DataSet(key=dataset, db=self.db, modules=self.modules).nearest("*-search")
 
             # determine appropriate label
             # is this the right place? should it be in the datasource?
diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py
index 0dfe2d408..0a1f235e0 100644
--- a/processors/visualisation/word-trees.py
+++ b/processors/visualisation/word-trees.py
@@ -212,6 +212,12 @@ def process(self):
 			if processed % 500 == 0:
 				self.dataset.update_status("Processing and tokenising post %i" % processed)
 			body = post.get(column)
+			
+			try:
+				body = str(body)
+			except TypeError:
+				continue
+
 			if not body:
 				continue
 
diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py
index 6cc91eba1..d0e74a377 100644
--- a/webtool/lib/helpers.py
+++ b/webtool/lib/helpers.py
@@ -96,30 +96,6 @@ def error(code=200, **kwargs):
 	return response
 
 
-def string_to_timestamp(string):
-	"""
-	Convert dd-mm-yyyy date to unix time
-
-	:param string: Date string to parse
-	:return: The unix time, or 0 if value could not be parsed
-	"""
-	bits = string.split("-")
-	if re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", string):
-		bits = list(reversed(bits))
-
-	if len(bits) != 3:
-		return 0
-
-	try:
-		day = int(bits[0])
-		month = int(bits[1])
-		year = int(bits[2])
-		date = datetime.datetime(year, month, day)
-	except ValueError:
-		return 0
-
-	return int(date.timestamp())
-
 def pad_interval(intervals, first_interval=None, last_interval=None):
 	"""
 	Pad an interval so all intermediate intervals are filled
@@ -299,25 +275,6 @@ def generate_css_colours(force=False):
 	)
 
 
-def get_preview(query):
-	"""
-	Generate a data preview of 25 rows of a results csv
-	
-	:param query 
-	:return list: 
-	"""
-	preview = []
-	with query.get_results_path().open(encoding="utf-8") as resultfile:
-		posts = csv.DictReader(resultfile)
-		i = 0
-		for post in posts:
-			i += 1
-			preview.append(post)
-			if i > 25:
-				break
-	return preview
-
-
 def format_chan_post(post):
 	"""
 	Format a plain-text imageboard post post for HTML display
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 8d1db0e2c..d3ba68314 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -1,5 +1,7 @@
 import urllib.parse
 import datetime
+from math import floor
+
 import markdown
 import json
 import ural
@@ -123,7 +125,7 @@ def _jinja2_filter_httpquery(data):
 		return ""
 
 @app.template_filter("add_ahref")
-def _jinja2_filter_add_ahref(content):
+def _jinja2_filter_add_ahref(content, ellipsiate=0):
 	"""
 	Add HTML links to text
 
@@ -138,7 +140,11 @@ def _jinja2_filter_add_ahref(content):
 		return content
 
 	for link in set(ural.urls_from_text(str(content))):
-		content = content.replace(link, f'<a href="{link.replace("<", "%3C").replace(">", "%3E").replace(chr(34), "%22")}" rel="external">{link}</a>')
+		if ellipsiate > 0:
+			link_text = _jinja2_filter_ellipsiate(link, ellipsiate, True, "[&hellip;]")
+		else:
+			link_text = link
+		content = content.replace(link, f'<a href="{link.replace("<", "%3C").replace(">", "%3E").replace(chr(34), "%22")}" rel="external">{link_text}</a>')
 
 	return content
 
@@ -203,6 +209,7 @@ def _jinja2_filter_extension_to_noun(ext):
 	else:
 		return "item"
 
+
 @app.template_filter('social_mediafy')
 def _jinja2_filter_social_mediafy(body, datasource=""):
 	# Adds links to a text body with hashtags, @-mentions, and URLs
@@ -239,6 +246,176 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		}
 	}
 
+  
+@app.template_filter("ellipsiate")
+def _jinja2_filter_ellipsiate(text, length, inside=False, ellipsis_str="&hellip;"):
+	if len(text) <= length:
+		return text
+
+	elif not inside:
+		return text[:length] + ellipsis_str
+
+	else:
+		# two cases: URLs and normal text
+		# for URLs, try to only ellipsiate after the domain name
+		# this makes the URLs easier to read when shortened
+		if ural.is_url(text):
+			pre_part = "/".join(text.split("/")[:3])
+			if len(pre_part) < length - 6:  # kind of arbitrary
+				before = len(pre_part) + 1
+			else:
+				before = floor(length / 2)
+		else:
+			before = floor(length / 2)
+
+		after = len(text) - before
+		return text[:before] + ellipsis_str + text[after:]
+
+@app.template_filter('4chan_image')
+def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
+
+	plebs_boards = ["adv","f","hr","mlpol","mo","o","pol","s4s","sp","tg","trv","tv","x"]
+	archivedmoe_boards = ["3","a","aco","adv","an","asp","b","bant","biz","c","can","cgl","ck","cm","co","cock","con","d","diy","e","f","fa","fap","fit","fitlit","g","gd","gif","h","hc","his","hm","hr","i","ic","int","jp","k","lgbt","lit","m","mlp","mlpol","mo","mtv","mu","n","news","o","out","outsoc","p","po","pol","pw","q","qa","qb","qst","r","r9k","s","s4s","sci","soc","sp","spa","t","tg","toy","trash","trv","tv","u","v","vg","vint","vip","vm","vmg","vp","vr","vrpg","vst","vt","w","wg","wsg","wsr","x","xs","y"]
+
+	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
+
+	img_link = None
+	thumb_link = image_4chan.split(".")
+	thumb_link = thumb_link[0][:4] + "/" + thumb_link[0][4:6] + "/" + thumb_link[0] + "s." + thumb_link[1]
+
+	# If the board is archived by 4plebs, check that site first
+	if board in plebs_boards:
+
+		# First we're going to try to get the image link through the 4plebs API.
+		api_url = "https://archive.4plebs.org/_/api/chan/post/?board=%s&num=%s" % (board, post_id)
+		try:
+			api_json = requests.get(api_url, headers=headers)
+		except requests.RequestException as e:
+		 	pass
+		if api_json.status_code != 200:
+			pass
+		try:
+			api_json = json.loads(api_json.content)
+			img_link = api_json.get("media", {}).get("thumb_link", "")
+		except json.JSONDecodeError:
+			pass
+		if img_link:
+			return img_link
+
+		# If that doesn't work, we can check whether we can retrieve the image directly.
+		# 4plebs has a back-referral system so that some filenames are translated.
+		# This means direct linking won't work for every image without API retrieval.
+		# So only show if we get a 200 status code.
+		img_page = requests.get("https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link), headers=headers)
+		if img_page.status_code == 200:
+			return "https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link)
+
+	# If the board is archived by archivedmoe, we can also check this resource
+	if board in archivedmoe_boards:
+		img_page = requests.get("https://archived.moe/files/%s/thumb/%s" % (board, thumb_link), headers=headers)
+		if img_page.status_code == 200:
+			return img_page
+
+	# If we couldn't retrieve the thumbnail yet, then we'll just give a search link
+	# and display it as a hidden image.
+	image_md5 = image_md5.replace("/", "_")
+	if board in plebs_boards:
+		return "retrieve:http://archive.4plebs.org/_/search/image/" + image_md5
+	# Archivedmoe as a last resort - has a lot of boards
+	return "retrieve:https://archived.moe/_/search/image/" + image_md5
+
+
+
+@app.template_filter('post_field')
+def _jinja2_filter_post_field(field, post):
+	# Extracts string values between {{ two curly brackets }} and uses that
+	# as a dictionary key for the given dict. It then returns the corresponding value.
+	# Mainly used in the Explorer.
+
+	matches = False
+	formatted_field = field
+
+	field = str(field)
+
+	for key in re.findall(r"\{\{(.*?)\}\}", field):
+
+		original_key = key
+
+		# Remove possible slice strings so we get the original key
+		string_slice = None
+		if "[" in original_key and "]" in original_key:
+			string_slice = re.search(r"\[(.*?)\]", original_key)
+			if string_slice:
+				string_slice = string_slice.group(1)
+				key = key.replace("[" + string_slice + "]", "")
+
+		# We're also gonna extract any other filters present
+		extra_filters = []
+		if "|" in key:
+			extra_filters = key.split("|")[1:]
+			key = key.split("|")[0]
+
+		# They keys can also be subfields (e.g. "author.username")
+		# So we're splitting and looping until we get the value.
+		keys = key.split(".")
+		val = post
+
+		for k in keys:
+			if isinstance(val, list):
+				val = val[0]
+			if isinstance(val, dict):
+				val = val.get(k.strip(), "")
+
+		# Return nothing if one of the fields is not found.
+		# We see 0 as a valid value - e.g. '0 retweets'.
+		if not val and val != 0:
+			return ""
+
+		# Support some basic string slicing
+		if string_slice:
+			field = field.replace("[" + string_slice + "]", "")
+			if ":" not in string_slice:
+				string_slice = slice(int(string_slice), int(string_slice) + 1)
+			else:
+				sl = string_slice.split(":")
+				if not sl[0] and sl[0] != "0":
+					sl1 = 0
+					sl2 = sl[1]
+				elif not sl[-1]:
+					sl1 = sl[0]
+					sl2 = len(st)
+				else:
+					sl1 = sl[0]
+					sl2 = sl[1]
+				string_slice = slice(int(sl1), int(sl2))
+
+		# Apply further filters, if present (e.g. lower)
+		for extra_filter in extra_filters:
+
+			extra_filter = extra_filter.strip()
+
+			# We're going to parse possible parameters to pass to the filter
+			# These are passed as unnamed variables to the function.
+			params = ()
+			if "(" in extra_filter:
+				params = extra_filter.split("(")[-1][:-1].strip()
+				extra_filter = extra_filter.split("(")[0]
+				params = [p.strip() for p in params.split(",")]
+				params = [post[param] for param in params]
+
+			val = app.jinja_env.filters[extra_filter](val, *params)
+
+		if string_slice:
+			val = val[string_slice]
+
+		# Extract single list item
+		if isinstance(val, list) and len(val) == 1:
+			val = val[0]
+
+		formatted_field = formatted_field.replace("{{" + original_key + "}}", str(val))
+
+	return formatted_field
+
 	# Supported data sources
 	known_datasources = list(base_urls.keys())
 	if datasource not in known_datasources:
diff --git a/webtool/pages/faq.md b/webtool/pages/faq.md
deleted file mode 100644
index 866a9675f..000000000
--- a/webtool/pages/faq.md
+++ /dev/null
@@ -1,15 +0,0 @@
-## Frequently Asked Questions
-
-### How do I cite this tool in my research paper?
-
-Please refer to the [How to cite](/page/citing/) page.
-
-### Where can I find more information about this tool?
-
-Take a look at 4CAT's [website](https://4cat.nl) and its 
-[GitHub repository](https://github.com/digitalmethodsinitiative/4cat)!
-
-### What query syntax can I use?
-
-Most standard search engine query syntax is supported. An 
-[overview of syntax you can use](/page/query-syntax/) is available.
\ No newline at end of file
diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index afa10f2c6..21d03c206 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -630,6 +630,10 @@ body.csv-preview table td, body.csv-preview table th {
     border: 1px solid var(--gray-light);
 }
 
+body.csv-preview table tr:nth-child(2n+1) {
+    background: var(--contrast-bright);
+}
+
 .child.focus:not(.card) > .sub-controls > .query-result > .query-result-iframe {
 	display: none;
 }
diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html
index d8b41b1ee..fee8c0956 100644
--- a/webtool/templates/frontpage.html
+++ b/webtool/templates/frontpage.html
@@ -11,7 +11,8 @@ <h2><span>What is {{ __user_config("4cat.name") }}?</span></h2>
                 from a variety of online sources, and analyze the data through analytical processors.</p>
             <p>4CAT is developed by <a href="https://oilab.eu">OILab</a> and the <a
                     href="https://www.digitalmethods.net">Digital Methods Initiative</a> at the University of Amsterdam.
-                For more information, take a look at the <a href="https://4cat.nl">4CAT website</a>.</p>
+                For more information, take a look at the <a href="https://4cat.nl">4CAT website</a> or the tool's
+                <a href="https://github.com/digitalmethodsinitiative/4cat">GitHub repository</a>.</p>
             {% if __user_config("4cat.about_this_server") %}
             <h2><span>About this server</span></h2>
             <p>{{ __user_config("4cat.about_this_server") }}</p>
@@ -19,6 +20,7 @@ <h2><span>About this server</span></h2>
             <h2><span>4CAT updates</span></h2>
             <p class="updates">
                   <bsky-embed
+                    id = "bsky-embed"
                     username="4cat.nl"
                     mode=""
                     limit="5"
diff --git a/webtool/templates/layout.html b/webtool/templates/layout.html
index 1815c2336..33f4ad610 100644
--- a/webtool/templates/layout.html
+++ b/webtool/templates/layout.html
@@ -54,7 +54,7 @@ <h1>
             <li{% if navigation.current == "about" %} class="current"{% endif %}><a href="{{ url_for('show_about') }}">About</a></li>
             {% for page in __user_config("ui.nav_pages") %}
             <li{% if navigation.current == page %} class="current"{% endif %}>
-                <a href="{{ url_for("show_page", page=page) }}">{% if page == "faq" %}<abbr title="Frequently Asked Questions">FAQ</abbr>{% else %}{{ page|title }}{% endif %}</a>
+                <a href="{{ url_for("show_page", page=page) }}">{{ page|title }}</a>
             </li>
             {% endfor %}
 		</ul>
@@ -85,7 +85,9 @@ <h1>
 		{% endif %}
 		  <li><a href="{{ url_for('show_page', page='citing') }}">How to cite</a></li>
 		  <li><a href="{{ url_for('show_page', page='issues') }}">Help &amp; Bug Reports</a></li>
+		{% if current_user.is_authenticated %}
           <li><abbr title="Version">v</abbr>{{ __version }}</li>
+        {% endif %}
 		  <li><a href="https://www.oilab.eu">OILab</a>, 2018 &ndash; {{ __datenow.year }}</li>
 		</ul>
 	</nav>
diff --git a/webtool/templates/preview/csv.html b/webtool/templates/preview/csv.html
index fc36bb9d1..d2473735a 100644
--- a/webtool/templates/preview/csv.html
+++ b/webtool/templates/preview/csv.html
@@ -20,7 +20,7 @@
                 {% endif %}
             {% endif %}
             <t{% if outer_loop.index == 1 %}h{% else %}d{% endif %}>
-                {{ cell|e|add_ahref|safe }}
+                {{ cell|e|add_ahref(ellipsiate=50)|safe }}
             </t{% if outer_loop.index == 1 %}h{% else %}d{% endif %}>
         {% endfor %}
         </tr>
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index f7f66ad6e..7097a92ee 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -282,13 +282,6 @@ def queue_dataset():
 	Request parameters vary by data source. The ones mandated constitute the
 	minimum but more may be required.
 
-	:request-param str board:  Board ID to query
-	:request-param str datasource:  Data source ID to query
-	:request-param str body_match:  String to match in the post body
-	:request-param str subject_match:  String to match in the post subject
-    :request-param int min_date:  Timestamp marking the beginning of the match
-                                  period
-    :request-param int max_date:  Timestamp marking the end of the match period
     :request-param str ?access_token:  Access token; only required if not
                                        logged in currently.
 
@@ -296,6 +289,7 @@ def queue_dataset():
 	              status and results.
 	:return-error 404: If the datasource does not exist.
 	"""
+
 	datasource_id = request.form.get("datasource", "")
 	if datasource_id not in fourcat_modules.datasources:
 		return error(404, message="Datasource '%s' does not exist" % datasource_id)
diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py
index 500c5a821..bdd86a3f0 100644
--- a/webtool/views/views_dataset.py
+++ b/webtool/views/views_dataset.py
@@ -70,7 +70,7 @@ def show_results(page):
         filters["sort_by"] = "timestamp"
 
     if not request.args:
-        filters["hide_empty"] = True
+        filters["hide_empty"] = False
 
     # handle 'depth'; all, own datasets, or favourites?
     # 'all' is limited to admins
diff --git a/webtool/views/views_misc.py b/webtool/views/views_misc.py
index 4690b6228..e179085c2 100644
--- a/webtool/views/views_misc.py
+++ b/webtool/views/views_misc.py
@@ -73,7 +73,7 @@ def show_about():
 
     datasources = {k: v for k, v in fourcat_modules.datasources.items() if
                    k in config.get("datasources.enabled") and not v["importable"]}
-    importables = {k: v for k, v in fourcat_modules.datasources.items() if v["importable"]}
+    importables = {k: v for k, v in fourcat_modules.datasources.items() if (v["importable"] and k in config.get("datasources.enabled"))}
 
     return render_template("frontpage.html", stats=stats, news=news, datasources=datasources, importables=importables)