diff --git a/.readthedocs.yaml b/.readthedocs.yaml
deleted file mode 100644
index faaf6921a..000000000
--- a/.readthedocs.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# .readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-# Required
-version: 2
-
-# Set the version of Python and other tools you might need
-build:
- os: ubuntu-20.04
- tools:
- python: "3.8"
-
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
- configuration: docs/conf.py
-
-# Optionally build your docs in additional formats such as PDF
-# formats:
-# - pdf
-
-# Optionally declare the Python requirements required to build your docs
-python:
- install:
- - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index cada86171..52ad88cec 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -481,7 +481,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,
self.dataset.update_status("Parent dataset updated.")
- def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True):
+ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True, filename_filter=[]):
"""
A generator that iterates through files in an archive
@@ -498,6 +498,8 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
:param bool immediately_delete: Temporary files are removed after yielded;
False keeps files until the staging_area is removed (usually during processor
cleanup)
+ :param list filename_filter: Whitelist of filenames to iterate.
+ Other files will be ignored. If empty, do not ignore anything.
:return: An iterator with a Path item for each file
"""
@@ -514,6 +516,9 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
archive_contents = sorted(archive_file.namelist())
for archived_file in archive_contents:
+ if filename_filter and archived_file not in filename_filter:
+ continue
+
info = archive_file.getinfo(archived_file)
if info.is_dir():
continue
diff --git a/backend/lib/search.py b/backend/lib/search.py
index 15b3982d6..3258561e1 100644
--- a/backend/lib/search.py
+++ b/backend/lib/search.py
@@ -170,10 +170,22 @@ def import_from_file(self, path):
if self.interrupted:
raise WorkerInterruptedException()
- # remove NUL bytes here because they trip up a lot of other
- # things
- # also include import metadata in item
- item = json.loads(line.replace("\0", ""))
+ try:
+ # remove NUL bytes here because they trip up a lot of other
+ # things
+ # also include import metadata in item
+ item = json.loads(line.replace("\0", ""))
+ except json.JSONDecodeError:
+ warning = (f"An item on line {i:,} of the imported file could not be parsed as JSON - this may "
+ f"indicate that the file you uploaded was incomplete and you need to try uploading it "
+ f"again. The item will be ignored.")
+
+ if warning not in import_warnings:
+ import_warnings[warning] = 0
+ import_warnings[warning] += 1
+ continue
+
+
new_item = {
**item["data"],
"__import_meta": {k: v for k, v in item.items() if k != "data"}
diff --git a/common/config_manager.py b/common/config_manager.py
index 1b8d4052f..7760aae99 100644
--- a/common/config_manager.py
+++ b/common/config_manager.py
@@ -269,11 +269,11 @@ def get(self, attribute_name, default=None, is_json=False, user=None, tags=None)
if not is_json and value is not None:
value = json.loads(value)
- # TODO: check this as it feels like it could cause a default to return even if value is not None. - Dale
- elif default is not None:
- value = default
+ # TODO: Which default should have priority? The provided default feels like it should be the highest priority, but I think that is an old implementation and perhaps should be removed. - Dale
elif value is None and setting_name in self.config_definition and "default" in self.config_definition[setting_name]:
value = self.config_definition[setting_name]["default"]
+ elif value is None and default is not None:
+ value = default
final_settings[setting_name] = value
diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 1ae6c06e5..4138ef4d0 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -529,11 +529,10 @@
"type": UserInput.OPTION_MULTI_SELECT,
"help": "Pages in navigation",
"options": {
- "faq": "FAQ",
"data-policy": "Data Policy",
"citing": "How to cite",
},
- "default": ["faq"],
+ "default": [],
"tooltip": "These pages will be included in the navigation bar at the top of the interface."
},
"ui.prefer_mapped_preview": {
diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index 148a2cd8d..1dcd3b27b 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -1,14 +1,16 @@
"""
Miscellaneous helper functions for the 4CAT backend
"""
-import hashlib
import subprocess
+import imagehash
+import hashlib
import requests
import hashlib
import datetime
import smtplib
import fnmatch
import socket
+import shlex
import copy
import time
import json
@@ -24,6 +26,7 @@
from urllib.parse import urlparse, urlunparse
from calendar import monthrange
from packaging import version
+from PIL import Image
from common.lib.user_input import UserInput
from common.config_manager import config
@@ -111,10 +114,8 @@ def get_git_branch():
repository or git is not installed an empty string is returned.
"""
try:
- cwd = os.getcwd()
- os.chdir(config.get('PATH_ROOT'))
- branch = subprocess.run(["git", "branch", "--show-current"], stdout=subprocess.PIPE)
- os.chdir(cwd)
+ root_dir = str(config.get('PATH_ROOT').resolve())
+ branch = subprocess.run(shlex.split(f"git -C {shlex.quote(root_dir)} branch --show-current"), stdout=subprocess.PIPE)
if branch.returncode != 0:
raise ValueError()
return branch.stdout.decode("utf-8").strip()
@@ -144,7 +145,6 @@ def get_software_commit(worker=None):
# try git command line within the 4CAT root folder
# if it is a checked-out git repository, it will tell us the hash of
# the currently checked-out commit
- cwd = os.getcwd()
# path has no Path.relative()...
relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent
@@ -154,24 +154,24 @@ def get_software_commit(worker=None):
# useful version info (since the extension is by definition not in the
# main 4CAT repository) and will return an empty value
if worker and worker.is_extension:
- extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath)
- os.chdir(extension_dir)
+ working_dir = str(config.get("PATH_ROOT").joinpath(relative_filepath).resolve())
# check if we are in the extensions' own repo or 4CAT's
- repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+ git_cmd = f"git -C {shlex.quote(working_dir)} rev-parse --show-toplevel"
+ repo_level = subprocess.run(shlex.split(git_cmd), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"):
# not its own repository
return ("", "")
else:
- os.chdir(config.get("PATH_ROOT"))
+ working_dir = str(config.get("PATH_ROOT").resolve())
- show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+ show = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} show"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
if show.returncode != 0:
raise ValueError()
commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1]
# now get the repository the commit belongs to, if we can
- origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+ origin = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
if origin.returncode != 0 or not origin.stdout:
raise ValueError()
repository = origin.stdout.decode("utf-8").strip()
@@ -181,9 +181,6 @@ def get_software_commit(worker=None):
except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
return ("", "")
- finally:
- os.chdir(cwd)
-
return (commit, repository)
def get_software_version():
@@ -279,7 +276,6 @@ def find_extensions():
# collect metadata for extensions
allowed_metadata_keys = ("name", "version", "url")
- cwd = os.getcwd()
for extension in extensions:
extension_folder = extension_path.joinpath(extension)
metadata_file = extension_folder.joinpath("metadata.json")
@@ -296,8 +292,8 @@ def find_extensions():
if extensions[extension]["is_git"]:
# try to get remote URL
try:
- os.chdir(extension_folder)
- origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE,
+ extension_root = str(extension_folder.resolve())
+ origin = subprocess.run(shlex.split(f"git -C {shlex.quote(extension_root)} config --get remote.origin.url"), stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
if origin.returncode != 0 or not origin.stdout:
raise ValueError()
@@ -309,8 +305,6 @@ def find_extensions():
except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
print(e)
pass
- finally:
- os.chdir(cwd)
return extensions, errors
@@ -421,6 +415,37 @@ def andify(items):
return ", ".join([str(item) for item in items]) + result
+def hash_file(image_file, hash_type="file-hash"):
+ """
+ Generate an image hash
+
+ :param Path image_file: Image file to hash
+ :param str hash_type: Hash type, one of `file-hash`, `colorhash`,
+ `phash`, `average_hash`, `dhash`
+ :return str: Hexadecimal hash value
+ """
+ if not image_file.exists():
+ raise FileNotFoundError()
+
+ if hash_type == "file-hash":
+ hasher = hashlib.sha1()
+
+ # Open the file in binary mode
+ with image_file.open("rb") as infile:
+ # Read and update hash in chunks to handle large files
+ while chunk := infile.read(1024):
+ hasher.update(chunk)
+
+ return hasher.hexdigest()
+
+ elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
+ image = Image.open(image_file)
+
+ return str(getattr(imagehash, hash_type)(image))
+
+ else:
+ raise NotImplementedError(f"Unknown hash type '{hash_type}'")
+
def get_yt_compatible_ids(yt_ids):
"""
:param yt_ids list, a list of strings
diff --git a/common/lib/logger.py b/common/lib/logger.py
index bbd30c444..ddffa2d72 100644
--- a/common/lib/logger.py
+++ b/common/lib/logger.py
@@ -185,23 +185,24 @@ def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log'
self.logger.setLevel(log_level)
# this handler manages the text log files
- handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
- handler.setLevel(log_level)
- handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
- "%d-%m-%Y %H:%M:%S"))
- self.logger.addHandler(handler)
-
- # the slack webhook has its own handler, and is only active if the
- # webhook URL is set
- try:
- if config.get("logging.slack.webhook"):
- slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
- slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
- self.logger.addHandler(slack_handler)
- except Exception:
- # we *may* need the logger before the database is in working order
- if config.db is not None:
- config.db.rollback()
+ if not self.logger.handlers:
+ handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
+ handler.setLevel(log_level)
+ handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
+ "%d-%m-%Y %H:%M:%S"))
+ self.logger.addHandler(handler)
+
+ # the slack webhook has its own handler, and is only active if the
+ # webhook URL is set
+ try:
+ if config.get("logging.slack.webhook"):
+ slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
+ slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
+ self.logger.addHandler(slack_handler)
+ except Exception:
+ # we *may* need the logger before the database is in working order
+ if config.db is not None:
+ config.db.rollback()
def log(self, message, level=logging.INFO, frame=None):
"""
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index f7cb7590e..6bee534dd 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -50,16 +50,16 @@ def map_item(post):
# from intercepted API response
user_nickname = post["author"]["uniqueId"]
user_fullname = post["author"]["nickname"]
- user_id = post["author"]["id"]
+ user_thumbnail = post["author"].get("avatarThumb", "")
elif post.get("author"):
# from embedded JSON object
user_nickname = post["author"]
user_fullname = post["nickname"]
- user_id = ""
+ user_thumbnail = ""
else:
user_nickname = ""
user_fullname = ""
- user_id = ""
+ user_thumbnail = ""
# there are various thumbnail URLs, some of them expire later than
# others. Try to get the highest-resolution one that hasn't expired
@@ -84,13 +84,15 @@ def map_item(post):
"author_followers": post.get("authorStats", {}).get("followerCount", ""),
"author_likes": post.get("authorStats", {}).get("diggCount", ""),
"author_videos": post.get("authorStats", {}).get("videoCount", ""),
- "author_avatar": post.get("avatarThumb", ""),
+ "author_avatar": user_thumbnail,
"body": post["desc"],
"timestamp": datetime.utcfromtimestamp(int(post["createTime"])).strftime('%Y-%m-%d %H:%M:%S'),
"unix_timestamp": int(post["createTime"]),
"is_duet": "yes" if (post.get("duetInfo", {}).get("duetFromId") != "0" if post.get("duetInfo", {}) else False) else "no",
"is_ad": "yes" if post.get("isAd", False) else "no",
"is_paid_partnership": "yes" if post.get("adAuthorization") else "no",
+ "is_sensitive": "yes" if post.get("maskType") == 3 else "no",
+ "is_photosensitive": "yes" if post.get("maskType") == 4 else "no",
"music_name": post["music"]["title"],
"music_id": post["music"]["id"],
"music_url": post["music"].get("playUrl", ""),
diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py
index efaffc21d..31471fcdc 100644
--- a/datasources/tiktok_comments/search_tiktok_comments.py
+++ b/datasources/tiktok_comments/search_tiktok_comments.py
@@ -58,7 +58,7 @@ def map_item(item):
"post_url": item["share_info"]["url"].split(".html")[0],
"post_body": item["share_info"]["title"],
"comment_url": item["share_info"]["url"],
- "is_liked_by_post_author": "yes" if bool(item["author_pin"]) else "no",
+ "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
"is_sticky": "yes" if bool(item["stick_position"]) else "no",
"is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
"language_guess": item["comment_language"]
diff --git a/datasources/twitterv2/DESCRIPTION.md b/datasources/twitterv2/DESCRIPTION.md
index 57f1f7a59..d138e6754 100644
--- a/datasources/twitterv2/DESCRIPTION.md
+++ b/datasources/twitterv2/DESCRIPTION.md
@@ -1,93 +1,88 @@
-Twitter data is gathered through the official [Twitter v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT
-allows access to both the Standard and the Academic track. The Standard track is free for anyone to use, but only
-allows to retrieve tweets up to seven days old. The Academic track allows a full-archive search of up to ten million
-tweets per month (as of March 2022). For the Academic track, you need a valid Bearer token. You can request one
-[here](https://developer.twitter.com/en/portal/petition/academic/is-it-right-for-you).
+X/Twitter data is gathered through the official [X v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT can interface with X's Research API (sometimes
+branded as the 'DSA API', referencing the EU's Digital Services Act). To retrieve posts via this API with 4CAT, you need
+a valid Bearer token. Read more about this mode of access [here](https://developer.x.com/en/use-cases/do-research/academic-research).
-Tweets are captured in batches at a speed of approximately 100,000 tweets per hour. 4CAT will warn you if your dataset
+Posts are captured in batches at a speed of approximately 100,000 posts per hour. 4CAT will warn you if your dataset
is expected to take more than 30 minutes to collect. It is often a good idea to start small (with very specific
queries or narrow date ranges) and then only create a larger dataset if you are confident that it will be manageable and
useful for your analysis.
-If you hit your Twitter API quota while creating a dataset, the dataset will be finished with the tweets that have been
+If you hit your X API quota while creating a dataset, the dataset will be finished with the posts that have been
collected so far and a warning will be logged.
### Query syntax
-Check the [API documentation](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)
+Check the [API documentation](https://developer.x.com/en/docs/x-api/tweets/search/integrate/build-a-query)
for available query syntax and operators. This information is crucial to what data you collect. Important operators for
-instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted tweets and retweets. Query syntax
-is roughly the same as for Twitter's search interface, so you can try out most queries by entering them in the Twitter
-app or website's search field and looking at the results. You can also test queries with
-Twitter's [Query Builder](https://developer.twitter.com/apitools/query?query=).
+instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted posts and reposts. Query syntax
+is roughly the same as for X's search interface, so you can try out most queries by entering them in the X app or
+website's search field and looking at the results. You can also test queries with
+X's [Query Builder](https://developer.twitter.com/apitools/query?query=).
### Date ranges
-By default, Twitter returns tweets posted within the past 30 days. If you want to go back further, you need to
-explicitly set a date range. Note that Twitter does not like date ranges that end in the future, or start before
-Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date.
+By default, X returns posts posted within the past 30 days. If you want to go back further, you need to
+explicitly set a date range. Note that X does not like date ranges that end in the future, or start before
+Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date. Also note
+that API access may come with certain limitations on how far a query may extend into history.
### Geo parameters
-Twitter offers a number of ways
-to [query by location/geo data](https://developer.twitter.com/en/docs/tutorials/filtering-tweets-by-location)
-such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. This feature is only available for the Academic level;
-you will receive a 400 error if using queries filtering by geographic information.
+X offers a number of ways
+to [query by location/geo data](https://developer.x.com/en/docs/tutorials/filtering-tweets-by-location)
+such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`.
### Retweets
-A retweet from Twitter API v2 contains at maximum 140 characters from the original tweet. 4CAT therefore
-gathers both the retweet and the original tweet and reformats the retweet text so it resembles a user's experience.
+A repost from X API v2 contains at maximum 140 characters from the original post. 4CAT therefore
+gathers both the repost and the original post and reformats the repost text so it resembles a user's experience.
This also affects mentions, hashtags, and other data as only those contained in the first 140 characters are provided
-by Twitter API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added
-to the retweet for 4CAT analysis methods. *4CAT stores the data from Twitter API v2 as similar as possible to the format
+by X API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added
+to the repost for 4CAT analysis methods. *4CAT stores the data from X API v2 as similar as possible to the format
in which it was received which you can obtain by downloading the ndjson file.*
*Example 1*
-[This retweet](https://twitter.com/tonino1630/status/1554618034299568128) returns the following data:
+[This repost](https://x.com/tonino1630/status/1554618034299568128) returns the following data:
- *author:* `tonino1630`
-- *
- text:* `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…`
+- *text:* `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…`
- *mentions:* `ChuckyFrao`
- *hashags:*
-While the original tweet will return (as a reference tweet) this data:
+While the original post will return (as a reference post) this data:
- *author:* `ChuckyFrao`
-- *
- text:* `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0`
+- *text:* `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0`
- *mentions:* `POTUS, usembassyve, StateSPEHA, StateDept, SecBlinken`
- *hashtags:* `FreeAlexSaab, BringAlexHome, IntegridadTerritorial`
-As you can see, only the author of the original tweet is listed as a mention in the retweet.
+As you can see, only the author of the original post is listed as a mention in the repost.
*Example 2*
-[This retweet](https://twitter.com/Macsmart31/status/1554618041459445760) returns the following:
+[This repost](https://x.com/Macsmart31/status/1554618041459445760) returns the following:
- *author:* `Macsmart31`
-- *
- text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…`
+- *text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…`
- *mentions:* `mickyd123us, tribelaw, HonorDecency`
-Compared with the original tweet referenced below:
+Compared with the original post referenced below:
- *author:* `mickyd123us`
-- *
- text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr`
+- *text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr`
- *mentions:* `tribelaw, HonorDecency`
-Because the mentioned users are in the first 140 characters of the original tweet, they are also listed as mentions in the retweet.
-
-The key difference here is that example one the retweet contains none of the hashtags or mentions from the original
-tweet (they are beyond the first 140 characters) while the second retweet example does return mentions from the original
-tweet. *Due to this discrepancy, for retweets all mentions and hashtags of the original tweet are considered as mentions
-and hashtags of the retweet.* A user on Twitter will see all mentions and hashtags when viewing a retweet and the
-retweet would be a part of any network around those mentions and hashtags.
+Because the mentioned users are in the first 140 characters of the original post, they are also listed as mentions in
+the repost.
+
+The key difference here is that in example one the repost contains none of the hashtags or mentions from the original
+post (they are beyond the first 140 characters) while the second repost example does return mentions from the original
+post. *Due to this discrepancy, for reposts all mentions and hashtags of the original post are considered as mentions
+and hashtags of the repost.* A user on X will see all mentions and hashtags when viewing a repost and the
+repost would be a part of any network around those mentions and hashtags.
diff --git a/datasources/twitterv2/__init__.py b/datasources/twitterv2/__init__.py
index 3335bc7c0..6aa80c7b3 100644
--- a/datasources/twitterv2/__init__.py
+++ b/datasources/twitterv2/__init__.py
@@ -9,4 +9,4 @@
# Internal identifier for this data source
DATASOURCE = "twitterv2"
-NAME = "Twitter API (v2) Search"
\ No newline at end of file
+NAME = "X/Twitter API (v2) Search"
\ No newline at end of file
diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py
index 999680b6e..8b91d1eb2 100644
--- a/datasources/twitterv2/search_twitter.py
+++ b/datasources/twitterv2/search_twitter.py
@@ -1,5 +1,5 @@
"""
-Twitter keyword search via the Twitter API v2
+X/Twitter keyword search via the X API v2
"""
import requests
import datetime
@@ -17,13 +17,10 @@
class SearchWithTwitterAPIv2(Search):
"""
- Get Tweets via the Twitter API
-
- This only allows for historical search - use f.ex. TCAT for more advanced
- queries.
+ Get Tweets via the X API
"""
type = "twitterv2-search" # job ID
- title = "Twitter API (v2)"
+ title = "X/Twitter API (v2)"
extension = "ndjson"
is_local = False # Whether this datasource is locally scraped
is_static = False # Whether this datasource is still updated
@@ -32,15 +29,15 @@ class SearchWithTwitterAPIv2(Search):
import_issues = True
references = [
- "[Twitter API documentation](https://developer.twitter.com/en/docs/twitter-api)"
+ "[X/Twitter API documentation](https://developer.x.com/en/docs/x-api)"
]
config = {
"twitterv2-search.academic_api_key": {
"type": UserInput.OPTION_TEXT,
"default": "",
- "help": "Academic API Key",
- "tooltip": "An API key for the Twitter v2 Academic API. If "
+ "help": "Research API Key",
+ "tooltip": "An API key for the X/Twitter v2 Research API. If "
"provided, the user will not need to enter their own "
"key to retrieve tweets. Note that this API key should "
"have access to the Full Archive Search endpoint."
@@ -50,15 +47,15 @@ class SearchWithTwitterAPIv2(Search):
"default": 0,
"min": 0,
"max": 10_000_000,
- "help": "Max tweets per dataset",
+ "help": "Max posts per dataset",
"tooltip": "4CAT will never retrieve more than this amount of "
- "tweets per dataset. Enter '0' for unlimited tweets."
+ "posts per dataset. Enter '0' for unlimited posts."
},
"twitterv2-search.id_lookup": {
"type": UserInput.OPTION_TOGGLE,
"default": False,
"help": "Allow lookup by ID",
- "tooltip": "If enabled, allow users to enter a list of tweet IDs "
+ "tooltip": "If enabled, allow users to enter a list of post IDs "
"to retrieve. This is disabled by default because it "
"can be confusing to novice users."
}
@@ -110,7 +107,7 @@ def get_items(self, query):
}
if self.parameters.get("query_type", "query") == "id_lookup" and self.config.get("twitterv2-search.id_lookup"):
- endpoint = "https://api.twitter.com/2/tweets"
+ endpoint = "https://api.x.com/2/tweets"
tweet_ids = self.parameters.get("query", []).split(',')
@@ -126,7 +123,7 @@ def get_items(self, query):
else:
# Query to all or search
- endpoint = "https://api.twitter.com/2/tweets/search/" + api_type
+ endpoint = "https://api.x.com/2/tweets/search/" + api_type
queries = [self.parameters.get("query", "")]
@@ -158,7 +155,7 @@ def get_items(self, query):
while True:
if self.interrupted:
- raise ProcessorInterruptedException("Interrupted while getting tweets from the Twitter API")
+ raise ProcessorInterruptedException("Interrupted while getting posts from the Twitter API")
# there is a limit of one request per second, so stay on the safe side of this
while self.previous_request == int(time.time()):
@@ -188,18 +185,18 @@ def get_items(self, query):
try:
structured_response = api_response.json()
if structured_response.get("title") == "UsageCapExceeded":
- self.dataset.update_status("Hit the monthly tweet cap. You cannot capture more tweets "
- "until your API quota resets. Dataset completed with tweets "
+ self.dataset.update_status("Hit the monthly post cap. You cannot capture more posts "
+ "until your API quota resets. Dataset completed with posts "
"collected so far.", is_final=True)
return
except (json.JSONDecodeError, ValueError):
- self.dataset.update_status("Hit Twitter rate limit, but could not figure out why. Halting "
- "tweet collection.", is_final=True)
+ self.dataset.update_status("Hit X's rate limit, but could not figure out why. Halting "
+ "post collection.", is_final=True)
return
resume_at = convert_to_int(api_response.headers["x-rate-limit-reset"]) + 1
resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c")
- self.dataset.update_status("Hit Twitter rate limit - waiting until %s to continue." % resume_at_str)
+ self.dataset.update_status("Hit X's rate limit - waiting until %s to continue." % resume_at_str)
while time.time() <= resume_at:
if self.interrupted:
raise ProcessorInterruptedException("Interrupted while waiting for rate limit to reset")
@@ -211,10 +208,10 @@ def get_items(self, query):
elif api_response.status_code == 403:
try:
structured_response = api_response.json()
- self.dataset.update_status("'Forbidden' error from the Twitter API. Could not connect to Twitter API "
+ self.dataset.update_status("'Forbidden' error from the X API. Could not connect to X API "
"with this API key. %s" % structured_response.get("detail", ""), is_final=True)
except (json.JSONDecodeError, ValueError):
- self.dataset.update_status("'Forbidden' error from the Twitter API. Your key may not have access to "
+ self.dataset.update_status("'Forbidden' error from the X API. Your key may not have access to "
"the full-archive search endpoint.", is_final=True)
finally:
return
@@ -224,7 +221,7 @@ def get_items(self, query):
elif api_response.status_code in (502, 503, 504):
resume_at = time.time() + 60
resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c")
- self.dataset.update_status("Twitter unavailable (status %i) - waiting until %s to continue." % (
+ self.dataset.update_status("X unavailable (status %i) - waiting until %s to continue." % (
api_response.status_code, resume_at_str))
while time.time() <= resume_at:
time.sleep(0.5)
@@ -233,7 +230,7 @@ def get_items(self, query):
# this usually means the query is too long or otherwise contains
# a syntax error
elif api_response.status_code == 400:
- msg = "Response %i from the Twitter API; " % api_response.status_code
+ msg = "Response %i from the X API; " % api_response.status_code
try:
api_response = api_response.json()
msg += api_response.get("title", "")
@@ -247,19 +244,19 @@ def get_items(self, query):
# invalid API key
elif api_response.status_code == 401:
- self.dataset.update_status("Invalid API key - could not connect to Twitter API", is_final=True)
+ self.dataset.update_status("Invalid API key - could not connect to X API", is_final=True)
return
# haven't seen one yet, but they probably exist
elif api_response.status_code != 200:
self.dataset.update_status(
"Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True)
- self.log.warning("Twitter API v2 responded with status code %i. Response body: %s" % (
+ self.log.warning("X API v2 responded with status code %i. Response body: %s" % (
api_response.status_code, api_response.text))
return
elif not api_response:
- self.dataset.update_status("Could not connect to Twitter. Cancelling.", is_final=True)
+ self.dataset.update_status("Could not connect to X. Cancelling.", is_final=True)
return
api_response = api_response.json()
@@ -291,13 +288,13 @@ def get_items(self, query):
if num_missing_objects > 50:
# Large amount of missing objects; possible error with Twitter API
self.import_issues = False
- error_report.append('%i missing objects received following tweet number %i. Possible issue with Twitter API.' % (num_missing_objects, tweets))
+ error_report.append('%i missing objects received following post number %i. Possible issue with X API.' % (num_missing_objects, tweets))
error_report.append('Missing objects collected: ' + ', '.join(['%s: %s' % (k, len(v)) for k, v in missing_objects.items()]))
# Warn if new missing object is recorded (for developers to handle)
expected_error_types = ['user', 'media', 'poll', 'tweet', 'place']
if any(key not in expected_error_types for key in missing_objects.keys()):
- self.log.warning("Twitter API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types]))
+ self.log.warning("X API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types]))
# Loop through and collect tweets
for tweet in api_response.get("data", []):
@@ -312,7 +309,7 @@ def get_items(self, query):
tweets += 1
if tweets % 500 == 0:
- self.dataset.update_status("Received %s of ~%s tweets from the Twitter API" % ("{:,}".format(tweets), expected_tweets))
+ self.dataset.update_status("Received %s of ~%s tweets from the X API" % ("{:,}".format(tweets), expected_tweets))
if num_expected_tweets is not None:
self.dataset.update_progress(tweets / num_expected_tweets)
@@ -474,21 +471,19 @@ def get_options(cls, parent_dataset=None, user=None):
max_tweets = config.get("twitterv2-search.max_tweets", user=user)
if have_api_key:
- intro_text = ("This data source uses the full-archive search endpoint of the Twitter API (v2) to retrieve "
+ intro_text = ("This data source uses the full-archive search endpoint of the X API (v2) to retrieve "
"historic tweets that match a given query.")
else:
- intro_text = ("This data source uses either the Standard 7-day historical Search endpoint or the "
- "full-archive search endpoint of the Twitter API, v2. To use the latter, you must have "
- "access to the Academic Research track of the Twitter API. In either case, you will need to "
- "provide a valid [bearer "
- "token](https://developer.twitter.com/en/docs/authentication/oauth-2-0). The bearer token "
- "**will be sent to the 4CAT server**, where it will be deleted after data collection has "
- "started. Note that any tweets retrieved with 4CAT will count towards your monthly Tweet "
- "retrieval cap.")
-
- intro_text += ("\n\nPlease refer to the [Twitter API documentation]("
- "https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) "
+ intro_text = ("This data source uses the full-archive search endpoint of the X/Twitter API, v2. To use the "
+ "it, you must have access to the Research track of the X API. You will need to provide a "
+ "valid [bearer token](https://developer.x.com/en/docs/authentication/oauth-2-0). The "
+ "bearer token **will be sent to the 4CAT server**, where it will be deleted after data "
+ "collection has started. Note that any posts retrieved with 4CAT will count towards your "
+ "monthly post retrieval cap.")
+
+ intro_text += ("\n\nPlease refer to the [X API documentation]("
+ "https://developer.x.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) "
"documentation for more information about this API endpoint and the syntax you can use in your "
"search query. Retweets are included by default; add `-is:retweet` to exclude them.")
@@ -500,16 +495,18 @@ def get_options(cls, parent_dataset=None, user=None):
}
if not have_api_key:
+ # options.update({
+ # "api_type": {
+ # "type": UserInput.OPTION_CHOICE,
+ # "help": "API track",
+ # "options": {
+ # "all": "Research API: Full-archive search",
+ # "recent": "Standard: Recent search (Tweets published in last 7 days)",
+ # },
+ # "default": "all"
+ # }
+ # })
options.update({
- "api_type": {
- "type": UserInput.OPTION_CHOICE,
- "help": "API track",
- "options": {
- "all": "Academic: Full-archive search",
- "recent": "Standard: Recent search (Tweets published in last 7 days)",
- },
- "default": "all"
- },
"api_bearer_token": {
"type": UserInput.OPTION_TEXT,
"sensitive": True,
@@ -523,10 +520,10 @@ def get_options(cls, parent_dataset=None, user=None):
"query_type": {
"type": UserInput.OPTION_CHOICE,
"help": "Query type",
- "tooltip": "Note: Num of Tweets and Date fields ignored with 'Tweets by ID' lookup",
+ "tooltip": "Note: Num of posts and date fields are ignored with 'Posts by ID' lookup",
"options": {
"query": "Search query",
- "id_lookup": "Tweets by ID (list IDs seperated by commas or one per line)",
+ "id_lookup": "Posts by ID (list IDs seperated by commas or one per line)",
},
"default": "query"
}
@@ -539,7 +536,7 @@ def get_options(cls, parent_dataset=None, user=None):
},
"amount": {
"type": UserInput.OPTION_TEXT,
- "help": "Tweets to retrieve",
+ "help": "Posts to retrieve",
"tooltip": "0 = unlimited (be careful!)" if not max_tweets else ("0 = maximum (%s)" % str(max_tweets)),
"min": 0,
"max": max_tweets if max_tweets else 10_000_000,
@@ -550,7 +547,7 @@ def get_options(cls, parent_dataset=None, user=None):
},
"daterange-info": {
"type": UserInput.OPTION_INFO,
- "help": "By default, Twitter returns tweets up til 30 days ago. If you want to go back further, you "
+ "help": "By default, X returns posts up til 30 days ago. If you want to go back further, you "
"need to explicitly set a date range."
},
"daterange": {
@@ -591,7 +588,7 @@ def validate_query(query, request, user):
raise QueryParametersException("Please provide a valid bearer token.")
if len(query.get("query")) > 1024 and query.get("query_type", "query") != "id_lookup":
- raise QueryParametersException("Twitter API queries cannot be longer than 1024 characters.")
+ raise QueryParametersException("X API queries cannot be longer than 1024 characters.")
if query.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup", user=user):
# reformat queries to be a comma-separated list with no wrapping
@@ -630,7 +627,7 @@ def validate_query(query, request, user):
# to dissuade users from running huge queries that will take forever
# to process
if params["query_type"] == "query" and (params.get("api_type") == "all" or have_api_key):
- count_url = "https://api.twitter.com/2/tweets/counts/all"
+ count_url = "https://api.x.com/2/tweets/counts/all"
count_params = {
"granularity": "day",
"query": params["query"],
@@ -668,7 +665,7 @@ def validate_query(query, request, user):
elif response.status_code == 401:
raise QueryParametersException("Your bearer token seems to be invalid. Please make sure it is valid "
- "for the Academic Track of the Twitter API.")
+ "for the Research track of the X API.")
elif response.status_code == 400:
raise QueryParametersException("Your query is invalid. Please make sure the date range does not "
@@ -791,7 +788,7 @@ def map_item(item):
"thread_id": item.get("conversation_id", item["id"]),
"timestamp": tweet_time.strftime("%Y-%m-%d %H:%M:%S"),
"unix_timestamp": int(tweet_time.timestamp()),
- 'link': "https://twitter.com/%s/status/%s" % (author_username, item.get('id')),
+ 'link': "https://x.com/%s/status/%s" % (author_username, item.get('id')),
"subject": "",
"body": item["text"],
"author": author_username,
diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
index bd7b81289..f6c8bcc11 100644
--- a/processors/conversion/export_datasets.py
+++ b/processors/conversion/export_datasets.py
@@ -23,7 +23,7 @@ class ExportDatasets(BasicProcessor):
type = "export-datasets" # job type ID
category = "Conversion" # category
title = "Export Dataset and All Analyses" # title displayed in UI
- description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again." # description displayed in UI
+ description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Filters are *not* included and must be exported separately as new datasets. Results automatically expires after 1 day, after which you must run again." # description displayed in UI
extension = "zip" # extension of result file, used internally and in UI
@classmethod
@@ -40,6 +40,11 @@ def process(self):
This takes a CSV file as input and writes the same data as a JSON file
"""
self.dataset.update_status("Collecting dataset and all analyses")
+ primary_dataset = self.dataset.top_parent()
+ if not primary_dataset.is_finished():
+ # This ought not happen as processors (i.e., this processor) should only be available for finished datasets
+ self.dataset.finish_with_error("You cannot export unfinished datasets; please wait until dataset is finished to export.")
+ return
results_path = self.dataset.get_staging_area()
@@ -52,25 +57,26 @@ def process(self):
try:
dataset = DataSet(key=dataset_key, db=self.db)
- # TODO: these two should fail for the primary dataset, but should they fail for the children too?
except DataSetException:
- self.dataset.finish_with_error("Dataset not found.")
- return
+ self.dataset.update_status(f"Dataset {dataset_key} not found: it may have been deleted prior to export; skipping.")
+ failed_exports.append(dataset_key)
+ continue
if not dataset.is_finished():
- self.dataset.finish_with_error("You cannot export unfinished datasets.")
- return
+ self.dataset.update_status(f"Dataset {dataset_key} not finished: cannot export unfinished datasets; skipping.")
+ failed_exports.append(dataset_key)
+ continue
# get metadata
metadata = dataset.get_metadata()
if metadata["num_rows"] == 0:
- self.dataset.update_status(f"Skipping empty dataset {dataset_key}")
+ self.dataset.update_status(f"Dataset {dataset_key} has no results; skipping.")
failed_exports.append(dataset_key)
continue
# get data
data_file = dataset.get_results_path()
if not data_file.exists():
- self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.")
+ self.dataset.update_status(f"Dataset {dataset_key} has no data file; skipping.")
failed_exports.append(dataset_key)
continue
diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py
index 860c0ddbe..461cdd54a 100644
--- a/processors/conversion/merge_datasets.py
+++ b/processors/conversion/merge_datasets.py
@@ -60,7 +60,7 @@ def is_compatible_with(cls, module=None, user=None):
return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector())
@staticmethod
- def get_dataset_from_url(url, db):
+ def get_dataset_from_url(url, db, modules=None):
"""
Get dataset object based on dataset URL
@@ -68,6 +68,7 @@ def get_dataset_from_url(url, db):
:param str url: Dataset URL
:param db: Database handler (to retrieve metadata)
+ :param modules: Modules handler (pass through to DataSet)
:return DataSet: The dataset
"""
if not url:
@@ -75,7 +76,7 @@ def get_dataset_from_url(url, db):
source_url = ural.normalize_url(url)
source_key = source_url.split("/")[-1]
- return DataSet(key=source_key, db=db)
+ return DataSet(key=source_key, db=db, modules=modules)
def process(self):
"""
@@ -96,7 +97,7 @@ def process(self):
continue
try:
- source_dataset = self.get_dataset_from_url(source_dataset_url, self.db)
+ source_dataset = self.get_dataset_from_url(source_dataset_url, self.db, modules=self.modules)
except DataSetException:
return self.dataset.finish_with_error(f"Dataset URL '{source_dataset_url} not found - cannot perform "
f"merge.")
diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
new file mode 100644
index 000000000..a8dd8763e
--- /dev/null
+++ b/processors/filtering/unique_images.py
@@ -0,0 +1,113 @@
+"""
+Filter by unique images
+"""
+import shutil
+import json
+
+from backend.lib.processor import BasicProcessor
+from common.lib.exceptions import ProcessorInterruptedException
+from common.lib.helpers import UserInput, hash_file
+
+__author__ = "Stijn Peeters"
+__credits__ = ["Stijn Peeters"]
+__maintainer__ = "Stijn Peeters"
+__email__ = "4cat@oilab.eu"
+
+
+class UniqueImageFilter(BasicProcessor):
+ """
+ Retain only unique images, by a user-defined metric
+ """
+ type = "image-downloader-unique" # job type ID
+ category = "Visualisation" # category
+ title = "Filter for unique images" # title displayed in UI
+ description = "Only keeps one instance per image, using a choice of detection method." # description displayed in UI
+ extension = "zip"
+
+ references = [
+ "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)",
+ "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+ "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+ "Explainer: [Difference hash](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)",
+
+ ]
+
+ options = {
+ "hash-type": {
+ "type": UserInput.OPTION_CHOICE,
+ "help": "Comparison method",
+ "default": "file-hash",
+ "options": {
+ "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+ "colorhash": "Colour hash (good at colours, worse at shapes)",
+ "phash": "Perceptual hash (decent at colours and shapes)",
+ "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+ "dhash": "Difference hash (similar to average hash, better at photos and art)"
+ }
+ }
+ }
+
+ @classmethod
+ def is_compatible_with(cls, module=None, user=None):
+ """
+ Allow processor on image archives
+
+ :param module: Module to determine compatibility with
+ """
+ return module.get_media_type() == "image" or module.type.startswith(
+ "image-downloader") or module.type == "video-frames"
+
+ def process(self):
+ """
+ Loop through images and only retain ones that have not been seen yet
+
+ :return:
+ """
+ seen_hashes = set()
+ hash_map = {}
+ metadata = None
+ dupes = 0
+ processed = 0
+ staging_area = self.dataset.get_staging_area()
+
+ self.dataset.update_status("Processing images and looking for duplicates")
+ for image_file in self.iterate_archive_contents(self.source_file):
+ if self.interrupted:
+ raise ProcessorInterruptedException("Interrupted while filtering for unique images")
+
+ self.dataset.update_progress(processed / self.source_dataset.num_rows)
+ if processed % 100 == 0:
+ self.dataset.update_status(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, "
+ f"found {dupes:,} duplicate(s)")
+ processed += 1
+
+ if image_file.name == ".metadata.json":
+ with image_file.open() as infile:
+ metadata = json.load(infile)
+ continue
+
+ image_hash = hash_file(image_file, self.parameters.get("hash-type"))
+
+ if image_hash not in seen_hashes:
+ seen_hashes.add(image_hash)
+ shutil.copy2(image_file, staging_area)
+ hash_map[image_hash] = image_file.name
+ else:
+ self.dataset.log(f"{image_file.name} is a duplicate of {hash_map[image_hash]} - skipping")
+ dupes += 1
+
+ new_metadata = {}
+ inverse_hashmap = {v: k for k, v in hash_map.items()}
+ for url, item in metadata.items():
+ if item["filename"] in inverse_hashmap:
+ new_metadata[inverse_hashmap[item["filename"]]] = {
+ **item,
+ "hash": inverse_hashmap[item["filename"]],
+ "hash_type": self.parameters.get("hash-type")
+ }
+
+ with staging_area.joinpath(".metadata.json").open("w") as outfile:
+ json.dump(new_metadata, outfile)
+
+ self.dataset.update_status(f"Image archive filtered, found {dupes:,} duplicate(s)", is_final=True)
+ self.write_archive_and_finish(staging_area, len(hash_map), finish=True)
diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py
index 022e96de5..26234a186 100644
--- a/processors/machine_learning/annotate_text.py
+++ b/processors/machine_learning/annotate_text.py
@@ -184,8 +184,8 @@ def process(self):
# prepare data for annotation
data_path = staging_area.joinpath("data.temp.ndjson")
with data_path.open("w", newline="") as outfile:
- for item in self.source_dataset.iterate_items():
- outfile.write(json.dumps({item.get("id"): item.get(textfield)}) + "\n")
+ for i, item in enumerate(self.source_dataset.iterate_items()):
+ outfile.write(json.dumps({item.get("id", str(i)): item.get(textfield)}) + "\n")
path_to_files, path_to_results = dmi_service_manager.process_files(staging_area,
[data_path.name, labels_path.name],
@@ -238,15 +238,14 @@ def make_filename(id, prompt):
self.dataset.update_status("Loading annotated data")
with output_dir.joinpath("results.json").open() as infile:
annotations = json.load(infile)
-
self.dataset.update_status("Writing results")
with self.dataset.get_results_path().open("w") as outfile:
writer = None
- for item in self.source_dataset.iterate_items():
+ for i, item in enumerate(self.source_dataset.iterate_items()):
row = {
- "id": item.get("id"),
+ "id": item.get("id", i),
textfield: item.get(textfield),
- "category": annotations[item.get("id")]
+ "category": annotations.get(item.get("id", str(i))) # str(i) because it is not recorded as an int in the annotations
}
if not writer:
writer = csv.DictWriter(outfile, fieldnames=row.keys())
diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py
index 236e9577f..139b2ac93 100644
--- a/processors/networks/cotag_network.py
+++ b/processors/networks/cotag_network.py
@@ -29,6 +29,13 @@ class CoTaggerPreset(ProcessorPreset):
"default": True,
"help": "Convert tags to lowercase",
"tooltip": "Merges tags with varying cases"
+ },
+ "ignore-tags": {
+ "type": UserInput.OPTION_TEXT,
+ "default": "",
+ "help": "Tags to ignore",
+ "tooltip": "Separate with commas if you want to ignore multiple tags. Do not include the '#' "
+ "character."
}
}
@@ -72,6 +79,7 @@ def get_processor_pipeline(self):
"split-comma": True,
"categorise": True,
"allow-loops": False,
+ "ignore-nodes": self.parameters.get("ignore-tags", ""),
"to-lowercase": self.parameters.get("to-lowercase", True)
}
}
diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py
new file mode 100644
index 000000000..4267c9650
--- /dev/null
+++ b/processors/networks/image-network.py
@@ -0,0 +1,217 @@
+"""
+Make a bipartite Image-Item network
+"""
+import json
+
+from backend.lib.processor import BasicProcessor
+from common.lib.helpers import hash_file
+
+import networkx as nx
+
+__author__ = "Stijn Peeters"
+__credits__ = ["Stijn Peeters"]
+__maintainer__ = "Stijn Peeters"
+__email__ = "4cat@oilab.eu"
+
+from common.lib.exceptions import ProcessorInterruptedException
+from common.lib.user_input import UserInput
+
+
+class ImageGrapher(BasicProcessor):
+ """
+ Image network
+
+ Creates a bipartite network of images and some attribute of the dataset the
+ images were sourced from
+ """
+ type = "image-bipartite-network" # job type ID
+ category = "Networks"
+ title = "Bipartite image-item network" # title displayed in UI
+ description = ("Create a GEXF network file with a bipartite network of "
+ "images and some data field (e.g. author) of the dataset "
+ "the images were sourced from. Suitable for use with Gephi's "
+ "'Image Preview' plugin.")
+ extension = "gexf" # extension of result file, used internally and in UI
+
+ options = {}
+
+ @classmethod
+ def get_options(cls, parent_dataset=None, user=None):
+ root_dataset = None
+ columns = None
+ if parent_dataset:
+ for parent in reversed(parent_dataset.get_genealogy()):
+ if parent.get_columns():
+ root_dataset = parent
+ break
+ columns = root_dataset.get_columns()
+
+ return {
+ "column": {
+ "help": "Dataset field",
+ "type": UserInput.OPTION_TEXT,
+ "default": "id"
+ },
+ "image-value": {
+ "help": "Image node label",
+ "type": UserInput.OPTION_CHOICE,
+ "options": {
+ "filename": "Image file name",
+ "url": "Image URL"
+ },
+ "tooltip": "The image node label will have this value. Depending on the network visualisation software "
+ "you use, one or the other is required to display the images as nodes."
+ },
+ "deduplicate": {
+ "type": UserInput.OPTION_CHOICE,
+ "help": "Merge images",
+ "tooltip": "Similar images can be merged into a single node, represented by the first image of the set "
+ "that was encountered.",
+ "options": {
+ "none": "Do not merge",
+ "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+ "colorhash": "Colour hash (good at colours, worse at shapes)",
+ "phash": "Perceptual hash (decent at colours and shapes)",
+ "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+ "dhash": "Difference hash (similar to average hash, better at photos and art)"
+ }
+ },
+ **({
+ "column": {
+ "help": "Dataset field",
+ "type": UserInput.OPTION_CHOICE,
+ "options": {
+ column: column
+ for column in columns}
+ }
+ } if columns else {})
+ }
+
+ @classmethod
+ def is_compatible_with(cls, module=None, user=None):
+ """
+ Allow processor to run on images downloaded from a dataset
+
+ :param module: Module to determine compatibility with
+ """
+ return module.type.startswith("image-downloader")
+
+ def process(self):
+ column = self.parameters.get("column")
+ hash_type = self.parameters.get("deduplicate")
+ filename_filter = [".metadata.json"] if hash_type == "none" else []
+ metadata = None
+ hashed = 0
+
+ # some maps to make sure we use the right value in the right place
+ # url or filename, original image or duplicate, etc
+ file_hash_map = {}
+ hash_file_map = {}
+ seen_hashes = set()
+ id_file_map = {}
+
+ for file in self.iterate_archive_contents(self.source_file, filename_filter=filename_filter):
+ if file.name == ".metadata.json":
+ with file.open() as infile:
+ try:
+ metadata = json.load(infile)
+ file_hash_map = {i: v["filename"] for i, v in metadata.items()} if self.parameters.get("image-value") == "url" else {i["filename"]: i["filename"] for i in metadata.values()}
+ except json.JSONDecodeError:
+ pass
+ else:
+ try:
+ hashed += 1
+ if hashed % 100 == 0:
+ self.dataset.update_status(f"Generated identity hashes for {hashed:,} of {self.source_dataset.num_rows-1:,} item(s)")
+ self.dataset.update_progress(hashed / (self.source_dataset.num_rows-1) * 0.5)
+ file_hash = hash_file(file, hash_type)
+ file_hash_map[file.name] = file_hash
+ if file_hash not in hash_file_map:
+ hash_file_map[file_hash] = file.name
+
+ except (FileNotFoundError, ValueError) as e:
+ continue
+
+ if not metadata:
+ return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only "
+ "be run on sets of images sourced from another 4CAT dataset.")
+
+ file_url_map = {v["filename"]: u for u, v in metadata.items()}
+ for url, details in metadata.items():
+ for item_id in details.get("post_ids", []):
+ if self.source_dataset.type.endswith("-telegram"):
+ # telegram has weird IDs
+ item_id = "-".join(details["filename"].split("-")[:-1]) + "-" + str(item_id)
+ id_file_map[item_id] = details["filename"]
+
+ root_dataset = None
+ for parent in reversed(self.dataset.get_genealogy()):
+ if parent.get_columns():
+ root_dataset = parent
+ break
+
+ if not root_dataset:
+ return self.dataset.finish_with_error("No suitable parent dataset found - this processor can only "
+ "be run on sets of images sourced from another 4CAT dataset.")
+
+ network = nx.DiGraph()
+ processed = 0
+ for item in root_dataset.iterate_items():
+ progress = processed / root_dataset.num_rows
+ if hashed:
+ # if hashing was necessary, we approximate that as 50% of the work
+ progress = (progress * 0.5) + 0.5
+
+ self.dataset.update_progress(progress)
+ processed += 1
+ if processed % 100 == 0:
+ self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)")
+
+ if self.interrupted:
+ raise ProcessorInterruptedException()
+
+ if item.get("id") not in id_file_map:
+ continue
+
+ # from nodes are the dataset fields (e.g. 'body' or 'chat')
+ # to node names are filenames (optionally mapped to URLs later)
+ from_node = item.get(column)
+ from_node_id = f"{column}-{from_node}"
+
+ image_file = id_file_map[item.get("id")]
+ image_hash = file_hash_map.get(image_file)
+ if hash_type != "none" and image_hash in seen_hashes:
+ # if we're deduplicating and the image is already in the graph,
+ # merge the nodes (use the original node as the 'to node')
+ to_node = hash_file_map.get(image_hash)
+ if to_node and image_file != to_node:
+ self.dataset.update_status(f"Image {image_file} identified as a duplicate of {to_node} - "
+ f"merging.")
+
+ else:
+ seen_hashes.add(image_hash)
+ to_node = image_file
+
+ if not to_node:
+ # image could not be hashed, probably invalid file
+ continue
+
+ if self.parameters.get("image-value") == "url":
+ to_node = file_url_map[to_node]
+
+ to_node_id = f"image-{to_node}"
+ if from_node_id not in network.nodes:
+ network.add_node(from_node_id, label=from_node, category=column)
+
+ if to_node_id not in network.nodes:
+ network.add_node(to_node_id, label=to_node, category="image", image=to_node)
+
+ edge = (from_node_id, to_node_id)
+ if edge not in network.edges():
+ network.add_edge(*edge, frequency=0)
+
+ network.edges[edge]["frequency"] += 1
+
+ self.dataset.update_status("Writing network file")
+ nx.write_gexf(network, self.dataset.get_results_path())
+ self.dataset.finish(len(network.nodes))
diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py
index 0f6045702..43ceffdf4 100644
--- a/processors/networks/two-column-network.py
+++ b/processors/networks/two-column-network.py
@@ -84,6 +84,12 @@ class ColumnNetworker(BasicProcessor):
"default": False,
"help": "Convert values to lowercase",
"tooltip": "Merges values with varying cases"
+ },
+ "ignore-nodes": {
+ "type": UserInput.OPTION_TEXT,
+ "default": "",
+ "help": "Nodes to ignore",
+ "tooltip": "Separate with commas if you want to ignore multiple nodes"
}
}
@@ -145,6 +151,7 @@ def process(self):
allow_loops = self.parameters.get("allow-loops")
interval_type = self.parameters.get("interval")
to_lower = self.parameters.get("to-lowercase", False)
+ ignoreable = [n.strip() for n in self.parameters.get("ignore-nodes", "").split(",") if n.strip()]
processed = 0
@@ -193,6 +200,14 @@ def process(self):
values_a = [value.strip() for value_groups in values_a for value in value_groups.split(",")]
values_b = [value.strip() for value_groups in values_b for value in value_groups.split(",")]
+ if ignoreable:
+ values_a = [v for v in values_a if v not in ignoreable]
+ values_b = [v for v in values_b if v not in ignoreable]
+
+ # only proceed if we actually have any edges left
+ if not values_a or not values_b:
+ continue
+
try:
interval = get_interval_descriptor(item, interval_type)
except ValueError as e:
diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index 17c350c86..1ee3b1990 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -226,6 +226,7 @@ def process(self):
The result is valid JSON, written in chunks.
"""
+ sentence_error = False
columns = self.parameters.get("columns")
if not columns:
self.dataset.update_status("No columns selected, aborting.", is_final=True)
@@ -357,11 +358,11 @@ def dummy_function(x, *args, **kwargs):
# for russian we use a special purpose splitter with better
# performance
sentence_method = razdel.sentenize
- elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
- 'pickle' in lang]:
+ elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab'))]:
self.dataset.update_status(
f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")
sentence_method = dummy_function
+ sentence_error = True
else:
sentence_method = sent_tokenize
else:
@@ -490,6 +491,9 @@ def dummy_function(x, *args, **kwargs):
with staging_area.joinpath(".token_metadata.json").open("w", encoding="utf-8") as outfile:
json.dump(metadata, outfile)
+ if sentence_error:
+ self.dataset.update_status(f"Finished tokenizing; Unable to group by sentence ({language} not supported), instead grouped by item.", is_final=True)
+
# create zip of archive and delete temporary files and folder
self.write_archive_and_finish(staging_area)
diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py
index 99ff5199b..6394862e8 100644
--- a/processors/visualisation/download-telegram-images.py
+++ b/processors/visualisation/download-telegram-images.py
@@ -7,13 +7,14 @@
from pathlib import Path
+import telethon.errors
from telethon import TelegramClient
-from telethon.errors import TimedOutError
+from telethon.errors import TimedOutError, BadRequestError
from common.config_manager import config
from backend.lib.processor import BasicProcessor
from common.lib.exceptions import ProcessorInterruptedException
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
from common.lib.dataset import DataSet
from processors.visualisation.download_images import ImageDownloader
@@ -194,6 +195,13 @@ async def get_images(self):
if self.interrupted:
raise ProcessorInterruptedException("Interrupted while downloading images")
+ if not message:
+ # message no longer exists
+ self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it "
+ f"may have been deleted)")
+ self.flawless = False
+ break
+
success = False
try:
# it's actually unclear if images are always jpegs, but this
@@ -216,13 +224,27 @@ async def get_images(self):
self.dataset.log(f"Could not download image for message {msg_id} ({e})")
self.flawless = False
- media_done += 1
- self.metadata[filename] = {
- "filename": filename,
- "success": success,
- "from_dataset": self.source_dataset.key,
- "post_ids": [msg_id]
- }
+ finally:
+ media_done += 1
+ self.metadata[filename] = {
+ "filename": filename,
+ "success": success,
+ "from_dataset": self.source_dataset.key,
+ "post_ids": [msg_id]
+ }
+
+ except BadRequestError as e:
+ self.dataset.log(f"Couldn't retrieve images for {entity} - the channel is no longer accessible ({e})")
+ self.flawless = False
+
+ except telethon.errors.FloodError as e:
+ later = "later"
+ if hasattr(e, "seconds"):
+ later = f"in {timify_long(e.seconds)}"
+ self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+ f"halting download process. Try again {later}.", is_final=True)
+ self.flawless = False
+ break
except ValueError as e:
self.dataset.log(f"Couldn't retrieve images for {entity}, it probably does not exist anymore ({e})")
diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py
index ef6d44231..aa05173ce 100644
--- a/processors/visualisation/download-telegram-videos.py
+++ b/processors/visualisation/download-telegram-videos.py
@@ -8,12 +8,13 @@
from pathlib import Path
from telethon import TelegramClient
+from telethon.errors import FloodError, BadRequestError
from common.config_manager import config
from backend.lib.processor import BasicProcessor
from common.lib.exceptions import ProcessorInterruptedException
from processors.visualisation.download_videos import VideoDownloaderPlus
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, timify_long
from common.lib.dataset import DataSet
__author__ = "Stijn Peeters"
@@ -197,7 +198,7 @@ async def get_videos(self):
msg_id = message.id
success = True
- except (AttributeError, RuntimeError, ValueError, TypeError) as e:
+ except (AttributeError, RuntimeError, ValueError, TypeError, BadRequestError) as e:
filename = f"{entity}-index-{media_done}"
msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}"
self.dataset.log(f"Could not download video for message {msg_id} ({e})")
@@ -210,6 +211,15 @@ async def get_videos(self):
"from_dataset": self.source_dataset.key,
"post_ids": [msg_id]
}
+
+ except FloodError as e:
+ later = "later"
+ if hasattr(e, "seconds"):
+ later = f"in {timify_long(e.seconds)}"
+ self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); "
+ f"halting download process. Try again {later}.", is_final=True)
+ self.flawless = False
+ break
except ValueError as e:
self.dataset.log(f"Couldn't retrieve video for {entity}, it probably does not exist anymore ({e})")
diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py
index c02b53bf7..3854e9653 100644
--- a/processors/visualisation/download_tiktok.py
+++ b/processors/visualisation/download_tiktok.py
@@ -161,6 +161,7 @@ class TikTokImageDownloader(BasicProcessor):
"options": {
"thumbnail": "Video Thumbnail",
"music": "Music Thumbnail",
+ "author_avatar": "User avatar"
},
"default": "thumbnail"
}
@@ -217,6 +218,8 @@ def process(self):
url_column = "thumbnail_url"
elif self.parameters.get("thumb_type") == "music":
url_column = "music_thumbnail"
+ elif self.parameters.get("thumb_type") == "author_avatar":
+ url_column = "author_avatar"
else:
self.dataset.update_status("No image column selected.", is_final=True)
self.dataset.finish(0)
diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
index 2b385ffe7..d1d7bd67c 100644
--- a/processors/visualisation/download_videos.py
+++ b/processors/visualisation/download_videos.py
@@ -3,6 +3,7 @@
First attempt to download via request, but if that fails use yt-dlp
"""
+import os
import json
import re
import time
@@ -601,15 +602,22 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
f"Video size {response.headers.get('Content-Length')} larger than maximum allowed per 4CAT")
# Size unknown
elif not self.config.get("video-downloader.allow-unknown-size", False):
- FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
+ raise FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
# Download video
self.dataset.update_status(
"Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
with open(results_path.joinpath(save_location), "wb") as f:
- for chunk in response.iter_content(chunk_size=1024 * 1024):
- if chunk:
- f.write(chunk)
+ try:
+ for chunk in response.iter_content(chunk_size=1024 * 1024):
+ if not max_video_size == 0 and f.tell() > (max_video_size * 1000000):
+ # File size too large; stop download and remove file
+ os.remove(f.name)
+ raise FilesizeException("Video size larger than maximum allowed per 4CAT")
+ if chunk:
+ f.write(chunk)
+ except requests.exceptions.ChunkedEncodingError as e:
+ raise FailedDownload(f"Failed to complete download: {e}")
# Return filename to add to metadata
return save_location.name
diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py
index 64b0c4f34..ec95f84f9 100644
--- a/processors/visualisation/video_frames.py
+++ b/processors/visualisation/video_frames.py
@@ -94,7 +94,7 @@ def process(self):
processed_videos = 0
self.dataset.update_status("Extracting video frames")
- for path in self.iterate_archive_contents(self.source_file, staging_area):
+ for i, path in enumerate(self.iterate_archive_contents(self.source_file, staging_area)):
if self.interrupted:
raise ProcessorInterruptedException("Interrupted while determining image wall order")
@@ -138,17 +138,21 @@ def process(self):
outfile.write(ffmpeg_error)
if result.returncode != 0:
- error = 'Error Return Code with video %s: %s' % (vid_name, str(result.returncode))
- self.dataset.log(error)
+ self.dataset.update_status(f"Unable to extract frames from video {vid_name} (see logs for details)")
+ self.dataset.log('Error Return Code (%s) with video %s: %s' % (str(result.returncode), vid_name, "\n".join(ffmpeg_error.split('\n')[-2:]) if ffmpeg_error else ''))
+ else:
+ processed_videos += 1
+ self.dataset.update_status("Created frames for %i of %i videos" % (processed_videos, total_possible_videos))
- processed_videos += 1
- self.dataset.update_status(
- "Created frames for %i of %i videos" % (processed_videos, total_possible_videos))
- self.dataset.update_progress(processed_videos / total_possible_videos)
+ self.dataset.update_progress(i / total_possible_videos)
# Finish up
# We've created a directory and folder structure here as opposed to a single folder with single files as
# expected by self.write_archive_and_finish() so we use make_archive instead
+ if not processed_videos:
+ self.dataset.finish_with_error("Unable to extract frames from any videos")
+ return
+
from shutil import make_archive
make_archive(self.dataset.get_results_path().with_suffix(''), "zip", output_directory)
diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py
index ff1222bc1..aad1baf69 100644
--- a/processors/visualisation/video_hasher.py
+++ b/processors/visualisation/video_hasher.py
@@ -183,8 +183,9 @@ def process(self):
self.dataset.log('Frames per seconds: %f' % frame_interval)
# Prepare staging area for videos and video tracking
+ # VideoHash creates various files that may not be cleaned up on error so we use an output directory
staging_area = self.dataset.get_staging_area()
- self.dataset.log('Staging directory location: %s' % staging_area)
+ output_dir = self.dataset.get_staging_area()
video_hashes = {}
video_metadata = None
@@ -215,16 +216,17 @@ def process(self):
self.dataset.update_status("FFmpeg software not found. Please contact 4CAT maintainers.", is_final=True)
self.dataset.finish(0)
return
- except FileNotFoundError as e:
- self.dataset.update_status(f"Unable to find file {str(path)}")
+ except FileNotFoundError:
+ self.dataset.update_status(f"Unable to find file {path.name}")
continue
except FFmpegFailedToExtractFrames as e:
- self.dataset.update_status(f"Unable to extract frame for {str(path)}: {e}")
+ self.dataset.update_status(f"Unable to extract frame for {path.name} (see log for details)")
+ self.dataset.log(f"Unable to extract frame for {str(path)}: {e}")
continue
video_hashes[path.name] = {'videohash': videohash}
- shutil.copy(videohash.collage_path, staging_area.joinpath(path.stem + '.jpg'))
+ shutil.copy(videohash.collage_path, output_dir.joinpath(path.stem + '.jpg'))
video_hashes[path.name]['video_collage_filename'] = path.stem + '.jpg'
processed_videos += 1
@@ -233,6 +235,10 @@ def process(self):
self.dataset.update_progress(processed_videos / total_possible_videos)
videohash.delete_storage_path()
+ if processed_videos == 0:
+ self.dataset.finish_with_error("Unable to create video hashes for any videos")
+ return
+
# Write hash file
# This file is held here and then copied as its own dataset via VideoHasherTwo
num_posts = 0
@@ -240,7 +246,7 @@ def process(self):
if video_metadata is None:
# Grab the metadata directly, if it exists but was skipped (e.g., not found prior to max_videos)
try:
- metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area)
+ metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, output_dir)
except FileNotFoundError:
metadata_path = None
if metadata_path:
@@ -293,7 +299,7 @@ def process(self):
num_posts += 1
writer = None
- with staging_area.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile:
+ with output_dir.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile:
for row in rows:
if not writer:
writer = csv.DictWriter(outfile, fieldnames=row.keys())
@@ -303,7 +309,7 @@ def process(self):
# Finish up
self.dataset.update_status(f'Created {num_posts} video hashes and stored video collages')
- self.write_archive_and_finish(staging_area)
+ self.write_archive_and_finish(output_dir, num_items=processed_videos)
class VideoHashNetwork(BasicProcessor):
"""
diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py
index 634e8c49d..5140baa01 100644
--- a/processors/visualisation/video_scene_identifier.py
+++ b/processors/visualisation/video_scene_identifier.py
@@ -252,8 +252,9 @@ def process(self):
if video_data.get('success'):
files = video_data.get('files') if 'files' in video_data else [{"filename": video_data.get("filename"), "success":True}]
for file in files:
- if not file.get("success"):
+ if not file.get("success") or file.get("filename") not in collected_scenes:
continue
+
# List types are not super fun for CSV
if 'post_ids' in video_data:
video_data['post_ids'] = ','.join([str(i) for i in video_data['post_ids']])
diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py
index f668e6f5e..3c73e57f8 100644
--- a/processors/visualisation/video_timelines.py
+++ b/processors/visualisation/video_timelines.py
@@ -117,6 +117,9 @@ def process(self):
if previous_video is not None or not looping:
# draw the video filename/label on top of the rendered
# frame thumbnails
+ if not previous_video:
+ # This likely means no frames were found for the video and this processor should not have run
+ continue
video_label = labels.get(previous_video, previous_video)
footersize = (fontsize * (len(video_label) + 2) * 0.5925, fontsize * 2)
footer_shape = SVG(insert=(0, base_height - footersize[1]), size=footersize)
@@ -165,6 +168,10 @@ def process(self):
timeline.add(frame_element)
timeline_widths[video] += frame_width
+ if not timeline_widths:
+ self.dataset.finish_with_error("No video frames found")
+ return
+
# now we know all dimensions we can instantiate the canvas too
canvas_width = max(timeline_widths.values())
fontsize = 12
@@ -207,7 +214,7 @@ def get_video_labels(self, metadata):
labels[filename] = filename
for dataset, urls in mapping_dataset.items():
- dataset = DataSet(key=dataset, db=self.db).nearest("*-search")
+ dataset = DataSet(key=dataset, db=self.db, modules=self.modules).nearest("*-search")
# determine appropriate label
# is this the right place? should it be in the datasource?
diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py
index 0dfe2d408..0a1f235e0 100644
--- a/processors/visualisation/word-trees.py
+++ b/processors/visualisation/word-trees.py
@@ -212,6 +212,12 @@ def process(self):
if processed % 500 == 0:
self.dataset.update_status("Processing and tokenising post %i" % processed)
body = post.get(column)
+
+ try:
+ body = str(body)
+ except TypeError:
+ continue
+
if not body:
continue
diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py
index 6cc91eba1..d0e74a377 100644
--- a/webtool/lib/helpers.py
+++ b/webtool/lib/helpers.py
@@ -96,30 +96,6 @@ def error(code=200, **kwargs):
return response
-def string_to_timestamp(string):
- """
- Convert dd-mm-yyyy date to unix time
-
- :param string: Date string to parse
- :return: The unix time, or 0 if value could not be parsed
- """
- bits = string.split("-")
- if re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", string):
- bits = list(reversed(bits))
-
- if len(bits) != 3:
- return 0
-
- try:
- day = int(bits[0])
- month = int(bits[1])
- year = int(bits[2])
- date = datetime.datetime(year, month, day)
- except ValueError:
- return 0
-
- return int(date.timestamp())
-
def pad_interval(intervals, first_interval=None, last_interval=None):
"""
Pad an interval so all intermediate intervals are filled
@@ -299,25 +275,6 @@ def generate_css_colours(force=False):
)
-def get_preview(query):
- """
- Generate a data preview of 25 rows of a results csv
-
- :param query
- :return list:
- """
- preview = []
- with query.get_results_path().open(encoding="utf-8") as resultfile:
- posts = csv.DictReader(resultfile)
- i = 0
- for post in posts:
- i += 1
- preview.append(post)
- if i > 25:
- break
- return preview
-
-
def format_chan_post(post):
"""
Format a plain-text imageboard post post for HTML display
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 8d1db0e2c..d3ba68314 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -1,5 +1,7 @@
import urllib.parse
import datetime
+from math import floor
+
import markdown
import json
import ural
@@ -123,7 +125,7 @@ def _jinja2_filter_httpquery(data):
return ""
@app.template_filter("add_ahref")
-def _jinja2_filter_add_ahref(content):
+def _jinja2_filter_add_ahref(content, ellipsiate=0):
"""
Add HTML links to text
@@ -138,7 +140,11 @@ def _jinja2_filter_add_ahref(content):
return content
for link in set(ural.urls_from_text(str(content))):
- content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link}')
+ if ellipsiate > 0:
+ link_text = _jinja2_filter_ellipsiate(link, ellipsiate, True, "[…]")
+ else:
+ link_text = link
+ content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link_text}')
return content
@@ -203,6 +209,7 @@ def _jinja2_filter_extension_to_noun(ext):
else:
return "item"
+
@app.template_filter('social_mediafy')
def _jinja2_filter_social_mediafy(body, datasource=""):
# Adds links to a text body with hashtags, @-mentions, and URLs
@@ -239,6 +246,176 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
}
}
+
+@app.template_filter("ellipsiate")
+def _jinja2_filter_ellipsiate(text, length, inside=False, ellipsis_str="…"):
+ if len(text) <= length:
+ return text
+
+ elif not inside:
+ return text[:length] + ellipsis_str
+
+ else:
+ # two cases: URLs and normal text
+ # for URLs, try to only ellipsiate after the domain name
+ # this makes the URLs easier to read when shortened
+ if ural.is_url(text):
+ pre_part = "/".join(text.split("/")[:3])
+ if len(pre_part) < length - 6: # kind of arbitrary
+ before = len(pre_part) + 1
+ else:
+ before = floor(length / 2)
+ else:
+ before = floor(length / 2)
+
+ after = len(text) - before
+ return text[:before] + ellipsis_str + text[after:]
+
+@app.template_filter('4chan_image')
+def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
+
+ plebs_boards = ["adv","f","hr","mlpol","mo","o","pol","s4s","sp","tg","trv","tv","x"]
+ archivedmoe_boards = ["3","a","aco","adv","an","asp","b","bant","biz","c","can","cgl","ck","cm","co","cock","con","d","diy","e","f","fa","fap","fit","fitlit","g","gd","gif","h","hc","his","hm","hr","i","ic","int","jp","k","lgbt","lit","m","mlp","mlpol","mo","mtv","mu","n","news","o","out","outsoc","p","po","pol","pw","q","qa","qb","qst","r","r9k","s","s4s","sci","soc","sp","spa","t","tg","toy","trash","trv","tv","u","v","vg","vint","vip","vm","vmg","vp","vr","vrpg","vst","vt","w","wg","wsg","wsr","x","xs","y"]
+
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
+
+ img_link = None
+ thumb_link = image_4chan.split(".")
+ thumb_link = thumb_link[0][:4] + "/" + thumb_link[0][4:6] + "/" + thumb_link[0] + "s." + thumb_link[1]
+
+ # If the board is archived by 4plebs, check that site first
+ if board in plebs_boards:
+
+ # First we're going to try to get the image link through the 4plebs API.
+ api_url = "https://archive.4plebs.org/_/api/chan/post/?board=%s&num=%s" % (board, post_id)
+ try:
+ api_json = requests.get(api_url, headers=headers)
+ except requests.RequestException as e:
+ pass
+ if api_json.status_code != 200:
+ pass
+ try:
+ api_json = json.loads(api_json.content)
+ img_link = api_json.get("media", {}).get("thumb_link", "")
+ except json.JSONDecodeError:
+ pass
+ if img_link:
+ return img_link
+
+ # If that doesn't work, we can check whether we can retrieve the image directly.
+ # 4plebs has a back-referral system so that some filenames are translated.
+ # This means direct linking won't work for every image without API retrieval.
+ # So only show if we get a 200 status code.
+ img_page = requests.get("https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link), headers=headers)
+ if img_page.status_code == 200:
+ return "https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link)
+
+ # If the board is archived by archivedmoe, we can also check this resource
+ if board in archivedmoe_boards:
+ img_page = requests.get("https://archived.moe/files/%s/thumb/%s" % (board, thumb_link), headers=headers)
+ if img_page.status_code == 200:
+ return img_page
+
+ # If we couldn't retrieve the thumbnail yet, then we'll just give a search link
+ # and display it as a hidden image.
+ image_md5 = image_md5.replace("/", "_")
+ if board in plebs_boards:
+ return "retrieve:http://archive.4plebs.org/_/search/image/" + image_md5
+ # Archivedmoe as a last resort - has a lot of boards
+ return "retrieve:https://archived.moe/_/search/image/" + image_md5
+
+
+
+@app.template_filter('post_field')
+def _jinja2_filter_post_field(field, post):
+ # Extracts string values between {{ two curly brackets }} and uses that
+ # as a dictionary key for the given dict. It then returns the corresponding value.
+ # Mainly used in the Explorer.
+
+ matches = False
+ formatted_field = field
+
+ field = str(field)
+
+ for key in re.findall(r"\{\{(.*?)\}\}", field):
+
+ original_key = key
+
+ # Remove possible slice strings so we get the original key
+ string_slice = None
+ if "[" in original_key and "]" in original_key:
+ string_slice = re.search(r"\[(.*?)\]", original_key)
+ if string_slice:
+ string_slice = string_slice.group(1)
+ key = key.replace("[" + string_slice + "]", "")
+
+ # We're also gonna extract any other filters present
+ extra_filters = []
+ if "|" in key:
+ extra_filters = key.split("|")[1:]
+ key = key.split("|")[0]
+
+ # They keys can also be subfields (e.g. "author.username")
+ # So we're splitting and looping until we get the value.
+ keys = key.split(".")
+ val = post
+
+ for k in keys:
+ if isinstance(val, list):
+ val = val[0]
+ if isinstance(val, dict):
+ val = val.get(k.strip(), "")
+
+ # Return nothing if one of the fields is not found.
+ # We see 0 as a valid value - e.g. '0 retweets'.
+ if not val and val != 0:
+ return ""
+
+ # Support some basic string slicing
+ if string_slice:
+ field = field.replace("[" + string_slice + "]", "")
+ if ":" not in string_slice:
+ string_slice = slice(int(string_slice), int(string_slice) + 1)
+ else:
+ sl = string_slice.split(":")
+ if not sl[0] and sl[0] != "0":
+ sl1 = 0
+ sl2 = sl[1]
+ elif not sl[-1]:
+ sl1 = sl[0]
+ sl2 = len(st)
+ else:
+ sl1 = sl[0]
+ sl2 = sl[1]
+ string_slice = slice(int(sl1), int(sl2))
+
+ # Apply further filters, if present (e.g. lower)
+ for extra_filter in extra_filters:
+
+ extra_filter = extra_filter.strip()
+
+ # We're going to parse possible parameters to pass to the filter
+ # These are passed as unnamed variables to the function.
+ params = ()
+ if "(" in extra_filter:
+ params = extra_filter.split("(")[-1][:-1].strip()
+ extra_filter = extra_filter.split("(")[0]
+ params = [p.strip() for p in params.split(",")]
+ params = [post[param] for param in params]
+
+ val = app.jinja_env.filters[extra_filter](val, *params)
+
+ if string_slice:
+ val = val[string_slice]
+
+ # Extract single list item
+ if isinstance(val, list) and len(val) == 1:
+ val = val[0]
+
+ formatted_field = formatted_field.replace("{{" + original_key + "}}", str(val))
+
+ return formatted_field
+
# Supported data sources
known_datasources = list(base_urls.keys())
if datasource not in known_datasources:
diff --git a/webtool/pages/faq.md b/webtool/pages/faq.md
deleted file mode 100644
index 866a9675f..000000000
--- a/webtool/pages/faq.md
+++ /dev/null
@@ -1,15 +0,0 @@
-## Frequently Asked Questions
-
-### How do I cite this tool in my research paper?
-
-Please refer to the [How to cite](/page/citing/) page.
-
-### Where can I find more information about this tool?
-
-Take a look at 4CAT's [website](https://4cat.nl) and its
-[GitHub repository](https://github.com/digitalmethodsinitiative/4cat)!
-
-### What query syntax can I use?
-
-Most standard search engine query syntax is supported. An
-[overview of syntax you can use](/page/query-syntax/) is available.
\ No newline at end of file
diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index afa10f2c6..21d03c206 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -630,6 +630,10 @@ body.csv-preview table td, body.csv-preview table th {
border: 1px solid var(--gray-light);
}
+body.csv-preview table tr:nth-child(2n+1) {
+ background: var(--contrast-bright);
+}
+
.child.focus:not(.card) > .sub-controls > .query-result > .query-result-iframe {
display: none;
}
diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html
index d8b41b1ee..fee8c0956 100644
--- a/webtool/templates/frontpage.html
+++ b/webtool/templates/frontpage.html
@@ -11,7 +11,8 @@
4CAT is developed by OILab and the Digital Methods Initiative at the University of Amsterdam. - For more information, take a look at the 4CAT website.
+ For more information, take a look at the 4CAT website or the tool's + GitHub repository. {% if __user_config("4cat.about_this_server") %}{{ __user_config("4cat.about_this_server") }}
@@ -19,6 +20,7 @@
{% endif %}