diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index faaf6921a..000000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# .readthedocs.yaml -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-20.04 - tools: - python: "3.8" - -# Build documentation in the docs/ directory with Sphinx -sphinx: - configuration: docs/conf.py - -# Optionally build your docs in additional formats such as PDF -# formats: -# - pdf - -# Optionally declare the Python requirements required to build your docs -python: - install: - - requirements: docs/requirements.txt \ No newline at end of file diff --git a/backend/lib/processor.py b/backend/lib/processor.py index cada86171..52ad88cec 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -481,7 +481,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset, self.dataset.update_status("Parent dataset updated.") - def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True): + def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True, filename_filter=[]): """ A generator that iterates through files in an archive @@ -498,6 +498,8 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T :param bool immediately_delete: Temporary files are removed after yielded; False keeps files until the staging_area is removed (usually during processor cleanup) + :param list filename_filter: Whitelist of filenames to iterate. + Other files will be ignored. If empty, do not ignore anything. :return: An iterator with a Path item for each file """ @@ -514,6 +516,9 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T archive_contents = sorted(archive_file.namelist()) for archived_file in archive_contents: + if filename_filter and archived_file not in filename_filter: + continue + info = archive_file.getinfo(archived_file) if info.is_dir(): continue diff --git a/backend/lib/search.py b/backend/lib/search.py index 15b3982d6..3258561e1 100644 --- a/backend/lib/search.py +++ b/backend/lib/search.py @@ -170,10 +170,22 @@ def import_from_file(self, path): if self.interrupted: raise WorkerInterruptedException() - # remove NUL bytes here because they trip up a lot of other - # things - # also include import metadata in item - item = json.loads(line.replace("\0", "")) + try: + # remove NUL bytes here because they trip up a lot of other + # things + # also include import metadata in item + item = json.loads(line.replace("\0", "")) + except json.JSONDecodeError: + warning = (f"An item on line {i:,} of the imported file could not be parsed as JSON - this may " + f"indicate that the file you uploaded was incomplete and you need to try uploading it " + f"again. The item will be ignored.") + + if warning not in import_warnings: + import_warnings[warning] = 0 + import_warnings[warning] += 1 + continue + + new_item = { **item["data"], "__import_meta": {k: v for k, v in item.items() if k != "data"} diff --git a/common/config_manager.py b/common/config_manager.py index 1b8d4052f..7760aae99 100644 --- a/common/config_manager.py +++ b/common/config_manager.py @@ -269,11 +269,11 @@ def get(self, attribute_name, default=None, is_json=False, user=None, tags=None) if not is_json and value is not None: value = json.loads(value) - # TODO: check this as it feels like it could cause a default to return even if value is not None. - Dale - elif default is not None: - value = default + # TODO: Which default should have priority? The provided default feels like it should be the highest priority, but I think that is an old implementation and perhaps should be removed. - Dale elif value is None and setting_name in self.config_definition and "default" in self.config_definition[setting_name]: value = self.config_definition[setting_name]["default"] + elif value is None and default is not None: + value = default final_settings[setting_name] = value diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index 1ae6c06e5..4138ef4d0 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -529,11 +529,10 @@ "type": UserInput.OPTION_MULTI_SELECT, "help": "Pages in navigation", "options": { - "faq": "FAQ", "data-policy": "Data Policy", "citing": "How to cite", }, - "default": ["faq"], + "default": [], "tooltip": "These pages will be included in the navigation bar at the top of the interface." }, "ui.prefer_mapped_preview": { diff --git a/common/lib/helpers.py b/common/lib/helpers.py index 148a2cd8d..1dcd3b27b 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -1,14 +1,16 @@ """ Miscellaneous helper functions for the 4CAT backend """ -import hashlib import subprocess +import imagehash +import hashlib import requests import hashlib import datetime import smtplib import fnmatch import socket +import shlex import copy import time import json @@ -24,6 +26,7 @@ from urllib.parse import urlparse, urlunparse from calendar import monthrange from packaging import version +from PIL import Image from common.lib.user_input import UserInput from common.config_manager import config @@ -111,10 +114,8 @@ def get_git_branch(): repository or git is not installed an empty string is returned. """ try: - cwd = os.getcwd() - os.chdir(config.get('PATH_ROOT')) - branch = subprocess.run(["git", "branch", "--show-current"], stdout=subprocess.PIPE) - os.chdir(cwd) + root_dir = str(config.get('PATH_ROOT').resolve()) + branch = subprocess.run(shlex.split(f"git -C {shlex.quote(root_dir)} branch --show-current"), stdout=subprocess.PIPE) if branch.returncode != 0: raise ValueError() return branch.stdout.decode("utf-8").strip() @@ -144,7 +145,6 @@ def get_software_commit(worker=None): # try git command line within the 4CAT root folder # if it is a checked-out git repository, it will tell us the hash of # the currently checked-out commit - cwd = os.getcwd() # path has no Path.relative()... relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent @@ -154,24 +154,24 @@ def get_software_commit(worker=None): # useful version info (since the extension is by definition not in the # main 4CAT repository) and will return an empty value if worker and worker.is_extension: - extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath) - os.chdir(extension_dir) + working_dir = str(config.get("PATH_ROOT").joinpath(relative_filepath).resolve()) # check if we are in the extensions' own repo or 4CAT's - repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + git_cmd = f"git -C {shlex.quote(working_dir)} rev-parse --show-toplevel" + repo_level = subprocess.run(shlex.split(git_cmd), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"): # not its own repository return ("", "") else: - os.chdir(config.get("PATH_ROOT")) + working_dir = str(config.get("PATH_ROOT").resolve()) - show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + show = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} show"), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if show.returncode != 0: raise ValueError() commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1] # now get the repository the commit belongs to, if we can - origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + origin = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if origin.returncode != 0 or not origin.stdout: raise ValueError() repository = origin.stdout.decode("utf-8").strip() @@ -181,9 +181,6 @@ def get_software_commit(worker=None): except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e: return ("", "") - finally: - os.chdir(cwd) - return (commit, repository) def get_software_version(): @@ -279,7 +276,6 @@ def find_extensions(): # collect metadata for extensions allowed_metadata_keys = ("name", "version", "url") - cwd = os.getcwd() for extension in extensions: extension_folder = extension_path.joinpath(extension) metadata_file = extension_folder.joinpath("metadata.json") @@ -296,8 +292,8 @@ def find_extensions(): if extensions[extension]["is_git"]: # try to get remote URL try: - os.chdir(extension_folder) - origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, + extension_root = str(extension_folder.resolve()) + origin = subprocess.run(shlex.split(f"git -C {shlex.quote(extension_root)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if origin.returncode != 0 or not origin.stdout: raise ValueError() @@ -309,8 +305,6 @@ def find_extensions(): except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e: print(e) pass - finally: - os.chdir(cwd) return extensions, errors @@ -421,6 +415,37 @@ def andify(items): return ", ".join([str(item) for item in items]) + result +def hash_file(image_file, hash_type="file-hash"): + """ + Generate an image hash + + :param Path image_file: Image file to hash + :param str hash_type: Hash type, one of `file-hash`, `colorhash`, + `phash`, `average_hash`, `dhash` + :return str: Hexadecimal hash value + """ + if not image_file.exists(): + raise FileNotFoundError() + + if hash_type == "file-hash": + hasher = hashlib.sha1() + + # Open the file in binary mode + with image_file.open("rb") as infile: + # Read and update hash in chunks to handle large files + while chunk := infile.read(1024): + hasher.update(chunk) + + return hasher.hexdigest() + + elif hash_type in ("colorhash", "phash", "average_hash", "dhash"): + image = Image.open(image_file) + + return str(getattr(imagehash, hash_type)(image)) + + else: + raise NotImplementedError(f"Unknown hash type '{hash_type}'") + def get_yt_compatible_ids(yt_ids): """ :param yt_ids list, a list of strings diff --git a/common/lib/logger.py b/common/lib/logger.py index bbd30c444..ddffa2d72 100644 --- a/common/lib/logger.py +++ b/common/lib/logger.py @@ -185,23 +185,24 @@ def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log' self.logger.setLevel(log_level) # this handler manages the text log files - handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1) - handler.setLevel(log_level) - handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s", - "%d-%m-%Y %H:%M:%S")) - self.logger.addHandler(handler) - - # the slack webhook has its own handler, and is only active if the - # webhook URL is set - try: - if config.get("logging.slack.webhook"): - slack_handler = SlackLogHandler(config.get("logging.slack.webhook")) - slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level)) - self.logger.addHandler(slack_handler) - except Exception: - # we *may* need the logger before the database is in working order - if config.db is not None: - config.db.rollback() + if not self.logger.handlers: + handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1) + handler.setLevel(log_level) + handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s", + "%d-%m-%Y %H:%M:%S")) + self.logger.addHandler(handler) + + # the slack webhook has its own handler, and is only active if the + # webhook URL is set + try: + if config.get("logging.slack.webhook"): + slack_handler = SlackLogHandler(config.get("logging.slack.webhook")) + slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level)) + self.logger.addHandler(slack_handler) + except Exception: + # we *may* need the logger before the database is in working order + if config.db is not None: + config.db.rollback() def log(self, message, level=logging.INFO, frame=None): """ diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index f7cb7590e..6bee534dd 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -50,16 +50,16 @@ def map_item(post): # from intercepted API response user_nickname = post["author"]["uniqueId"] user_fullname = post["author"]["nickname"] - user_id = post["author"]["id"] + user_thumbnail = post["author"].get("avatarThumb", "") elif post.get("author"): # from embedded JSON object user_nickname = post["author"] user_fullname = post["nickname"] - user_id = "" + user_thumbnail = "" else: user_nickname = "" user_fullname = "" - user_id = "" + user_thumbnail = "" # there are various thumbnail URLs, some of them expire later than # others. Try to get the highest-resolution one that hasn't expired @@ -84,13 +84,15 @@ def map_item(post): "author_followers": post.get("authorStats", {}).get("followerCount", ""), "author_likes": post.get("authorStats", {}).get("diggCount", ""), "author_videos": post.get("authorStats", {}).get("videoCount", ""), - "author_avatar": post.get("avatarThumb", ""), + "author_avatar": user_thumbnail, "body": post["desc"], "timestamp": datetime.utcfromtimestamp(int(post["createTime"])).strftime('%Y-%m-%d %H:%M:%S'), "unix_timestamp": int(post["createTime"]), "is_duet": "yes" if (post.get("duetInfo", {}).get("duetFromId") != "0" if post.get("duetInfo", {}) else False) else "no", "is_ad": "yes" if post.get("isAd", False) else "no", "is_paid_partnership": "yes" if post.get("adAuthorization") else "no", + "is_sensitive": "yes" if post.get("maskType") == 3 else "no", + "is_photosensitive": "yes" if post.get("maskType") == 4 else "no", "music_name": post["music"]["title"], "music_id": post["music"]["id"], "music_url": post["music"].get("playUrl", ""), diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py index efaffc21d..31471fcdc 100644 --- a/datasources/tiktok_comments/search_tiktok_comments.py +++ b/datasources/tiktok_comments/search_tiktok_comments.py @@ -58,7 +58,7 @@ def map_item(item): "post_url": item["share_info"]["url"].split(".html")[0], "post_body": item["share_info"]["title"], "comment_url": item["share_info"]["url"], - "is_liked_by_post_author": "yes" if bool(item["author_pin"]) else "no", + "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no", "is_sticky": "yes" if bool(item["stick_position"]) else "no", "is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes", "language_guess": item["comment_language"] diff --git a/datasources/twitterv2/DESCRIPTION.md b/datasources/twitterv2/DESCRIPTION.md index 57f1f7a59..d138e6754 100644 --- a/datasources/twitterv2/DESCRIPTION.md +++ b/datasources/twitterv2/DESCRIPTION.md @@ -1,93 +1,88 @@ -Twitter data is gathered through the official [Twitter v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT -allows access to both the Standard and the Academic track. The Standard track is free for anyone to use, but only -allows to retrieve tweets up to seven days old. The Academic track allows a full-archive search of up to ten million -tweets per month (as of March 2022). For the Academic track, you need a valid Bearer token. You can request one -[here](https://developer.twitter.com/en/portal/petition/academic/is-it-right-for-you). +X/Twitter data is gathered through the official [X v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT can interface with X's Research API (sometimes +branded as the 'DSA API', referencing the EU's Digital Services Act). To retrieve posts via this API with 4CAT, you need +a valid Bearer token. Read more about this mode of access [here](https://developer.x.com/en/use-cases/do-research/academic-research). -Tweets are captured in batches at a speed of approximately 100,000 tweets per hour. 4CAT will warn you if your dataset +Posts are captured in batches at a speed of approximately 100,000 posts per hour. 4CAT will warn you if your dataset is expected to take more than 30 minutes to collect. It is often a good idea to start small (with very specific queries or narrow date ranges) and then only create a larger dataset if you are confident that it will be manageable and useful for your analysis. -If you hit your Twitter API quota while creating a dataset, the dataset will be finished with the tweets that have been +If you hit your X API quota while creating a dataset, the dataset will be finished with the posts that have been collected so far and a warning will be logged. ### Query syntax -Check the [API documentation](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) +Check the [API documentation](https://developer.x.com/en/docs/x-api/tweets/search/integrate/build-a-query) for available query syntax and operators. This information is crucial to what data you collect. Important operators for -instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted tweets and retweets. Query syntax -is roughly the same as for Twitter's search interface, so you can try out most queries by entering them in the Twitter -app or website's search field and looking at the results. You can also test queries with -Twitter's [Query Builder](https://developer.twitter.com/apitools/query?query=). +instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted posts and reposts. Query syntax +is roughly the same as for X's search interface, so you can try out most queries by entering them in the X app or +website's search field and looking at the results. You can also test queries with +X's [Query Builder](https://developer.twitter.com/apitools/query?query=). ### Date ranges -By default, Twitter returns tweets posted within the past 30 days. If you want to go back further, you need to -explicitly set a date range. Note that Twitter does not like date ranges that end in the future, or start before -Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date. +By default, X returns posts posted within the past 30 days. If you want to go back further, you need to +explicitly set a date range. Note that X does not like date ranges that end in the future, or start before +Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date. Also note +that API access may come with certain limitations on how far a query may extend into history. ### Geo parameters -Twitter offers a number of ways -to [query by location/geo data](https://developer.twitter.com/en/docs/tutorials/filtering-tweets-by-location) -such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. This feature is only available for the Academic level; -you will receive a 400 error if using queries filtering by geographic information. +X offers a number of ways +to [query by location/geo data](https://developer.x.com/en/docs/tutorials/filtering-tweets-by-location) +such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. ### Retweets -A retweet from Twitter API v2 contains at maximum 140 characters from the original tweet. 4CAT therefore -gathers both the retweet and the original tweet and reformats the retweet text so it resembles a user's experience. +A repost from X API v2 contains at maximum 140 characters from the original post. 4CAT therefore +gathers both the repost and the original post and reformats the repost text so it resembles a user's experience. This also affects mentions, hashtags, and other data as only those contained in the first 140 characters are provided -by Twitter API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added -to the retweet for 4CAT analysis methods. *4CAT stores the data from Twitter API v2 as similar as possible to the format +by X API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added +to the repost for 4CAT analysis methods. *4CAT stores the data from X API v2 as similar as possible to the format in which it was received which you can obtain by downloading the ndjson file.* *Example 1* -[This retweet](https://twitter.com/tonino1630/status/1554618034299568128) returns the following data: +[This repost](https://x.com/tonino1630/status/1554618034299568128) returns the following data: - *author:* `tonino1630` -- * - text:* `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…` +- *text:* `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…` - *mentions:* `ChuckyFrao` - *hashags:*
-While the original tweet will return (as a reference tweet) this data: +While the original post will return (as a reference post) this data: - *author:* `ChuckyFrao` -- * - text:* `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0` +- *text:* `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0` - *mentions:* `POTUS, usembassyve, StateSPEHA, StateDept, SecBlinken` - *hashtags:* `FreeAlexSaab, BringAlexHome, IntegridadTerritorial`
-As you can see, only the author of the original tweet is listed as a mention in the retweet. +As you can see, only the author of the original post is listed as a mention in the repost. *Example 2* -[This retweet](https://twitter.com/Macsmart31/status/1554618041459445760) returns the following: +[This repost](https://x.com/Macsmart31/status/1554618041459445760) returns the following: - *author:* `Macsmart31` -- * - text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…` +- *text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…` - *mentions:* `mickyd123us, tribelaw, HonorDecency`
-Compared with the original tweet referenced below: +Compared with the original post referenced below: - *author:* `mickyd123us` -- * - text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr` +- *text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr` - *mentions:* `tribelaw, HonorDecency`
-Because the mentioned users are in the first 140 characters of the original tweet, they are also listed as mentions in the retweet. - -The key difference here is that example one the retweet contains none of the hashtags or mentions from the original -tweet (they are beyond the first 140 characters) while the second retweet example does return mentions from the original -tweet. *Due to this discrepancy, for retweets all mentions and hashtags of the original tweet are considered as mentions -and hashtags of the retweet.* A user on Twitter will see all mentions and hashtags when viewing a retweet and the -retweet would be a part of any network around those mentions and hashtags. +Because the mentioned users are in the first 140 characters of the original post, they are also listed as mentions in +the repost. + +The key difference here is that in example one the repost contains none of the hashtags or mentions from the original +post (they are beyond the first 140 characters) while the second repost example does return mentions from the original +post. *Due to this discrepancy, for reposts all mentions and hashtags of the original post are considered as mentions +and hashtags of the repost.* A user on X will see all mentions and hashtags when viewing a repost and the +repost would be a part of any network around those mentions and hashtags. diff --git a/datasources/twitterv2/__init__.py b/datasources/twitterv2/__init__.py index 3335bc7c0..6aa80c7b3 100644 --- a/datasources/twitterv2/__init__.py +++ b/datasources/twitterv2/__init__.py @@ -9,4 +9,4 @@ # Internal identifier for this data source DATASOURCE = "twitterv2" -NAME = "Twitter API (v2) Search" \ No newline at end of file +NAME = "X/Twitter API (v2) Search" \ No newline at end of file diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py index 999680b6e..8b91d1eb2 100644 --- a/datasources/twitterv2/search_twitter.py +++ b/datasources/twitterv2/search_twitter.py @@ -1,5 +1,5 @@ """ -Twitter keyword search via the Twitter API v2 +X/Twitter keyword search via the X API v2 """ import requests import datetime @@ -17,13 +17,10 @@ class SearchWithTwitterAPIv2(Search): """ - Get Tweets via the Twitter API - - This only allows for historical search - use f.ex. TCAT for more advanced - queries. + Get Tweets via the X API """ type = "twitterv2-search" # job ID - title = "Twitter API (v2)" + title = "X/Twitter API (v2)" extension = "ndjson" is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated @@ -32,15 +29,15 @@ class SearchWithTwitterAPIv2(Search): import_issues = True references = [ - "[Twitter API documentation](https://developer.twitter.com/en/docs/twitter-api)" + "[X/Twitter API documentation](https://developer.x.com/en/docs/x-api)" ] config = { "twitterv2-search.academic_api_key": { "type": UserInput.OPTION_TEXT, "default": "", - "help": "Academic API Key", - "tooltip": "An API key for the Twitter v2 Academic API. If " + "help": "Research API Key", + "tooltip": "An API key for the X/Twitter v2 Research API. If " "provided, the user will not need to enter their own " "key to retrieve tweets. Note that this API key should " "have access to the Full Archive Search endpoint." @@ -50,15 +47,15 @@ class SearchWithTwitterAPIv2(Search): "default": 0, "min": 0, "max": 10_000_000, - "help": "Max tweets per dataset", + "help": "Max posts per dataset", "tooltip": "4CAT will never retrieve more than this amount of " - "tweets per dataset. Enter '0' for unlimited tweets." + "posts per dataset. Enter '0' for unlimited posts." }, "twitterv2-search.id_lookup": { "type": UserInput.OPTION_TOGGLE, "default": False, "help": "Allow lookup by ID", - "tooltip": "If enabled, allow users to enter a list of tweet IDs " + "tooltip": "If enabled, allow users to enter a list of post IDs " "to retrieve. This is disabled by default because it " "can be confusing to novice users." } @@ -110,7 +107,7 @@ def get_items(self, query): } if self.parameters.get("query_type", "query") == "id_lookup" and self.config.get("twitterv2-search.id_lookup"): - endpoint = "https://api.twitter.com/2/tweets" + endpoint = "https://api.x.com/2/tweets" tweet_ids = self.parameters.get("query", []).split(',') @@ -126,7 +123,7 @@ def get_items(self, query): else: # Query to all or search - endpoint = "https://api.twitter.com/2/tweets/search/" + api_type + endpoint = "https://api.x.com/2/tweets/search/" + api_type queries = [self.parameters.get("query", "")] @@ -158,7 +155,7 @@ def get_items(self, query): while True: if self.interrupted: - raise ProcessorInterruptedException("Interrupted while getting tweets from the Twitter API") + raise ProcessorInterruptedException("Interrupted while getting posts from the Twitter API") # there is a limit of one request per second, so stay on the safe side of this while self.previous_request == int(time.time()): @@ -188,18 +185,18 @@ def get_items(self, query): try: structured_response = api_response.json() if structured_response.get("title") == "UsageCapExceeded": - self.dataset.update_status("Hit the monthly tweet cap. You cannot capture more tweets " - "until your API quota resets. Dataset completed with tweets " + self.dataset.update_status("Hit the monthly post cap. You cannot capture more posts " + "until your API quota resets. Dataset completed with posts " "collected so far.", is_final=True) return except (json.JSONDecodeError, ValueError): - self.dataset.update_status("Hit Twitter rate limit, but could not figure out why. Halting " - "tweet collection.", is_final=True) + self.dataset.update_status("Hit X's rate limit, but could not figure out why. Halting " + "post collection.", is_final=True) return resume_at = convert_to_int(api_response.headers["x-rate-limit-reset"]) + 1 resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c") - self.dataset.update_status("Hit Twitter rate limit - waiting until %s to continue." % resume_at_str) + self.dataset.update_status("Hit X's rate limit - waiting until %s to continue." % resume_at_str) while time.time() <= resume_at: if self.interrupted: raise ProcessorInterruptedException("Interrupted while waiting for rate limit to reset") @@ -211,10 +208,10 @@ def get_items(self, query): elif api_response.status_code == 403: try: structured_response = api_response.json() - self.dataset.update_status("'Forbidden' error from the Twitter API. Could not connect to Twitter API " + self.dataset.update_status("'Forbidden' error from the X API. Could not connect to X API " "with this API key. %s" % structured_response.get("detail", ""), is_final=True) except (json.JSONDecodeError, ValueError): - self.dataset.update_status("'Forbidden' error from the Twitter API. Your key may not have access to " + self.dataset.update_status("'Forbidden' error from the X API. Your key may not have access to " "the full-archive search endpoint.", is_final=True) finally: return @@ -224,7 +221,7 @@ def get_items(self, query): elif api_response.status_code in (502, 503, 504): resume_at = time.time() + 60 resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c") - self.dataset.update_status("Twitter unavailable (status %i) - waiting until %s to continue." % ( + self.dataset.update_status("X unavailable (status %i) - waiting until %s to continue." % ( api_response.status_code, resume_at_str)) while time.time() <= resume_at: time.sleep(0.5) @@ -233,7 +230,7 @@ def get_items(self, query): # this usually means the query is too long or otherwise contains # a syntax error elif api_response.status_code == 400: - msg = "Response %i from the Twitter API; " % api_response.status_code + msg = "Response %i from the X API; " % api_response.status_code try: api_response = api_response.json() msg += api_response.get("title", "") @@ -247,19 +244,19 @@ def get_items(self, query): # invalid API key elif api_response.status_code == 401: - self.dataset.update_status("Invalid API key - could not connect to Twitter API", is_final=True) + self.dataset.update_status("Invalid API key - could not connect to X API", is_final=True) return # haven't seen one yet, but they probably exist elif api_response.status_code != 200: self.dataset.update_status( "Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True) - self.log.warning("Twitter API v2 responded with status code %i. Response body: %s" % ( + self.log.warning("X API v2 responded with status code %i. Response body: %s" % ( api_response.status_code, api_response.text)) return elif not api_response: - self.dataset.update_status("Could not connect to Twitter. Cancelling.", is_final=True) + self.dataset.update_status("Could not connect to X. Cancelling.", is_final=True) return api_response = api_response.json() @@ -291,13 +288,13 @@ def get_items(self, query): if num_missing_objects > 50: # Large amount of missing objects; possible error with Twitter API self.import_issues = False - error_report.append('%i missing objects received following tweet number %i. Possible issue with Twitter API.' % (num_missing_objects, tweets)) + error_report.append('%i missing objects received following post number %i. Possible issue with X API.' % (num_missing_objects, tweets)) error_report.append('Missing objects collected: ' + ', '.join(['%s: %s' % (k, len(v)) for k, v in missing_objects.items()])) # Warn if new missing object is recorded (for developers to handle) expected_error_types = ['user', 'media', 'poll', 'tweet', 'place'] if any(key not in expected_error_types for key in missing_objects.keys()): - self.log.warning("Twitter API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types])) + self.log.warning("X API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types])) # Loop through and collect tweets for tweet in api_response.get("data", []): @@ -312,7 +309,7 @@ def get_items(self, query): tweets += 1 if tweets % 500 == 0: - self.dataset.update_status("Received %s of ~%s tweets from the Twitter API" % ("{:,}".format(tweets), expected_tweets)) + self.dataset.update_status("Received %s of ~%s tweets from the X API" % ("{:,}".format(tweets), expected_tweets)) if num_expected_tweets is not None: self.dataset.update_progress(tweets / num_expected_tweets) @@ -474,21 +471,19 @@ def get_options(cls, parent_dataset=None, user=None): max_tweets = config.get("twitterv2-search.max_tweets", user=user) if have_api_key: - intro_text = ("This data source uses the full-archive search endpoint of the Twitter API (v2) to retrieve " + intro_text = ("This data source uses the full-archive search endpoint of the X API (v2) to retrieve " "historic tweets that match a given query.") else: - intro_text = ("This data source uses either the Standard 7-day historical Search endpoint or the " - "full-archive search endpoint of the Twitter API, v2. To use the latter, you must have " - "access to the Academic Research track of the Twitter API. In either case, you will need to " - "provide a valid [bearer " - "token](https://developer.twitter.com/en/docs/authentication/oauth-2-0). The bearer token " - "**will be sent to the 4CAT server**, where it will be deleted after data collection has " - "started. Note that any tweets retrieved with 4CAT will count towards your monthly Tweet " - "retrieval cap.") - - intro_text += ("\n\nPlease refer to the [Twitter API documentation](" - "https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) " + intro_text = ("This data source uses the full-archive search endpoint of the X/Twitter API, v2. To use the " + "it, you must have access to the Research track of the X API. You will need to provide a " + "valid [bearer token](https://developer.x.com/en/docs/authentication/oauth-2-0). The " + "bearer token **will be sent to the 4CAT server**, where it will be deleted after data " + "collection has started. Note that any posts retrieved with 4CAT will count towards your " + "monthly post retrieval cap.") + + intro_text += ("\n\nPlease refer to the [X API documentation](" + "https://developer.x.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) " "documentation for more information about this API endpoint and the syntax you can use in your " "search query. Retweets are included by default; add `-is:retweet` to exclude them.") @@ -500,16 +495,18 @@ def get_options(cls, parent_dataset=None, user=None): } if not have_api_key: + # options.update({ + # "api_type": { + # "type": UserInput.OPTION_CHOICE, + # "help": "API track", + # "options": { + # "all": "Research API: Full-archive search", + # "recent": "Standard: Recent search (Tweets published in last 7 days)", + # }, + # "default": "all" + # } + # }) options.update({ - "api_type": { - "type": UserInput.OPTION_CHOICE, - "help": "API track", - "options": { - "all": "Academic: Full-archive search", - "recent": "Standard: Recent search (Tweets published in last 7 days)", - }, - "default": "all" - }, "api_bearer_token": { "type": UserInput.OPTION_TEXT, "sensitive": True, @@ -523,10 +520,10 @@ def get_options(cls, parent_dataset=None, user=None): "query_type": { "type": UserInput.OPTION_CHOICE, "help": "Query type", - "tooltip": "Note: Num of Tweets and Date fields ignored with 'Tweets by ID' lookup", + "tooltip": "Note: Num of posts and date fields are ignored with 'Posts by ID' lookup", "options": { "query": "Search query", - "id_lookup": "Tweets by ID (list IDs seperated by commas or one per line)", + "id_lookup": "Posts by ID (list IDs seperated by commas or one per line)", }, "default": "query" } @@ -539,7 +536,7 @@ def get_options(cls, parent_dataset=None, user=None): }, "amount": { "type": UserInput.OPTION_TEXT, - "help": "Tweets to retrieve", + "help": "Posts to retrieve", "tooltip": "0 = unlimited (be careful!)" if not max_tweets else ("0 = maximum (%s)" % str(max_tweets)), "min": 0, "max": max_tweets if max_tweets else 10_000_000, @@ -550,7 +547,7 @@ def get_options(cls, parent_dataset=None, user=None): }, "daterange-info": { "type": UserInput.OPTION_INFO, - "help": "By default, Twitter returns tweets up til 30 days ago. If you want to go back further, you " + "help": "By default, X returns posts up til 30 days ago. If you want to go back further, you " "need to explicitly set a date range." }, "daterange": { @@ -591,7 +588,7 @@ def validate_query(query, request, user): raise QueryParametersException("Please provide a valid bearer token.") if len(query.get("query")) > 1024 and query.get("query_type", "query") != "id_lookup": - raise QueryParametersException("Twitter API queries cannot be longer than 1024 characters.") + raise QueryParametersException("X API queries cannot be longer than 1024 characters.") if query.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup", user=user): # reformat queries to be a comma-separated list with no wrapping @@ -630,7 +627,7 @@ def validate_query(query, request, user): # to dissuade users from running huge queries that will take forever # to process if params["query_type"] == "query" and (params.get("api_type") == "all" or have_api_key): - count_url = "https://api.twitter.com/2/tweets/counts/all" + count_url = "https://api.x.com/2/tweets/counts/all" count_params = { "granularity": "day", "query": params["query"], @@ -668,7 +665,7 @@ def validate_query(query, request, user): elif response.status_code == 401: raise QueryParametersException("Your bearer token seems to be invalid. Please make sure it is valid " - "for the Academic Track of the Twitter API.") + "for the Research track of the X API.") elif response.status_code == 400: raise QueryParametersException("Your query is invalid. Please make sure the date range does not " @@ -791,7 +788,7 @@ def map_item(item): "thread_id": item.get("conversation_id", item["id"]), "timestamp": tweet_time.strftime("%Y-%m-%d %H:%M:%S"), "unix_timestamp": int(tweet_time.timestamp()), - 'link': "https://twitter.com/%s/status/%s" % (author_username, item.get('id')), + 'link': "https://x.com/%s/status/%s" % (author_username, item.get('id')), "subject": "", "body": item["text"], "author": author_username, diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py index bd7b81289..f6c8bcc11 100644 --- a/processors/conversion/export_datasets.py +++ b/processors/conversion/export_datasets.py @@ -23,7 +23,7 @@ class ExportDatasets(BasicProcessor): type = "export-datasets" # job type ID category = "Conversion" # category title = "Export Dataset and All Analyses" # title displayed in UI - description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again." # description displayed in UI + description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Filters are *not* included and must be exported separately as new datasets. Results automatically expires after 1 day, after which you must run again." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI @classmethod @@ -40,6 +40,11 @@ def process(self): This takes a CSV file as input and writes the same data as a JSON file """ self.dataset.update_status("Collecting dataset and all analyses") + primary_dataset = self.dataset.top_parent() + if not primary_dataset.is_finished(): + # This ought not happen as processors (i.e., this processor) should only be available for finished datasets + self.dataset.finish_with_error("You cannot export unfinished datasets; please wait until dataset is finished to export.") + return results_path = self.dataset.get_staging_area() @@ -52,25 +57,26 @@ def process(self): try: dataset = DataSet(key=dataset_key, db=self.db) - # TODO: these two should fail for the primary dataset, but should they fail for the children too? except DataSetException: - self.dataset.finish_with_error("Dataset not found.") - return + self.dataset.update_status(f"Dataset {dataset_key} not found: it may have been deleted prior to export; skipping.") + failed_exports.append(dataset_key) + continue if not dataset.is_finished(): - self.dataset.finish_with_error("You cannot export unfinished datasets.") - return + self.dataset.update_status(f"Dataset {dataset_key} not finished: cannot export unfinished datasets; skipping.") + failed_exports.append(dataset_key) + continue # get metadata metadata = dataset.get_metadata() if metadata["num_rows"] == 0: - self.dataset.update_status(f"Skipping empty dataset {dataset_key}") + self.dataset.update_status(f"Dataset {dataset_key} has no results; skipping.") failed_exports.append(dataset_key) continue # get data data_file = dataset.get_results_path() if not data_file.exists(): - self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.") + self.dataset.update_status(f"Dataset {dataset_key} has no data file; skipping.") failed_exports.append(dataset_key) continue diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py index 860c0ddbe..461cdd54a 100644 --- a/processors/conversion/merge_datasets.py +++ b/processors/conversion/merge_datasets.py @@ -60,7 +60,7 @@ def is_compatible_with(cls, module=None, user=None): return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector()) @staticmethod - def get_dataset_from_url(url, db): + def get_dataset_from_url(url, db, modules=None): """ Get dataset object based on dataset URL @@ -68,6 +68,7 @@ def get_dataset_from_url(url, db): :param str url: Dataset URL :param db: Database handler (to retrieve metadata) + :param modules: Modules handler (pass through to DataSet) :return DataSet: The dataset """ if not url: @@ -75,7 +76,7 @@ def get_dataset_from_url(url, db): source_url = ural.normalize_url(url) source_key = source_url.split("/")[-1] - return DataSet(key=source_key, db=db) + return DataSet(key=source_key, db=db, modules=modules) def process(self): """ @@ -96,7 +97,7 @@ def process(self): continue try: - source_dataset = self.get_dataset_from_url(source_dataset_url, self.db) + source_dataset = self.get_dataset_from_url(source_dataset_url, self.db, modules=self.modules) except DataSetException: return self.dataset.finish_with_error(f"Dataset URL '{source_dataset_url} not found - cannot perform " f"merge.") diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py new file mode 100644 index 000000000..a8dd8763e --- /dev/null +++ b/processors/filtering/unique_images.py @@ -0,0 +1,113 @@ +""" +Filter by unique images +""" +import shutil +import json + +from backend.lib.processor import BasicProcessor +from common.lib.exceptions import ProcessorInterruptedException +from common.lib.helpers import UserInput, hash_file + +__author__ = "Stijn Peeters" +__credits__ = ["Stijn Peeters"] +__maintainer__ = "Stijn Peeters" +__email__ = "4cat@oilab.eu" + + +class UniqueImageFilter(BasicProcessor): + """ + Retain only unique images, by a user-defined metric + """ + type = "image-downloader-unique" # job type ID + category = "Visualisation" # category + title = "Filter for unique images" # title displayed in UI + description = "Only keeps one instance per image, using a choice of detection method." # description displayed in UI + extension = "zip" + + references = [ + "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)", + "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", + "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", + "Explainer: [Difference hash](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)", + + ] + + options = { + "hash-type": { + "type": UserInput.OPTION_CHOICE, + "help": "Comparison method", + "default": "file-hash", + "options": { + "file-hash": "File hash (files need to be byte-by-byte duplicates)", + "colorhash": "Colour hash (good at colours, worse at shapes)", + "phash": "Perceptual hash (decent at colours and shapes)", + "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)", + "dhash": "Difference hash (similar to average hash, better at photos and art)" + } + } + } + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor on image archives + + :param module: Module to determine compatibility with + """ + return module.get_media_type() == "image" or module.type.startswith( + "image-downloader") or module.type == "video-frames" + + def process(self): + """ + Loop through images and only retain ones that have not been seen yet + + :return: + """ + seen_hashes = set() + hash_map = {} + metadata = None + dupes = 0 + processed = 0 + staging_area = self.dataset.get_staging_area() + + self.dataset.update_status("Processing images and looking for duplicates") + for image_file in self.iterate_archive_contents(self.source_file): + if self.interrupted: + raise ProcessorInterruptedException("Interrupted while filtering for unique images") + + self.dataset.update_progress(processed / self.source_dataset.num_rows) + if processed % 100 == 0: + self.dataset.update_status(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, " + f"found {dupes:,} duplicate(s)") + processed += 1 + + if image_file.name == ".metadata.json": + with image_file.open() as infile: + metadata = json.load(infile) + continue + + image_hash = hash_file(image_file, self.parameters.get("hash-type")) + + if image_hash not in seen_hashes: + seen_hashes.add(image_hash) + shutil.copy2(image_file, staging_area) + hash_map[image_hash] = image_file.name + else: + self.dataset.log(f"{image_file.name} is a duplicate of {hash_map[image_hash]} - skipping") + dupes += 1 + + new_metadata = {} + inverse_hashmap = {v: k for k, v in hash_map.items()} + for url, item in metadata.items(): + if item["filename"] in inverse_hashmap: + new_metadata[inverse_hashmap[item["filename"]]] = { + **item, + "hash": inverse_hashmap[item["filename"]], + "hash_type": self.parameters.get("hash-type") + } + + with staging_area.joinpath(".metadata.json").open("w") as outfile: + json.dump(new_metadata, outfile) + + self.dataset.update_status(f"Image archive filtered, found {dupes:,} duplicate(s)", is_final=True) + self.write_archive_and_finish(staging_area, len(hash_map), finish=True) diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py index 022e96de5..26234a186 100644 --- a/processors/machine_learning/annotate_text.py +++ b/processors/machine_learning/annotate_text.py @@ -184,8 +184,8 @@ def process(self): # prepare data for annotation data_path = staging_area.joinpath("data.temp.ndjson") with data_path.open("w", newline="") as outfile: - for item in self.source_dataset.iterate_items(): - outfile.write(json.dumps({item.get("id"): item.get(textfield)}) + "\n") + for i, item in enumerate(self.source_dataset.iterate_items()): + outfile.write(json.dumps({item.get("id", str(i)): item.get(textfield)}) + "\n") path_to_files, path_to_results = dmi_service_manager.process_files(staging_area, [data_path.name, labels_path.name], @@ -238,15 +238,14 @@ def make_filename(id, prompt): self.dataset.update_status("Loading annotated data") with output_dir.joinpath("results.json").open() as infile: annotations = json.load(infile) - self.dataset.update_status("Writing results") with self.dataset.get_results_path().open("w") as outfile: writer = None - for item in self.source_dataset.iterate_items(): + for i, item in enumerate(self.source_dataset.iterate_items()): row = { - "id": item.get("id"), + "id": item.get("id", i), textfield: item.get(textfield), - "category": annotations[item.get("id")] + "category": annotations.get(item.get("id", str(i))) # str(i) because it is not recorded as an int in the annotations } if not writer: writer = csv.DictWriter(outfile, fieldnames=row.keys()) diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py index 236e9577f..139b2ac93 100644 --- a/processors/networks/cotag_network.py +++ b/processors/networks/cotag_network.py @@ -29,6 +29,13 @@ class CoTaggerPreset(ProcessorPreset): "default": True, "help": "Convert tags to lowercase", "tooltip": "Merges tags with varying cases" + }, + "ignore-tags": { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "Tags to ignore", + "tooltip": "Separate with commas if you want to ignore multiple tags. Do not include the '#' " + "character." } } @@ -72,6 +79,7 @@ def get_processor_pipeline(self): "split-comma": True, "categorise": True, "allow-loops": False, + "ignore-nodes": self.parameters.get("ignore-tags", ""), "to-lowercase": self.parameters.get("to-lowercase", True) } } diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py new file mode 100644 index 000000000..4267c9650 --- /dev/null +++ b/processors/networks/image-network.py @@ -0,0 +1,217 @@ +""" +Make a bipartite Image-Item network +""" +import json + +from backend.lib.processor import BasicProcessor +from common.lib.helpers import hash_file + +import networkx as nx + +__author__ = "Stijn Peeters" +__credits__ = ["Stijn Peeters"] +__maintainer__ = "Stijn Peeters" +__email__ = "4cat@oilab.eu" + +from common.lib.exceptions import ProcessorInterruptedException +from common.lib.user_input import UserInput + + +class ImageGrapher(BasicProcessor): + """ + Image network + + Creates a bipartite network of images and some attribute of the dataset the + images were sourced from + """ + type = "image-bipartite-network" # job type ID + category = "Networks" + title = "Bipartite image-item network" # title displayed in UI + description = ("Create a GEXF network file with a bipartite network of " + "images and some data field (e.g. author) of the dataset " + "the images were sourced from. Suitable for use with Gephi's " + "'Image Preview' plugin.") + extension = "gexf" # extension of result file, used internally and in UI + + options = {} + + @classmethod + def get_options(cls, parent_dataset=None, user=None): + root_dataset = None + columns = None + if parent_dataset: + for parent in reversed(parent_dataset.get_genealogy()): + if parent.get_columns(): + root_dataset = parent + break + columns = root_dataset.get_columns() + + return { + "column": { + "help": "Dataset field", + "type": UserInput.OPTION_TEXT, + "default": "id" + }, + "image-value": { + "help": "Image node label", + "type": UserInput.OPTION_CHOICE, + "options": { + "filename": "Image file name", + "url": "Image URL" + }, + "tooltip": "The image node label will have this value. Depending on the network visualisation software " + "you use, one or the other is required to display the images as nodes." + }, + "deduplicate": { + "type": UserInput.OPTION_CHOICE, + "help": "Merge images", + "tooltip": "Similar images can be merged into a single node, represented by the first image of the set " + "that was encountered.", + "options": { + "none": "Do not merge", + "file-hash": "File hash (files need to be byte-by-byte duplicates)", + "colorhash": "Colour hash (good at colours, worse at shapes)", + "phash": "Perceptual hash (decent at colours and shapes)", + "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)", + "dhash": "Difference hash (similar to average hash, better at photos and art)" + } + }, + **({ + "column": { + "help": "Dataset field", + "type": UserInput.OPTION_CHOICE, + "options": { + column: column + for column in columns} + } + } if columns else {}) + } + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor to run on images downloaded from a dataset + + :param module: Module to determine compatibility with + """ + return module.type.startswith("image-downloader") + + def process(self): + column = self.parameters.get("column") + hash_type = self.parameters.get("deduplicate") + filename_filter = [".metadata.json"] if hash_type == "none" else [] + metadata = None + hashed = 0 + + # some maps to make sure we use the right value in the right place + # url or filename, original image or duplicate, etc + file_hash_map = {} + hash_file_map = {} + seen_hashes = set() + id_file_map = {} + + for file in self.iterate_archive_contents(self.source_file, filename_filter=filename_filter): + if file.name == ".metadata.json": + with file.open() as infile: + try: + metadata = json.load(infile) + file_hash_map = {i: v["filename"] for i, v in metadata.items()} if self.parameters.get("image-value") == "url" else {i["filename"]: i["filename"] for i in metadata.values()} + except json.JSONDecodeError: + pass + else: + try: + hashed += 1 + if hashed % 100 == 0: + self.dataset.update_status(f"Generated identity hashes for {hashed:,} of {self.source_dataset.num_rows-1:,} item(s)") + self.dataset.update_progress(hashed / (self.source_dataset.num_rows-1) * 0.5) + file_hash = hash_file(file, hash_type) + file_hash_map[file.name] = file_hash + if file_hash not in hash_file_map: + hash_file_map[file_hash] = file.name + + except (FileNotFoundError, ValueError) as e: + continue + + if not metadata: + return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only " + "be run on sets of images sourced from another 4CAT dataset.") + + file_url_map = {v["filename"]: u for u, v in metadata.items()} + for url, details in metadata.items(): + for item_id in details.get("post_ids", []): + if self.source_dataset.type.endswith("-telegram"): + # telegram has weird IDs + item_id = "-".join(details["filename"].split("-")[:-1]) + "-" + str(item_id) + id_file_map[item_id] = details["filename"] + + root_dataset = None + for parent in reversed(self.dataset.get_genealogy()): + if parent.get_columns(): + root_dataset = parent + break + + if not root_dataset: + return self.dataset.finish_with_error("No suitable parent dataset found - this processor can only " + "be run on sets of images sourced from another 4CAT dataset.") + + network = nx.DiGraph() + processed = 0 + for item in root_dataset.iterate_items(): + progress = processed / root_dataset.num_rows + if hashed: + # if hashing was necessary, we approximate that as 50% of the work + progress = (progress * 0.5) + 0.5 + + self.dataset.update_progress(progress) + processed += 1 + if processed % 100 == 0: + self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)") + + if self.interrupted: + raise ProcessorInterruptedException() + + if item.get("id") not in id_file_map: + continue + + # from nodes are the dataset fields (e.g. 'body' or 'chat') + # to node names are filenames (optionally mapped to URLs later) + from_node = item.get(column) + from_node_id = f"{column}-{from_node}" + + image_file = id_file_map[item.get("id")] + image_hash = file_hash_map.get(image_file) + if hash_type != "none" and image_hash in seen_hashes: + # if we're deduplicating and the image is already in the graph, + # merge the nodes (use the original node as the 'to node') + to_node = hash_file_map.get(image_hash) + if to_node and image_file != to_node: + self.dataset.update_status(f"Image {image_file} identified as a duplicate of {to_node} - " + f"merging.") + + else: + seen_hashes.add(image_hash) + to_node = image_file + + if not to_node: + # image could not be hashed, probably invalid file + continue + + if self.parameters.get("image-value") == "url": + to_node = file_url_map[to_node] + + to_node_id = f"image-{to_node}" + if from_node_id not in network.nodes: + network.add_node(from_node_id, label=from_node, category=column) + + if to_node_id not in network.nodes: + network.add_node(to_node_id, label=to_node, category="image", image=to_node) + + edge = (from_node_id, to_node_id) + if edge not in network.edges(): + network.add_edge(*edge, frequency=0) + + network.edges[edge]["frequency"] += 1 + + self.dataset.update_status("Writing network file") + nx.write_gexf(network, self.dataset.get_results_path()) + self.dataset.finish(len(network.nodes)) diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py index 0f6045702..43ceffdf4 100644 --- a/processors/networks/two-column-network.py +++ b/processors/networks/two-column-network.py @@ -84,6 +84,12 @@ class ColumnNetworker(BasicProcessor): "default": False, "help": "Convert values to lowercase", "tooltip": "Merges values with varying cases" + }, + "ignore-nodes": { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "Nodes to ignore", + "tooltip": "Separate with commas if you want to ignore multiple nodes" } } @@ -145,6 +151,7 @@ def process(self): allow_loops = self.parameters.get("allow-loops") interval_type = self.parameters.get("interval") to_lower = self.parameters.get("to-lowercase", False) + ignoreable = [n.strip() for n in self.parameters.get("ignore-nodes", "").split(",") if n.strip()] processed = 0 @@ -193,6 +200,14 @@ def process(self): values_a = [value.strip() for value_groups in values_a for value in value_groups.split(",")] values_b = [value.strip() for value_groups in values_b for value in value_groups.split(",")] + if ignoreable: + values_a = [v for v in values_a if v not in ignoreable] + values_b = [v for v in values_b if v not in ignoreable] + + # only proceed if we actually have any edges left + if not values_a or not values_b: + continue + try: interval = get_interval_descriptor(item, interval_type) except ValueError as e: diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index 17c350c86..1ee3b1990 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -226,6 +226,7 @@ def process(self): The result is valid JSON, written in chunks. """ + sentence_error = False columns = self.parameters.get("columns") if not columns: self.dataset.update_status("No columns selected, aborting.", is_final=True) @@ -357,11 +358,11 @@ def dummy_function(x, *args, **kwargs): # for russian we use a special purpose splitter with better # performance sentence_method = razdel.sentenize - elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if - 'pickle' in lang]: + elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab'))]: self.dataset.update_status( f"Language {language} not available for sentence tokenizer; grouping by item/post instead.") sentence_method = dummy_function + sentence_error = True else: sentence_method = sent_tokenize else: @@ -490,6 +491,9 @@ def dummy_function(x, *args, **kwargs): with staging_area.joinpath(".token_metadata.json").open("w", encoding="utf-8") as outfile: json.dump(metadata, outfile) + if sentence_error: + self.dataset.update_status(f"Finished tokenizing; Unable to group by sentence ({language} not supported), instead grouped by item.", is_final=True) + # create zip of archive and delete temporary files and folder self.write_archive_and_finish(staging_area) diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 99ff5199b..6394862e8 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -7,13 +7,14 @@ from pathlib import Path +import telethon.errors from telethon import TelegramClient -from telethon.errors import TimedOutError +from telethon.errors import TimedOutError, BadRequestError from common.config_manager import config from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, timify_long from common.lib.dataset import DataSet from processors.visualisation.download_images import ImageDownloader @@ -194,6 +195,13 @@ async def get_images(self): if self.interrupted: raise ProcessorInterruptedException("Interrupted while downloading images") + if not message: + # message no longer exists + self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it " + f"may have been deleted)") + self.flawless = False + break + success = False try: # it's actually unclear if images are always jpegs, but this @@ -216,13 +224,27 @@ async def get_images(self): self.dataset.log(f"Could not download image for message {msg_id} ({e})") self.flawless = False - media_done += 1 - self.metadata[filename] = { - "filename": filename, - "success": success, - "from_dataset": self.source_dataset.key, - "post_ids": [msg_id] - } + finally: + media_done += 1 + self.metadata[filename] = { + "filename": filename, + "success": success, + "from_dataset": self.source_dataset.key, + "post_ids": [msg_id] + } + + except BadRequestError as e: + self.dataset.log(f"Couldn't retrieve images for {entity} - the channel is no longer accessible ({e})") + self.flawless = False + + except telethon.errors.FloodError as e: + later = "later" + if hasattr(e, "seconds"): + later = f"in {timify_long(e.seconds)}" + self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); " + f"halting download process. Try again {later}.", is_final=True) + self.flawless = False + break except ValueError as e: self.dataset.log(f"Couldn't retrieve images for {entity}, it probably does not exist anymore ({e})") diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py index ef6d44231..aa05173ce 100644 --- a/processors/visualisation/download-telegram-videos.py +++ b/processors/visualisation/download-telegram-videos.py @@ -8,12 +8,13 @@ from pathlib import Path from telethon import TelegramClient +from telethon.errors import FloodError, BadRequestError from common.config_manager import config from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from processors.visualisation.download_videos import VideoDownloaderPlus -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, timify_long from common.lib.dataset import DataSet __author__ = "Stijn Peeters" @@ -197,7 +198,7 @@ async def get_videos(self): msg_id = message.id success = True - except (AttributeError, RuntimeError, ValueError, TypeError) as e: + except (AttributeError, RuntimeError, ValueError, TypeError, BadRequestError) as e: filename = f"{entity}-index-{media_done}" msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}" self.dataset.log(f"Could not download video for message {msg_id} ({e})") @@ -210,6 +211,15 @@ async def get_videos(self): "from_dataset": self.source_dataset.key, "post_ids": [msg_id] } + + except FloodError as e: + later = "later" + if hasattr(e, "seconds"): + later = f"in {timify_long(e.seconds)}" + self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); " + f"halting download process. Try again {later}.", is_final=True) + self.flawless = False + break except ValueError as e: self.dataset.log(f"Couldn't retrieve video for {entity}, it probably does not exist anymore ({e})") diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py index c02b53bf7..3854e9653 100644 --- a/processors/visualisation/download_tiktok.py +++ b/processors/visualisation/download_tiktok.py @@ -161,6 +161,7 @@ class TikTokImageDownloader(BasicProcessor): "options": { "thumbnail": "Video Thumbnail", "music": "Music Thumbnail", + "author_avatar": "User avatar" }, "default": "thumbnail" } @@ -217,6 +218,8 @@ def process(self): url_column = "thumbnail_url" elif self.parameters.get("thumb_type") == "music": url_column = "music_thumbnail" + elif self.parameters.get("thumb_type") == "author_avatar": + url_column = "author_avatar" else: self.dataset.update_status("No image column selected.", is_final=True) self.dataset.finish(0) diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index 2b385ffe7..d1d7bd67c 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -3,6 +3,7 @@ First attempt to download via request, but if that fails use yt-dlp """ +import os import json import re import time @@ -601,15 +602,22 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie f"Video size {response.headers.get('Content-Length')} larger than maximum allowed per 4CAT") # Size unknown elif not self.config.get("video-downloader.allow-unknown-size", False): - FilesizeException("Video size unknown; not allowed to download per 4CAT settings") + raise FilesizeException("Video size unknown; not allowed to download per 4CAT settings") # Download video self.dataset.update_status( "Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url)) with open(results_path.joinpath(save_location), "wb") as f: - for chunk in response.iter_content(chunk_size=1024 * 1024): - if chunk: - f.write(chunk) + try: + for chunk in response.iter_content(chunk_size=1024 * 1024): + if not max_video_size == 0 and f.tell() > (max_video_size * 1000000): + # File size too large; stop download and remove file + os.remove(f.name) + raise FilesizeException("Video size larger than maximum allowed per 4CAT") + if chunk: + f.write(chunk) + except requests.exceptions.ChunkedEncodingError as e: + raise FailedDownload(f"Failed to complete download: {e}") # Return filename to add to metadata return save_location.name diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py index 64b0c4f34..ec95f84f9 100644 --- a/processors/visualisation/video_frames.py +++ b/processors/visualisation/video_frames.py @@ -94,7 +94,7 @@ def process(self): processed_videos = 0 self.dataset.update_status("Extracting video frames") - for path in self.iterate_archive_contents(self.source_file, staging_area): + for i, path in enumerate(self.iterate_archive_contents(self.source_file, staging_area)): if self.interrupted: raise ProcessorInterruptedException("Interrupted while determining image wall order") @@ -138,17 +138,21 @@ def process(self): outfile.write(ffmpeg_error) if result.returncode != 0: - error = 'Error Return Code with video %s: %s' % (vid_name, str(result.returncode)) - self.dataset.log(error) + self.dataset.update_status(f"Unable to extract frames from video {vid_name} (see logs for details)") + self.dataset.log('Error Return Code (%s) with video %s: %s' % (str(result.returncode), vid_name, "\n".join(ffmpeg_error.split('\n')[-2:]) if ffmpeg_error else '')) + else: + processed_videos += 1 + self.dataset.update_status("Created frames for %i of %i videos" % (processed_videos, total_possible_videos)) - processed_videos += 1 - self.dataset.update_status( - "Created frames for %i of %i videos" % (processed_videos, total_possible_videos)) - self.dataset.update_progress(processed_videos / total_possible_videos) + self.dataset.update_progress(i / total_possible_videos) # Finish up # We've created a directory and folder structure here as opposed to a single folder with single files as # expected by self.write_archive_and_finish() so we use make_archive instead + if not processed_videos: + self.dataset.finish_with_error("Unable to extract frames from any videos") + return + from shutil import make_archive make_archive(self.dataset.get_results_path().with_suffix(''), "zip", output_directory) diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index ff1222bc1..aad1baf69 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -183,8 +183,9 @@ def process(self): self.dataset.log('Frames per seconds: %f' % frame_interval) # Prepare staging area for videos and video tracking + # VideoHash creates various files that may not be cleaned up on error so we use an output directory staging_area = self.dataset.get_staging_area() - self.dataset.log('Staging directory location: %s' % staging_area) + output_dir = self.dataset.get_staging_area() video_hashes = {} video_metadata = None @@ -215,16 +216,17 @@ def process(self): self.dataset.update_status("FFmpeg software not found. Please contact 4CAT maintainers.", is_final=True) self.dataset.finish(0) return - except FileNotFoundError as e: - self.dataset.update_status(f"Unable to find file {str(path)}") + except FileNotFoundError: + self.dataset.update_status(f"Unable to find file {path.name}") continue except FFmpegFailedToExtractFrames as e: - self.dataset.update_status(f"Unable to extract frame for {str(path)}: {e}") + self.dataset.update_status(f"Unable to extract frame for {path.name} (see log for details)") + self.dataset.log(f"Unable to extract frame for {str(path)}: {e}") continue video_hashes[path.name] = {'videohash': videohash} - shutil.copy(videohash.collage_path, staging_area.joinpath(path.stem + '.jpg')) + shutil.copy(videohash.collage_path, output_dir.joinpath(path.stem + '.jpg')) video_hashes[path.name]['video_collage_filename'] = path.stem + '.jpg' processed_videos += 1 @@ -233,6 +235,10 @@ def process(self): self.dataset.update_progress(processed_videos / total_possible_videos) videohash.delete_storage_path() + if processed_videos == 0: + self.dataset.finish_with_error("Unable to create video hashes for any videos") + return + # Write hash file # This file is held here and then copied as its own dataset via VideoHasherTwo num_posts = 0 @@ -240,7 +246,7 @@ def process(self): if video_metadata is None: # Grab the metadata directly, if it exists but was skipped (e.g., not found prior to max_videos) try: - metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area) + metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, output_dir) except FileNotFoundError: metadata_path = None if metadata_path: @@ -293,7 +299,7 @@ def process(self): num_posts += 1 writer = None - with staging_area.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile: + with output_dir.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile: for row in rows: if not writer: writer = csv.DictWriter(outfile, fieldnames=row.keys()) @@ -303,7 +309,7 @@ def process(self): # Finish up self.dataset.update_status(f'Created {num_posts} video hashes and stored video collages') - self.write_archive_and_finish(staging_area) + self.write_archive_and_finish(output_dir, num_items=processed_videos) class VideoHashNetwork(BasicProcessor): """ diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py index 634e8c49d..5140baa01 100644 --- a/processors/visualisation/video_scene_identifier.py +++ b/processors/visualisation/video_scene_identifier.py @@ -252,8 +252,9 @@ def process(self): if video_data.get('success'): files = video_data.get('files') if 'files' in video_data else [{"filename": video_data.get("filename"), "success":True}] for file in files: - if not file.get("success"): + if not file.get("success") or file.get("filename") not in collected_scenes: continue + # List types are not super fun for CSV if 'post_ids' in video_data: video_data['post_ids'] = ','.join([str(i) for i in video_data['post_ids']]) diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py index f668e6f5e..3c73e57f8 100644 --- a/processors/visualisation/video_timelines.py +++ b/processors/visualisation/video_timelines.py @@ -117,6 +117,9 @@ def process(self): if previous_video is not None or not looping: # draw the video filename/label on top of the rendered # frame thumbnails + if not previous_video: + # This likely means no frames were found for the video and this processor should not have run + continue video_label = labels.get(previous_video, previous_video) footersize = (fontsize * (len(video_label) + 2) * 0.5925, fontsize * 2) footer_shape = SVG(insert=(0, base_height - footersize[1]), size=footersize) @@ -165,6 +168,10 @@ def process(self): timeline.add(frame_element) timeline_widths[video] += frame_width + if not timeline_widths: + self.dataset.finish_with_error("No video frames found") + return + # now we know all dimensions we can instantiate the canvas too canvas_width = max(timeline_widths.values()) fontsize = 12 @@ -207,7 +214,7 @@ def get_video_labels(self, metadata): labels[filename] = filename for dataset, urls in mapping_dataset.items(): - dataset = DataSet(key=dataset, db=self.db).nearest("*-search") + dataset = DataSet(key=dataset, db=self.db, modules=self.modules).nearest("*-search") # determine appropriate label # is this the right place? should it be in the datasource? diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index 0dfe2d408..0a1f235e0 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -212,6 +212,12 @@ def process(self): if processed % 500 == 0: self.dataset.update_status("Processing and tokenising post %i" % processed) body = post.get(column) + + try: + body = str(body) + except TypeError: + continue + if not body: continue diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py index 6cc91eba1..d0e74a377 100644 --- a/webtool/lib/helpers.py +++ b/webtool/lib/helpers.py @@ -96,30 +96,6 @@ def error(code=200, **kwargs): return response -def string_to_timestamp(string): - """ - Convert dd-mm-yyyy date to unix time - - :param string: Date string to parse - :return: The unix time, or 0 if value could not be parsed - """ - bits = string.split("-") - if re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", string): - bits = list(reversed(bits)) - - if len(bits) != 3: - return 0 - - try: - day = int(bits[0]) - month = int(bits[1]) - year = int(bits[2]) - date = datetime.datetime(year, month, day) - except ValueError: - return 0 - - return int(date.timestamp()) - def pad_interval(intervals, first_interval=None, last_interval=None): """ Pad an interval so all intermediate intervals are filled @@ -299,25 +275,6 @@ def generate_css_colours(force=False): ) -def get_preview(query): - """ - Generate a data preview of 25 rows of a results csv - - :param query - :return list: - """ - preview = [] - with query.get_results_path().open(encoding="utf-8") as resultfile: - posts = csv.DictReader(resultfile) - i = 0 - for post in posts: - i += 1 - preview.append(post) - if i > 25: - break - return preview - - def format_chan_post(post): """ Format a plain-text imageboard post post for HTML display diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 8d1db0e2c..d3ba68314 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -1,5 +1,7 @@ import urllib.parse import datetime +from math import floor + import markdown import json import ural @@ -123,7 +125,7 @@ def _jinja2_filter_httpquery(data): return "" @app.template_filter("add_ahref") -def _jinja2_filter_add_ahref(content): +def _jinja2_filter_add_ahref(content, ellipsiate=0): """ Add HTML links to text @@ -138,7 +140,11 @@ def _jinja2_filter_add_ahref(content): return content for link in set(ural.urls_from_text(str(content))): - content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link}') + if ellipsiate > 0: + link_text = _jinja2_filter_ellipsiate(link, ellipsiate, True, "[…]") + else: + link_text = link + content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link_text}') return content @@ -203,6 +209,7 @@ def _jinja2_filter_extension_to_noun(ext): else: return "item" + @app.template_filter('social_mediafy') def _jinja2_filter_social_mediafy(body, datasource=""): # Adds links to a text body with hashtags, @-mentions, and URLs @@ -239,6 +246,176 @@ def _jinja2_filter_social_mediafy(body, datasource=""): } } + +@app.template_filter("ellipsiate") +def _jinja2_filter_ellipsiate(text, length, inside=False, ellipsis_str="…"): + if len(text) <= length: + return text + + elif not inside: + return text[:length] + ellipsis_str + + else: + # two cases: URLs and normal text + # for URLs, try to only ellipsiate after the domain name + # this makes the URLs easier to read when shortened + if ural.is_url(text): + pre_part = "/".join(text.split("/")[:3]) + if len(pre_part) < length - 6: # kind of arbitrary + before = len(pre_part) + 1 + else: + before = floor(length / 2) + else: + before = floor(length / 2) + + after = len(text) - before + return text[:before] + ellipsis_str + text[after:] + +@app.template_filter('4chan_image') +def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5): + + plebs_boards = ["adv","f","hr","mlpol","mo","o","pol","s4s","sp","tg","trv","tv","x"] + archivedmoe_boards = ["3","a","aco","adv","an","asp","b","bant","biz","c","can","cgl","ck","cm","co","cock","con","d","diy","e","f","fa","fap","fit","fitlit","g","gd","gif","h","hc","his","hm","hr","i","ic","int","jp","k","lgbt","lit","m","mlp","mlpol","mo","mtv","mu","n","news","o","out","outsoc","p","po","pol","pw","q","qa","qb","qst","r","r9k","s","s4s","sci","soc","sp","spa","t","tg","toy","trash","trv","tv","u","v","vg","vint","vip","vm","vmg","vp","vr","vrpg","vst","vt","w","wg","wsg","wsr","x","xs","y"] + + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + + img_link = None + thumb_link = image_4chan.split(".") + thumb_link = thumb_link[0][:4] + "/" + thumb_link[0][4:6] + "/" + thumb_link[0] + "s." + thumb_link[1] + + # If the board is archived by 4plebs, check that site first + if board in plebs_boards: + + # First we're going to try to get the image link through the 4plebs API. + api_url = "https://archive.4plebs.org/_/api/chan/post/?board=%s&num=%s" % (board, post_id) + try: + api_json = requests.get(api_url, headers=headers) + except requests.RequestException as e: + pass + if api_json.status_code != 200: + pass + try: + api_json = json.loads(api_json.content) + img_link = api_json.get("media", {}).get("thumb_link", "") + except json.JSONDecodeError: + pass + if img_link: + return img_link + + # If that doesn't work, we can check whether we can retrieve the image directly. + # 4plebs has a back-referral system so that some filenames are translated. + # This means direct linking won't work for every image without API retrieval. + # So only show if we get a 200 status code. + img_page = requests.get("https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link), headers=headers) + if img_page.status_code == 200: + return "https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link) + + # If the board is archived by archivedmoe, we can also check this resource + if board in archivedmoe_boards: + img_page = requests.get("https://archived.moe/files/%s/thumb/%s" % (board, thumb_link), headers=headers) + if img_page.status_code == 200: + return img_page + + # If we couldn't retrieve the thumbnail yet, then we'll just give a search link + # and display it as a hidden image. + image_md5 = image_md5.replace("/", "_") + if board in plebs_boards: + return "retrieve:http://archive.4plebs.org/_/search/image/" + image_md5 + # Archivedmoe as a last resort - has a lot of boards + return "retrieve:https://archived.moe/_/search/image/" + image_md5 + + + +@app.template_filter('post_field') +def _jinja2_filter_post_field(field, post): + # Extracts string values between {{ two curly brackets }} and uses that + # as a dictionary key for the given dict. It then returns the corresponding value. + # Mainly used in the Explorer. + + matches = False + formatted_field = field + + field = str(field) + + for key in re.findall(r"\{\{(.*?)\}\}", field): + + original_key = key + + # Remove possible slice strings so we get the original key + string_slice = None + if "[" in original_key and "]" in original_key: + string_slice = re.search(r"\[(.*?)\]", original_key) + if string_slice: + string_slice = string_slice.group(1) + key = key.replace("[" + string_slice + "]", "") + + # We're also gonna extract any other filters present + extra_filters = [] + if "|" in key: + extra_filters = key.split("|")[1:] + key = key.split("|")[0] + + # They keys can also be subfields (e.g. "author.username") + # So we're splitting and looping until we get the value. + keys = key.split(".") + val = post + + for k in keys: + if isinstance(val, list): + val = val[0] + if isinstance(val, dict): + val = val.get(k.strip(), "") + + # Return nothing if one of the fields is not found. + # We see 0 as a valid value - e.g. '0 retweets'. + if not val and val != 0: + return "" + + # Support some basic string slicing + if string_slice: + field = field.replace("[" + string_slice + "]", "") + if ":" not in string_slice: + string_slice = slice(int(string_slice), int(string_slice) + 1) + else: + sl = string_slice.split(":") + if not sl[0] and sl[0] != "0": + sl1 = 0 + sl2 = sl[1] + elif not sl[-1]: + sl1 = sl[0] + sl2 = len(st) + else: + sl1 = sl[0] + sl2 = sl[1] + string_slice = slice(int(sl1), int(sl2)) + + # Apply further filters, if present (e.g. lower) + for extra_filter in extra_filters: + + extra_filter = extra_filter.strip() + + # We're going to parse possible parameters to pass to the filter + # These are passed as unnamed variables to the function. + params = () + if "(" in extra_filter: + params = extra_filter.split("(")[-1][:-1].strip() + extra_filter = extra_filter.split("(")[0] + params = [p.strip() for p in params.split(",")] + params = [post[param] for param in params] + + val = app.jinja_env.filters[extra_filter](val, *params) + + if string_slice: + val = val[string_slice] + + # Extract single list item + if isinstance(val, list) and len(val) == 1: + val = val[0] + + formatted_field = formatted_field.replace("{{" + original_key + "}}", str(val)) + + return formatted_field + # Supported data sources known_datasources = list(base_urls.keys()) if datasource not in known_datasources: diff --git a/webtool/pages/faq.md b/webtool/pages/faq.md deleted file mode 100644 index 866a9675f..000000000 --- a/webtool/pages/faq.md +++ /dev/null @@ -1,15 +0,0 @@ -## Frequently Asked Questions - -### How do I cite this tool in my research paper? - -Please refer to the [How to cite](/page/citing/) page. - -### Where can I find more information about this tool? - -Take a look at 4CAT's [website](https://4cat.nl) and its -[GitHub repository](https://github.com/digitalmethodsinitiative/4cat)! - -### What query syntax can I use? - -Most standard search engine query syntax is supported. An -[overview of syntax you can use](/page/query-syntax/) is available. \ No newline at end of file diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index afa10f2c6..21d03c206 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -630,6 +630,10 @@ body.csv-preview table td, body.csv-preview table th { border: 1px solid var(--gray-light); } +body.csv-preview table tr:nth-child(2n+1) { + background: var(--contrast-bright); +} + .child.focus:not(.card) > .sub-controls > .query-result > .query-result-iframe { display: none; } diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html index d8b41b1ee..fee8c0956 100644 --- a/webtool/templates/frontpage.html +++ b/webtool/templates/frontpage.html @@ -11,7 +11,8 @@

What is {{ __user_config("4cat.name") }}?

from a variety of online sources, and analyze the data through analytical processors.

4CAT is developed by OILab and the Digital Methods Initiative at the University of Amsterdam. - For more information, take a look at the 4CAT website.

+ For more information, take a look at the 4CAT website or the tool's + GitHub repository.

{% if __user_config("4cat.about_this_server") %}

About this server

{{ __user_config("4cat.about_this_server") }}

@@ -19,6 +20,7 @@

About this server

4CAT updates

About {% for page in __user_config("ui.nav_pages") %} - {% if page == "faq" %}FAQ{% else %}{{ page|title }}{% endif %} + {{ page|title }} {% endfor %} @@ -85,7 +85,9 @@

{% endif %}
  • How to cite
  • Help & Bug Reports
  • + {% if current_user.is_authenticated %}
  • v{{ __version }}
  • + {% endif %}
  • OILab, 2018 – {{ __datenow.year }}
  • diff --git a/webtool/templates/preview/csv.html b/webtool/templates/preview/csv.html index fc36bb9d1..d2473735a 100644 --- a/webtool/templates/preview/csv.html +++ b/webtool/templates/preview/csv.html @@ -20,7 +20,7 @@ {% endif %} {% endif %} - {{ cell|e|add_ahref|safe }} + {{ cell|e|add_ahref(ellipsiate=50)|safe }} {% endfor %} diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py index f7f66ad6e..7097a92ee 100644 --- a/webtool/views/api_tool.py +++ b/webtool/views/api_tool.py @@ -282,13 +282,6 @@ def queue_dataset(): Request parameters vary by data source. The ones mandated constitute the minimum but more may be required. - :request-param str board: Board ID to query - :request-param str datasource: Data source ID to query - :request-param str body_match: String to match in the post body - :request-param str subject_match: String to match in the post subject - :request-param int min_date: Timestamp marking the beginning of the match - period - :request-param int max_date: Timestamp marking the end of the match period :request-param str ?access_token: Access token; only required if not logged in currently. @@ -296,6 +289,7 @@ def queue_dataset(): status and results. :return-error 404: If the datasource does not exist. """ + datasource_id = request.form.get("datasource", "") if datasource_id not in fourcat_modules.datasources: return error(404, message="Datasource '%s' does not exist" % datasource_id) diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py index 500c5a821..bdd86a3f0 100644 --- a/webtool/views/views_dataset.py +++ b/webtool/views/views_dataset.py @@ -70,7 +70,7 @@ def show_results(page): filters["sort_by"] = "timestamp" if not request.args: - filters["hide_empty"] = True + filters["hide_empty"] = False # handle 'depth'; all, own datasets, or favourites? # 'all' is limited to admins diff --git a/webtool/views/views_misc.py b/webtool/views/views_misc.py index 4690b6228..e179085c2 100644 --- a/webtool/views/views_misc.py +++ b/webtool/views/views_misc.py @@ -73,7 +73,7 @@ def show_about(): datasources = {k: v for k, v in fourcat_modules.datasources.items() if k in config.get("datasources.enabled") and not v["importable"]} - importables = {k: v for k, v in fourcat_modules.datasources.items() if v["importable"]} + importables = {k: v for k, v in fourcat_modules.datasources.items() if (v["importable"] and k in config.get("datasources.enabled"))} return render_template("frontpage.html", stats=stats, news=news, datasources=datasources, importables=importables)