From e556c2dcba8e28b544042a2134cd623711cd90d6 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Tue, 15 Oct 2024 18:06:45 +0200 Subject: [PATCH 01/48] docs: remove references to old search fields in `api/queue-query` --- webtool/views/api_tool.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py index 5b47c030..b37339f9 100644 --- a/webtool/views/api_tool.py +++ b/webtool/views/api_tool.py @@ -282,13 +282,6 @@ def queue_dataset(): Request parameters vary by data source. The ones mandated constitute the minimum but more may be required. - :request-param str board: Board ID to query - :request-param str datasource: Data source ID to query - :request-param str body_match: String to match in the post body - :request-param str subject_match: String to match in the post subject - :request-param int min_date: Timestamp marking the beginning of the match - period - :request-param int max_date: Timestamp marking the end of the match period :request-param str ?access_token: Access token; only required if not logged in currently. @@ -296,6 +289,7 @@ def queue_dataset(): status and results. :return-error 404: If the datasource does not exist. """ + datasource_id = request.form.get("datasource", "") if datasource_id not in fourcat_modules.datasources: return error(404, message="Datasource '%s' does not exist" % datasource_id) From d5c873ae4841c15b35ed9354137a0237e0d60c6f Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 16 Oct 2024 13:05:22 +0200 Subject: [PATCH 02/48] Bye bye FAQ page --- common/lib/config_definition.py | 3 +-- webtool/pages/faq.md | 15 --------------- webtool/templates/frontpage.html | 3 ++- webtool/templates/layout.html | 2 +- 4 files changed, 4 insertions(+), 19 deletions(-) delete mode 100644 webtool/pages/faq.md diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index a4fca2dc..ee38ce70 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -499,11 +499,10 @@ "type": UserInput.OPTION_MULTI_SELECT, "help": "Pages in navigation", "options": { - "faq": "FAQ", "data-policy": "Data Policy", "citing": "How to cite", }, - "default": ["faq"], + "default": [], "tooltip": "These pages will be included in the navigation bar at the top of the interface." }, "ui.prefer_mapped_preview": { diff --git a/webtool/pages/faq.md b/webtool/pages/faq.md deleted file mode 100644 index 866a9675..00000000 --- a/webtool/pages/faq.md +++ /dev/null @@ -1,15 +0,0 @@ -## Frequently Asked Questions - -### How do I cite this tool in my research paper? - -Please refer to the [How to cite](/page/citing/) page. - -### Where can I find more information about this tool? - -Take a look at 4CAT's [website](https://4cat.nl) and its -[GitHub repository](https://github.com/digitalmethodsinitiative/4cat)! - -### What query syntax can I use? - -Most standard search engine query syntax is supported. An -[overview of syntax you can use](/page/query-syntax/) is available. \ No newline at end of file diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html index d8b41b1e..fa5cbc73 100644 --- a/webtool/templates/frontpage.html +++ b/webtool/templates/frontpage.html @@ -11,7 +11,8 @@

What is {{ __user_config("4cat.name") }}?

from a variety of online sources, and analyze the data through analytical processors.

4CAT is developed by OILab and the Digital Methods Initiative at the University of Amsterdam. - For more information, take a look at the 4CAT website.

+ For more information, take a look at the 4CAT website or the tool's + GitHub repository.

{% if __user_config("4cat.about_this_server") %}

About this server

{{ __user_config("4cat.about_this_server") }}

diff --git a/webtool/templates/layout.html b/webtool/templates/layout.html index 1815c233..e1ecda59 100644 --- a/webtool/templates/layout.html +++ b/webtool/templates/layout.html @@ -54,7 +54,7 @@

About {% for page in __user_config("ui.nav_pages") %} - {% if page == "faq" %}FAQ{% else %}{{ page|title }}{% endif %} + {{ page|title }} {% endfor %} From bb4a581ca7d3efd7dad290d54c1fb7652c05e3b5 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 16 Oct 2024 16:43:45 +0200 Subject: [PATCH 03/48] annotate will use index if no id present --- processors/machine_learning/annotate_text.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py index 022e96de..26234a18 100644 --- a/processors/machine_learning/annotate_text.py +++ b/processors/machine_learning/annotate_text.py @@ -184,8 +184,8 @@ def process(self): # prepare data for annotation data_path = staging_area.joinpath("data.temp.ndjson") with data_path.open("w", newline="") as outfile: - for item in self.source_dataset.iterate_items(): - outfile.write(json.dumps({item.get("id"): item.get(textfield)}) + "\n") + for i, item in enumerate(self.source_dataset.iterate_items()): + outfile.write(json.dumps({item.get("id", str(i)): item.get(textfield)}) + "\n") path_to_files, path_to_results = dmi_service_manager.process_files(staging_area, [data_path.name, labels_path.name], @@ -238,15 +238,14 @@ def make_filename(id, prompt): self.dataset.update_status("Loading annotated data") with output_dir.joinpath("results.json").open() as infile: annotations = json.load(infile) - self.dataset.update_status("Writing results") with self.dataset.get_results_path().open("w") as outfile: writer = None - for item in self.source_dataset.iterate_items(): + for i, item in enumerate(self.source_dataset.iterate_items()): row = { - "id": item.get("id"), + "id": item.get("id", i), textfield: item.get(textfield), - "category": annotations[item.get("id")] + "category": annotations.get(item.get("id", str(i))) # str(i) because it is not recorded as an int in the annotations } if not writer: writer = csv.DictWriter(outfile, fieldnames=row.keys()) From 6aa7177edba61357638d858ad171468ffd51c945 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 18 Oct 2024 18:34:40 +0200 Subject: [PATCH 04/48] Don't crash on broken import NDJSON but ignore item instead --- backend/lib/search.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/backend/lib/search.py b/backend/lib/search.py index 15b3982d..3258561e 100644 --- a/backend/lib/search.py +++ b/backend/lib/search.py @@ -170,10 +170,22 @@ def import_from_file(self, path): if self.interrupted: raise WorkerInterruptedException() - # remove NUL bytes here because they trip up a lot of other - # things - # also include import metadata in item - item = json.loads(line.replace("\0", "")) + try: + # remove NUL bytes here because they trip up a lot of other + # things + # also include import metadata in item + item = json.loads(line.replace("\0", "")) + except json.JSONDecodeError: + warning = (f"An item on line {i:,} of the imported file could not be parsed as JSON - this may " + f"indicate that the file you uploaded was incomplete and you need to try uploading it " + f"again. The item will be ignored.") + + if warning not in import_warnings: + import_warnings[warning] = 0 + import_warnings[warning] += 1 + continue + + new_item = { **item["data"], "__import_meta": {k: v for k, v in item.items() if k != "data"} From e09e87502e2fcee8f2ad79b26e40c27f54842acf Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 22 Oct 2024 13:38:35 +0200 Subject: [PATCH 05/48] only show importables in frontpage if enabled --- webtool/views/views_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/views/views_misc.py b/webtool/views/views_misc.py index 4690b622..e179085c 100644 --- a/webtool/views/views_misc.py +++ b/webtool/views/views_misc.py @@ -73,7 +73,7 @@ def show_about(): datasources = {k: v for k, v in fourcat_modules.datasources.items() if k in config.get("datasources.enabled") and not v["importable"]} - importables = {k: v for k, v in fourcat_modules.datasources.items() if v["importable"]} + importables = {k: v for k, v in fourcat_modules.datasources.items() if (v["importable"] and k in config.get("datasources.enabled"))} return render_template("frontpage.html", stats=stats, news=news, datasources=datasources, importables=importables) From 8261b25d62718f981163d85238df4b83642e8f27 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Tue, 22 Oct 2024 15:52:52 +0200 Subject: [PATCH 06/48] Do not hide empty datasets by default --- webtool/views/views_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py index 28aaa76a..1720b3a8 100644 --- a/webtool/views/views_dataset.py +++ b/webtool/views/views_dataset.py @@ -70,7 +70,7 @@ def show_results(page): filters["sort_by"] = "timestamp" if not request.args: - filters["hide_empty"] = True + filters["hide_empty"] = False # handle 'depth'; all, own datasets, or favourites? # 'all' is limited to admins From aad7d57e1b44f7c59a3eb89ba6282d37f10339fe Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 23 Oct 2024 15:39:27 +0200 Subject: [PATCH 07/48] New processor: deduplicate images --- processors/filtering/unique_images.py | 143 ++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 processors/filtering/unique_images.py diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py new file mode 100644 index 00000000..819e4b9d --- /dev/null +++ b/processors/filtering/unique_images.py @@ -0,0 +1,143 @@ +""" +Filter by unique images +""" +import imagehash +import hashlib +import shutil +import json + +from PIL import Image +from backend.lib.processor import BasicProcessor +from common.lib.exceptions import ProcessorInterruptedException +from common.lib.helpers import UserInput + +__author__ = "Stijn Peeters" +__credits__ = ["Stijn Peeters"] +__maintainer__ = "Stijn Peeters" +__email__ = "4cat@oilab.eu" + + +class UniqueImageFilter(BasicProcessor): + """ + Retain only unique images, by a user-defined metric + """ + type = "image-downloader-unique" # job type ID + category = "Visualisation" # category + title = "Filter for unique images" # title displayed in UI + description = "Only keeps one instance per image, using a choice of detection method." # description displayed in UI + extension = "zip" + + references = [ + "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)", + "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", + "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", + "Explainer: [Difference hash](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)", + + ] + + options = { + "hash-type": { + "type": UserInput.OPTION_CHOICE, + "help": "Comparison method", + "default": "file-hash", + "options": { + "file-hash": "File hash (files need to be byte-by-byte duplicates)", + "colorhash": "Colour hash (good at colours, worse at shapes)", + "phash": "Perceptual hash (decent at colours and shapes)", + "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)", + "dhash": "Difference hash (similar to average hash, better at photos and art)" + } + } + } + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor on image archives + + :param module: Module to determine compatibility with + """ + return module.get_media_type() == "image" or module.type.startswith( + "image-downloader") or module.type == "video-frames" + + def hash_file(self, image_file, hash_type="file-hash"): + """ + Generate an image hash + + :param Path image_file: Image file to hash + :param str hash_type: Hash type, one of `file-hash`, `colorhash`, + `phash`, `average_hash`, `dhash` + :return str: Hexadecimal hash value + """ + if not image_file.exists(): + raise FileNotFoundError() + + if hash_type == "file-hash": + hasher = hashlib.sha1() + + # Open the file in binary mode + with image_file.open("rb") as infile: + # Read and update hash in chunks to handle large files + while chunk := infile.read(1024): + hasher.update(chunk) + + return hasher.hexdigest() + + elif hash_type in ("colorhash", "phash", "average_hash", "dhash"): + image = Image.open(image_file) + + return str(getattr(imagehash, hash_type)(image)) + + else: + raise NotImplementedError(f"Unknown hash type '{hash_type}'") + + def process(self): + """ + Loop through images and only retain ones that have not been seen yet + + :return: + """ + seen_hashes = set() + hash_map = {} + metadata = None + dupes = 0 + processed = 0 + staging_area = self.dataset.get_staging_area() + + for image_file in self.iterate_archive_contents(self.source_file): + if self.interrupted: + raise ProcessorInterruptedException("Interrupted while filtering for unique images") + + self.dataset.update_progress(processed / self.source_dataset.num_rows) + processed += 1 + + if image_file.name == ".metadata.json": + with image_file.open() as infile: + metadata = json.load(infile) + continue + + image_hash = self.hash_file(image_file, self.parameters.get("hash-type")) + + if image_hash not in seen_hashes: + seen_hashes.add(image_hash) + shutil.copy2(image_file, staging_area) + hash_map[image_hash] = image_file.name + else: + self.dataset.log(f"{image_file.name} is a duplicate of {hash_map[image_hash]} - skipping") + dupes += 1 + + new_metadata = {} + inverse_hashmap = {v: k for k, v in hash_map.items()} + for url, item in metadata.items(): + if item["filename"] in inverse_hashmap: + new_metadata[inverse_hashmap[item["filename"]]] = { + **item, + "hash": inverse_hashmap[item["filename"]], + "hash_type": self.parameters.get("hash-type") + } + + with staging_area.joinpath(".metadata.json").open("w") as outfile: + json.dump(new_metadata, outfile) + + self.dataset.update_status(f"Image archive filtered, found {dupes:,} duplicate(s)", is_final=True) + self.write_archive_and_finish(staging_area, len(hash_map), finish=True) From 25f9ffd464e07fcc19014173eec39af01adee76b Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 23 Oct 2024 15:42:38 +0200 Subject: [PATCH 08/48] Add some progress logs --- processors/filtering/unique_images.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py index 819e4b9d..0970d1f8 100644 --- a/processors/filtering/unique_images.py +++ b/processors/filtering/unique_images.py @@ -104,11 +104,15 @@ def process(self): processed = 0 staging_area = self.dataset.get_staging_area() + self.dataset.update_progress("Processing images and looking for duplicates") for image_file in self.iterate_archive_contents(self.source_file): if self.interrupted: raise ProcessorInterruptedException("Interrupted while filtering for unique images") self.dataset.update_progress(processed / self.source_dataset.num_rows) + if processed % 100 == 0: + self.dataset.update_progress(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, " + f"found {dupes:,} duplicate(s)") processed += 1 if image_file.name == ".metadata.json": From 53821d10ffff6c9834f2cc73b0e3fc2e7f8269db Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 23 Oct 2024 16:57:27 +0200 Subject: [PATCH 09/48] Filename filter in iterate_archive_contents --- backend/lib/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/lib/processor.py b/backend/lib/processor.py index e9e4d85a..5dbb09a1 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -480,7 +480,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset, self.dataset.update_status("Parent dataset updated.") - def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True): + def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True, filename_filter=[]): """ A generator that iterates through files in an archive From f98addc7a3c5173b0f531d07a0379e73204c1c61 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 23 Oct 2024 17:39:17 +0200 Subject: [PATCH 10/48] New 'Bipartite image-item network' processor --- processors/networks/image-network.py | 146 +++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 processors/networks/image-network.py diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py new file mode 100644 index 00000000..123a1189 --- /dev/null +++ b/processors/networks/image-network.py @@ -0,0 +1,146 @@ +""" +Make a bipartite Image-Item network +""" +import json + +from backend.lib.processor import BasicProcessor + +import networkx as nx + +__author__ = "Stijn Peeters" +__credits__ = ["Stijn Peeters"] +__maintainer__ = "Stijn Peeters" +__email__ = "4cat@oilab.eu" + +from common.lib.exceptions import ProcessorInterruptedException +from common.lib.user_input import UserInput + + +class ImageGrapher(BasicProcessor): + """ + Image network + + Creates a bipartite network of images and some attribute of the dataset the + images were sourced from + """ + type = "image-bipartite-network" # job type ID + category = "Networks" + title = "Bipartite image-item network" # title displayed in UI + description = ("Create a GEXF network file with a bipartite network of " + "images and some data field (e.g. author) of the dataset " + "the images were sourced from. Suitable for use with Gephi's " + "'Image Preview' plugin.") + extension = "gexf" # extension of result file, used internally and in UI + + options = {} + + @classmethod + def get_options(cls, parent_dataset=None, user=None): + root_dataset = None + columns = None + if parent_dataset: + for parent in reversed(parent_dataset.get_genealogy()): + if parent.get_columns(): + root_dataset = parent + break + columns = root_dataset.get_columns() + + return { + "column": { + "help": "Dataset field", + "type": UserInput.OPTION_TEXT, + "default": "id" + }, + "image-value": { + "help": "Image node label", + "type": UserInput.OPTION_CHOICE, + "options": { + "filename": "Image file name", + "url": "Image URL" + }, + "tooltip": "The image node label will have this value. Depending on the network visualisation software " + "you use, one or the other is required to display the images as nodes." + }, + **({ + "column": { + "help": "Dataset field", + "type": UserInput.OPTION_CHOICE, + "options": { + column: column + for column in columns} + } + } if columns else {}) + } + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor to run on images downloaded from a dataset + + :param module: Module to determine compatibility with + """ + return module.type.startswith("image-downloader") + + def process(self): + column = self.parameters.get("column") + metadata = None + for file in self.iterate_archive_contents(self.source_file, filename_filter=[".metadata.json"]): + with file.open() as infile: + try: + metadata = json.load(infile) + except json.JSONDecodeError: + pass + + if not metadata: + return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only " + "be run on sets of images sourced from another 4CAT dataset.") + + id_file_map = {} + for url, details in metadata.items(): + for item_id in details.get("post_ids", []): + id_file_map[item_id] = url if self.parameters.get("image-value") == "url" else details["filename"] + + root_dataset = None + for parent in reversed(self.dataset.get_genealogy()): + if parent.get_columns(): + root_dataset = parent + break + + if not root_dataset: + return self.dataset.finish_with_error("No suitable parent dataset found - this processor can only " + "be run on sets of images sourced from another 4CAT dataset.") + + network = nx.DiGraph() + processed = 0 + for item in root_dataset.iterate_items(): + self.dataset.update_progress(processed / root_dataset.num_rows) + processed += 1 + if processed % 100 == 0: + self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)") + + if self.interrupted: + raise ProcessorInterruptedException() + + if item.get("id") not in id_file_map: + continue + + from_node_label = item.get(column) + from_node = f"{column}-{from_node_label}" + to_node_label = id_file_map[item.get("id")] + to_node = f"image-{to_node_label}" + + if from_node not in network.nodes: + network.add_node(from_node, label=from_node_label, category=column) + + if to_node not in network.nodes: + network.add_node(to_node, label=to_node_label, category="image", image=to_node_label) + + edge = (from_node, to_node) + if edge not in network.edges(): + network.add_edge(*edge, frequency=0) + + network.edges[edge]["frequency"] += 1 + + self.dataset.update_status("Writing network file") + nx.write_gexf(network, self.dataset.get_results_path()) + self.dataset.finish(len(network.nodes)) From c479a85b71710b65ca44cb5330112c6cc1a84c00 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 23 Oct 2024 17:39:26 +0200 Subject: [PATCH 11/48] Filename filter lost code --- backend/lib/processor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 5dbb09a1..29efde8c 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -497,6 +497,8 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T :param bool immediately_delete: Temporary files are removed after yielded; False keeps files until the staging_area is removed (usually during processor cleanup) + :param list filename_filter: Whitelist of filenames to iterate. + Other files will be ignored. If empty, do not ignore anything. :return: An iterator with a Path item for each file """ @@ -513,6 +515,9 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T archive_contents = sorted(archive_file.namelist()) for archived_file in archive_contents: + if filename_filter and archived_file not in filename_filter: + continue + info = archive_file.getinfo(archived_file) if info.is_dir(): continue From 5d5a0e30bb111a4096c22dc7929e79ca1a9d1f9c Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 23 Oct 2024 19:04:34 +0200 Subject: [PATCH 12/48] Catch rate limits in Telegram media downloads --- .../visualisation/download-telegram-images.py | 35 ++++++++++++++----- .../visualisation/download-telegram-videos.py | 12 ++++++- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 99ff5199..9f0d38ee 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -7,13 +7,14 @@ from pathlib import Path +import telethon.errors from telethon import TelegramClient from telethon.errors import TimedOutError from common.config_manager import config from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, timify_long from common.lib.dataset import DataSet from processors.visualisation.download_images import ImageDownloader @@ -194,6 +195,13 @@ async def get_images(self): if self.interrupted: raise ProcessorInterruptedException("Interrupted while downloading images") + if not message: + # message no longer exists + self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it " + f"may have been deleted)") + self.flawless = False + continue + success = False try: # it's actually unclear if images are always jpegs, but this @@ -215,14 +223,23 @@ async def get_images(self): msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}" self.dataset.log(f"Could not download image for message {msg_id} ({e})") self.flawless = False - - media_done += 1 - self.metadata[filename] = { - "filename": filename, - "success": success, - "from_dataset": self.source_dataset.key, - "post_ids": [msg_id] - } + finally: + media_done += 1 + self.metadata[filename] = { + "filename": filename, + "success": success, + "from_dataset": self.source_dataset.key, + "post_ids": [msg_id] + } + + except telethon.errors.FloodError as e: + later = "later" + if hasattr(e, "seconds"): + later = f"in {timify_long(e.seconds)}" + self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); " + f"halting download process. Try again {later}.", is_final=True) + self.flawless = False + break except ValueError as e: self.dataset.log(f"Couldn't retrieve images for {entity}, it probably does not exist anymore ({e})") diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py index ef6d4423..b441ff9d 100644 --- a/processors/visualisation/download-telegram-videos.py +++ b/processors/visualisation/download-telegram-videos.py @@ -8,12 +8,13 @@ from pathlib import Path from telethon import TelegramClient +from telethon.errors import FloodError from common.config_manager import config from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from processors.visualisation.download_videos import VideoDownloaderPlus -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, timify_long from common.lib.dataset import DataSet __author__ = "Stijn Peeters" @@ -210,6 +211,15 @@ async def get_videos(self): "from_dataset": self.source_dataset.key, "post_ids": [msg_id] } + + except FloodError as e: + later = "later" + if hasattr(e, "seconds"): + later = f"in {timify_long(e.seconds)}" + self.dataset.update_status(f"Rate-limited by Telegram after downloading {media_done-1:,} image(s); " + f"halting download process. Try again {later}.", is_final=True) + self.flawless = False + break except ValueError as e: self.dataset.log(f"Couldn't retrieve video for {entity}, it probably does not exist anymore ({e})") From 3df74c9be01118c354eb1457895052cace37cb9a Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 24 Oct 2024 17:11:09 +0200 Subject: [PATCH 13/48] Catch bad request error in Telegram media download --- processors/visualisation/download-telegram-images.py | 4 ++-- processors/visualisation/download-telegram-videos.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 9f0d38ee..dda0ad82 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -9,7 +9,7 @@ import telethon.errors from telethon import TelegramClient -from telethon.errors import TimedOutError +from telethon.errors import TimedOutError, BadRequestError from common.config_manager import config from backend.lib.processor import BasicProcessor @@ -218,7 +218,7 @@ async def get_images(self): await client.download_media(message, str(path), thumb=-1) msg_id = message.id success = True - except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError) as e: + except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError, BadRequestError) as e: filename = f"{entity}-index-{media_done}" msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}" self.dataset.log(f"Could not download image for message {msg_id} ({e})") diff --git a/processors/visualisation/download-telegram-videos.py b/processors/visualisation/download-telegram-videos.py index b441ff9d..aa05173c 100644 --- a/processors/visualisation/download-telegram-videos.py +++ b/processors/visualisation/download-telegram-videos.py @@ -8,7 +8,7 @@ from pathlib import Path from telethon import TelegramClient -from telethon.errors import FloodError +from telethon.errors import FloodError, BadRequestError from common.config_manager import config from backend.lib.processor import BasicProcessor @@ -198,7 +198,7 @@ async def get_videos(self): msg_id = message.id success = True - except (AttributeError, RuntimeError, ValueError, TypeError) as e: + except (AttributeError, RuntimeError, ValueError, TypeError, BadRequestError) as e: filename = f"{entity}-index-{media_done}" msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}" self.dataset.log(f"Could not download video for message {msg_id} ({e})") From a8bec31bfa17b8ac6042a4ff8c4262a229c73f35 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 24 Oct 2024 17:46:26 +0200 Subject: [PATCH 14/48] Catch error in the right place... --- processors/visualisation/download-telegram-images.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index dda0ad82..3cb3e924 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -218,7 +218,7 @@ async def get_images(self): await client.download_media(message, str(path), thumb=-1) msg_id = message.id success = True - except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError, BadRequestError) as e: + except (AttributeError, RuntimeError, ValueError, TypeError, TimedOutError) as e: filename = f"{entity}-index-{media_done}" msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}" self.dataset.log(f"Could not download image for message {msg_id} ({e})") @@ -232,6 +232,10 @@ async def get_images(self): "post_ids": [msg_id] } + except BadRequestError: + self.dataset.log(f"Couldn't retrieve images for {entity} - the channel is no longer accessible ({e})") + self.flawless = False + except telethon.errors.FloodError as e: later = "later" if hasattr(e, "seconds"): From ec3422e2f85ac3f8576a7448c098aed9c95a3d83 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 24 Oct 2024 17:51:48 +0200 Subject: [PATCH 15/48] =?UTF-8?q?a=C4=81a=C4=81a=C4=81a=C4=81a=C4=81a?= =?UTF-8?q?=C4=81a=C4=81a=C4=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- processors/visualisation/download-telegram-images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 3cb3e924..a3309e96 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -232,7 +232,7 @@ async def get_images(self): "post_ids": [msg_id] } - except BadRequestError: + except BadRequestError as e: self.dataset.log(f"Couldn't retrieve images for {entity} - the channel is no longer accessible ({e})") self.flawless = False From ac543cc8bf73d4ecb61ca4d93f162fa198f645a5 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 24 Oct 2024 19:05:17 +0200 Subject: [PATCH 16/48] Break instead of continue when trying to download deleted Telegram image --- processors/visualisation/download-telegram-images.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index a3309e96..6394862e 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -200,7 +200,7 @@ async def get_images(self): self.dataset.log(f"Could not download image for message {msg_id} - message is unavailable (it " f"may have been deleted)") self.flawless = False - continue + break success = False try: @@ -223,6 +223,7 @@ async def get_images(self): msg_id = str(message.id) if hasattr(message, "id") else f"with index {media_done:,}" self.dataset.log(f"Could not download image for message {msg_id} ({e})") self.flawless = False + finally: media_done += 1 self.metadata[filename] = { From bb0909cc4d8908f2df2ca95c9a9896631c055a12 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 24 Oct 2024 23:21:33 +0200 Subject: [PATCH 17/48] dumb image things --- processors/filtering/unique_images.py | 40 +------------- processors/networks/image-network.py | 79 +++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 47 deletions(-) diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py index 0970d1f8..b0a9a2cb 100644 --- a/processors/filtering/unique_images.py +++ b/processors/filtering/unique_images.py @@ -1,15 +1,12 @@ """ Filter by unique images """ -import imagehash -import hashlib import shutil import json -from PIL import Image from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, hash_file __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -60,37 +57,6 @@ def is_compatible_with(cls, module=None, user=None): return module.get_media_type() == "image" or module.type.startswith( "image-downloader") or module.type == "video-frames" - def hash_file(self, image_file, hash_type="file-hash"): - """ - Generate an image hash - - :param Path image_file: Image file to hash - :param str hash_type: Hash type, one of `file-hash`, `colorhash`, - `phash`, `average_hash`, `dhash` - :return str: Hexadecimal hash value - """ - if not image_file.exists(): - raise FileNotFoundError() - - if hash_type == "file-hash": - hasher = hashlib.sha1() - - # Open the file in binary mode - with image_file.open("rb") as infile: - # Read and update hash in chunks to handle large files - while chunk := infile.read(1024): - hasher.update(chunk) - - return hasher.hexdigest() - - elif hash_type in ("colorhash", "phash", "average_hash", "dhash"): - image = Image.open(image_file) - - return str(getattr(imagehash, hash_type)(image)) - - else: - raise NotImplementedError(f"Unknown hash type '{hash_type}'") - def process(self): """ Loop through images and only retain ones that have not been seen yet @@ -111,7 +77,7 @@ def process(self): self.dataset.update_progress(processed / self.source_dataset.num_rows) if processed % 100 == 0: - self.dataset.update_progress(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, " + self.dataset.update_status(f"Processed {processed:,} of {self.source_dataset.num_rows:,} images, " f"found {dupes:,} duplicate(s)") processed += 1 @@ -120,7 +86,7 @@ def process(self): metadata = json.load(infile) continue - image_hash = self.hash_file(image_file, self.parameters.get("hash-type")) + image_hash = hash_file(image_file, self.parameters.get("hash-type")) if image_hash not in seen_hashes: seen_hashes.add(image_hash) diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py index 123a1189..3d153b9b 100644 --- a/processors/networks/image-network.py +++ b/processors/networks/image-network.py @@ -4,6 +4,7 @@ import json from backend.lib.processor import BasicProcessor +from common.lib.helpers import hash_file import networkx as nx @@ -61,6 +62,20 @@ def get_options(cls, parent_dataset=None, user=None): "tooltip": "The image node label will have this value. Depending on the network visualisation software " "you use, one or the other is required to display the images as nodes." }, + "deduplicate": { + "type": UserInput.OPTION_CHOICE, + "help": "Merge images", + "tooltip": "Similar images can be merged into a single node, represented by the first image of the set " + "that was encountered.", + "options": { + "none": "Do not merge", + "file-hash": "File hash (files need to be byte-by-byte duplicates)", + "colorhash": "Colour hash (good at colours, worse at shapes)", + "phash": "Perceptual hash (decent at colours and shapes)", + "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)", + "dhash": "Difference hash (similar to average hash, better at photos and art)" + } + }, **({ "column": { "help": "Dataset field", @@ -83,22 +98,48 @@ def is_compatible_with(cls, module=None, user=None): def process(self): column = self.parameters.get("column") + hash_type = self.parameters.get("deduplicate") + filename_filter = [".metadata.json"] if hash_type == "none" else [] metadata = None - for file in self.iterate_archive_contents(self.source_file, filename_filter=[".metadata.json"]): - with file.open() as infile: + hashed = 0 + + # some maps to make sure we use the right value in the right place + # url or filename, original image or duplicate, etc + file_hash_map = {} + hash_file_map = {} + seen_hashes = set() + id_file_map = {} + + for file in self.iterate_archive_contents(self.source_file, filename_filter=filename_filter): + if file.name == ".metadata.json": + with file.open() as infile: + try: + metadata = json.load(infile) + file_hash_map = {i: v["filename"] for i, v in metadata.items()} if self.parameters.get("image-value") == "url" else {i["filename"]: i["filename"] for i in metadata.values()} + except json.JSONDecodeError: + pass + else: try: - metadata = json.load(infile) - except json.JSONDecodeError: - pass + hashed += 1 + if hashed % 100 == 0: + self.dataset.update_status(f"Generated identity hashes for {hashed:,} of {self.source_dataset.num_rows-1:,} item(s)") + self.dataset.update_progress(hashed / (self.source_dataset.num_rows-1) * 0.5) + file_hash = hash_file(file, hash_type) + file_hash_map[file.name] = file_hash + if file_hash not in hash_file_map: + hash_file_map[file_hash] = file.name + + except (FileNotFoundError, ValueError) as e: + continue if not metadata: return self.dataset.finish_with_error("No valid metadata found in image archive - this processor can only " "be run on sets of images sourced from another 4CAT dataset.") - id_file_map = {} + file_url_map = {v["filename"]: u for u, v in metadata.items()} for url, details in metadata.items(): for item_id in details.get("post_ids", []): - id_file_map[item_id] = url if self.parameters.get("image-value") == "url" else details["filename"] + id_file_map[item_id] = details["filename"] root_dataset = None for parent in reversed(self.dataset.get_genealogy()): @@ -113,7 +154,12 @@ def process(self): network = nx.DiGraph() processed = 0 for item in root_dataset.iterate_items(): - self.dataset.update_progress(processed / root_dataset.num_rows) + progress = processed / root_dataset.num_rows + if hashed: + # if hashing was necessary, we approximate that as 50% of the work + progress = (progress * 0.5) + 0.5 + + self.dataset.update_progress(progress) processed += 1 if processed % 100 == 0: self.dataset.update_status(f"Processed {processed:,} of {root_dataset.num_rows:,} item(s)") @@ -126,9 +172,22 @@ def process(self): from_node_label = item.get(column) from_node = f"{column}-{from_node_label}" - to_node_label = id_file_map[item.get("id")] - to_node = f"image-{to_node_label}" + image_file = id_file_map[item.get("id")] + image_hash = file_hash_map[image_file] + if image_hash in seen_hashes: + to_node_label = hash_file_map[image_hash] + if image_file != to_node_label: + self.dataset.update_status(f"Image {image_file} is a duplicate of {to_node_label} - merging.") + + else: + seen_hashes.add(image_hash) + to_node_label = id_file_map[item.get("id")] + + if self.parameters.get("image-value") == "url": + to_node_label = file_url_map[to_node_label] + + to_node = f"image-{to_node_label}" if from_node not in network.nodes: network.add_node(from_node, label=from_node_label, category=column) From 1aec9e6837da3e322a4d128fc5254edf00ff4308 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 24 Oct 2024 23:23:44 +0200 Subject: [PATCH 18/48] help!! --- common/lib/helpers.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/common/lib/helpers.py b/common/lib/helpers.py index 2911044f..5fe5df48 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -1,8 +1,9 @@ """ Miscellaneous helper functions for the 4CAT backend """ -import hashlib import subprocess +import imagehash +import hashlib import requests import datetime import smtplib @@ -23,6 +24,7 @@ from urllib.parse import urlparse, urlunparse from calendar import monthrange from packaging import version +from PIL import Image from common.lib.user_input import UserInput from common.config_manager import config @@ -404,6 +406,37 @@ def andify(items): return ", ".join([str(item) for item in items]) + result +def hash_file(image_file, hash_type="file-hash"): + """ + Generate an image hash + + :param Path image_file: Image file to hash + :param str hash_type: Hash type, one of `file-hash`, `colorhash`, + `phash`, `average_hash`, `dhash` + :return str: Hexadecimal hash value + """ + if not image_file.exists(): + raise FileNotFoundError() + + if hash_type == "file-hash": + hasher = hashlib.sha1() + + # Open the file in binary mode + with image_file.open("rb") as infile: + # Read and update hash in chunks to handle large files + while chunk := infile.read(1024): + hasher.update(chunk) + + return hasher.hexdigest() + + elif hash_type in ("colorhash", "phash", "average_hash", "dhash"): + image = Image.open(image_file) + + return str(getattr(imagehash, hash_type)(image)) + + else: + raise NotImplementedError(f"Unknown hash type '{hash_type}'") + def get_yt_compatible_ids(yt_ids): """ :param yt_ids list, a list of strings From ea9e3f5bddbb3ec7e4abb3b8cfd17d79675dcb74 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 24 Oct 2024 23:24:54 +0200 Subject: [PATCH 19/48] =?UTF-8?q?=F0=9F=98=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- processors/filtering/unique_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py index b0a9a2cb..a8dd8763 100644 --- a/processors/filtering/unique_images.py +++ b/processors/filtering/unique_images.py @@ -70,7 +70,7 @@ def process(self): processed = 0 staging_area = self.dataset.get_staging_area() - self.dataset.update_progress("Processing images and looking for duplicates") + self.dataset.update_status("Processing images and looking for duplicates") for image_file in self.iterate_archive_contents(self.source_file): if self.interrupted: raise ProcessorInterruptedException("Interrupted while filtering for unique images") From 96528d15daba77bd4e9ef05e2eaa65e3336b0bee Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 25 Oct 2024 12:59:18 +0200 Subject: [PATCH 20/48] =?UTF-8?q?Clean=20that=20code=20=F0=9F=A7=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- processors/networks/image-network.py | 35 ++++++++++++++++------------ 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py index 3d153b9b..3b93a3df 100644 --- a/processors/networks/image-network.py +++ b/processors/networks/image-network.py @@ -170,31 +170,36 @@ def process(self): if item.get("id") not in id_file_map: continue - from_node_label = item.get(column) - from_node = f"{column}-{from_node_label}" + # from nodes are the dataset fields (e.g. 'body' or 'chat') + # to node names are filenames (optionally mapped to URLs later) + from_node = item.get(column) + from_node_id = f"{column}-{from_node}" image_file = id_file_map[item.get("id")] - image_hash = file_hash_map[image_file] - if image_hash in seen_hashes: - to_node_label = hash_file_map[image_hash] - if image_file != to_node_label: - self.dataset.update_status(f"Image {image_file} is a duplicate of {to_node_label} - merging.") + image_hash = file_hash_map.get(image_file) + if hash_type != "none" and image_hash in seen_hashes: + # if we're deduplicating and the image is already in the graph, + # merge the nodes (use the original node as the 'to node') + to_node = hash_file_map[image_hash] + if image_file != to_node: + self.dataset.update_status(f"Image {image_file} identified as a duplicate of {to_node} - " + f"merging.") else: seen_hashes.add(image_hash) - to_node_label = id_file_map[item.get("id")] + to_node = image_file if self.parameters.get("image-value") == "url": - to_node_label = file_url_map[to_node_label] + to_node = file_url_map[to_node] - to_node = f"image-{to_node_label}" - if from_node not in network.nodes: - network.add_node(from_node, label=from_node_label, category=column) + to_node_id = f"image-{to_node}" + if from_node_id not in network.nodes: + network.add_node(from_node_id, label=from_node, category=column) - if to_node not in network.nodes: - network.add_node(to_node, label=to_node_label, category="image", image=to_node_label) + if to_node_id not in network.nodes: + network.add_node(to_node_id, label=to_node, category="image", image=to_node) - edge = (from_node, to_node) + edge = (from_node_id, to_node_id) if edge not in network.edges(): network.add_edge(*edge, frequency=0) From caf451ad446dd4ba36b4da0939cb1a258a1e37e6 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 25 Oct 2024 14:57:54 +0200 Subject: [PATCH 21/48] Fix image network for Telegram images --- processors/networks/image-network.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py index 3b93a3df..8b1549ce 100644 --- a/processors/networks/image-network.py +++ b/processors/networks/image-network.py @@ -139,6 +139,9 @@ def process(self): file_url_map = {v["filename"]: u for u, v in metadata.items()} for url, details in metadata.items(): for item_id in details.get("post_ids", []): + if self.source_dataset.type.endswith("-telegram"): + # telegram has weird IDs + item_id = "-".join(details["filename"].split("-")[:-1]) + "-" + str(item_id) id_file_map[item_id] = details["filename"] root_dataset = None From 80dfceddf377ddcff5e9f9c0d63ef07ea0138aa7 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 25 Oct 2024 15:12:14 +0200 Subject: [PATCH 22/48] asdfghjkl; --- processors/networks/image-network.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py index 8b1549ce..4267c965 100644 --- a/processors/networks/image-network.py +++ b/processors/networks/image-network.py @@ -183,8 +183,8 @@ def process(self): if hash_type != "none" and image_hash in seen_hashes: # if we're deduplicating and the image is already in the graph, # merge the nodes (use the original node as the 'to node') - to_node = hash_file_map[image_hash] - if image_file != to_node: + to_node = hash_file_map.get(image_hash) + if to_node and image_file != to_node: self.dataset.update_status(f"Image {image_file} identified as a duplicate of {to_node} - " f"merging.") @@ -192,6 +192,10 @@ def process(self): seen_hashes.add(image_hash) to_node = image_file + if not to_node: + # image could not be hashed, probably invalid file + continue + if self.parameters.get("image-value") == "url": to_node = file_url_map[to_node] From dcc0a21d3ea0a7955cca26d6e34d032598128b80 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 31 Oct 2024 14:29:39 +0100 Subject: [PATCH 23/48] fix: weird config.get() bug w/ default values --- common/config_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/config_manager.py b/common/config_manager.py index 1b8d4052..7760aae9 100644 --- a/common/config_manager.py +++ b/common/config_manager.py @@ -269,11 +269,11 @@ def get(self, attribute_name, default=None, is_json=False, user=None, tags=None) if not is_json and value is not None: value = json.loads(value) - # TODO: check this as it feels like it could cause a default to return even if value is not None. - Dale - elif default is not None: - value = default + # TODO: Which default should have priority? The provided default feels like it should be the highest priority, but I think that is an old implementation and perhaps should be removed. - Dale elif value is None and setting_name in self.config_definition and "default" in self.config_definition[setting_name]: value = self.config_definition[setting_name]["default"] + elif value is None and default is not None: + value = default final_settings[setting_name] = value From 187101926bb7c650ced3d14a9cac17560ddb27b6 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 4 Nov 2024 10:25:37 +0000 Subject: [PATCH 24/48] Rebrand Twitter v2 datasource as compatible with the X Research API --- datasources/twitterv2/DESCRIPTION.md | 81 ++++++++--------- datasources/twitterv2/__init__.py | 2 +- datasources/twitterv2/search_twitter.py | 115 ++++++++++++------------ 3 files changed, 95 insertions(+), 103 deletions(-) diff --git a/datasources/twitterv2/DESCRIPTION.md b/datasources/twitterv2/DESCRIPTION.md index 57f1f7a5..d138e675 100644 --- a/datasources/twitterv2/DESCRIPTION.md +++ b/datasources/twitterv2/DESCRIPTION.md @@ -1,93 +1,88 @@ -Twitter data is gathered through the official [Twitter v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT -allows access to both the Standard and the Academic track. The Standard track is free for anyone to use, but only -allows to retrieve tweets up to seven days old. The Academic track allows a full-archive search of up to ten million -tweets per month (as of March 2022). For the Academic track, you need a valid Bearer token. You can request one -[here](https://developer.twitter.com/en/portal/petition/academic/is-it-right-for-you). +X/Twitter data is gathered through the official [X v2 API](https://developer.twitter.com/en/docs/twitter-api). 4CAT can interface with X's Research API (sometimes +branded as the 'DSA API', referencing the EU's Digital Services Act). To retrieve posts via this API with 4CAT, you need +a valid Bearer token. Read more about this mode of access [here](https://developer.x.com/en/use-cases/do-research/academic-research). -Tweets are captured in batches at a speed of approximately 100,000 tweets per hour. 4CAT will warn you if your dataset +Posts are captured in batches at a speed of approximately 100,000 posts per hour. 4CAT will warn you if your dataset is expected to take more than 30 minutes to collect. It is often a good idea to start small (with very specific queries or narrow date ranges) and then only create a larger dataset if you are confident that it will be manageable and useful for your analysis. -If you hit your Twitter API quota while creating a dataset, the dataset will be finished with the tweets that have been +If you hit your X API quota while creating a dataset, the dataset will be finished with the posts that have been collected so far and a warning will be logged. ### Query syntax -Check the [API documentation](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) +Check the [API documentation](https://developer.x.com/en/docs/x-api/tweets/search/integrate/build-a-query) for available query syntax and operators. This information is crucial to what data you collect. Important operators for -instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted tweets and retweets. Query syntax -is roughly the same as for Twitter's search interface, so you can try out most queries by entering them in the Twitter -app or website's search field and looking at the results. You can also test queries with -Twitter's [Query Builder](https://developer.twitter.com/apitools/query?query=). +instance include `-is:nullcast` and `-is:retweet`, with which you can ignore promoted posts and reposts. Query syntax +is roughly the same as for X's search interface, so you can try out most queries by entering them in the X app or +website's search field and looking at the results. You can also test queries with +X's [Query Builder](https://developer.twitter.com/apitools/query?query=). ### Date ranges -By default, Twitter returns tweets posted within the past 30 days. If you want to go back further, you need to -explicitly set a date range. Note that Twitter does not like date ranges that end in the future, or start before -Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date. +By default, X returns posts posted within the past 30 days. If you want to go back further, you need to +explicitly set a date range. Note that X does not like date ranges that end in the future, or start before +Twitter existed. If you want to capture tweets "until now", it is often best to use yesterday as an end date. Also note +that API access may come with certain limitations on how far a query may extend into history. ### Geo parameters -Twitter offers a number of ways -to [query by location/geo data](https://developer.twitter.com/en/docs/tutorials/filtering-tweets-by-location) -such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. This feature is only available for the Academic level; -you will receive a 400 error if using queries filtering by geographic information. +X offers a number of ways +to [query by location/geo data](https://developer.x.com/en/docs/tutorials/filtering-tweets-by-location) +such as `has:geo`, `place:Amsterdam`, or `place:Amsterdam`. ### Retweets -A retweet from Twitter API v2 contains at maximum 140 characters from the original tweet. 4CAT therefore -gathers both the retweet and the original tweet and reformats the retweet text so it resembles a user's experience. +A repost from X API v2 contains at maximum 140 characters from the original post. 4CAT therefore +gathers both the repost and the original post and reformats the repost text so it resembles a user's experience. This also affects mentions, hashtags, and other data as only those contained in the first 140 characters are provided -by Twitter API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added -to the retweet for 4CAT analysis methods. *4CAT stores the data from Twitter API v2 as similar as possible to the format +by X API v2 with the retweet. Additional hashtags, mentions, etc. are taken from the original tweet and added +to the repost for 4CAT analysis methods. *4CAT stores the data from X API v2 as similar as possible to the format in which it was received which you can obtain by downloading the ndjson file.* *Example 1* -[This retweet](https://twitter.com/tonino1630/status/1554618034299568128) returns the following data: +[This repost](https://x.com/tonino1630/status/1554618034299568128) returns the following data: - *author:* `tonino1630` -- * - text:* `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…` +- *text:* `RT @ChuckyFrao: ¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar p…` - *mentions:* `ChuckyFrao` - *hashags:*
-While the original tweet will return (as a reference tweet) this data: +While the original post will return (as a reference post) this data: - *author:* `ChuckyFrao` -- * - text:* `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0` +- *text:* `¡HUELE A LIBERTAD! La Casa Blanca publicó una orden ejecutiva sobre las acciones del Gobierno de Joe Biden para negociar presos estadounidenses en otros países. #FreeAlexSaab @POTUS @usembassyve @StateSPEHA @StateDept @SecBlinken #BringAlexHome #IntegridadTerritorial https://t.co/ClSQ3Rfax0` - *mentions:* `POTUS, usembassyve, StateSPEHA, StateDept, SecBlinken` - *hashtags:* `FreeAlexSaab, BringAlexHome, IntegridadTerritorial`
-As you can see, only the author of the original tweet is listed as a mention in the retweet. +As you can see, only the author of the original post is listed as a mention in the repost. *Example 2* -[This retweet](https://twitter.com/Macsmart31/status/1554618041459445760) returns the following: +[This repost](https://x.com/Macsmart31/status/1554618041459445760) returns the following: - *author:* `Macsmart31` -- * - text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…` +- *text:* `RT @mickyd123us: @tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the de…` - *mentions:* `mickyd123us, tribelaw, HonorDecency`
-Compared with the original tweet referenced below: +Compared with the original post referenced below: - *author:* `mickyd123us` -- * - text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr` +- *text:* `@tribelaw @HonorDecency Thank goodness Biden replaced his detail - we know that Pence refused to "Take A Ride" with the detail he had in the basement. Who knows where they would have taken him. https://t.co/s47Kb5RrCr` - *mentions:* `tribelaw, HonorDecency`
-Because the mentioned users are in the first 140 characters of the original tweet, they are also listed as mentions in the retweet. - -The key difference here is that example one the retweet contains none of the hashtags or mentions from the original -tweet (they are beyond the first 140 characters) while the second retweet example does return mentions from the original -tweet. *Due to this discrepancy, for retweets all mentions and hashtags of the original tweet are considered as mentions -and hashtags of the retweet.* A user on Twitter will see all mentions and hashtags when viewing a retweet and the -retweet would be a part of any network around those mentions and hashtags. +Because the mentioned users are in the first 140 characters of the original post, they are also listed as mentions in +the repost. + +The key difference here is that in example one the repost contains none of the hashtags or mentions from the original +post (they are beyond the first 140 characters) while the second repost example does return mentions from the original +post. *Due to this discrepancy, for reposts all mentions and hashtags of the original post are considered as mentions +and hashtags of the repost.* A user on X will see all mentions and hashtags when viewing a repost and the +repost would be a part of any network around those mentions and hashtags. diff --git a/datasources/twitterv2/__init__.py b/datasources/twitterv2/__init__.py index 3335bc7c..6aa80c7b 100644 --- a/datasources/twitterv2/__init__.py +++ b/datasources/twitterv2/__init__.py @@ -9,4 +9,4 @@ # Internal identifier for this data source DATASOURCE = "twitterv2" -NAME = "Twitter API (v2) Search" \ No newline at end of file +NAME = "X/Twitter API (v2) Search" \ No newline at end of file diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py index 999680b6..8b91d1eb 100644 --- a/datasources/twitterv2/search_twitter.py +++ b/datasources/twitterv2/search_twitter.py @@ -1,5 +1,5 @@ """ -Twitter keyword search via the Twitter API v2 +X/Twitter keyword search via the X API v2 """ import requests import datetime @@ -17,13 +17,10 @@ class SearchWithTwitterAPIv2(Search): """ - Get Tweets via the Twitter API - - This only allows for historical search - use f.ex. TCAT for more advanced - queries. + Get Tweets via the X API """ type = "twitterv2-search" # job ID - title = "Twitter API (v2)" + title = "X/Twitter API (v2)" extension = "ndjson" is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated @@ -32,15 +29,15 @@ class SearchWithTwitterAPIv2(Search): import_issues = True references = [ - "[Twitter API documentation](https://developer.twitter.com/en/docs/twitter-api)" + "[X/Twitter API documentation](https://developer.x.com/en/docs/x-api)" ] config = { "twitterv2-search.academic_api_key": { "type": UserInput.OPTION_TEXT, "default": "", - "help": "Academic API Key", - "tooltip": "An API key for the Twitter v2 Academic API. If " + "help": "Research API Key", + "tooltip": "An API key for the X/Twitter v2 Research API. If " "provided, the user will not need to enter their own " "key to retrieve tweets. Note that this API key should " "have access to the Full Archive Search endpoint." @@ -50,15 +47,15 @@ class SearchWithTwitterAPIv2(Search): "default": 0, "min": 0, "max": 10_000_000, - "help": "Max tweets per dataset", + "help": "Max posts per dataset", "tooltip": "4CAT will never retrieve more than this amount of " - "tweets per dataset. Enter '0' for unlimited tweets." + "posts per dataset. Enter '0' for unlimited posts." }, "twitterv2-search.id_lookup": { "type": UserInput.OPTION_TOGGLE, "default": False, "help": "Allow lookup by ID", - "tooltip": "If enabled, allow users to enter a list of tweet IDs " + "tooltip": "If enabled, allow users to enter a list of post IDs " "to retrieve. This is disabled by default because it " "can be confusing to novice users." } @@ -110,7 +107,7 @@ def get_items(self, query): } if self.parameters.get("query_type", "query") == "id_lookup" and self.config.get("twitterv2-search.id_lookup"): - endpoint = "https://api.twitter.com/2/tweets" + endpoint = "https://api.x.com/2/tweets" tweet_ids = self.parameters.get("query", []).split(',') @@ -126,7 +123,7 @@ def get_items(self, query): else: # Query to all or search - endpoint = "https://api.twitter.com/2/tweets/search/" + api_type + endpoint = "https://api.x.com/2/tweets/search/" + api_type queries = [self.parameters.get("query", "")] @@ -158,7 +155,7 @@ def get_items(self, query): while True: if self.interrupted: - raise ProcessorInterruptedException("Interrupted while getting tweets from the Twitter API") + raise ProcessorInterruptedException("Interrupted while getting posts from the Twitter API") # there is a limit of one request per second, so stay on the safe side of this while self.previous_request == int(time.time()): @@ -188,18 +185,18 @@ def get_items(self, query): try: structured_response = api_response.json() if structured_response.get("title") == "UsageCapExceeded": - self.dataset.update_status("Hit the monthly tweet cap. You cannot capture more tweets " - "until your API quota resets. Dataset completed with tweets " + self.dataset.update_status("Hit the monthly post cap. You cannot capture more posts " + "until your API quota resets. Dataset completed with posts " "collected so far.", is_final=True) return except (json.JSONDecodeError, ValueError): - self.dataset.update_status("Hit Twitter rate limit, but could not figure out why. Halting " - "tweet collection.", is_final=True) + self.dataset.update_status("Hit X's rate limit, but could not figure out why. Halting " + "post collection.", is_final=True) return resume_at = convert_to_int(api_response.headers["x-rate-limit-reset"]) + 1 resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c") - self.dataset.update_status("Hit Twitter rate limit - waiting until %s to continue." % resume_at_str) + self.dataset.update_status("Hit X's rate limit - waiting until %s to continue." % resume_at_str) while time.time() <= resume_at: if self.interrupted: raise ProcessorInterruptedException("Interrupted while waiting for rate limit to reset") @@ -211,10 +208,10 @@ def get_items(self, query): elif api_response.status_code == 403: try: structured_response = api_response.json() - self.dataset.update_status("'Forbidden' error from the Twitter API. Could not connect to Twitter API " + self.dataset.update_status("'Forbidden' error from the X API. Could not connect to X API " "with this API key. %s" % structured_response.get("detail", ""), is_final=True) except (json.JSONDecodeError, ValueError): - self.dataset.update_status("'Forbidden' error from the Twitter API. Your key may not have access to " + self.dataset.update_status("'Forbidden' error from the X API. Your key may not have access to " "the full-archive search endpoint.", is_final=True) finally: return @@ -224,7 +221,7 @@ def get_items(self, query): elif api_response.status_code in (502, 503, 504): resume_at = time.time() + 60 resume_at_str = datetime.datetime.fromtimestamp(int(resume_at)).strftime("%c") - self.dataset.update_status("Twitter unavailable (status %i) - waiting until %s to continue." % ( + self.dataset.update_status("X unavailable (status %i) - waiting until %s to continue." % ( api_response.status_code, resume_at_str)) while time.time() <= resume_at: time.sleep(0.5) @@ -233,7 +230,7 @@ def get_items(self, query): # this usually means the query is too long or otherwise contains # a syntax error elif api_response.status_code == 400: - msg = "Response %i from the Twitter API; " % api_response.status_code + msg = "Response %i from the X API; " % api_response.status_code try: api_response = api_response.json() msg += api_response.get("title", "") @@ -247,19 +244,19 @@ def get_items(self, query): # invalid API key elif api_response.status_code == 401: - self.dataset.update_status("Invalid API key - could not connect to Twitter API", is_final=True) + self.dataset.update_status("Invalid API key - could not connect to X API", is_final=True) return # haven't seen one yet, but they probably exist elif api_response.status_code != 200: self.dataset.update_status( "Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True) - self.log.warning("Twitter API v2 responded with status code %i. Response body: %s" % ( + self.log.warning("X API v2 responded with status code %i. Response body: %s" % ( api_response.status_code, api_response.text)) return elif not api_response: - self.dataset.update_status("Could not connect to Twitter. Cancelling.", is_final=True) + self.dataset.update_status("Could not connect to X. Cancelling.", is_final=True) return api_response = api_response.json() @@ -291,13 +288,13 @@ def get_items(self, query): if num_missing_objects > 50: # Large amount of missing objects; possible error with Twitter API self.import_issues = False - error_report.append('%i missing objects received following tweet number %i. Possible issue with Twitter API.' % (num_missing_objects, tweets)) + error_report.append('%i missing objects received following post number %i. Possible issue with X API.' % (num_missing_objects, tweets)) error_report.append('Missing objects collected: ' + ', '.join(['%s: %s' % (k, len(v)) for k, v in missing_objects.items()])) # Warn if new missing object is recorded (for developers to handle) expected_error_types = ['user', 'media', 'poll', 'tweet', 'place'] if any(key not in expected_error_types for key in missing_objects.keys()): - self.log.warning("Twitter API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types])) + self.log.warning("X API v2 returned unknown error types: %s" % str([key for key in missing_objects.keys() if key not in expected_error_types])) # Loop through and collect tweets for tweet in api_response.get("data", []): @@ -312,7 +309,7 @@ def get_items(self, query): tweets += 1 if tweets % 500 == 0: - self.dataset.update_status("Received %s of ~%s tweets from the Twitter API" % ("{:,}".format(tweets), expected_tweets)) + self.dataset.update_status("Received %s of ~%s tweets from the X API" % ("{:,}".format(tweets), expected_tweets)) if num_expected_tweets is not None: self.dataset.update_progress(tweets / num_expected_tweets) @@ -474,21 +471,19 @@ def get_options(cls, parent_dataset=None, user=None): max_tweets = config.get("twitterv2-search.max_tweets", user=user) if have_api_key: - intro_text = ("This data source uses the full-archive search endpoint of the Twitter API (v2) to retrieve " + intro_text = ("This data source uses the full-archive search endpoint of the X API (v2) to retrieve " "historic tweets that match a given query.") else: - intro_text = ("This data source uses either the Standard 7-day historical Search endpoint or the " - "full-archive search endpoint of the Twitter API, v2. To use the latter, you must have " - "access to the Academic Research track of the Twitter API. In either case, you will need to " - "provide a valid [bearer " - "token](https://developer.twitter.com/en/docs/authentication/oauth-2-0). The bearer token " - "**will be sent to the 4CAT server**, where it will be deleted after data collection has " - "started. Note that any tweets retrieved with 4CAT will count towards your monthly Tweet " - "retrieval cap.") - - intro_text += ("\n\nPlease refer to the [Twitter API documentation](" - "https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) " + intro_text = ("This data source uses the full-archive search endpoint of the X/Twitter API, v2. To use the " + "it, you must have access to the Research track of the X API. You will need to provide a " + "valid [bearer token](https://developer.x.com/en/docs/authentication/oauth-2-0). The " + "bearer token **will be sent to the 4CAT server**, where it will be deleted after data " + "collection has started. Note that any posts retrieved with 4CAT will count towards your " + "monthly post retrieval cap.") + + intro_text += ("\n\nPlease refer to the [X API documentation](" + "https://developer.x.com/en/docs/twitter-api/tweets/search/integrate/build-a-query) " "documentation for more information about this API endpoint and the syntax you can use in your " "search query. Retweets are included by default; add `-is:retweet` to exclude them.") @@ -500,16 +495,18 @@ def get_options(cls, parent_dataset=None, user=None): } if not have_api_key: + # options.update({ + # "api_type": { + # "type": UserInput.OPTION_CHOICE, + # "help": "API track", + # "options": { + # "all": "Research API: Full-archive search", + # "recent": "Standard: Recent search (Tweets published in last 7 days)", + # }, + # "default": "all" + # } + # }) options.update({ - "api_type": { - "type": UserInput.OPTION_CHOICE, - "help": "API track", - "options": { - "all": "Academic: Full-archive search", - "recent": "Standard: Recent search (Tweets published in last 7 days)", - }, - "default": "all" - }, "api_bearer_token": { "type": UserInput.OPTION_TEXT, "sensitive": True, @@ -523,10 +520,10 @@ def get_options(cls, parent_dataset=None, user=None): "query_type": { "type": UserInput.OPTION_CHOICE, "help": "Query type", - "tooltip": "Note: Num of Tweets and Date fields ignored with 'Tweets by ID' lookup", + "tooltip": "Note: Num of posts and date fields are ignored with 'Posts by ID' lookup", "options": { "query": "Search query", - "id_lookup": "Tweets by ID (list IDs seperated by commas or one per line)", + "id_lookup": "Posts by ID (list IDs seperated by commas or one per line)", }, "default": "query" } @@ -539,7 +536,7 @@ def get_options(cls, parent_dataset=None, user=None): }, "amount": { "type": UserInput.OPTION_TEXT, - "help": "Tweets to retrieve", + "help": "Posts to retrieve", "tooltip": "0 = unlimited (be careful!)" if not max_tweets else ("0 = maximum (%s)" % str(max_tweets)), "min": 0, "max": max_tweets if max_tweets else 10_000_000, @@ -550,7 +547,7 @@ def get_options(cls, parent_dataset=None, user=None): }, "daterange-info": { "type": UserInput.OPTION_INFO, - "help": "By default, Twitter returns tweets up til 30 days ago. If you want to go back further, you " + "help": "By default, X returns posts up til 30 days ago. If you want to go back further, you " "need to explicitly set a date range." }, "daterange": { @@ -591,7 +588,7 @@ def validate_query(query, request, user): raise QueryParametersException("Please provide a valid bearer token.") if len(query.get("query")) > 1024 and query.get("query_type", "query") != "id_lookup": - raise QueryParametersException("Twitter API queries cannot be longer than 1024 characters.") + raise QueryParametersException("X API queries cannot be longer than 1024 characters.") if query.get("query_type", "query") == "id_lookup" and config.get("twitterv2-search.id_lookup", user=user): # reformat queries to be a comma-separated list with no wrapping @@ -630,7 +627,7 @@ def validate_query(query, request, user): # to dissuade users from running huge queries that will take forever # to process if params["query_type"] == "query" and (params.get("api_type") == "all" or have_api_key): - count_url = "https://api.twitter.com/2/tweets/counts/all" + count_url = "https://api.x.com/2/tweets/counts/all" count_params = { "granularity": "day", "query": params["query"], @@ -668,7 +665,7 @@ def validate_query(query, request, user): elif response.status_code == 401: raise QueryParametersException("Your bearer token seems to be invalid. Please make sure it is valid " - "for the Academic Track of the Twitter API.") + "for the Research track of the X API.") elif response.status_code == 400: raise QueryParametersException("Your query is invalid. Please make sure the date range does not " @@ -791,7 +788,7 @@ def map_item(item): "thread_id": item.get("conversation_id", item["id"]), "timestamp": tweet_time.strftime("%Y-%m-%d %H:%M:%S"), "unix_timestamp": int(tweet_time.timestamp()), - 'link': "https://twitter.com/%s/status/%s" % (author_username, item.get('id')), + 'link': "https://x.com/%s/status/%s" % (author_username, item.get('id')), "subject": "", "body": item["text"], "author": author_username, From ded8d3df49e7b3cf1e142bfc80019edb43f1adfe Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 11 Nov 2024 16:29:30 +0100 Subject: [PATCH 25/48] Do not instantiate logging handlers twice --- common/lib/logger.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/common/lib/logger.py b/common/lib/logger.py index bbd30c44..ddffa2d7 100644 --- a/common/lib/logger.py +++ b/common/lib/logger.py @@ -185,23 +185,24 @@ def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log' self.logger.setLevel(log_level) # this handler manages the text log files - handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1) - handler.setLevel(log_level) - handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s", - "%d-%m-%Y %H:%M:%S")) - self.logger.addHandler(handler) - - # the slack webhook has its own handler, and is only active if the - # webhook URL is set - try: - if config.get("logging.slack.webhook"): - slack_handler = SlackLogHandler(config.get("logging.slack.webhook")) - slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level)) - self.logger.addHandler(slack_handler) - except Exception: - # we *may* need the logger before the database is in working order - if config.db is not None: - config.db.rollback() + if not self.logger.handlers: + handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1) + handler.setLevel(log_level) + handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s", + "%d-%m-%Y %H:%M:%S")) + self.logger.addHandler(handler) + + # the slack webhook has its own handler, and is only active if the + # webhook URL is set + try: + if config.get("logging.slack.webhook"): + slack_handler = SlackLogHandler(config.get("logging.slack.webhook")) + slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level)) + self.logger.addHandler(slack_handler) + except Exception: + # we *may* need the logger before the database is in working order + if config.db is not None: + config.db.rollback() def log(self, message, level=logging.INFO, frame=None): """ From dc173249e8e82f794f4b857249108fe191adf370 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 13 Nov 2024 11:19:40 +0100 Subject: [PATCH 26/48] Remove .readthedocs.yaml We're not doing docs at the moment --- .readthedocs.yaml | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index faaf6921..00000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# .readthedocs.yaml -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-20.04 - tools: - python: "3.8" - -# Build documentation in the docs/ directory with Sphinx -sphinx: - configuration: docs/conf.py - -# Optionally build your docs in additional formats such as PDF -# formats: -# - pdf - -# Optionally declare the Python requirements required to build your docs -python: - install: - - requirements: docs/requirements.txt \ No newline at end of file From 4e5ef88be340b1880281f07587cbdbc2bbd74313 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 13 Nov 2024 14:11:37 +0100 Subject: [PATCH 27/48] Pass modules to dataset in video timelines processor --- processors/visualisation/video_timelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py index f668e6f5..9270fb3f 100644 --- a/processors/visualisation/video_timelines.py +++ b/processors/visualisation/video_timelines.py @@ -207,7 +207,7 @@ def get_video_labels(self, metadata): labels[filename] = filename for dataset, urls in mapping_dataset.items(): - dataset = DataSet(key=dataset, db=self.db).nearest("*-search") + dataset = DataSet(key=dataset, db=self.db, modules=self.modules).nearest("*-search") # determine appropriate label # is this the right place? should it be in the datasource? From 9197c99af5b2bc27a2e98c0bb72e761712ff91f3 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 18 Nov 2024 12:16:48 +0100 Subject: [PATCH 28/48] Avoid use of chdir() --- common/lib/helpers.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/common/lib/helpers.py b/common/lib/helpers.py index 5fe5df48..cd26d575 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -9,6 +9,7 @@ import smtplib import fnmatch import socket +import shlex import copy import time import json @@ -112,10 +113,8 @@ def get_git_branch(): repository or git is not installed an empty string is returned. """ try: - cwd = os.getcwd() - os.chdir(config.get('PATH_ROOT')) - branch = subprocess.run(["git", "branch", "--show-current"], stdout=subprocess.PIPE) - os.chdir(cwd) + root_dir = str(config.get('PATH_ROOT').resolve()) + branch = subprocess.run(shlex.split(f"git -C {shlex.quote(root_dir)} branch --show-current"), stdout=subprocess.PIPE) if branch.returncode != 0: raise ValueError() return branch.stdout.decode("utf-8").strip() @@ -145,7 +144,6 @@ def get_software_commit(worker=None): # try git command line within the 4CAT root folder # if it is a checked-out git repository, it will tell us the hash of # the currently checked-out commit - cwd = os.getcwd() # path has no Path.relative()... relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent @@ -155,24 +153,24 @@ def get_software_commit(worker=None): # useful version info (since the extension is by definition not in the # main 4CAT repository) and will return an empty value if worker and worker.is_extension: - extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath) - os.chdir(extension_dir) + working_dir = str(config.get("PATH_ROOT").joinpath(relative_filepath).resolve()) # check if we are in the extensions' own repo or 4CAT's - repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + git_cmd = f"git -C {shlex.quote(working_dir)} rev-parse --show-toplevel" + repo_level = subprocess.run(shlex.split(git_cmd), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"): # not its own repository return ("", "") else: - os.chdir(config.get("PATH_ROOT")) + working_dir = str(config.get("PATH_ROOT").resolve()) - show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + show = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} show"), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if show.returncode != 0: raise ValueError() commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1] # now get the repository the commit belongs to, if we can - origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + origin = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if origin.returncode != 0 or not origin.stdout: raise ValueError() repository = origin.stdout.decode("utf-8").strip() @@ -182,9 +180,6 @@ def get_software_commit(worker=None): except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e: return ("", "") - finally: - os.chdir(cwd) - return (commit, repository) def get_software_version(): @@ -280,7 +275,6 @@ def find_extensions(): # collect metadata for extensions allowed_metadata_keys = ("name", "version", "url") - cwd = os.getcwd() for extension in extensions: extension_folder = extension_path.joinpath(extension) metadata_file = extension_folder.joinpath("metadata.json") @@ -297,8 +291,8 @@ def find_extensions(): if extensions[extension]["is_git"]: # try to get remote URL try: - os.chdir(extension_folder) - origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, + extension_root = str(extension_folder.resolve()) + origin = subprocess.run(shlex.split(f"git -C {shlex.quote(extension_root)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE) if origin.returncode != 0 or not origin.stdout: raise ValueError() @@ -310,8 +304,6 @@ def find_extensions(): except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e: print(e) pass - finally: - os.chdir(cwd) return extensions, errors From 9453b76099ec03b3cea859f812890adb36e13df9 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 18 Nov 2024 13:57:27 +0100 Subject: [PATCH 29/48] Don't crash on skipped videos in scene detector --- processors/visualisation/video_scene_identifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py index 634e8c49..5140baa0 100644 --- a/processors/visualisation/video_scene_identifier.py +++ b/processors/visualisation/video_scene_identifier.py @@ -252,8 +252,9 @@ def process(self): if video_data.get('success'): files = video_data.get('files') if 'files' in video_data else [{"filename": video_data.get("filename"), "success":True}] for file in files: - if not file.get("success"): + if not file.get("success") or file.get("filename") not in collected_scenes: continue + # List types are not super fun for CSV if 'post_ids' in video_data: video_data['post_ids'] = ','.join([str(i) for i in video_data['post_ids']]) From 2d4d60f9873fd1a3fd7b8ba2b7ddbc50d17c44f2 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 19 Nov 2024 11:50:32 +0100 Subject: [PATCH 30/48] fix video_hasher to properly cleanup videos with errors --- processors/visualisation/video_hasher.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index ff1222bc..f23b3d0c 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -183,8 +183,9 @@ def process(self): self.dataset.log('Frames per seconds: %f' % frame_interval) # Prepare staging area for videos and video tracking + # VideoHash creates various files that may not be cleaned up on error so we use an output directory staging_area = self.dataset.get_staging_area() - self.dataset.log('Staging directory location: %s' % staging_area) + output_dir = self.dataset.get_staging_area() video_hashes = {} video_metadata = None @@ -224,7 +225,7 @@ def process(self): video_hashes[path.name] = {'videohash': videohash} - shutil.copy(videohash.collage_path, staging_area.joinpath(path.stem + '.jpg')) + shutil.copy(videohash.collage_path, output_dir.joinpath(path.stem + '.jpg')) video_hashes[path.name]['video_collage_filename'] = path.stem + '.jpg' processed_videos += 1 @@ -240,7 +241,7 @@ def process(self): if video_metadata is None: # Grab the metadata directly, if it exists but was skipped (e.g., not found prior to max_videos) try: - metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area) + metadata_path = self.extract_archived_file_by_name(".metadata.json", self.source_file, output_dir) except FileNotFoundError: metadata_path = None if metadata_path: @@ -293,7 +294,7 @@ def process(self): num_posts += 1 writer = None - with staging_area.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile: + with output_dir.joinpath("video_hashes.csv").open("w", encoding="utf-8", newline="") as outfile: for row in rows: if not writer: writer = csv.DictWriter(outfile, fieldnames=row.keys()) @@ -303,7 +304,7 @@ def process(self): # Finish up self.dataset.update_status(f'Created {num_posts} video hashes and stored video collages') - self.write_archive_and_finish(staging_area) + self.write_archive_and_finish(output_dir) class VideoHashNetwork(BasicProcessor): """ From 176905a6307e9c1afeb7b9f36083a26893ce9ec0 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 19 Nov 2024 12:40:49 +0100 Subject: [PATCH 31/48] fixes to video frames when all videos are corrupt --- processors/visualisation/video_frames.py | 18 +++++++++++------- processors/visualisation/video_hasher.py | 13 +++++++++---- processors/visualisation/video_timelines.py | 7 +++++++ 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py index 64b0c4f3..ec95f84f 100644 --- a/processors/visualisation/video_frames.py +++ b/processors/visualisation/video_frames.py @@ -94,7 +94,7 @@ def process(self): processed_videos = 0 self.dataset.update_status("Extracting video frames") - for path in self.iterate_archive_contents(self.source_file, staging_area): + for i, path in enumerate(self.iterate_archive_contents(self.source_file, staging_area)): if self.interrupted: raise ProcessorInterruptedException("Interrupted while determining image wall order") @@ -138,17 +138,21 @@ def process(self): outfile.write(ffmpeg_error) if result.returncode != 0: - error = 'Error Return Code with video %s: %s' % (vid_name, str(result.returncode)) - self.dataset.log(error) + self.dataset.update_status(f"Unable to extract frames from video {vid_name} (see logs for details)") + self.dataset.log('Error Return Code (%s) with video %s: %s' % (str(result.returncode), vid_name, "\n".join(ffmpeg_error.split('\n')[-2:]) if ffmpeg_error else '')) + else: + processed_videos += 1 + self.dataset.update_status("Created frames for %i of %i videos" % (processed_videos, total_possible_videos)) - processed_videos += 1 - self.dataset.update_status( - "Created frames for %i of %i videos" % (processed_videos, total_possible_videos)) - self.dataset.update_progress(processed_videos / total_possible_videos) + self.dataset.update_progress(i / total_possible_videos) # Finish up # We've created a directory and folder structure here as opposed to a single folder with single files as # expected by self.write_archive_and_finish() so we use make_archive instead + if not processed_videos: + self.dataset.finish_with_error("Unable to extract frames from any videos") + return + from shutil import make_archive make_archive(self.dataset.get_results_path().with_suffix(''), "zip", output_directory) diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index f23b3d0c..aad1baf6 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -216,11 +216,12 @@ def process(self): self.dataset.update_status("FFmpeg software not found. Please contact 4CAT maintainers.", is_final=True) self.dataset.finish(0) return - except FileNotFoundError as e: - self.dataset.update_status(f"Unable to find file {str(path)}") + except FileNotFoundError: + self.dataset.update_status(f"Unable to find file {path.name}") continue except FFmpegFailedToExtractFrames as e: - self.dataset.update_status(f"Unable to extract frame for {str(path)}: {e}") + self.dataset.update_status(f"Unable to extract frame for {path.name} (see log for details)") + self.dataset.log(f"Unable to extract frame for {str(path)}: {e}") continue video_hashes[path.name] = {'videohash': videohash} @@ -234,6 +235,10 @@ def process(self): self.dataset.update_progress(processed_videos / total_possible_videos) videohash.delete_storage_path() + if processed_videos == 0: + self.dataset.finish_with_error("Unable to create video hashes for any videos") + return + # Write hash file # This file is held here and then copied as its own dataset via VideoHasherTwo num_posts = 0 @@ -304,7 +309,7 @@ def process(self): # Finish up self.dataset.update_status(f'Created {num_posts} video hashes and stored video collages') - self.write_archive_and_finish(output_dir) + self.write_archive_and_finish(output_dir, num_items=processed_videos) class VideoHashNetwork(BasicProcessor): """ diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py index 9270fb3f..3c73e57f 100644 --- a/processors/visualisation/video_timelines.py +++ b/processors/visualisation/video_timelines.py @@ -117,6 +117,9 @@ def process(self): if previous_video is not None or not looping: # draw the video filename/label on top of the rendered # frame thumbnails + if not previous_video: + # This likely means no frames were found for the video and this processor should not have run + continue video_label = labels.get(previous_video, previous_video) footersize = (fontsize * (len(video_label) + 2) * 0.5925, fontsize * 2) footer_shape = SVG(insert=(0, base_height - footersize[1]), size=footersize) @@ -165,6 +168,10 @@ def process(self): timeline.add(frame_element) timeline_widths[video] += frame_width + if not timeline_widths: + self.dataset.finish_with_error("No video frames found") + return + # now we know all dimensions we can instantiate the canvas too canvas_width = max(timeline_widths.values()) fontsize = 12 From da8328edbbcf64395e69d06b5abf3708e5d60cc8 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Fri, 22 Nov 2024 16:59:45 +0100 Subject: [PATCH 32/48] Don't show link thumbnails in Bsky widget --- webtool/templates/frontpage.html | 1 + 1 file changed, 1 insertion(+) diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html index fa5cbc73..fee8c095 100644 --- a/webtool/templates/frontpage.html +++ b/webtool/templates/frontpage.html @@ -20,6 +20,7 @@

About this server

4CAT updates

Date: Mon, 25 Nov 2024 11:51:11 +0100 Subject: [PATCH 33/48] Don't ignore TikTok comments with missing author_pin --- datasources/tiktok_comments/search_tiktok_comments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py index efaffc21..31471fcd 100644 --- a/datasources/tiktok_comments/search_tiktok_comments.py +++ b/datasources/tiktok_comments/search_tiktok_comments.py @@ -58,7 +58,7 @@ def map_item(item): "post_url": item["share_info"]["url"].split(".html")[0], "post_body": item["share_info"]["title"], "comment_url": item["share_info"]["url"], - "is_liked_by_post_author": "yes" if bool(item["author_pin"]) else "no", + "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no", "is_sticky": "yes" if bool(item["stick_position"]) else "no", "is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes", "language_guess": item["comment_language"] From 0792ef4dae41ddf4f282a82801fced557738e807 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 27 Nov 2024 18:07:14 +0100 Subject: [PATCH 34/48] Don't show version in footer unless logged in --- webtool/templates/layout.html | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webtool/templates/layout.html b/webtool/templates/layout.html index e1ecda59..33f4ad61 100644 --- a/webtool/templates/layout.html +++ b/webtool/templates/layout.html @@ -85,7 +85,9 @@

{% endif %}
  • How to cite
  • Help & Bug Reports
  • + {% if current_user.is_authenticated %}
  • v{{ __version }}
  • + {% endif %}
  • OILab, 2018 – {{ __datenow.year }}
  • From 1396bb5f79fe8d1f77f47ef4f258b91f123bb2f3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Fri, 29 Nov 2024 11:15:54 +0100 Subject: [PATCH 35/48] pass through modules in merge_datasets --- processors/conversion/merge_datasets.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py index 860c0ddb..461cdd54 100644 --- a/processors/conversion/merge_datasets.py +++ b/processors/conversion/merge_datasets.py @@ -60,7 +60,7 @@ def is_compatible_with(cls, module=None, user=None): return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector()) @staticmethod - def get_dataset_from_url(url, db): + def get_dataset_from_url(url, db, modules=None): """ Get dataset object based on dataset URL @@ -68,6 +68,7 @@ def get_dataset_from_url(url, db): :param str url: Dataset URL :param db: Database handler (to retrieve metadata) + :param modules: Modules handler (pass through to DataSet) :return DataSet: The dataset """ if not url: @@ -75,7 +76,7 @@ def get_dataset_from_url(url, db): source_url = ural.normalize_url(url) source_key = source_url.split("/")[-1] - return DataSet(key=source_key, db=db) + return DataSet(key=source_key, db=db, modules=modules) def process(self): """ @@ -96,7 +97,7 @@ def process(self): continue try: - source_dataset = self.get_dataset_from_url(source_dataset_url, self.db) + source_dataset = self.get_dataset_from_url(source_dataset_url, self.db, modules=self.modules) except DataSetException: return self.dataset.finish_with_error(f"Dataset URL '{source_dataset_url} not found - cannot perform " f"merge.") From cb2ef691153ba1c3b1c78e567ae070480df14e72 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 29 Nov 2024 11:42:42 +0100 Subject: [PATCH 36/48] Shorten URLs in CSV preview (links still work) --- webtool/lib/template_filters.py | 34 ++++++++++++++++++++++++++++-- webtool/templates/preview/csv.html | 2 +- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 6ac9272b..c1ec867a 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -1,5 +1,7 @@ import urllib.parse import datetime +from math import floor + import markdown import json import ural @@ -120,7 +122,7 @@ def _jinja2_filter_httpquery(data): return "" @app.template_filter("add_ahref") -def _jinja2_filter_add_ahref(content): +def _jinja2_filter_add_ahref(content, ellipsiate=0): """ Add HTML links to text @@ -135,7 +137,11 @@ def _jinja2_filter_add_ahref(content): return content for link in set(ural.urls_from_text(str(content))): - content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link}') + if ellipsiate > 0: + link_text = _jinja2_filter_ellipsiate(link, ellipsiate, True, "[…]") + else: + link_text = link + content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link_text}') return content @@ -200,6 +206,30 @@ def _jinja2_filter_extension_to_noun(ext): else: return "item" +@app.template_filter("ellipsiate") +def _jinja2_filter_ellipsiate(text, length, inside=False, ellipsis_str="…"): + if len(text) <= length: + return text + + elif not inside: + return text[:length] + ellipsis_str + + else: + # two cases: URLs and normal text + # for URLs, try to only ellipsiate after the domain name + # this makes the URLs easier to read when shortened + if ural.is_url(text): + pre_part = "/".join(text.split("/")[:3]) + if len(pre_part) < length - 6: # kind of arbitrary + before = len(pre_part) + 1 + else: + before = floor(length / 2) + else: + before = floor(length / 2) + + after = len(text) - before + return text[:before] + ellipsis_str + text[after:] + @app.template_filter('4chan_image') def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5): diff --git a/webtool/templates/preview/csv.html b/webtool/templates/preview/csv.html index fc36bb9d..d2473735 100644 --- a/webtool/templates/preview/csv.html +++ b/webtool/templates/preview/csv.html @@ -20,7 +20,7 @@ {% endif %} {% endif %} - {{ cell|e|add_ahref|safe }} + {{ cell|e|add_ahref(ellipsiate=50)|safe }} {% endfor %} From 8e660a4674b5e570a51730a342c3336437ab9817 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 29 Nov 2024 11:43:50 +0100 Subject: [PATCH 37/48] Fix author thumbnail in TikTok mapping --- datasources/tiktok/search_tiktok.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index f7cb7590..2c5a51c5 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -50,16 +50,16 @@ def map_item(post): # from intercepted API response user_nickname = post["author"]["uniqueId"] user_fullname = post["author"]["nickname"] - user_id = post["author"]["id"] + user_thumbnail = post["author"].get("avatarThumb", "") elif post.get("author"): # from embedded JSON object user_nickname = post["author"] user_fullname = post["nickname"] - user_id = "" + user_thumbnail = "" else: user_nickname = "" user_fullname = "" - user_id = "" + user_thumbnail = "" # there are various thumbnail URLs, some of them expire later than # others. Try to get the highest-resolution one that hasn't expired From 2f4211354c1b15d41f850ca9bace3fb9a69070e2 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 29 Nov 2024 11:44:02 +0100 Subject: [PATCH 38/48] Add is_sensitive and is_photosensitive columns to TikTok mapping --- datasources/tiktok/search_tiktok.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index 2c5a51c5..6bee534d 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -84,13 +84,15 @@ def map_item(post): "author_followers": post.get("authorStats", {}).get("followerCount", ""), "author_likes": post.get("authorStats", {}).get("diggCount", ""), "author_videos": post.get("authorStats", {}).get("videoCount", ""), - "author_avatar": post.get("avatarThumb", ""), + "author_avatar": user_thumbnail, "body": post["desc"], "timestamp": datetime.utcfromtimestamp(int(post["createTime"])).strftime('%Y-%m-%d %H:%M:%S'), "unix_timestamp": int(post["createTime"]), "is_duet": "yes" if (post.get("duetInfo", {}).get("duetFromId") != "0" if post.get("duetInfo", {}) else False) else "no", "is_ad": "yes" if post.get("isAd", False) else "no", "is_paid_partnership": "yes" if post.get("adAuthorization") else "no", + "is_sensitive": "yes" if post.get("maskType") == 3 else "no", + "is_photosensitive": "yes" if post.get("maskType") == 4 else "no", "music_name": post["music"]["title"], "music_id": post["music"]["id"], "music_url": post["music"].get("playUrl", ""), From 8da18b397c28888160ae5e434390cb2b1f59547b Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 29 Nov 2024 11:47:19 +0100 Subject: [PATCH 39/48] Zebra striping in csv preview table to help readability --- webtool/static/css/dataset-page.css | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index 8e99832f..9eae3229 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -621,6 +621,10 @@ body.csv-preview table td, body.csv-preview table th { border: 1px solid var(--gray-light); } +body.csv-preview table tr:nth-child(2n+1) { + background: var(--contrast-bright); +} + .child.focus:not(.card) > .sub-controls > .query-result > .query-result-iframe { display: none; } From 0abe88569b175a6c956cb2441883bd8d5509a284 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 29 Nov 2024 18:37:19 +0100 Subject: [PATCH 40/48] Delete unused webtool helper functions --- webtool/lib/helpers.py | 43 ------------------------------------------ 1 file changed, 43 deletions(-) diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py index 6cc91eba..d0e74a37 100644 --- a/webtool/lib/helpers.py +++ b/webtool/lib/helpers.py @@ -96,30 +96,6 @@ def error(code=200, **kwargs): return response -def string_to_timestamp(string): - """ - Convert dd-mm-yyyy date to unix time - - :param string: Date string to parse - :return: The unix time, or 0 if value could not be parsed - """ - bits = string.split("-") - if re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", string): - bits = list(reversed(bits)) - - if len(bits) != 3: - return 0 - - try: - day = int(bits[0]) - month = int(bits[1]) - year = int(bits[2]) - date = datetime.datetime(year, month, day) - except ValueError: - return 0 - - return int(date.timestamp()) - def pad_interval(intervals, first_interval=None, last_interval=None): """ Pad an interval so all intermediate intervals are filled @@ -299,25 +275,6 @@ def generate_css_colours(force=False): ) -def get_preview(query): - """ - Generate a data preview of 25 rows of a results csv - - :param query - :return list: - """ - preview = [] - with query.get_results_path().open(encoding="utf-8") as resultfile: - posts = csv.DictReader(resultfile) - i = 0 - for post in posts: - i += 1 - preview.append(post) - if i > 25: - break - return preview - - def format_chan_post(post): """ Format a plain-text imageboard post post for HTML display From 6881cbadf36f1ff28c39543ce25a6e8b8796e31e Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 2 Dec 2024 22:42:40 +0100 Subject: [PATCH 41/48] Add option to TikTok image downloader for user avatars --- processors/visualisation/download_tiktok.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py index c02b53bf..3854e965 100644 --- a/processors/visualisation/download_tiktok.py +++ b/processors/visualisation/download_tiktok.py @@ -161,6 +161,7 @@ class TikTokImageDownloader(BasicProcessor): "options": { "thumbnail": "Video Thumbnail", "music": "Music Thumbnail", + "author_avatar": "User avatar" }, "default": "thumbnail" } @@ -217,6 +218,8 @@ def process(self): url_column = "thumbnail_url" elif self.parameters.get("thumb_type") == "music": url_column = "music_thumbnail" + elif self.parameters.get("thumb_type") == "author_avatar": + url_column = "author_avatar" else: self.dataset.update_status("No image column selected.", is_final=True) self.dataset.finish(0) From e53b73f75a5acfa0373072d5786d07fa5d44a9bc Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Tue, 3 Dec 2024 17:00:48 +0100 Subject: [PATCH 42/48] Option for co-tag networks to ignore certain tags --- processors/networks/cotag_network.py | 8 ++++++++ processors/networks/two-column-network.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py index 236e9577..139b2ac9 100644 --- a/processors/networks/cotag_network.py +++ b/processors/networks/cotag_network.py @@ -29,6 +29,13 @@ class CoTaggerPreset(ProcessorPreset): "default": True, "help": "Convert tags to lowercase", "tooltip": "Merges tags with varying cases" + }, + "ignore-tags": { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "Tags to ignore", + "tooltip": "Separate with commas if you want to ignore multiple tags. Do not include the '#' " + "character." } } @@ -72,6 +79,7 @@ def get_processor_pipeline(self): "split-comma": True, "categorise": True, "allow-loops": False, + "ignore-nodes": self.parameters.get("ignore-tags", ""), "to-lowercase": self.parameters.get("to-lowercase", True) } } diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py index 0f604570..43ceffdf 100644 --- a/processors/networks/two-column-network.py +++ b/processors/networks/two-column-network.py @@ -84,6 +84,12 @@ class ColumnNetworker(BasicProcessor): "default": False, "help": "Convert values to lowercase", "tooltip": "Merges values with varying cases" + }, + "ignore-nodes": { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "Nodes to ignore", + "tooltip": "Separate with commas if you want to ignore multiple nodes" } } @@ -145,6 +151,7 @@ def process(self): allow_loops = self.parameters.get("allow-loops") interval_type = self.parameters.get("interval") to_lower = self.parameters.get("to-lowercase", False) + ignoreable = [n.strip() for n in self.parameters.get("ignore-nodes", "").split(",") if n.strip()] processed = 0 @@ -193,6 +200,14 @@ def process(self): values_a = [value.strip() for value_groups in values_a for value in value_groups.split(",")] values_b = [value.strip() for value_groups in values_b for value in value_groups.split(",")] + if ignoreable: + values_a = [v for v in values_a if v not in ignoreable] + values_b = [v for v in values_b if v not in ignoreable] + + # only proceed if we actually have any edges left + if not values_a or not values_b: + continue + try: interval = get_interval_descriptor(item, interval_type) except ValueError as e: From 5021e85302fe8cf16b783496052929cb30287820 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 4 Dec 2024 11:11:28 +0100 Subject: [PATCH 43/48] Cast to str before word tree-izing --- processors/visualisation/word-trees.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index 0dfe2d40..0a1f235e 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -212,6 +212,12 @@ def process(self): if processed % 500 == 0: self.dataset.update_status("Processing and tokenising post %i" % processed) body = post.get(column) + + try: + body = str(body) + except TypeError: + continue + if not body: continue From 3f06845a0e2dc63e772a071e77748e116eef896d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Fri, 6 Dec 2024 13:32:23 +0100 Subject: [PATCH 44/48] tokenizer group by sentence fix: nltk renamed lang packs --- processors/text-analysis/tokenise.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index 17c350c8..1ee3b199 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -226,6 +226,7 @@ def process(self): The result is valid JSON, written in chunks. """ + sentence_error = False columns = self.parameters.get("columns") if not columns: self.dataset.update_status("No columns selected, aborting.", is_final=True) @@ -357,11 +358,11 @@ def dummy_function(x, *args, **kwargs): # for russian we use a special purpose splitter with better # performance sentence_method = razdel.sentenize - elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if - 'pickle' in lang]: + elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab'))]: self.dataset.update_status( f"Language {language} not available for sentence tokenizer; grouping by item/post instead.") sentence_method = dummy_function + sentence_error = True else: sentence_method = sent_tokenize else: @@ -490,6 +491,9 @@ def dummy_function(x, *args, **kwargs): with staging_area.joinpath(".token_metadata.json").open("w", encoding="utf-8") as outfile: json.dump(metadata, outfile) + if sentence_error: + self.dataset.update_status(f"Finished tokenizing; Unable to group by sentence ({language} not supported), instead grouped by item.", is_final=True) + # create zip of archive and delete temporary files and folder self.write_archive_and_finish(staging_area) From 1765e8066e74624cd4b89cf96737f90901655336 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 10 Dec 2024 12:00:47 +0100 Subject: [PATCH 45/48] download video: handle broken connection in video download; also stop and remove files that exceed max file size --- processors/visualisation/download_videos.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index 2b385ffe..08a632b8 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -3,6 +3,7 @@ First attempt to download via request, but if that fails use yt-dlp """ +import os import json import re import time @@ -601,15 +602,22 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie f"Video size {response.headers.get('Content-Length')} larger than maximum allowed per 4CAT") # Size unknown elif not self.config.get("video-downloader.allow-unknown-size", False): - FilesizeException("Video size unknown; not allowed to download per 4CAT settings") + raise FilesizeException("Video size unknown; not allowed to download per 4CAT settings") # Download video self.dataset.update_status( "Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url)) - with open(results_path.joinpath(save_location), "wb") as f: - for chunk in response.iter_content(chunk_size=1024 * 1024): - if chunk: - f.write(chunk) + try: + with open(results_path.joinpath(save_location), "wb") as f: + for chunk in response.iter_content(chunk_size=1024 * 1024): + if not max_video_size == 0 and f.tell() > (max_video_size * 1000000): + # File size too large; stop download and remove file + os.remove(f.name) + raise FilesizeException("Video size larger than maximum allowed per 4CAT") + if chunk: + f.write(chunk) + except ChunkedEncodingError as e: + raise FailedDownload(f"Failed to download video: {e}") # Return filename to add to metadata return save_location.name From 8450304ab156fe27302412dcf4a112da0689074a Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 10 Dec 2024 12:13:45 +0100 Subject: [PATCH 46/48] video_download: forgot import exception type --- processors/visualisation/download_videos.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index 08a632b8..d1d7bd67 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -607,8 +607,8 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie # Download video self.dataset.update_status( "Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url)) - try: - with open(results_path.joinpath(save_location), "wb") as f: + with open(results_path.joinpath(save_location), "wb") as f: + try: for chunk in response.iter_content(chunk_size=1024 * 1024): if not max_video_size == 0 and f.tell() > (max_video_size * 1000000): # File size too large; stop download and remove file @@ -616,8 +616,8 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie raise FilesizeException("Video size larger than maximum allowed per 4CAT") if chunk: f.write(chunk) - except ChunkedEncodingError as e: - raise FailedDownload(f"Failed to download video: {e}") + except requests.exceptions.ChunkedEncodingError as e: + raise FailedDownload(f"Failed to complete download: {e}") # Return filename to add to metadata return save_location.name From a296ff03c983103b902a830c585efd426e349ece Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 11 Dec 2024 12:09:24 +0100 Subject: [PATCH 47/48] export_datasets fix: only finish export dataset if primary dataset is not finished; children should be skipped instead --- processors/conversion/export_datasets.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py index bd7b8128..fbda0e85 100644 --- a/processors/conversion/export_datasets.py +++ b/processors/conversion/export_datasets.py @@ -40,6 +40,11 @@ def process(self): This takes a CSV file as input and writes the same data as a JSON file """ self.dataset.update_status("Collecting dataset and all analyses") + primary_dataset = self.dataset.top_parent() + if not primary_dataset.is_finished(): + # This ought not happen as processors (i.e., this processor) should only be available for finished datasets + self.dataset.finish_with_error("You cannot export unfinished datasets; please wait until dataset is finished to export.") + return results_path = self.dataset.get_staging_area() @@ -52,25 +57,26 @@ def process(self): try: dataset = DataSet(key=dataset_key, db=self.db) - # TODO: these two should fail for the primary dataset, but should they fail for the children too? except DataSetException: - self.dataset.finish_with_error("Dataset not found.") - return + self.dataset.update_status(f"Dataset {dataset_key} not found: it may have been deleted prior to export; skipping.") + failed_exports.append(dataset_key) + continue if not dataset.is_finished(): - self.dataset.finish_with_error("You cannot export unfinished datasets.") - return + self.dataset.update_status(f"Dataset {dataset_key} not finished: cannot export unfinished datasets; skipping.") + failed_exports.append(dataset_key) + continue # get metadata metadata = dataset.get_metadata() if metadata["num_rows"] == 0: - self.dataset.update_status(f"Skipping empty dataset {dataset_key}") + self.dataset.update_status(f"Dataset {dataset_key} has no results; skipping.") failed_exports.append(dataset_key) continue # get data data_file = dataset.get_results_path() if not data_file.exists(): - self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.") + self.dataset.update_status(f"Dataset {dataset_key} has no data file; skipping.") failed_exports.append(dataset_key) continue From a60ac61cae5d3b4dc5d9f8e97ba7f30a0d2b2af4 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 11 Dec 2024 12:30:34 +0100 Subject: [PATCH 48/48] export_dataset: note that filters must be exported separately in description --- processors/conversion/export_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py index fbda0e85..f6c8bcc1 100644 --- a/processors/conversion/export_datasets.py +++ b/processors/conversion/export_datasets.py @@ -23,7 +23,7 @@ class ExportDatasets(BasicProcessor): type = "export-datasets" # job type ID category = "Conversion" # category title = "Export Dataset and All Analyses" # title displayed in UI - description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again." # description displayed in UI + description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Filters are *not* included and must be exported separately as new datasets. Results automatically expires after 1 day, after which you must run again." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI @classmethod