Date: Fri, 22 Nov 2024 16:59:45 +0100
Subject: [PATCH 32/48] Don't show link thumbnails in Bsky widget
---
webtool/templates/frontpage.html | 1 +
1 file changed, 1 insertion(+)
diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html
index fa5cbc73..fee8c095 100644
--- a/webtool/templates/frontpage.html
+++ b/webtool/templates/frontpage.html
@@ -20,6 +20,7 @@ About this server
4CAT updates
Date: Mon, 25 Nov 2024 11:51:11 +0100
Subject: [PATCH 33/48] Don't ignore TikTok comments with missing author_pin
---
datasources/tiktok_comments/search_tiktok_comments.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py
index efaffc21..31471fcd 100644
--- a/datasources/tiktok_comments/search_tiktok_comments.py
+++ b/datasources/tiktok_comments/search_tiktok_comments.py
@@ -58,7 +58,7 @@ def map_item(item):
"post_url": item["share_info"]["url"].split(".html")[0],
"post_body": item["share_info"]["title"],
"comment_url": item["share_info"]["url"],
- "is_liked_by_post_author": "yes" if bool(item["author_pin"]) else "no",
+ "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
"is_sticky": "yes" if bool(item["stick_position"]) else "no",
"is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
"language_guess": item["comment_language"]
From 0792ef4dae41ddf4f282a82801fced557738e807 Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Wed, 27 Nov 2024 18:07:14 +0100
Subject: [PATCH 34/48] Don't show version in footer unless logged in
---
webtool/templates/layout.html | 2 ++
1 file changed, 2 insertions(+)
diff --git a/webtool/templates/layout.html b/webtool/templates/layout.html
index e1ecda59..33f4ad61 100644
--- a/webtool/templates/layout.html
+++ b/webtool/templates/layout.html
@@ -85,7 +85,9 @@
{% endif %}
How to cite
Help & Bug Reports
+ {% if current_user.is_authenticated %}
v{{ __version }}
+ {% endif %}
OILab, 2018 – {{ __datenow.year }}
From 1396bb5f79fe8d1f77f47ef4f258b91f123bb2f3 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Fri, 29 Nov 2024 11:15:54 +0100
Subject: [PATCH 35/48] pass through modules in merge_datasets
---
processors/conversion/merge_datasets.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py
index 860c0ddb..461cdd54 100644
--- a/processors/conversion/merge_datasets.py
+++ b/processors/conversion/merge_datasets.py
@@ -60,7 +60,7 @@ def is_compatible_with(cls, module=None, user=None):
return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector())
@staticmethod
- def get_dataset_from_url(url, db):
+ def get_dataset_from_url(url, db, modules=None):
"""
Get dataset object based on dataset URL
@@ -68,6 +68,7 @@ def get_dataset_from_url(url, db):
:param str url: Dataset URL
:param db: Database handler (to retrieve metadata)
+ :param modules: Modules handler (pass through to DataSet)
:return DataSet: The dataset
"""
if not url:
@@ -75,7 +76,7 @@ def get_dataset_from_url(url, db):
source_url = ural.normalize_url(url)
source_key = source_url.split("/")[-1]
- return DataSet(key=source_key, db=db)
+ return DataSet(key=source_key, db=db, modules=modules)
def process(self):
"""
@@ -96,7 +97,7 @@ def process(self):
continue
try:
- source_dataset = self.get_dataset_from_url(source_dataset_url, self.db)
+ source_dataset = self.get_dataset_from_url(source_dataset_url, self.db, modules=self.modules)
except DataSetException:
return self.dataset.finish_with_error(f"Dataset URL '{source_dataset_url} not found - cannot perform "
f"merge.")
From cb2ef691153ba1c3b1c78e567ae070480df14e72 Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Fri, 29 Nov 2024 11:42:42 +0100
Subject: [PATCH 36/48] Shorten URLs in CSV preview
(links still work)
---
webtool/lib/template_filters.py | 34 ++++++++++++++++++++++++++++--
webtool/templates/preview/csv.html | 2 +-
2 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 6ac9272b..c1ec867a 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -1,5 +1,7 @@
import urllib.parse
import datetime
+from math import floor
+
import markdown
import json
import ural
@@ -120,7 +122,7 @@ def _jinja2_filter_httpquery(data):
return ""
@app.template_filter("add_ahref")
-def _jinja2_filter_add_ahref(content):
+def _jinja2_filter_add_ahref(content, ellipsiate=0):
"""
Add HTML links to text
@@ -135,7 +137,11 @@ def _jinja2_filter_add_ahref(content):
return content
for link in set(ural.urls_from_text(str(content))):
- content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link}')
+ if ellipsiate > 0:
+ link_text = _jinja2_filter_ellipsiate(link, ellipsiate, True, "[…]")
+ else:
+ link_text = link
+ content = content.replace(link, f'", "%3E").replace(chr(34), "%22")}" rel="external">{link_text}')
return content
@@ -200,6 +206,30 @@ def _jinja2_filter_extension_to_noun(ext):
else:
return "item"
+@app.template_filter("ellipsiate")
+def _jinja2_filter_ellipsiate(text, length, inside=False, ellipsis_str="…"):
+ if len(text) <= length:
+ return text
+
+ elif not inside:
+ return text[:length] + ellipsis_str
+
+ else:
+ # two cases: URLs and normal text
+ # for URLs, try to only ellipsiate after the domain name
+ # this makes the URLs easier to read when shortened
+ if ural.is_url(text):
+ pre_part = "/".join(text.split("/")[:3])
+ if len(pre_part) < length - 6: # kind of arbitrary
+ before = len(pre_part) + 1
+ else:
+ before = floor(length / 2)
+ else:
+ before = floor(length / 2)
+
+ after = len(text) - before
+ return text[:before] + ellipsis_str + text[after:]
+
@app.template_filter('4chan_image')
def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
diff --git a/webtool/templates/preview/csv.html b/webtool/templates/preview/csv.html
index fc36bb9d..d2473735 100644
--- a/webtool/templates/preview/csv.html
+++ b/webtool/templates/preview/csv.html
@@ -20,7 +20,7 @@
{% endif %}
{% endif %}
- {{ cell|e|add_ahref|safe }}
+ {{ cell|e|add_ahref(ellipsiate=50)|safe }}
{% endfor %}
From 8e660a4674b5e570a51730a342c3336437ab9817 Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Fri, 29 Nov 2024 11:43:50 +0100
Subject: [PATCH 37/48] Fix author thumbnail in TikTok mapping
---
datasources/tiktok/search_tiktok.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index f7cb7590..2c5a51c5 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -50,16 +50,16 @@ def map_item(post):
# from intercepted API response
user_nickname = post["author"]["uniqueId"]
user_fullname = post["author"]["nickname"]
- user_id = post["author"]["id"]
+ user_thumbnail = post["author"].get("avatarThumb", "")
elif post.get("author"):
# from embedded JSON object
user_nickname = post["author"]
user_fullname = post["nickname"]
- user_id = ""
+ user_thumbnail = ""
else:
user_nickname = ""
user_fullname = ""
- user_id = ""
+ user_thumbnail = ""
# there are various thumbnail URLs, some of them expire later than
# others. Try to get the highest-resolution one that hasn't expired
From 2f4211354c1b15d41f850ca9bace3fb9a69070e2 Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Fri, 29 Nov 2024 11:44:02 +0100
Subject: [PATCH 38/48] Add is_sensitive and is_photosensitive columns to
TikTok mapping
---
datasources/tiktok/search_tiktok.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index 2c5a51c5..6bee534d 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -84,13 +84,15 @@ def map_item(post):
"author_followers": post.get("authorStats", {}).get("followerCount", ""),
"author_likes": post.get("authorStats", {}).get("diggCount", ""),
"author_videos": post.get("authorStats", {}).get("videoCount", ""),
- "author_avatar": post.get("avatarThumb", ""),
+ "author_avatar": user_thumbnail,
"body": post["desc"],
"timestamp": datetime.utcfromtimestamp(int(post["createTime"])).strftime('%Y-%m-%d %H:%M:%S'),
"unix_timestamp": int(post["createTime"]),
"is_duet": "yes" if (post.get("duetInfo", {}).get("duetFromId") != "0" if post.get("duetInfo", {}) else False) else "no",
"is_ad": "yes" if post.get("isAd", False) else "no",
"is_paid_partnership": "yes" if post.get("adAuthorization") else "no",
+ "is_sensitive": "yes" if post.get("maskType") == 3 else "no",
+ "is_photosensitive": "yes" if post.get("maskType") == 4 else "no",
"music_name": post["music"]["title"],
"music_id": post["music"]["id"],
"music_url": post["music"].get("playUrl", ""),
From 8da18b397c28888160ae5e434390cb2b1f59547b Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Fri, 29 Nov 2024 11:47:19 +0100
Subject: [PATCH 39/48] Zebra striping in csv preview table to help readability
---
webtool/static/css/dataset-page.css | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 8e99832f..9eae3229 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -621,6 +621,10 @@ body.csv-preview table td, body.csv-preview table th {
border: 1px solid var(--gray-light);
}
+body.csv-preview table tr:nth-child(2n+1) {
+ background: var(--contrast-bright);
+}
+
.child.focus:not(.card) > .sub-controls > .query-result > .query-result-iframe {
display: none;
}
From 0abe88569b175a6c956cb2441883bd8d5509a284 Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Fri, 29 Nov 2024 18:37:19 +0100
Subject: [PATCH 40/48] Delete unused webtool helper functions
---
webtool/lib/helpers.py | 43 ------------------------------------------
1 file changed, 43 deletions(-)
diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py
index 6cc91eba..d0e74a37 100644
--- a/webtool/lib/helpers.py
+++ b/webtool/lib/helpers.py
@@ -96,30 +96,6 @@ def error(code=200, **kwargs):
return response
-def string_to_timestamp(string):
- """
- Convert dd-mm-yyyy date to unix time
-
- :param string: Date string to parse
- :return: The unix time, or 0 if value could not be parsed
- """
- bits = string.split("-")
- if re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", string):
- bits = list(reversed(bits))
-
- if len(bits) != 3:
- return 0
-
- try:
- day = int(bits[0])
- month = int(bits[1])
- year = int(bits[2])
- date = datetime.datetime(year, month, day)
- except ValueError:
- return 0
-
- return int(date.timestamp())
-
def pad_interval(intervals, first_interval=None, last_interval=None):
"""
Pad an interval so all intermediate intervals are filled
@@ -299,25 +275,6 @@ def generate_css_colours(force=False):
)
-def get_preview(query):
- """
- Generate a data preview of 25 rows of a results csv
-
- :param query
- :return list:
- """
- preview = []
- with query.get_results_path().open(encoding="utf-8") as resultfile:
- posts = csv.DictReader(resultfile)
- i = 0
- for post in posts:
- i += 1
- preview.append(post)
- if i > 25:
- break
- return preview
-
-
def format_chan_post(post):
"""
Format a plain-text imageboard post post for HTML display
From 6881cbadf36f1ff28c39543ce25a6e8b8796e31e Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Mon, 2 Dec 2024 22:42:40 +0100
Subject: [PATCH 41/48] Add option to TikTok image downloader for user avatars
---
processors/visualisation/download_tiktok.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py
index c02b53bf..3854e965 100644
--- a/processors/visualisation/download_tiktok.py
+++ b/processors/visualisation/download_tiktok.py
@@ -161,6 +161,7 @@ class TikTokImageDownloader(BasicProcessor):
"options": {
"thumbnail": "Video Thumbnail",
"music": "Music Thumbnail",
+ "author_avatar": "User avatar"
},
"default": "thumbnail"
}
@@ -217,6 +218,8 @@ def process(self):
url_column = "thumbnail_url"
elif self.parameters.get("thumb_type") == "music":
url_column = "music_thumbnail"
+ elif self.parameters.get("thumb_type") == "author_avatar":
+ url_column = "author_avatar"
else:
self.dataset.update_status("No image column selected.", is_final=True)
self.dataset.finish(0)
From e53b73f75a5acfa0373072d5786d07fa5d44a9bc Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Tue, 3 Dec 2024 17:00:48 +0100
Subject: [PATCH 42/48] Option for co-tag networks to ignore certain tags
---
processors/networks/cotag_network.py | 8 ++++++++
processors/networks/two-column-network.py | 15 +++++++++++++++
2 files changed, 23 insertions(+)
diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py
index 236e9577..139b2ac9 100644
--- a/processors/networks/cotag_network.py
+++ b/processors/networks/cotag_network.py
@@ -29,6 +29,13 @@ class CoTaggerPreset(ProcessorPreset):
"default": True,
"help": "Convert tags to lowercase",
"tooltip": "Merges tags with varying cases"
+ },
+ "ignore-tags": {
+ "type": UserInput.OPTION_TEXT,
+ "default": "",
+ "help": "Tags to ignore",
+ "tooltip": "Separate with commas if you want to ignore multiple tags. Do not include the '#' "
+ "character."
}
}
@@ -72,6 +79,7 @@ def get_processor_pipeline(self):
"split-comma": True,
"categorise": True,
"allow-loops": False,
+ "ignore-nodes": self.parameters.get("ignore-tags", ""),
"to-lowercase": self.parameters.get("to-lowercase", True)
}
}
diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py
index 0f604570..43ceffdf 100644
--- a/processors/networks/two-column-network.py
+++ b/processors/networks/two-column-network.py
@@ -84,6 +84,12 @@ class ColumnNetworker(BasicProcessor):
"default": False,
"help": "Convert values to lowercase",
"tooltip": "Merges values with varying cases"
+ },
+ "ignore-nodes": {
+ "type": UserInput.OPTION_TEXT,
+ "default": "",
+ "help": "Nodes to ignore",
+ "tooltip": "Separate with commas if you want to ignore multiple nodes"
}
}
@@ -145,6 +151,7 @@ def process(self):
allow_loops = self.parameters.get("allow-loops")
interval_type = self.parameters.get("interval")
to_lower = self.parameters.get("to-lowercase", False)
+ ignoreable = [n.strip() for n in self.parameters.get("ignore-nodes", "").split(",") if n.strip()]
processed = 0
@@ -193,6 +200,14 @@ def process(self):
values_a = [value.strip() for value_groups in values_a for value in value_groups.split(",")]
values_b = [value.strip() for value_groups in values_b for value in value_groups.split(",")]
+ if ignoreable:
+ values_a = [v for v in values_a if v not in ignoreable]
+ values_b = [v for v in values_b if v not in ignoreable]
+
+ # only proceed if we actually have any edges left
+ if not values_a or not values_b:
+ continue
+
try:
interval = get_interval_descriptor(item, interval_type)
except ValueError as e:
From 5021e85302fe8cf16b783496052929cb30287820 Mon Sep 17 00:00:00 2001
From: Stijn Peeters
Date: Wed, 4 Dec 2024 11:11:28 +0100
Subject: [PATCH 43/48] Cast to str before word tree-izing
---
processors/visualisation/word-trees.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py
index 0dfe2d40..0a1f235e 100644
--- a/processors/visualisation/word-trees.py
+++ b/processors/visualisation/word-trees.py
@@ -212,6 +212,12 @@ def process(self):
if processed % 500 == 0:
self.dataset.update_status("Processing and tokenising post %i" % processed)
body = post.get(column)
+
+ try:
+ body = str(body)
+ except TypeError:
+ continue
+
if not body:
continue
From 3f06845a0e2dc63e772a071e77748e116eef896d Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Fri, 6 Dec 2024 13:32:23 +0100
Subject: [PATCH 44/48] tokenizer group by sentence fix: nltk renamed lang
packs
---
processors/text-analysis/tokenise.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index 17c350c8..1ee3b199 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -226,6 +226,7 @@ def process(self):
The result is valid JSON, written in chunks.
"""
+ sentence_error = False
columns = self.parameters.get("columns")
if not columns:
self.dataset.update_status("No columns selected, aborting.", is_final=True)
@@ -357,11 +358,11 @@ def dummy_function(x, *args, **kwargs):
# for russian we use a special purpose splitter with better
# performance
sentence_method = razdel.sentenize
- elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
- 'pickle' in lang]:
+ elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab'))]:
self.dataset.update_status(
f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")
sentence_method = dummy_function
+ sentence_error = True
else:
sentence_method = sent_tokenize
else:
@@ -490,6 +491,9 @@ def dummy_function(x, *args, **kwargs):
with staging_area.joinpath(".token_metadata.json").open("w", encoding="utf-8") as outfile:
json.dump(metadata, outfile)
+ if sentence_error:
+ self.dataset.update_status(f"Finished tokenizing; Unable to group by sentence ({language} not supported), instead grouped by item.", is_final=True)
+
# create zip of archive and delete temporary files and folder
self.write_archive_and_finish(staging_area)
From 1765e8066e74624cd4b89cf96737f90901655336 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Tue, 10 Dec 2024 12:00:47 +0100
Subject: [PATCH 45/48] download video: handle broken connection in video
download; also stop and remove files that exceed max file size
---
processors/visualisation/download_videos.py | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
index 2b385ffe..08a632b8 100644
--- a/processors/visualisation/download_videos.py
+++ b/processors/visualisation/download_videos.py
@@ -3,6 +3,7 @@
First attempt to download via request, but if that fails use yt-dlp
"""
+import os
import json
import re
import time
@@ -601,15 +602,22 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
f"Video size {response.headers.get('Content-Length')} larger than maximum allowed per 4CAT")
# Size unknown
elif not self.config.get("video-downloader.allow-unknown-size", False):
- FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
+ raise FilesizeException("Video size unknown; not allowed to download per 4CAT settings")
# Download video
self.dataset.update_status(
"Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
- with open(results_path.joinpath(save_location), "wb") as f:
- for chunk in response.iter_content(chunk_size=1024 * 1024):
- if chunk:
- f.write(chunk)
+ try:
+ with open(results_path.joinpath(save_location), "wb") as f:
+ for chunk in response.iter_content(chunk_size=1024 * 1024):
+ if not max_video_size == 0 and f.tell() > (max_video_size * 1000000):
+ # File size too large; stop download and remove file
+ os.remove(f.name)
+ raise FilesizeException("Video size larger than maximum allowed per 4CAT")
+ if chunk:
+ f.write(chunk)
+ except ChunkedEncodingError as e:
+ raise FailedDownload(f"Failed to download video: {e}")
# Return filename to add to metadata
return save_location.name
From 8450304ab156fe27302412dcf4a112da0689074a Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Tue, 10 Dec 2024 12:13:45 +0100
Subject: [PATCH 46/48] video_download: forgot import exception type
---
processors/visualisation/download_videos.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
index 08a632b8..d1d7bd67 100644
--- a/processors/visualisation/download_videos.py
+++ b/processors/visualisation/download_videos.py
@@ -607,8 +607,8 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
# Download video
self.dataset.update_status(
"Downloading %i/%i via requests: %s" % (self.downloaded_videos + 1, self.total_possible_videos, url))
- try:
- with open(results_path.joinpath(save_location), "wb") as f:
+ with open(results_path.joinpath(save_location), "wb") as f:
+ try:
for chunk in response.iter_content(chunk_size=1024 * 1024):
if not max_video_size == 0 and f.tell() > (max_video_size * 1000000):
# File size too large; stop download and remove file
@@ -616,8 +616,8 @@ def download_video_with_requests(self, url, results_path, max_video_size, retrie
raise FilesizeException("Video size larger than maximum allowed per 4CAT")
if chunk:
f.write(chunk)
- except ChunkedEncodingError as e:
- raise FailedDownload(f"Failed to download video: {e}")
+ except requests.exceptions.ChunkedEncodingError as e:
+ raise FailedDownload(f"Failed to complete download: {e}")
# Return filename to add to metadata
return save_location.name
From a296ff03c983103b902a830c585efd426e349ece Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 11 Dec 2024 12:09:24 +0100
Subject: [PATCH 47/48] export_datasets fix: only finish export dataset if
primary dataset is not finished; children should be skipped instead
---
processors/conversion/export_datasets.py | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
index bd7b8128..fbda0e85 100644
--- a/processors/conversion/export_datasets.py
+++ b/processors/conversion/export_datasets.py
@@ -40,6 +40,11 @@ def process(self):
This takes a CSV file as input and writes the same data as a JSON file
"""
self.dataset.update_status("Collecting dataset and all analyses")
+ primary_dataset = self.dataset.top_parent()
+ if not primary_dataset.is_finished():
+ # This ought not happen as processors (i.e., this processor) should only be available for finished datasets
+ self.dataset.finish_with_error("You cannot export unfinished datasets; please wait until dataset is finished to export.")
+ return
results_path = self.dataset.get_staging_area()
@@ -52,25 +57,26 @@ def process(self):
try:
dataset = DataSet(key=dataset_key, db=self.db)
- # TODO: these two should fail for the primary dataset, but should they fail for the children too?
except DataSetException:
- self.dataset.finish_with_error("Dataset not found.")
- return
+ self.dataset.update_status(f"Dataset {dataset_key} not found: it may have been deleted prior to export; skipping.")
+ failed_exports.append(dataset_key)
+ continue
if not dataset.is_finished():
- self.dataset.finish_with_error("You cannot export unfinished datasets.")
- return
+ self.dataset.update_status(f"Dataset {dataset_key} not finished: cannot export unfinished datasets; skipping.")
+ failed_exports.append(dataset_key)
+ continue
# get metadata
metadata = dataset.get_metadata()
if metadata["num_rows"] == 0:
- self.dataset.update_status(f"Skipping empty dataset {dataset_key}")
+ self.dataset.update_status(f"Dataset {dataset_key} has no results; skipping.")
failed_exports.append(dataset_key)
continue
# get data
data_file = dataset.get_results_path()
if not data_file.exists():
- self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.")
+ self.dataset.update_status(f"Dataset {dataset_key} has no data file; skipping.")
failed_exports.append(dataset_key)
continue
From a60ac61cae5d3b4dc5d9f8e97ba7f30a0d2b2af4 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 11 Dec 2024 12:30:34 +0100
Subject: [PATCH 48/48] export_dataset: note that filters must be exported
separately in description
---
processors/conversion/export_datasets.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
index fbda0e85..f6c8bcc1 100644
--- a/processors/conversion/export_datasets.py
+++ b/processors/conversion/export_datasets.py
@@ -23,7 +23,7 @@ class ExportDatasets(BasicProcessor):
type = "export-datasets" # job type ID
category = "Conversion" # category
title = "Export Dataset and All Analyses" # title displayed in UI
- description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again." # description displayed in UI
+ description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Filters are *not* included and must be exported separately as new datasets. Results automatically expires after 1 day, after which you must run again." # description displayed in UI
extension = "zip" # extension of result file, used internally and in UI
@classmethod