Skip to content

Commit

Permalink
Merge branch 'master' into explorer-improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
stijn-uva authored Dec 11, 2024
2 parents e5dba0e + a60ac61 commit ba0bd6e
Show file tree
Hide file tree
Showing 39 changed files with 865 additions and 303 deletions.
25 changes: 0 additions & 25 deletions .readthedocs.yaml

This file was deleted.

7 changes: 6 additions & 1 deletion backend/lib/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,

self.dataset.update_status("Parent dataset updated.")

def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True):
def iterate_archive_contents(self, path, staging_area=None, immediately_delete=True, filename_filter=[]):
"""
A generator that iterates through files in an archive
Expand All @@ -498,6 +498,8 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
:param bool immediately_delete: Temporary files are removed after yielded;
False keeps files until the staging_area is removed (usually during processor
cleanup)
:param list filename_filter: Whitelist of filenames to iterate.
Other files will be ignored. If empty, do not ignore anything.
:return: An iterator with a Path item for each file
"""

Expand All @@ -514,6 +516,9 @@ def iterate_archive_contents(self, path, staging_area=None, immediately_delete=T
archive_contents = sorted(archive_file.namelist())

for archived_file in archive_contents:
if filename_filter and archived_file not in filename_filter:
continue

info = archive_file.getinfo(archived_file)
if info.is_dir():
continue
Expand Down
20 changes: 16 additions & 4 deletions backend/lib/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,22 @@ def import_from_file(self, path):
if self.interrupted:
raise WorkerInterruptedException()

# remove NUL bytes here because they trip up a lot of other
# things
# also include import metadata in item
item = json.loads(line.replace("\0", ""))
try:
# remove NUL bytes here because they trip up a lot of other
# things
# also include import metadata in item
item = json.loads(line.replace("\0", ""))
except json.JSONDecodeError:
warning = (f"An item on line {i:,} of the imported file could not be parsed as JSON - this may "
f"indicate that the file you uploaded was incomplete and you need to try uploading it "
f"again. The item will be ignored.")

if warning not in import_warnings:
import_warnings[warning] = 0
import_warnings[warning] += 1
continue


new_item = {
**item["data"],
"__import_meta": {k: v for k, v in item.items() if k != "data"}
Expand Down
6 changes: 3 additions & 3 deletions common/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,11 +269,11 @@ def get(self, attribute_name, default=None, is_json=False, user=None, tags=None)

if not is_json and value is not None:
value = json.loads(value)
# TODO: check this as it feels like it could cause a default to return even if value is not None. - Dale
elif default is not None:
value = default
# TODO: Which default should have priority? The provided default feels like it should be the highest priority, but I think that is an old implementation and perhaps should be removed. - Dale
elif value is None and setting_name in self.config_definition and "default" in self.config_definition[setting_name]:
value = self.config_definition[setting_name]["default"]
elif value is None and default is not None:
value = default

final_settings[setting_name] = value

Expand Down
3 changes: 1 addition & 2 deletions common/lib/config_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,11 +529,10 @@
"type": UserInput.OPTION_MULTI_SELECT,
"help": "Pages in navigation",
"options": {
"faq": "FAQ",
"data-policy": "Data Policy",
"citing": "How to cite",
},
"default": ["faq"],
"default": [],
"tooltip": "These pages will be included in the navigation bar at the top of the interface."
},
"ui.prefer_mapped_preview": {
Expand Down
65 changes: 45 additions & 20 deletions common/lib/helpers.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""
Miscellaneous helper functions for the 4CAT backend
"""
import hashlib
import subprocess
import imagehash
import hashlib
import requests
import hashlib
import datetime
import smtplib
import fnmatch
import socket
import shlex
import copy
import time
import json
Expand All @@ -24,6 +26,7 @@
from urllib.parse import urlparse, urlunparse
from calendar import monthrange
from packaging import version
from PIL import Image

from common.lib.user_input import UserInput
from common.config_manager import config
Expand Down Expand Up @@ -111,10 +114,8 @@ def get_git_branch():
repository or git is not installed an empty string is returned.
"""
try:
cwd = os.getcwd()
os.chdir(config.get('PATH_ROOT'))
branch = subprocess.run(["git", "branch", "--show-current"], stdout=subprocess.PIPE)
os.chdir(cwd)
root_dir = str(config.get('PATH_ROOT').resolve())
branch = subprocess.run(shlex.split(f"git -C {shlex.quote(root_dir)} branch --show-current"), stdout=subprocess.PIPE)
if branch.returncode != 0:
raise ValueError()
return branch.stdout.decode("utf-8").strip()
Expand Down Expand Up @@ -144,7 +145,6 @@ def get_software_commit(worker=None):
# try git command line within the 4CAT root folder
# if it is a checked-out git repository, it will tell us the hash of
# the currently checked-out commit
cwd = os.getcwd()

# path has no Path.relative()...
relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent
Expand All @@ -154,24 +154,24 @@ def get_software_commit(worker=None):
# useful version info (since the extension is by definition not in the
# main 4CAT repository) and will return an empty value
if worker and worker.is_extension:
extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath)
os.chdir(extension_dir)
working_dir = str(config.get("PATH_ROOT").joinpath(relative_filepath).resolve())
# check if we are in the extensions' own repo or 4CAT's
repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
git_cmd = f"git -C {shlex.quote(working_dir)} rev-parse --show-toplevel"
repo_level = subprocess.run(shlex.split(git_cmd), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"):
# not its own repository
return ("", "")

else:
os.chdir(config.get("PATH_ROOT"))
working_dir = str(config.get("PATH_ROOT").resolve())

show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
show = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} show"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
if show.returncode != 0:
raise ValueError()
commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1]

# now get the repository the commit belongs to, if we can
origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
origin = subprocess.run(shlex.split(f"git -C {shlex.quote(working_dir)} config --get remote.origin.url"), stderr=subprocess.PIPE, stdout=subprocess.PIPE)
if origin.returncode != 0 or not origin.stdout:
raise ValueError()
repository = origin.stdout.decode("utf-8").strip()
Expand All @@ -181,9 +181,6 @@ def get_software_commit(worker=None):
except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
return ("", "")

finally:
os.chdir(cwd)

return (commit, repository)

def get_software_version():
Expand Down Expand Up @@ -279,7 +276,6 @@ def find_extensions():

# collect metadata for extensions
allowed_metadata_keys = ("name", "version", "url")
cwd = os.getcwd()
for extension in extensions:
extension_folder = extension_path.joinpath(extension)
metadata_file = extension_folder.joinpath("metadata.json")
Expand All @@ -296,8 +292,8 @@ def find_extensions():
if extensions[extension]["is_git"]:
# try to get remote URL
try:
os.chdir(extension_folder)
origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE,
extension_root = str(extension_folder.resolve())
origin = subprocess.run(shlex.split(f"git -C {shlex.quote(extension_root)} config --get remote.origin.url"), stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
if origin.returncode != 0 or not origin.stdout:
raise ValueError()
Expand All @@ -309,8 +305,6 @@ def find_extensions():
except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
print(e)
pass
finally:
os.chdir(cwd)

return extensions, errors

Expand Down Expand Up @@ -421,6 +415,37 @@ def andify(items):
return ", ".join([str(item) for item in items]) + result


def hash_file(image_file, hash_type="file-hash"):
"""
Generate an image hash
:param Path image_file: Image file to hash
:param str hash_type: Hash type, one of `file-hash`, `colorhash`,
`phash`, `average_hash`, `dhash`
:return str: Hexadecimal hash value
"""
if not image_file.exists():
raise FileNotFoundError()

if hash_type == "file-hash":
hasher = hashlib.sha1()

# Open the file in binary mode
with image_file.open("rb") as infile:
# Read and update hash in chunks to handle large files
while chunk := infile.read(1024):
hasher.update(chunk)

return hasher.hexdigest()

elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
image = Image.open(image_file)

return str(getattr(imagehash, hash_type)(image))

else:
raise NotImplementedError(f"Unknown hash type '{hash_type}'")

def get_yt_compatible_ids(yt_ids):
"""
:param yt_ids list, a list of strings
Expand Down
35 changes: 18 additions & 17 deletions common/lib/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,23 +185,24 @@ def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log'
self.logger.setLevel(log_level)

# this handler manages the text log files
handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
handler.setLevel(log_level)
handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
"%d-%m-%Y %H:%M:%S"))
self.logger.addHandler(handler)

# the slack webhook has its own handler, and is only active if the
# webhook URL is set
try:
if config.get("logging.slack.webhook"):
slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
self.logger.addHandler(slack_handler)
except Exception:
# we *may* need the logger before the database is in working order
if config.db is not None:
config.db.rollback()
if not self.logger.handlers:
handler = RotatingFileHandler(self.log_path, maxBytes=(50 * 1024 * 1024), backupCount=1)
handler.setLevel(log_level)
handler.setFormatter(logging.Formatter("%(asctime)-15s | %(levelname)s at %(location)s: %(message)s",
"%d-%m-%Y %H:%M:%S"))
self.logger.addHandler(handler)

# the slack webhook has its own handler, and is only active if the
# webhook URL is set
try:
if config.get("logging.slack.webhook"):
slack_handler = SlackLogHandler(config.get("logging.slack.webhook"))
slack_handler.setLevel(self.levels.get(config.get("logging.slack.level"), self.alert_level))
self.logger.addHandler(slack_handler)
except Exception:
# we *may* need the logger before the database is in working order
if config.db is not None:
config.db.rollback()

def log(self, message, level=logging.INFO, frame=None):
"""
Expand Down
10 changes: 6 additions & 4 deletions datasources/tiktok/search_tiktok.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,16 @@ def map_item(post):
# from intercepted API response
user_nickname = post["author"]["uniqueId"]
user_fullname = post["author"]["nickname"]
user_id = post["author"]["id"]
user_thumbnail = post["author"].get("avatarThumb", "")
elif post.get("author"):
# from embedded JSON object
user_nickname = post["author"]
user_fullname = post["nickname"]
user_id = ""
user_thumbnail = ""
else:
user_nickname = ""
user_fullname = ""
user_id = ""
user_thumbnail = ""

# there are various thumbnail URLs, some of them expire later than
# others. Try to get the highest-resolution one that hasn't expired
Expand All @@ -84,13 +84,15 @@ def map_item(post):
"author_followers": post.get("authorStats", {}).get("followerCount", ""),
"author_likes": post.get("authorStats", {}).get("diggCount", ""),
"author_videos": post.get("authorStats", {}).get("videoCount", ""),
"author_avatar": post.get("avatarThumb", ""),
"author_avatar": user_thumbnail,
"body": post["desc"],
"timestamp": datetime.utcfromtimestamp(int(post["createTime"])).strftime('%Y-%m-%d %H:%M:%S'),
"unix_timestamp": int(post["createTime"]),
"is_duet": "yes" if (post.get("duetInfo", {}).get("duetFromId") != "0" if post.get("duetInfo", {}) else False) else "no",
"is_ad": "yes" if post.get("isAd", False) else "no",
"is_paid_partnership": "yes" if post.get("adAuthorization") else "no",
"is_sensitive": "yes" if post.get("maskType") == 3 else "no",
"is_photosensitive": "yes" if post.get("maskType") == 4 else "no",
"music_name": post["music"]["title"],
"music_id": post["music"]["id"],
"music_url": post["music"].get("playUrl", ""),
Expand Down
2 changes: 1 addition & 1 deletion datasources/tiktok_comments/search_tiktok_comments.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def map_item(item):
"post_url": item["share_info"]["url"].split(".html")[0],
"post_body": item["share_info"]["title"],
"comment_url": item["share_info"]["url"],
"is_liked_by_post_author": "yes" if bool(item["author_pin"]) else "no",
"is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
"is_sticky": "yes" if bool(item["stick_position"]) else "no",
"is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
"language_guess": item["comment_language"]
Expand Down
Loading

0 comments on commit ba0bd6e

Please sign in to comment.