Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

basic spam detection #719

Merged
merged 9 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions django/core/jinja2/common.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,23 @@ Currently in <em><mark>{{ constants.DEPLOY_ENVIRONMENT }}</mark></em> mode.
<meta property="og:description" content="{{ description }}" />
<meta property="og:image" content="{{ absolute_image_url }}" />
{% endmacro %}

{% macro alert_if_spam(is_marked_spam) %}
{% if is_marked_spam %}
<div class="alert alert-danger" role="alert">
<i class="fas fa-exclamation-triangle"></i>
<b>This submission has been flagged as spam and is awaiting moderator approval.</b>
<hr>
Believe this is a mistake? Please <a href="{{ slugurl('contact') }}">contact us</a>!
</div>
{% endif %}
{% endmacro %}

{% macro alert_if_deleted(is_deleted, content_type="content") %}
{% if is_deleted %}
<div class="alert alert-danger" role="alert">
<i class="fas fa-trash"></i>
<b>This {{ content_type }} has been deleted. It will remain archived but will not be publicly visible.</b>
</div>
{% endif %}
{% endmacro%}
4 changes: 3 additions & 1 deletion django/core/jinja2/core/events/retrieve.jinja
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% extends "sidebar_layout.jinja" %}
{% from "common.jinja" import breadcrumb, embed_discourse_comments, share_card, member_profile_href, search_tag_href, delete_confirm_modal, render_ogp_tags %}
{% from "common.jinja" import breadcrumb, embed_discourse_comments, share_card, member_profile_href, search_tag_href, delete_confirm_modal, render_ogp_tags, alert_if_spam, alert_if_deleted %}

{% block title %}{{ title }}{% endblock %}

Expand All @@ -21,6 +21,8 @@
{% endblock ogp_tags %}

{% block content %}
{{ alert_if_spam(is_marked_spam) }}
{{ alert_if_deleted(is_deleted, "event") }}
Copy link
Member

@alee alee May 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when if ever will alert_if_deleted be executed? the old behavior was to generate a 404 when trying to retrieve something that's been marked as deleted, guessing that's no longer the case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, its now treated exactly the same as things marked spam, hidden from list view but accessible if you have a direct link to it. There wasn't much reasoning behind it besides simplifying queries

<div id='discourse-content'>
<h1>{{ title }}</h1>
{% if tags %}
Expand Down
4 changes: 3 additions & 1 deletion django/core/jinja2/core/jobs/retrieve.jinja
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% extends "sidebar_layout.jinja" %}
{% from "common.jinja" import permission_button_group, breadcrumb, embed_discourse_comments, share_card, member_profile_href, search_tag_href, delete_confirm_modal, render_ogp_tags %}
{% from "common.jinja" import permission_button_group, breadcrumb, embed_discourse_comments, share_card, member_profile_href, search_tag_href, delete_confirm_modal, render_ogp_tags, alert_if_spam, alert_if_deleted %}

{% block title %}{{ title }}{% endblock %}

Expand All @@ -21,6 +21,8 @@
{% endblock ogp_tags %}

{% block content %}
{{ alert_if_spam(is_marked_spam) }}
{{ alert_if_deleted(is_deleted, "job") }}
<div id="discourse-content">
<h1>{{ title }}</h1>
{% if tags %}
Expand Down
127 changes: 127 additions & 0 deletions django/core/migrations/0021_add_spam_moderation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Generated by Django 4.2.11 on 2024-05-20 22:15

import core.models
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("contenttypes", "0002_remove_content_type_name"),
("core", "0020_alter_eventtag_tag_alter_jobtag_tag_and_more"),
]

operations = [
migrations.AddField(
model_name="event",
name="is_marked_spam",
field=models.BooleanField(
default=False,
help_text="cached boolean representation of spam_moderation for search indexing",
),
),
migrations.AddField(
model_name="job",
name="is_marked_spam",
field=models.BooleanField(
default=False,
help_text="cached boolean representation of spam_moderation for search indexing",
),
),
migrations.AlterField(
model_name="event",
name="submitter",
field=models.ForeignKey(
blank=True,
on_delete=models.SET(core.models.get_sentinel_user),
to=settings.AUTH_USER_MODEL,
),
),
migrations.CreateModel(
name="SpamModeration",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"status",
models.CharField(
choices=[
("unreviewed", "Unreviewed"),
("spam", "Confirmed spam"),
("not_spam", "Confirmed not spam"),
],
default="unreviewed",
max_length=32,
),
),
("object_id", models.PositiveIntegerField()),
("date_created", models.DateTimeField(auto_now_add=True)),
("last_modified", models.DateTimeField(auto_now=True)),
(
"notes",
models.TextField(
blank=True,
help_text="Additional notes left by the reviewer",
null=True,
),
),
(
"detection_method",
models.CharField(blank=True, max_length=255, null=True),
),
(
"detection_details",
models.JSONField(
default=dict,
help_text="Extra context from the detection method, e.g. NLP results, elapsed time",
),
),
(
"content_type",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="contenttypes.contenttype",
),
),
(
"reviewer",
models.ForeignKey(
blank=True,
null=True,
on_delete=models.SET(core.models.get_sentinel_user),
to=settings.AUTH_USER_MODEL,
),
),
],
),
migrations.AddField(
model_name="event",
name="spam_moderation",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="core.spammoderation",
),
),
migrations.AddField(
model_name="job",
name="spam_moderation",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="core.spammoderation",
),
),
]
113 changes: 113 additions & 0 deletions django/core/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

from django.conf import settings
from django.contrib.auth.views import redirect_to_login
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import PermissionDenied
from django.utils import timezone
from rest_framework import serializers
from rest_framework.exceptions import NotFound
from rest_framework.response import Response

from .models import SpamModeration
from .permissions import ViewRestrictedObjectPermissions

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -180,3 +184,112 @@ def list(self, request, *args, **kwargs):

serializer = self.get_serializer(queryset, many=True)
return Response(serializer.data)


class SpamCatcherSerializerMixin(serializers.Serializer):
Copy link
Member

@alee alee May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this use class SpamCatcherSerializerMixin(metaclass=serializers.SerializerMetaclass) instead?

see

https://stackoverflow.com/questions/28747487/mixin-common-fields-between-serializers-in-django-rest-framework

and

encode/django-rest-framework#4482 (comment)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, yeah. the mixin shouldn't be a serializer. I'll check this out

"""
sets a "spam_context" flag on the serializer context using some simple heuristics:
- if the honeypot field (named "content") is filled in
- if the time between the page being loaded and the form being submitted is less than
SPAM_LIKELY_SECONDS_THRESHOLD

Note: a serializer using this mixin that overrides the validate method must call
super().validate(attrs) to chain the validation logic
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fields["content"] = serializers.CharField(
required=False, allow_blank=True, write_only=True
)
self.fields["loaded_time"] = serializers.DateTimeField(required=False)

def validate(self, attrs):
super().validate(attrs)
if attrs.get("content"):
self.context["spam_context"] = self.format_spam_context(
"honeypot",
{
"field_name": "content",
"field_value": attrs["content"],
},
)
else:
self.check_form_submit_time(attrs)
# remove so that the serializer doesn't attempt to save these fields
for field in ["content", "loaded_time"]:
attrs.pop(field, None)

return attrs

def check_form_submit_time(self, attrs):
loaded_time = attrs.get("loaded_time")
submit_time = timezone.now()
if loaded_time and submit_time:
elapsed = submit_time - loaded_time
if elapsed.total_seconds() < settings.SPAM_LIKELY_SECONDS_THRESHOLD:
self.context["spam_context"] = self.format_spam_context(
"form_submit_time", {"elapsed_seconds": elapsed.total_seconds()}
)

def format_spam_context(self, method: str, value: dict):
return {
"detection_method": method,
"detection_details": value,
}


class SpamCatcherViewSetMixin:
"""
creates a SpamContent object on create and update if the spam_context
flag is set by the serializer (see SpamCatcherSerializerMixin)
"""

def perform_create(self, serializer: serializers.Serializer):
super().perform_create(serializer)
self.handle_spam_detection(serializer)

def perform_update(self, serializer):
super().perform_update(serializer)
self.handle_spam_detection(serializer)

def _validate_content_object(self, instance):
# make sure that the instance has a spam_moderation attribute as well as the
# necessary fields for displaying the spam content in the admin
required_fields = [
"spam_moderation",
"is_marked_spam",
"get_absolute_url",
"title",
]
for field in required_fields:
if not hasattr(instance, field):
raise ValueError(
f"instance {instance} does not have a {field} attribute"
)

def handle_spam_detection(self, serializer: serializers.Serializer):
if "spam_context" in serializer.context:
try:
self._validate_content_object(serializer.instance)
self._record_spam(
serializer.instance, serializer.context["spam_context"]
)
except ValueError as e:
logger.warning("Cannot flag %s as spam: %s", serializer.instance, e)

def _record_spam(self, instance, spam_context: dict):
content_type = ContentType.objects.get_for_model(type(instance))
# SpamContent updates the content instance on save
spam_moderation, created = SpamModeration.objects.get_or_create(
content_type=content_type,
object_id=instance.pk,
defaults={
"status": SpamModeration.Status.UNREVIEWED,
"detection_method": spam_context["detection_method"],
"detection_details": spam_context["detection_details"],
},
)
if not created:
spam_moderation.status = SpamModeration.Status.UNREVIEWED
spam_moderation.save()
Loading
Loading