-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
basic spam detection #719
basic spam detection #719
Changes from 8 commits
4ba5028
4a273c7
3ce321c
78b171e
0172885
d9a65f0
3428d80
a2ce823
8d4fe55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# Generated by Django 4.2.11 on 2024-05-20 22:15 | ||
|
||
import core.models | ||
from django.conf import settings | ||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
migrations.swappable_dependency(settings.AUTH_USER_MODEL), | ||
("contenttypes", "0002_remove_content_type_name"), | ||
("core", "0020_alter_eventtag_tag_alter_jobtag_tag_and_more"), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name="event", | ||
name="is_marked_spam", | ||
field=models.BooleanField( | ||
default=False, | ||
help_text="cached boolean representation of spam_moderation for search indexing", | ||
), | ||
), | ||
migrations.AddField( | ||
model_name="job", | ||
name="is_marked_spam", | ||
field=models.BooleanField( | ||
default=False, | ||
help_text="cached boolean representation of spam_moderation for search indexing", | ||
), | ||
), | ||
migrations.AlterField( | ||
model_name="event", | ||
name="submitter", | ||
field=models.ForeignKey( | ||
blank=True, | ||
on_delete=models.SET(core.models.get_sentinel_user), | ||
to=settings.AUTH_USER_MODEL, | ||
), | ||
), | ||
migrations.CreateModel( | ||
name="SpamModeration", | ||
fields=[ | ||
( | ||
"id", | ||
models.AutoField( | ||
auto_created=True, | ||
primary_key=True, | ||
serialize=False, | ||
verbose_name="ID", | ||
), | ||
), | ||
( | ||
"status", | ||
models.CharField( | ||
choices=[ | ||
("unreviewed", "Unreviewed"), | ||
("spam", "Confirmed spam"), | ||
("not_spam", "Confirmed not spam"), | ||
], | ||
default="unreviewed", | ||
max_length=32, | ||
), | ||
), | ||
("object_id", models.PositiveIntegerField()), | ||
("date_created", models.DateTimeField(auto_now_add=True)), | ||
("last_modified", models.DateTimeField(auto_now=True)), | ||
( | ||
"notes", | ||
models.TextField( | ||
blank=True, | ||
help_text="Additional notes left by the reviewer", | ||
null=True, | ||
), | ||
), | ||
( | ||
"detection_method", | ||
models.CharField(blank=True, max_length=255, null=True), | ||
), | ||
( | ||
"detection_details", | ||
models.JSONField( | ||
default=dict, | ||
help_text="Extra context from the detection method, e.g. NLP results, elapsed time", | ||
), | ||
), | ||
( | ||
"content_type", | ||
models.ForeignKey( | ||
on_delete=django.db.models.deletion.CASCADE, | ||
to="contenttypes.contenttype", | ||
), | ||
), | ||
( | ||
"reviewer", | ||
models.ForeignKey( | ||
blank=True, | ||
null=True, | ||
on_delete=models.SET(core.models.get_sentinel_user), | ||
to=settings.AUTH_USER_MODEL, | ||
), | ||
), | ||
], | ||
), | ||
migrations.AddField( | ||
model_name="event", | ||
name="spam_moderation", | ||
field=models.ForeignKey( | ||
blank=True, | ||
null=True, | ||
on_delete=django.db.models.deletion.SET_NULL, | ||
to="core.spammoderation", | ||
), | ||
), | ||
migrations.AddField( | ||
model_name="job", | ||
name="spam_moderation", | ||
field=models.ForeignKey( | ||
blank=True, | ||
null=True, | ||
on_delete=django.db.models.deletion.SET_NULL, | ||
to="core.spammoderation", | ||
), | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,10 +2,14 @@ | |
|
||
from django.conf import settings | ||
from django.contrib.auth.views import redirect_to_login | ||
from django.contrib.contenttypes.models import ContentType | ||
from django.core.exceptions import PermissionDenied | ||
from django.utils import timezone | ||
from rest_framework import serializers | ||
from rest_framework.exceptions import NotFound | ||
from rest_framework.response import Response | ||
|
||
from .models import SpamModeration | ||
from .permissions import ViewRestrictedObjectPermissions | ||
|
||
logger = logging.getLogger(__name__) | ||
|
@@ -180,3 +184,112 @@ def list(self, request, *args, **kwargs): | |
|
||
serializer = self.get_serializer(queryset, many=True) | ||
return Response(serializer.data) | ||
|
||
|
||
class SpamCatcherSerializerMixin(serializers.Serializer): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this use see and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably, yeah. the mixin shouldn't be a serializer. I'll check this out |
||
""" | ||
sets a "spam_context" flag on the serializer context using some simple heuristics: | ||
- if the honeypot field (named "content") is filled in | ||
- if the time between the page being loaded and the form being submitted is less than | ||
SPAM_LIKELY_SECONDS_THRESHOLD | ||
|
||
Note: a serializer using this mixin that overrides the validate method must call | ||
super().validate(attrs) to chain the validation logic | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.fields["content"] = serializers.CharField( | ||
required=False, allow_blank=True, write_only=True | ||
) | ||
self.fields["loaded_time"] = serializers.DateTimeField(required=False) | ||
|
||
def validate(self, attrs): | ||
super().validate(attrs) | ||
if attrs.get("content"): | ||
self.context["spam_context"] = self.format_spam_context( | ||
"honeypot", | ||
{ | ||
"field_name": "content", | ||
"field_value": attrs["content"], | ||
}, | ||
) | ||
else: | ||
self.check_form_submit_time(attrs) | ||
# remove so that the serializer doesn't attempt to save these fields | ||
for field in ["content", "loaded_time"]: | ||
attrs.pop(field, None) | ||
|
||
return attrs | ||
|
||
def check_form_submit_time(self, attrs): | ||
loaded_time = attrs.get("loaded_time") | ||
submit_time = timezone.now() | ||
if loaded_time and submit_time: | ||
elapsed = submit_time - loaded_time | ||
if elapsed.total_seconds() < settings.SPAM_LIKELY_SECONDS_THRESHOLD: | ||
self.context["spam_context"] = self.format_spam_context( | ||
"form_submit_time", {"elapsed_seconds": elapsed.total_seconds()} | ||
) | ||
|
||
def format_spam_context(self, method: str, value: dict): | ||
return { | ||
"detection_method": method, | ||
"detection_details": value, | ||
} | ||
|
||
|
||
class SpamCatcherViewSetMixin: | ||
""" | ||
creates a SpamContent object on create and update if the spam_context | ||
flag is set by the serializer (see SpamCatcherSerializerMixin) | ||
""" | ||
|
||
def perform_create(self, serializer: serializers.Serializer): | ||
super().perform_create(serializer) | ||
self.handle_spam_detection(serializer) | ||
|
||
def perform_update(self, serializer): | ||
super().perform_update(serializer) | ||
self.handle_spam_detection(serializer) | ||
|
||
def _validate_content_object(self, instance): | ||
# make sure that the instance has a spam_moderation attribute as well as the | ||
# necessary fields for displaying the spam content in the admin | ||
required_fields = [ | ||
"spam_moderation", | ||
"is_marked_spam", | ||
"get_absolute_url", | ||
"title", | ||
] | ||
for field in required_fields: | ||
if not hasattr(instance, field): | ||
raise ValueError( | ||
f"instance {instance} does not have a {field} attribute" | ||
) | ||
|
||
def handle_spam_detection(self, serializer: serializers.Serializer): | ||
if "spam_context" in serializer.context: | ||
try: | ||
self._validate_content_object(serializer.instance) | ||
self._record_spam( | ||
serializer.instance, serializer.context["spam_context"] | ||
) | ||
except ValueError as e: | ||
logger.warning("Cannot flag %s as spam: %s", serializer.instance, e) | ||
|
||
def _record_spam(self, instance, spam_context: dict): | ||
content_type = ContentType.objects.get_for_model(type(instance)) | ||
# SpamContent updates the content instance on save | ||
spam_moderation, created = SpamModeration.objects.get_or_create( | ||
content_type=content_type, | ||
object_id=instance.pk, | ||
defaults={ | ||
"status": SpamModeration.Status.UNREVIEWED, | ||
"detection_method": spam_context["detection_method"], | ||
"detection_details": spam_context["detection_details"], | ||
}, | ||
) | ||
if not created: | ||
spam_moderation.status = SpamModeration.Status.UNREVIEWED | ||
spam_moderation.save() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when if ever will
alert_if_deleted
be executed? the old behavior was to generate a 404 when trying to retrieve something that's been marked as deleted, guessing that's no longer the case?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, its now treated exactly the same as things marked spam, hidden from list view but accessible if you have a direct link to it. There wasn't much reasoning behind it besides simplifying queries