Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Person identifier cleaning #2433

Merged
merged 5 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions ynr/apps/candidatebot/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,13 @@ def add_theyworkforyou(self, twfy_id):
value = f"https://www.theyworkforyou.com/mp/{twfy_id}/"
internal_id = f"uk.org.publicwhip/person/{twfy_id}"
self.edit_field("theyworkforyou", value, internal_id=internal_id)

def delete_field(self, value_type):
"""
Remove a PersonIdentifier from a person
"""
deleted, _ = self.person.tmp_person_identifiers.filter(
symroe marked this conversation as resolved.
Show resolved Hide resolved
value_type=value_type
).delete()
if deleted:
self.edits_made = True
22 changes: 22 additions & 0 deletions ynr/apps/people/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,25 @@ standing in.
Thing like their political party should be stored on the _candidacy_.
This is currently represented as a `Membership` in the `popolo` app.

# Person Identifiers

`PersonIdentifiers` are used to store contact information and online presence for a candidate. Users can add/edit these identifiers in the person update form. The following fields are available in the `PersonIdentifier` model with the corresponding form labels and accepted formats:
```
email = "Email": Accepts a valid email address; raises an exception if the email is submitted with an invalid format. The email address must be unique.
facebook_page_url = "Facebook Page": Accepts a valid URL but does not currently validate the domain.
facebook_personal_url = "Facebook Personal": Same as above.
homepage_url = "Homepage": Accepts a valid URL
blog_url = "Blog": Accepts a valid URL
linkedin_url = "Linkedin": Accepts a valid URL; does not currently validate the domain or the format of the username.
party_ppc_page_url = "Party Candidate Page": Accepts a valid URL
twitter_username = "Twitter": Accepts a Twitter username and returns the full Twitter URL.No longer validates the username actually exists.
mastodon_username = "Mastodon": Accepts and returns a valid Mastodon URL and validates the domain and username.
wikipedia_url = "Wikipedia": Accepts a valid URL; does not currently validate the domain.
wikidata_id = "Wikidata": Accepts a valid Wikidata ID
youtube_profile = "YouTube Profile": Accepts a valid URL; does not currently validate the domain.
instagram_url = "Instagram Profile": Accepts and returns a valid Instagram URL and validates the domain and format of the username.
blue_sky_url = "Bluesky URL": Accepts and returns a valid URL
threads_url = "Threads URL": Accepts and returns a valid URL
tiktok_url = "TikTok URL": Accepts and returns a valid URL
other_url = "Other URL": Accepts and returns a valid URL
```
25 changes: 25 additions & 0 deletions ynr/apps/people/forms/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
StrippedCharField,
)
from people.helpers import (
clean_instagram_url,
clean_linkedin_url,
clean_mastodon_username,
clean_twitter_username,
clean_wikidata_id,
Expand Down Expand Up @@ -116,6 +118,17 @@ def clean(self):
self.add_error(None, e)
return self.cleaned_data

def clean_instagram_url(self, username):
if self.instance.value != username:
self.instance.internal_identifier = None
if self.instance.internal_identifier:
return username
try:
return clean_instagram_url(username)
except ValueError as e:
raise ValidationError(e)
return username

def clean_twitter_username(self, username):
if self.instance.value != username:
self.instance.internal_identifier = None
Expand All @@ -128,6 +141,18 @@ def clean_twitter_username(self, username):
except ValueError as e:
raise ValidationError(e)

def clean_linkedin_url(self, url):
if self.instance.value != url:
self.instance.internal_identifier = None

if self.instance.internal_identifier:
return url

try:
return clean_linkedin_url(url)
except ValueError as e:
raise ValidationError(e)

def clean_mastodon_username(self, username):
if self.instance.value != username:
self.instance.internal_identifier = None
Expand Down
48 changes: 47 additions & 1 deletion ynr/apps/people/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import contextlib
import re
from urllib.parse import urlparse
from urllib.parse import urlparse, urlunparse

from candidates.mastodon_api import (
MastodonAPITokenMissing,
Expand Down Expand Up @@ -120,6 +120,52 @@ def clean_twitter_username(username):
return username


def clean_linkedin_url(url):
parsed_url = urlparse(url)
valid = True
if not re.match(r"([^.]+)?\.linkedin.com$", parsed_url.netloc):
valid = False
path = parsed_url.path
if path.startswith("/pub/"):
parts = path.split("/")
name = parts[2]
id_parts = parts[3:]
user_id = "".join(id_parts[::-1])
path = f"/in/{name}-{user_id}/"

if not path.startswith("/in/"):
valid = False

if not valid:
raise ValueError("Please enter a valid LinkedIn URL.")
return urlunparse(parsed_url._replace(path=path))


def clean_instagram_url(url):
parsed_username = urlparse(url)
if not parsed_username.scheme:
url = f"https://{url}"
parsed_username = urlparse(url)

if parsed_username.netloc and parsed_username.netloc not in [
"instagram.com",
"www.instagram.com",
"instagr.am",
"www.instagr.am",
]:
raise ValueError(
"The Instagram URL must be from a valid Instagram domain."
)
username = parsed_username.path.strip("/")
# RegEx thanks to https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
if not re.match(
r"(?:@?)([A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)$",
username,
):
raise ValueError("This is not a valid Instagram URL. Please try again.")
return f"https://www.instagram.com/{username}/"


def clean_wikidata_id(identifier):
identifier = identifier.strip().lower()
m = re.search(r"^.*wikidata.org/(wiki|entity)/(\w+)", identifier)
Expand Down
86 changes: 86 additions & 0 deletions ynr/apps/people/management/commands/reformat_person_identifiers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from candidatebot.helpers import CandidateBot
from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand
from django.db import transaction
from people.helpers import (
clean_instagram_url,
clean_linkedin_url,
clean_mastodon_username,
clean_twitter_username,
clean_wikidata_id,
)
from people.models import PersonIdentifier

# Add to this list when we have dedicated clean_* functions for an value type
supported_identifiers = {
"instagram_url": clean_instagram_url,
"linkedin_url": clean_linkedin_url,
"twitter_username": clean_twitter_username,
"mastodon_username": clean_mastodon_username,
"wikidata_id": clean_wikidata_id,
}


class Command(BaseCommand):
help = f"""Checks and updates links in the PersonIdentifier field.

Currently the fields supported are {", ".join(supported_identifiers)}.
"""

def add_arguments(self, parser):
parser.add_argument(
"--update",
help="Write changes to IDs that have been cleaned",
action="store_true",
)
parser.add_argument(
"--remove",
help="Remove invalid IDs that can't be cleaned",
action="store_true",
)

@transaction.atomic()
def handle(self, *args, **options):
"""
Iterate over all PersonIdentifier objects and reformat urls as needed.
"""

if not options["update"] or not options["remove"]:
self.stdout.write("Not modifying data, just reporting on changes.")

person_identifiers = PersonIdentifier.objects.filter(
value_type__in=supported_identifiers.keys()
)
for identifier in person_identifiers:
initial_value = identifier.value
try:
cleaned_value = supported_identifiers[identifier.value_type](
initial_value
)
if initial_value != cleaned_value:
self.change_identifier(
identifier, cleaned_value, save=options["update"]
)

except (ValueError, ValidationError) as exception:
self.unclean_identifier(
identifier, exception, remove=options["remove"]
)

def change_identifier(self, identifier, cleaned_value, save=False):
self.stdout.write(
f"Changing person {identifier.person_id}: {identifier.value} to {cleaned_value}"
)
if save:
bot = CandidateBot(identifier.person_id, update=True)
bot.edit_field(identifier.value_type, cleaned_value)
bot.save(source="Auto-cleaning URLs")

def unclean_identifier(self, identifier, exception, remove=False):
self.stdout.write(
f"ERROR: {identifier.person_id}: {identifier.value}: {exception}"
)
if remove:
bot = CandidateBot(identifier.person_id, update=True)
bot.delete_field(identifier.value_type)
bot.save(source="Auto removing invalid IDs")
2 changes: 2 additions & 0 deletions ynr/apps/people/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ def get_value_html(self):

if self.value_type == "twitter_username":
url = format_html("https://twitter.com/{}", self.value)
if self.value_type == "instagram_url":
url = self.value

if self.value.startswith("http"):
url = format_html("{}", self.value)
Expand Down
76 changes: 61 additions & 15 deletions ynr/apps/people/tests/test_person_form_identifier_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,20 +150,6 @@ def _submit_values(self, value, value_type="twitter_username"):
form["source"] = "They changed their username"
form["tmp_person_identifiers-0-value_type"] = value_type
form["tmp_person_identifiers-0-value"] = value
form.submit()
return form.submit()

def _submit_mastodon_values(self, value, value_type="mastodon_username"):
resp = self.app.get(
reverse("person-update", kwargs={"person_id": self.person.pk}),
user=self.user,
)

form = resp.forms[1]
form["source"] = "They changed their username"
form["tmp_person_identifiers-0-value_type"] = value_type
form["tmp_person_identifiers-0-value"] = value
form.submit()
return form.submit()

def test_twitter_bad_url(self):
Expand All @@ -185,9 +171,48 @@ def test_twitter_full_url(self):
PersonIdentifier.objects.get().value, "madeuptwitteraccount"
)

def test_clean_instagram_url(self):
resp = self._submit_values(
"https://www.instagr.am/disco_dude", value_type="instagram_url"
)
self.assertEqual(resp.status_code, 302)
instagram_url_qs = PersonIdentifier.objects.filter(
value_type="instagram_url"
)
self.assertEqual(instagram_url_qs.count(), 1)
self.assertEqual(
instagram_url_qs[0].value,
"https://www.instagram.com/disco_dude/",
)

def test_bad_instagram_domain(self):
resp = self._submit_values(
"www.instbad.am/blah", value_type="instagram_url"
)
form = resp.context["identifiers_formset"]
self.assertFalse(form.is_valid())
self.assertEqual(
form[0].non_field_errors(),
["The Instagram URL must be from a valid Instagram domain."],
)

def test_bad_instagram_username(self):
resp = self._submit_values(
"https://www.instagr.am/___@_____blah", value_type="instagram_url"
)
self.assertEqual(resp.status_code, 200)
form = resp.context["identifiers_formset"]
self.assertFalse(form.is_valid())
self.assertEqual(
form[0].non_field_errors(),
["This is not a valid Instagram URL. Please try again."],
)

def test_mastodon_bad_url(self):
# submit a username missing the `@` symbol
resp = self._submit_mastodon_values("https://mastodon.social/joe")
resp = self._submit_values(
"https://mastodon.social/joe", value_type="mastodon_username"
)
form = resp.context["identifiers_formset"]
self.assertFalse(form.is_valid())
self.assertEqual(
Expand All @@ -209,6 +234,27 @@ def test_mastodon_full_url(self):
qs = PersonIdentifier.objects.all()
self.assertEqual(qs[0].value, "joe")

def test_linkedin_urls(self):
urls_to_valid = (
("http://example.com/@blah", False),
("https://www.linkedin.com/in/first-last-57338a4/", True),
("https://uk.linkedin.com/in/first-last-57338a4/", True),
("https://ie.linkedin.com/in/first-last-57338a4/", True),
("https://ie.linkedin.com/in/first-last-57338a4", True),
)
for url, valid in urls_to_valid:
with self.subTest(url=url, value=valid):
resp = self._submit_values(url, "linkedin_url")
if valid:
self.assertEqual(resp.status_code, 302)
else:
form = resp.context["identifiers_formset"]
self.assertFalse(form.is_valid())
self.assertEqual(
form[0].non_field_errors(),
["Please enter a valid LinkedIn URL."],
)

def test_bad_email_address(self):
resp = self._submit_values("whoops", "email")
form = resp.context["identifiers_formset"]
Expand Down
17 changes: 17 additions & 0 deletions ynr/apps/people/tests/test_person_identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,23 @@ def test_get_value_html_twitter(self):
# Test the value type HTML
self.assertEqual(pi.get_value_type_html, "Twitter")

def test_get_value_html_instagram(self):
pi = PersonIdentifier.objects.create(
person=self.person,
value="https://www.instagram.com/democlub",
value_type="instagram_url",
internal_identifier="2324",
)

# Test the value HTML
self.assertEqual(
pi.get_value_html,
"""<a href="https://www.instagram.com/democlub" rel="nofollow">https://www.instagram.com/democlub</a>""",
)

# Test the value type HTML
self.assertEqual(pi.get_value_type_html, "Instagram")

def test_get_value_html_mastodon(self):
pi = PersonIdentifier.objects.create(
person=self.person,
Expand Down
4 changes: 2 additions & 2 deletions ynr/apps/ynr_refactoring/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ class PersonIdentifierFields(Enum):
facebook_personal_url = "Facebook Personal"
homepage_url = "Homepage"
blog_url = "Blog"
linkedin_url = "Linkedin"
linkedin_url = "LinkedIn URL"
party_ppc_page_url = "Party Candidate Page"
twitter_username = "Twitter"
mastodon_username = "Mastodon"
wikipedia_url = "Wikipedia"
wikidata_id = "Wikidata"
youtube_profile = "YouTube Profile"
instagram_url = "Instagram Profile"
instagram_url = "Instagram URL"
blue_sky_url = "Bluesky URL"
threads_url = "Threads URL"
tiktok_url = "TikTok URL"
Expand Down