Skip to content

Commit

Permalink
use counter for parsing moderation
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennecallies committed Nov 1, 2024
1 parent 7d40496 commit df4348d
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Generated by Django 5.0.9 on 2024-11-01 22:06

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('home', '0121_rename_is_exception_rule_historicaloneoffschedule_is_cancellation_and_more'),
]

operations = [
migrations.AddField(
model_name='historicalpage',
name='parsing_last_validated_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='historicalpage',
name='parsing_validation_counter',
field=models.SmallIntegerField(default=0),
),
migrations.AddField(
model_name='historicalwebsite',
name='parsing_last_validated_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='historicalwebsite',
name='parsing_validation_counter',
field=models.SmallIntegerField(default=0),
),
migrations.AddField(
model_name='page',
name='parsing_last_validated_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='page',
name='parsing_validation_counter',
field=models.SmallIntegerField(default=0),
),
migrations.AddField(
model_name='website',
name='parsing_last_validated_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='website',
name='parsing_validation_counter',
field=models.SmallIntegerField(default=0),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Generated by Django 5.0.9 on 2024-11-01 23:10

import django.db.models.deletion
import simple_history.models
from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('home', '0122_historicalpage_parsing_last_validated_at_and_more'),
]

operations = [
migrations.AlterField(
model_name='historicaloneoffschedule',
name='parsing',
field=simple_history.models.HistoricForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='home.parsing'),
),
migrations.AlterField(
model_name='historicalregularschedule',
name='parsing',
field=simple_history.models.HistoricForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='home.parsing'),
),
migrations.AlterField(
model_name='oneoffschedule',
name='parsing',
field=simple_history.models.HistoricForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='one_off_schedules', to='home.parsing'),
),
migrations.AlterField(
model_name='regularschedule',
name='parsing',
field=simple_history.models.HistoricForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='regular_schedules', to='home.parsing'),
),
]
14 changes: 9 additions & 5 deletions home/models/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from django.contrib.postgres.fields import ArrayField
from django.db import models
from pgvector.django import VectorField
from simple_history.models import HistoricalRecords
from simple_history.models import HistoricalRecords, HistoricForeignKey

from home.models.custom_fields import ChoiceArrayField
from home.utils.hash_utils import hash_string_to_hex
Expand Down Expand Up @@ -35,6 +35,8 @@ class Website(TimeStampMixin):
is_active = models.BooleanField(default=True)
pruning_validation_counter = models.SmallIntegerField(default=0)
pruning_last_validated_at = models.DateTimeField(null=True, blank=True)
parsing_validation_counter = models.SmallIntegerField(default=0)
parsing_last_validated_at = models.DateTimeField(null=True, blank=True)
history = HistoricalRecords()

_latest_crawling = None
Expand Down Expand Up @@ -108,6 +110,8 @@ class Page(TimeStampMixin):
website = models.ForeignKey('Website', on_delete=models.CASCADE, related_name='pages')
pruning_validation_counter = models.SmallIntegerField(default=0)
pruning_last_validated_at = models.DateTimeField(null=True, blank=True)
parsing_validation_counter = models.SmallIntegerField(default=0)
parsing_last_validated_at = models.DateTimeField(null=True, blank=True)
history = HistoricalRecords()

class Meta:
Expand Down Expand Up @@ -249,8 +253,8 @@ class Meta:


class OneOffSchedule(Schedule):
parsing = models.ForeignKey('Parsing', on_delete=models.CASCADE,
related_name='one_off_schedules')
parsing = HistoricForeignKey('Parsing', on_delete=models.CASCADE,
related_name='one_off_schedules')
year = models.SmallIntegerField(null=True, blank=True)
month = models.SmallIntegerField(null=True, blank=True)
day = models.SmallIntegerField(null=True, blank=True)
Expand All @@ -261,8 +265,8 @@ class OneOffSchedule(Schedule):


class RegularSchedule(Schedule):
parsing = models.ForeignKey('Parsing', on_delete=models.CASCADE,
related_name='regular_schedules')
parsing = HistoricForeignKey('Parsing', on_delete=models.CASCADE,
related_name='regular_schedules')
rrule = models.TextField() # in order to have TextArea in admin
include_periods = ChoiceArrayField(models.CharField(max_length=16,
choices=PeriodEnum.choices()),
Expand Down
1 change: 1 addition & 0 deletions home/services/edit_pruning_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def reset_pages_counter_of_pruning(pruning: Pruning):
page = scraping.page
page.pruning_validation_counter = -1
page.save()
websites_to_reset.add(page.website)

for website in websites_to_reset:
website.pruning_validation_counter = -1
Expand Down
4 changes: 4 additions & 0 deletions scraping/parse/schedules.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ class SchedulesList(BaseModel):
is_related_to_permanence: bool
will_be_seasonal_events: bool

def __eq__(self, other: 'SchedulesList'):
return self.model_dump(exclude={'schedules'}) == other.model_dump(exclude={'schedules'}) \
and set(self.schedules) == set(other.schedules)


class Event(BaseModel, frozen=True):
church_id: Optional[int]
Expand Down
2 changes: 1 addition & 1 deletion scraping/services/page_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ def remove_pruning_if_orphan(pruning: Optional[Pruning]):

return False


######################
# QUALITY EVALUATION #
######################


def page_first_pruning_was_validated(page: Page) -> Optional[bool]:
for page_version in page.history.all():
if page_version.pruning_validation_counter == -1:
Expand Down
56 changes: 56 additions & 0 deletions scraping/services/parse_pruning_service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from datetime import timedelta
from typing import Optional

from django.forms import model_to_dict
from django.utils import timezone
from pydantic import ValidationError

from home.models import Pruning, Website, Parsing, Schedule, ParsingModeration, Church, \
Expand Down Expand Up @@ -173,9 +175,63 @@ def add_necessary_parsing_moderation(parsing: Parsing, schedules_list: Optional[
def update_validated_schedules_list(parsing_moderation: ParsingModeration):
schedules_list = get_parsing_schedules_list(parsing_moderation.parsing)
assert schedules_list is not None, 'Can not validate parsing with error'

parsing_moderation.validated_schedules_list = schedules_list.model_dump()
parsing_moderation.save()

update_counters_of_parsing(parsing_moderation.parsing)


def has_parsing_been_modified(parsing: Parsing) -> bool:
non_human_parsing_history = parsing.history.filter(history_user_id__isnull=True) \
.order_by('-history_date').first()
non_human_schedules_list = get_parsing_schedules_list(non_human_parsing_history.instance)
current_schedule_list = get_parsing_schedules_list(parsing)

return current_schedule_list != non_human_schedules_list


def update_counters_of_parsing(parsing: Parsing):
has_been_modified = has_parsing_been_modified(parsing)

websites_to_update = set()
for pruning in parsing.prunings.all():
for scraping in pruning.scrapings.all():
page = scraping.page
if has_been_modified:
page.parsing_validation_counter = -1
else:
page.parsing_validation_counter += 1
page.save()
websites_to_update.add(page.website)

for website in websites_to_update:
if has_been_modified:
website.parsing_validation_counter = -1
else:
website.parsing_validation_counter += 1
website.save()


def parsing_needs_moderation(parsing: Parsing):
for pruning in parsing.prunings.all():
for scraping in pruning.scrapings.all():
page = scraping.page
# if page has been validated less than three times or more than one year ago
# and if website has been validated less than seven times or more than one year ago
if (
page.parsing_validation_counter < 2
or page.parsing_last_validated_at is None
or page.parsing_last_validated_at < (timezone.now() - timedelta(days=365))
) and (
page.website.parsing_validation_counter < 6
or page.website.parsing_last_validated_at is None
or page.website.parsing_last_validated_at < (timezone.now() - timedelta(days=365))
):
return True

return False


########
# MAIN #
Expand Down

0 comments on commit df4348d

Please sign in to comment.