Skip to content

Commit

Permalink
do not use pruning any more
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennecallies committed Nov 17, 2024
1 parent 0273baa commit 934c0f0
Showing 1 changed file with 26 additions and 10 deletions.
36 changes: 26 additions & 10 deletions scraping/services/scrape_page_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,49 @@
create_pruning


def is_extracted_html_list_identical_for_scraping(scraping: Scraping,
extracted_html_list: list[str]) -> bool:
prunings = scraping.prunings.all()
if not prunings and not extracted_html_list:
return True

if not prunings or not extracted_html_list:
return False

return set(p.extracted_html for p in prunings) == set(extracted_html_list)


def upsert_scraping(page: Page, extracted_html: Optional[str]) -> ():
extracted_html_list = [extracted_html] if extracted_html else [] # TODO adapt this

# Compare result to last scraping
scraping = page.get_latest_scraping()
if (scraping is not None
and ((scraping.pruning is None
and extracted_html is None)
or (scraping.pruning is not None
and scraping.pruning.extracted_html == extracted_html))):
and is_extracted_html_list_identical_for_scraping(scraping, extracted_html_list)):
# If a scraping exists and is identical to last one
scraping.nb_iterations += 1
scraping.save()

if scraping.pruning is not None:
prune_pruning(scraping.pruning)
for pruning in scraping.prunings.all():
prune_pruning(pruning)
else:
if scraping is not None:
# If a scraping exists and is different from last one, we delete it
delete_scraping(scraping)

pruning = create_pruning(extracted_html)
prunings = []
for extracted_html_item in extracted_html_list:
prunings.append(create_pruning(extracted_html_item))

legacy_pruning = prunings[0] if prunings else None # TODO remove this line

scraping = Scraping(
nb_iterations=1,
page=page,
pruning=pruning,
pruning=legacy_pruning, # TODO remove this line
)
scraping.save()
scraping.prunings.add(pruning)

prune_pruning(pruning)
for pruning in prunings:
scraping.prunings.add(pruning)
prune_pruning(pruning)

0 comments on commit 934c0f0

Please sign in to comment.