Skip to content

Commit

Permalink
Merge branch 'fix-2561-bug_walla_scraping' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
EliorGigi committed Mar 2, 2024
2 parents e8db8d5 + 50b3212 commit fcc2cbb
Showing 1 changed file with 26 additions and 1 deletion.
27 changes: 26 additions & 1 deletion anyway/parsers/rss_sites.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,39 @@
import requests
from bs4 import BeautifulSoup
import feedparser
import json
from anyway.parsers import timezones


def get_author_from_walla_html_soup(html_soup):
script_tags = html_soup.find_all('script', {'type' : 'application/ld+json'})
results = []
for script_tag in script_tags:
script_text = script_tag.string.strip()
data = json.loads(script_text)

if isinstance(data.get('author'), dict):
# there is one author
author_section = data['author']
results.append(author_section.get('name', ''))
else:
# there are multiple authors
authors_section = data.get('author', [])
author_names = [author_section.get('name', '') for author_section in authors_section]
if len(author_names) == 1:
results.append(author_names[0])
elif len(author_names) > 1:
results.append(', '.join(author_names))
joined_results = ', '.join(results)
joined_results = joined_results.strip()
return joined_results

def parse_html_walla(item_rss, html_soup):
# For some reason there's html here
description = BeautifulSoup(item_rss["summary"], features="lxml").text

author = html_soup.find("div", class_="author").get_text().strip()
author = get_author_from_walla_html_soup(html_soup)
print(f"author: {author}")
return author, description


Expand Down

0 comments on commit fcc2cbb

Please sign in to comment.