Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add torideal spider #50

Merged
merged 1 commit into from
May 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ env:
- SPIDER=vauvapage
- SPIDER=suomi24page
- SPIDER=oikotieapartment
- SPIDER=torideal
- SPIDER="not spider"
script: pytest -v -m "$SPIDER"
jobs:
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ The library provides an easy-to-use API for fetching data from various Finnish w
| [Suomi24](https://keskustelu.suomi24.fi) | Discussion thread | `Suomi24Page` |
| [Vauva](https://www.vauva.fi) | Discussion thread | `VauvaPage` |
| [Oikotie Asunnot](https://asunnot.oikotie.fi/myytavat-asunnot) | Apartment ad | `OikotieApartment` |
| [Tori](https://www.tori.fi) | Item deal | `ToriDeal` |

Documentation is available at [https://finscraper.readthedocs.io](https://finscraper.readthedocs.io) and [simple online demo here](https://storage.googleapis.com/jmyrberg/index.html#/demo-projects/finscraper).

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.1dev21
0.0.1dev22
8 changes: 8 additions & 0 deletions docs/source/finscraper.scrapy_spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ finscraper.scrapy\_spiders.suomi24page module
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.torideal module
------------------------------------------

.. automodule:: finscraper.scrapy_spiders.torideal
:members:
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.vauvapage module
-------------------------------------------

Expand Down
137 changes: 137 additions & 0 deletions finscraper/scrapy_spiders/torideal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Module for ToriDeal spider."""


import time

from functools import partial

from scrapy import Item, Field, Selector
from scrapy.crawler import Spider
from scrapy.exceptions import DropItem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, \
Compose, Join

from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin
from finscraper.utils import strip_join, safe_cast_int, strip_elements, \
drop_empty_elements, replace


class _ToriDealSpider(FollowAndParseItemMixin, Spider):
name = 'torideal'
start_urls = ['https://tori.fi']
follow_link_extractor = LinkExtractor(
allow_domains=('tori.fi'),
allow=(),
deny=('.*tili.*'),
deny_domains=('tuki.tori.fi', 'blog.tori.fi', 'tori-kaupat.tori.fi',
'careers.tori.fi', 'media.tori.fi'),
canonicalize=True
)
item_link_extractor = LinkExtractor(
allow_domains=('tori.fi'),
allow=(rf'/[A-z0-9\_]+.htm.*'),
deny=('.*tili.*'),
deny_domains=('tuki.tori.fi', 'blog.tori.fi', 'tori-kaupat.tori.fi',
'careers.tori.fi', 'media.tori.fi', 'asunnot.tori.fi'),
canonicalize=True
)
custom_settings = {
'ROBOTSTXT_OBEY': False
}

def __init__(self, *args, **kwargs):
"""Fetch deals from tori.fi.

Args:
"""
super(_ToriDealSpider, self).__init__(*args, **kwargs)

@staticmethod
def _get_image_metadata(text):
sel = Selector(text=text)
return {
'src': sel.xpath('//@src').get(),
'alt': sel.xpath('//@alt').get(),
'title': sel.xpath('//@title').get()
}

def _parse_item(self, resp):
l = ItemLoader(item=_ToriDealItem(), response=resp)
l.add_value('url', resp.url)
l.add_value('time', int(time.time()))
l.add_xpath('seller',
'//div[contains(@id, "seller_info")]//text()')
l.add_xpath('name',
'//div[@class="topic"]//*[contains(@itemprop, "name")]//text()')
l.add_xpath('description',
'//*[contains(@itemprop, "description")]//text()')
l.add_xpath('price',
'//*[contains(@itemprop, "price")]//text()')
l.add_xpath('type',
'//td[contains(text(), "Ilmoitustyyppi")]'
'/following-sibling::td[1]//text()')
l.add_xpath('published',
'//td[contains(text(), "Ilmoitus jätetty")]'
'/following-sibling::td[1]//text()')
l.add_xpath('images', '//div[@class="media_container"]//img')
return l.load_item()


class _ToriDealItem(Item):
__doc__ = """
Returned fields:
* url (str): URL of the scraped web page.
* time (int): UNIX timestamp of the scraping.
* seller (str): Seller of the item.
* name (str): Name of the item.
* description (list of str): Description of the item.
* price (str): Price of the item.
* type (str): Type of the deal.
* published (str): Publish time of the deal.
* images (list of dict): Images of the item.
"""
url = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
time = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
seller = Field(
input_processor=Compose(
strip_elements,
drop_empty_elements,
MapCompose(partial(replace, source='\n', target=' ')),
partial(strip_join, join_with='\n')),
output_processor=TakeFirst()
)
name = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
description = Field(
input_processor=Compose(
strip_elements,
MapCompose(partial(replace, source='\n', target=' ')),
partial(strip_join, join_with='\n')),
output_processor=TakeFirst()
)
price = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
type = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
published = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
images = Field(
input_processor=MapCompose(_ToriDealSpider._get_image_metadata),
output_processor=Identity()
)
13 changes: 13 additions & 0 deletions finscraper/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
_YLEArticleItem
from finscraper.scrapy_spiders.oikotieapartment import \
_OikotieApartmentSpider, _OikotieApartmentItem
from finscraper.scrapy_spiders.torideal import _ToriDealSpider, \
_ToriDealItem


__wrapper_doc__ = '''
Expand Down Expand Up @@ -117,3 +119,14 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)


class ToriDeal(_SpiderWrapper):
__doc__ = _get_docstring(_ToriDealSpider, _ToriDealItem)
def __init__(self, jobdir=None, progress_bar=True, log_level=None):
super(ToriDeal, self).__init__(
spider_cls=_ToriDealSpider,
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
7 changes: 6 additions & 1 deletion finscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,18 @@ def strip_join(text_list, join_with=' '):
return joined_text


def replace(text, source, target):
return text.replace(source, target) if text is not None else None


def strip_elements(text_list):
return [text.strip() for text in text_list if text is not None]


def drop_empty_elements(text_list):
return [text for text in text_list
if text is not None and type(text) == str and text.strip() != '']
if text is not None or (type(text) == str and text.strip() != '')]


def safe_cast_int(text):
try:
Expand Down
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ markers =
demipage: Mark test as a demipage test.
vauvapage: Mark test as a vauvapage test.
suomi24page: Mark test as a suomi24page test.
oikotieapartment: Mark test as a oikotieapartment test.
oikotieapartment: Mark test as a oikotieapartment test.
torideal: Mark test as a torideal test.
8 changes: 7 additions & 1 deletion tests/test_spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
pytestmark = [pytest.mark.spider]

from finscraper.spiders import ILArticle, ISArticle, YLEArticle, VauvaPage, \
OikotieApartment, DemiPage, Suomi24Page
OikotieApartment, DemiPage, Suomi24Page, ToriDeal

from tests.utils import calc_field_emptiness

Expand Down Expand Up @@ -55,6 +55,12 @@
'params': [None],
'n_fields': 80,
'mark': pytest.mark.oikotieapartment
},
{
'class': ToriDeal,
'params': [None],
'n_fields': 9,
'mark': pytest.mark.torideal
}
]

Expand Down