Skip to content

Commit

Permalink
Add torideal spider (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmyrberg authored May 23, 2020
1 parent 9ff9817 commit 45b534f
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 4 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ env:
- SPIDER=vauvapage
- SPIDER=suomi24page
- SPIDER=oikotieapartment
- SPIDER=torideal
- SPIDER="not spider"
script: pytest -v -m "$SPIDER"
jobs:
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ The library provides an easy-to-use API for fetching data from various Finnish w
| [Suomi24](https://keskustelu.suomi24.fi) | Discussion thread | `Suomi24Page` |
| [Vauva](https://www.vauva.fi) | Discussion thread | `VauvaPage` |
| [Oikotie Asunnot](https://asunnot.oikotie.fi/myytavat-asunnot) | Apartment ad | `OikotieApartment` |
| [Tori](https://www.tori.fi) | Item deal | `ToriDeal` |

Documentation is available at [https://finscraper.readthedocs.io](https://finscraper.readthedocs.io) and [simple online demo here](https://storage.googleapis.com/jmyrberg/index.html#/demo-projects/finscraper).

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.1dev21
0.0.1dev22
8 changes: 8 additions & 0 deletions docs/source/finscraper.scrapy_spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ finscraper.scrapy\_spiders.suomi24page module
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.torideal module
------------------------------------------

.. automodule:: finscraper.scrapy_spiders.torideal
:members:
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.vauvapage module
-------------------------------------------

Expand Down
137 changes: 137 additions & 0 deletions finscraper/scrapy_spiders/torideal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Module for ToriDeal spider."""


import time

from functools import partial

from scrapy import Item, Field, Selector
from scrapy.crawler import Spider
from scrapy.exceptions import DropItem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, \
Compose, Join

from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin
from finscraper.utils import strip_join, safe_cast_int, strip_elements, \
drop_empty_elements, replace


class _ToriDealSpider(FollowAndParseItemMixin, Spider):
name = 'torideal'
start_urls = ['https://tori.fi']
follow_link_extractor = LinkExtractor(
allow_domains=('tori.fi'),
allow=(),
deny=('.*tili.*'),
deny_domains=('tuki.tori.fi', 'blog.tori.fi', 'tori-kaupat.tori.fi',
'careers.tori.fi', 'media.tori.fi'),
canonicalize=True
)
item_link_extractor = LinkExtractor(
allow_domains=('tori.fi'),
allow=(rf'/[A-z0-9\_]+.htm.*'),
deny=('.*tili.*'),
deny_domains=('tuki.tori.fi', 'blog.tori.fi', 'tori-kaupat.tori.fi',
'careers.tori.fi', 'media.tori.fi', 'asunnot.tori.fi'),
canonicalize=True
)
custom_settings = {
'ROBOTSTXT_OBEY': False
}

def __init__(self, *args, **kwargs):
"""Fetch deals from tori.fi.
Args:
"""
super(_ToriDealSpider, self).__init__(*args, **kwargs)

@staticmethod
def _get_image_metadata(text):
sel = Selector(text=text)
return {
'src': sel.xpath('//@src').get(),
'alt': sel.xpath('//@alt').get(),
'title': sel.xpath('//@title').get()
}

def _parse_item(self, resp):
l = ItemLoader(item=_ToriDealItem(), response=resp)
l.add_value('url', resp.url)
l.add_value('time', int(time.time()))
l.add_xpath('seller',
'//div[contains(@id, "seller_info")]//text()')
l.add_xpath('name',
'//div[@class="topic"]//*[contains(@itemprop, "name")]//text()')
l.add_xpath('description',
'//*[contains(@itemprop, "description")]//text()')
l.add_xpath('price',
'//*[contains(@itemprop, "price")]//text()')
l.add_xpath('type',
'//td[contains(text(), "Ilmoitustyyppi")]'
'/following-sibling::td[1]//text()')
l.add_xpath('published',
'//td[contains(text(), "Ilmoitus jätetty")]'
'/following-sibling::td[1]//text()')
l.add_xpath('images', '//div[@class="media_container"]//img')
return l.load_item()


class _ToriDealItem(Item):
__doc__ = """
Returned fields:
* url (str): URL of the scraped web page.
* time (int): UNIX timestamp of the scraping.
* seller (str): Seller of the item.
* name (str): Name of the item.
* description (list of str): Description of the item.
* price (str): Price of the item.
* type (str): Type of the deal.
* published (str): Publish time of the deal.
* images (list of dict): Images of the item.
"""
url = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
time = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
seller = Field(
input_processor=Compose(
strip_elements,
drop_empty_elements,
MapCompose(partial(replace, source='\n', target=' ')),
partial(strip_join, join_with='\n')),
output_processor=TakeFirst()
)
name = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
description = Field(
input_processor=Compose(
strip_elements,
MapCompose(partial(replace, source='\n', target=' ')),
partial(strip_join, join_with='\n')),
output_processor=TakeFirst()
)
price = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
type = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
published = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
images = Field(
input_processor=MapCompose(_ToriDealSpider._get_image_metadata),
output_processor=Identity()
)
13 changes: 13 additions & 0 deletions finscraper/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
_YLEArticleItem
from finscraper.scrapy_spiders.oikotieapartment import \
_OikotieApartmentSpider, _OikotieApartmentItem
from finscraper.scrapy_spiders.torideal import _ToriDealSpider, \
_ToriDealItem


__wrapper_doc__ = '''
Expand Down Expand Up @@ -117,3 +119,14 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)


class ToriDeal(_SpiderWrapper):
__doc__ = _get_docstring(_ToriDealSpider, _ToriDealItem)
def __init__(self, jobdir=None, progress_bar=True, log_level=None):
super(ToriDeal, self).__init__(
spider_cls=_ToriDealSpider,
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
7 changes: 6 additions & 1 deletion finscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,18 @@ def strip_join(text_list, join_with=' '):
return joined_text


def replace(text, source, target):
return text.replace(source, target) if text is not None else None


def strip_elements(text_list):
return [text.strip() for text in text_list if text is not None]


def drop_empty_elements(text_list):
return [text for text in text_list
if text is not None and type(text) == str and text.strip() != '']
if text is not None or (type(text) == str and text.strip() != '')]


def safe_cast_int(text):
try:
Expand Down
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ markers =
demipage: Mark test as a demipage test.
vauvapage: Mark test as a vauvapage test.
suomi24page: Mark test as a suomi24page test.
oikotieapartment: Mark test as a oikotieapartment test.
oikotieapartment: Mark test as a oikotieapartment test.
torideal: Mark test as a torideal test.
8 changes: 7 additions & 1 deletion tests/test_spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
pytestmark = [pytest.mark.spider]

from finscraper.spiders import ILArticle, ISArticle, YLEArticle, VauvaPage, \
OikotieApartment, DemiPage, Suomi24Page
OikotieApartment, DemiPage, Suomi24Page, ToriDeal

from tests.utils import calc_field_emptiness

Expand Down Expand Up @@ -55,6 +55,12 @@
'params': [None],
'n_fields': 80,
'mark': pytest.mark.oikotieapartment
},
{
'class': ToriDeal,
'params': [None],
'n_fields': 9,
'mark': pytest.mark.torideal
}
]

Expand Down

0 comments on commit 45b534f

Please sign in to comment.