diff --git a/docker-compose.test.yml b/docker-compose.test.yml
index 1b1cbc9a..512346e3 100644
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@@ -64,7 +64,7 @@ services:
depends_on:
scrapyd:
condition: service_healthy
- http-server.local:
+ pos-http-server.local:
condition: service_healthy
unit:
@@ -118,22 +118,6 @@ services:
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd
- http-server.local:
- image: nginx:stable-alpine
- volumes:
- - ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
- - ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl
- - ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/
- ports:
- - 443:443
- healthcheck:
- timeout: 5s
- interval: 5s
- retries: 5
- test:
- - "CMD-SHELL"
- - "curl https://localhost:443/"
-
functional_cds:
<<: *service_base
command: py.test -vv tests/functional/cds
@@ -173,6 +157,21 @@ services:
- "CMD-SHELL"
- "curl http://localhost:80/"
+ pos-http-server.local:
+ image: nginx:stable-alpine
+ volumes:
+ - ${PWD}/tests/functional/pos/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
+ - ${PWD}/tests/functional/pos/fixtures/http_server/records:/etc/nginx/html/
+ ports:
+ - 80:80
+ healthcheck:
+ timeout: 5s
+ interval: 5s
+ retries: 5
+ test:
+ - "CMD-SHELL"
+ - "curl http://localhost:80/"
+
rabbitmq:
image: rabbitmq
healthcheck:
diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
index 6edb8121..1ce9526f 100644
--- a/hepcrawl/settings.py
+++ b/hepcrawl/settings.py
@@ -33,9 +33,6 @@
# user-agent
USER_AGENT = 'hepcrawl (+http://www.inspirehep.net)'
-# Allow duplicate requests
-DUPEFILTER_CLASS = "scrapy.dupefilters.BaseDupeFilter"
-
# URI base prefix for $schema to be used during record generation
SCHEMA_BASE_URI = os.environ.get(
'APP_SCHEMA_BASE_URI',
diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py
index 0e4a5700..02f8da4f 100644
--- a/hepcrawl/spiders/pos_spider.py
+++ b/hepcrawl/spiders/pos_spider.py
@@ -7,7 +7,7 @@
# under the terms of the Revised BSD License; see LICENSE file for
# more details.
-"""Spider for POS."""
+"""Spider for POS OAI-PMH interface."""
from __future__ import absolute_import, division, print_function
@@ -18,7 +18,7 @@
from scrapy import Request, Selector
-from . import StatefulSpider
+from .common.oaipmh_spider import OAIPMHSpider
from ..dateutils import create_valid_date
from ..items import HEPRecord
from ..loaders import HEPLoader
@@ -30,12 +30,11 @@
)
-DEFAULT_BASE_URL = 'https://pos.sissa.it'
DEFAULT_BASE_CONFERENCE_PAPER_URL = (
- DEFAULT_BASE_URL + '/contribution?id='
+ 'https://pos.sissa.it/contribution?id='
)
DEFAULT_BASE_PROCEEDINGS_URL = (
- DEFAULT_BASE_URL + '/cgi-bin/reader/conf.cgi?confid='
+ 'https://pos.sissa.it//cgi-bin/reader/conf.cgi?confid='
)
@@ -43,7 +42,7 @@ class PoSExtractionException(Exception):
pass
-class POSSpider(StatefulSpider):
+class POSSpider(OAIPMHSpider):
"""POS/Sissa crawler.
From PoS we create two types of records, a conference paper record, and a
@@ -73,356 +72,417 @@ class POSSpider(StatefulSpider):
Example:
::
$ scrapy crawl PoS \\
- -a "source_file=file://$PWD/tests/unit/responses/pos/sample_pos_record.xml"
+ -a "sets=conference:IHEP-LHC" -a "from_date=2012-12-13"
"""
- name = 'pos'
+ name = 'PoS'
+ source = 'PoS'
@strict_kwargs
def __init__(
self,
- source_file=None,
- base_conference_paper_url=DEFAULT_BASE_CONFERENCE_PAPER_URL,
- base_proceedings_url=DEFAULT_BASE_PROCEEDINGS_URL,
+ url='https://pos.sissa.it/oai',
+ format='oai_dc',
+ sets=None,
+ from_date=None,
+ until_date=None,
**kwargs
):
- super(POSSpider, self).__init__(**kwargs)
- self.source_file = source_file
- self.base_conference_paper_url = base_conference_paper_url
- self.base_proceedings_url = base_proceedings_url
-
- def start_requests(self):
- yield Request(self.source_file)
-
- def parse(self, response):
- self.logger.info('Got record from: {response.url}'.format(**vars()))
-
- response.selector.remove_namespaces()
- record_xml_selectors = response.selector.xpath('.//record')
- for record_xml_selector in record_xml_selectors:
- yield self.get_conference_paper_page_request(
- xml_selector=record_xml_selector,
- )
-
- def get_conference_paper_page_request(self, xml_selector, meta=None):
- """Gets the conference paper html page, for the pdf link for the
- conference paper, and later the internal conference id.
- """
- meta = meta or {}
-
- identifier = xml_selector.xpath(
- './/metadata/pex-dc/identifier/text()'
- ).extract_first()
- conference_paper_url = "{0}{1}".format(
- self.base_conference_paper_url,
- identifier,
- )
- meta['xml_record'] = xml_selector.extract()
-
- # the meta parameter will be passed over to the callback as a property
- # in the response parameter
- return Request(
- url=conference_paper_url,
- callback=self.parse_conference_paper,
- meta=meta
+ super(POSSpider, self).__init__(
+ url=url,
+ format=format,
+ sets=sets,
+ from_date=from_date,
+ until_date=until_date,
+ **kwargs
)
- def parse_conference_paper(self, response):
- self.logger.info(
- 'Parsing conference paper from: {response.url}'.format(**vars())
- )
- xml_record = response.meta.get('xml_record')
- conference_paper_url = response.url
- conference_paper_pdf_url = self._get_conference_paper_pdf_url(
- conference_paper_page_html=response.body,
- )
+ def get_record_identifier(self, record):
+ """Extracts a unique identifier from a sickle record."""
+ return record.header.identifier
- parsed_conference_paper = self.build_conference_paper_item(
- xml_record=xml_record,
- conference_paper_url=conference_paper_url,
- conference_paper_pdf_url=conference_paper_pdf_url,
- )
- yield parsed_conference_paper
+ def parse_record(self, selector):
+ """Parse a PoS MARCXML record into a HEP record."""
+ selector.remove_namespaces()
+ conference_paper_page_request = get_conference_paper_page_request(xml_selector=selector)
- # prepare next callback step
- response.meta['html_record'] = response.body
- yield self.get_conference_proceedings_page_request(
- meta=response.meta,
- )
+ return conference_paper_page_request
- def get_conference_proceedings_page_request(self, meta):
- """Gets the conference proceedings page, using the indernal conference
- id from the record html page retrieved before.
- """
- if not meta.get('html_record'):
- raise PoSExtractionException(
- 'PoS conference paper page was empty, current meta:\n%s' % meta
- )
-
- proceedings_page_url = self._get_proceedings_page_url(
- page_html=meta.get('html_record'),
- )
- page_selector = Selector(
- text=meta.get('xml_record'),
- type='xml',
- )
- page_selector.remove_namespaces()
- pos_id = page_selector.xpath(
- ".//metadata/pex-dc/identifier/text()"
- ).extract_first()
- meta['pos_id'] = pos_id
-
- return Request(
- url=proceedings_page_url,
- meta=meta,
- callback=self.parse_conference_proceedings,
- )
+class POSSpiderSingle(OAIPMHSpider):
+ """POS/Sissa crawler.
- def parse_conference_proceedings(self, request):
- parsed_conference_proceedings = self.build_conference_proceedings_item(
- proceedings_page_html=request.body,
- pos_id=request.meta['pos_id'],
- )
- yield parsed_conference_proceedings
+ From PoS we create two types of records, a conference paper record, and a
+ conference proceedings record.
- def _get_proceedings_page_url(self, page_html):
- page_selector = Selector(
- text=page_html,
- type="html"
- )
- internal_url = page_selector.xpath(
- "//a[not(contains(text(),'pdf'))]/@href",
- ).extract_first()
- proceedings_internal_id = internal_url.split('/')[1]
- return '{0}{1}'.format(
- self.base_proceedings_url,
- proceedings_internal_id,
- )
+ The bulk of the records comes from oaiharvest, and this spider crawls the
+ files generated by it.
- def build_conference_paper_item(
- self,
- xml_record,
- conference_paper_url,
- conference_paper_pdf_url,
- ):
- selector = Selector(
- text=xml_record,
- type="xml"
- )
- selector.remove_namespaces()
- record = HEPLoader(
- item=HEPRecord(),
- selector=selector
- )
+ For the conference paper record we have to scrape also the html page of the
+ record on the PoS website to get the pdf link. (see
+ `DEFAULT_BASE_CONFERENCE_PAPER_URL`)
- license_text = selector.xpath(
- './/metadata/pex-dc/rights/text()'
- ).extract_first()
- record.add_value('license', get_licenses(license_text=license_text))
-
- date, year = self._get_date(selector=selector)
- record.add_value('date_published', date)
- record.add_value('journal_year', year)
-
- identifier = selector.xpath(
- ".//metadata/pex-dc/identifier/text()"
- ).extract_first()
- record.add_value(
- 'journal_title',
- self._get_journal_title(pos_ext_identifier=identifier),
- )
- record.add_value(
- 'journal_volume',
- self._get_journal_volume(pos_ext_identifier=identifier),
- )
- record.add_value(
- 'journal_artid',
- self._get_journal_artid(pos_ext_identifier=identifier),
- )
+ Then, from that same page, we get the internal conference id.
- record.add_xpath('title', '//metadata/pex-dc/title/text()')
- record.add_xpath('source', '//metadata/pex-dc/publisher/text()')
- record.add_value(
- 'external_system_numbers',
- self._get_ext_systems_number(selector=selector),
- )
- record.add_value('language', self._get_language(selector=selector))
- record.add_value('authors', self._get_authors(selector=selector))
- record.add_value('collections', ['conferencepaper'])
- record.add_value('urls', [conference_paper_url])
- record.add_value(
- 'documents',
- self.get_documents(
- path=conference_paper_pdf_url,
- ),
- )
+ With that conference id, then we scrape the conference proceedings page,
+ and extract the information to create the proceedings record. (see
+ `DEFAULT_BASE_PROCEEDINGS_URL`)
- parsed_item = ParsedItem(
- record=record.load_item(),
- record_format='hepcrawl',
- )
+ To do that and because each needs the information of the previous, the
+ spider must use the callbacks system provided by scrapy through the
+ :class:`scrapy.html.response.Response` callback parameter, and chain the
+ parser functions.
- return parsed_item
+ The deduplication of the conference proceedings papers is left for the
+ `HepcrawlCrawlOnceMiddleware` middleware.
+
+ Example:
+ ::
+ $ scrapy crawl PoS_single -a "identifier=oai:pos.sissa.it:IHEP-LHC/001"
+ """
+ name = 'PoS_single'
+ source = 'PoS'
- def build_conference_proceedings_item(
+ @strict_kwargs
+ def __init__(
self,
- proceedings_page_html,
- pos_id,
+ url='https://pos.sissa.it/oai',
+ format='oai_dc',
+ identifier=None,
+ **kwargs
):
- selector = Selector(
- text=proceedings_page_html,
- type='html',
+ super(POSSpiderSingle, self).__init__(
+ url=url,
+ format=format,
+ identifier=identifier,
+ **kwargs
)
+
+ def get_record_identifier(self, record):
+ """Extracts a unique identifier from a sickle record."""
+ return record.header.identifier
+
+ def parse_record(self, selector):
+ """Parse a PoS MARCXML record into a HEP record."""
selector.remove_namespaces()
- record = HEPLoader(
- item=HEPRecord(),
- selector=selector
- )
+ conference_paper_page_request = get_conference_paper_page_request(xml_selector=selector)
- record.add_value('collections', ['proceedings'])
- record.add_value(
- 'title',
- self._get_proceedings_title(selector=selector),
- )
- record.add_value(
- 'subtitle',
- self._get_proceedings_date_place(selector=selector),
- )
- record.add_value('journal_title', 'PoS')
- record.add_value(
- 'journal_volume',
- self._get_journal_volume(pos_ext_identifier=pos_id),
+ return conference_paper_page_request
+
+
+def get_conference_paper_page_request(xml_selector, meta=None):
+ """Gets the conference paper html page, for the pdf link for the
+ conference paper, and later the internal conference id.
+ """
+ meta = meta or {}
+
+ identifier = xml_selector.xpath(
+ './/metadata/dc/identifier/text()'
+ ).extract_first()
+ conference_paper_url = "{0}{1}".format(
+ DEFAULT_BASE_CONFERENCE_PAPER_URL,
+ identifier,
+ )
+ meta['xml_record'] = xml_selector.extract()
+
+ # the meta parameter will be passed over to the callback as a property
+ # in the response parameter
+ return Request(
+ url=conference_paper_url,
+ callback=parse_conference_paper,
+ meta=meta
+ )
+
+
+def parse_conference_paper(response):
+ xml_record = response.meta.get('xml_record')
+ conference_paper_url = response.url
+ conference_paper_pdf_url = _get_conference_paper_pdf_url(
+ conference_paper_page_html=response.body,
+ )
+
+ parsed_conference_paper = build_conference_paper_item(
+ xml_record=xml_record,
+ conference_paper_url=conference_paper_url,
+ conference_paper_pdf_url=conference_paper_pdf_url,
+ )
+ yield parsed_conference_paper
+
+ # prepare next callback step
+ response.meta['html_record'] = response.body
+ yield get_conference_proceedings_page_request(
+ meta=response.meta,
+ )
+
+
+def get_conference_proceedings_page_request(meta):
+ """Gets the conference proceedings page, using the indernal conference
+ id from the record html page retrieved before.
+ """
+ if not meta.get('html_record'):
+ raise PoSExtractionException(
+ 'PoS conference paper page was empty, current meta:\n%s' % meta
)
- parsed_proceeding = ParsedItem(
- record=record.load_item(),
- record_format='hepcrawl',
+ proceedings_page_url = _get_proceedings_page_url(
+ page_html=meta.get('html_record'),
+ )
+
+ page_selector = Selector(
+ text=meta.get('xml_record'),
+ type='xml',
+ )
+ page_selector.remove_namespaces()
+ pos_id = page_selector.xpath(
+ ".//metadata/dc/identifier/text()"
+ ).extract_first()
+ meta['pos_id'] = pos_id
+
+ return Request(
+ url=proceedings_page_url,
+ meta=meta,
+ callback=parse_conference_proceedings,
+ )
+
+
+def parse_conference_proceedings(request):
+ parsed_conference_proceedings = build_conference_proceedings_item(
+ proceedings_page_html=request.body,
+ pos_id=request.meta['pos_id'],
+ )
+ yield parsed_conference_proceedings
+
+
+def _get_proceedings_page_url(page_html):
+ page_selector = Selector(
+ text=page_html,
+ type="html"
+ )
+ internal_url = page_selector.xpath(
+ "//a[not(contains(text(),'pdf'))]/@href",
+ ).extract_first()
+ proceedings_internal_id = internal_url.split('/')[1]
+ return '{0}{1}'.format(
+ DEFAULT_BASE_PROCEEDINGS_URL,
+ proceedings_internal_id,
+ )
+
+
+def build_conference_paper_item(
+ xml_record,
+ conference_paper_url,
+ conference_paper_pdf_url,
+):
+ selector = Selector(
+ text=xml_record,
+ type="xml"
+ )
+ selector.remove_namespaces()
+ record = HEPLoader(
+ item=HEPRecord(),
+ selector=selector
+ )
+
+ license_text = selector.xpath(
+ './/metadata/dc/rights/text()'
+ ).extract_first()
+ record.add_value('license', get_licenses(license_text=license_text))
+
+ date, year = _get_date(selector=selector)
+ record.add_value('date_published', date)
+ record.add_value('journal_year', year)
+
+ identifier = selector.xpath(
+ ".//metadata/dc/identifier/text()"
+ ).extract_first()
+ record.add_value(
+ 'journal_title',
+ _get_journal_title(pos_ext_identifier=identifier),
+ )
+ record.add_value(
+ 'journal_volume',
+ _get_journal_volume(pos_ext_identifier=identifier),
+ )
+ record.add_value(
+ 'journal_artid',
+ _get_journal_artid(pos_ext_identifier=identifier),
+ )
+
+ record.add_xpath('title', '//metadata/dc/title/text()')
+ record.add_xpath('source', '//metadata/dc/publisher/text()')
+ record.add_value(
+ 'external_system_numbers',
+ _get_ext_systems_number(selector=selector),
+ )
+ record.add_value('language', _get_language(selector=selector))
+ record.add_value('authors', _get_authors(selector=selector))
+ record.add_value('collections', ['conferencepaper'])
+ record.add_value('urls', [conference_paper_url])
+ record.add_value(
+ 'documents',
+ get_documents(
+ path=conference_paper_pdf_url,
+ ),
+ )
+
+ parsed_item = ParsedItem(
+ record=record.load_item(),
+ record_format='hepcrawl',
+ )
+
+ return parsed_item
+
+
+def build_conference_proceedings_item(
+ proceedings_page_html,
+ pos_id,
+):
+ selector = Selector(
+ text=proceedings_page_html,
+ type='html',
+ )
+ selector.remove_namespaces()
+ record = HEPLoader(
+ item=HEPRecord(),
+ selector=selector
+ )
+
+ record.add_value('collections', ['proceedings'])
+ record.add_value(
+ 'title',
+ _get_proceedings_title(selector=selector),
+ )
+ record.add_value(
+ 'subtitle',
+ _get_proceedings_date_place(selector=selector),
+ )
+ record.add_value('journal_title', 'PoS')
+ record.add_value(
+ 'journal_volume',
+ _get_journal_volume(pos_ext_identifier=pos_id),
+ )
+
+ parsed_proceeding = ParsedItem(
+ record=record.load_item(),
+ record_format='hepcrawl',
+ )
+
+ return parsed_proceeding
+
+
+def _get_conference_paper_pdf_url(conference_paper_page_html):
+ selector = Selector(
+ text=conference_paper_page_html,
+ type='html',
+ )
+ conference_paper_pdf_relative_url = selector.xpath(
+ "//a[contains(text(),'pdf')]/@href",
+ ).extract_first()
+
+ if not conference_paper_pdf_relative_url:
+ raise PoSExtractionException(
+ (
+ 'Unable to get the conference paper pdf url from the html:'
+ '\n%s'
+ ) % conference_paper_page_html
)
- return parsed_proceeding
+ return urljoin(
+ DEFAULT_BASE_CONFERENCE_PAPER_URL,
+ conference_paper_pdf_relative_url,
+ )
- def _get_conference_paper_pdf_url(self, conference_paper_page_html):
- selector = Selector(
- text=conference_paper_page_html,
- type='html',
- )
- conference_paper_pdf_relative_url = selector.xpath(
- "//a[contains(text(),'pdf')]/@href",
- ).extract_first()
-
- if not conference_paper_pdf_relative_url:
- raise PoSExtractionException(
- (
- 'Unable to get the conference paper pdf url from the html:'
- '\n%s'
- ) % conference_paper_page_html
- )
-
- return urljoin(
- self.base_conference_paper_url,
- conference_paper_pdf_relative_url,
- )
- def _get_proceedings_url(self, response):
- internal_url = response.selector.xpath(
- "//a[not(contains(text(),'pdf'))]/@href",
- ).extract_first()
- proceedings_identifier = internal_url.split('/')[1]
- return '{0}{1}'.format(
- self.base_proceedings_url,
- proceedings_identifier,
- )
+def _get_proceedings_url(response):
+ internal_url = response.selector.xpath(
+ "//a[not(contains(text(),'pdf'))]/@href",
+ ).extract_first()
+ proceedings_identifier = internal_url.split('/')[1]
+ return '{0}{1}'.format(
+ DEFAULT_BASE_PROCEEDINGS_URL,
+ proceedings_identifier,
+ )
- @staticmethod
- def get_documents(path):
- return [
- {
- 'key': os.path.basename(path),
- 'url': quote(path, safe=':/'),
- 'original_url': quote(path, safe=':/'),
- 'hidden': True,
- 'fulltext': True,
- },
- ]
- @staticmethod
- def _get_language(selector):
- language = selector.xpath(
- ".//metadata/pex-dc/language/text()"
- ).extract_first()
- return language if language != 'en' else None
+def get_documents(path):
+ return [
+ {
+ 'key': os.path.basename(path),
+ 'url': quote(path, safe=':/'),
+ 'original_url': quote(path, safe=':/'),
+ 'hidden': True,
+ 'fulltext': True,
+ },
+ ]
+
+
+def _get_language(selector):
+ language = selector.xpath(
+ ".//metadata/dc/language/text()"
+ ).extract_first()
+ return language if language != 'en' else None
+
+
+def _get_journal_title(pos_ext_identifier):
+ return re.split('[()]', pos_ext_identifier)[0]
+
+
+def _get_journal_volume(pos_ext_identifier):
+ return re.split('[()]', pos_ext_identifier)[1]
+
- @staticmethod
- def _get_journal_title(pos_ext_identifier):
- return re.split('[()]', pos_ext_identifier)[0]
+def _get_journal_artid(pos_ext_identifier):
+ return re.split('[()]', pos_ext_identifier)[2]
- @staticmethod
- def _get_journal_volume(pos_ext_identifier):
- return re.split('[()]', pos_ext_identifier)[1]
- @staticmethod
- def _get_journal_artid(pos_ext_identifier):
- return re.split('[()]', pos_ext_identifier)[2]
+def _get_ext_systems_number(selector):
+ return [
+ {
+ 'institute': 'pos',
+ 'value': selector.xpath(
+ './/identifier/text()'
+ ).extract_first()
+ },
+ ]
- @staticmethod
- def _get_ext_systems_number(selector):
- return [
+
+def _get_date(selector):
+ full_date = selector.xpath(
+ ".//metadata/dc/date/text()"
+ ).extract_first()
+ date = create_valid_date(full_date)
+ year = int(date[0:4])
+
+ return date, year
+
+
+def _get_authors(selector):
+ """Get article authors."""
+ authors = []
+ creators = selector.xpath('.//metadata/dc/creator')
+ for creator in creators:
+ auth_dict = {}
+ author = Selector(text=creator.extract())
+ auth_dict['raw_name'] = get_first(
+ author.xpath('.//text()').extract(),
+ default='',
+ )
+ auth_dict['affiliations'] = [
{
- 'institute': 'pos',
- 'value': selector.xpath(
- './/identifier/text()'
- ).extract_first()
+ 'value': ''
},
]
+ if auth_dict:
+ authors.append(auth_dict)
+ return authors
+
+
+def _get_proceedings_title(selector):
+ return 'Proceedings, ' + selector.xpath('//h1/text()').extract_first()
+
+
+def _get_proceedings_date_place(selector):
+ date_place = selector.xpath(
+ "//div[@class='conference_date']/text()"
+ ).extract()
+ return ''.join(date_place)
- @staticmethod
- def _get_date(selector):
- full_date = selector.xpath(
- ".//metadata/pex-dc/date/text()"
- ).extract_first()
- date = create_valid_date(full_date)
- year = int(date[0:4])
-
- return date, year
-
- @staticmethod
- def _get_authors(selector):
- """Get article authors."""
- authors = []
- creators = selector.xpath('.//metadata/pex-dc/creator')
- for creator in creators:
- auth_dict = {}
- author = Selector(text=creator.extract())
- auth_dict['raw_name'] = get_first(
- author.xpath('.//name//text()').extract(),
- default='',
- )
- for affiliation in author.xpath(
- './/affiliation//text()'
- ).extract():
- if 'affiliations' in auth_dict:
- auth_dict['affiliations'].append(
- {
- 'value': affiliation
- }
- )
- else:
- auth_dict['affiliations'] = [
- {
- 'value': affiliation
- },
- ]
- if auth_dict:
- authors.append(auth_dict)
- return authors
-
- @staticmethod
- def _get_proceedings_title(selector):
- return 'Proceedings, ' + selector.xpath('//h1/text()').extract_first()
-
- @staticmethod
- def _get_proceedings_date_place(selector):
- date_place = selector.xpath(
- "//div[@class='conference_date']/text()"
- ).extract()
- return ''.join(date_place)
diff --git a/tests/functional/pos/fixtures/http_server/conf/proxy.conf b/tests/functional/pos/fixtures/http_server/conf/proxy.conf
new file mode 100644
index 00000000..afc6162c
--- /dev/null
+++ b/tests/functional/pos/fixtures/http_server/conf/proxy.conf
@@ -0,0 +1,15 @@
+server {
+ listen 80;
+ server_name localhost;
+ charset_types text/xml;
+ charset UTF-8;
+
+ location /oai {
+ if ($args ~ from=2012-02-02&verb=ListRecords&set=conference%3AIHEP-LHC&metadataPrefix=oai_dc) {
+ rewrite ^.*$ /pos-conference-ihep.xml permanent;
+ }
+ if ($args ~ verb=GetRecord&metadataPrefix=oai_dc&identifier=oai%3Apos.sissa.it%3AIHEP-LHC%2F005) {
+ rewrite ^.*$ /pos-single.xml permanent;
+ }
+ }
+}
diff --git a/tests/functional/pos/fixtures/https_server/records/187.html b/tests/functional/pos/fixtures/http_server/records/187.html
similarity index 100%
rename from tests/functional/pos/fixtures/https_server/records/187.html
rename to tests/functional/pos/fixtures/http_server/records/187.html
diff --git a/tests/functional/pos/fixtures/http_server/records/pos-conference-ihep.xml b/tests/functional/pos/fixtures/http_server/records/pos-conference-ihep.xml
new file mode 100644
index 00000000..46ceaff9
--- /dev/null
+++ b/tests/functional/pos/fixtures/http_server/records/pos-conference-ihep.xml
@@ -0,0 +1,54 @@
+
+
Heavy Flavour Physics Review
-A. El-Khadra
-in 31st International Symposium on Lattice Field Theory LATTICE 2013
-Contribution: pdf
-