Skip to content

Commit

Permalink
added many new scrapers
Browse files Browse the repository at this point in the history
  • Loading branch information
petermr committed Jun 6, 2016
1 parent f09f571 commit 1a1ab44
Show file tree
Hide file tree
Showing 21 changed files with 31,551 additions and 0 deletions.
48 changes: 48 additions & 0 deletions scrapers/brill.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"url": "brillonline\\.com",
"headless": true,
"elements": {
"publisher": {
"selector": "//meta[@name='citation_publisher']",
"attribute": "content"
},
"title": {
"selector": "//meta[@name='citation_title']",
"attribute": "content"
},
"authors": {
"selector": "//meta[@name='citation_authors']",
"attribute": "content"
},
"date": {
"selector": "//meta[@name='citation_date']",
"attribute": "content"
},
"doi": {
"selector": "//meta[@name='citation_doi']",
"attribute": "content"
},
"issn": {
"selector": "//meta[@name='citation_issn']",
"attribute": "content"
},
"publisher": {
"selector": "//meta[@name='citation_publisher']",
"attribute": "content"
},
"fulltext_pdf": {
"selector": "//meta[@name='citation_pdf_url']",
"attribute": "content",
"download": {
"rename": "fulltext.pdf"
}
},
"fulltext_html": {
"selector": "//meta[@name='citation_fulltext_pdf_url']",
"attribute": "content",
"download": {
"rename": "fulltext.html"
}
}
}
}
315 changes: 315 additions & 0 deletions scrapers/dois.txt

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions scrapers/emerald.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
"url": "emeraldinsight\\.com",
"headless": true,
"elements": {
"publisher": {
"selector": "//meta[@name='dc.Publisher']",
"attribute": "content"
},
"title": {
"selector": "//meta[@name='dc.Title']",
"attribute": "content"
},
"authors": {
"selector": "//meta[@name='dc.Creator']",
"attribute": "content"
},
"date": {
"selector": "//div[@id='pubDate']",
"attribute": "text"
},
"volume": {
"selector": "//span[@class='citation_volume']",
"attribute": "text"
},
"doi": {
"selector": "//meta[@scheme='doi']",
"attribute": "content"
},
"description": {
"selector": "//meta[@name='dc.Description']",
"attribute": "content"
},
"journal": {
"selector": "//meta[@name='citation_journal_title']",
"attribute": "text"
},
"abstract": {
"selector": "//p[@class='articleBody_abstractText']",
"attribute": "text"
},
"abstract2": {
"selector": "//a[@title='View the Abstract']",
"attribute": "html",
"download": true
},
"language": {
"selector": "//meta[@name='dc.Language']",
"attribute": "content"
},
"fulltext_html": {
"selector": "//a[@title='View the Full Text HTML']",
"attribute": "href",
"download": {
"rename": "fulltext.html"
}
},
"fulltext_pdf": {
"selector": "//a[@title='Download the PDF Full Text']",
"attribute": "href",
"download": {
"rename": "fulltext.pdf"
}
},
"supplementary_material": {
"selector": "//a[title='View Supporting Information']",
"attribute": "href",
"download": true
},
"figure": {
"selector": "//img[@alt='Abstract Image']",
"attribute": "src",
"download": true
},
"copyright": {
"selector": "//div[contains(@id, 'artCopyright')]",
"attribute": "text"
}
}
}
48 changes: 48 additions & 0 deletions scrapers/fpsych.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"url": "frontiersin\\.org",
"headless": true,
"elements": {
"publisher": {
"selector": "//meta[@name='citation_publisher']",
"attribute": "content"
},
"title": {
"selector": "//meta[@name='citation_title']",
"attribute": "content"
},
"authors": {
"selector": "//meta[@name='citation_authors']",
"attribute": "content"
},
"date": {
"selector": "//meta[@name='citation_date']",
"attribute": "content"
},
"doi": {
"selector": "//meta[@name='citation_doi']",
"attribute": "content"
},
"issn": {
"selector": "//meta[@name='citation_issn']",
"attribute": "content"
},
"publisher": {
"selector": "//meta[@name='citation_publisher']",
"attribute": "content"
},
"fulltext_pdf": {
"selector": "//meta[@name='citation_pdf_url']",
"attribute": "content",
"download": {
"rename": "fulltext.pdf"
}
},
"fulltext_html": {
"selector": "//meta[@name='citation_fulltext_pdf_url']",
"attribute": "content",
"download": {
"rename": "fulltext.html"
}
}
}
}
44 changes: 44 additions & 0 deletions scrapers/humkin.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"url": "humankinetics\\.com",
"headless": true,
"elements": {
"publisher": {
"selector": "//meta[@name='DC.Publisher']",
"attribute": "content"
},
"title": {
"selector": "//meta[@name='DC.Title']",
"attribute": "content"
},
"authors": {
"selector": "//meta[@name='citation_author']",
"attribute": "content"
},
"date": {
"selector": "//meta[@name='DC.Date']",
"attribute": "content"
},
"doi": {
"selector": "//meta[@name='DC.Identifier']",
"attribute": "content"
},
"issn": {
"selector": "//meta[@name='citation_issn']",
"attribute": "content"
},
"fulltext_html": {
"selector": "//meta[@name='citation_fulltext_html_url']",
"attribute": "content",
"download": {
"rename": "fulltext.html"
}
},
"fulltext_pdf": {
"selector": "//meta[@name='citation_pdf_url']",
"attribute": "content",
"download": {
"rename": "fulltext.pdf"
}
}
}
}
Loading

0 comments on commit 1a1ab44

Please sign in to comment.