Skip to content

Commit

Permalink
Unify XPath NS config over modules
Browse files Browse the repository at this point in the history
  • Loading branch information
TobiX committed Mar 17, 2024
1 parent 2e912bc commit 23125c7
Show file tree
Hide file tree
Showing 22 changed files with 81 additions and 86 deletions.
12 changes: 6 additions & 6 deletions dosagelib/plugins/a.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ class Amya(WordPressScraper):
url = 'http://www.amyachronicles.com/'


class Angband(_ParserScraper):
class Angband(ParserScraper):
url = 'http://angband.calamarain.net/'
stripUrl = url + '%s'
imageSearch = '//img'
Expand All @@ -237,7 +237,7 @@ class Angband(_ParserScraper):

def starter(self):
page = self.getPage(self.url)
self.pages = page.xpath('//p/a[not(contains(@href, "cast"))]/@href')
self.pages = self.match(page, '//p/a[not(contains(@href, "cast"))]/@href')
self.firstStripUrl = self.pages[0]
return self.pages[-1]

Expand Down Expand Up @@ -267,7 +267,7 @@ def link_modifier(self, fromurl, tourl):
return tourl


class AntiheroForHire(_ParserScraper):
class AntiheroForHire(ParserScraper):
stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s'
firstStripUrl = stripUrl % '2016/6/8/entrance-vigil'
url = firstStripUrl
Expand All @@ -278,7 +278,7 @@ class AntiheroForHire(_ParserScraper):
def starter(self):
# Build list of chapters for navigation
page = self.getPage(self.url)
self.chapters = page.xpath('//ul[@class="archive-group-list"]//a[contains(@class, "archive-item-link")]/@href')
self.chapters = self.match(page, '//ul[d:class("archive-group-list")]//a[d:class("archive-item-link")]/@href')
return self.chapters[0]

def getPrevUrl(self, url, data):
Expand Down Expand Up @@ -314,7 +314,7 @@ class ArtificialIncident(WordPressWebcomic):
firstStripUrl = stripUrl % 'issue-one-life-changing'


class AstronomyPOTD(_ParserScraper):
class AstronomyPOTD(ParserScraper):
baseUrl = 'http://apod.nasa.gov/apod/'
url = baseUrl + 'astropix.html'
starter = bounceStarter
Expand All @@ -328,7 +328,7 @@ class AstronomyPOTD(_ParserScraper):

def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return data.xpath('//iframe') # videos
return self.match(data, '//iframe') # videos

def namer(self, image_url, page_url):
return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:],
Expand Down
10 changes: 5 additions & 5 deletions dosagelib/plugins/c.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ class CaptainSNES(_BasicScraper):
help = 'Index format: yyyy/mm/dd/nnn-stripname'


class CarryOn(_ParserScraper):
class CarryOn(ParserScraper):
url = 'http://www.hirezfox.com/km/co/'
stripUrl = url + 'd/%s.html'
firstStripUrl = stripUrl % '20040701'
imageSearch = '//div[@class="strip"]/img'
imageSearch = '//div[d:class("strip")]/img'
prevSearch = '//a[text()="Previous Day"]'
multipleImagesPerStrip = True

Expand Down Expand Up @@ -122,21 +122,21 @@ class CatAndGirl(_ParserScraper):
prevSearch = '//a[d:class("pager--prev")]'


class CatenaManor(_ParserScraper):
class CatenaManor(ParserScraper):
baseUrl = ('https://web.archive.org/web/20141027141116/'
'http://catenamanor.com/')
url = baseUrl + 'archives'
stripUrl = baseUrl + '%s/'
firstStripUrl = stripUrl % '2003/07'
imageSearch = '//img[@class="comicthumbnail"]'
imageSearch = '//img[d:class("comicthumbnail")]'
multipleImagesPerStrip = True
endOfLife = True
strips: List[str] = []

def starter(self):
# Retrieve archive links and select valid range
archivePage = self.getPage(self.url)
archiveStrips = archivePage.xpath('//div[@id="archivepage"]//a')
archiveStrips = self.match(archivePage, '//div[@id="archivepage"]//a')
valid = False
for link in archiveStrips:
if self.stripUrl % '2012/01' in link.get('href'):
Expand Down
14 changes: 7 additions & 7 deletions dosagelib/plugins/comicfury.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
import os

from ..scraper import ParserScraper
Expand Down Expand Up @@ -79,7 +79,7 @@ def namer(self, imageUrl, pageUrl):
num = parts[-1]
if self.multipleImagesPerStrip:
page = self.getPage(pageUrl)
images = page.xpath('//img[@class="comicsegmentimage"]/@src')
images = self.match(page, '//img[d:class("comicsegmentimage")]/@src')
if len(images) > 1:
imageIndex = images.index(imageUrl) + 1
return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext)
Expand All @@ -88,8 +88,8 @@ def namer(self, imageUrl, pageUrl):
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
# Videos on Underverse
return (data.xpath('//div[@id="comicimagewrap"]//video') and
not data.xpath('//div[@id="comicimagewrap"]//img'))
return (self.match(data, '//div[@id="comicimagewrap"]//video') and
not self.match(data, '//div[@id="comicimagewrap"]//img'))

@classmethod
def getmodules(cls): # noqa: CFQ001
Expand Down
15 changes: 5 additions & 10 deletions dosagelib/plugins/d.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape

from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
Expand Down Expand Up @@ -328,19 +328,14 @@ class DreamKeepersPrelude(_ParserScraper):
help = 'Index format: n'


class DresdenCodak(_ParserScraper):
class DresdenCodak(ParserScraper):
url = 'http://dresdencodak.com/'
startUrl = url + 'cat/comic/'
firstStripUrl = url + '2007/02/08/pom/'
imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
prevSearch = '//a[img[contains(@src, "prev")]]'
latestSearch = '//a[d:class("tc-grid-bg-link")]'
starter = indirectStarter

# Blog and comic are mixed...
def shouldSkipUrl(self, url, data):
return not data.xpath(self.imageSearch)


class DrFun(_ParserScraper):
baseUrl = ('https://web.archive.org/web/20180726145737/'
Expand Down
6 changes: 3 additions & 3 deletions dosagelib/plugins/derideal.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper
from ..helpers import indirectStarter

Expand All @@ -27,7 +27,7 @@ def __init__(self, name, sub, first, last=None):

def starter(self):
indexPage = self.getPage(self.url)
self.chapters = indexPage.xpath('//a[contains(text(), "Read this episode")]/@href')
self.chapters = self.match(indexPage, '//a[contains(text(), "Read this episode")]/@href')
self.currentChapter = len(self.chapters)
return indirectStarter(self)

Expand Down
4 changes: 2 additions & 2 deletions dosagelib/plugins/e.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class Erfworld(ParserScraper):

def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return not data.xpath(self.imageSearch)
return not self.match(data, self.imageSearch)

def namer(self, imageUrl, pageUrl):
# Fix inconsistent filenames
Expand Down Expand Up @@ -232,7 +232,7 @@ def namer(self, image_url, page_url):
return '_'.join((pagepart, imagename))

def shouldSkipUrl(self, url, data):
return data.xpath('//div[@id="comic"]//iframe')
return self.match(data, '//div[@id="comic"]//iframe')


class ExtraLife(_BasicScraper):
Expand Down
4 changes: 2 additions & 2 deletions dosagelib/plugins/f.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ class FoxDad(ParserScraper):

def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
post = page.xpath('//li[@class="timestamp"]/a/@href')[0]
post = self.match(page, '//li[d:class("timestamp")]/a/@href')[0]
post = post.replace('https://foxdad.com/post/', '')
if '-consider-support' in post:
post = post.split('-consider-support')[0]
Expand Down Expand Up @@ -216,7 +216,7 @@ class FriendsYouAreStuckWith(WordPressScraper):

def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
strip = page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')
strip = self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')
return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1]


Expand Down
2 changes: 1 addition & 1 deletion dosagelib/plugins/gocomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def getIndexStripUrl(self, index):

def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return data.xpath('//img[contains(@src, "content-error-missing")]')
return self.match(data, '//img[contains(@src, "content-error-missing")]')

@classmethod
def getmodules(cls): # noqa: CFQ001
Expand Down
6 changes: 3 additions & 3 deletions dosagelib/plugins/kemonocafe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper


Expand Down Expand Up @@ -44,7 +44,7 @@ def namer(self, imageUrl, pageUrl):
# Fix unordered filenames
if 'addictivescience' in pageUrl:
page = self.getPage(pageUrl)
num = int(page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', ''))
num = int(self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', ''))
filename = '%04d_%s' % (num, filename)
elif 'CaughtInOrbit' in filename:
filename = filename.replace('CaughtInOrbit', 'CIO')
Expand Down
2 changes: 1 addition & 1 deletion dosagelib/plugins/l.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class LazJonesAndTheMayfieldRegulatorsSideStories(LazJonesAndTheMayfieldRegulato

def getPrevUrl(self, url, data):
# Fix broken navigation links
if url == self.url and data.xpath(self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00':
if url == self.url and self.match(data, self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00':
return self.stripUrl % 'summer21'
return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data)

Expand Down
3 changes: 1 addition & 2 deletions dosagelib/plugins/m.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from ..helpers import indirectStarter
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..util import tagre
from ..xml import NS
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic


Expand Down Expand Up @@ -153,7 +152,7 @@ class MonkeyUser(ParserScraper):

def shouldSkipUrl(self, url, data):
# videos
return data.xpath('//div[d:class("video-container")]', namespaces=NS)
return self.match(data, '//div[d:class("video-container")]')


class MonsieurLeChien(ParserScraper):
Expand Down
2 changes: 1 addition & 1 deletion dosagelib/plugins/p.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def shouldSkipUrl(self, url, data):
# video
self.stripUrl % '1880',
self.stripUrl % '1669',
) or data.xpath('//img[@id="comic" and contains(@src, "phd083123s")]')
) or self.match(data, '//img[@id="comic" and contains(@src, "phd083123s")]')


class Picklewhistle(ComicControlScraper):
Expand Down
14 changes: 7 additions & 7 deletions dosagelib/plugins/r.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2021 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile
from urllib.parse import urljoin

Expand Down Expand Up @@ -121,7 +121,7 @@ class Requiem(WordPressScraper):
firstStripUrl = stripUrl % '2004-06-07-3'


class Replay(_ParserScraper):
class Replay(ParserScraper):
url = 'http://replaycomic.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'red-desert'
Expand All @@ -132,11 +132,11 @@ class Replay(_ParserScraper):
def starter(self):
# Retrieve archive page to identify chapters
archivePage = self.getPage(self.url + 'archive')
archive = archivePage.xpath('//div[@class="comic-archive-chapter-wrap"]')
archive = self.match(archivePage, '//div[d:class("comic-archive-chapter-wrap")]')
self.chapter = len(archive) - 1
self.startOfChapter = []
for archiveChapter in archive:
self.startOfChapter.append(archiveChapter.xpath('.//a')[0].get('href'))
self.startOfChapter.append(self.match(archiveChapter, './/a')[0].get('href'))
return bounceStarter(self)

def namer(self, imageUrl, pageUrl):
Expand Down
8 changes: 4 additions & 4 deletions dosagelib/plugins/s.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ class SpaceFurries(ParserScraper):
def extract_image_urls(self, url, data):
# Website requires JS, so build the list of image URLs manually
imageurls = []
current = int(data.xpath('//input[@name="pagnum"]')[0].get('value'))
current = int(self.match(data, '//input[@name="pagnum"]')[0].get('value'))
for page in reversed(range(1, current + 1)):
imageurls.append(self.url + 'comics/' + str(page) + '.jpg')
return imageurls
Expand Down Expand Up @@ -636,16 +636,16 @@ def shouldSkipUrl(self, url, data):
)


class StupidFox(_ParserScraper):
class StupidFox(ParserScraper):
url = 'http://stupidfox.net/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % 'hello'
imageSearch = '//div[@class="comicmid"]//img'
imageSearch = '//div[d:class("comicmid")]//img'
prevSearch = '//a[@accesskey="p"]'

def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
title = page.xpath(self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-')
title = self.match(page, self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-')
return title + '.' + imageUrl.rsplit('.', 1)[-1]


Expand Down
2 changes: 1 addition & 1 deletion dosagelib/plugins/shivaestudios.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class AlienDice(WordPressSpliced):

def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return not data.xpath(self.imageSearch)
return not self.match(data, self.imageSearch)

def getPrevUrl(self, url, data):
# Fix broken navigation
Expand Down
5 changes: 2 additions & 3 deletions dosagelib/plugins/tapas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..output import out
from ..scraper import ParserScraper
from ..xml import NS


class Tapas(ParserScraper):
Expand All @@ -21,7 +20,7 @@ def __init__(self, name, url):
def starter(self):
# Retrieve comic metadata from info page
info = self.getPage(self.url)
series = info.xpath('//@data-series-id')[0]
series = self.match(info, '//@data-series-id')[0]
# Retrieve comic metadata from API
data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST')
data.raise_for_status()
Expand All @@ -43,7 +42,7 @@ def extract_image_urls(self, url, data):
return self._cached_image_urls

def shouldSkipUrl(self, url, data):
if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS):
if self.match(data, '//button[d:class("js-have-to-sign")]'):
out.warn(f'Nothing to download on "{url}", because a login is required.')
return True
return False
Expand Down
2 changes: 1 addition & 1 deletion dosagelib/plugins/u.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def extract_image_urls(self, url, data):
return urls

def extract_css_bg(self, page) -> str | None:
comicdivs = page.xpath('//div[@id="comic"]')
comicdivs = self.match(page, '//div[@id="comic"]')
if comicdivs:
style = comicdivs[0].attrib.get('style')
if style:
Expand Down
Loading

0 comments on commit 23125c7

Please sign in to comment.