Skip to content

Commit

Permalink
feat: Adds request_kwargs argument to Extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
jmriebold committed Feb 22, 2022
1 parent 018f95a commit 6074cfe
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 25 deletions.
2 changes: 1 addition & 1 deletion boilerpy3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@

__all__ = ('document', 'extractors', 'filters', 'parser')

__version__ = '1.0.5'
__version__ = '1.0.6'
64 changes: 40 additions & 24 deletions boilerpy3/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import urllib.error
import urllib.parse
import urllib.request
from copy import deepcopy
from logging import getLogger
from typing import Union

Expand All @@ -28,31 +29,35 @@ class Extractor:

SCRIPT_REGEX = re.compile(r'<(?:script|SCRIPT)[^>]*>.*?</(?:script|SCRIPT)>', re.DOTALL)

def __init__(self, filtr: BoilerpipeFilter, raise_on_failure: bool = True) -> None:
def __init__(self, filtr: BoilerpipeFilter, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param filtr: filter
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

self.filter = filtr
self.raise_on_failure = raise_on_failure
if request_kwargs is None:
request_kwargs = {}
self._request_kwargs = request_kwargs

def get_content(self, text: str) -> str:
return self.get_doc(text).content

def get_content_from_url(self, url: str) -> str:
return self.get_doc_from_url(url).content
def get_content_from_url(self, url: str, request_kwargs: dict = None) -> str:
return self.get_doc_from_url(url, request_kwargs).content

def get_content_from_file(self, filename: str) -> str:
return self.get_doc_from_file(filename).content

def get_doc_from_file(self, filename: str) -> TextDocument:
return self.get_doc(self.read_from_file(filename))

def get_doc_from_url(self, url: str) -> TextDocument:
return self.get_doc(self.read_from_url(url))
def get_doc_from_url(self, url: str, request_kwargs: dict = None) -> TextDocument:
return self.get_doc(self.read_from_url(url, request_kwargs))

def get_doc(self, text: str) -> TextDocument:
doc = self.parse_doc(text)
Expand All @@ -63,21 +68,25 @@ def get_marked_html(self, text: str) -> str:
doc = self.get_doc(text)
marker = HTMLBoilerpipeMarker(raise_on_failure=self.raise_on_failure)
return marker.process(doc, text)

def get_marked_html_from_url(self, url: str) -> str:
text = self.read_from_url(url)
return self.get_marked_html(text)

def get_marked_html_from_file(self, filename: str) -> str:
text = self.read_from_file(filename)
return self.get_marked_html(text)

def read_from_file(self, filename: str) -> str:
with open(filename) as text_file:
return text_file.read()

def read_from_url(self, url: str) -> str:
with urllib.request.urlopen(url) as url_obj:
def read_from_url(self, url: str, request_kwargs: dict = None) -> str:
all_request_kwargs = deepcopy(self._request_kwargs)
if request_kwargs is not None:
all_request_kwargs.update(request_kwargs)

with urllib.request.urlopen(url, **all_request_kwargs) as url_obj:
text = url_obj.read()
encoding = self.get_url_encoding(url_obj)

Expand Down Expand Up @@ -124,14 +133,15 @@ class DefaultExtractor(Extractor):
filters.DensityRulesClassifier()
])

def __init__(self, raise_on_failure: bool = True) -> None:
def __init__(self, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

super().__init__(self._filter_chain, raise_on_failure)
super().__init__(self._filter_chain, raise_on_failure, request_kwargs)


class ArticleExtractor(Extractor):
Expand All @@ -152,14 +162,15 @@ class ArticleExtractor(Extractor):
filters.ExpandTitleToContentFilter()
])

def __init__(self, raise_on_failure: bool = True) -> None:
def __init__(self, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

super().__init__(self._filter_chain, raise_on_failure)
super().__init__(self._filter_chain, raise_on_failure, request_kwargs)


class LargestContentExtractor(Extractor):
Expand All @@ -175,14 +186,15 @@ class LargestContentExtractor(Extractor):
filters.KeepLargestBlockFilter()
])

def __init__(self, raise_on_failure: bool = True) -> None:
def __init__(self, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

super().__init__(self._filter_chain, raise_on_failure)
super().__init__(self._filter_chain, raise_on_failure, request_kwargs)


class CanolaExtractor(Extractor):
Expand All @@ -192,14 +204,15 @@ class CanolaExtractor(Extractor):

_filter = filters.CanolaFilter()

def __init__(self, raise_on_failure: bool = True) -> None:
def __init__(self, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

super().__init__(self._filter, raise_on_failure)
super().__init__(self._filter, raise_on_failure, request_kwargs)


class KeepEverythingExtractor(Extractor):
Expand All @@ -210,14 +223,15 @@ class KeepEverythingExtractor(Extractor):

_filter = filters.MarkEverythingContentFilter()

def __init__(self, raise_on_failure: bool = True) -> None:
def __init__(self, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

super().__init__(self._filter, raise_on_failure)
super().__init__(self._filter, raise_on_failure, request_kwargs)


class NumWordsRulesExtractor(Extractor):
Expand All @@ -228,14 +242,15 @@ class NumWordsRulesExtractor(Extractor):

_filter = filters.NumWordsRulesClassifier()

def __init__(self, raise_on_failure: bool = True) -> None:
def __init__(self, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

super().__init__(self._filter, raise_on_failure)
super().__init__(self._filter, raise_on_failure, request_kwargs)


class ArticleSentencesExtractor(Extractor):
Expand All @@ -249,14 +264,15 @@ class ArticleSentencesExtractor(Extractor):
filters.MinClauseWordsFilter()
])

def __init__(self, raise_on_failure: bool = True) -> None:
def __init__(self, raise_on_failure: bool = True, request_kwargs: dict = None) -> None:
"""
Initialize extractor
:param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
:param request_kwargs: kwargs to pass to urllib.request
"""

super().__init__(self._filter_chain, raise_on_failure)
super().__init__(self._filter_chain, raise_on_failure, request_kwargs)


class KeepEverythingWithMinKWordsFilter(filters.FilterChain):
Expand Down
21 changes: 21 additions & 0 deletions tests/test_extractors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from unittest.mock import MagicMock, patch

import pytest

Expand Down Expand Up @@ -203,3 +204,23 @@ def test_extraction():

article_extractor.raise_on_failure = False
assert article_extractor.get_content(html)


@patch('urllib.request.urlopen')
def test_request_kwargs(mock_urlopen):
extractor = Extractor(None) # noqa
assert extractor._request_kwargs == {}

request_kwargs = {'timeout': 30}
extractor = Extractor(None, request_kwargs=request_kwargs) # noqa
assert extractor._request_kwargs == request_kwargs

mock_request = MagicMock()
mock_request.getcode.return_value = 200
mock_request.read.return_value = b'<html><body><h1>Example</h1></body></html>'
mock_request.headers = {'content-type': 'charset=utf-8'}
mock_request.__enter__.return_value = mock_request
mock_urlopen.return_value = mock_request
extractor = ArticleExtractor(request_kwargs=request_kwargs)
extractor.get_content_from_url('example.com', {'foo': 'bar'})
mock_urlopen.assert_called_with('example.com', timeout=30, foo='bar')

0 comments on commit 6074cfe

Please sign in to comment.