Skip to content

Commit

Permalink
[MRG] Wikicorpus custom filters (#2089)
Browse files Browse the repository at this point in the history
* Modified call to user provided filter_article so that all args are passed in as kwargs

* Small readability improvements.

* Added a catch for pickling errors for self.filter_function, and changed default value check to check against None not against callable().

* Fixed python 2 backwards compatibility issue with exception chaining.

* Added import for PicklingError

* Fixed linting errors.

* Added an example filter function.

* Added tests for wikicorpus filter_articles.

* Passing tests for wikicorpus filter_articles.

* Fixed linting error.

* Fixed linting errors.

* PEP8 hanging indent.
  • Loading branch information
mattilyra authored and piskvorky committed Jul 13, 2018
1 parent 408a714 commit 46ccefb
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 5 deletions.
90 changes: 85 additions & 5 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import multiprocessing
import re
import signal
from pickle import PicklingError
from xml.etree.cElementTree import \
iterparse # LXML isn't faster, so let's go with the built-in solution

Expand All @@ -32,6 +33,9 @@
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.textcorpus import TextCorpus

from six import raise_from


logger = logging.getLogger(__name__)

ARTICLE_MIN_WORDS = 50
Expand Down Expand Up @@ -89,6 +93,63 @@
"""`MediaWiki namespaces <https://www.mediawiki.org/wiki/Manual:Namespace>`_ that ought to be ignored."""


def filter_example(elem, text, *args, **kwargs):
"""Example function for filtering arbitrary documents from wikipedia dump.
The custom filter function is called _before_ tokenisation and should work on
the raw text and/or XML element information.
The filter function gets the entire context of the XML element passed into it,
but you can of course choose not the use some or all parts of the context. Please
refer to :func:`gensim.corpora.wikicorpus.extract_pages` for the exact details
of the page context.
Parameters
----------
elem : etree.Element
XML etree element
text : str
The text of the XML node
namespace : str
XML namespace of the XML element
title : str
Page title
page_tag : str
XPath expression for page.
text_path : str
XPath expression for text.
title_path : str
XPath expression for title.
ns_path : str
XPath expression for namespace.
pageid_path : str
XPath expression for page id.
Example:
------
>>> import gensim.corpora
>>> filter_func = gensim.corpora.wikicorpus.filter_example
>>> dewiki = gensim.corpora.WikiCorpus('./dewiki-20180520-pages-articles-multistream.xml.bz2',
filter_articles=filter_func)
"""
# Filter German wikipedia dump for articles that are marked either as
# Lesenswert (featured) or Exzellent (excellent) by wikipedia editors.
# *********************
# regex is in the function call so that we do not pollute the wikicorpus
# namespace do not do this in production as this function is called for
# every element in the wiki dump
_regex_de_excellent = re.compile('.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_featured = re.compile('.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)

if text is None:
return False
if _regex_de_excellent.match(text) or _regex_de_featured.match(text):
return True
else:
return False


def find_interlinks(raw):
"""Find all interlinks to other articles in the dump.
Expand Down Expand Up @@ -324,7 +385,7 @@ def get_namespace(tag):
_get_namespace = get_namespace


def extract_pages(f, filter_namespaces=False):
def extract_pages(f, filter_namespaces=False, filter_articles=None):
"""Extract pages from a MediaWiki database dump.
Parameters
Expand Down Expand Up @@ -365,6 +426,14 @@ def extract_pages(f, filter_namespaces=False):
if ns not in filter_namespaces:
text = None

if filter_articles is not None:
if not filter_articles(
elem, namespace=namespace, title=title,
text=text, page_tag=page_tag,
text_path=text_path, title_path=title_path,
ns_path=ns_path, pageid_path=pageid_path):
text = None

pageid = elem.find(pageid_path).text
yield title, text or "", pageid # empty page will yield None

Expand Down Expand Up @@ -500,8 +569,11 @@ class WikiCorpus(TextCorpus):
"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
"""
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
"""Initialize the corpus.
Unless a dictionary is provided, this scans the corpus once,
to determine its vocabulary.
Parameters
----------
Expand All @@ -528,7 +600,11 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
token_max_len : int, optional
Maximal token length.
lower : bool, optional
Convert all text to lower case?
If True - convert all text to lower case.
filter_articles: callable or None, optional
If set, each XML article element will be passed to this callable before being processed. Only articles
where the callable returns an XML element are processed, returning None allows filtering out
some articles based on customised rules.
Warnings
--------
Expand All @@ -537,6 +613,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
"""
self.fname = fname
self.filter_namespaces = filter_namespaces
self.filter_articles = filter_articles
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
Expand Down Expand Up @@ -589,7 +666,7 @@ def get_texts(self):
texts = \
((text, self.lemmatize, title, pageid, tokenization_params)
for title, text, pageid
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)

try:
Expand All @@ -616,6 +693,9 @@ def get_texts(self):
"(total %i articles, %i positions before pruning articles shorter than %i words)",
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
)
except PicklingError as exc:
raise_from(PicklingError('Can not send filtering function {} to multiprocessing, '
'make sure the function can be pickled.'.format(self.filter_articles)), exc)
else:
logger.info(
"finished iterating over Wikipedia corpus of %i documents with %i positions "
Expand Down
15 changes: 15 additions & 0 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,21 @@ def test_empty_input(self):
# An empty file is not legit XML
pass

def test_custom_filterfunction(self):
def reject_all(elem, *args, **kwargs):
return False
corpus = self.corpus_class(self.enwiki, filter_articles=reject_all)
texts = corpus.get_texts()
self.assertTrue(all([not t for t in texts]))

def keep_some(elem, title, *args, **kwargs):
return title[0] == 'C'
corpus = self.corpus_class(self.enwiki, filter_articles=reject_all)
corpus.metadata = True
texts = corpus.get_texts()
for text, (pageid, title) in texts:
self.assertEquals(title[0], 'C')


class TestTextDirectoryCorpus(unittest.TestCase):

Expand Down

0 comments on commit 46ccefb

Please sign in to comment.