Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Wikicorpus custom filters #2089

Merged
merged 15 commits into from
Jul 13, 2018
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 84 additions & 5 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import multiprocessing
import re
import signal
from pickle import PicklingError
from xml.etree.cElementTree import \
iterparse # LXML isn't faster, so let's go with the built-in solution

Expand All @@ -32,6 +33,9 @@
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.textcorpus import TextCorpus

from six import raise_from


logger = logging.getLogger(__name__)

ARTICLE_MIN_WORDS = 50
Expand Down Expand Up @@ -89,6 +93,63 @@
"""`MediaWiki namespaces <https://www.mediawiki.org/wiki/Manual:Namespace>`_ that ought to be ignored."""


def filter_example(elem, text, *args, **kwargs):
"""Example function for filtering arbitrary documents from wikipedia dump.


The custom filter function is called _before_ tokenisation and should work on
the raw text and/or XML element information.

The filter function gets the entire context of the XML element passed into it,
but you can of course choose not the use some or all parts of the context. Please
refer to :func:`gensim.corpora.wikicorpus.extract_pages` for the exact details
of the page context.

Parameters
----------
elem : etree.Element
XML etree element
text : str
The text of the XML node
namespace : str
XML namespace of the XML element
title : str
Page title
page_tag : str
XPath expression for page.
text_path : str
XPath expression for text.
title_path : str
XPath expression for title.
ns_path : str
XPath expression for namespace.
pageid_path : str
XPath expression for page id.

Example:
------
>>> import gensim.corpora
>>> filter_func = gensim.corpora.wikicorpus.filter_example
>>> dewiki = gensim.corpora.WikiCorpus('./dewiki-20180520-pages-articles-multistream.xml.bz2',
filter_articles=filter_func)
"""
# Filter German wikipedia dump for articles that are marked either as
# Lesenswert (featured) or Exzellent (excellent) by wikipedia editors.
# *********************
# regex is in the function call so that we do not pollute the wikicorpus
# namespace do not do this in production as this function is called for
# every element in the wiki dump
_regex_de_excellent = re.compile('.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_featured = re.compile('.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)

if text is None:
return False
if _regex_de_excellent.match(text) or _regex_de_featured.match(text):
return True
else:
return False


def find_interlinks(raw):
"""Find all interlinks to other articles in the dump.

Expand Down Expand Up @@ -324,7 +385,7 @@ def get_namespace(tag):
_get_namespace = get_namespace


def extract_pages(f, filter_namespaces=False):
def extract_pages(f, filter_namespaces=False, filter_articles=None):
"""Extract pages from a MediaWiki database dump.

Parameters
Expand Down Expand Up @@ -365,6 +426,13 @@ def extract_pages(f, filter_namespaces=False):
if ns not in filter_namespaces:
text = None

if filter_articles is not None:
if not filter_articles(elem, namespace=namespace, title=title,
text=text, page_tag=page_tag,
Copy link
Owner

@piskvorky piskvorky Jul 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use hanging indent in Gensim (not vertical alignment): https://www.python.org/dev/peps/pep-0008/#id3

Why? Makes code easier to maintain (people forget to re-indent after modifying the opening line). Plus some people (me!) also find it more consistent and easier to read :)

text_path=text_path, title_path=title_path,
ns_path=ns_path, pageid_path=pageid_path):
text = None

pageid = elem.find(pageid_path).text
yield title, text or "", pageid # empty page will yield None

Expand Down Expand Up @@ -500,8 +568,11 @@ class WikiCorpus(TextCorpus):
"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
"""
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
"""Initialize the corpus.

Unless a dictionary is provided, this scans the corpus once,
to determine its vocabulary.

Parameters
----------
Expand All @@ -528,7 +599,11 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
token_max_len : int, optional
Maximal token length.
lower : bool, optional
Convert all text to lower case?
If True - convert all text to lower case.
filter_articles: callable or None, optional
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

callable, optional

If set, each XML article element will be passed to this callable before being processed. Only articles
where the callable returns an XML element are processed, returning None allows filtering out
some articles based on customised rules.
Copy link
Owner

@piskvorky piskvorky Jun 17, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any restrictions on the callable? Can it be an anonymous lambda function? (troublesome with parallelized multiprocessing -- lambdas cannot be pickled)

Copy link
Contributor Author

@mattilyra mattilyra Jun 17, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, for the parallel self.process to work filter_articles needs to be serialisable, what's the practice for checking this in gensim? Try to pickle the callable and raise an informative exception if that fails? Is pickle the standard serialiser in gensim?

On a related "fun" note, see https://stackoverflow.com/questions/50328386/python-typing-pickle-and-serialisation

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking a simple note in the docs. What does the error look like now, if a user passes something not pickleable? If it looks reasonable, I'd just keep that.

@menshikh-iv what is our policy here in general?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well there's no specific catch for this so it'll just be python's PicklingError - that plus a note in the docs I think should be enough, but I don't know how this is handled elsewhere in gensim

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so too. Someone advanced enough to be setting this parameter better be advanced enough to RTFM :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you create an example of this function and link it in docstring like

:func:`gensim.corpora.wikicorpus.filter_example`

this can be useful for better understanding, how function works

  • input parameters
  • return values (probably True/False better that XML element/None)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you mean add an example filter_example to the wikicorpus module?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, only as example (no need to use it by default)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got it, very busy at work but will try to get it done this week

Copy link
Contributor

@menshikh-iv menshikh-iv Jun 22, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't forget to resolve merge conflict too


Warnings
--------
Expand All @@ -537,6 +612,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
"""
self.fname = fname
self.filter_namespaces = filter_namespaces
self.filter_articles = filter_articles
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
Expand Down Expand Up @@ -589,7 +665,7 @@ def get_texts(self):
texts = \
((text, self.lemmatize, title, pageid, tokenization_params)
for title, text, pageid
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)

try:
Expand All @@ -616,6 +692,9 @@ def get_texts(self):
"(total %i articles, %i positions before pruning articles shorter than %i words)",
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
)
except PicklingError as exc:
raise_from(PicklingError('Can not send filtering function {} to multiprocessing, '
'make sure the function can be pickled.'.format(self.filter_articles)), exc)
else:
logger.info(
"finished iterating over Wikipedia corpus of %i documents with %i positions "
Expand Down
15 changes: 15 additions & 0 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,21 @@ def test_empty_input(self):
# An empty file is not legit XML
pass

def test_custom_filterfunction(self):
def reject_all(elem, *args, **kwargs):
return False
corpus = self.corpus_class(self.enwiki, filter_articles=reject_all)
texts = corpus.get_texts()
self.assertTrue(all([not t for t in texts]))

def keep_some(elem, title, *args, **kwargs):
return title[0] == 'C'
corpus = self.corpus_class(self.enwiki, filter_articles=reject_all)
corpus.metadata = True
texts = corpus.get_texts()
for text, (pageid, title) in texts:
self.assertEquals(title[0], 'C')


class TestTextDirectoryCorpus(unittest.TestCase):

Expand Down