-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Wikicorpus custom filters #2089
Changes from 14 commits
cb02f55
9b8491e
ef7453e
ff9002f
e0012ca
0c97982
d19a2e0
882e699
b81851a
295f4ce
5610c36
224fb28
bfac0c2
cd03fd0
353de1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
import multiprocessing | ||
import re | ||
import signal | ||
from pickle import PicklingError | ||
from xml.etree.cElementTree import \ | ||
iterparse # LXML isn't faster, so let's go with the built-in solution | ||
|
||
|
@@ -32,6 +33,9 @@ | |
from gensim.corpora.dictionary import Dictionary | ||
from gensim.corpora.textcorpus import TextCorpus | ||
|
||
from six import raise_from | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
ARTICLE_MIN_WORDS = 50 | ||
|
@@ -89,6 +93,63 @@ | |
"""`MediaWiki namespaces <https://www.mediawiki.org/wiki/Manual:Namespace>`_ that ought to be ignored.""" | ||
|
||
|
||
def filter_example(elem, text, *args, **kwargs): | ||
"""Example function for filtering arbitrary documents from wikipedia dump. | ||
|
||
|
||
The custom filter function is called _before_ tokenisation and should work on | ||
the raw text and/or XML element information. | ||
|
||
The filter function gets the entire context of the XML element passed into it, | ||
but you can of course choose not the use some or all parts of the context. Please | ||
refer to :func:`gensim.corpora.wikicorpus.extract_pages` for the exact details | ||
of the page context. | ||
|
||
Parameters | ||
---------- | ||
elem : etree.Element | ||
XML etree element | ||
text : str | ||
The text of the XML node | ||
namespace : str | ||
XML namespace of the XML element | ||
title : str | ||
Page title | ||
page_tag : str | ||
XPath expression for page. | ||
text_path : str | ||
XPath expression for text. | ||
title_path : str | ||
XPath expression for title. | ||
ns_path : str | ||
XPath expression for namespace. | ||
pageid_path : str | ||
XPath expression for page id. | ||
|
||
Example: | ||
------ | ||
>>> import gensim.corpora | ||
>>> filter_func = gensim.corpora.wikicorpus.filter_example | ||
>>> dewiki = gensim.corpora.WikiCorpus('./dewiki-20180520-pages-articles-multistream.xml.bz2', | ||
filter_articles=filter_func) | ||
""" | ||
# Filter German wikipedia dump for articles that are marked either as | ||
# Lesenswert (featured) or Exzellent (excellent) by wikipedia editors. | ||
# ********************* | ||
# regex is in the function call so that we do not pollute the wikicorpus | ||
# namespace do not do this in production as this function is called for | ||
# every element in the wiki dump | ||
_regex_de_excellent = re.compile('.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL) | ||
_regex_de_featured = re.compile('.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL) | ||
|
||
if text is None: | ||
return False | ||
if _regex_de_excellent.match(text) or _regex_de_featured.match(text): | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def find_interlinks(raw): | ||
"""Find all interlinks to other articles in the dump. | ||
|
||
|
@@ -324,7 +385,7 @@ def get_namespace(tag): | |
_get_namespace = get_namespace | ||
|
||
|
||
def extract_pages(f, filter_namespaces=False): | ||
def extract_pages(f, filter_namespaces=False, filter_articles=None): | ||
"""Extract pages from a MediaWiki database dump. | ||
|
||
Parameters | ||
|
@@ -365,6 +426,13 @@ def extract_pages(f, filter_namespaces=False): | |
if ns not in filter_namespaces: | ||
text = None | ||
|
||
if filter_articles is not None: | ||
if not filter_articles(elem, namespace=namespace, title=title, | ||
text=text, page_tag=page_tag, | ||
text_path=text_path, title_path=title_path, | ||
ns_path=ns_path, pageid_path=pageid_path): | ||
text = None | ||
|
||
pageid = elem.find(pageid_path).text | ||
yield title, text or "", pageid # empty page will yield None | ||
|
||
|
@@ -500,8 +568,11 @@ class WikiCorpus(TextCorpus): | |
""" | ||
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, | ||
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, | ||
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): | ||
""" | ||
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None): | ||
"""Initialize the corpus. | ||
|
||
Unless a dictionary is provided, this scans the corpus once, | ||
to determine its vocabulary. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -528,7 +599,11 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction | |
token_max_len : int, optional | ||
Maximal token length. | ||
lower : bool, optional | ||
Convert all text to lower case? | ||
If True - convert all text to lower case. | ||
filter_articles: callable or None, optional | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
If set, each XML article element will be passed to this callable before being processed. Only articles | ||
where the callable returns an XML element are processed, returning None allows filtering out | ||
some articles based on customised rules. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there any restrictions on the callable? Can it be an anonymous lambda function? (troublesome with parallelized multiprocessing -- lambdas cannot be pickled) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True, for the parallel On a related "fun" note, see https://stackoverflow.com/questions/50328386/python-typing-pickle-and-serialisation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking a simple note in the docs. What does the error look like now, if a user passes something not pickleable? If it looks reasonable, I'd just keep that. @menshikh-iv what is our policy here in general? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well there's no specific catch for this so it'll just be python's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so too. Someone advanced enough to be setting this parameter better be advanced enough to RTFM :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you create an example of this function and link it in docstring like
this can be useful for better understanding, how function works
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you mean add an example There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, only as example (no need to use it by default) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it, very busy at work but will try to get it done this week There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't forget to resolve merge conflict too |
||
|
||
Warnings | ||
-------- | ||
|
@@ -537,6 +612,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction | |
""" | ||
self.fname = fname | ||
self.filter_namespaces = filter_namespaces | ||
self.filter_articles = filter_articles | ||
self.metadata = False | ||
if processes is None: | ||
processes = max(1, multiprocessing.cpu_count() - 1) | ||
|
@@ -589,7 +665,7 @@ def get_texts(self): | |
texts = \ | ||
((text, self.lemmatize, title, pageid, tokenization_params) | ||
for title, text, pageid | ||
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) | ||
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)) | ||
pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) | ||
|
||
try: | ||
|
@@ -616,6 +692,9 @@ def get_texts(self): | |
"(total %i articles, %i positions before pruning articles shorter than %i words)", | ||
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS | ||
) | ||
except PicklingError as exc: | ||
raise_from(PicklingError('Can not send filtering function {} to multiprocessing, ' | ||
'make sure the function can be pickled.'.format(self.filter_articles)), exc) | ||
else: | ||
logger.info( | ||
"finished iterating over Wikipedia corpus of %i documents with %i positions " | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We use hanging indent in Gensim (not vertical alignment): https://www.python.org/dev/peps/pep-0008/#id3
Why? Makes code easier to maintain (people forget to re-indent after modifying the opening line). Plus some people (me!) also find it more consistent and easier to read :)