-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Wikicorpus custom filters #2089
Changes from 7 commits
cb02f55
9b8491e
ef7453e
ff9002f
e0012ca
0c97982
d19a2e0
882e699
b81851a
295f4ce
5610c36
224fb28
bfac0c2
cd03fd0
353de1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
import multiprocessing | ||
import re | ||
import signal | ||
from pickle import PicklingError | ||
from xml.etree.cElementTree import \ | ||
iterparse # LXML isn't faster, so let's go with the built-in solution | ||
|
||
|
@@ -35,6 +36,9 @@ | |
from gensim.corpora.dictionary import Dictionary | ||
from gensim.corpora.textcorpus import TextCorpus | ||
|
||
from six import raise_from | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
ARTICLE_MIN_WORDS = 50 | ||
|
@@ -339,7 +343,7 @@ def get_namespace(tag): | |
_get_namespace = get_namespace | ||
|
||
|
||
def extract_pages(f, filter_namespaces=False): | ||
def extract_pages(f, filter_namespaces=False, filter_articles=None): | ||
"""Extract pages from a MediaWiki database dump. | ||
|
||
Parameters | ||
|
@@ -380,6 +384,13 @@ def extract_pages(f, filter_namespaces=False): | |
if ns not in filter_namespaces: | ||
text = None | ||
|
||
if filter_articles is not None: | ||
if filter_articles(elem, namespace=namespace, title=title, | ||
text=text, page_tag=page_tag, | ||
text_path=text_path, title_path=title_path, | ||
ns_path=ns_path, pageid_path=pageid_path) is None: | ||
text = None | ||
|
||
pageid = elem.find(pageid_path).text | ||
yield title, text or "", pageid # empty page will yield None | ||
|
||
|
@@ -512,7 +523,7 @@ class WikiCorpus(TextCorpus): | |
|
||
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, | ||
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, | ||
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): | ||
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None): | ||
"""Initialize the corpus. | ||
|
||
Unless a dictionary is provided, this scans the corpus once, | ||
|
@@ -544,10 +555,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction | |
Maximal token length. | ||
lower : bool, optional | ||
If True - convert all text to lower case. | ||
filter_articles: callable or None, optional | ||
If set, each XML article element will be passed to this callable before being processed. Only articles | ||
where the callable returns an XML element are processed, returning None allows filtering out | ||
some articles based on customised rules. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there any restrictions on the callable? Can it be an anonymous lambda function? (troublesome with parallelized multiprocessing -- lambdas cannot be pickled) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True, for the parallel On a related "fun" note, see https://stackoverflow.com/questions/50328386/python-typing-pickle-and-serialisation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking a simple note in the docs. What does the error look like now, if a user passes something not pickleable? If it looks reasonable, I'd just keep that. @menshikh-iv what is our policy here in general? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well there's no specific catch for this so it'll just be python's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so too. Someone advanced enough to be setting this parameter better be advanced enough to RTFM :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you create an example of this function and link it in docstring like
this can be useful for better understanding, how function works
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you mean add an example There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, only as example (no need to use it by default) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it, very busy at work but will try to get it done this week There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't forget to resolve merge conflict too |
||
|
||
""" | ||
self.fname = fname | ||
self.filter_namespaces = filter_namespaces | ||
self.filter_articles = filter_articles | ||
self.metadata = False | ||
if processes is None: | ||
processes = max(1, multiprocessing.cpu_count() - 1) | ||
|
@@ -587,7 +603,7 @@ def get_texts(self): | |
texts = \ | ||
((text, self.lemmatize, title, pageid, tokenization_params) | ||
for title, text, pageid | ||
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) | ||
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)) | ||
pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) | ||
|
||
try: | ||
|
@@ -614,6 +630,9 @@ def get_texts(self): | |
"(total %i articles, %i positions before pruning articles shorter than %i words)", | ||
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS | ||
) | ||
except PicklingError as exc: | ||
raise_from(PicklingError('Can not send filtering function {} to multiprocessing, ' | ||
'make sure the function can be pickled.'.format(self.filter_articles)), exc) | ||
else: | ||
logger.info( | ||
"finished iterating over Wikipedia corpus of %i documents with %i positions " | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
callable, optional