piskvorky · piskvorky · Jul 13, 2018 · Jun 13, 2018 · Jun 13, 2018 · Jun 14, 2018
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -27,6 +27,7 @@
 import multiprocessing
 import re
 import signal
+from pickle import PicklingError
 from xml.etree.cElementTree import \
     iterparse  # LXML isn't faster, so let's go with the built-in solution
 
@@ -35,6 +36,9 @@
 from gensim.corpora.dictionary import Dictionary
 from gensim.corpora.textcorpus import TextCorpus
 
+from six import raise_from
+
+
 logger = logging.getLogger(__name__)
 
 ARTICLE_MIN_WORDS = 50
@@ -339,7 +343,7 @@ def get_namespace(tag):
 _get_namespace = get_namespace
 
 
-def extract_pages(f, filter_namespaces=False):
+def extract_pages(f, filter_namespaces=False, filter_articles=None):
     """Extract pages from a MediaWiki database dump.
 
     Parameters
@@ -380,6 +384,13 @@ def extract_pages(f, filter_namespaces=False):
                 if ns not in filter_namespaces:
                     text = None
 
+            if filter_articles is not None:
+                if filter_articles(elem, namespace=namespace, title=title,
+                                   text=text, page_tag=page_tag,
+                                   text_path=text_path, title_path=title_path,
+                                   ns_path=ns_path, pageid_path=pageid_path) is None:
+                    text = None
+
             pageid = elem.find(pageid_path).text
             yield title, text or "", pageid  # empty page will yield None
 
@@ -512,7 +523,7 @@ class WikiCorpus(TextCorpus):
 
     def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
-                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
+                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
         """Initialize the corpus.
 
         Unless a dictionary is provided, this scans the corpus once,
@@ -544,10 +555,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
             Maximal token length.
         lower : bool, optional
              If True - convert all text to lower case.
+        filter_articles: callable or None, optional
+            If set, each XML article element will be passed to this callable before being processed. Only articles
+            where the callable returns an XML element are processed, returning None allows filtering out
+            some articles based on customised rules.
 
         """
         self.fname = fname
         self.filter_namespaces = filter_namespaces
+        self.filter_articles = filter_articles
         self.metadata = False
         if processes is None:
             processes = max(1, multiprocessing.cpu_count() - 1)
@@ -587,7 +603,7 @@ def get_texts(self):
         texts = \
             ((text, self.lemmatize, title, pageid, tokenization_params)
              for title, text, pageid
-             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
+             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
         pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
 
         try:
@@ -614,6 +630,9 @@ def get_texts(self):
                 "(total %i articles, %i positions before pruning articles shorter than %i words)",
                 articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
             )
+        except PicklingError as exc:
+            raise_from(PicklingError('Can not send filtering function {} to multiprocessing, '
+                'make sure the function can be pickled.'.format(self.filter_articles)), exc)
         else:
             logger.info(
                 "finished iterating over Wikipedia corpus of %i documents with %i positions "