Skip to content

Commit

Permalink
change "multiprocessing" parameter to "workers" parameter
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
  • Loading branch information
TimSchopf committed Feb 12, 2022
1 parent 2f45652 commit 48d7b68
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 27 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ vectorizer = KeyphraseCountVectorizer()

# Print parameters
print(vectorizer.get_params())
>>> {'binary': False, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'multiprocessing': False, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english'}
>>> {'binary': False, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
```

By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is
Expand Down Expand Up @@ -237,7 +237,7 @@ vectorizer = KeyphraseTfidfVectorizer()

# Print parameters
print(vectorizer.get_params())
{'binary': False, 'dtype': <class 'numpy.float64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'multiprocessing': False, 'norm': 'l2', 'pos_pattern': '<J.*>*<N.*>+', 'smooth_idf': True, 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'sublinear_tf': False, 'use_idf': True}
{'binary': False, 'dtype': <class 'numpy.float64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'norm': 'l2', 'pos_pattern': '<J.*>*<N.*>+', 'smooth_idf': True, 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'sublinear_tf': False, 'use_idf': True, 'workers': 1}
```

To calculate tf values instead, set `use_idf=False`.
Expand Down
3 changes: 2 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ numpy>=1.18.5
spacy>=3.0.1
nltk>=3.6.1
scikit-learn>=1.0
scipy>=1.7.3
scipy>=1.7.3
psutil>=5.8.0
2 changes: 1 addition & 1 deletion keyphrase_vectorizers/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.5'
__version__ = '0.0.6'
25 changes: 19 additions & 6 deletions keyphrase_vectorizers/keyphrase_count_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import List

import numpy as np
import psutil
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from sklearn.feature_extraction.text import CountVectorizer
Expand Down Expand Up @@ -52,9 +53,10 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
lowercase : bool, default=True
Whether the returned keyphrases should be converted to lowercase.
multiprocessing : bool, default=False
Whether to use multiprocessing for spaCy part-of-speech tagging.
If True, spaCy uses all cores to tag documents with part-of-speech.
workers :int, default=1
How many workers to use for spaCy part-of-speech tagging.
If set to -1, use all available worker threads of the machine.
spaCy uses the specified number of cores to tag documents with part-of-speech.
Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
Therefore, carefully consider whether this option is really necessary.
Expand All @@ -75,7 +77,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
"""

def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
stop_words: str = 'english', lowercase: bool = True, multiprocessing: bool = False, max_df: int = None,
stop_words: str = 'english', lowercase: bool = True, workers: int = 1, max_df: int = None,
min_df: int = None,
binary: bool = False, dtype: np.dtype = np.int64):

Expand Down Expand Up @@ -108,11 +110,22 @@ def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<
"'max_df' must be > 'min_df'"
)

# triggers a parameter validation
if not isinstance(workers, int):
raise ValueError(
"'workers' parameter must be of type int"
)

if (workers < -1) or (workers > psutil.cpu_count(logical=True)):
raise ValueError(
"'workers' parameter value must be between -1 and " + str(psutil.cpu_count(logical=True))
)

self.spacy_pipeline = spacy_pipeline
self.pos_pattern = pos_pattern
self.stop_words = stop_words
self.lowercase = lowercase
self.multiprocessing = multiprocessing
self.workers = workers
self.max_df = max_df
self.min_df = min_df
self.binary = binary
Expand All @@ -137,7 +150,7 @@ def fit(self, raw_documents: List[str]) -> object:
stop_words=self.stop_words,
spacy_pipeline=self.spacy_pipeline,
pos_pattern=self.pos_pattern,
lowercase=self.lowercase, multiprocessing=self.multiprocessing)
lowercase=self.lowercase, workers=self.workers)

# remove keyphrases that have more than 8 words, as they are probably no real keyphrases
# additionally this prevents memory issues during transformation to a document-keyphrase matrix
Expand Down
25 changes: 19 additions & 6 deletions keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import List

import numpy as np
import psutil
from sklearn.exceptions import NotFittedError
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils.validation import FLOAT_DTYPES
Expand Down Expand Up @@ -80,9 +81,10 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
lowercase : bool, default=True
Whether the returned keyphrases should be converted to lowercase.
multiprocessing : bool, default=False
Whether to use multiprocessing for spaCy part-of-speech tagging.
If True, spaCy uses all cores to tag documents with part-of-speech.
workers :int, default=1
How many workers to use for spaCy part-of-speech tagging.
If set to -1, use all available worker threads of the machine.
spaCy uses the specified number of cores to tag documents with part-of-speech.
Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
Therefore, carefully consider whether this option is really necessary.
Expand Down Expand Up @@ -121,17 +123,28 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):

def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
stop_words: str = 'english',
lowercase: bool = True, multiprocessing: bool = False, max_df: int = None, min_df: int = None,
lowercase: bool = True, workers: int = 1, max_df: int = None, min_df: int = None,
binary: bool = False,
dtype: np.dtype = np.float64, norm: str = "l2",
use_idf: bool = True, smooth_idf: bool = True,
sublinear_tf: bool = False):

# triggers a parameter validation
if not isinstance(workers, int):
raise ValueError(
"'workers' parameter must be of type int"
)

if (workers < -1) or (workers > psutil.cpu_count(logical=True)):
raise ValueError(
"'workers' parameter value must be between -1 and " + str(psutil.cpu_count(logical=True))
)

self.spacy_pipeline = spacy_pipeline
self.pos_pattern = pos_pattern
self.stop_words = stop_words
self.lowercase = lowercase
self.multiprocessing = multiprocessing
self.workers = workers
self.max_df = max_df
self.min_df = min_df
self.binary = binary
Expand All @@ -145,7 +158,7 @@ def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<
sublinear_tf=self.sublinear_tf)

super().__init__(spacy_pipeline=self.spacy_pipeline, pos_pattern=self.pos_pattern, stop_words=self.stop_words,
lowercase=self.lowercase, multiprocessing=self.multiprocessing, max_df=self.max_df,
lowercase=self.lowercase, workers=self.workers, max_df=self.max_df,
min_df=self.min_df, binary=self.binary,
dtype=self.dtype)

Expand Down
28 changes: 19 additions & 9 deletions keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import List

import numpy as np
import psutil
import scipy.sparse as sp
import spacy
from nltk import RegexpParser
Expand Down Expand Up @@ -180,7 +181,7 @@ def _split_long_document(self, text: str, max_text_length: int) -> List[str]:
return splitted_document

def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_pipeline: str, pos_pattern: str,
lowercase: bool = True, multiprocessing: bool = False) -> List[str]:
lowercase: bool = True, workers: int = 1) -> List[str]:
"""
Select keyphrases with part-of-speech tagging from a text document.
Parameters
Expand All @@ -202,9 +203,10 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p
lowercase : bool, default=True
Whether the returned keyphrases should be converted to lowercase.
multiprocessing : bool, default=False
Whether to use multiprocessing for spaCy POS tagging.
If True, spaCy uses all cores to POS tag documents.
workers :int, default=1
How many workers to use for spaCy part-of-speech tagging.
If set to -1, use all available worker threads of the machine.
spaCy uses the specified number of cores to tag documents with part-of-speech.
Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
Therefore, carefully consider whether this option is really necessary.
Expand Down Expand Up @@ -244,6 +246,17 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p
"'pos_pattern' parameter needs to be a regex string. E.g. '<J.*>*<N.*>+'"
)

# triggers a parameter validation
if not isinstance(workers, int):
raise ValueError(
"'workers' parameter must be of type int"
)

if (workers < -1) or (workers > psutil.cpu_count(logical=True)):
raise ValueError(
"'workers' parameter value must be between -1 and " + str(psutil.cpu_count(logical=True))
)

stop_words_list = []
if stop_words:
stop_words_list = set(stopwords.words(stop_words))
Expand Down Expand Up @@ -274,11 +287,8 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p
nlp.add_pipe('sentencizer')

keyphrases_list = []
if multiprocessing:
num_workers = -1
if workers != 1:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
else:
num_workers = 1

# split large documents in smaller chunks, so that spacy can process them without memory issues
docs_list = []
Expand All @@ -297,7 +307,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p
nlp.max_length = max([len(doc) for doc in document_list]) + 100

cp = RegexpParser('CHUNK: {(' + pos_pattern + ')}')
for tagged_doc in nlp.pipe(document_list, n_process=num_workers):
for tagged_doc in nlp.pipe(document_list, n_process=workers):
tagged_pos_doc = []
for sentence in tagged_doc.sents:
pos_tagged_sentence = []
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ numpy>=1.18.5
spacy>=3.0.1
nltk>=3.6.1
scikit-learn>=1.0
scipy>=1.7.3
scipy>=1.7.3
psutil>=5.8.0
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
'spacy >= 3.0.1',
'nltk >= 3.6.1',
'scikit-learn >= 1.0',
'scipy>=1.7.3'
'scipy>=1.7.3',
'psutil>=5.8.0'
],
package_dir={"": "."},
packages=setuptools.find_packages(where="."),
Expand Down

0 comments on commit 48d7b68

Please sign in to comment.