Skip to content

Commit

Permalink
adjust test data to 'stop_words' = None parameter
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
  • Loading branch information
TimSchopf committed May 16, 2022
1 parent 8823364 commit 2069eb1
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 79 deletions.
111 changes: 55 additions & 56 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,16 @@ vectorizer.fit(docs)
keyphrases = vectorizer.get_feature_names_out()

print(keyphrases)
>>> ['output' 'training data' 'task' 'way' 'input object' 'documents'
'unseen instances' 'vector' 'interest' 'learning algorithm'
'unseen situations' 'training examples' 'machine' 'given document'
'document' 'document relevance' 'output pairs' 'document content'
'class labels' 'new examples' 'pair' 'main topics' 'phrases' 'overlap'
'algorithm' 'various applications' 'information retrieval' 'users' 'list'
'example input' 'supervised learning' 'optimal scenario'
'precise summary' 'keywords' 'input' 'supervised learning algorithm'
'example' 'supervisory signal' 'indication' 'set'
'information retrieval environment' 'output value' 'inductive bias'
'groups' 'function']
>>> ['users' 'main topics' 'learning algorithm' 'overlap' 'documents' 'output'
'keywords' 'precise summary' 'new examples' 'training data' 'input'
'document content' 'training examples' 'unseen instances'
'optimal scenario' 'document' 'task' 'supervised learning algorithm'
'example' 'interest' 'function' 'example input' 'various applications'
'unseen situations' 'phrases' 'indication' 'inductive bias'
'supervisory signal' 'document relevance' 'information retrieval' 'set'
'input object' 'groups' 'output value' 'list' 'learning' 'output pairs'
'pair' 'class labels' 'supervised learning' 'machine'
'information retrieval environment' 'algorithm' 'vector' 'way']
```

```python
Expand All @@ -158,10 +157,10 @@ print(keyphrases)
document_keyphrase_matrix = vectorizer.transform(docs).toarray()

print(document_keyphrase_matrix)
>>> [[3 3 1 1 1 0 1 1 0 2 1 1 1 0 0 0 1 0 1 1 1 0 0 0 3 0 0 0 0 1 3 1 0 0 3 1
2 1 0 1 0 1 1 0 3]
[0 0 0 0 0 1 0 0 1 0 0 0 0 1 5 1 0 1 0 0 0 2 1 1 0 1 2 1 1 0 0 0 1 5 0 0
0 0 1 0 1 0 0 1 0]]
>>> [[0 0 2 0 0 3 0 0 1 3 3 0 1 1 1 0 1 1 2 0 3 1 0 1 0 0 1 1 0 0 1 1 0 1 0 6
1 1 1 3 1 0 3 1 1]
[1 2 0 1 1 0 5 1 0 0 0 1 0 0 0 5 0 0 0 1 0 0 1 0 1 1 0 0 1 2 0 0 1 0 1 0
0 0 0 0 0 1 0 0 0]]
```

```python
Expand All @@ -170,10 +169,10 @@ print(document_keyphrase_matrix)
document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray()

print(document_keyphrase_matrix)
>>> [[3 3 1 1 1 0 1 1 0 2 1 1 1 0 0 0 1 0 1 1 1 0 0 0 3 0 0 0 0 1 3 1 0 0 3 1
2 1 0 1 0 1 1 0 3]
[0 0 0 0 0 1 0 0 1 0 0 0 0 1 5 1 0 1 0 0 0 2 1 1 0 1 2 1 1 0 0 0 1 5 0 0
0 0 1 0 1 0 0 1 0]]
>>> [[0 0 2 0 0 3 0 0 1 3 3 0 1 1 1 0 1 1 2 0 3 1 0 1 0 0 1 1 0 0 1 1 0 1 0 6
1 1 1 3 1 0 3 1 1]
[1 2 0 1 1 0 5 1 0 0 0 1 0 0 0 5 0 0 0 1 0 0 1 0 1 1 0 0 1 2 0 0 1 0 1 0
0 0 0 0 0 1 0 0 0]]
```

<a name="#other-languages"/></a>
Expand Down Expand Up @@ -250,40 +249,39 @@ To calculate tf values instead, set `use_idf=False`.
document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray()

print(document_keyphrase_matrix)
>>> [[0.11111111 0.22222222 0.11111111 0. 0. 0.
0.11111111 0. 0.11111111 0.11111111 0.33333333 0.
0. 0. 0.11111111 0. 0. 0.11111111
0. 0.33333333 0. 0.22222222 0. 0.11111111
0.11111111 0.11111111 0.11111111 0.11111111 0.33333333 0.11111111
0.11111111 0.33333333 0.11111111 0. 0.33333333 0.
0. 0. 0.11111111 0. 0.11111111 0.11111111
0. 0.33333333 0.11111111]
[0. 0. 0. 0.11785113 0.11785113 0.11785113
0. 0.11785113 0. 0. 0. 0.11785113
0.11785113 0.11785113 0. 0.11785113 0.23570226 0.
0.23570226 0. 0.58925565 0. 0.11785113 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0.58925565 0. 0.11785113
0.11785113 0.11785113 0. 0.11785113 0. 0.
0.11785113 0. 0. ]]
>>> [[0. 0. 0.09245003 0.09245003 0.09245003 0.09245003
0.2773501 0.09245003 0.2773501 0.2773501 0.09245003 0.
0. 0.09245003 0. 0.2773501 0.09245003 0.09245003
0. 0.09245003 0.09245003 0.09245003 0.09245003 0.09245003
0.5547002 0. 0. 0.09245003 0.09245003 0.
0.2773501 0.18490007 0.09245003 0. 0.2773501 0.
0. 0.09245003 0. 0.09245003 0. 0.
0. 0.18490007 0. ]
[0.11867817 0.11867817 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.11867817
0.11867817 0. 0.11867817 0. 0. 0.
0.11867817 0. 0. 0. 0. 0.
0. 0.11867817 0.23735633 0. 0. 0.11867817
0. 0. 0. 0.23735633 0. 0.11867817
0.11867817 0. 0.59339083 0. 0.11867817 0.11867817
0.11867817 0. 0.59339083]]
```

```python
# Return keyphrases
keyphrases = vectorizer.get_feature_names_out()

print(keyphrases)
>>> ['optimal scenario' 'example' 'input object' 'groups' 'list'
'precise summary' 'inductive bias' 'phrases' 'training examples'
'output value' 'function' 'given document' 'documents'
'information retrieval environment' 'new examples' 'interest'
'main topics' 'unseen situations' 'information retrieval' 'input'
'keywords' 'learning algorithm' 'indication' 'set' 'example input'
'vector' 'machine' 'supervised learning algorithm' 'algorithm' 'pair'
'task' 'training data' 'way' 'document' 'supervised learning' 'users'
'document relevance' 'document content' 'supervisory signal' 'overlap'
'class labels' 'unseen instances' 'various applications' 'output'
'output pairs']
>>> ['various applications' 'list' 'task' 'supervisory signal'
'inductive bias' 'supervised learning algorithm' 'supervised learning'
'example input' 'input' 'algorithm' 'set' 'precise summary' 'documents'
'input object' 'interest' 'function' 'class labels' 'machine'
'document content' 'output pairs' 'new examples' 'unseen situations'
'vector' 'output value' 'learning' 'document relevance' 'main topics'
'pair' 'training examples' 'information retrieval environment'
'training data' 'example' 'optimal scenario' 'information retrieval'
'output' 'groups' 'indication' 'unseen instances' 'keywords' 'way'
'phrases' 'overlap' 'users' 'learning algorithm' 'document']
```

<a name="#keyphrase-extraction-with-keybert"/></a>
Expand Down Expand Up @@ -363,20 +361,21 @@ n-gram range. We only have to pass a keyphrase vectorizer as parameter to KeyBER

```python
>>> kw_model.extract_keywords(docs=docs, vectorizer=KeyphraseCountVectorizer())
[[('training examples', 0.4668),
('training data', 0.5271),
('learning algorithm', 0.5632),
('supervised learning', 0.6779),
('supervised learning algorithm', 0.6992)],
[('given document', 0.4143),
('information retrieval environment', 0.5166),
('information retrieval', 0.5792),
('keywords', 0.6046),
[[('learning', 0.4813),
('training data', 0.5271),
('learning algorithm', 0.5632),
('supervised learning', 0.6779),
('supervised learning algorithm', 0.6992)],
[('document content', 0.3988),
('information retrieval environment', 0.5166),
('information retrieval', 0.5792),
('keywords', 0.6046),
('document relevance', 0.633)]]

```

This allows us to make sure that we do not cut off important words caused by defining our n-gram range too short. For
example, we would not have found the keyphrase "supervised learning algorithm" with keyphrase_ngram_range=(1,2).
example, we would not have found the keyphrase "supervised learning algorithm" with `keyphrase_ngram_range=(1,2)`.
Furthermore, we avoid to get keyphrases that are slightly off-key like "labeled training", "signal supervised" or
"keywords quickly".

Expand Down
47 changes: 24 additions & 23 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,12 @@ def get_german_test_docs():

def get_english_test_keyphrases():
sorted_english_test_keyphrases = ['algorithm', 'class labels', 'document', 'document content', 'document relevance',
'documents', 'example', 'example input', 'function', 'given document', 'groups',
'indication', 'inductive bias', 'information retrieval',
'information retrieval environment', 'input', 'input object', 'interest',
'keywords',
'learning algorithm', 'list', 'machine', 'main topics', 'new examples',
'optimal scenario', 'output', 'output pairs', 'output value', 'overlap', 'pair',
'phrases', 'precise summary', 'set', 'supervised learning',
'supervised learning algorithm', 'supervisory signal', 'task', 'training data',
'documents', 'example', 'example input', 'function', 'groups', 'indication',
'inductive bias', 'information retrieval', 'information retrieval environment',
'input', 'input object', 'interest', 'keywords', 'learning', 'learning algorithm',
'list', 'machine', 'main topics', 'new examples', 'optimal scenario', 'output',
'output pairs', 'output value', 'overlap', 'pair', 'phrases', 'precise summary',
'set', 'supervised learning', 'supervisory signal', 'task', 'training data',
'training examples', 'unseen instances', 'unseen situations', 'users',
'various applications', 'vector', 'way']
return sorted_english_test_keyphrases
Expand All @@ -68,30 +66,33 @@ def get_german_test_keyphrases():

def get_sorted_english_count_matrix():
sorted_english_count_matrix = [
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 3, 3, 3, 3, 3, 3],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
3, 3, 3, 3, 3, 3, 6],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 5, 5]]
1, 1, 1, 2, 2, 5, 5]]

return sorted_english_count_matrix


def get_sorted_english_tfidf_matrix():
sorted_english_tfidf_matrix = [
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111,
0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111,
0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111, 0.1111111111,
0.1111111111, 0.1111111111, 0.2222222222, 0.2222222222, 0.3333333333, 0.3333333333, 0.3333333333, 0.3333333333,
0.3333333333, 0.3333333333],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0928476691,
0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691,
0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691, 0.0928476691,
0.0928476691, 0.1856953382, 0.1856953382, 0.2785430073, 0.2785430073, 0.2785430073, 0.2785430073, 0.2785430073,
0.2785430073, 0.5570860145],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, 0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302,
0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302, 0.1178511302,
0.2357022604, 0.2357022604, 0.589255651, 0.589255651]]
0.0, 0.0, 0.0, 0.0, 0.0, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658,
0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.2373563316,
0.2373563316, 0.5933908291, 0.5933908291]]

return sorted_english_tfidf_matrix


def get_english_keybert_keyphrases():
english_keybert_keyphrases = [['training examples', 'training data', 'learning algorithm', 'supervised learning',
'supervised learning algorithm'],
['given document', 'information retrieval environment', 'information retrieval',
'keywords', 'document relevance']]
english_keybert_keyphrases = [
['training examples', 'learning', 'training data', 'learning algorithm', 'supervised learning'],
['document content', 'information retrieval environment', 'information retrieval', 'keywords',
'document relevance']]

return english_keybert_keyphrases

0 comments on commit 2069eb1

Please sign in to comment.