-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Dictionary save_as_text method #56 + fix lint errors #1402
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -120,35 +120,34 @@ def testFilter(self): | |
d.filter_extremes(no_below=2, no_above=1.0, keep_n=4) | ||
expected = {0: 3, 1: 3, 2: 3, 3: 3} | ||
self.assertEqual(d.dfs, expected) | ||
|
||
def testFilterKeepTokens_keepTokens(self): | ||
# provide keep_tokens argument, keep the tokens given | ||
d = Dictionary(self.texts) | ||
d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['human', 'survey']) | ||
expected = set(['graph', 'trees', 'human', 'system', 'user', 'survey']) | ||
self.assertEqual(set(d.token2id.keys()), expected) | ||
|
||
def testFilterKeepTokens_unchangedFunctionality(self): | ||
# do not provide keep_tokens argument, filter_extremes functionality is unchanged | ||
d = Dictionary(self.texts) | ||
d.filter_extremes(no_below=3, no_above=1.0) | ||
expected = set(['graph', 'trees', 'system', 'user']) | ||
self.assertEqual(set(d.token2id.keys()), expected) | ||
|
||
def testFilterKeepTokens_unseenToken(self): | ||
# do provide keep_tokens argument with unseen tokens, filter_extremes functionality is unchanged | ||
d = Dictionary(self.texts) | ||
d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['unknown_token']) | ||
expected = set(['graph', 'trees', 'system', 'user']) | ||
self.assertEqual(set(d.token2id.keys()), expected) | ||
self.assertEqual(set(d.token2id.keys()), expected) | ||
|
||
def testFilterMostFrequent(self): | ||
d = Dictionary(self.texts) | ||
d.filter_n_most_frequent(4) | ||
expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2} | ||
self.assertEqual(d.dfs, expected) | ||
|
||
|
||
d = Dictionary(self.texts) | ||
d.filter_n_most_frequent(4) | ||
expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2} | ||
self.assertEqual(d.dfs, expected) | ||
|
||
def testFilterTokens(self): | ||
self.maxDiff = 10000 | ||
d = Dictionary(self.texts) | ||
|
@@ -157,16 +156,15 @@ def testFilterTokens(self): | |
d.filter_tokens([0]) | ||
|
||
expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, | ||
'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, | ||
'system': 5, 'time': 6, 'trees': 9, 'user': 7} | ||
'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use hanging indent (no vertical) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. |
||
'system': 5, 'time': 6, 'trees': 9, 'user': 7} | ||
del expected[removed_word] | ||
self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) | ||
|
||
expected[removed_word] = len(expected) | ||
d.add_documents([[removed_word]]) | ||
self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) | ||
|
||
|
||
def test_doc2bow(self): | ||
d = Dictionary([["žluťoučký"], ["žluťoučký"]]) | ||
|
||
|
@@ -179,6 +177,58 @@ def test_doc2bow(self): | |
# unicode must be converted to utf8 | ||
self.assertEqual(d.doc2bow([u'\u017elu\u0165ou\u010dk\xfd']), [(0, 1)]) | ||
|
||
def test_saveAsText(self): | ||
"""`Dictionary` can be saved as textfile. """ | ||
tmpf = get_tmpfile('save_dict_test.txt') | ||
small_text = [["prvé", "slovo"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hanging indent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. |
||
["slovo", "druhé"], | ||
["druhé", "slovo"]] | ||
|
||
d = Dictionary(small_text) | ||
|
||
d.save_as_text(tmpf) | ||
with open(tmpf) as file: | ||
serialized_lines = file.readlines() | ||
self.assertEqual(serialized_lines[0], "3\n") | ||
self.assertEqual(len(serialized_lines), 4) | ||
# We do not know, which word will have which index | ||
self.assertEqual(serialized_lines[1][1:], "\tdruhé\t2\n") | ||
self.assertEqual(serialized_lines[2][1:], "\tprvé\t1\n") | ||
self.assertEqual(serialized_lines[3][1:], "\tslovo\t3\n") | ||
|
||
d.save_as_text(tmpf, sort_by_word=False) | ||
with open(tmpf) as file: | ||
serialized_lines = file.readlines() | ||
self.assertEqual(serialized_lines[0], "3\n") | ||
self.assertEqual(len(serialized_lines), 4) | ||
self.assertEqual(serialized_lines[1][1:], "\tslovo\t3\n") | ||
self.assertEqual(serialized_lines[2][1:], "\tdruhé\t2\n") | ||
self.assertEqual(serialized_lines[3][1:], "\tprvé\t1\n") | ||
|
||
def test_loadFromText(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add case with old file format There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The first part is for format, the second is for new. I create two different functions for better verbosity. |
||
tmpf = get_tmpfile('load_dict_test.txt') | ||
no_num_docs_serialization = "1\tprvé\t1\n2\tslovo\t2\n" | ||
with open(tmpf, "w") as file: | ||
file.write(no_num_docs_serialization) | ||
|
||
d = Dictionary.load_from_text(tmpf) | ||
self.assertEqual(d.token2id[u"prvé"], 1) | ||
self.assertEqual(d.token2id[u"slovo"], 2) | ||
self.assertEqual(d.dfs[1], 1) | ||
self.assertEqual(d.dfs[2], 2) | ||
self.assertEqual(d.num_docs, 0) | ||
|
||
no_num_docs_serialization = "2\n1\tprvé\t1\n2\tslovo\t2\n" | ||
with open(tmpf, "w") as file: | ||
file.write(no_num_docs_serialization) | ||
|
||
d = Dictionary.load_from_text(tmpf) | ||
self.assertEqual(d.token2id[u"prvé"], 1) | ||
self.assertEqual(d.token2id[u"slovo"], 2) | ||
self.assertEqual(d.dfs[1], 1) | ||
self.assertEqual(d.dfs[2], 2) | ||
self.assertEqual(d.num_docs, 2) | ||
|
||
def test_saveAsText_and_loadFromText(self): | ||
"""`Dictionary` can be saved as textfile and loaded again from textfile. """ | ||
tmpf = get_tmpfile('dict_test.txt') | ||
|
@@ -195,23 +245,23 @@ def test_from_corpus(self): | |
"""build `Dictionary` from an existing corpus""" | ||
|
||
documents = ["Human machine interface for lab abc computer applications", | ||
"A survey of user opinion of computer system response time", | ||
"The EPS user interface management system", | ||
"System and human system engineering testing of EPS", | ||
"Relation of user perceived response time to error measurement", | ||
"The generation of random binary unordered trees", | ||
"The intersection graph of paths in trees", | ||
"Graph minors IV Widths of trees and well quasi ordering", | ||
"Graph minors A survey"] | ||
"A survey of user opinion of computer system response time", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hanging indent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. |
||
"The EPS user interface management system", | ||
"System and human system engineering testing of EPS", | ||
"Relation of user perceived response time to error measurement", | ||
"The generation of random binary unordered trees", | ||
"The intersection graph of paths in trees", | ||
"Graph minors IV Widths of trees and well quasi ordering", | ||
"Graph minors A survey"] | ||
stoplist = set('for a of the and to in'.split()) | ||
texts = [[word for word in document.lower().split() if word not in stoplist] | ||
for document in documents] | ||
for document in documents] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hanging indent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. |
||
|
||
# remove words that appear only once | ||
all_tokens = sum(texts, []) | ||
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) | ||
texts = [[word for word in text if word not in tokens_once] | ||
for text in texts] | ||
for text in texts] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hanging indent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. It actually fits on one line. |
||
|
||
dictionary = Dictionary(texts) | ||
corpus = [dictionary.doc2bow(text) for text in texts] | ||
|
@@ -260,7 +310,7 @@ def test_dict_interface(self): | |
self.assertTrue(isinstance(d.keys(), list)) | ||
self.assertTrue(isinstance(d.values(), list)) | ||
|
||
#endclass TestDictionary | ||
# endclass TestDictionary | ||
|
||
|
||
if __name__ == '__main__': | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really a warning?
There's no ambiguity (3 columns vs 1 column), so I think INFO (or even DEBUG) should be sufficient.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The warning is not about number ambiguity, but about the fact that you are using old serialization format.
I put there a warning, because it may cause you errors somewhere down the pipeline (as mentioned by the original issue #56). If it does not make sense, I can change it, but I think warning should be the best.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps, but the current message doesn't help the users.
What "text"? Why should it contain "num_docs"? What should the user do about it?
This should be a backward compatible change, so "errors down the pipeline" are not acceptable. CC @menshikh-iv
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something like: "This dictionary was serialized using old format. Please set the dictionary.num_docs manually or some dictionary methods may not work as intended."