Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lazy formatting in evaluate_word_pairs #1084

Merged
merged 1 commit into from
Jan 10, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,19 +481,20 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c

@staticmethod
def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
logger.info('Pearson correlation coefficient against %s: %.4f' % (pairs, pearson[0]))
logger.info('Spearman rank-order correlation coefficient against %s: %.4f' % (pairs, spearman[0]))
logger.info('Pairs with unknown words ratio: %.1f%%' % oov)
logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0])
logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0])
logger.info('Pairs with unknown words ratio: %.1f%%', oov)

def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
dummy4unknown=False):
"""
Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.

The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
between the similarities from the dataset and the similarities produced by the model itself. .
between the similarities from the dataset and the similarities produced by the model itself.
The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).

Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
Expand Down Expand Up @@ -532,7 +533,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
a, b, sim = [word for word in line.split(delimiter)]
sim = float(sim)
except:
logger.info('skipping invalid line #%d in %s' % (line_no, pairs))
logger.info('skipping invalid line #%d in %s', line_no, pairs)
continue
if a not in ok_vocab or b not in ok_vocab:
oov += 1
Expand All @@ -541,7 +542,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
similarity_gold.append(sim)
continue
else:
logger.debug('skipping line #%d with OOV words: %s' % (line_no, line.strip()))
logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip())
continue
similarity_gold.append(sim) # Similarity from the dataset
similarity_model.append(self.similarity(a, b)) # Similarity from the model
Expand All @@ -550,10 +551,14 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
pearson = stats.pearsonr(similarity_gold, similarity_model)
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100

logger.debug('Pearson correlation coefficient against %s: %f with p-value %f'
% (pairs, pearson[0], pearson[1]))
logger.debug('Spearman rank-order correlation coefficient against %s: %f with p-value %f'
% (pairs, spearman[0], spearman[1]))
logger.debug(
'Pearson correlation coefficient against %s: %f with p-value %f',
pairs, pearson[0], pearson[1]
)
logger.debug(
'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
pairs, spearman[0], spearman[1]
)
logger.debug('Pairs with unknown words: %d' % oov)
self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
return pearson, spearman, oov_ratio
Expand Down