-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix formula in gensim.summarization.bm25
. Fix #1828
#1833
Changes from 15 commits
f9803cb
80dc13d
848e5ee
38e2a0f
dc702de
ce290dc
d147d0c
77409dd
f1f65e2
de436bc
17ccbcc
6ec708c
575a24b
ef8e918
a2a4c61
fbf4fad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
"""This module contains function of computing rank scores for documents in | ||
corpus and helper class `BM25` used in calculations. Original alhorithm | ||
corpus and helper class `BM25` used in calculations. Original algorithm | ||
descibed in [1]_, also you may check Wikipedia page [2]_. | ||
|
||
|
||
|
@@ -61,7 +61,8 @@ class BM25(object): | |
Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values. | ||
idf : dict | ||
Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values. | ||
|
||
doc_len : list of int | ||
List of document lengths. | ||
""" | ||
|
||
def __init__(self, corpus): | ||
|
@@ -78,12 +79,14 @@ def __init__(self, corpus): | |
self.f = [] | ||
self.df = {} | ||
self.idf = {} | ||
self.doc_len = [] | ||
self.initialize() | ||
|
||
def initialize(self): | ||
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies.""" | ||
for document in self.corpus: | ||
frequencies = {} | ||
self.doc_len.append(len(document)) | ||
for word in document: | ||
if word not in frequencies: | ||
frequencies[word] = 0 | ||
|
@@ -122,7 +125,7 @@ def get_score(self, document, index, average_idf): | |
continue | ||
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf | ||
score += (idf * self.f[index][word] * (PARAM_K1 + 1) | ||
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * len(document) / self.avgdl))) | ||
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add several simple tests for BM25 |
||
return score | ||
|
||
def get_scores(self, document, average_idf): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
""" | ||
Automated tests for checking transformation algorithms (the models package). | ||
""" | ||
|
||
import logging | ||
import unittest | ||
|
||
from gensim.summarization.bm25 import get_bm25_weights | ||
from gensim.test.utils import common_texts | ||
|
||
|
||
class TestBM25(unittest.TestCase): | ||
def test_max_match_with_itself(self): | ||
""" Document should show maximum matching with itself """ | ||
weights = get_bm25_weights(common_texts) | ||
for index, doc_weights in enumerate(weights): | ||
expected = max(doc_weights) | ||
predicted = doc_weights[index] | ||
self.assertEqual(expected, predicted) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be |
||
|
||
def test_nonnegative_weights(self): | ||
""" All the weights for a partiular document should be non negative """ | ||
weights = get_bm25_weights(common_texts) | ||
for doc_weights in weights: | ||
for weight in doc_weights: | ||
self.assertTrue(weight >= 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
def test_same_match_with_same_document(self): | ||
""" A document should always get the same weight when matched with a particular document """ | ||
corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] | ||
weights = get_bm25_weights(corpus) | ||
self.assertEqual(weights[0][1], weights[0][2]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be |
||
|
||
def test_disjoint_docs_if_weight_zero(self): | ||
""" Two disjoint documents should have zero matching""" | ||
corpus = [['cat', 'dog', 'lion'], ['goat', 'fish', 'tiger']] | ||
weights = get_bm25_weights(corpus) | ||
self.assertTrue(weights[0][1] == 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be |
||
self.assertTrue(weights[1][0] == 0) | ||
|
||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(level=logging.DEBUG) | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove `Keras-2.1.2-py2.7.egg from PR (comment here, because I can't add comment to binary file)