From 848e5ee5c94b49b987e04eb59610adeab67d50a5 Mon Sep 17 00:00:00 2001 From: samyak jain Date: Mon, 8 Jan 2018 19:00:11 +0530 Subject: [PATCH] Fixes #1828 --- gensim/summarization/bm25.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 39caba1f80..ce673b970a 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -78,12 +78,14 @@ def __init__(self, corpus): self.f = [] self.df = {} self.idf = {} + self.doc_length = [] self.initialize() def initialize(self): """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies.""" for document in self.corpus: frequencies = {} + doc_length.append(len(document)) for word in document: if word not in frequencies: frequencies[word] = 0 @@ -121,9 +123,8 @@ def get_score(self, document, index, average_idf): if word not in self.f[index]: continue idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf - doc_length = len(self.corpus[index]) score += (idf * self.f[index][word] * (PARAM_K1 + 1) - / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * doc_length / self.avgdl))) + / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * doc_length[index] / self.avgdl))) return score def get_scores(self, document, average_idf):