Add diversity calculation.

- Added a function to calculate the diversity of words in titles based on CTTR formula. - Updated name creation function to use a shared pattern for word matching. - Modified bundle creation method to include diversity check before grouping items.
Tribler · Apr 4, 2024 · a7edd17 · a7edd17
1 parent 6166c21
commit a7edd17
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 6 deletions.
diff --git a/src/tribler/core/components/knowledge/rules/content_bundling.py b/src/tribler/core/components/knowledge/rules/content_bundling.py
@@ -1,12 +1,18 @@
 import logging
+import math
 import re
 from collections import defaultdict
+from itertools import chain
 from typing import Dict, Iterable, List
 
 logger = logging.getLogger('ContentBundling')
 
 
-def _create_name(content_list: List[Dict], number: str, min_word_length: int = 3) -> str:
+def _words_pattern(min_word_length: int = 3) -> str:
+    return r'[^\W\d_]{' + str(min_word_length) + ',}'
+
+
+def _create_name(content_list: List[Dict], number: str, min_word_length: int = 4) -> str:
     """
     Create a name for a group of content items based on the most common word in the title.
     If several most frequently occurring words are found, preference is given to the longest word.
@@ -23,7 +29,7 @@ def _create_name(content_list: List[Dict], number: str, min_word_length: int = 3
     words = defaultdict(int)
     for item in content_list:
         if name := item.get('name'):
-            pattern = r'[^\W\d_]{' + str(min_word_length) + ',}'
+            pattern = _words_pattern(min_word_length)
             title_words = {w.lower() for w in re.findall(pattern, name) if w}
             for word in title_words:
                 words[word] += 1
@@ -36,6 +42,30 @@ def _create_name(content_list: List[Dict], number: str, min_word_length: int = 3
     return name[0].capitalize() + name[1:]
 
 
+def calculate_diversity(content_list: Iterable[Dict], min_word_length: int = 4) -> float:
+    """
+    Calculate the diversity of words in the titles of the content list.
+    The diversity calculation based on Corrected Type-Token Ratio (CTTR) formula.
+
+    Args:
+        content_list: list of content items. Each item should have a 'name' key with a title.
+        min_word_length: minimum word length to be considered as a word in the title.
+
+    Returns:
+        float: diversity of words in the titles
+    """
+    pattern = _words_pattern(min_word_length)
+    titles = (title.get('name') for title in content_list)
+    words_in_titles = (re.findall(pattern, title) for title in titles)
+    words = [w.lower() for w in chain.from_iterable(words_in_titles) if w]
+    total_words = len(words)
+    if total_words == 0:
+        return 0
+    unique_words = set(words)
+
+    return len(unique_words) / math.sqrt(2 * total_words)
+
+
 def group_content_by_number(content_list: Iterable[Dict], min_group_size=2) -> Dict[str, List[Dict]]:
     """
     Group content by the first number in the title. Returned groups keep the order in which it was found in the input.

diff --git a/src/tribler/core/components/knowledge/rules/tests/test_content_bundling.py b/src/tribler/core/components/knowledge/rules/tests/test_content_bundling.py
@@ -1,6 +1,7 @@
 import pytest
 
-from tribler.core.components.knowledge.rules.content_bundling import _create_name, group_content_by_number
+from tribler.core.components.knowledge.rules.content_bundling import _create_name, calculate_diversity, \
+    group_content_by_number
 
 
 def test_group_content_by_number_empty_list():
@@ -65,3 +66,36 @@ def test_create_name_non_latin():
     ]
 
     assert _create_name(content_list, '11') == 'Один 11'
+
+
+DIVERSITY_BY_WORD_LENGTH = [
+    # (min_word_length, diversity)
+    (3, 1),
+    (2, 1.2),
+    (1, 1.4),
+    (0, 1.4),
+]
+
+
+@pytest.mark.parametrize('min_word_length, diversity', DIVERSITY_BY_WORD_LENGTH)
+def test_calculate_diversity_min_word_len(min_word_length, diversity):
+    # Test that calculate_diversity calculates diversity based on the minimum word length.
+    content_list = [{'name': 'word wor wo w'}]
+    assert calculate_diversity(content_list, min_word_length) == pytest.approx(diversity, abs=0.1)
+
+
+DIVERSITY_EXAMPLES = [
+    # (text, diversity)
+    ('The', 0.7),
+    ('The quick', 1),
+    ('The quick brown', 1.22),
+    ('The quick brown the', 1.06),
+    ('The quick brown the quick', 0.94),
+]
+
+
+@pytest.mark.parametrize('text, diversity', DIVERSITY_EXAMPLES)
+def test_calculate_diversity(text, diversity):
+    # Test that calculate_diversity calculates diversity based on the text.
+    content_list = [{'name': text}]
+    assert calculate_diversity(content_list, min_word_length=3) == pytest.approx(diversity, abs=0.01)
diff --git a/src/tribler/gui/widgets/search_results_model.py b/src/tribler/gui/widgets/search_results_model.py
@@ -1,6 +1,6 @@
 from typing import Dict, List
 
-from tribler.core.components.knowledge.rules.content_bundling import group_content_by_number
+from tribler.core.components.knowledge.rules.content_bundling import calculate_diversity, group_content_by_number
 from tribler.core.components.metadata_store.db.serialization import SNIPPET
 from tribler.gui.widgets.tablecontentmodel import ChannelContentModel, get_item_uid
 
@@ -61,8 +61,7 @@ def show_remote_results(self):
         if remote_items:
             self.add_items(remote_items, remote=True)
 
-    @staticmethod
-    def create_bundles(content_list: List[Dict], filter_zero_seeders=True, min_bundle_size=2) -> List[Dict]:
+    def create_bundles(self, content_list: List[Dict], filter_zero_seeders=True, min_bundle_size=3) -> List[Dict]:
         """
         Create bundles from the content list. Each bundle contains at least min_bundle_size items.
         Args:
@@ -73,6 +72,11 @@ def create_bundles(content_list: List[Dict], filter_zero_seeders=True, min_bundl
         Returns:
             list: list of content items and bundles
         """
+        diversity = calculate_diversity(content_list)
+        self._logger.info(f'Diversity: {diversity}')
+        if diversity > 6:  # 6 is a threshold found empirically
+            self._logger.info('Diversity is higher than 6. Bundling is disabled.')
+            return content_list
         groups = group_content_by_number(content_list, min_bundle_size)
 
         result = []