Skip to content

Commit

Permalink
Add diversity calculation.
Browse files Browse the repository at this point in the history
- Added a function to calculate the diversity of words in titles based on CTTR formula.
- Updated name creation function to use a shared pattern for word matching.
- Modified bundle creation method to include diversity check before grouping items.
  • Loading branch information
drew2a committed Apr 4, 2024
1 parent 6166c21 commit a7edd17
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 6 deletions.
34 changes: 32 additions & 2 deletions src/tribler/core/components/knowledge/rules/content_bundling.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import logging
import math
import re
from collections import defaultdict
from itertools import chain
from typing import Dict, Iterable, List

logger = logging.getLogger('ContentBundling')


def _create_name(content_list: List[Dict], number: str, min_word_length: int = 3) -> str:
def _words_pattern(min_word_length: int = 3) -> str:
return r'[^\W\d_]{' + str(min_word_length) + ',}'


def _create_name(content_list: List[Dict], number: str, min_word_length: int = 4) -> str:
"""
Create a name for a group of content items based on the most common word in the title.
If several most frequently occurring words are found, preference is given to the longest word.
Expand All @@ -23,7 +29,7 @@ def _create_name(content_list: List[Dict], number: str, min_word_length: int = 3
words = defaultdict(int)
for item in content_list:
if name := item.get('name'):
pattern = r'[^\W\d_]{' + str(min_word_length) + ',}'
pattern = _words_pattern(min_word_length)
title_words = {w.lower() for w in re.findall(pattern, name) if w}
for word in title_words:
words[word] += 1
Expand All @@ -36,6 +42,30 @@ def _create_name(content_list: List[Dict], number: str, min_word_length: int = 3
return name[0].capitalize() + name[1:]


def calculate_diversity(content_list: Iterable[Dict], min_word_length: int = 4) -> float:
"""
Calculate the diversity of words in the titles of the content list.
The diversity calculation based on Corrected Type-Token Ratio (CTTR) formula.
Args:
content_list: list of content items. Each item should have a 'name' key with a title.
min_word_length: minimum word length to be considered as a word in the title.
Returns:
float: diversity of words in the titles
"""
pattern = _words_pattern(min_word_length)
titles = (title.get('name') for title in content_list)
words_in_titles = (re.findall(pattern, title) for title in titles)
words = [w.lower() for w in chain.from_iterable(words_in_titles) if w]
total_words = len(words)
if total_words == 0:
return 0

Check warning on line 63 in src/tribler/core/components/knowledge/rules/content_bundling.py

View check run for this annotation

Codecov / codecov/patch

src/tribler/core/components/knowledge/rules/content_bundling.py#L63

Added line #L63 was not covered by tests
unique_words = set(words)

return len(unique_words) / math.sqrt(2 * total_words)


def group_content_by_number(content_list: Iterable[Dict], min_group_size=2) -> Dict[str, List[Dict]]:
"""
Group content by the first number in the title. Returned groups keep the order in which it was found in the input.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from tribler.core.components.knowledge.rules.content_bundling import _create_name, group_content_by_number
from tribler.core.components.knowledge.rules.content_bundling import _create_name, calculate_diversity, \
group_content_by_number


def test_group_content_by_number_empty_list():
Expand Down Expand Up @@ -65,3 +66,36 @@ def test_create_name_non_latin():
]

assert _create_name(content_list, '11') == 'Один 11'


DIVERSITY_BY_WORD_LENGTH = [
# (min_word_length, diversity)
(3, 1),
(2, 1.2),
(1, 1.4),
(0, 1.4),
]


@pytest.mark.parametrize('min_word_length, diversity', DIVERSITY_BY_WORD_LENGTH)
def test_calculate_diversity_min_word_len(min_word_length, diversity):
# Test that calculate_diversity calculates diversity based on the minimum word length.
content_list = [{'name': 'word wor wo w'}]
assert calculate_diversity(content_list, min_word_length) == pytest.approx(diversity, abs=0.1)


DIVERSITY_EXAMPLES = [
# (text, diversity)
('The', 0.7),
('The quick', 1),
('The quick brown', 1.22),
('The quick brown the', 1.06),
('The quick brown the quick', 0.94),
]


@pytest.mark.parametrize('text, diversity', DIVERSITY_EXAMPLES)
def test_calculate_diversity(text, diversity):
# Test that calculate_diversity calculates diversity based on the text.
content_list = [{'name': text}]
assert calculate_diversity(content_list, min_word_length=3) == pytest.approx(diversity, abs=0.01)
10 changes: 7 additions & 3 deletions src/tribler/gui/widgets/search_results_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Dict, List

from tribler.core.components.knowledge.rules.content_bundling import group_content_by_number
from tribler.core.components.knowledge.rules.content_bundling import calculate_diversity, group_content_by_number
from tribler.core.components.metadata_store.db.serialization import SNIPPET
from tribler.gui.widgets.tablecontentmodel import ChannelContentModel, get_item_uid

Expand Down Expand Up @@ -61,8 +61,7 @@ def show_remote_results(self):
if remote_items:
self.add_items(remote_items, remote=True)

@staticmethod
def create_bundles(content_list: List[Dict], filter_zero_seeders=True, min_bundle_size=2) -> List[Dict]:
def create_bundles(self, content_list: List[Dict], filter_zero_seeders=True, min_bundle_size=3) -> List[Dict]:
"""
Create bundles from the content list. Each bundle contains at least min_bundle_size items.
Args:
Expand All @@ -73,6 +72,11 @@ def create_bundles(content_list: List[Dict], filter_zero_seeders=True, min_bundl
Returns:
list: list of content items and bundles
"""
diversity = calculate_diversity(content_list)
self._logger.info(f'Diversity: {diversity}')
if diversity > 6: # 6 is a threshold found empirically
self._logger.info('Diversity is higher than 6. Bundling is disabled.')
return content_list
groups = group_content_by_number(content_list, min_bundle_size)

result = []
Expand Down

0 comments on commit a7edd17

Please sign in to comment.