Skip to content

Commit

Permalink
Merge pull request #9258 from jonasraoni/feature/stable-3_4_0/8710-ad…
Browse files Browse the repository at this point in the history
…dress-recommend-by-similarity-performance

Feature/stable 3 4 0/8710 address recommend by similarity performance
  • Loading branch information
jonasraoni authored Aug 25, 2023
2 parents 96791cd + f477ec8 commit 18ac7b0
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 101 deletions.
44 changes: 20 additions & 24 deletions classes/search/SubmissionSearchIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ abstract class SubmissionSearchIndex
/**
* Split a string into a clean array of keywords
*
* @param string $text
* @param string|array $text
* @param bool $allowWildcards
*
* @return array of keywords
* @return string[] of keywords
*/
public function filterKeywords($text, $allowWildcards = false)
public static function filterKeywords($text, $allowWildcards = false, bool $allowShortWords = false, bool $allowNumericWords = false): array
{
$minLength = Config::getVar('search', 'min_word_length');
$stopwords = $this->_loadStopwords();
$stopwords = static::loadStopwords();

// Join multiple lines into a single string
if (is_array($text)) {
Expand All @@ -56,11 +56,11 @@ public function filterKeywords($text, $allowWildcards = false)

// FIXME Do not perform further filtering for some fields, e.g., author names?
// Remove stopwords
$keywords = [];
foreach ($words as $k) {
if (!isset($stopwords[$k]) && PKPString::strlen($k) >= $minLength && !is_numeric($k)) {
$keywords[] = PKPString::substr($k, 0, self::SEARCH_KEYWORD_MAX_LENGTH);
foreach ($words as $word) {
// Ignores: stop words, short words (when $allowShortWords is false) and words composed solely of numbers (when $allowNumericWords is false)
if (empty($stopwords[$word]) && ($allowShortWords || PKPString::strlen($word) >= $minLength) && ($allowNumericWords || !is_numeric($word))) {
$keywords[] = PKPString::substr($word, 0, static::SEARCH_KEYWORD_MAX_LENGTH);
}
}
return $keywords;
Expand All @@ -70,26 +70,22 @@ public function filterKeywords($text, $allowWildcards = false)
* Return list of stopwords.
* FIXME: Should this be locale-specific?
*
* @return array with stopwords as keys
* @return array<string,int> Stop words (in lower case) as keys and 1 as value
*/
protected function _loadStopwords()
protected static function loadStopwords()
{
static $searchStopwords;

if (!isset($searchStopwords)) {
// Load stopwords only once per request
$searchStopwords = array_count_values(
array_filter(
array_map('trim', file(dirname(__FILE__, 5) . '/' . self::SEARCH_STOPWORDS_FILE)),
function ($a) {
return !empty($a) && $a[0] != '#';
}
)
);
$searchStopwords[''] = 1;
}

return $searchStopwords;
return $searchStopwords ??= array_fill_keys(
collect(file(base_path(static::SEARCH_STOPWORDS_FILE)))
->map(fn (string $word) => trim($word))
// Ignore comments/line-breaks
->filter(fn (string $word) => !empty($word) && $word[0] !== '#')
// Include a map for empty words
->push('')
->toArray(),
1
);
}

/**
Expand Down
Loading

0 comments on commit 18ac7b0

Please sign in to comment.