Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/stable 3 4 0/8710 address recommend by similarity performance #9258

44 changes: 20 additions & 24 deletions classes/search/SubmissionSearchIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ abstract class SubmissionSearchIndex
/**
* Split a string into a clean array of keywords
*
* @param string $text
* @param string|array $text
* @param bool $allowWildcards
*
* @return array of keywords
* @return string[] of keywords
*/
public function filterKeywords($text, $allowWildcards = false)
public static function filterKeywords($text, $allowWildcards = false, bool $allowShortWords = false, bool $allowNumericWords = false): array
{
$minLength = Config::getVar('search', 'min_word_length');
$stopwords = $this->_loadStopwords();
$stopwords = static::loadStopwords();

// Join multiple lines into a single string
if (is_array($text)) {
Expand All @@ -56,11 +56,11 @@ public function filterKeywords($text, $allowWildcards = false)

// FIXME Do not perform further filtering for some fields, e.g., author names?

// Remove stopwords
$keywords = [];
foreach ($words as $k) {
if (!isset($stopwords[$k]) && PKPString::strlen($k) >= $minLength && !is_numeric($k)) {
$keywords[] = PKPString::substr($k, 0, self::SEARCH_KEYWORD_MAX_LENGTH);
foreach ($words as $word) {
// Ignores: stop words, short words (when $allowShortWords is false) and words composed solely of numbers (when $allowNumericWords is false)
if (empty($stopwords[$word]) && ($allowShortWords || PKPString::strlen($word) >= $minLength) && ($allowNumericWords || !is_numeric($word))) {
$keywords[] = PKPString::substr($word, 0, static::SEARCH_KEYWORD_MAX_LENGTH);
}
}
return $keywords;
Expand All @@ -70,26 +70,22 @@ public function filterKeywords($text, $allowWildcards = false)
* Return list of stopwords.
* FIXME: Should this be locale-specific?
*
* @return array with stopwords as keys
* @return array<string,int> Stop words (in lower case) as keys and 1 as value
*/
protected function _loadStopwords()
protected static function loadStopwords()
{
static $searchStopwords;

if (!isset($searchStopwords)) {
// Load stopwords only once per request
$searchStopwords = array_count_values(
array_filter(
array_map('trim', file(dirname(__FILE__, 5) . '/' . self::SEARCH_STOPWORDS_FILE)),
function ($a) {
return !empty($a) && $a[0] != '#';
}
)
);
$searchStopwords[''] = 1;
}

return $searchStopwords;
return $searchStopwords ??= array_fill_keys(
collect(file(base_path(static::SEARCH_STOPWORDS_FILE)))
->map(fn (string $word) => trim($word))
// Ignore comments/line-breaks
->filter(fn (string $word) => !empty($word) && $word[0] !== '#')
// Include a map for empty words
->push('')
->toArray(),
1
);
}

/**
Expand Down
Loading