Skip to content

Commit

Permalink
[Spellcheck] Experimental settings for better sku/reference token ext…
Browse files Browse the repository at this point in the history
…raction
  • Loading branch information
rbayet committed Jul 4, 2023
1 parent 1a18705 commit f38e32c
Show file tree
Hide file tree
Showing 16 changed files with 293 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ public function getType();
*/
public function isSearchable();

/**
* Is the field searchable and contains reference (sku) data.
*/
public function isSearchableReference();

/**
* Is the field filterable in navigation.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ interface MappingInterface
const DEFAULT_SEARCH_FIELD = 'search';
const DEFAULT_SPELLING_FIELD = 'spelling';
const DEFAULT_AUTOCOMPLETE_FIELD = 'autocomplete';
const DEFAULT_REFERENCE_FIELD = 'reference';

/**
* List of the properties of the mapping.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,26 @@ public function getSpanMatchBoost();
* @return false|int
*/
public function getSpanSize();

/**
* Check if the reference collector field should be used instead of the simple 'sku' field
* when building the exact match filter query.
*
* @return bool
*/
public function isUsingReferenceInExactMatchFilter();

/**
* Check if all tokens of the term vectors response should be used.
*
* @return bool
*/
public function isUsingAllTokens();

/**
* Check if the term vectors request should also include the reference analyzer collector field.
*
* @return bool
*/
public function isUsingReferenceAnalyzer();
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,18 @@ public function getQueryText();
* @return float
*/
public function getCutoffFrequency();

/**
* Is the spellcheck request using all tokens returned by the term vectors.
*
* @return boolean
*/
public function isUsingAllTokens();

/**
* Should the spellcheck request target the 'reference' collector field.
*
* @return boolean
*/
public function isUsingReference();
}
6 changes: 6 additions & 0 deletions src/module-elasticsuite-core/Index/Mapping.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ class Mapping implements MappingInterface
FieldInterface::ANALYZER_WHITESPACE,
FieldInterface::ANALYZER_SHINGLE,
],
self::DEFAULT_REFERENCE_FIELD => [
FieldInterface::ANALYZER_REFERENCE,
FieldInterface::ANALYZER_WHITESPACE,
FieldInterface::ANALYZER_SHINGLE,
],
];

/**
Expand All @@ -72,6 +77,7 @@ class Mapping implements MappingInterface
private $copyFieldMap = [
'isSearchable' => self::DEFAULT_SEARCH_FIELD,
'isUsedInSpellcheck' => self::DEFAULT_SPELLING_FIELD,
'isSearchableReference' => self::DEFAULT_REFERENCE_FIELD,
];

/**
Expand Down
8 changes: 8 additions & 0 deletions src/module-elasticsuite-core/Index/Mapping/Field.php
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,14 @@ public function isSearchable(): bool
return (bool) $this->config['is_searchable'];
}

/**
* {@inheritdoc}
*/
public function isSearchableReference(): bool
{
return ($this->isSearchable() && (FieldInterface::ANALYZER_REFERENCE === $this->config['default_search_analyzer']));
}

/**
* {@inheritdoc}
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ private function loadSpellingType(RequestInterface $request)
try {
$cutoffFrequencyLimit = $this->getCutoffrequencyLimit($request);
$termVectors = $this->getTermVectors($request);
$queryTermStats = $this->parseTermVectors($termVectors, $cutoffFrequencyLimit);
$queryTermStats = $this->parseTermVectors($termVectors, $cutoffFrequencyLimit, $request->isUsingAllTokens());

if ($queryTermStats['total'] == $queryTermStats['stop']) {
$spellingType = self::SPELLING_TYPE_PURE_STOPWORDS;
Expand Down Expand Up @@ -164,6 +164,11 @@ private function getTermVectors(RequestInterface $request)
],
];

if ($request->isUsingReference()) {
$doc['fields'][] = MappingInterface::DEFAULT_REFERENCE_FIELD . "." . FieldInterface::ANALYZER_REFERENCE;
$doc['doc'][MappingInterface::DEFAULT_REFERENCE_FIELD] = $request->getQueryText();
}

$docs = [];

// Compute the mtermvector query on all shards to ensure exhaustive results.
Expand All @@ -186,15 +191,18 @@ private function getTermVectors(RequestInterface $request)
* - missing : number of terms of the query not found into the index
* - standard : number of terms of the query found using the standard analyzer.
*
* @param array $termVectors The term vector query response.
* @param int $cutoffFrequencyLimit Cutoff freq (max absolute number of docs to consider term as a stopword).
* @SuppressWarnings(PHPMD.BooleanArgumentFlag)
*
* @param array $termVectors The term vector query response.
* @param int $cutoffFrequencyLimit Cutoff freq (max absolute number of docs to consider term as a stopword).
* @param boolean $useAllTokens Whether to use all tokens or not
*
* @return array
*/
private function parseTermVectors($termVectors, $cutoffFrequencyLimit)
private function parseTermVectors($termVectors, $cutoffFrequencyLimit, $useAllTokens = false)
{
$queryTermStats = ['stop' => 0, 'exact' => 0, 'standard' => 0, 'missing' => 0];
$statByPosition = $this->extractTermStatsByPosition($termVectors);
$statByPosition = $this->extractTermStatsByPosition($termVectors, $useAllTokens);

foreach ($statByPosition as $positionStat) {
$type = 'missing';
Expand All @@ -204,6 +212,8 @@ private function parseTermVectors($termVectors, $cutoffFrequencyLimit)
$type = 'stop';
} elseif (in_array(FieldInterface::ANALYZER_WHITESPACE, $positionStat['analyzers'])) {
$type = 'exact';
} elseif (in_array(FieldInterface::ANALYZER_REFERENCE, $positionStat['analyzers'])) {
$type = 'exact';
}
}
$queryTermStats[$type]++;
Expand All @@ -216,18 +226,20 @@ private function parseTermVectors($termVectors, $cutoffFrequencyLimit)

/**
* Extract term stats by position from a term vectors query response.
* Wil return an array of doc_freq, analayzers and term by position.
* Will return an array of doc_freq, analyzers and term by position.
*
* @SuppressWarnings(PHPMD.CyclomaticComplexity)
* @SuppressWarnings(PHPMD.BooleanArgumentFlag)
*
* @param array $termVectors The term vector query response.
* @param array $termVectors The term vector query response.
* @param boolean $useAllTokens Whether to use all tokens returned in the term vector response.
*
* @return array
*/
private function extractTermStatsByPosition($termVectors)
private function extractTermStatsByPosition($termVectors, $useAllTokens = false)
{
$statByPosition = [];
$analyzers = [FieldInterface::ANALYZER_STANDARD, FieldInterface::ANALYZER_WHITESPACE];
$analyzers = [FieldInterface::ANALYZER_STANDARD, FieldInterface::ANALYZER_WHITESPACE, FieldInterface::ANALYZER_REFERENCE];

if (is_array($termVectors) && isset($termVectors['docs'])) {
foreach ($termVectors['docs'] as $termVector) {
Expand All @@ -237,6 +249,9 @@ private function extractTermStatsByPosition($termVectors)
foreach ($fieldData['terms'] as $term => $termStats) {
foreach ($termStats['tokens'] as $token) {
$positionKey = $token['position'];
if ($useAllTokens) {
$positionKey = "{$token['position']}_{$token['start_offset']}_{$token['end_offset']}";
}

if (!isset($termStats['doc_freq'])) {
$termStats['doc_freq'] = 0;
Expand Down Expand Up @@ -266,7 +281,7 @@ private function extractTermStatsByPosition($termVectors)
}

/**
* Extract analayser from a mapping property name.
* Extract analyser from a mapping property name.
*
* @param string $propertyName Property name (eg. : search.whitespace)
*
Expand Down
2 changes: 2 additions & 0 deletions src/module-elasticsuite-core/Search/Request/Builder.php
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ private function getSpellingType(ContainerConfigurationInterface $containerConfi
'index' => $containerConfig->getIndexName(),
'queryText' => $queryText,
'cutoffFrequency' => $containerConfig->getRelevanceConfig()->getCutOffFrequency(),
'isUsingAllTokens' => $containerConfig->getRelevanceConfig()->isUsingAllTokens(),
'isUsingReference' => $containerConfig->getRelevanceConfig()->isUsingReferenceAnalyzer(),
];

$spellcheckRequest = $this->spellcheckRequestFactory->create($spellcheckRequestParams);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,42 @@ class RelevanceConfig implements RelevanceConfigurationInterface
*/
private $spanSize;

/**
* @var boolean
*/
private $useReferenceInExactMatchFilter;

/**
* @var boolean
*/
private $useAllTokens;

/**
* @var boolean
*/
private $useReferenceAnalyzer;

/**
* RelevanceConfiguration constructor.
*
* @SuppressWarnings(PHPMD.BooleanArgumentFlag)
* @SuppressWarnings(PHPMD.ExcessiveParameterList)
*
* @param string $minimumShouldMatch Minimum should match clause of the text query.
* @param float $tieBreaker Tie breaker for multimatch queries.
* @param int|null $phraseMatchBoost The Phrase match boost value, or null if not
* enabled
* @param float $cutOffFrequency The cutoff Frequency value
* @param FuzzinessConfigurationInterface|null $fuzziness The fuzziness Configuration, or null
* @param boolean $enablePhoneticSearch The phonetic Configuration, or null
* @param int|null $spanMatchBoost The Span match boost value, or null if not
* enabled
* @param int|null $spanSize The number of terms to match in span queries
* @param string $minimumShouldMatch Minimum should match clause of the text query.
* @param float $tieBreaker Tie breaker for multimatch queries.
* @param int|null $phraseMatchBoost The Phrase match boost value, or null if not
* enabled
* @param float $cutOffFrequency The cutoff Frequency value
* @param FuzzinessConfigurationInterface|null $fuzziness The fuzziness Configuration, or null
* @param boolean $enablePhoneticSearch The phonetic Configuration, or null
* @param int|null $spanMatchBoost The Span match boost value, or null if not
* enabled
* @param int|null $spanSize The number of terms to match in span queries
* @param boolean $useReferenceInExactMatchFilter Whether to use the reference collector field instead of
* the 'sku' field in the exact match filter query
* @param boolean $useAllTokens Whether to take into account all term vectors tokens
* @param boolean $useReferenceAnalyzer Whether to include the collector field associated
* with the reference analyzer in term vectors request
*/
public function __construct(
$minimumShouldMatch,
Expand All @@ -89,7 +110,10 @@ public function __construct(
FuzzinessConfigurationInterface $fuzziness = null,
$enablePhoneticSearch = false,
$spanMatchBoost = null,
$spanSize = null
$spanSize = null,
$useReferenceInExactMatchFilter = false,
$useAllTokens = false,
$useReferenceAnalyzer = false
) {
$this->minimumShouldMatch = $minimumShouldMatch;
$this->tieBreaker = $tieBreaker;
Expand All @@ -99,6 +123,9 @@ public function __construct(
$this->enablePhoneticSearch = $enablePhoneticSearch;
$this->spanMatchBoost = $spanMatchBoost;
$this->spanSize = $spanSize;
$this->useReferenceInExactMatchFilter = $useReferenceInExactMatchFilter;
$this->useAllTokens = $useAllTokens;
$this->useReferenceAnalyzer = $useReferenceAnalyzer;
}

/**
Expand Down Expand Up @@ -178,4 +205,28 @@ public function getSpanSize()
{
return (int) $this->spanSize;
}

/**
* {@inheritDoc}
*/
public function isUsingReferenceInExactMatchFilter()
{
return (bool) $this->useReferenceInExactMatchFilter;
}

/**
* {@inheritDoc}
*/
public function isUsingAllTokens()
{
return (bool) $this->useAllTokens;
}

/**
* {@inheritDoc}
*/
public function isUsingReferenceAnalyzer()
{
return (bool) $this->useReferenceAnalyzer;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ class Factory
*/
const SPAN_MATCH_CONFIG_XML_PREFIX = 'span_match_configuration';

/**
* XML node for using reference in the exact match filter query
*/
const EXACT_MATCH_USE_REFERENCE_IN_FILTER_XML_PATH = 'exact_match_configuration/use_reference_in_filter';

/**
* XML node for tokens usage in term vectors configuration.
*/
const TERM_VECTORS_TOKENS_CONFIG_XML_PATH = 'spellchecking/term_vectors/use_all_tokens';

/**
* XML node for reference analyzer usage in term vectors configuration.
*/
const TERM_VECTORS_USE_REFERENCE_CONFIG_XML_PATH = 'spellchecking/term_vectors/use_reference_analyzer';

/**
* @var RelevanceConfigurationInterface[]
*/
Expand Down Expand Up @@ -141,6 +156,9 @@ protected function loadConfiguration($scopeCode)
'enablePhoneticSearch' => $this->isPhoneticSearchEnabled($scopeCode),
'spanMatchBoost' => $this->getSpanMatchBoostConfiguration($scopeCode),
'spanSize' => $this->getSpanSize($scopeCode),
'useReferenceInExactMatchFilter' => $this->isUsingReferenceInExactMatchFilter($scopeCode),
'useAllTokens' => $this->isUsingAllTokensConfiguration($scopeCode),
'useReferenceAnalyzer' => $this->isUsingReferenceAnalyzerConfiguration($scopeCode),
];

return $configurationParams;
Expand Down Expand Up @@ -328,4 +346,42 @@ private function getSpanSize($scopeCode)

return $size;
}

/**
* Retrieve reference collector field usage configuration for a container.
*
* @param @param string $scopeCode The scope code
*
* @return bool
*/
private function isUsingReferenceInExactMatchFilter($scopeCode)
{
$path = self::BASE_RELEVANCE_CONFIG_XML_PREFIX . "/" . self::EXACT_MATCH_USE_REFERENCE_IN_FILTER_XML_PATH;

return (bool) $this->getConfigValue($path, $scopeCode);
}

/**
* Retrieve term vectors extensive tokens usage configuration for a container.
*
* @param string $scopeCode The scope code
*
* @return bool
*/
private function isUsingAllTokensConfiguration($scopeCode)
{
return (bool) $this->getConfigValue(self::TERM_VECTORS_TOKENS_CONFIG_XML_PATH, $scopeCode);
}

/**
* Retrieve term vectors reference analyzer usage configuration for a container.
*
* @param string $scopeCode The scope code
*
* @return bool
*/
private function isUsingReferenceAnalyzerConfiguration($scopeCode)
{
return (bool) $this->getConfigValue(self::TERM_VECTORS_USE_REFERENCE_CONFIG_XML_PATH, $scopeCode);
}
}
Loading

0 comments on commit f38e32c

Please sign in to comment.