Skip to content

Commit

Permalink
[BUGFIX] Fix autosuggest with non-ascii terms
Browse files Browse the repository at this point in the history
The introduced ASCII folding filters or language depending normalization
filters lead to issue with the auto suggest function due to the
differing stemming behaviour.

To fix this issue the original token is preserved if possible, this e.g.
allows suggestions for search terms with and without accents. As this
extension might lead to unwanted duplicates a new field textSpellExact
is introduced, which considers non-ascii characters as given.

Resolves: TYPO3-Solr#3096
  • Loading branch information
dkd-friedrich committed Dec 8, 2021
1 parent 126584e commit 829fe67
Show file tree
Hide file tree
Showing 43 changed files with 891 additions and 24 deletions.
2 changes: 2 additions & 0 deletions Documentation/Appendix/DynamicFieldTypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ Extension Type Multivalue Comment
\*_textExactM textExact Yes
\*_textSpellS textSpell No
\*_textSpellM textSpell Yes
\*_textSpellExactS textSpellExact No
\*_textSpellExactM textSpellExact Yes
\*_phoneticS Phonetic No
\*_phoneticM Phonetic Yes
\*_point point No
Expand Down
3 changes: 3 additions & 0 deletions Documentation/Configuration/Reference/TxSolrSuggest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ suggestField

Sets the Solr index field used to get suggestions from. A general advice is to use a field without stemming on it. For practical reasons this is currently the spell checker field.

Note: With EXT:solr 11.1.0 ASCII folding and language depending normalization filters were introduced, but due to the special behaviour of the auto suggestions ascii-terms were not treated correctly. So with 11.1.3 the untouched tokens are also kept, as this might lead to duplicate
suggestions, a new field for exact suggestions is introduced, if you want to avoid duplicates and use stricter suggestions, just configure `spellExact` as suggest field.

forceHttps
----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,4 +202,30 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.ArabicStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.ArabicStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -169,4 +169,23 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,26 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,27 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,29 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.BulgarianStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.BulgarianStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,17 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.ICUTokenizerFactory"/>
<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.ICUTokenizerFactory" />
<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
</analyzer>
</fieldType>

</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,27 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,21 @@
<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.CJKBigramFilterFactory" han="true" hiragana="true" katakana="true" hangul="true" outputUnigrams="false"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.CJKBigramFilterFactory" han="true" hiragana="true" katakana="true" hangul="true" outputUnigrams="false" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
</analyzer>
</fieldType>
</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,26 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.CzechStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>

<filter class="solr.CzechStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,24 @@
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@
<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
Expand All @@ -186,9 +186,29 @@

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
</schema>
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,30 @@

<!-- no synonyms here because we do not want to add them as spell suggestion -->
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<!-- no synonyms here because we do not want to add them as spell suggestion -->
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
Expand All @@ -187,7 +210,6 @@

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@
<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
Expand All @@ -186,9 +186,29 @@

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

<!-- Setup simple analysis for more exact spell checking, considers non-ascii charaters as they are -->
<fieldType name="textSpellExact" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.ManagedSynonymGraphFilterFactory" managed="${solr.core.name}" />
<filter class="solr.ManagedStopFilterFactory" managed="${solr.core.name}"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
</schema>
Loading

0 comments on commit 829fe67

Please sign in to comment.