-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Add optional ICU support for user search #14464
Changes from 11 commits
2de725d
cd08454
7ea4b05
131b30f
b25b534
e4579d7
4744982
f6d79aa
7806e21
d632735
a6770a3
a6a3573
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Improve user search for international display names. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -97,6 +97,8 @@ RUN \ | |
zlib1g-dev \ | ||
git \ | ||
curl \ | ||
libicu-dev \ | ||
pkg-config \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
|
||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Copyright 2022 The Matrix.org Foundation C.I.C. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# Stub for PyICU. | ||
|
||
class Locale: | ||
@staticmethod | ||
def getDefault() -> Locale: ... | ||
|
||
class BreakIterator: | ||
@staticmethod | ||
def createWordInstance(locale: Locale) -> BreakIterator: ... | ||
def setText(self, text: str) -> None: ... | ||
def nextBoundary(self) -> int: ... |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,14 @@ | |
cast, | ||
) | ||
|
||
try: | ||
# Figure out if ICU support is available for searching users. | ||
import icu | ||
|
||
USE_ICU = True | ||
except ModuleNotFoundError: | ||
USE_ICU = False | ||
|
||
from typing_extensions import TypedDict | ||
|
||
from synapse.api.errors import StoreError | ||
|
@@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str: | |
""" | ||
|
||
# Pull out the individual words, discarding any non-word characters. | ||
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) | ||
results = _parse_words(search_term) | ||
return " & ".join("(%s* OR %s)" % (result, result) for result in results) | ||
|
||
|
||
|
@@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: | |
We use this so that we can add prefix matching, which isn't something | ||
that is supported by default. | ||
""" | ||
|
||
# Pull out the individual words, discarding any non-word characters. | ||
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) | ||
results = _parse_words(search_term) | ||
|
||
both = " & ".join("(%s:* | %s)" % (result, result) for result in results) | ||
exact = " & ".join("%s" % (result,) for result in results) | ||
prefix = " & ".join("%s:*" % (result,) for result in results) | ||
|
||
return both, exact, prefix | ||
|
||
|
||
def _parse_words(search_term: str) -> List[str]: | ||
"""Split the provided search string into a list of its words. | ||
|
||
If support for ICU (International Components for Unicode) is available, use it. | ||
Otherwise, fall back to using a regex to detect word boundaries. This latter | ||
solution works well enough for most latin-based languages, but doesn't work as well | ||
with other languages. | ||
|
||
Args: | ||
search_term: The search string. | ||
|
||
Returns: | ||
A list of the words in the search string. | ||
""" | ||
if USE_ICU: | ||
return _parse_words_with_icu(search_term) | ||
|
||
return re.findall(r"([\w\-]+)", search_term, re.UNICODE) | ||
|
||
|
||
def _parse_words_with_icu(search_term: str) -> List[str]: | ||
"""Break down the provided search string into its individual words using ICU | ||
(International Components for Unicode). | ||
|
||
Args: | ||
search_term: The search string. | ||
|
||
Returns: | ||
A list of the words in the search string. | ||
""" | ||
results = [] | ||
breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault()) | ||
breaker.setText(search_term) | ||
i = 0 | ||
while True: | ||
j = breaker.nextBoundary() | ||
if j < 0: | ||
break | ||
|
||
result = search_term[i:j] | ||
|
||
# libicu considers spaces and punctuation between words as words, but we don't | ||
# want to include those in results as they would result in syntax errors in SQL | ||
# queries (e.g. "foo bar" would result in the search query including "foo & & | ||
# bar"). | ||
if len(re.findall(r"([\w\-]+)", result, re.UNICODE)): | ||
results.append(result) | ||
Comment on lines
+971
to
+976
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This'll consider There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nevermind, I realise it's taken from the previous code, so we've probably not regressed anything here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When handling a word with a hyphen, postgres constructs the tsvector as follows: synapse=# SELECT to_tsvector('simple', 'co-operative');
to_tsvector
---------------------------------------
'co':2 'co-operative':1 'operative':3
(1 row) While we search for the following words: >>> _parse_words_with_icu("co-operative")
['co', '-', 'operative'] Which to_tsquery interprets as: synapse=# SELECT to_tsquery('simple', 'co & - & operative');
to_tsquery
--------------------
'co' & 'operative'
(1 row) which should match. I'm not sure what happens on sqlite, but I'm less concerned there. Note that previously, we'd search for: synapse=# SELECT to_tsquery('simple', 'co-operative');
to_tsquery
-------------------------------------
'co-operative' & 'co' & 'operative'
(1 row) |
||
|
||
i = j | ||
|
||
return results |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that this is the latest version of the package, which can be a point of friction for downstream packagers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I might have just used whatever I was using at the time I implemented this change (there doesn't seem to be any recent fix to packaging or typing on that package which would justify otherwise) so it should be fine to relax this constraint if necessary.