Skip to content

Commit

Permalink
search for function* (#3114)
Browse files Browse the repository at this point in the history
* search for function*

Fixes #3113

* feedbacked
  • Loading branch information
peterbe authored Mar 3, 2021
1 parent 3c90b0e commit e0fee91
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
3 changes: 2 additions & 1 deletion deployer/src/deployer/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,15 +267,16 @@ def analyze(
connections.create_connection(hosts=[url])
index = Document._index
analysis = index.analyze(body={"text": text, "analyzer": analyzer})
print(f"For text: {text!r}")
if "tokens" in analysis:
keys = None
for token in analysis["tokens"]:
if keys is None:
keys = token.keys()
longest_key = max(len(x) for x in keys)
print()
for key in keys:
print(f"{key:{longest_key + 1}} {token[key]!r}")
print()
elif not analysis:
print("No tokens found!")
else:
Expand Down
24 changes: 23 additions & 1 deletion deployer/src/deployer/search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,24 @@
replacement="_$1_",
)

special_charater_name_char_filter = char_filter(
"special_charater_name_char_filter",
type="pattern_replace",
# In Java, this matches things like `yield*`, `Function*` and `data-*`.
# But not that it will also match things like: `x*y` or `*emphasis*` which
# is a "risk" worth accepting because event `*emphasis*` would get converted
# to `emphasisstar` which is more accurate than letting the standard tokenizer
# turn it into `emphasis` alone.
pattern="(\\w+)-?\\*",
# Now a search for `yield*` becomes a search for `yieldstar` which won't
# get confused for `yield`.
# We *could* consider changing the replacement for `$1__starcharacter` which means
# it would tokenize `yield*` into `[yieldstarcharacter, yield, starcharacter]`
# which might capture people who didn't expect to find the page about `yield`
# when they searched for `yield*`.
replacement="$1star",
)

unicorns_char_filter = char_filter(
"unicorns_char_filter",
type="mapping",
Expand Down Expand Up @@ -136,7 +154,11 @@
# Note that we don't use the `html_strip` char_filter.
# With that, we'd lose some of the valueable characters like: `<video>` which
# is an actual title.
char_filter=[unicorns_char_filter, keep_html_char_filter],
char_filter=[
unicorns_char_filter,
special_charater_name_char_filter,
keep_html_char_filter,
],
)


Expand Down

0 comments on commit e0fee91

Please sign in to comment.