Skip to content

Commit

Permalink
feat(textprocessing): unknown languages use eng stopwords
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed Nov 1, 2024
1 parent fb3bdbd commit dce0067
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,12 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};

if stopwords == :auto
language = detect_language(keys(counter), language)
lk = language
if !haskey(STOPWORDS, language)
@info "No built-in stopwords for $(language)!"
lk = "eng"
end
stopwords = get(STOPWORDS, language, nothing)
stopwords = STOPWORDS[lk]
end
stopwords === nothing && (stopwords = Set{String}())
stopwords isa AbstractSet || (stopwords = Set(stopwords))
Expand Down
1 change: 1 addition & 0 deletions test/test_textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
@test processtext("word cloud is a cloud", language="en", stopwords_extra=["word"])[1] |> only == "cloud"
# settokenizer! ...
WordCloud.settokenizer!("mylang", t->split(t, "a"))
WordCloud.setstopwords!("mylang", [])
@test Set(processtext("bananais", language="mylang")[1]) == Set(["b", "n", "is"])
WordCloud.setlemmatizer!("mylang", uppercase)
@test Set(processtext("bananais", language="mylang")[1]) == Set(["B", "N", "IS"])
Expand Down

0 comments on commit dce0067

Please sign in to comment.