Skip to content

Commit

Permalink
Update German and English to the latest Snowball standard
Browse files Browse the repository at this point in the history
  • Loading branch information
Blake-Madden committed Dec 9, 2023
1 parent a8a3c20 commit 3652da4
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 16 deletions.
28 changes: 22 additions & 6 deletions src/english_stem.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,9 @@ namespace stemming
- ed edly+ ing ingly+
- Delete if the preceding word part contains a vowel, and then
- If the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
- If the word ends with a double remove the last letter (so hopp -> hop), or
- If the word is short, add e (so hop -> hope).
- If the word ends with a double preceded by something other than exactly 'a', 'e' or 'o' then
remove the last letter (so hopp -> hop but add, egg and off are not changed), or
- If the word does not end with a double and is short, add 'e' (so hop -> hope).
<b>Step 1c:</b>
Expand Down Expand Up @@ -808,6 +809,16 @@ namespace stemming
}
if (regress_trim)
{
const bool isExactly3NotAEOStart
{
text.length() == 3 &&
!(stem<string_typeT>::is_either(text[0],
common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) ||
stem<string_typeT>::is_either(text[0],
common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ||
stem<string_typeT>::is_either(text[0],
common_lang_constants::LOWER_O, common_lang_constants::UPPER_O))
};
if (stem<string_typeT>::is_suffix(text,
/*at*/common_lang_constants::LOWER_A, common_lang_constants::UPPER_A,
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ||
Expand All @@ -822,7 +833,9 @@ namespace stemming
// need to search for r2 again because the 'e' added here may change that
stem<string_typeT>::find_r2(text, L"aeiouyAEIOUY");
}
else if (stem<string_typeT>::is_suffix(text,
// undouble
else if ((text.length() > 3 || isExactly3NotAEOStart) &&
(stem<string_typeT>::is_suffix(text,
/*bb*/
common_lang_constants::LOWER_B, common_lang_constants::UPPER_B,
common_lang_constants::LOWER_B, common_lang_constants::UPPER_B) ||
Expand Down Expand Up @@ -856,15 +869,18 @@ namespace stemming
stem<string_typeT>::is_suffix(text,
/*tt*/
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T,
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) )
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)) )
{
text.erase(text.length()-1);
stem<string_typeT>::update_r_sections(text);
}
else if (is_short_word(text, text.length() ) )
else if ((text.length() < 2 ||
stem<string_typeT>::tolower_western(text[text.length() - 1]) !=
stem<string_typeT>::tolower_western(text[text.length() - 2]) ) &&
is_short_word(text, text.length() ) )
{
text += common_lang_constants::LOWER_E;
// need to search for r2 again because the 'e' added here may change that
// need to search for R2 again because the 'e' added here may change that
stem<string_typeT>::find_r2(text, L"aeiouyAEIOUY");
}
}
Expand Down
33 changes: 23 additions & 10 deletions src/german_stem.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ namespace stemming
<b>Step 1:</b>
Search for the longest among the following suffixes:
- e em en ern er es
- s (preceded by a valid s-ending)
a.) em (not preceded by 'syst')
b.) ern er
c.) en es e
d.) s (preceded by a valid s-ending)
and delete if in R1. (Of course the letter of the valid s-ending is not necessarily in R1).
If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s.
If an ending of group (c) is deleted, and the ending is preceded by 'niss', delete the final s.
(For example, äckern -> äck, ackers -> acker, armes -> arm, bedürfnissen -> bedürfnis).
Expand Down Expand Up @@ -160,7 +162,24 @@ namespace stemming
void step_1(string_typeT& text)
{
bool stepBSucessfull{ false };
if (stem<string_typeT>::delete_if_is_in_r1(text,
// 'em', but not if 'system'
if ((is_suffix(text,
common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_M, common_lang_constants::UPPER_M)) &&
!(is_suffix(text,
common_lang_constants::LOWER_S, common_lang_constants::UPPER_S,
common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y,
common_lang_constants::LOWER_S, common_lang_constants::UPPER_S,
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T,
common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_M, common_lang_constants::UPPER_M)) &&
stem<string_typeT>::delete_if_is_in_r1(text,
/*em*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) )
{
return;
}
else if (stem<string_typeT>::delete_if_is_in_r1(text,
/*ern*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_R, common_lang_constants::UPPER_R,
common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) )
Expand All @@ -173,12 +192,6 @@ namespace stemming
{
return;
}
else if (stem<string_typeT>::delete_if_is_in_r1(text,
/*em*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) )
{
return;
}
else if (stem<string_typeT>::delete_if_is_in_r1(text,
/*es*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) )
Expand Down

0 comments on commit 3652da4

Please sign in to comment.