diff --git a/src/english_stem.h b/src/english_stem.h
index 21096b0..7f1f809 100644
--- a/src/english_stem.h
+++ b/src/english_stem.h
@@ -106,8 +106,9 @@ namespace stemming
- ed edly+ ing ingly+
- Delete if the preceding word part contains a vowel, and then
- If the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
- - If the word ends with a double remove the last letter (so hopp -> hop), or
- - If the word is short, add e (so hop -> hope).
+ - If the word ends with a double preceded by something other than exactly 'a', 'e' or 'o' then
+ remove the last letter (so hopp -> hop but add, egg and off are not changed), or
+ - If the word does not end with a double and is short, add 'e' (so hop -> hope).
Step 1c:
@@ -808,6 +809,16 @@ namespace stemming
}
if (regress_trim)
{
+ const bool isExactly3NotAEOStart
+ {
+ text.length() == 3 &&
+ !(stem::is_either(text[0],
+ common_lang_constants::LOWER_A, common_lang_constants::UPPER_A) ||
+ stem::is_either(text[0],
+ common_lang_constants::LOWER_E, common_lang_constants::UPPER_E) ||
+ stem::is_either(text[0],
+ common_lang_constants::LOWER_O, common_lang_constants::UPPER_O))
+ };
if (stem::is_suffix(text,
/*at*/common_lang_constants::LOWER_A, common_lang_constants::UPPER_A,
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) ||
@@ -822,7 +833,9 @@ namespace stemming
// need to search for r2 again because the 'e' added here may change that
stem::find_r2(text, L"aeiouyAEIOUY");
}
- else if (stem::is_suffix(text,
+ // undouble
+ else if ((text.length() > 3 || isExactly3NotAEOStart) &&
+ (stem::is_suffix(text,
/*bb*/
common_lang_constants::LOWER_B, common_lang_constants::UPPER_B,
common_lang_constants::LOWER_B, common_lang_constants::UPPER_B) ||
@@ -856,15 +869,18 @@ namespace stemming
stem::is_suffix(text,
/*tt*/
common_lang_constants::LOWER_T, common_lang_constants::UPPER_T,
- common_lang_constants::LOWER_T, common_lang_constants::UPPER_T) )
+ common_lang_constants::LOWER_T, common_lang_constants::UPPER_T)) )
{
text.erase(text.length()-1);
stem::update_r_sections(text);
}
- else if (is_short_word(text, text.length() ) )
+ else if ((text.length() < 2 ||
+ stem::tolower_western(text[text.length() - 1]) !=
+ stem::tolower_western(text[text.length() - 2]) ) &&
+ is_short_word(text, text.length() ) )
{
text += common_lang_constants::LOWER_E;
- // need to search for r2 again because the 'e' added here may change that
+ // need to search for R2 again because the 'e' added here may change that
stem::find_r2(text, L"aeiouyAEIOUY");
}
}
diff --git a/src/german_stem.h b/src/german_stem.h
index 7fdf9e7..15326c3 100644
--- a/src/german_stem.h
+++ b/src/german_stem.h
@@ -40,10 +40,12 @@ namespace stemming
Step 1:
Search for the longest among the following suffixes:
- - e em en ern er es
- - s (preceded by a valid s-ending)
+ a.) em (not preceded by 'syst')
+ b.) ern er
+ c.) en es e
+ d.) s (preceded by a valid s-ending)
and delete if in R1. (Of course the letter of the valid s-ending is not necessarily in R1).
- If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s.
+ If an ending of group (c) is deleted, and the ending is preceded by 'niss', delete the final s.
(For example, äckern -> äck, ackers -> acker, armes -> arm, bedürfnissen -> bedürfnis).
@@ -160,7 +162,24 @@ namespace stemming
void step_1(string_typeT& text)
{
bool stepBSucessfull{ false };
- if (stem::delete_if_is_in_r1(text,
+ // 'em', but not if 'system'
+ if ((is_suffix(text,
+ common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
+ common_lang_constants::LOWER_M, common_lang_constants::UPPER_M)) &&
+ !(is_suffix(text,
+ common_lang_constants::LOWER_S, common_lang_constants::UPPER_S,
+ common_lang_constants::LOWER_Y, common_lang_constants::UPPER_Y,
+ common_lang_constants::LOWER_S, common_lang_constants::UPPER_S,
+ common_lang_constants::LOWER_T, common_lang_constants::UPPER_T,
+ common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
+ common_lang_constants::LOWER_M, common_lang_constants::UPPER_M)) &&
+ stem::delete_if_is_in_r1(text,
+ /*em*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
+ common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) )
+ {
+ return;
+ }
+ else if (stem::delete_if_is_in_r1(text,
/*ern*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_R, common_lang_constants::UPPER_R,
common_lang_constants::LOWER_N, common_lang_constants::UPPER_N) )
@@ -173,12 +192,6 @@ namespace stemming
{
return;
}
- else if (stem::delete_if_is_in_r1(text,
- /*em*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
- common_lang_constants::LOWER_M, common_lang_constants::UPPER_M) )
- {
- return;
- }
else if (stem::delete_if_is_in_r1(text,
/*es*/common_lang_constants::LOWER_E, common_lang_constants::UPPER_E,
common_lang_constants::LOWER_S, common_lang_constants::UPPER_S) )