Recalculate position so it is based on the index of unicode codepoints.

`cucumber-ruby` expects position values which are based on the index of the codepoint instead of the index of the code unit. This change modifies the value returned to `cucumber-ruby`. Prior to this change, the RegexSubMatch's position, which was correct in terms of a code unit array, would cause an `index out of string` error and crash cucumber-ruby when pretty-printing the results of a test. This commit also ammends the added tests to demonstrate the corrected behavior.
cucumber · Jul 1, 2019 · 02d74ed · 02d74ed
1 parent 581b3c8
commit 02d74ed
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 18 deletions.
diff --git a/features/specific/wire_encoding.feature b/features/specific/wire_encoding.feature
@@ -24,13 +24,7 @@ Feature: Wire Encoding Feature
       """
     When Cucumber runs the feature
     Then the step output should contain:
-    # EXPECTED
-    #   """
-    #   カラオケ機ASCII
-    #   
-    #   """
-    # ACTUAL
       """
-      \\u00E3\\u0082\\u00AB\\u00E3\\u0083\\u00A9\\u00E3\\u0082\\u00AA\\u00E3\\u0082\\u00B1\\u00E6\\u00A9\\u009FASCII
+      カラオケ機ASCII
 
       """
diff --git a/include/cucumber-cpp/internal/Table.hpp b/include/cucumber-cpp/internal/Table.hpp
@@ -24,6 +24,7 @@ class CUCUMBER_CPP_EXPORT Table {
      * @brief addColumn
      * @param column
      *
+     * @throws std::range_error
      * @throws std::runtime_error
      */
     void addColumn(const std::string column);

diff --git a/src/Regex.cpp b/src/Regex.cpp
@@ -1,6 +1,9 @@
 #include <cucumber-cpp/internal/utils/Regex.hpp>
 #include <boost/make_shared.hpp>
 
+#include <algorithm>
+#include <stdexcept>
+
 namespace cucumber {
 namespace internal {
 
@@ -24,18 +27,32 @@ boost::shared_ptr<RegexMatch> Regex::find(const std::string &expression) const {
     return boost::make_shared<FindRegexMatch>(regexImpl, expression);
 }
 
-FindRegexMatch::FindRegexMatch(const boost::regex &regexImpl, const std::string &expression) {
+namespace {
+bool isUtf8CodeUnitStartOfCodepoint(unsigned int i) {
+    return (i & 0xc0) != 0x80;
+}
+
+std::ptrdiff_t utf8CodepointOffset(const std::string &expression,
+                                   boost::smatch::const_iterator &matchIterator) {
+    if (expression.size() < static_cast<size_t>(matchIterator->first - expression.begin())) {
+        throw std::range_error("first codepoint in match is out of range");
+    }
+    return count_if(expression.begin(), matchIterator->first, &isUtf8CodeUnitStartOfCodepoint);
+}
+} // namespace
+
+FindRegexMatch::FindRegexMatch(const boost::regex& regexImpl, const std::string& expression) {
     boost::smatch matchResults;
     regexMatched = boost::regex_search(
         expression, matchResults, regexImpl, boost::regex_constants::match_extra);
     if (regexMatched) {
-        boost::smatch::const_iterator i = matchResults.begin();
-        if (i != matchResults.end())
+        boost::smatch::const_iterator matchIterator = matchResults.begin();
+        if (matchIterator != matchResults.end())
             // Skip capture group 0 which is the whole match, not a user marked sub-expression
-            ++i;
-        for (; i != matchResults.end(); ++i) {
-            if (i->matched) {
-                RegexSubmatch s = {*i, i->first - expression.begin()};
+            ++matchIterator;
+        for (; matchIterator != matchResults.end(); ++matchIterator) {
+            if (matchIterator->matched) {
+                RegexSubmatch s = {*matchIterator, utf8CodepointOffset(expression, matchIterator)};
                 submatches.push_back(s);
             } else {
                 submatches.push_back(RegexSubmatch());

diff --git a/tests/unit/RegexTest.cpp b/tests/unit/RegexTest.cpp
@@ -77,10 +77,7 @@ TEST(RegexTest, findReportsCodepointPositions) {
     EXPECT_TRUE(match->matches());
     ASSERT_EQ(2, match->getSubmatches().size());
     EXPECT_EQ(5, match->getSubmatches()[0].position);
-    // EXPECTED:
-    // EXPECT_EQ(18, match->getSubmatches()[1].position);
-    // ACTUAL
-    EXPECT_EQ(28, match->getSubmatches()[1].position);
+    EXPECT_EQ(18, match->getSubmatches()[1].position);
 }
 
 TEST(RegexTest, findAllExtractsTheFirstGroupOfEveryToken) {