From 498caf292052cf8c93003315068ed7c1b9dd4a13 Mon Sep 17 00:00:00 2001 From: Tz-Huan Huang Date: Fri, 3 Oct 2014 16:47:16 +0800 Subject: [PATCH 1/5] Import the zh_TW/Text.php --- src/Faker/Provider/zh_TW/Text.php | 174 ++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 src/Faker/Provider/zh_TW/Text.php diff --git a/src/Faker/Provider/zh_TW/Text.php b/src/Faker/Provider/zh_TW/Text.php new file mode 100644 index 0000000000..7026cef039 --- /dev/null +++ b/src/Faker/Provider/zh_TW/Text.php @@ -0,0 +1,174 @@ + 5) { + throw new \InvalidArgumentException('indexSize must be at most 5'); + } + + $words = $this->getConsecutiveWords($indexSize); + $result = array(); + $resultLength = 0; + // take a random starting point + $punct = array('、', '。', '」', '』', '!', '?', 'ー', ',', ':', ';'); + $next = static::randomKey($words); + while ($resultLength < $maxNbChars && isset($words[$next])) { + // fetch a random word to append + $word = static::randomElement($words[$next]); + + // calculate next index + $currentWords = static::split($next); + $currentWords[] = $word; + array_shift($currentWords); + $next = implode('', $currentWords); + + // ensure the first word is not punctuation + if ($resultLength === 0 and in_array($word, $punct)) { + continue; + } + + // append the element + $result[] = $word; + $resultLength += static::strlen($word); + } + + // remove the element that caused the text to overflow + array_pop($result); + + // build result + $result = implode('', $result); + + return $result.static::randomElement(array('。', '!', '?',)); + } + + protected function getConsecutiveWords($indexSize) + { + if (!isset($this->consecutiveWords[$indexSize])) { + $parts = $this->getExplodedText(); + $words = array(); + $index = array(); + for ($i = 0; $i < $indexSize; $i++) { + $index[] = array_shift($parts); + } + + for ($i = 0, $count = count($parts); $i < $count; $i++) { + $stringIndex = implode('', $index); + if (!isset($words[$stringIndex])) { + $words[$stringIndex] = array(); + } + $word = $parts[$i]; + $words[$stringIndex][] = $word; + array_shift($index); + $index[] = $word; + } + // cache look up words for performance + $this->consecutiveWords[$indexSize] = $words; + } + + return $this->consecutiveWords[$indexSize]; + } + + protected function getExplodedText() + { + if ($this->explodedText === null) { + $this->explodedText = static::split(static::$baseText); + } + return $this->explodedText; + } + + public static function split($text) + { + return array_values(array_filter(preg_split('//u', preg_replace('/\s+/', '', $text)))); + } + + public static function strlen($text) + { + if (function_exists('mb_get_info')) { + return mb_strlen($text); + } + return count(static::split($text)); + } +} From 81662c79094016662ba1371c056180504a3d9bf9 Mon Sep 17 00:00:00 2001 From: Tz-Huan Huang Date: Tue, 23 Sep 2014 18:24:56 +0800 Subject: [PATCH 2/5] Generalize the realText algorithm --- src/Faker/Provider/Text.php | 45 ++++++++++++--- src/Faker/Provider/zh_TW/Text.php | 96 ++++--------------------------- 2 files changed, 48 insertions(+), 93 deletions(-) diff --git a/src/Faker/Provider/Text.php b/src/Faker/Provider/Text.php index 64d09a402d..c9c8ac3f6c 100644 --- a/src/Faker/Provider/Text.php +++ b/src/Faker/Provider/Text.php @@ -5,6 +5,8 @@ abstract class Text extends \Faker\Provider\Base { protected static $baseText = ''; + protected static $separator = ' '; + protected static $separatorLen = 1; protected $explodedText = null; protected $consecutiveWords = array(); @@ -37,6 +39,7 @@ public function realText($maxNbChars = 200, $indexSize = 2) throw new \InvalidArgumentException('indexSize must be at most 5'); } + $words = $this->getConsecutiveWords($indexSize); $result = array(); $resultLength = 0; @@ -47,28 +50,28 @@ public function realText($maxNbChars = 200, $indexSize = 2) $word = static::randomElement($words[$next]); // calculate next index - $currentWords = explode(' ', $next); + $currentWords = static::explode($next); $currentWords[] = $word; array_shift($currentWords); - $next = implode(' ', $currentWords); + $next = static::implode($currentWords); // ensure text starts with an uppercase letter - if ($resultLength == 0 && !preg_match('/^\p{Lu}/u', $word)) { + if ($resultLength == 0 && !static::validStart($word)) { continue; } // append the element $result[] = $word; - $resultLength += strlen($word) + 1; + $resultLength += static::strlen($word) + static::$separatorLen; } // remove the element that caused the text to overflow array_pop($result); // build result - $result = implode(' ', $result); + $result = static::implode($result); - return $result.'.'; + return static::appendEnd($result); } protected function getConsecutiveWords($indexSize) @@ -82,7 +85,7 @@ protected function getConsecutiveWords($indexSize) } for ($i = 0, $count = count($parts); $i < $count; $i++) { - $stringIndex = implode(' ', $index); + $stringIndex = static::implode($index); if (!isset($words[$stringIndex])) { $words[$stringIndex] = array(); } @@ -101,9 +104,35 @@ protected function getConsecutiveWords($indexSize) protected function getExplodedText() { if ($this->explodedText === null) { - $this->explodedText = explode(' ', preg_replace('/\s+/u', ' ', static::$baseText)); + $this->explodedText = static::explode(preg_replace('/\s+/u', ' ', static::$baseText)); } return $this->explodedText; } + + protected static function explode($text) + { + return explode(static::$separator, $text); + } + + protected static function implode($words) + { + return implode(static::$separator, $words); + } + + protected static function strlen($text) + { + return strlen($text); + } + + protected static function validStart($word) + { + return preg_match('/^\p{Lu}/u', $word); + } + + protected static function appendEnd($text) + { + return $text.'.'; + } + } diff --git a/src/Faker/Provider/zh_TW/Text.php b/src/Faker/Provider/zh_TW/Text.php index 7026cef039..9599dac9cd 100644 --- a/src/Faker/Provider/zh_TW/Text.php +++ b/src/Faker/Provider/zh_TW/Text.php @@ -4,8 +4,9 @@ class Text extends \Faker\Provider\Text { - protected $explodedText = null; - protected $consecutiveWords = array(); + protected static $separator = ''; + protected static $separatorLen = 0; + protected static $punct = array('、', '。', '」', '』', '!', '?', 'ー', ',', ':', ';'); /** * Title: 三國演義 Romance of the Three Kingdoms @@ -77,98 +78,23 @@ class Text extends \Faker\Provider\Text 三人飛馬引軍而出。張角正殺敗董卓,乘勢趕來,忽遇三人衝殺,角軍大亂,敗走五十餘里。三人救了董卓回寨。卓問三人現居何職。玄德曰:「白身。」卓甚輕之,不為禮。玄德出,張飛大怒曰:「我等親赴血戰,救了這廝,他卻如此無禮;若不殺之,難消我氣!」便要提刀入帳來殺董卓。正是:人情勢利古猶今,誰識英雄是白身?安得快人如翼德,盡誅世上負心人!畢竟董卓性命如何,且看下文分解。 EOT; - public function realText($maxNbChars = 200, $indexSize = 2) + protected static function explode($text) { - if ($maxNbChars < 10) { - throw new \InvalidArgumentException('maxNbChars must be at least 10'); - } - if ($indexSize < 1) { - throw new \InvalidArgumentException('indexSize must be at least 1'); - } - if ($indexSize > 5) { - throw new \InvalidArgumentException('indexSize must be at most 5'); - } - - $words = $this->getConsecutiveWords($indexSize); - $result = array(); - $resultLength = 0; - // take a random starting point - $punct = array('、', '。', '」', '』', '!', '?', 'ー', ',', ':', ';'); - $next = static::randomKey($words); - while ($resultLength < $maxNbChars && isset($words[$next])) { - // fetch a random word to append - $word = static::randomElement($words[$next]); - - // calculate next index - $currentWords = static::split($next); - $currentWords[] = $word; - array_shift($currentWords); - $next = implode('', $currentWords); - - // ensure the first word is not punctuation - if ($resultLength === 0 and in_array($word, $punct)) { - continue; - } - - // append the element - $result[] = $word; - $resultLength += static::strlen($word); - } - - // remove the element that caused the text to overflow - array_pop($result); - - // build result - $result = implode('', $result); - - return $result.static::randomElement(array('。', '!', '?',)); - } - - protected function getConsecutiveWords($indexSize) - { - if (!isset($this->consecutiveWords[$indexSize])) { - $parts = $this->getExplodedText(); - $words = array(); - $index = array(); - for ($i = 0; $i < $indexSize; $i++) { - $index[] = array_shift($parts); - } - - for ($i = 0, $count = count($parts); $i < $count; $i++) { - $stringIndex = implode('', $index); - if (!isset($words[$stringIndex])) { - $words[$stringIndex] = array(); - } - $word = $parts[$i]; - $words[$stringIndex][] = $word; - array_shift($index); - $index[] = $word; - } - // cache look up words for performance - $this->consecutiveWords[$indexSize] = $words; - } - - return $this->consecutiveWords[$indexSize]; + return array_values(array_filter(preg_split('//u', preg_replace('/\s+/', '', $text)))); } - protected function getExplodedText() + protected static function strlen($text) { - if ($this->explodedText === null) { - $this->explodedText = static::split(static::$baseText); - } - return $this->explodedText; + return function_exists('mb_get_info') ? mb_strlen($text) : count(static::split($text)); } - public static function split($text) + protected static function validStart($word) { - return array_values(array_filter(preg_split('//u', preg_replace('/\s+/', '', $text)))); + return !in_array($word, static::$punct); } - public static function strlen($text) + protected static function appendEnd($text) { - if (function_exists('mb_get_info')) { - return mb_strlen($text); - } - return count(static::split($text)); + return $text.static::randomElement(array('。', '!', '?',)); } } From db8bba3432059a69540691aac1608c501b1fd27e Mon Sep 17 00:00:00 2001 From: Tz-Huan Huang Date: Tue, 7 Oct 2014 15:33:32 +0800 Subject: [PATCH 3/5] Prefer mb_strlen and refine the zh_TW's realText --- src/Faker/Provider/Text.php | 2 +- src/Faker/Provider/zh_TW/Text.php | 34 ++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/Faker/Provider/Text.php b/src/Faker/Provider/Text.php index c9c8ac3f6c..91443a888a 100644 --- a/src/Faker/Provider/Text.php +++ b/src/Faker/Provider/Text.php @@ -122,7 +122,7 @@ protected static function implode($words) protected static function strlen($text) { - return strlen($text); + return function_exists('mb_strlen') ? mb_strlen($text, 'UTF-8') : strlen($text); } protected static function validStart($word) diff --git a/src/Faker/Provider/zh_TW/Text.php b/src/Faker/Provider/zh_TW/Text.php index 9599dac9cd..e68b90c8a3 100644 --- a/src/Faker/Provider/zh_TW/Text.php +++ b/src/Faker/Provider/zh_TW/Text.php @@ -6,7 +6,13 @@ class Text extends \Faker\Provider\Text { protected static $separator = ''; protected static $separatorLen = 0; - protected static $punct = array('、', '。', '」', '』', '!', '?', 'ー', ',', ':', ';'); + + /** + * All punctuation in $baseText: 、 。 「 」 『 』 ! ? ー , : ; + */ + protected static $notEndPunct = array('、', '「', '『', 'ー', ',', ':', ';'); + protected static $endPunct = array('。', '」', '』', '!', '?'); + protected static $notBeginPunct = array('、', '。', '」', '』', '!', '?', 'ー', ',', ':', ';'); /** * Title: 三國演義 Romance of the Three Kingdoms @@ -80,21 +86,39 @@ class Text extends \Faker\Provider\Text protected static function explode($text) { - return array_values(array_filter(preg_split('//u', preg_replace('/\s+/', '', $text)))); + $chars = array(); + foreach (preg_split('//u', preg_replace('/\s+/', '', $text)) as $char) { + if ($char !== '') { + $chars[] = $char; + } + } + return $chars; } protected static function strlen($text) { - return function_exists('mb_get_info') ? mb_strlen($text) : count(static::split($text)); + return function_exists('mb_strlen') ? mb_strlen($text, 'UTF-8') : count(static::split($text)); } protected static function validStart($word) { - return !in_array($word, static::$punct); + return !in_array($word, static::$notBeginPunct); } protected static function appendEnd($text) { - return $text.static::randomElement(array('。', '!', '?',)); + // extract the last char of $text + if (function_exists('mb_substr')) { + $last = mb_substr($text, mb_strlen($text)-1, 'UTF-8'); + } else { + $chars = static::split($text); + $last = end($chars); + } + // if the last char is a not-valid-end punctuation, remove it + if (in_array($last, static::$notEndPunct)) { + $text = preg_replace('/.$/u', '', $text); + } + // if the last char is not a valid punctuation, append a default one. + return in_array($last, static::$endPunct) ? $text : $text.'。'; } } From 8b1dc8790fe01e7fccca2b720649206694cfa72d Mon Sep 17 00:00:00 2001 From: Tz-Huan Huang Date: Tue, 7 Oct 2014 15:40:20 +0800 Subject: [PATCH 4/5] Fix travis error --- src/Faker/Provider/Text.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Faker/Provider/Text.php b/src/Faker/Provider/Text.php index 91443a888a..675ad85169 100644 --- a/src/Faker/Provider/Text.php +++ b/src/Faker/Provider/Text.php @@ -134,5 +134,4 @@ protected static function appendEnd($text) { return $text.'.'; } - } From 2a2197ceeb476b14936a0f11c925b0b3400cda56 Mon Sep 17 00:00:00 2001 From: Tz-Huan Huang Date: Wed, 29 Oct 2014 15:24:58 +0800 Subject: [PATCH 5/5] Fix the function name --- src/Faker/Provider/zh_TW/Text.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Faker/Provider/zh_TW/Text.php b/src/Faker/Provider/zh_TW/Text.php index e68b90c8a3..b35903f62b 100644 --- a/src/Faker/Provider/zh_TW/Text.php +++ b/src/Faker/Provider/zh_TW/Text.php @@ -97,7 +97,7 @@ protected static function explode($text) protected static function strlen($text) { - return function_exists('mb_strlen') ? mb_strlen($text, 'UTF-8') : count(static::split($text)); + return function_exists('mb_strlen') ? mb_strlen($text, 'UTF-8') : count(static::explode($text)); } protected static function validStart($word)