diff --git a/docs/source/_data/sidebar.yml b/docs/source/_data/sidebar.yml index bf6affcb97..c5fefca4d0 100644 --- a/docs/source/_data/sidebar.yml +++ b/docs/source/_data/sidebar.yml @@ -26,6 +26,7 @@ tutorials: contribution_guidelines: contribution-guidelines.html filters: + array_to_sentence_string: array_to_sentence_string.html overview: overview.html abs: abs.html append: append.html @@ -63,6 +64,7 @@ filters: modulo: modulo.html newline_to_br: newline_to_br.html normalize_whitespace: normalize_whitespace.html + number_of_words: number_of_words.html plus: plus.html pop: pop.html push: push.html diff --git a/docs/source/filters/array_to_sentence_string.md b/docs/source/filters/array_to_sentence_string.md new file mode 100644 index 0000000000..32f25f7ad9 --- /dev/null +++ b/docs/source/filters/array_to_sentence_string.md @@ -0,0 +1,27 @@ +--- +title: array_to_sentence_string +--- + +{% since %}v10.13.0{% endsince %} + +Convert an array into a sentence. Useful for listing tags. Optional argument for connector. + +Input +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string }} +``` + +Output +```text +foo, bar, and baz +``` + +Input +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string: "or" }} +``` + +Output +```text +foo, bar, or baz +``` diff --git a/docs/source/filters/number_of_words.md b/docs/source/filters/number_of_words.md new file mode 100644 index 0000000000..01c7d93ee8 --- /dev/null +++ b/docs/source/filters/number_of_words.md @@ -0,0 +1,49 @@ +--- +title: number_of_words +--- + +{% since %}v10.13.0{% endsince %} + +Count the number of words in some text. This filter takes an optional argument to control the handling of Chinese-Japanese-Korean (CJK) characters in the input string: +- Passing 'cjk' as the argument will count every CJK character detected as one word irrespective of being separated by whitespace. +- Passing 'auto' (auto-detect) works similar to 'cjk' but is more performant if the filter is used on a variable string that may or may not contain CJK chars. + +Input +```liquid +{{ "Hello world!" | number_of_words }} +``` + +Output +```text +2 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words }} +``` + +Output +```text +1 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words: "cjk" }} +``` + +Output +```text +6 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words: "auto" }} +``` + +Output +```text +6 +``` diff --git a/docs/source/filters/overview.md b/docs/source/filters/overview.md index c531cd281c..447715e8cb 100644 --- a/docs/source/filters/overview.md +++ b/docs/source/filters/overview.md @@ -10,7 +10,7 @@ There's 40+ filters supported by LiquidJS. These filters can be categorized into Categories | Filters --- | --- Math | plus, minus, modulo, times, floor, ceil, round, divided_by, abs, at_least, at_most -String | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last,remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace +String | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last,remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace, number_of_words, array_to_sentence_string HTML/URI | escape, escape_once, url_encode, url_decode, strip_html, newline_to_br, xml_escape, cgi_escape, uri_escape Array | slice, map, sort, sort_natural, uniq, where, where_exp, group_by, group_by_exp, find, find_exp, first, last, join, reverse, concat, compact, size, push, pop, shift, unshift Date | date, date_to_xmlschema, date_to_rfc822, date_to_string, date_to_long_string diff --git a/docs/source/zh-cn/filters/array_to_sentence_string.md b/docs/source/zh-cn/filters/array_to_sentence_string.md new file mode 100644 index 0000000000..3e1d8b5378 --- /dev/null +++ b/docs/source/zh-cn/filters/array_to_sentence_string.md @@ -0,0 +1,27 @@ +--- +title: array_to_sentence_string +--- + +{% since %}v10.13.0{% endsince %} + +把数组转化为句子,用于做标签列表。有一个可选的连接词参数。 + +输入 +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string }} +``` + +输出 +```text +foo, bar, and baz +``` + +输入 +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string: "or" }} +``` + +输出 +```text +foo, bar, or baz +``` diff --git a/docs/source/zh-cn/filters/number_of_words.md b/docs/source/zh-cn/filters/number_of_words.md new file mode 100644 index 0000000000..b8f218578a --- /dev/null +++ b/docs/source/zh-cn/filters/number_of_words.md @@ -0,0 +1,49 @@ +--- +title: number_of_words +--- + +{% since %}v10.13.0{% endsince %} + +计算文本中的单词数。此过滤器接受一个可选参数,用于控制输入字符串中汉字-日语-韩语(CJK)字符的处理方式: +- 将 'cjk' 作为参数传递将会将每个检测到的 CJK 字符计为一个单词,无论是否由空格分隔。 +- 将 'auto' (自动检测)作为参数传递与 'cjk' 类似,但如果过滤器用于可能包含或不包含 CJK 字符的字符串,则性能更好。 + +Input +```liquid +{{ "Hello world!" | number_of_words }} +``` + +Output +```text +2 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words }} +``` + +Output +```text +1 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words: "cjk" }} +``` + +Output +```text +6 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words: "auto" }} +``` + +Output +```text +6 +``` diff --git a/docs/source/zh-cn/filters/overview.md b/docs/source/zh-cn/filters/overview.md index 58b08b5323..97d6b21342 100644 --- a/docs/source/zh-cn/filters/overview.md +++ b/docs/source/zh-cn/filters/overview.md @@ -10,7 +10,7 @@ LiquidJS 共支持 40+ 个过滤器,可以分为如下几类: 类别 | 过滤器 --- | --- 数学 | plus, minus, modulo, times, floor, ceil, round, divided_by, abs, at_least, at_most -字符串 | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last, remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace +字符串 | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last, remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace, number_of_words, array_to_sentence_string HTML/URI | escape, escape_once, url_encode, url_decode, strip_html, newline_to_br, xml_escape, cgi_escape, uri_escape 数组 | slice, map, sort, sort_natural, uniq, where, where_exp, group_by, group_by_exp, find, find_exp, first, last, join, reverse, concat, compact, size, push, pop, shift, unshift 日期 | date, date_to_xmlschema, date_to_rfc822, date_to_string, date_to_long_string diff --git a/src/filters/string.ts b/src/filters/string.ts index 1bdc315d81..de1b2868da 100644 --- a/src/filters/string.ts +++ b/src/filters/string.ts @@ -3,6 +3,18 @@ * * * prefer stringify() to String() since `undefined`, `null` should eval '' */ + +// Han (Chinese) characters: \u4E00-\u9FFF +// Additional Han characters: \uF900-\uFAFF (CJK Compatibility Ideographs) +// Additional Han characters: \u3400-\u4DBF (CJK Unified Ideographs Extension A) +// Katakana (Japanese): \u30A0-\u30FF +// Hiragana (Japanese): \u3040-\u309F +// Hangul (Korean): \uAC00-\uD7AF +const rCJKWord = /[\u4E00-\u9FFF\uF900-\uFAFF\u3400-\u4DBF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF]/gu; + +// Word boundary followed by word characters (for detecting words) +const rNonCJKWord = /[^\u4E00-\u9FFF\uF900-\uFAFF\u3400-\u4DBF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF\s]+/gu; + import { assert, escapeRegExp, stringify } from '../util' export function append (v: string, arg: string) { @@ -32,16 +44,16 @@ export function upcase (str: string) { } export function remove (v: string, arg: string) { - return stringify(v).split(String(arg)).join('') + return stringify(v).split(stringify(arg)).join('') } export function remove_first (v: string, l: string) { - return stringify(v).replace(String(l), '') + return stringify(v).replace(stringify(l), '') } export function remove_last (v: string, l: string) { const str = stringify(v) - const pattern = String(l) + const pattern = stringify(l) const index = str.lastIndexOf(pattern) if (index === -1) return str return str.substring(0, index) + str.substring(index + pattern.length) @@ -56,7 +68,7 @@ export function rstrip (str: string, chars?: string) { } export function split (v: string, arg: string) { - const arr = stringify(v).split(String(arg)) + const arr = stringify(v).split(stringify(arg)) // align to ruby split, which is the behavior of shopify/liquid // see: https://ruby-doc.org/core-2.4.0/String.html#method-i-split while (arr.length && arr[arr.length - 1] === '') arr.pop() @@ -83,19 +95,19 @@ export function capitalize (str: string) { } export function replace (v: string, pattern: string, replacement: string) { - return stringify(v).split(String(pattern)).join(replacement) + return stringify(v).split(stringify(pattern)).join(replacement) } export function replace_first (v: string, arg1: string, arg2: string) { - return stringify(v).replace(String(arg1), arg2) + return stringify(v).replace(stringify(arg1), arg2) } export function replace_last (v: string, arg1: string, arg2: string) { const str = stringify(v) - const pattern = String(arg1) + const pattern = stringify(arg1) const index = str.lastIndexOf(pattern) if (index === -1) return str - const replacement = String(arg2) + const replacement = stringify(arg2) return str.substring(0, index) + replacement + str.substring(index + pattern.length) } @@ -117,3 +129,34 @@ export function normalize_whitespace (v: string) { v = stringify(v) return v.replace(/\s+/g, ' ') } + +export function number_of_words(input: string, mode?: 'cjk' | 'auto') { + input = stringify(input).trim() + if (!input) return 0 + switch (mode) { + case 'cjk': + // Count CJK characters and words + return (input.match(rCJKWord) || []).length + (input.match(rNonCJKWord) || []).length; + case 'auto': + // Count CJK characters, if none, count words + return rCJKWord.test(input) + ? input.match(rCJKWord)!.length + (input.match(rNonCJKWord) || []).length + : input.split(/\s+/).length + default: + // Count words only + return input.split(/\s+/).length; + } +} + +export function array_to_sentence_string(array: unknown[], connector = "and") { + switch (array.length) { + case 0: + return "" + case 1: + return array[0] + case 2: + return `${array[0]} ${connector} ${array[1]}`; + default: + return `${array.slice(0, -1).join(", ")}, ${connector} ${array[array.length - 1]}`; + } +} diff --git a/test/integration/filters/string.spec.ts b/test/integration/filters/string.spec.ts index 3cb494da5a..90ba6084b1 100644 --- a/test/integration/filters/string.spec.ts +++ b/test/integration/filters/string.spec.ts @@ -238,4 +238,101 @@ describe('filters/string', function () { expect(liquid.parseAndRenderSync('{{ "a \n b c" | normalize_whitespace }}')).toEqual('a b c') }) }) + describe('number_of_words', () => { + it('should count words of Latin sentence', async () => { + const html = await liquid.parseAndRender('{{ "I\'m not hungry" | number_of_words: "auto"}}') + expect(html).toEqual('3') + }); + + it('should count words of mixed sentence', async () => { + const html = await liquid.parseAndRender('{{ "Hello world!" | number_of_words }}'); + expect(html).toEqual('2'); + }); + + it('should count words of CJK sentence', async () => { + const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words }}'); + expect(html).toEqual('1'); + }); + + it('should count words of CJK sentence with mode "cjk"', async () => { + const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words: "cjk" }}'); + expect(html).toEqual('6'); + }); + + it('should count words of CJK sentence with mode "auto"', async () => { + const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words: "auto" }}'); + expect(html).toEqual('6'); + }); + it('should handle empty input', async () => { + const html = await liquid.parseAndRender('{{ "" | number_of_words }}'); + expect(html).toEqual('0'); + }); + + it('should handle input with only whitespace', async () => { + const html = await liquid.parseAndRender('{{ " " | number_of_words }}'); + expect(html).toEqual('0'); + }); + + it('should count words with punctuation marks', async () => { + const html = await liquid.parseAndRender('{{ "Hello! This is a test." | number_of_words }}'); + expect(html).toEqual('5'); + }); + + it('should count words with special characters', async () => { + const html = await liquid.parseAndRender('{{ "This is a test with special characters: !@#$%^&*()-_+=`~[]{};:\'\\"\\|<,>.?/" | number_of_words }}'); + expect(html).toEqual('8'); + }); + + it('should count words with multiple spaces between words', async () => { + const html = await liquid.parseAndRender('{{ " Hello world! " | number_of_words }}'); + expect(html).toEqual('2'); + }); + + it('should count words with mixed CJK characters', async () => { + const html = await liquid.parseAndRender('{{ "你好こんにちは안녕하세요" | number_of_words: "cjk" }}'); + expect(html).toEqual('12'); + }); + }); + describe('array_to_sentence_string', () => { + it('should handle an empty array', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: [] }) + expect(html).toEqual('') + }) + + it('should handle an array with one element', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ["apple"] }) + expect(html).toEqual('apple') + }) + + it('should handle an array with two elements', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ["apple", "banana"] }) + expect(html).toEqual('apple and banana') + }) + + it('should handle an array with more than two elements', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ["apple", "banana", "orange"] }) + expect(html).toEqual('apple, banana, and orange') + }) + + it('should handle an array with custom connector', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string: "or" }}', { arr: ["apple", "banana", "orange"] }) + expect(html).toEqual('apple, banana, or orange') + }) + + it('should handle an array of numbers', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: [1, 2, 3] }) + expect(html).toEqual('1, 2, and 3') + }) + + it('should handle an array of mixed types', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ["apple", 2, "orange"] }) + expect(html).toEqual('apple, 2, and orange') + }) + + it('should handle an array of mixed types', async () => { + const html = await liquid.parseAndRender('{{ "foo,bar,baz" | split: "," | array_to_sentence_string }}') + expect(html).toEqual('foo, bar, and baz') + }) + + }) })