Skip to content

Commit

Permalink
🐛 适配大写 %
Browse files Browse the repository at this point in the history
  • Loading branch information
zhzLuke96 committed Jun 3, 2024
1 parent a2c18b1 commit 5de4bf7
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 45 deletions.
22 changes: 12 additions & 10 deletions modules/utils/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,15 @@ def replace(match):
return result


# print(
# text_normalize(
# "ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本."
# )
# )
# print(
# text_normalize(
# " [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149"
# )
# )
if __name__ == "__main__":
print(
text_normalize(
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本."
)
)
print(
text_normalize(
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149"
)
)
print(text_normalize(" 明天有62%的概率降雨"))
76 changes: 41 additions & 35 deletions modules/utils/zh_normalization/num.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,21 @@
from collections import OrderedDict
from typing import List

DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
UNITS = OrderedDict({
1: '十',
2: '百',
3: '千',
4: '万',
8: '亿',
})

COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
UNITS = OrderedDict(
{
1: "十",
2: "百",
3: "千",
4: "万",
8: "亿",
}
)

COM_QUANTIFIERS = "(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"

# 分数表达式
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")


def replace_frac(match) -> str:
Expand All @@ -52,7 +54,7 @@ def replace_frac(match) -> str:


# 百分数表达式
RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)(%|%)")


def replace_percentage(match) -> str:
Expand All @@ -72,7 +74,7 @@ def replace_percentage(match) -> str:

# 整数表达式
# 带负号的整数 -10
RE_INTEGER = re.compile(r'(-)' r'(\d+)')
RE_INTEGER = re.compile(r"(-)" r"(\d+)")


def replace_negative_num(match) -> str:
Expand All @@ -92,7 +94,7 @@ def replace_negative_num(match) -> str:

# 编号-无符号整形
# 00078
RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")


def replace_default_num(match):
Expand All @@ -108,10 +110,10 @@ def replace_default_num(match):

# 数字表达式
# 纯小数
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
# 正整数 + 量词
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")


def replace_positive_quantifier(match) -> str:
Expand Down Expand Up @@ -155,7 +157,8 @@ def replace_number(match) -> str:
# match.group(1) and match.group(8) are copy from RE_NUMBER

RE_RANGE = re.compile(
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))"
)


def replace_range(match) -> str:
Expand All @@ -172,54 +175,57 @@ def replace_range(match) -> str:
return result


def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
stripped = value_string.lstrip('0')
def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
stripped = value_string.lstrip("0")
if len(stripped) == 0:
return []
elif len(stripped) == 1:
if use_zero and len(stripped) < len(value_string):
return [DIGITS['0'], DIGITS[stripped]]
return [DIGITS["0"], DIGITS[stripped]]
else:
return [DIGITS[stripped]]
else:
largest_unit = next(
power for power in reversed(UNITS.keys()) if power < len(stripped))
power for power in reversed(UNITS.keys()) if power < len(stripped)
)
first_part = value_string[:-largest_unit]
second_part = value_string[-largest_unit:]
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
second_part)
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)


def verbalize_cardinal(value_string: str) -> str:
if not value_string:
return ''
return ""

# 000 -> '零' , 0 -> '零'
value_string = value_string.lstrip('0')
value_string = value_string.lstrip("0")
if len(value_string) == 0:
return DIGITS['0']
return DIGITS["0"]

result_symbols = _get_value(value_string)
# verbalized number starting with '一十*' is abbreviated as `十*`
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
'1'] and result_symbols[1] == UNITS[1]:
if (
len(result_symbols) >= 2
and result_symbols[0] == DIGITS["1"]
and result_symbols[1] == UNITS[1]
):
result_symbols = result_symbols[1:]
return ''.join(result_symbols)
return "".join(result_symbols)


def verbalize_digit(value_string: str, alt_one=False) -> str:
result_symbols = [DIGITS[digit] for digit in value_string]
result = ''.join(result_symbols)
result = "".join(result_symbols)
if alt_one:
result = result.replace("一", "幺")
return result


def num2str(value_string: str) -> str:
integer_decimal = value_string.split('.')
integer_decimal = value_string.split(".")
if len(integer_decimal) == 1:
integer = integer_decimal[0]
decimal = ''
decimal = ""
elif len(integer_decimal) == 2:
integer, decimal = integer_decimal
else:
Expand All @@ -229,10 +235,10 @@ def num2str(value_string: str) -> str:

result = verbalize_cardinal(integer)

decimal = decimal.rstrip('0')
decimal = decimal.rstrip("0")
if decimal:
# '.22' is verbalized as '零点二二'
# '3.20' is verbalized as '三点二
result = result if result else "零"
result += '点' + verbalize_digit(decimal)
return result
result += "点" + verbalize_digit(decimal)
return result

0 comments on commit 5de4bf7

Please sign in to comment.