From 5d03b45af9b112adf6292e9328fa543ac3665845 Mon Sep 17 00:00:00 2001 From: Veselin Stoyanov Date: Wed, 9 Aug 2017 08:05:28 -0700 Subject: [PATCH] Setup Bulgarian language and Numeral Dimension Summary: - Setup Bulgarian (BG) language - Added Numeral Dimension Closes https://github.com/facebookincubator/duckling/pull/78 Reviewed By: niteria Differential Revision: D5575513 Pulled By: patapizza fbshipit-source-id: e566155 --- Duckling/Dimensions.hs | 2 + Duckling/Dimensions/BG.hs | 18 ++ Duckling/Lang.hs | 1 + Duckling/Numeral/BG/Corpus.hs | 127 ++++++++++++++ Duckling/Numeral/BG/Rules.hs | 273 +++++++++++++++++++++++++++++ Duckling/Ranking/Classifiers.hs | 2 + Duckling/Ranking/Classifiers/BG.hs | 22 +++ Duckling/Rules.hs | 2 + Duckling/Rules/BG.hs | 34 ++++ duckling.cabal | 6 + exe/Duckling/Ranking/Generate.hs | 1 + tests/Duckling/Numeral/BG/Tests.hs | 23 +++ tests/Duckling/Numeral/Tests.hs | 2 + 13 files changed, 513 insertions(+) create mode 100644 Duckling/Dimensions/BG.hs create mode 100644 Duckling/Numeral/BG/Corpus.hs create mode 100644 Duckling/Numeral/BG/Rules.hs create mode 100644 Duckling/Ranking/Classifiers/BG.hs create mode 100644 Duckling/Rules/BG.hs create mode 100644 tests/Duckling/Numeral/BG/Tests.hs diff --git a/Duckling/Dimensions.hs b/Duckling/Dimensions.hs index ef04a49be..289510817 100644 --- a/Duckling/Dimensions.hs +++ b/Duckling/Dimensions.hs @@ -21,6 +21,7 @@ import qualified Data.HashSet as HashSet import Duckling.Dimensions.Types import qualified Duckling.Dimensions.Common as CommonDimensions import qualified Duckling.Dimensions.AR as ARDimensions +import qualified Duckling.Dimensions.BG as BGDimensions import qualified Duckling.Dimensions.CS as CSDimensions import qualified Duckling.Dimensions.DA as DADimensions import qualified Duckling.Dimensions.DE as DEDimensions @@ -78,6 +79,7 @@ dependents (This Volume) = HashSet.singleton (This Numeral) langDimensions :: Lang -> [Some Dimension] langDimensions AR = ARDimensions.allDimensions +langDimensions BG = BGDimensions.allDimensions langDimensions CS = CSDimensions.allDimensions langDimensions DA = DADimensions.allDimensions langDimensions DE = DEDimensions.allDimensions diff --git a/Duckling/Dimensions/BG.hs b/Duckling/Dimensions/BG.hs new file mode 100644 index 000000000..755888e87 --- /dev/null +++ b/Duckling/Dimensions/BG.hs @@ -0,0 +1,18 @@ +-- Copyright (c) 2016-present, Facebook, Inc. +-- All rights reserved. +-- +-- This source code is licensed under the BSD-style license found in the +-- LICENSE file in the root directory of this source tree. An additional grant +-- of patent rights can be found in the PATENTS file in the same directory. + + +module Duckling.Dimensions.BG + ( allDimensions + ) where + +import Duckling.Dimensions.Types + +allDimensions :: [Some Dimension] +allDimensions = + [ This Numeral + ] diff --git a/Duckling/Lang.hs b/Duckling/Lang.hs index f4db4d864..a7e08c9c4 100644 --- a/Duckling/Lang.hs +++ b/Duckling/Lang.hs @@ -23,6 +23,7 @@ import qualified TextShow as TS data Lang = AR + | BG | CS | DA | DE diff --git a/Duckling/Numeral/BG/Corpus.hs b/Duckling/Numeral/BG/Corpus.hs new file mode 100644 index 000000000..6cb299cb6 --- /dev/null +++ b/Duckling/Numeral/BG/Corpus.hs @@ -0,0 +1,127 @@ +-- Copyright (c) 2016-present, Facebook, Inc. +-- All rights reserved. +-- +-- This source code is licensed under the BSD-style license found in the +-- LICENSE file in the root directory of this source tree. An additional grant +-- of patent rights can be found in the PATENTS file in the same directory. + + +{-# LANGUAGE OverloadedStrings #-} + +module Duckling.Numeral.BG.Corpus + ( corpus ) where + +import Data.String +import Prelude + +import Duckling.Lang +import Duckling.Numeral.Types +import Duckling.Resolve +import Duckling.Testing.Types + +corpus :: Corpus +corpus = (testContext {lang = BG}, allExamples) + +allExamples :: [Example] +allExamples = concat + [ examples (NumeralValue 0) + [ "0" + , "нула" + ] + , examples (NumeralValue 1) + [ "1" + , "един" + , "една" + , "едно" + ] + , examples (NumeralValue 2) + [ "2" + , "02" + , "две" + , "два" + ] + , examples (NumeralValue 3) + [ "3" + , "03" + , "три" + ] + , examples (NumeralValue 4) + [ "4" + , "04" + , "четири" + ] + , examples (NumeralValue 5) + [ "5" + , "05" + , "пет" + ] + , examples (NumeralValue 33) + [ "33" + , "0033" + , "тридесет и три" + ] + , examples (NumeralValue 14) + [ "14" + , "четиринадесет" + , "четиринайсет" + ] + , examples (NumeralValue 15) + [ "15" + , "петнадесет" + , "петнайсет" + ] + , examples (NumeralValue 16) + [ "16" + , "шестнадесет" + , "шестнайсет" + ] + , examples (NumeralValue 17) + [ "17" + , "седемнадесет" + , "седемнайсет" + ] + , examples (NumeralValue 18) + [ "18" + , "осемнадесет" + , "осемнайсет" + ] + , examples (NumeralValue 525) + [ "525" + , "петстотин двадесет и пет" + ] + , examples (NumeralValue 1.1) + [ "1.1" + , "1.10" + , "01.10" + , "1 цяло и 1" + , "едно цяло и едно" + ] + , examples (NumeralValue 0.77) + [ "0.77" + , ".77" + ] + , examples (NumeralValue 100000) + [ "100000" + , "100к" + , "100К" + ] + , examples (NumeralValue 3000000) + [ "3М" + , "3000К" + , "3000000" + , "3,000,000" + ] + , examples (NumeralValue 1200000) + [ "1200000" + , "1.2М" + , "1200К" + , ".0012Г" + ] + , examples (NumeralValue (-1200000)) + [ "-1200000" + , "минус 1200000" + , "-1.2М" + , "-1200К" + , "-.0012Г" + ] + ] diff --git a/Duckling/Numeral/BG/Rules.hs b/Duckling/Numeral/BG/Rules.hs new file mode 100644 index 000000000..5ac30b5dc --- /dev/null +++ b/Duckling/Numeral/BG/Rules.hs @@ -0,0 +1,273 @@ +-- Copyright (c) 2016-present, Facebook, Inc. +-- All rights reserved. +-- +-- This source code is licensed under the BSD-style license found in the +-- LICENSE file in the root directory of this source tree. An additional grant +-- of patent rights can be found in the PATENTS file in the same directory. + + +{-# LANGUAGE GADTs #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE NoRebindableSyntax #-} + +module Duckling.Numeral.BG.Rules + ( rules ) where + +import Data.HashMap.Strict (HashMap) +import Data.Maybe +import Data.String +import Data.Text (Text) +import Prelude +import qualified Data.HashMap.Strict as HashMap +import qualified Data.Text as Text + +import Duckling.Dimensions.Types +import Duckling.Numeral.Helpers +import Duckling.Numeral.Types (NumeralData (..)) +import Duckling.Regex.Types +import Duckling.Types +import qualified Duckling.Numeral.Types as TNumeral + +ruleIntegers :: Rule +ruleIntegers = Rule + { name = "integer (numeric)" + , pattern = + [ regex "(\\d{1,18})" + ] + , prod = \tokens -> case tokens of + (Token RegexMatch (GroupMatch (match:_)):_) -> do + v <- parseInt match + integer $ toInteger v + _ -> Nothing + } + +zeroNineteenMap :: HashMap Text Integer +zeroNineteenMap = HashMap.fromList + [ ( "нула", 0 ) + , ( "един", 1 ) + , ( "една", 1 ) + , ( "едно", 1 ) + , ( "два", 2 ) + , ( "две", 2 ) + , ( "три", 3 ) + , ( "четири", 4 ) + , ( "пет", 5) + , ( "шест", 6) + , ( "седем", 7) + , ( "осем", 8) + , ( "девет", 9) + , ( "десет", 10) + , ( "единадесет", 11 ) + , ( "единайсет", 11 ) + , ( "дванадесет", 12 ) + , ( "дванайсет", 12 ) + , ( "тринадесет", 13 ) + , ( "тринайсет", 13 ) + , ( "четиринадесет", 14) + , ( "четиринайсет", 14) + , ( "петнадесет", 15) + , ( "петнайсет", 15) + , ( "шестнадесет", 16) + , ( "шестнайсет", 16) + , ( "седемнадесет", 17) + , ( "седемнайсет", 17) + , ( "осемнадесет", 18) + , ( "осемнайсет", 18) + , ( "деветнадесет", 19) + , ( "деветнайсет", 19) + ] + +ruleToNineteen :: Rule +ruleToNineteen = Rule + { name = "number (0..19)" + , pattern = + [ regex "(нула|едина(де|й)сет|двана(де|й)сет|трина(де|й)сет|четирина(де|й)сет|петна(де|й)сет|шестна(де|й)сет|седемна(де|й)сет|осемна(де|й)сет|деветна(де|й)сет|един|една|едно|два|две|три|четири|пет|шест|седем|осем|девет|десет)" + ] + , prod = \tokens -> case tokens of + (Token RegexMatch (GroupMatch (match:_)):_) -> + let x = Text.toLower match in + HashMap.lookup x zeroNineteenMap >>= integer + _ -> Nothing + } + +ruleTens :: Rule +ruleTens = Rule + { name = "integer (20..90)" + , pattern = + [ regex "((два|три|четири|пет|шест|седем|осем|девет)десет)" + ] + , prod = \tokens -> + case tokens of + (Token RegexMatch (GroupMatch (_:match:_)):_) -> do + x <- HashMap.lookup (Text.toLower match) zeroNineteenMap + integer $ x * 10 + _ -> Nothing + } + +rulePowersOfTen :: Rule +rulePowersOfTen = Rule + { name = "powers of tens" + , pattern = + [ regex "(хиляд(а|и)|милион(а|и)?|милиард(а|и)?)" + ] + , prod = \tokens -> case tokens of + (Token RegexMatch (GroupMatch (match:_)):_) -> case Text.toLower match of + "хиляд" -> double 1e3 >>= withGrain 3 >>= withMultipliable + "милион" -> double 1e6 >>= withGrain 6 >>= withMultipliable + "милиард" -> double 1e9 >>= withGrain 9 >>= withMultipliable + _ -> Nothing + _ -> Nothing + } + +ruleCompositeTens :: Rule +ruleCompositeTens = Rule + { name = "integer 21..99" + , pattern = + [ oneOf [20, 30..90] + , regex "и" + , numberBetween 1 10 + ] + , prod = \tokens -> case tokens of + (Token Numeral (NumeralData {TNumeral.value = tens}): + _: + Token Numeral (NumeralData {TNumeral.value = units}): + _) -> double $ tens + units + _ -> Nothing + } + +ruleHundreds :: Rule +ruleHundreds = Rule + { name = "integer (100..900)" + , pattern = + [ regex "(сто|двеста|триста|(четири|пет|шест|седем|осем|девет)стотин)" + ] + , prod = \tokens -> case tokens of + (Token RegexMatch (GroupMatch (match:_)):_) -> case Text.toLower match of + "сто" -> integer 100 + "двеста" -> integer 200 + "триста" -> integer 300 + "четиристотин" -> integer 400 + "петстотин" -> integer 500 + "шестстотин" -> integer 600 + "седемстотин" -> integer 700 + "осемстотин" -> integer 800 + "деветстотин" -> integer 900 + _ -> Nothing + _ -> Nothing + } + +ruleCompositeHundreds :: Rule +ruleCompositeHundreds = Rule + { name = "integer 101..999" + , pattern = + [ oneOf [200, 300..900] + , numberBetween 1 100 + ] + , prod = \tokens -> case tokens of + (Token Numeral (NumeralData {TNumeral.value = hundreds}): + Token Numeral (NumeralData {TNumeral.value = tens}): + _) -> double $ hundreds + tens + _ -> Nothing + } + +ruleDotSpelledOut :: Rule +ruleDotSpelledOut = Rule + { name = "one point 2" + , pattern = + [ dimension Numeral + , regex "цяло и" + , numberWith TNumeral.grain isNothing + ] + , prod = \tokens -> case tokens of + (Token Numeral nd1:_:Token Numeral nd2:_) -> + double $ TNumeral.value nd1 + decimalsToDouble (TNumeral.value nd2) + _ -> Nothing + } + +ruleDecimals :: Rule +ruleDecimals = Rule + { name = "decimal number" + , pattern = + [ regex "(\\d*\\.\\d+)" + ] + , prod = \tokens -> case tokens of + (Token RegexMatch (GroupMatch (match:_)):_) -> parseDecimal True match + _ -> Nothing + } + +ruleFractions :: Rule +ruleFractions = Rule + { name = "fractional number" + , pattern = + [ regex "(\\d+)/(\\d+)" + ] + , prod = \tokens -> case tokens of + (Token RegexMatch (GroupMatch (numerator:denominator:_)):_) -> do + n <- parseDecimal False numerator + d <- parseDecimal False denominator + divide n d + _ -> Nothing + } + +ruleCommas :: Rule +ruleCommas = Rule + { name = "comma-separated numbers" + , pattern = + [ regex "(\\d+(,\\d\\d\\d)+(\\.\\d+)?)" + ] + , prod = \tokens -> case tokens of + (Token RegexMatch (GroupMatch (match:_)):_) -> + parseDouble (Text.replace (Text.singleton ',') Text.empty match) >>= double + _ -> Nothing + } + +ruleSuffixes :: Rule +ruleSuffixes = Rule + { name = "suffixes (K,M,G))" + , pattern = + [ dimension Numeral + , regex "((к|м|г)|(К|М|Г))(?=[\\W$\x20ac\x00a2\x00a3]|$)" + ] + , prod = \tokens -> + case tokens of + (Token Numeral nd : Token RegexMatch (GroupMatch (match : _)):_) -> do + x <- case Text.toLower match of + "к" -> Just 1e3 + "К" -> Just 1e3 + "м" -> Just 1e6 + "М" -> Just 1e6 + "г" -> Just 1e9 + "Г" -> Just 1e9 + _ -> Nothing + double $ TNumeral.value nd * x + _ -> Nothing + } + +ruleNegative :: Rule +ruleNegative = Rule + { name = "negative numbers" + , pattern = + [ regex "-|минус\\s?" + , dimension Numeral + ] + , prod = \tokens -> case tokens of + (_:Token Numeral nd:_) -> double (TNumeral.value nd * (-1)) + _ -> Nothing + } + +rules :: [Rule] +rules = + [ ruleIntegers + , ruleToNineteen + , ruleTens + , rulePowersOfTen + , ruleCompositeTens + , ruleHundreds + , ruleCompositeHundreds + , ruleDotSpelledOut + , ruleDecimals + , ruleFractions + , ruleCommas + , ruleSuffixes + , ruleNegative + ] diff --git a/Duckling/Ranking/Classifiers.hs b/Duckling/Ranking/Classifiers.hs index 86ef4d2d2..213aa04bd 100644 --- a/Duckling/Ranking/Classifiers.hs +++ b/Duckling/Ranking/Classifiers.hs @@ -12,6 +12,7 @@ module Duckling.Ranking.Classifiers import Duckling.Lang import qualified Duckling.Ranking.Classifiers.AR as ARClassifiers +import qualified Duckling.Ranking.Classifiers.BG as BGClassifiers import qualified Duckling.Ranking.Classifiers.CS as CSClassifiers import qualified Duckling.Ranking.Classifiers.DA as DAClassifiers import qualified Duckling.Ranking.Classifiers.DE as DEClassifiers @@ -42,6 +43,7 @@ import Duckling.Ranking.Types classifiers :: Lang -> Classifiers classifiers AR = ARClassifiers.classifiers +classifiers BG = BGClassifiers.classifiers classifiers CS = CSClassifiers.classifiers classifiers DA = DAClassifiers.classifiers classifiers DE = DEClassifiers.classifiers diff --git a/Duckling/Ranking/Classifiers/BG.hs b/Duckling/Ranking/Classifiers/BG.hs new file mode 100644 index 000000000..a92f2618d --- /dev/null +++ b/Duckling/Ranking/Classifiers/BG.hs @@ -0,0 +1,22 @@ +-- Copyright (c) 2016-present, Facebook, Inc. +-- All rights reserved. +-- +-- This source code is licensed under the BSD-style license found in the +-- LICENSE file in the root directory of this source tree. An additional grant +-- of patent rights can be found in the PATENTS file in the same directory. + +----------------------------------------------------------------- +-- Auto-generated by regenClassifiers +-- +-- DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING +-- @generated +----------------------------------------------------------------- +{-# LANGUAGE OverloadedStrings #-} +module Duckling.Ranking.Classifiers.BG (classifiers) where +import Prelude +import Duckling.Ranking.Types +import qualified Data.HashMap.Strict as HashMap +import Data.String + +classifiers :: Classifiers +classifiers = HashMap.fromList [] \ No newline at end of file diff --git a/Duckling/Rules.hs b/Duckling/Rules.hs index d6bc3ec7d..3c6a02968 100644 --- a/Duckling/Rules.hs +++ b/Duckling/Rules.hs @@ -23,6 +23,7 @@ import Duckling.Dimensions.Types import Duckling.Lang import qualified Duckling.Rules.AR as ARRules import qualified Duckling.Rules.Common as CommonRules +import qualified Duckling.Rules.BG as BGRules import qualified Duckling.Rules.CS as CSRules import qualified Duckling.Rules.DA as DARules import qualified Duckling.Rules.DE as DERules @@ -69,6 +70,7 @@ rulesFor' lang dim = CommonRules.rules dim ++ langRules lang dim langRules :: Lang -> Some Dimension -> [Rule] langRules AR = ARRules.rules +langRules BG = BGRules.rules langRules CS = CSRules.rules langRules DA = DARules.rules langRules DE = DERules.rules diff --git a/Duckling/Rules/BG.hs b/Duckling/Rules/BG.hs new file mode 100644 index 000000000..d61002828 --- /dev/null +++ b/Duckling/Rules/BG.hs @@ -0,0 +1,34 @@ +-- Copyright (c) 2016-present, Facebook, Inc. +-- All rights reserved. +-- +-- This source code is licensed under the BSD-style license found in the +-- LICENSE file in the root directory of this source tree. An additional grant +-- of patent rights can be found in the PATENTS file in the same directory. + + +{-# LANGUAGE GADTs #-} +{-# LANGUAGE OverloadedStrings #-} + +module Duckling.Rules.BG + ( rules + ) where + +import Duckling.Dimensions.Types +import Duckling.Types +import qualified Duckling.Numeral.BG.Rules as Numeral + +rules :: Some Dimension -> [Rule] +rules (This Distance) = [] +rules (This Duration) = [] +rules (This Numeral) = Numeral.rules +rules (This Email) = [] +rules (This AmountOfMoney) = [] +rules (This Ordinal) = [] +rules (This PhoneNumber) = [] +rules (This Quantity) = [] +rules (This RegexMatch) = [] +rules (This Temperature) = [] +rules (This Time) = [] +rules (This TimeGrain) = [] +rules (This Url) = [] +rules (This Volume) = [] diff --git a/duckling.cabal b/duckling.cabal index 6fea68c0f..d614716a9 100644 --- a/duckling.cabal +++ b/duckling.cabal @@ -41,6 +41,7 @@ library , Duckling.Rules , Duckling.Rules.Common , Duckling.Rules.AR + , Duckling.Rules.BG , Duckling.Rules.CS , Duckling.Rules.DA , Duckling.Rules.DE @@ -75,6 +76,7 @@ library , Duckling.Ranking.Rank , Duckling.Ranking.Classifiers , Duckling.Ranking.Classifiers.AR + , Duckling.Ranking.Classifiers.BG , Duckling.Ranking.Classifiers.CS , Duckling.Ranking.Classifiers.DA , Duckling.Ranking.Classifiers.DE @@ -108,6 +110,7 @@ library , Duckling.Dimensions.Common , Duckling.Dimensions.Types , Duckling.Dimensions.AR + , Duckling.Dimensions.BG , Duckling.Dimensions.CS , Duckling.Dimensions.DA , Duckling.Dimensions.DE @@ -236,6 +239,8 @@ library -- Numeral , Duckling.Numeral.AR.Corpus , Duckling.Numeral.AR.Rules + , Duckling.Numeral.BG.Corpus + , Duckling.Numeral.BG.Rules , Duckling.Numeral.CS.Corpus , Duckling.Numeral.CS.Rules , Duckling.Numeral.DA.Corpus @@ -588,6 +593,7 @@ test-suite duckling-test -- Numeral , Duckling.Numeral.AR.Tests + , Duckling.Numeral.BG.Tests , Duckling.Numeral.CS.Tests , Duckling.Numeral.DA.Tests , Duckling.Numeral.DE.Tests diff --git a/exe/Duckling/Ranking/Generate.hs b/exe/Duckling/Ranking/Generate.hs index 0ecfb6cd5..02d7cc6e7 100644 --- a/exe/Duckling/Ranking/Generate.hs +++ b/exe/Duckling/Ranking/Generate.hs @@ -69,6 +69,7 @@ regenClassifiers lang = do -- | The training set (corpus) trainSet = case lang of AR -> (testContext, []) + BG -> (testContext, []) CS -> (testContext, []) DA -> DATime.corpus DE -> DETime.corpus diff --git a/tests/Duckling/Numeral/BG/Tests.hs b/tests/Duckling/Numeral/BG/Tests.hs new file mode 100644 index 000000000..0613fc3a7 --- /dev/null +++ b/tests/Duckling/Numeral/BG/Tests.hs @@ -0,0 +1,23 @@ +-- Copyright (c) 2016-present, Facebook, Inc. +-- All rights reserved. +-- +-- This source code is licensed under the BSD-style license found in the +-- LICENSE file in the root directory of this source tree. An additional grant +-- of patent rights can be found in the PATENTS file in the same directory. + + +module Duckling.Numeral.BG.Tests + ( tests ) where + +import Prelude +import Data.String +import Test.Tasty + +import Duckling.Dimensions.Types +import Duckling.Numeral.BG.Corpus +import Duckling.Testing.Asserts + +tests :: TestTree +tests = testGroup "BG Tests" + [ makeCorpusTest [This Numeral] corpus + ] diff --git a/tests/Duckling/Numeral/Tests.hs b/tests/Duckling/Numeral/Tests.hs index e8e0a315c..f5db59cd8 100644 --- a/tests/Duckling/Numeral/Tests.hs +++ b/tests/Duckling/Numeral/Tests.hs @@ -13,6 +13,7 @@ import Prelude import Test.Tasty import qualified Duckling.Numeral.AR.Tests as AR +import qualified Duckling.Numeral.BG.Tests as BG import qualified Duckling.Numeral.CS.Tests as CS import qualified Duckling.Numeral.DA.Tests as DA import qualified Duckling.Numeral.DE.Tests as DE @@ -43,6 +44,7 @@ import qualified Duckling.Numeral.ZH.Tests as ZH tests :: TestTree tests = testGroup "Numeral Tests" [ AR.tests + , BG.tests , CS.tests , DA.tests , DE.tests