Internationalized domain names support #33

ilyaglow · 2020-03-30T16:46:11Z

Hi,

At the moment the library can't handle IDNs:

>>> import ioc_fanger
>>> ioc_fanger.defang("вирус.рф")
'вирус.рф'

The test case:

diff --git a/tests/test_ioc_fanger.py b/tests/test_ioc_fanger.py
index edc95ce..df4aa8b 100644
--- a/tests/test_ioc_fanger.py
+++ b/tests/test_ioc_fanger.py
@@ -378,3 +378,16 @@ def test_debug():
     # make sure using debug still works properly
     s = '192[.]168[.]4[.]2'
     assert ioc_fanger.fang(s, debug=True) == '192.168.4.2'
+
+def test_idns():
+    s = 'вирус.рф'
+    assert ioc_fanger.defang(s) == 'вирус[.]рф'
+
+    s = 'вирус[.]рф'
+    assert ioc_fanger.fang(s) == 'вирус.рф'
+
+    s = '名がドメイン.中国'
+    assert ioc_fanger.defang(s) == '名がドメイン[.]中国'
+
+    s = '名がドメイン[.]中国'
+    assert ioc_fanger.fang(s) == '名がドメイン.中国'

And here's my naive approach to fix it:

diff --git a/ioc_fanger/defang.json b/ioc_fanger/defang.json
index d860481..a1cf0a0 100644
--- a/ioc_fanger/defang.json
+++ b/ioc_fanger/defang.json
@@ -1,6 +1,6 @@
 [
     {
-        "find": "(\\.)[a-zA-Z0-9-]",
+        "find": "(\\.).+",
         "replace": "[.]"
     },
     {
diff --git a/ioc_fanger/grammars.py b/ioc_fanger/grammars.py
index 6870e49..4e3379a 100644
--- a/ioc_fanger/grammars.py
+++ b/ioc_fanger/grammars.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-

 from pyparsing import alphas, alphanums
+from pyparsing import pyparsing_unicode
 from pyparsing import (
     CaselessLiteral,
     Combine,
@@ -19,8 +20,8 @@ from pyparsing import (
     White,
 )

-alphanum_word_start = WordStart(wordChars=alphanums)
-alphanum_word_end = WordEnd(wordChars=alphanums)
+alphanum_word_start = WordStart(wordChars=pyparsing_unicode.alphanums+alphanums)
+alphanum_word_end = WordEnd(wordChars=pyparsing_unicode.alphanums+alphanums)

 uppercase_word = Word("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
 not_uppercase_word_regex = Regex("[^A-Z]")
@@ -428,12 +429,12 @@ odd_url_scheme_form = alphanum_word_start + Or(
     [
         Combine(
             Word("Hh")
-            + Word(alphas, exact=2).addParseAction(replaceWith("tt"))
+            + Word(pyparsing_unicode.alphas+alphas, exact=2).addParseAction(replaceWith("tt"))
             + Word("Pp")
             + Optional(Word("Ss"))
             + Word(":")
         ),
-        Combine(Word(alphas, exact=5) + "://").addParseAction(replaceWith("https://")),
-        Combine(Word(alphas, exact=4) + "://").addParseAction(replaceWith("http://")),
+        Combine(Word(pyparsing_unicode.alphas+alphas, exact=5) + "://").addParseAction(replaceWith("https://")),
+        Combine(Word(pyparsing_unicode.alphas+alphas, exact=4) + "://").addParseAction(replaceWith("http://")),
     ]
 )

I'm mostly worried about defang.json regexp change and FP-cases it may bring.

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Internationalized domain names support #33

Internationalized domain names support #33

ilyaglow commented Mar 30, 2020

Internationalized domain names support #33

Internationalized domain names support #33

Comments

ilyaglow commented Mar 30, 2020