From bbb49a4d92a10158d186c80629747a2f2ebe05b9 Mon Sep 17 00:00:00 2001 From: David Harsha Date: Wed, 14 Jun 2023 11:32:52 -0700 Subject: [PATCH] Better email validation The existing regex doesn't support spaces in quotes and it seems better to match the rfc as much as possible. I built the regex using `Mailbox` in [rfc5321][0] and then added non-ascii support as described in [rfc6531][1] for `idn-email`. Regular `email` uses the same check with an `ascii_only?` guard to filter to ascii. For `Domain` and `address-literal`, I'm using the existing checks because writing those as regular expressions seems hard and I believe the behavior matches the specification. I pulled the test cases from json-schema-test-suite for draft 2020-12, which is how I caught the problem. [0]: https://datatracker.ietf.org/doc/html/rfc5321#section-4.1.2 [1]: https://datatracker.ietf.org/doc/html/rfc6531#section-3.3 --- lib/json_schemer.rb | 1 + lib/json_schemer/format.rb | 9 +---- lib/json_schemer/format/email.rb | 56 ++++++++++++++++++++++++++++++++ test/format_test.rb | 25 ++++++++++++++ 4 files changed, 83 insertions(+), 8 deletions(-) create mode 100644 lib/json_schemer/format/email.rb diff --git a/lib/json_schemer.rb b/lib/json_schemer.rb index 1d11c94c..6f81e274 100644 --- a/lib/json_schemer.rb +++ b/lib/json_schemer.rb @@ -16,6 +16,7 @@ require 'json_schemer/version' require 'json_schemer/format/hostname' require 'json_schemer/format/uri_template' +require 'json_schemer/format/email' require 'json_schemer/format' require 'json_schemer/errors' require 'json_schemer/cached_resolver' diff --git a/lib/json_schemer/format.rb b/lib/json_schemer/format.rb index e087ea34..1e638465 100644 --- a/lib/json_schemer/format.rb +++ b/lib/json_schemer/format.rb @@ -1,11 +1,10 @@ # frozen_string_literal: true module JSONSchemer module Format + include Email include Hostname include URITemplate - # this is no good - EMAIL_REGEX = /\A[^@\s]+@([\p{L}\d-]+\.)+[\p{L}\d\-]{2,}\z/i.freeze JSON_POINTER_REGEX_STRING = '(\/([^~\/]|~[01])*)*' JSON_POINTER_REGEX = /\A#{JSON_POINTER_REGEX_STRING}\z/.freeze RELATIVE_JSON_POINTER_REGEX = /\A(0|[1-9]\d*)(#|#{JSON_POINTER_REGEX_STRING})?\z/.freeze @@ -72,12 +71,6 @@ def valid_date_time?(data) false end - def valid_email?(data) - return false unless EMAIL_REGEX.match?(data) - local, _domain = data.partition('@') - !local.start_with?('.') && !local.end_with?('.') && !local.include?('..') - end - def valid_ip?(data, family) IPAddr.new(data, family) IP_REGEX.match?(data) diff --git a/lib/json_schemer/format/email.rb b/lib/json_schemer/format/email.rb new file mode 100644 index 00000000..253762d4 --- /dev/null +++ b/lib/json_schemer/format/email.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true +module JSONSchemer + module Format + module Email + # https://datatracker.ietf.org/doc/html/rfc6531#section-3.3 + # I think this is the same as "UTF8-non-ascii"? (https://datatracker.ietf.org/doc/html/rfc6532#section-3.1) + UTF8_NON_ASCII = '[^[:ascii:]]' + # https://datatracker.ietf.org/doc/html/rfc5321#section-4.1.2 + A_TEXT = "([\\w!#$%&'*+\\-/=?\\^`{|}~]|#{UTF8_NON_ASCII})" # atext = ALPHA / DIGIT / ; Printable US-ASCII + # "!" / "#" / ; characters not including + # "$" / "%" / ; specials. Used for atoms. + # "&" / "'" / + # "*" / "+" / + # "-" / "/" / + # "=" / "?" / + # "^" / "_" / + # "`" / "{" / + # "|" / "}" / + # "~" + Q_TEXT_SMTP = "([\\x20-\\x21\\x23-\\x5B\\x5D-\\x7E]|#{UTF8_NON_ASCII})" # qtextSMTP = %d32-33 / %d35-91 / %d93-126 + # ; i.e., within a quoted string, any + # ; ASCII graphic or space is permitted + # ; without blackslash-quoting except + # ; double-quote and the backslash itself. + QUOTED_PAIR_SMTP = '\x5C[\x20-\x7E]' # quoted-pairSMTP = %d92 %d32-126 + # ; i.e., backslash followed by any ASCII + # ; graphic (including itself) or SPace + Q_CONTENT_SMTP = "#{Q_TEXT_SMTP}|#{QUOTED_PAIR_SMTP}" # QcontentSMTP = qtextSMTP / quoted-pairSMTP + QUOTED_STRING = "\"(#{Q_CONTENT_SMTP})*\"" # Quoted-string = DQUOTE *QcontentSMTP DQUOTE + ATOM = "#{A_TEXT}+" # Atom = 1*atext + DOT_STRING = "#{ATOM}(\\.#{ATOM})*" # Dot-string = Atom *("." Atom) + LOCAL_PART = "#{DOT_STRING}|#{QUOTED_STRING}" # Local-part = Dot-string / Quoted-string + # ; MAY be case-sensitive + # IPv4-address-literal = Snum 3("." Snum) + # using `valid_id?` to check ip addresses because it's complicated. # IPv6-address-literal = "IPv6:" IPv6-addr + ADDRESS_LITERAL = '\[(IPv6:(?[\h:]+)|(?[\d.]+))\]' # address-literal = "[" ( IPv4-address-literal / + # IPv6-address-literal / + # General-address-literal ) "]" + # ; See Section 4.1.3 + # using `valid_hostname?` to check domain because it's complicated + MAILBOX = "(#{LOCAL_PART})@(#{ADDRESS_LITERAL}|(?.+))" # Mailbox = Local-part "@" ( Domain / address-literal ) + EMAIL_REGEX = /\A#{MAILBOX}\z/ + + def valid_email?(data) + return false unless match = EMAIL_REGEX.match(data) + if ipv4 = match.named_captures.fetch('ipv4') + valid_ip?(ipv4, Socket::AF_INET) + elsif ipv6 = match.named_captures.fetch('ipv6') + valid_ip?(ipv6, Socket::AF_INET6) + else + valid_hostname?(match.named_captures.fetch('domain')) + end + end + end + end +end diff --git a/test/format_test.rb b/test/format_test.rb index 858dc56a..c098d072 100644 --- a/test/format_test.rb +++ b/test/format_test.rb @@ -73,4 +73,29 @@ def test_it_allows_callable_custom_format assert(schema.valid?('valid')) refute(schema.valid?('invalid')) end + + def test_email_format + schema = JSONSchemer.schema({ 'format' => 'email' }) + + { + "joe.bloggs@example.com" => true, + "2962" => false, + "te~st@example.com" => true, + "~test@example.com" => true, + "test~@example.com" => true, + "\"joe bloggs\"@example.com" => true, + "\"joe..bloggs\"@example.com" => true, + "\"joe@bloggs\"@example.com" => true, + "joe.bloggs@[127.0.0.1]" => true, + "joe.bloggs@[IPv6:::1]" => true, + ".test@example.com" => false, + "test.@example.com" => false, + "te.s.t@example.com" => true, + "te..st@example.com" => false, + "joe.bloggs@invalid=domain.com" => false, + "joe.bloggs@[127.0.0.300]" => false + }.each do |email, valid| + assert_equal(valid, schema.valid?(email)) + end + end end