diff --git a/gulpfile.js b/gulpfile.js index b0f3d656..2ef331da 100644 --- a/gulpfile.js +++ b/gulpfile.js @@ -293,7 +293,7 @@ function buildSrcMinifyUmdTask() { async function buildSrcCheckMinifiedSizeTask() { const stats = await fs.stat( './dist/Autolinker.min.js' ); const sizeInKb = stats.size / 1000; - const maxExpectedSizeInKb = 44; + const maxExpectedSizeInKb = 46; if( sizeInKb > maxExpectedSizeInKb ) { throw new Error( ` diff --git a/src/htmlParser/parse-html.ts b/src/htmlParser/parse-html.ts index 410ef9f3..fa2e9b56 100644 --- a/src/htmlParser/parse-html.ts +++ b/src/htmlParser/parse-html.ts @@ -1,4 +1,6 @@ import { State } from './state'; +import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib'; +import { throwUnhandledCaseError } from '../utils'; // For debugging: search for other "For debugging" lines // import CliTable from 'cli-table'; @@ -61,12 +63,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm onComment: ( offset: number ) => void; onDoctype: ( offset: number ) => void; } ) { - const letterRe = /[A-Za-z]/, - digitRe = /[0-9]/, - whitespaceRe = /\s/, - quoteRe = /['"]/, - controlCharsRe = /[\x00-\x1F\x7F]/, // control chars (0-31), and the backspace char (127) - noCurrentTag = new CurrentTag(); + const noCurrentTag = new CurrentTag(); let charIdx = 0, len = html.length, @@ -112,7 +109,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm case State.Doctype: stateDoctype( char ); break; default: - throwUnhandledStateError( state ); + throwUnhandledCaseError( state ); } // For debugging: search for other "For debugging" lines @@ -131,14 +128,6 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm //console.log( '\n' + table.toString() ); - /** - * Function that should never be called but is used to check that every - * enum value is handled using TypeScript's 'never' type. - */ - function throwUnhandledStateError( state: never ) { - throw new Error( 'Unhandled State' ) - } - // Called when non-tags are being read (i.e. the text around HTML †ags) // https://www.w3.org/TR/html51/syntax.html#data-state diff --git a/src/matcher/email-matcher.ts b/src/matcher/email-matcher.ts index 55ddae57..2899bb24 100644 --- a/src/matcher/email-matcher.ts +++ b/src/matcher/email-matcher.ts @@ -1,8 +1,11 @@ import { Matcher } from "./matcher"; -import { alphaNumericAndMarksCharsStr, getDomainNameStr } from "../regex-lib"; -import { tldRegex } from "./tld-regex"; +import { alphaNumericAndMarksCharsStr, domainNameCharRegex } from "../regex-lib"; import { EmailMatch } from "../match/email-match"; import { Match } from "../match/match"; +import { throwUnhandledCaseError } from '../utils'; + +// For debugging: search for other "For debugging" lines +// import CliTable from 'cli-table'; /** * @class Autolinker.matcher.Email @@ -15,49 +18,245 @@ import { Match } from "../match/match"; export class EmailMatcher extends Matcher { /** - * The regular expression to match email addresses. Example match: - * - * person@place.com - * - * @protected - * @property {RegExp} matcherRegex + * Valid characters that can be used in the "local" part of an email address, + * i.e. the "name" part of "name@site.com" */ - protected matcherRegex = (function() { - var specialCharacters = '!#$%&\'*+\\-\\/=?^_`{|}~', - restrictedSpecialCharacters = '\\s"(),:;<>@\\[\\]', - validCharacters = alphaNumericAndMarksCharsStr + specialCharacters, - validRestrictedCharacters = validCharacters + restrictedSpecialCharacters, - emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@'); - - return new RegExp( [ - emailRegex.source, - getDomainNameStr( 1 ), - '\\.', tldRegex.source // '.com', '.net', etc - ].join( "" ), 'gi' ); - } )(); + protected localPartCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}!#$%&'*+/=?^_\`{|}~-]` ); /** * @inheritdoc */ parseMatches( text: string ) { - let matcherRegex = this.matcherRegex, - tagBuilder = this.tagBuilder, - matches: Match[] = [], - match: RegExpExecArray | null; + const tagBuilder = this.tagBuilder, + localPartCharRegex = this.localPartCharRegex, + matches: Match[] = [], + len = text.length, + noCurrentEmailAddress = new CurrentEmailAddress(); + + let charIdx = 0, + state = State.NonEmailAddress as State, + currentEmailAddress = noCurrentEmailAddress; + + // For debugging: search for other "For debugging" lines + // const table = new CliTable( { + // head: [ 'charIdx', 'char', 'state', 'charIdx', 'currentEmailAddress.idx', 'hasDomainDot' ] + // } ); - while( ( match = matcherRegex.exec( text ) ) !== null ) { - let matchedText = match[ 0 ]; + while( charIdx < len ) { + const char = text.charAt( charIdx ); - matches.push( new EmailMatch( { - tagBuilder : tagBuilder, - matchedText : matchedText, - offset : match.index, - email : matchedText - } ) ); + // For debugging: search for other "For debugging" lines + // table.push( + // [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ] + // ); + + switch( state ) { + case State.NonEmailAddress: stateNonEmailAddress( char ); break; + case State.LocalPart: stateLocalPart( char ); break; + case State.LocalPartDot: stateLocalPartDot( char ); break; + case State.AtSign: stateAtSign( char ); break; + case State.DomainChar: stateDomainChar( char ); break; + case State.DomainHyphen: stateDomainHyphen( char ); break; + case State.DomainDot: stateDomainDot( char ); break; + + default: + throwUnhandledCaseError( state ); + } + + // For debugging: search for other "For debugging" lines + // table.push( + // [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ] + // ); + + charIdx++; } + // Capture any valid match at the end of the string + captureMatchIfValidAndReset(); + + // For debugging: search for other "For debugging" lines + //console.log( '\n' + table.toString() ); + return matches; + + + // Handles the state when we're not in an email address + function stateNonEmailAddress( char: string ) { + if( localPartCharRegex.test( char ) ) { + beginEmailAddress(); + + } else { + // not an email address character, continue + } + } + + + // Handles the state when we're currently in the "local part" of an + // email address (as opposed to the "domain part") + function stateLocalPart( char: string ) { + if( char === '.' ) { + state = State.LocalPartDot; + + } else if( char === '@' ) { + state = State.AtSign; + + } else if( localPartCharRegex.test( char ) ) { + // stay in the "local part" of the email address + + } else { + // not an email address character, return to "NonEmailAddress" state + resetToNonEmailAddressState(); + } + } + + + // Handles the state where we've read + function stateLocalPartDot( char: string ) { + if( char === '.' ) { + // We read a second '.' in a row, not a valid email address + // local part + resetToNonEmailAddressState(); + + } else if( char === '@' ) { + // We read the '@' character immediately after a dot ('.'), not + // an email address + resetToNonEmailAddressState(); + + } else if( localPartCharRegex.test( char ) ) { + state = State.LocalPart; + + } else { + // Anything else, not an email address + resetToNonEmailAddressState(); + } + } + + + function stateAtSign( char: string ) { + if( domainNameCharRegex.test( char ) ) { + state = State.DomainChar; + + } else { + // Anything else, not an email address + resetToNonEmailAddressState(); + } + } + + function stateDomainChar( char: string ) { + if( char === '.' ) { + state = State.DomainDot; + + } else if( char === '-' ) { + state = State.DomainHyphen; + + } else if( domainNameCharRegex.test( char ) ) { + // Stay in the DomainChar state + + } else { + // Anything else, we potentially matched if the criteria has + // been met + captureMatchIfValidAndReset(); + } + } + + function stateDomainHyphen( char: string ) { + if( char === '-' || char === '.' ) { + // Not valid to have two hyphens ("--") or hypen+dot ("-.") + captureMatchIfValidAndReset(); + + } else if( domainNameCharRegex.test( char ) ) { + state = State.DomainChar; + + } else { + // Anything else + captureMatchIfValidAndReset(); + } + } + + function stateDomainDot( char: string ) { + if( char === '.' || char === '-' ) { + // not valid to have two dots ("..") or dot+hypen (".-") + captureMatchIfValidAndReset(); + + } else if( domainNameCharRegex.test( char ) ) { + state = State.DomainChar; + + // After having read a '.' and then a valid domain character, + // we now know that the domain part of the email is valid, and + // we have found at least a partial EmailMatch (however, the + // email address may have additional characters from this point) + currentEmailAddress = new CurrentEmailAddress( { + ...currentEmailAddress, + hasDomainDot: true + } ); + + } else { + // Anything else + captureMatchIfValidAndReset(); + } + } + + + function beginEmailAddress() { + state = State.LocalPart; + currentEmailAddress = new CurrentEmailAddress( { idx: charIdx } ); + } + + function resetToNonEmailAddressState() { + state = State.NonEmailAddress; + currentEmailAddress = noCurrentEmailAddress + } + + + /* + * Captures the current email address as an EmailMatch if it's valid, + * and resets the state to read another email address. + */ + function captureMatchIfValidAndReset() { + if( currentEmailAddress.hasDomainDot ) { // we need at least one dot in the domain to be considered a valid email address + let emailAddress = text.slice( currentEmailAddress.idx, charIdx ); + + // If we read a '.' or '-' char that ended the email address + // (valid domain name characters, but only valid email address + // characters if they are followed by something else), strip + // it off now + if( /[-.]$/.test( emailAddress ) ){ + emailAddress = emailAddress.slice( 0, -1 ); + } + + matches.push( new EmailMatch( { + tagBuilder : tagBuilder, + matchedText : emailAddress, + offset : currentEmailAddress.idx, + email : emailAddress + } ) ); + } + + resetToNonEmailAddressState(); + } } } + + +const enum State { + NonEmailAddress = 0, + LocalPart, + LocalPartDot, + AtSign, + DomainChar, + DomainHyphen, + DomainDot +} + + +class CurrentEmailAddress { + readonly idx: number; // the index of the first character in the email address + readonly hasDomainDot: boolean; + + constructor( cfg: Partial = {} ) { + this.idx = cfg.idx !== undefined ? cfg.idx : -1; + this.hasDomainDot = !!cfg.hasDomainDot; + } +} \ No newline at end of file diff --git a/src/regex-lib.ts b/src/regex-lib.ts index 2e13e8a8..8a677744 100644 --- a/src/regex-lib.ts +++ b/src/regex-lib.ts @@ -6,6 +6,32 @@ * regular expressions that are shared between source files. */ +/** + * Regular expression to match upper and lowercase ASCII letters + */ +export const letterRe = /[A-Za-z]/; + +/** + * Regular expression to match ASCII digits + */ +export const digitRe = /[0-9]/; + +/** + * Regular expression to match whitespace + */ +export const whitespaceRe = /\s/; + +/** + * Regular expression to match quote characters + */ +export const quoteRe = /['"]/; + +/** + * Regular expression to match the range of ASCII control characters (0-31), and + * the backspace char (127) + */ +export const controlCharsRe = /[\x00-\x1F\x7F]/; + /** * The string form of a regular expression that would match all of the * alphabetic ("letter") chars in the unicode character set when placed in a @@ -142,3 +168,10 @@ export const getDomainNameStr = ( group: number ) => { * Ex: 'google', 'yahoo', 'some-other-company', etc. */ export const domainNameRegex = new RegExp( '[' + alphaNumericAndMarksCharsStr + '.\\-]*[' + alphaNumericAndMarksCharsStr + '\\-]' ); + + +/** + * A regular expression that is simply the character class of the characters + * that may be used in a domain name, minus the '-' or '.' + */ +export const domainNameCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}]` ); \ No newline at end of file diff --git a/src/utils.ts b/src/utils.ts index f930b869..8a8ec355 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -127,3 +127,12 @@ export function splitAndCapture( str: string, splitRegex: RegExp ) { return result; } + + +/** + * Function that should never be called but is used to check that every + * enum value is handled using TypeScript's 'never' type. + */ +export function throwUnhandledCaseError( theValue: never ) { + throw new Error( `Unhandled case for value: '${theValue}'` ); +} \ No newline at end of file diff --git a/tests/autolinker-email.spec.ts b/tests/autolinker-email.spec.ts index 0eeeb460..8d223f6b 100644 --- a/tests/autolinker-email.spec.ts +++ b/tests/autolinker-email.spec.ts @@ -33,6 +33,13 @@ describe( "Autolinker Email Matching -", () => { } ); + it( "should automatically link email addresses with a period at the end of a sentence (but not include the period)", function() { + let result = autolinker.link( "Joe's email is joe@joe.com. Try emailing him" ); + + expect( result ).toBe( 'Joe\'s email is joe@joe.com. Try emailing him' ); + } ); + + it( "should automatically link email addresses with a period in the 'local part'", function() { let result = autolinker.link( "Joe's email is joe.smith@joe.com" ); @@ -88,6 +95,7 @@ describe( "Autolinker Email Matching -", () => { expect( result ).toBe( 'Hi there@stuff' ); } ); + it( "should automatically link an email address with tld matched localpart", function () { let result = autolinker.link( "My email is busueng.kim@aaa.com" ); diff --git a/tests/matcher/email-matcher.spec.ts b/tests/matcher/email-matcher.spec.ts index 7bd0a10f..282b13f2 100644 --- a/tests/matcher/email-matcher.spec.ts +++ b/tests/matcher/email-matcher.spec.ts @@ -2,19 +2,19 @@ import { EmailMatcher } from "../../src/matcher/email-matcher"; import { AnchorTagBuilder } from "../../src/anchor-tag-builder"; import { MatchChecker } from "../match/match-checker"; -describe( "Autolinker.matcher.Email", function() { +describe( "Autolinker.matcher.Email", () => { let matcher: EmailMatcher; - beforeEach( function() { + beforeEach( () => { matcher = new EmailMatcher( { tagBuilder : new AnchorTagBuilder() } ); } ); - describe( 'parseMatches()', function() { + describe( 'parseMatches()', () => { - it( 'should return an empty array if there are no matches for email addresses', function() { + it( 'should return an empty array if there are no matches for email addresses', () => { expect( matcher.parseMatches( '' ) ).toEqual( [] ); expect( matcher.parseMatches( 'asdf' ) ).toEqual( [] ); expect( matcher.parseMatches( '@asdf' ) ).toEqual( [] ); @@ -22,7 +22,7 @@ describe( "Autolinker.matcher.Email", function() { } ); - it( 'should return an array of a single email address match when the string is the email address itself', function() { + it( 'should return an array of a single email address match when the string is the email address itself', () => { let matches = matcher.parseMatches( 'asdf@asdf.com' ); expect( matches.length ).toBe( 1 ); @@ -30,7 +30,15 @@ describe( "Autolinker.matcher.Email", function() { } ); - it( 'should return an array of a single email address match when the email address is in the middle of the string', function() { + it( 'should return an array of a single email address match when the email address is at the start of the string', () => { + let matches = matcher.parseMatches( 'asdf@asdf.com is my good friend' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 0 ); + } ); + + + it( 'should return an array of a single email address match when the email address is in the middle of the string', () => { let matches = matcher.parseMatches( 'Hello asdf@asdf.com my good friend' ); expect( matches.length ).toBe( 1 ); @@ -38,7 +46,7 @@ describe( "Autolinker.matcher.Email", function() { } ); - it( 'should return an array of a single email address match when the email address is at the end of the string', function() { + it( 'should return an array of a single email address match when the email address is at the end of the string', () => { let matches = matcher.parseMatches( 'Hello asdf@asdf.com' ); expect( matches.length ).toBe( 1 ); @@ -46,7 +54,23 @@ describe( "Autolinker.matcher.Email", function() { } ); - it( 'should return an array of multiple email addresses when there are more than one within the string', function() { + it( 'should return a single email address match when an email address has two dot characters following it', () => { + let matches = matcher.parseMatches( 'asdf@asdf.com..' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 0 ); + } ); + + + it( 'should return a single email address match when an email address has three dot characters following it', () => { + let matches = matcher.parseMatches( 'asdf@asdf.com...' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 0 ); + } ); + + + it( 'should return an array of multiple email addresses when there are more than one within the string', () => { let matches = matcher.parseMatches( 'Talk to asdf@asdf.com or fdsa@fdsa.com' ); expect( matches.length ).toBe( 2 ); @@ -55,7 +79,7 @@ describe( "Autolinker.matcher.Email", function() { } ); - it( 'a match within parenthesis should be parsed correctly', function() { + it( 'a match within parenthesis should be parsed correctly', () => { let matches = matcher.parseMatches( 'Hello (asdf@asdf.com)' ); expect( matches.length ).toBe( 1 ); @@ -63,7 +87,15 @@ describe( "Autolinker.matcher.Email", function() { } ); - it( 'a match with underscores should be parsed correctly', function() { + it( 'should match correctly when the email address is uppercase', () => { + let matches = matcher.parseMatches( 'Hello ASDF@ASDF.COM' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'ASDF@ASDF.COM', 6 ); + } ); + + + it( 'a match with underscores should be parsed correctly', () => { let matches = matcher.parseMatches( 'Hello asdf_fdsa_asdf@asdf.com' ); expect( matches.length ).toBe( 1 ); @@ -71,45 +103,139 @@ describe( "Autolinker.matcher.Email", function() { } ); - it( 'a match with an \' should be parsed correctly', function() { + it( 'a match with an \' should be parsed correctly', () => { let matches = matcher.parseMatches( 'o\'donnel@asdf.com' ); expect( matches.length ).toBe( 1 ); MatchChecker.expectEmailMatch( matches[ 0 ], 'o\'donnel@asdf.com', 0 ); } ); - it( 'should *not* match email with incorrect domain beginning with "-"', function() { + + it( `when a dot exists in front of the email address, the email address + should be parsed without the dot`, + () => { + let matches = matcher.parseMatches( 'Hello .asdf@asdf.com' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 7 ); + } ); + + + it( `when a dot exists at the end of the email address, the dot should + not be included`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@asdf.com.' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 6 ); + } ); + + + it( `when a dot exists at the end of the sentence ended by an email + address, the dot should not be included`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@asdf.com. How are you?' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 6 ); + } ); + + + it( `when a hypen exists at the end of the email address, the hypen + should not be included`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@asdf.com- how are you?' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 6 ); + } ); + + + it( `when two hypens exist in the domain portion of the email address, + but not next to each other, it should be considered a match`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@as-df-gh.com' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@as-df-gh.com', 6 ); + } ); + + + it( `when a domain has 3 (or more) parts, should be considered a match`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@asdf.fdsa.com' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.fdsa.com', 6 ); + } ); + + + it( `when two consecutive dots exist in the domain portion of + the email address, it should not be considered a match`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@as..df.com' ); + + expect( matches.length ).toBe( 0 ); + } ); + + + it( `when two consecutive hypens exist in the domain portion of + the email address, it should not be considered a match`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@as--df.com' ); + + expect( matches.length ).toBe( 0 ); + } ); + + + it( `when two hypens exist in the domain portion of the email address + which already has a valid domain name, the part before the two + hypens should should not be considered a match`, + () => { + let matches = matcher.parseMatches( 'Hello asdf@asdf.com--somethingelse.com' ); + + expect( matches.length ).toBe( 1 ); + MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 6 ); + } ); + + + it( 'should *not* match email with incorrect domain beginning with "-"', () => { var matches = matcher.parseMatches( 'asdf@-asdf.com' ); expect( matches.length ).toBe( 0 ); } ); - it( 'should *not* match email with incorrect domain ending with "-"', function() { + + it( 'should *not* match email with incorrect domain ending with "-"', () => { var matches = matcher.parseMatches( 'asdf@asdf-.com' ); expect( matches.length ).toBe( 0 ); } ); - it( 'should *not* match email with incorrect domain beginning with "."', function() { + + it( 'should *not* match email with incorrect domain beginning with "."', () => { var matches = matcher.parseMatches( 'asdf@.asdf.com' ); expect( matches.length ).toBe( 0 ); } ); - it( 'should *not* match email with incorrect local part beginning with "."', function() { + + it( 'should *not* match email with incorrect local part beginning with "."', () => { var matches = matcher.parseMatches( '.asdf@asdf.com' ); expect( matches.length ).toBe( 1 ); MatchChecker.expectEmailMatch( matches[ 0 ], 'asdf@asdf.com', 1 ); } ); - it( 'should *not* match email with incorrect local part ending with "."', function() { + + it( 'should *not* match email with incorrect local part ending with "."', () => { var matches = matcher.parseMatches( 'asdf.@asdf.com' ); expect( matches.length ).toBe( 0 ); } ); + - it( 'should match email skipping incorrect local part tailing with ".."', function() { + it( 'should match email skipping incorrect local part tailing with ".."', () => { var matches = matcher.parseMatches( 'asdf..asdf@asdf.com' ); expect( matches.length ).toBe( 1 );