Skip to content

Commit

Permalink
Merge pull request #260 from gregjacobs/linear-time-email-matcher
Browse files Browse the repository at this point in the history
Linear time email matcher
  • Loading branch information
gregjacobs authored Jan 23, 2019
2 parents fe79604 + 9942278 commit ac52836
Show file tree
Hide file tree
Showing 7 changed files with 430 additions and 66 deletions.
2 changes: 1 addition & 1 deletion gulpfile.js
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ function buildSrcMinifyUmdTask() {
async function buildSrcCheckMinifiedSizeTask() {
const stats = await fs.stat( './dist/Autolinker.min.js' );
const sizeInKb = stats.size / 1000;
const maxExpectedSizeInKb = 44;
const maxExpectedSizeInKb = 46;

if( sizeInKb > maxExpectedSizeInKb ) {
throw new Error( `
Expand Down
19 changes: 4 additions & 15 deletions src/htmlParser/parse-html.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { State } from './state';
import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
import { throwUnhandledCaseError } from '../utils';

// For debugging: search for other "For debugging" lines
// import CliTable from 'cli-table';
Expand Down Expand Up @@ -61,12 +63,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
onComment: ( offset: number ) => void;
onDoctype: ( offset: number ) => void;
} ) {
const letterRe = /[A-Za-z]/,
digitRe = /[0-9]/,
whitespaceRe = /\s/,
quoteRe = /['"]/,
controlCharsRe = /[\x00-\x1F\x7F]/, // control chars (0-31), and the backspace char (127)
noCurrentTag = new CurrentTag();
const noCurrentTag = new CurrentTag();

let charIdx = 0,
len = html.length,
Expand Down Expand Up @@ -112,7 +109,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
case State.Doctype: stateDoctype( char ); break;

default:
throwUnhandledStateError( state );
throwUnhandledCaseError( state );
}

// For debugging: search for other "For debugging" lines
Expand All @@ -131,14 +128,6 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
//console.log( '\n' + table.toString() );


/**
* Function that should never be called but is used to check that every
* enum value is handled using TypeScript's 'never' type.
*/
function throwUnhandledStateError( state: never ) {
throw new Error( 'Unhandled State' )
}


// Called when non-tags are being read (i.e. the text around HTML †ags)
// https://www.w3.org/TR/html51/syntax.html#data-state
Expand Down
265 changes: 232 additions & 33 deletions src/matcher/email-matcher.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import { Matcher } from "./matcher";
import { alphaNumericAndMarksCharsStr, getDomainNameStr } from "../regex-lib";
import { tldRegex } from "./tld-regex";
import { alphaNumericAndMarksCharsStr, domainNameCharRegex } from "../regex-lib";
import { EmailMatch } from "../match/email-match";
import { Match } from "../match/match";
import { throwUnhandledCaseError } from '../utils';

// For debugging: search for other "For debugging" lines
// import CliTable from 'cli-table';

/**
* @class Autolinker.matcher.Email
Expand All @@ -15,49 +18,245 @@ import { Match } from "../match/match";
export class EmailMatcher extends Matcher {

/**
* The regular expression to match email addresses. Example match:
*
* person@place.com
*
* @protected
* @property {RegExp} matcherRegex
* Valid characters that can be used in the "local" part of an email address,
* i.e. the "name" part of "name@site.com"
*/
protected matcherRegex = (function() {
var specialCharacters = '!#$%&\'*+\\-\\/=?^_`{|}~',
restrictedSpecialCharacters = '\\s"(),:;<>@\\[\\]',
validCharacters = alphaNumericAndMarksCharsStr + specialCharacters,
validRestrictedCharacters = validCharacters + restrictedSpecialCharacters,
emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@');

return new RegExp( [
emailRegex.source,
getDomainNameStr( 1 ),
'\\.', tldRegex.source // '.com', '.net', etc
].join( "" ), 'gi' );
} )();
protected localPartCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}!#$%&'*+/=?^_\`{|}~-]` );


/**
* @inheritdoc
*/
parseMatches( text: string ) {
let matcherRegex = this.matcherRegex,
tagBuilder = this.tagBuilder,
matches: Match[] = [],
match: RegExpExecArray | null;
const tagBuilder = this.tagBuilder,
localPartCharRegex = this.localPartCharRegex,
matches: Match[] = [],
len = text.length,
noCurrentEmailAddress = new CurrentEmailAddress();

let charIdx = 0,
state = State.NonEmailAddress as State,
currentEmailAddress = noCurrentEmailAddress;

// For debugging: search for other "For debugging" lines
// const table = new CliTable( {
// head: [ 'charIdx', 'char', 'state', 'charIdx', 'currentEmailAddress.idx', 'hasDomainDot' ]
// } );

while( ( match = matcherRegex.exec( text ) ) !== null ) {
let matchedText = match[ 0 ];
while( charIdx < len ) {
const char = text.charAt( charIdx );

matches.push( new EmailMatch( {
tagBuilder : tagBuilder,
matchedText : matchedText,
offset : match.index,
email : matchedText
} ) );
// For debugging: search for other "For debugging" lines
// table.push(
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
// );

switch( state ) {
case State.NonEmailAddress: stateNonEmailAddress( char ); break;
case State.LocalPart: stateLocalPart( char ); break;
case State.LocalPartDot: stateLocalPartDot( char ); break;
case State.AtSign: stateAtSign( char ); break;
case State.DomainChar: stateDomainChar( char ); break;
case State.DomainHyphen: stateDomainHyphen( char ); break;
case State.DomainDot: stateDomainDot( char ); break;

default:
throwUnhandledCaseError( state );
}

// For debugging: search for other "For debugging" lines
// table.push(
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
// );

charIdx++;
}

// Capture any valid match at the end of the string
captureMatchIfValidAndReset();

// For debugging: search for other "For debugging" lines
//console.log( '\n' + table.toString() );

return matches;


// Handles the state when we're not in an email address
function stateNonEmailAddress( char: string ) {
if( localPartCharRegex.test( char ) ) {
beginEmailAddress();

} else {
// not an email address character, continue
}
}


// Handles the state when we're currently in the "local part" of an
// email address (as opposed to the "domain part")
function stateLocalPart( char: string ) {
if( char === '.' ) {
state = State.LocalPartDot;

} else if( char === '@' ) {
state = State.AtSign;

} else if( localPartCharRegex.test( char ) ) {
// stay in the "local part" of the email address

} else {
// not an email address character, return to "NonEmailAddress" state
resetToNonEmailAddressState();
}
}


// Handles the state where we've read
function stateLocalPartDot( char: string ) {
if( char === '.' ) {
// We read a second '.' in a row, not a valid email address
// local part
resetToNonEmailAddressState();

} else if( char === '@' ) {
// We read the '@' character immediately after a dot ('.'), not
// an email address
resetToNonEmailAddressState();

} else if( localPartCharRegex.test( char ) ) {
state = State.LocalPart;

} else {
// Anything else, not an email address
resetToNonEmailAddressState();
}
}


function stateAtSign( char: string ) {
if( domainNameCharRegex.test( char ) ) {
state = State.DomainChar;

} else {
// Anything else, not an email address
resetToNonEmailAddressState();
}
}

function stateDomainChar( char: string ) {
if( char === '.' ) {
state = State.DomainDot;

} else if( char === '-' ) {
state = State.DomainHyphen;

} else if( domainNameCharRegex.test( char ) ) {
// Stay in the DomainChar state

} else {
// Anything else, we potentially matched if the criteria has
// been met
captureMatchIfValidAndReset();
}
}

function stateDomainHyphen( char: string ) {
if( char === '-' || char === '.' ) {
// Not valid to have two hyphens ("--") or hypen+dot ("-.")
captureMatchIfValidAndReset();

} else if( domainNameCharRegex.test( char ) ) {
state = State.DomainChar;

} else {
// Anything else
captureMatchIfValidAndReset();
}
}

function stateDomainDot( char: string ) {
if( char === '.' || char === '-' ) {
// not valid to have two dots ("..") or dot+hypen (".-")
captureMatchIfValidAndReset();

} else if( domainNameCharRegex.test( char ) ) {
state = State.DomainChar;

// After having read a '.' and then a valid domain character,
// we now know that the domain part of the email is valid, and
// we have found at least a partial EmailMatch (however, the
// email address may have additional characters from this point)
currentEmailAddress = new CurrentEmailAddress( {
...currentEmailAddress,
hasDomainDot: true
} );

} else {
// Anything else
captureMatchIfValidAndReset();
}
}


function beginEmailAddress() {
state = State.LocalPart;
currentEmailAddress = new CurrentEmailAddress( { idx: charIdx } );
}

function resetToNonEmailAddressState() {
state = State.NonEmailAddress;
currentEmailAddress = noCurrentEmailAddress
}


/*
* Captures the current email address as an EmailMatch if it's valid,
* and resets the state to read another email address.
*/
function captureMatchIfValidAndReset() {
if( currentEmailAddress.hasDomainDot ) { // we need at least one dot in the domain to be considered a valid email address
let emailAddress = text.slice( currentEmailAddress.idx, charIdx );

// If we read a '.' or '-' char that ended the email address
// (valid domain name characters, but only valid email address
// characters if they are followed by something else), strip
// it off now
if( /[-.]$/.test( emailAddress ) ){
emailAddress = emailAddress.slice( 0, -1 );
}

matches.push( new EmailMatch( {
tagBuilder : tagBuilder,
matchedText : emailAddress,
offset : currentEmailAddress.idx,
email : emailAddress
} ) );
}

resetToNonEmailAddressState();
}
}

}


const enum State {
NonEmailAddress = 0,
LocalPart,
LocalPartDot,
AtSign,
DomainChar,
DomainHyphen,
DomainDot
}


class CurrentEmailAddress {
readonly idx: number; // the index of the first character in the email address
readonly hasDomainDot: boolean;

constructor( cfg: Partial<CurrentEmailAddress> = {} ) {
this.idx = cfg.idx !== undefined ? cfg.idx : -1;
this.hasDomainDot = !!cfg.hasDomainDot;
}
}
33 changes: 33 additions & 0 deletions src/regex-lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,32 @@
* regular expressions that are shared between source files.
*/

/**
* Regular expression to match upper and lowercase ASCII letters
*/
export const letterRe = /[A-Za-z]/;

/**
* Regular expression to match ASCII digits
*/
export const digitRe = /[0-9]/;

/**
* Regular expression to match whitespace
*/
export const whitespaceRe = /\s/;

/**
* Regular expression to match quote characters
*/
export const quoteRe = /['"]/;

/**
* Regular expression to match the range of ASCII control characters (0-31), and
* the backspace char (127)
*/
export const controlCharsRe = /[\x00-\x1F\x7F]/;

/**
* The string form of a regular expression that would match all of the
* alphabetic ("letter") chars in the unicode character set when placed in a
Expand Down Expand Up @@ -142,3 +168,10 @@ export const getDomainNameStr = ( group: number ) => {
* Ex: 'google', 'yahoo', 'some-other-company', etc.
*/
export const domainNameRegex = new RegExp( '[' + alphaNumericAndMarksCharsStr + '.\\-]*[' + alphaNumericAndMarksCharsStr + '\\-]' );


/**
* A regular expression that is simply the character class of the characters
* that may be used in a domain name, minus the '-' or '.'
*/
export const domainNameCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}]` );
Loading

0 comments on commit ac52836

Please sign in to comment.