Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linear time email matcher #260

Merged
merged 3 commits into from
Jan 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gulpfile.js
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ function buildSrcMinifyUmdTask() {
async function buildSrcCheckMinifiedSizeTask() {
const stats = await fs.stat( './dist/Autolinker.min.js' );
const sizeInKb = stats.size / 1000;
const maxExpectedSizeInKb = 44;
const maxExpectedSizeInKb = 46;

if( sizeInKb > maxExpectedSizeInKb ) {
throw new Error( `
Expand Down
19 changes: 4 additions & 15 deletions src/htmlParser/parse-html.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { State } from './state';
import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
import { throwUnhandledCaseError } from '../utils';

// For debugging: search for other "For debugging" lines
// import CliTable from 'cli-table';
Expand Down Expand Up @@ -61,12 +63,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
onComment: ( offset: number ) => void;
onDoctype: ( offset: number ) => void;
} ) {
const letterRe = /[A-Za-z]/,
digitRe = /[0-9]/,
whitespaceRe = /\s/,
quoteRe = /['"]/,
controlCharsRe = /[\x00-\x1F\x7F]/, // control chars (0-31), and the backspace char (127)
noCurrentTag = new CurrentTag();
const noCurrentTag = new CurrentTag();

let charIdx = 0,
len = html.length,
Expand Down Expand Up @@ -112,7 +109,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
case State.Doctype: stateDoctype( char ); break;

default:
throwUnhandledStateError( state );
throwUnhandledCaseError( state );
}

// For debugging: search for other "For debugging" lines
Expand All @@ -131,14 +128,6 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
//console.log( '\n' + table.toString() );


/**
* Function that should never be called but is used to check that every
* enum value is handled using TypeScript's 'never' type.
*/
function throwUnhandledStateError( state: never ) {
throw new Error( 'Unhandled State' )
}


// Called when non-tags are being read (i.e. the text around HTML †ags)
// https://www.w3.org/TR/html51/syntax.html#data-state
Expand Down
265 changes: 232 additions & 33 deletions src/matcher/email-matcher.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import { Matcher } from "./matcher";
import { alphaNumericAndMarksCharsStr, getDomainNameStr } from "../regex-lib";
import { tldRegex } from "./tld-regex";
import { alphaNumericAndMarksCharsStr, domainNameCharRegex } from "../regex-lib";
import { EmailMatch } from "../match/email-match";
import { Match } from "../match/match";
import { throwUnhandledCaseError } from '../utils';

// For debugging: search for other "For debugging" lines
// import CliTable from 'cli-table';

/**
* @class Autolinker.matcher.Email
Expand All @@ -15,49 +18,245 @@ import { Match } from "../match/match";
export class EmailMatcher extends Matcher {

/**
* The regular expression to match email addresses. Example match:
*
* person@place.com
*
* @protected
* @property {RegExp} matcherRegex
* Valid characters that can be used in the "local" part of an email address,
* i.e. the "name" part of "name@site.com"
*/
protected matcherRegex = (function() {
var specialCharacters = '!#$%&\'*+\\-\\/=?^_`{|}~',
restrictedSpecialCharacters = '\\s"(),:;<>@\\[\\]',
validCharacters = alphaNumericAndMarksCharsStr + specialCharacters,
validRestrictedCharacters = validCharacters + restrictedSpecialCharacters,
emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@');

return new RegExp( [
emailRegex.source,
getDomainNameStr( 1 ),
'\\.', tldRegex.source // '.com', '.net', etc
].join( "" ), 'gi' );
} )();
protected localPartCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}!#$%&'*+/=?^_\`{|}~-]` );


/**
* @inheritdoc
*/
parseMatches( text: string ) {
let matcherRegex = this.matcherRegex,
tagBuilder = this.tagBuilder,
matches: Match[] = [],
match: RegExpExecArray | null;
const tagBuilder = this.tagBuilder,
localPartCharRegex = this.localPartCharRegex,
matches: Match[] = [],
len = text.length,
noCurrentEmailAddress = new CurrentEmailAddress();

let charIdx = 0,
state = State.NonEmailAddress as State,
currentEmailAddress = noCurrentEmailAddress;

// For debugging: search for other "For debugging" lines
// const table = new CliTable( {
// head: [ 'charIdx', 'char', 'state', 'charIdx', 'currentEmailAddress.idx', 'hasDomainDot' ]
// } );

while( ( match = matcherRegex.exec( text ) ) !== null ) {
let matchedText = match[ 0 ];
while( charIdx < len ) {
const char = text.charAt( charIdx );

matches.push( new EmailMatch( {
tagBuilder : tagBuilder,
matchedText : matchedText,
offset : match.index,
email : matchedText
} ) );
// For debugging: search for other "For debugging" lines
// table.push(
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
// );

switch( state ) {
case State.NonEmailAddress: stateNonEmailAddress( char ); break;
case State.LocalPart: stateLocalPart( char ); break;
case State.LocalPartDot: stateLocalPartDot( char ); break;
case State.AtSign: stateAtSign( char ); break;
case State.DomainChar: stateDomainChar( char ); break;
case State.DomainHyphen: stateDomainHyphen( char ); break;
case State.DomainDot: stateDomainDot( char ); break;

default:
throwUnhandledCaseError( state );
}

// For debugging: search for other "For debugging" lines
// table.push(
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
// );

charIdx++;
}

// Capture any valid match at the end of the string
captureMatchIfValidAndReset();

// For debugging: search for other "For debugging" lines
//console.log( '\n' + table.toString() );

return matches;


// Handles the state when we're not in an email address
function stateNonEmailAddress( char: string ) {
if( localPartCharRegex.test( char ) ) {
beginEmailAddress();

} else {
// not an email address character, continue
}
}


// Handles the state when we're currently in the "local part" of an
// email address (as opposed to the "domain part")
function stateLocalPart( char: string ) {
if( char === '.' ) {
state = State.LocalPartDot;

} else if( char === '@' ) {
state = State.AtSign;

} else if( localPartCharRegex.test( char ) ) {
// stay in the "local part" of the email address

} else {
// not an email address character, return to "NonEmailAddress" state
resetToNonEmailAddressState();
}
}


// Handles the state where we've read
function stateLocalPartDot( char: string ) {
if( char === '.' ) {
// We read a second '.' in a row, not a valid email address
// local part
resetToNonEmailAddressState();

} else if( char === '@' ) {
// We read the '@' character immediately after a dot ('.'), not
// an email address
resetToNonEmailAddressState();

} else if( localPartCharRegex.test( char ) ) {
state = State.LocalPart;

} else {
// Anything else, not an email address
resetToNonEmailAddressState();
}
}


function stateAtSign( char: string ) {
if( domainNameCharRegex.test( char ) ) {
state = State.DomainChar;

} else {
// Anything else, not an email address
resetToNonEmailAddressState();
}
}

function stateDomainChar( char: string ) {
if( char === '.' ) {
state = State.DomainDot;

} else if( char === '-' ) {
state = State.DomainHyphen;

} else if( domainNameCharRegex.test( char ) ) {
// Stay in the DomainChar state

} else {
// Anything else, we potentially matched if the criteria has
// been met
captureMatchIfValidAndReset();
}
}

function stateDomainHyphen( char: string ) {
if( char === '-' || char === '.' ) {
// Not valid to have two hyphens ("--") or hypen+dot ("-.")
captureMatchIfValidAndReset();

} else if( domainNameCharRegex.test( char ) ) {
state = State.DomainChar;

} else {
// Anything else
captureMatchIfValidAndReset();
}
}

function stateDomainDot( char: string ) {
if( char === '.' || char === '-' ) {
// not valid to have two dots ("..") or dot+hypen (".-")
captureMatchIfValidAndReset();

} else if( domainNameCharRegex.test( char ) ) {
state = State.DomainChar;

// After having read a '.' and then a valid domain character,
// we now know that the domain part of the email is valid, and
// we have found at least a partial EmailMatch (however, the
// email address may have additional characters from this point)
currentEmailAddress = new CurrentEmailAddress( {
...currentEmailAddress,
hasDomainDot: true
} );

} else {
// Anything else
captureMatchIfValidAndReset();
}
}


function beginEmailAddress() {
state = State.LocalPart;
currentEmailAddress = new CurrentEmailAddress( { idx: charIdx } );
}

function resetToNonEmailAddressState() {
state = State.NonEmailAddress;
currentEmailAddress = noCurrentEmailAddress
}


/*
* Captures the current email address as an EmailMatch if it's valid,
* and resets the state to read another email address.
*/
function captureMatchIfValidAndReset() {
if( currentEmailAddress.hasDomainDot ) { // we need at least one dot in the domain to be considered a valid email address
let emailAddress = text.slice( currentEmailAddress.idx, charIdx );

// If we read a '.' or '-' char that ended the email address
// (valid domain name characters, but only valid email address
// characters if they are followed by something else), strip
// it off now
if( /[-.]$/.test( emailAddress ) ){
emailAddress = emailAddress.slice( 0, -1 );
}

matches.push( new EmailMatch( {
tagBuilder : tagBuilder,
matchedText : emailAddress,
offset : currentEmailAddress.idx,
email : emailAddress
} ) );
}

resetToNonEmailAddressState();
}
}

}


const enum State {
NonEmailAddress = 0,
LocalPart,
LocalPartDot,
AtSign,
DomainChar,
DomainHyphen,
DomainDot
}


class CurrentEmailAddress {
readonly idx: number; // the index of the first character in the email address
readonly hasDomainDot: boolean;

constructor( cfg: Partial<CurrentEmailAddress> = {} ) {
this.idx = cfg.idx !== undefined ? cfg.idx : -1;
this.hasDomainDot = !!cfg.hasDomainDot;
}
}
33 changes: 33 additions & 0 deletions src/regex-lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,32 @@
* regular expressions that are shared between source files.
*/

/**
* Regular expression to match upper and lowercase ASCII letters
*/
export const letterRe = /[A-Za-z]/;

/**
* Regular expression to match ASCII digits
*/
export const digitRe = /[0-9]/;

/**
* Regular expression to match whitespace
*/
export const whitespaceRe = /\s/;

/**
* Regular expression to match quote characters
*/
export const quoteRe = /['"]/;

/**
* Regular expression to match the range of ASCII control characters (0-31), and
* the backspace char (127)
*/
export const controlCharsRe = /[\x00-\x1F\x7F]/;

/**
* The string form of a regular expression that would match all of the
* alphabetic ("letter") chars in the unicode character set when placed in a
Expand Down Expand Up @@ -142,3 +168,10 @@ export const getDomainNameStr = ( group: number ) => {
* Ex: 'google', 'yahoo', 'some-other-company', etc.
*/
export const domainNameRegex = new RegExp( '[' + alphaNumericAndMarksCharsStr + '.\\-]*[' + alphaNumericAndMarksCharsStr + '\\-]' );


/**
* A regular expression that is simply the character class of the characters
* that may be used in a domain name, minus the '-' or '.'
*/
export const domainNameCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}]` );
Loading