Merge pull request #232 from caseywebdev/url-perf

gregjacobs · web-flow · commit fb31497534a5 · 2018-02-08T23:00:03.000-05:00
Improve regex performance for URL parsing
diff --git a/src/RegexLib.js b/src/RegexLib.js
@@ -49,13 +49,19 @@ Autolinker.RegexLib = (function() {
 	var alphaNumericCharsStr = alphaCharsStr + decimalNumbersStr;
 
 	// Simplified IP regular expression
-	var ipRegex = new RegExp( '(?:[' + decimalNumbersStr + ']{1,3}\\.){3}[' + decimalNumbersStr + ']{1,3}' );
+	var ipStr = '(?:[' + decimalNumbersStr + ']{1,3}\\.){3}[' + decimalNumbersStr + ']{1,3}';
 
 	// Protected domain label which do not allow "-" character on the beginning and the end of a single label
-	var domainLabelStr = '[' + alphaNumericCharsStr + '](?:[' + alphaNumericCharsStr + '\\-]*[' + alphaNumericCharsStr + '])?';
+	var domainLabelStr = '[' + alphaNumericCharsStr + '](?:[' + alphaNumericCharsStr + '\\-]{0,61}[' + alphaNumericCharsStr + '])?';
+
+	var getDomainLabelStr = function(group) {
+		return '(?=(' + domainLabelStr + '))\\' + group;
+	};
 
 	// See documentation below
-	var domainNameRegex = new RegExp( '(?:(?:(?:' + domainLabelStr + '\\.)*(?:' + domainLabelStr + '))|(?:' + ipRegex.source + '))' );
+	var getDomainNameStr = function(group) {
+		return '(?:' + getDomainLabelStr(group) + '(?:\\.' + getDomainLabelStr(group + 1) + '){0,126}|' + ipStr + ')';
+	};
 
 	return {
 
@@ -89,7 +95,7 @@ Autolinker.RegexLib = (function() {
 		 *
 		 * @property {RegExp} domainNameRegex
 		 */
-		domainNameRegex : domainNameRegex,
+		getDomainNameStr : getDomainNameStr,
 
 	};
 
diff --git a/src/matcher/Email.js b/src/matcher/Email.js
@@ -24,12 +24,12 @@ Autolinker.matcher.Email = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
 			validCharacters = alphaNumericChars + specialCharacters,
 			validRestrictedCharacters = validCharacters + restrictedSpecialCharacters,
 		    emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@'),
-			domainNameRegex = Autolinker.RegexLib.domainNameRegex,
+			getDomainNameStr = Autolinker.RegexLib.getDomainNameStr,
 			tldRegex = Autolinker.tldRegex;  // match our known top level domains (TLDs)
 
 		return new RegExp( [
 			emailRegex.source,
-			domainNameRegex.source,
+			getDomainNameStr(1),
 			'\\.', tldRegex.source   // '.com', '.net', etc
 		].join( "" ), 'gi' );
 	} )(),
diff --git a/src/matcher/Url.js b/src/matcher/Url.js
@@ -61,9 +61,9 @@ Autolinker.matcher.Url = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
 	 *     See #3 for more info.
 	 */
 	matcherRegex : (function() {
-		var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]*:(?![A-Za-z][-.+A-Za-z0-9]*:\/\/)(?!\d+\/?)(?:\/\/)?)/,  // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
+		var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/,  // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
 		    wwwRegex = /(?:www\.)/,                  // starting with 'www.'
-		    domainNameRegex = Autolinker.RegexLib.domainNameRegex,
+		    getDomainNameStr = Autolinker.RegexLib.getDomainNameStr,
 		    tldRegex = Autolinker.tldRegex,  // match our known top level domains (TLDs)
 		    alphaNumericCharsStr = Autolinker.RegexLib.alphaNumericCharsStr,
 
@@ -75,22 +75,22 @@ Autolinker.matcher.Url = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
 			'(?:', // parens to cover match for scheme (optional), and domain
 				'(',  // *** Capturing group $1, for a scheme-prefixed url (ex: http://google.com)
 					schemeRegex.source,
-					domainNameRegex.source,
+					getDomainNameStr(2),
 				')',
 
 				'|',
 
-				'(',  // *** Capturing group $2, for a 'www.' prefixed url (ex: www.google.com)
-					'(//)?',  // *** Capturing group $3 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
+				'(',  // *** Capturing group $4 for a 'www.' prefixed url (ex: www.google.com)
+					'(//)?',  // *** Capturing group $5 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
 					wwwRegex.source,
-					domainNameRegex.source,
+					getDomainNameStr(6),
 				')',
 
 				'|',
 
-				'(',  // *** Capturing group $4, for known a TLD url (ex: google.com)
-					'(//)?',  // *** Capturing group $5 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
-					domainNameRegex.source + '\\.',
+				'(',  // *** Capturing group $8, for known a TLD url (ex: google.com)
+					'(//)?',  // *** Capturing group $9 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
+					getDomainNameStr(10) + '\\.',
 					tldRegex.source,
 					'(?![-' + alphaNumericCharsStr + '])', // TLD not followed by a letter, behaves like unicode-aware \b
 				')',
@@ -179,10 +179,10 @@ Autolinker.matcher.Url = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
 		while( ( match = matcherRegex.exec( text ) ) !== null ) {
 			var matchStr = match[ 0 ],
 			    schemeUrlMatch = match[ 1 ],
-			    wwwUrlMatch = match[ 2 ],
-			    wwwProtocolRelativeMatch = match[ 3 ],
-			    //tldUrlMatch = match[ 4 ],  -- not needed at the moment
-			    tldProtocolRelativeMatch = match[ 5 ],
+			    wwwUrlMatch = match[ 4 ],
+			    wwwProtocolRelativeMatch = match[ 5 ],
+			    //tldUrlMatch = match[ 8 ],  -- not needed at the moment
+			    tldProtocolRelativeMatch = match[ 9 ],
 			    offset = match.index,
 			    protocolRelativeMatch = wwwProtocolRelativeMatch || tldProtocolRelativeMatch,
 				prevChar = text.charAt( offset - 1 );
diff --git a/tests/index.html b/tests/index.html
@@ -30,6 +30,7 @@
 	<script src="../src/match/Mention.js"></script>
 	<script src="../src/match/Url.js"></script>
 	<script src="../src/matcher/Matcher.js"></script>
+	<script src="../src/matcher/TldRegex.js"></script>
 	<script src="../src/matcher/Email.js"></script>
 	<script src="../src/matcher/Hashtag.js"></script>
 	<script src="../src/matcher/Phone.js"></script>
diff --git a/tests/matcher/UrlSpec.js b/tests/matcher/UrlSpec.js
@@ -7,7 +7,8 @@ describe( "Autolinker.matcher.Url", function() {
 		matcher = new Autolinker.matcher.Url( {
 			tagBuilder  : new Autolinker.AnchorTagBuilder(),
 			stripPrefix : false,
-			stripTrailingSlash : false
+			stripTrailingSlash : false,
+			decodePercentEncoding: false
 		} );
 	} );
 
@@ -161,6 +162,12 @@ describe( "Autolinker.matcher.Url", function() {
 				expect( matches.length ).toBe( 0 );
 			} );
 
+			it( 'should parse long contiguous characters with no spaces in a timely manner', function() {
+				const start = Date.now();
+				matcher.parseMatches( new Array(10000).join('a') );
+				expect( Date.now() - start ).toBeLessThan( 100 );
+			} );
+
 		} );
 
 	} );