Skip to content

Commit fb31497

Browse files
authored
Merge pull request #232 from caseywebdev/url-perf
Improve regex performance for URL parsing
2 parents c49a03b + 55bf8e3 commit fb31497

File tree

5 files changed

+34
-20
lines changed

5 files changed

+34
-20
lines changed

src/RegexLib.js

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,19 @@ Autolinker.RegexLib = (function() {
4949
var alphaNumericCharsStr = alphaCharsStr + decimalNumbersStr;
5050

5151
// Simplified IP regular expression
52-
var ipRegex = new RegExp( '(?:[' + decimalNumbersStr + ']{1,3}\\.){3}[' + decimalNumbersStr + ']{1,3}' );
52+
var ipStr = '(?:[' + decimalNumbersStr + ']{1,3}\\.){3}[' + decimalNumbersStr + ']{1,3}';
5353

5454
// Protected domain label which do not allow "-" character on the beginning and the end of a single label
55-
var domainLabelStr = '[' + alphaNumericCharsStr + '](?:[' + alphaNumericCharsStr + '\\-]*[' + alphaNumericCharsStr + '])?';
55+
var domainLabelStr = '[' + alphaNumericCharsStr + '](?:[' + alphaNumericCharsStr + '\\-]{0,61}[' + alphaNumericCharsStr + '])?';
56+
57+
var getDomainLabelStr = function(group) {
58+
return '(?=(' + domainLabelStr + '))\\' + group;
59+
};
5660

5761
// See documentation below
58-
var domainNameRegex = new RegExp( '(?:(?:(?:' + domainLabelStr + '\\.)*(?:' + domainLabelStr + '))|(?:' + ipRegex.source + '))' );
62+
var getDomainNameStr = function(group) {
63+
return '(?:' + getDomainLabelStr(group) + '(?:\\.' + getDomainLabelStr(group + 1) + '){0,126}|' + ipStr + ')';
64+
};
5965

6066
return {
6167

@@ -89,7 +95,7 @@ Autolinker.RegexLib = (function() {
8995
*
9096
* @property {RegExp} domainNameRegex
9197
*/
92-
domainNameRegex : domainNameRegex,
98+
getDomainNameStr : getDomainNameStr,
9399

94100
};
95101

src/matcher/Email.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@ Autolinker.matcher.Email = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
2424
validCharacters = alphaNumericChars + specialCharacters,
2525
validRestrictedCharacters = validCharacters + restrictedSpecialCharacters,
2626
emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@'),
27-
domainNameRegex = Autolinker.RegexLib.domainNameRegex,
27+
getDomainNameStr = Autolinker.RegexLib.getDomainNameStr,
2828
tldRegex = Autolinker.tldRegex; // match our known top level domains (TLDs)
2929

3030
return new RegExp( [
3131
emailRegex.source,
32-
domainNameRegex.source,
32+
getDomainNameStr(1),
3333
'\\.', tldRegex.source // '.com', '.net', etc
3434
].join( "" ), 'gi' );
3535
} )(),

src/matcher/Url.js

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ Autolinker.matcher.Url = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
6161
* See #3 for more info.
6262
*/
6363
matcherRegex : (function() {
64-
var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]*:(?![A-Za-z][-.+A-Za-z0-9]*:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
64+
var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
6565
wwwRegex = /(?:www\.)/, // starting with 'www.'
66-
domainNameRegex = Autolinker.RegexLib.domainNameRegex,
66+
getDomainNameStr = Autolinker.RegexLib.getDomainNameStr,
6767
tldRegex = Autolinker.tldRegex, // match our known top level domains (TLDs)
6868
alphaNumericCharsStr = Autolinker.RegexLib.alphaNumericCharsStr,
6969

@@ -75,22 +75,22 @@ Autolinker.matcher.Url = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
7575
'(?:', // parens to cover match for scheme (optional), and domain
7676
'(', // *** Capturing group $1, for a scheme-prefixed url (ex: http://google.com)
7777
schemeRegex.source,
78-
domainNameRegex.source,
78+
getDomainNameStr(2),
7979
')',
8080

8181
'|',
8282

83-
'(', // *** Capturing group $2, for a 'www.' prefixed url (ex: www.google.com)
84-
'(//)?', // *** Capturing group $3 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
83+
'(', // *** Capturing group $4 for a 'www.' prefixed url (ex: www.google.com)
84+
'(//)?', // *** Capturing group $5 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
8585
wwwRegex.source,
86-
domainNameRegex.source,
86+
getDomainNameStr(6),
8787
')',
8888

8989
'|',
9090

91-
'(', // *** Capturing group $4, for known a TLD url (ex: google.com)
92-
'(//)?', // *** Capturing group $5 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
93-
domainNameRegex.source + '\\.',
91+
'(', // *** Capturing group $8, for known a TLD url (ex: google.com)
92+
'(//)?', // *** Capturing group $9 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character (handled later)
93+
getDomainNameStr(10) + '\\.',
9494
tldRegex.source,
9595
'(?![-' + alphaNumericCharsStr + '])', // TLD not followed by a letter, behaves like unicode-aware \b
9696
')',
@@ -179,10 +179,10 @@ Autolinker.matcher.Url = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
179179
while( ( match = matcherRegex.exec( text ) ) !== null ) {
180180
var matchStr = match[ 0 ],
181181
schemeUrlMatch = match[ 1 ],
182-
wwwUrlMatch = match[ 2 ],
183-
wwwProtocolRelativeMatch = match[ 3 ],
184-
//tldUrlMatch = match[ 4 ], -- not needed at the moment
185-
tldProtocolRelativeMatch = match[ 5 ],
182+
wwwUrlMatch = match[ 4 ],
183+
wwwProtocolRelativeMatch = match[ 5 ],
184+
//tldUrlMatch = match[ 8 ], -- not needed at the moment
185+
tldProtocolRelativeMatch = match[ 9 ],
186186
offset = match.index,
187187
protocolRelativeMatch = wwwProtocolRelativeMatch || tldProtocolRelativeMatch,
188188
prevChar = text.charAt( offset - 1 );

tests/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
<script src="../src/match/Mention.js"></script>
3131
<script src="../src/match/Url.js"></script>
3232
<script src="../src/matcher/Matcher.js"></script>
33+
<script src="../src/matcher/TldRegex.js"></script>
3334
<script src="../src/matcher/Email.js"></script>
3435
<script src="../src/matcher/Hashtag.js"></script>
3536
<script src="../src/matcher/Phone.js"></script>

tests/matcher/UrlSpec.js

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ describe( "Autolinker.matcher.Url", function() {
77
matcher = new Autolinker.matcher.Url( {
88
tagBuilder : new Autolinker.AnchorTagBuilder(),
99
stripPrefix : false,
10-
stripTrailingSlash : false
10+
stripTrailingSlash : false,
11+
decodePercentEncoding: false
1112
} );
1213
} );
1314

@@ -161,6 +162,12 @@ describe( "Autolinker.matcher.Url", function() {
161162
expect( matches.length ).toBe( 0 );
162163
} );
163164

165+
it( 'should parse long contiguous characters with no spaces in a timely manner', function() {
166+
const start = Date.now();
167+
matcher.parseMatches( new Array(10000).join('a') );
168+
expect( Date.now() - start ).toBeLessThan( 100 );
169+
} );
170+
164171
} );
165172

166173
} );

0 commit comments

Comments
 (0)