Skip to content

Commit 6335974

Browse files
committed
Fix for <!DOCTYPE> tag html parsing, which could cause the regex engine to freeze with 100% cpu for certain inputs.
Add fix for proper handling of <A> tags (with capitalized tag name) as well.
1 parent 3bedddc commit 6335974

File tree

6 files changed

+162
-52
lines changed

6 files changed

+162
-52
lines changed

dist/Autolinker.js

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
/*!
1818
* Autolinker.js
19-
* 0.14.1
19+
* 0.15.0
2020
*
2121
* Copyright(c) 2014 Gregory Jacobs <[email protected]>
2222
* MIT Licensed. http://www.opensource.org/licenses/mit-license.php
@@ -850,34 +850,53 @@
850850
*
851851
* Capturing groups:
852852
*
853-
* 1. If it is an end tag, this group will have the '/'.
854-
* 2. The tag name.
853+
* 1. The "!DOCTYPE" tag name, if a tag is a &lt;!DOCTYPE&gt; tag.
854+
* 2. If it is an end tag, this group will have the '/'.
855+
* 3. The tag name for all tags (other than the &lt;!DOCTYPE&gt; tag)
855856
*/
856857
htmlRegex : (function() {
857-
var tagNameRegex = /[0-9a-zA-Z:]+/,
858+
var tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/,
858859
attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/, // the unicode range accounts for excluding control chars, and the delete char
859860
attrValueRegex = /(?:".*?"|'.*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
860861
nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?'; // optional '=[value]'
861862

862863
return new RegExp( [
863-
'<(?:!|(/))?', // Beginning of a tag. Either '<' for a start tag, '</' for an end tag, or <! for the <!DOCTYPE ...> tag. The slash or an empty string is Capturing Group 1.
864+
// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
865+
'(?:',
866+
'<(!DOCTYPE)', // *** Capturing Group 1 - If it's a doctype tag
867+
868+
// Zero or more attributes following the tag name
869+
'(?:',
870+
'\\s+', // one or more whitespace chars before an attribute
871+
872+
// Either:
873+
// A. attr="value", or
874+
// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
875+
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
876+
')*',
877+
'>',
878+
')',
879+
880+
'|',
864881

865-
// The tag name (Capturing Group 2)
866-
'(' + tagNameRegex.source + ')',
882+
// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
883+
'(?:',
884+
'<(/)?', // Beginning of a tag. Either '<' for a start tag, or '</' for an end tag.
885+
// *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.
867886

868-
// Zero or more attributes following the tag name
869-
'(?:',
870-
'\\s+', // one or more whitespace chars before an attribute
887+
// *** Capturing Group 3 - The tag name
888+
'(' + tagNameRegex.source + ')',
871889

872-
// Either:
873-
// A. tag="value", or
874-
// B. "value" alone (for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
875-
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
876-
')*',
890+
// Zero or more attributes following the tag name
891+
'(?:',
892+
'\\s+', // one or more whitespace chars before an attribute
893+
nameEqualsValueRegex, // attr="value" (with optional ="value" part)
894+
')*',
877895

878-
'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
879-
'>'
880-
].join( "" ), 'g' );
896+
'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
897+
'>',
898+
')'
899+
].join( "" ), 'gi' );
881900
} )(),
882901

883902

@@ -911,15 +930,15 @@
911930
// wrapping the URLs in anchor tags
912931
while( ( currentResult = htmlRegex.exec( html ) ) !== null ) {
913932
var tagText = currentResult[ 0 ],
914-
tagName = currentResult[ 2 ],
915-
isClosingTag = !!currentResult[ 1 ],
933+
tagName = currentResult[ 1 ] || currentResult[ 3 ], // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a")
934+
isClosingTag = !!currentResult[ 2 ],
916935
inBetweenTagsText = html.substring( lastIndex, currentResult.index );
917936

918937
if( inBetweenTagsText ) {
919938
processTextNodeVisitor( inBetweenTagsText );
920939
}
921940

922-
processHtmlNodeVisitor( tagText, tagName, isClosingTag );
941+
processHtmlNodeVisitor( tagText, tagName.toLowerCase(), isClosingTag );
923942

924943
lastIndex = currentResult.index + tagText.length;
925944
}

dist/Autolinker.min.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "autolinker",
3-
"version": "0.14.1",
3+
"version": "0.15.0",
44
"description": "Utility to automatically link the URLs, email addresses, and Twitter handles in a given block of text/HTML",
55
"main": "dist/Autolinker.js",
66
"directories": {

src/HtmlParser.js

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,34 +20,53 @@ Autolinker.HtmlParser = Autolinker.Util.extend( Object, {
2020
*
2121
* Capturing groups:
2222
*
23-
* 1. If it is an end tag, this group will have the '/'.
24-
* 2. The tag name.
23+
* 1. The "!DOCTYPE" tag name, if a tag is a &lt;!DOCTYPE&gt; tag.
24+
* 2. If it is an end tag, this group will have the '/'.
25+
* 3. The tag name for all tags (other than the &lt;!DOCTYPE&gt; tag)
2526
*/
2627
htmlRegex : (function() {
27-
var tagNameRegex = /[0-9a-zA-Z:]+/,
28+
var tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/,
2829
attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/, // the unicode range accounts for excluding control chars, and the delete char
2930
attrValueRegex = /(?:".*?"|'.*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
3031
nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?'; // optional '=[value]'
3132

3233
return new RegExp( [
33-
'<(?:!|(/))?', // Beginning of a tag. Either '<' for a start tag, '</' for an end tag, or <! for the <!DOCTYPE ...> tag. The slash or an empty string is Capturing Group 1.
34+
// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
35+
'(?:',
36+
'<(!DOCTYPE)', // *** Capturing Group 1 - If it's a doctype tag
37+
38+
// Zero or more attributes following the tag name
39+
'(?:',
40+
'\\s+', // one or more whitespace chars before an attribute
41+
42+
// Either:
43+
// A. attr="value", or
44+
// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
45+
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
46+
')*',
47+
'>',
48+
')',
49+
50+
'|',
3451

35-
// The tag name (Capturing Group 2)
36-
'(' + tagNameRegex.source + ')',
37-
38-
// Zero or more attributes following the tag name
39-
'(?:',
40-
'\\s+', // one or more whitespace chars before an attribute
52+
// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
53+
'(?:',
54+
'<(/)?', // Beginning of a tag. Either '<' for a start tag, or '</' for an end tag.
55+
// *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.
56+
57+
// *** Capturing Group 3 - The tag name
58+
'(' + tagNameRegex.source + ')',
59+
60+
// Zero or more attributes following the tag name
61+
'(?:',
62+
'\\s+', // one or more whitespace chars before an attribute
63+
nameEqualsValueRegex, // attr="value" (with optional ="value" part)
64+
')*',
4165

42-
// Either:
43-
// A. tag="value", or
44-
// B. "value" alone (for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
45-
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
46-
')*',
47-
48-
'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
49-
'>'
50-
].join( "" ), 'g' );
66+
'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
67+
'>',
68+
')'
69+
].join( "" ), 'gi' );
5170
} )(),
5271

5372

@@ -81,15 +100,15 @@ Autolinker.HtmlParser = Autolinker.Util.extend( Object, {
81100
// wrapping the URLs in anchor tags
82101
while( ( currentResult = htmlRegex.exec( html ) ) !== null ) {
83102
var tagText = currentResult[ 0 ],
84-
tagName = currentResult[ 2 ],
85-
isClosingTag = !!currentResult[ 1 ],
103+
tagName = currentResult[ 1 ] || currentResult[ 3 ], // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a")
104+
isClosingTag = !!currentResult[ 2 ],
86105
inBetweenTagsText = html.substring( lastIndex, currentResult.index );
87106

88107
if( inBetweenTagsText ) {
89108
processTextNodeVisitor( inBetweenTagsText );
90109
}
91110

92-
processHtmlNodeVisitor( tagText, tagName, isClosingTag );
111+
processHtmlNodeVisitor( tagText, tagName.toLowerCase(), isClosingTag );
93112

94113
lastIndex = currentResult.index + tagText.length;
95114
}

tests/AutolinkerSpec.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,14 @@ describe( "Autolinker", function() {
850850
} );
851851

852852

853+
it( "should autolink the link, and not fail with 100% cpu in the Regex engine when presented with the input in issue #54", function() {
854+
var inputStr = "Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: http://dorcoshai.de/pb1205ro, und dann machst Du am Gewinnspiel mit! Gefallt mir klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)",
855+
result = autolinker.link( inputStr );
856+
857+
expect( result ).toBe( 'Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: <a href="http://dorcoshai.de/pb1205ro">dorcoshai.de/pb1205ro</a>, und dann machst Du am Gewinnspiel mit! Gefallt mir klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)' );
858+
} );
859+
860+
853861
it( "should NOT modify the email address with other tags when inside another anchor", function() {
854862
var input = [
855863
'<div>First name: Subin</div>',

tests/HtmlParserSpec.js

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
/*global Autolinker, _, describe, beforeEach, afterEach, it, expect */
22
describe( "Autolinker.HtmlParser", function() {
3-
var HtmlParser = Autolinker.HtmlParser;
3+
var HtmlParser = Autolinker.HtmlParser,
4+
htmlParser;
5+
6+
7+
beforeEach( function() {
8+
htmlParser = new HtmlParser();
9+
} );
410

511

612
it( "should be able to reproduce the input string based on storing the results of the visitor function calls", function() {
7-
var htmlParser = new HtmlParser(),
8-
inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
13+
var inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
914
result = [];
1015

11-
1216
htmlParser.parse( inputStr, {
1317
processHtmlNode : function( tagText, tagName, isClosingTag ) {
1418
result.push( tagText );
@@ -23,8 +27,7 @@ describe( "Autolinker.HtmlParser", function() {
2327

2428

2529
it( "should properly call the visitor functions for each text / html node encountered, with the proper arguments", function() {
26-
var htmlParser = new HtmlParser(),
27-
inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
30+
var inputStr = 'Joe went to <a href="google.com">ebay.com</a> today, and bought <b>big</b> items',
2831
htmlNodeArgs = [],
2932
textNodeArgs = [];
3033

@@ -51,4 +54,65 @@ describe( "Autolinker.HtmlParser", function() {
5154
expect( textNodeArgs[ 4 ] ).toEqual( [ ' items' ] );
5255
} );
5356

57+
58+
it( 'should match tags of both upper and lower case', function() {
59+
var inputStr = 'Joe <!DOCTYPE html> went <!doctype "blah" "blah blah"> to <a href="google.com">ebay.com</a> today, and <A href="purchase.com">purchased</A> <b>big</b> <B>items</B>',
60+
htmlNodeArgs = [],
61+
textNodeArgs = [];
62+
63+
htmlParser.parse( inputStr, {
64+
processHtmlNode : function( tagText, tagName, isClosingTag ) {
65+
htmlNodeArgs.push( Array.prototype.slice.call( arguments ) );
66+
},
67+
processTextNode : function( text ) {
68+
textNodeArgs.push( Array.prototype.slice.call( arguments ) );
69+
}
70+
} );
71+
72+
expect( htmlNodeArgs.length ).toBe( 10 );
73+
expect( htmlNodeArgs[ 0 ] ).toEqual( [ '<!DOCTYPE html>', '!doctype', false ] );
74+
expect( htmlNodeArgs[ 1 ] ).toEqual( [ '<!doctype "blah" "blah blah">', '!doctype', false ] );
75+
expect( htmlNodeArgs[ 2 ] ).toEqual( [ '<a href="google.com">', 'a', false ] );
76+
expect( htmlNodeArgs[ 3 ] ).toEqual( [ '</a>', 'a', true ] );
77+
expect( htmlNodeArgs[ 4 ] ).toEqual( [ '<A href="purchase.com">', 'a', false ] );
78+
expect( htmlNodeArgs[ 5 ] ).toEqual( [ '</A>', 'a', true ] );
79+
expect( htmlNodeArgs[ 6 ] ).toEqual( [ '<b>', 'b', false ] );
80+
expect( htmlNodeArgs[ 7 ] ).toEqual( [ '</b>', 'b', true ] );
81+
expect( htmlNodeArgs[ 8 ] ).toEqual( [ '<B>', 'b', false ] );
82+
expect( htmlNodeArgs[ 9 ] ).toEqual( [ '</B>', 'b', true ] );
83+
84+
expect( textNodeArgs.length ).toBe( 10 );
85+
expect( textNodeArgs[ 0 ] ).toEqual( [ 'Joe ' ] );
86+
expect( textNodeArgs[ 1 ] ).toEqual( [ ' went ' ] );
87+
expect( textNodeArgs[ 2 ] ).toEqual( [ ' to ' ] );
88+
expect( textNodeArgs[ 3 ] ).toEqual( [ 'ebay.com' ] );
89+
expect( textNodeArgs[ 4 ] ).toEqual( [ ' today, and ' ] );
90+
expect( textNodeArgs[ 5 ] ).toEqual( [ 'purchased' ] );
91+
expect( textNodeArgs[ 6 ] ).toEqual( [ ' ' ] );
92+
expect( textNodeArgs[ 7 ] ).toEqual( [ 'big' ] );
93+
expect( textNodeArgs[ 8 ] ).toEqual( [ ' ' ] );
94+
expect( textNodeArgs[ 9 ] ).toEqual( [ 'items' ] );
95+
} );
96+
97+
98+
it( "should not freeze up the regular expression engine when presented with the input string in issue #54", function() {
99+
var inputStr = "Shai ist endlich in Deutschland! Und wir haben gute Nachrichten! <3 Alle, die den Shai-Rasierer kostenlos probieren, machen am Gewinnspiel eines Jahresvorrates Klingen mit. Den Rasierer bekommst Du kostenlos durch diesen Link: http://dorcoshai.de/pb1205ro, und dann machst Du am Gewinnspiel mit! 'Gefallt mir' klicken, wenn Du gern einen Jahresvorrat Shai haben mochtest. (Y)",
100+
htmlNodeArgs = [],
101+
textNodeArgs = [];
102+
103+
htmlParser.parse( inputStr, {
104+
processHtmlNode : function( tagText, tagName, isClosingTag ) {
105+
htmlNodeArgs.push( Array.prototype.slice.call( arguments ) );
106+
},
107+
processTextNode : function( text ) {
108+
textNodeArgs.push( Array.prototype.slice.call( arguments ) );
109+
}
110+
} );
111+
112+
expect( htmlNodeArgs.length ).toBe( 0 );
113+
114+
expect( textNodeArgs.length ).toBe( 1 );
115+
expect( textNodeArgs[ 0 ] ).toEqual( [ inputStr ] );
116+
} );
117+
54118
} );

0 commit comments

Comments
 (0)