Skip to content

Commit ee33204

Browse files
committed
Add support for protocol-relative URLs (i.e. URLs that start with '//')
1 parent 94adf90 commit ee33204

File tree

4 files changed

+203
-77
lines changed

4 files changed

+203
-77
lines changed

dist/Autolinker.js

Lines changed: 75 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* https://github.com/gregjacobs/Autolinker.js
99
*/
1010
/*global define, module */
11-
/*jshint smarttabs:true */
11+
/*jshint undef:true, smarttabs:true */
1212
// Set up Autolinker appropriately for the environment.
1313
( function( root, factory ) {
1414
if( typeof define === 'function' && define.amd ) {
@@ -132,20 +132,25 @@
132132
*
133133
* The regular expression that matches URLs, email addresses, and Twitter handles.
134134
*
135-
* Capturing groups:
135+
* This regular expression has the following capturing groups:
136136
*
137-
* 1. Group that is used to determine if there is a Twitter handle match (i.e. @someTwitterUser). Simply check for its existence
138-
* to determine if there is a Twitter handle match. The next couple of capturing groups give information about the Twitter
139-
* handle match.
140-
* 2. The whitespace character before the @sign in a Twitter handle. This is needed because there are no lookbehinds in JS regular
141-
* expressions, and can be used to reconstruct the original string in a replace().
137+
* 1. Group that is used to determine if there is a Twitter handle match (i.e. @someTwitterUser). Simply check for its
138+
* existence to determine if there is a Twitter handle match. The next couple of capturing groups give information
139+
* about the Twitter handle match.
140+
* 2. The whitespace character before the @sign in a Twitter handle. This is needed because there are no lookbehinds in
141+
* JS regular expressions, and can be used to reconstruct the original string in a replace().
142142
* 3. The Twitter handle itself in a Twitter match. If the match is '@someTwitterUser', the handle is 'someTwitterUser'.
143-
* 4. Group that matches an email address. Used to determine if the match is an email address, as well as holding the full address.
144-
143+
* 4. Group that matches an email address. Used to determine if the match is an email address, as well as holding the full
144+
* address. Ex: '[email protected]'
145145
* 5. Group that matches a URL in the input text. Ex: 'http://google.com', 'www.google.com', or just 'google.com'.
146146
* This also includes a path, url parameters, or hash anchors. Ex: google.com/path/to/file?q1=1&q2=2#myAnchor
147+
* 6. A protocol-relative ('//') match for the case of a 'www.' prefixed URL. Will be an empty string if it is not a
148+
* protocol-relative match. We need to know the character before the '//' in order to determine if it is a valid match
149+
* or the // was in a string we don't want to auto-link.
150+
* 7. A protocol-relative ('//') match for the case of a known TLD prefixed URL. Will be an empty string if it is not a
151+
* protocol-relative match. See #6 for more info.
147152
*/
148-
matcherRegex: (function() {
153+
matcherRegex : (function() {
149154
var twitterRegex = /(^|[^\w])@(\w{1,15})/, // For matching a twitter handle. Ex: @gregory_jacobs
150155

151156
emailRegex = /(?:[\-;:&=\+\$,\w\.]+@)/, // something@ for email addresses (a.k.a. local-part)
@@ -159,7 +164,6 @@
159164
// http://blog.codinghorror.com/the-problem-with-urls/
160165
urlSuffixRegex = /(?:[\-A-Za-z0-9+&@#\/%?=~_()|!:,.;]*[\-A-Za-z0-9+&@#\/%=~_()|])?/; // note: optional part of the full regex
161166

162-
163167
return new RegExp( [
164168
'(', // *** Capturing group $1, which can be used to check for a twitter handle match. Use group $3 for the actual twitter handle though. $2 may be used to reconstruct the original string in a replace()
165169
// *** Capturing group $2, which matches the whitespace character before the '@' sign (needed because of no lookbehinds), and
@@ -179,21 +183,23 @@
179183

180184
'(', // *** Capturing group $5, which is used to match a URL
181185
'(?:', // parens to cover match for protocol (optional), and domain
182-
'(?:', // non-capturing paren for a protocol-prefixed url (ex: http://google.com)
186+
'(?:', // non-capturing paren for a protocol-prefixed url (ex: http://google.com)
183187
protocolRegex.source,
184188
domainNameRegex.source,
185189
')',
186190

187191
'|',
188192

189193
'(?:', // non-capturing paren for a 'www.' prefixed url (ex: www.google.com)
194+
'(.?//)?', // *** Capturing group $6 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
190195
wwwRegex.source,
191196
domainNameRegex.source,
192197
')',
193198

194199
'|',
195200

196201
'(?:', // non-capturing paren for known a TLD url (ex: google.com)
202+
'(.?//)?', // *** Capturing group $7 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
197203
domainNameRegex.source,
198204
tldRegex.source,
199205
')',
@@ -204,12 +210,24 @@
204210
].join( "" ), 'g' );
205211
} )(),
206212

213+
/**
214+
* @private
215+
* @property {RegExp} protocolRelativeRegex
216+
*
217+
* The regular expression used to find protocol-relative URLs. A protocol-relative URL is, for example, "//yahoo.com"
218+
*
219+
* This regular expression needs to match the character before the '//', in order to determine if we should actually
220+
* autolink a protocol-relative URL. For instance, we want to autolink something like "//google.com", but we
221+
* don't want to autolink something like "abc//google.com"
222+
*/
223+
protocolRelativeRegex : /(.)?\/\//,
224+
207225
/**
208226
* @private
209227
* @property {RegExp} htmlRegex
210228
*
211-
* A regular expression used to pull out HTML tags from a string. Handles namespaced HTML tags and
212-
* attribute names, as specified by http://www.w3.org/TR/html-markup/syntax.html
229+
* The regular expression used to pull out HTML tags from a string. Handles namespaced HTML tags and
230+
* attribute names, as specified by http://www.w3.org/TR/html-markup/syntax.html.
213231
*
214232
* Capturing groups:
215233
*
@@ -343,15 +361,31 @@
343361
enableEmailAddresses = this.email,
344362
enableUrls = this.urls;
345363

346-
return text.replace( matcherRegex, function( matchStr, $1, $2, $3, $4, $5 ) {
364+
return text.replace( matcherRegex, function( matchStr, $1, $2, $3, $4, $5, $6, $7 ) {
347365
var twitterMatch = $1,
348366
twitterHandlePrefixWhitespaceChar = $2, // The whitespace char before the @ sign in a Twitter handle match. This is needed because of no lookbehinds in JS regexes
349-
twitterHandle = $3, // The actual twitterUser (i.e the word after the @ sign in a Twitter handle match)
350-
emailAddress = $4, // For both determining if it is an email address, and stores the actual email address
351-
urlMatch = $5, // The matched URL string
367+
twitterHandle = $3, // The actual twitterUser (i.e the word after the @ sign in a Twitter handle match)
368+
emailAddress = $4, // For both determining if it is an email address, and stores the actual email address
369+
urlMatch = $5, // The matched URL string
370+
protocolRelativeMatch = $6 || $7, // The '//' for a protocol-relative match, with the character that comes before the '//'
352371

353-
prefixStr = "", // A string to use to prefix the anchor tag that is created. This is needed for the Twitter handle match
354-
suffixStr = ""; // A string to suffix the anchor tag that is created. This is used if there is a trailing parenthesis that should not be auto-linked.
372+
prefixStr = "", // A string to use to prefix the anchor tag that is created. This is needed for the Twitter handle match
373+
suffixStr = ""; // A string to suffix the anchor tag that is created. This is used if there is a trailing parenthesis that should not be auto-linked.
374+
375+
// Early exits with no replacements for:
376+
// 1) Disabled link types
377+
// 2) URL matches which do not have at least have one period ('.') in the domain name (effectively skipping over
378+
// matches like "abc:def")
379+
// 3) A protocol-relative url match (a URL beginning with '//') whose previous character is a word character
380+
// (effectively skipping over strings like "abc//google.com")
381+
if(
382+
( twitterMatch && !enableTwitter ) || ( emailAddress && !enableEmailAddresses ) || ( urlMatch && !enableUrls ) ||
383+
( urlMatch && urlMatch.indexOf( '.' ) === -1 ) || // At least one period ('.') must exist in the URL match for us to consider it an actual URL
384+
( protocolRelativeMatch && /^[\w]\/\//.test( protocolRelativeMatch ) ) // a protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
385+
) {
386+
return matchStr;
387+
}
388+
355389

356390
// Handle a closing parenthesis at the end of the match, and exclude it if there is not a matching open parenthesis
357391
// in the match. This handles cases like the string "wikipedia.com/something_(disambiguation)" (which should be auto-
@@ -372,33 +406,39 @@
372406

373407

374408
var anchorHref = matchStr, // initialize both of these
375-
anchorText = matchStr; // values as the full match
409+
anchorText = matchStr, // values as the full match
410+
linkType;
376411

377-
// Simply return out for disabled link types, or possibly URL matches which do not have at least have
378-
// one period ('.') in the domain name (effectively skipping over strings like "abc:def")
379-
if(
380-
( twitterMatch && !enableTwitter ) ||
381-
( emailAddress && !enableEmailAddresses ) ||
382-
( urlMatch && !enableUrls ) ||
383-
( urlMatch && urlMatch.indexOf( '.' ) === -1 ) // At least one period ('.') must exist in the URL match for us to consider it an actual URL
384-
) {
385-
return prefixStr + anchorText + suffixStr;
386-
}
387-
388412
// Process the urls that are found. We need to change URLs like "www.yahoo.com" to "http://www.yahoo.com" (or the browser
389413
// will try to direct the user to "http://current-domain.com/www.yahoo.com"), and we need to prefix 'mailto:' to email addresses.
390-
var linkType = 'url';
391414
if( twitterMatch ) {
392415
linkType = 'twitter';
393416
prefixStr = twitterHandlePrefixWhitespaceChar;
394417
anchorHref = 'https://twitter.com/' + twitterHandle;
395418
anchorText = '@' + twitterHandle;
419+
396420
} else if( emailAddress ) {
397421
linkType = 'email';
398422
anchorHref = 'mailto:' + emailAddress;
399423
anchorText = emailAddress;
400-
} else if( !/^[A-Za-z]{3,9}:/i.test( anchorHref ) ) { // url string doesn't begin with a protocol, assume http://
401-
anchorHref = 'http://' + anchorHref;
424+
425+
} else { // url match
426+
linkType = 'url';
427+
428+
if( protocolRelativeMatch ) {
429+
// Strip off any protocol-relative '//' from the anchor text (leaving the previous non-word character
430+
// intact, if there is one)
431+
var protocolRelRegex = new RegExp( "^" + me.protocolRelativeRegex.source ), // for this one, we want to only match at the beginning of the string
432+
charBeforeMatch = protocolRelativeMatch.match( protocolRelRegex )[ 1 ] || "";
433+
434+
prefixStr = charBeforeMatch + prefixStr; // re-add the character before the '//' to what will be placed before the <a> tag
435+
anchorHref = anchorHref.replace( protocolRelRegex, "//" ); // remove the char before the match for the href
436+
anchorText = anchorText.replace( protocolRelRegex, "" ); // remove both the char before the match and the '//' for the anchor text
437+
438+
} else if( !/^[A-Za-z]{3,9}:/i.test( anchorHref ) ) {
439+
// url string doesn't begin with a protocol, assume http://
440+
anchorHref = 'http://' + anchorHref;
441+
}
402442
}
403443

404444
// wrap the match in an anchor tag

0 commit comments

Comments
 (0)