From 25ec9f929301a33eca0b22826bb070eb77d32e3a Mon Sep 17 00:00:00 2001 From: catcompiles Date: Tue, 27 Dec 2022 03:27:26 -0800 Subject: [PATCH 1/2] fix(toUTF8): replace deprecated unescape with decodedURIComponent --- spark-md5.js | 6 +++--- test/specs.js | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spark-md5.js b/spark-md5.js index d2bdbbb..09a7037 100644 --- a/spark-md5.js +++ b/spark-md5.js @@ -379,10 +379,10 @@ */ function toUtf8(str) { - if (/[\u0080-\uFFFF]/.test(str)) { - str = unescape(encodeURIComponent(str)); + const containsUnicodeChars = /[\u0080-\uFFFF]/.test(str) + if (containsUnicodeChars) { + str = decodeURIComponent(encodeURIComponent(str)); } - return str; } diff --git a/test/specs.js b/test/specs.js index 87cbcb0..5e1c246 100644 --- a/test/specs.js +++ b/test/specs.js @@ -324,13 +324,13 @@ test('Incremental usage (resume with JSON.stringify)', function () { test('UTF-8', function () { var str = 'räksmörgås'; - equal(SparkMD5.hash(str), 'e462805dcf84413d5eddca45a4b88a5e', 'SparkMD5.hash() of "' + str + '"'); + equal(SparkMD5.hash(str), '09d9d71ec8a8e3bc74e51ebd587154f3', 'SparkMD5.hash() of "' + str + '"'); equal(SparkMD5.hashBinary(str), '09d9d71ec8a8e3bc74e51ebd587154f3', 'SparkMD5.hashBinary() of "' + str + '"'); equal(SparkMD5.ArrayBuffer.hash(unicodeStringToArrayBuffer(str)), 'e462805dcf84413d5eddca45a4b88a5e', 'SparkMD5.ArrayBuffer.hash() of "' + str + '"'); hasher.reset(); hasher.append(str); - equal(hasher.end(), 'e462805dcf84413d5eddca45a4b88a5e', 'Incremental (normal) of "' + str + '"'); + equal(hasher.end(), '09d9d71ec8a8e3bc74e51ebd587154f3', 'Incremental (normal) of "' + str + '"'); hasher.reset(); hasher.appendBinary(str); @@ -342,13 +342,13 @@ test('UTF-8', function () { str = '\u30b9\u3092\u98df'; - equal(SparkMD5.hash(str), '453931ab48a4a5af69f3da3c21064fc9', 'SparkMD5.hash() of "' + str + '"'); - equal(SparkMD5.hashBinary(str), '24e3399be06b7cf59dbd848e18d9246c', 'SparkMD5.hashBinary() of "' + str + '"'); + equal(SparkMD5.hash(str), '24e3399be06b7cf59dbd848e18d9246c', 'SparkMD5.hash() of "' + str + '"'); + equal(SparkMD5.hashBinary(str), '24e3399be06b7cf59dbd848e18d9246c', 'SparkMD5.hashBinary() of '' + str + '"'); equal(SparkMD5.ArrayBuffer.hash(unicodeStringToArrayBuffer(str)), '453931ab48a4a5af69f3da3c21064fc9', 'SparkMD5.ArrayBuffer.hash() of "' + str + '"'); hasher.reset(); hasher.append(str); - equal(hasher.end(), '453931ab48a4a5af69f3da3c21064fc9', 'Incremental (normal) of "' + str + '"'); + equal(hasher.end(), '24e3399be06b7cf59dbd848e18d9246c', 'Incremental (normal) of "' + str + '"'); hasher.reset(); hasher.appendBinary(str); From 41562805a1d2ca508cf411b5318e4755f1873091 Mon Sep 17 00:00:00 2001 From: catcompiles Date: Mon, 2 Jan 2023 21:33:56 -0800 Subject: [PATCH 2/2] Reimplement native Javascript unescape to ensure backward compatibility --- spark-md5.js | 138 ++++++++++++++++++++++++++++++++++++++++- test/specs.js | 166 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 293 insertions(+), 11 deletions(-) diff --git a/spark-md5.js b/spark-md5.js index 09a7037..23edcf3 100644 --- a/spark-md5.js +++ b/spark-md5.js @@ -377,12 +377,146 @@ /** * Helpers. */ + var letterFCharCode = "f".charCodeAt(0); + var zeroCharCode = "0".charCodeAt(0); + + function clampHexValue(c) { + var charCodeNum = c.charCodeAt(0); + charCodeNum -= zeroCharCode; + if (charCodeNum <= 9) { + return charCodeNum; + } + var aCharCode = "a".charCodeAt(0); + charCodeNum = (charCodeNum | 0x20) - (aCharCode - zeroCharCode); + if (charCodeNum <= 5) { + return charCodeNum + 10; + } + return -1; + }; + + function twoDigitHex(char1, char2) { + if (!char1) { + return -1; + } + if (!char2) { + return -1; + } + + var char1CharCode = char1.charCodeAt(0); + var char2CharCode = char2.charCodeAt(0); + if (char1CharCode > letterFCharCode) { + return -1; + } + var high = clampHexValue(char1); + if (high == -1) { + return -1; + } + if (char2CharCode > letterFCharCode) { + return -1; + } + var low = clampHexValue(char2); + if (low == -1) { + return -1; + } + return (high << 4) + low; + }; + //JS implementation of unescaping a character + //from v8 https://github.com/v8/v8/blob/main/src/strings/uri.cc#L329 + function unescapeChar(arr, i, charLength) { + var hi = 0; + var lo = 0; + var character = arr[i]; + + var encodingPresent = character == "%"; + var longHexChars = i <= charLength - 6; + var unicodeChar = arr[i + 1] == "u"; + var firstTwoHexChars = twoDigitHex(arr[i + 2], arr[i + 3]); + var lastTwoHexChars = twoDigitHex(arr[i + 4], arr[i + 5]); + + var validFourHex = firstTwoHexChars > -1 && lastTwoHexChars > -1; + var startOfEscapeSeq = encodingPresent && unicodeChar; + + var escapeFourDigitHex = startOfEscapeSeq && longHexChars && validFourHex; + + if (escapeFourDigitHex) { + hi = firstTwoHexChars; + lo = lastTwoHexChars; + return { + value: (hi << 8) + lo, + step: 6, + }; + } + + var shortHexChars = i <= charLength - 3; + var shortTwoHexChars = twoDigitHex(arr[i + 1], arr[i + 2]); + var validShortHex = shortTwoHexChars > -1; + + var escapeTwoDigitHex = encodingPresent && shortHexChars && validShortHex; + + if (escapeTwoDigitHex) { + lo = shortTwoHexChars; + return { + value: lo, + step: 3, + }; + } else { + return { + value: character, + step: 1, + }; + } + }; + + function findEncodingIndex(str) { + var index = -1; + + for (var i = 0; i < str.length; i++) { + var containsPercent = str[i] === "%"; + var percentNotFound = index === -1; + + if (containsPercent && percentNotFound) { + index = i; + } + } + return index; + }; + + function legacyUnescape(str) { + if (typeof str != "string") { + return ""; + } + + var encodingIndex = findEncodingIndex(str); + + var noEscapeChars = encodingIndex < 0; + if (noEscapeChars) { + return str; + } + + var len = str.length; + var arr = str.split(""); + var destPosition = 0; + var unescapedStr = ""; + unescapedStr += str.substring(encodingIndex, 0); + + for (var i = encodingIndex; i < len; destPosition += 1) { + var { value, step } = unescapeChar(arr, i, len); + if (/[0-9]/.test(value)) { + unescapedStr += String.fromCharCode.call(this, value); + } else { + unescapedStr += value; + } + i += step; + } + return unescapedStr; + }; function toUtf8(str) { - const containsUnicodeChars = /[\u0080-\uFFFF]/.test(str) + var containsUnicodeChars = /[\u0080-\uFFFF]/.test(str) if (containsUnicodeChars) { - str = decodeURIComponent(encodeURIComponent(str)); + str = legacyUnescape(encodeURIComponent(str)); } + return str; } diff --git a/test/specs.js b/test/specs.js index 5e1c246..e28098d 100644 --- a/test/specs.js +++ b/test/specs.js @@ -3,6 +3,142 @@ var hasher = new SparkMD5(), buffHasher = new SparkMD5.ArrayBuffer(); +const letterFCharCode = "f".charCodeAt(0); +const zeroCharCode = "0".charCodeAt(0); + +const hexValue = (c) => { + let charCodeNum = c.charCodeAt(0); + charCodeNum -= zeroCharCode; + if (charCodeNum <= 9) { + return charCodeNum; + } + let aCharCode = "a".charCodeAt(0); + charCodeNum = (charCodeNum | 0x20) - (aCharCode - zeroCharCode); + if (charCodeNum <= 5) { + return charCodeNum + 10; + } + return -1; +}; + +const twoDigitHex = (char1, char2) => { + if (!char1) { + return -1; + } + if (!char2) { + return -1; + } + + let char1CharCode = char1.charCodeAt(0); + let char2CharCode = char2.charCodeAt(0); + if (char1CharCode > letterFCharCode) { + return -1; + } + let high = hexValue(char1); + if (high == -1) { + return -1; + } + if (char2CharCode > letterFCharCode) { + return -1; + } + let low = hexValue(char2); + if (low == -1) { + return -1; + } + return (high << 4) + low; +}; + +//JS implementation of unescaping a character +//from v8 https://github.com/v8/v8/blob/main/src/strings/uri.cc#L329 + +const unescapeChar = (arr, i, charLength) => { + let hi = 0; + let lo = 0; + let character = arr[i]; + + const encodingPresent = character == "%"; + const longHexChars = i <= charLength - 6; + const unicodeChar = arr[i + 1] == "u"; + const firstTwoHexChars = twoDigitHex(arr[i + 2], arr[i + 3]); + const lastTwoHexChars = twoDigitHex(arr[i + 4], arr[i + 5]); + + const validFourHex = firstTwoHexChars > -1 && lastTwoHexChars > -1; + const startOfEscapeSeq = encodingPresent && unicodeChar; + + const escapeFourDigitHex = startOfEscapeSeq && longHexChars && validFourHex; + + if (escapeFourDigitHex) { + hi = firstTwoHexChars; + lo = lastTwoHexChars; + return { + value: (hi << 8) + lo, + step: 6, + }; + } + + const shortHexChars = i <= charLength - 3; + const shortTwoHexChars = twoDigitHex(arr[i + 1], arr[i + 2]); + const validShortHex = shortTwoHexChars > -1; + + const escapeTwoDigitHex = encodingPresent && shortHexChars && validShortHex; + + if (escapeTwoDigitHex) { + lo = shortTwoHexChars; + return { + value: lo, + step: 3, + }; + } else { + return { + value: character, + step: 1, + }; + } +}; + +const findEncodingIndex = (str) => { + let index = -1; + + for (let i = 0; i < str.length; i++) { + let containsPercent = str[i] === "%"; + let percentNotFound = index === -1; + + if (containsPercent && percentNotFound) { + index = i; + } + } + return index; +}; + +const legacyUnescape = (str) => { + if (typeof str != "string") { + return ""; + } + + const encodingIndex = findEncodingIndex(str); + + const noEscapeChars = encodingIndex < 0; + if (noEscapeChars) { + return str; + } + + let len = str.length; + let arr = str.split(""); + let destPosition = 0; + let unescapedStr = ""; + unescapedStr += str.substring(encodingIndex, 0); + + for (let i = encodingIndex; i < len; destPosition += 1) { + let { value, step } = unescapeChar(arr, i, len); + if (/[0-9]/.test(value)) { + unescapedStr += String.fromCharCode.call(this, value); + } else { + unescapedStr += value; + } + i += step; + } + return unescapedStr; +}; + function unicodeStringToArrayBuffer(str) { if (/[\u0080-\uFFFF]/.test(str)) { str = unescape(encodeURIComponent(str)); @@ -13,9 +149,9 @@ function unicodeStringToArrayBuffer(str) { function stringToArrayBuffer(str) { var length = str.length, - buff = new ArrayBuffer(length), - arr = new Uint8Array(buff), - i; + buff = new ArrayBuffer(length), + arr = new Uint8Array(buff), + i; for (i = 0; i < length; i += 1) { arr[i] = str.charCodeAt(i); @@ -324,13 +460,13 @@ test('Incremental usage (resume with JSON.stringify)', function () { test('UTF-8', function () { var str = 'räksmörgås'; - equal(SparkMD5.hash(str), '09d9d71ec8a8e3bc74e51ebd587154f3', 'SparkMD5.hash() of "' + str + '"'); + equal(SparkMD5.hash(str), 'e462805dcf84413d5eddca45a4b88a5e', 'SparkMD5.hash() of "' + str + '"'); equal(SparkMD5.hashBinary(str), '09d9d71ec8a8e3bc74e51ebd587154f3', 'SparkMD5.hashBinary() of "' + str + '"'); equal(SparkMD5.ArrayBuffer.hash(unicodeStringToArrayBuffer(str)), 'e462805dcf84413d5eddca45a4b88a5e', 'SparkMD5.ArrayBuffer.hash() of "' + str + '"'); hasher.reset(); hasher.append(str); - equal(hasher.end(), '09d9d71ec8a8e3bc74e51ebd587154f3', 'Incremental (normal) of "' + str + '"'); + equal(hasher.end(), 'e462805dcf84413d5eddca45a4b88a5e', 'Incremental (normal) of "' + str + '"'); hasher.reset(); hasher.appendBinary(str); @@ -342,13 +478,13 @@ test('UTF-8', function () { str = '\u30b9\u3092\u98df'; - equal(SparkMD5.hash(str), '24e3399be06b7cf59dbd848e18d9246c', 'SparkMD5.hash() of "' + str + '"'); - equal(SparkMD5.hashBinary(str), '24e3399be06b7cf59dbd848e18d9246c', 'SparkMD5.hashBinary() of '' + str + '"'); + equal(SparkMD5.hash(str), '453931ab48a4a5af69f3da3c21064fc9', 'SparkMD5.hash() of "' + str + '"'); + equal(SparkMD5.hashBinary(str), '24e3399be06b7cf59dbd848e18d9246c', 'SparkMD5.hashBinary() of "' + str + '"'); equal(SparkMD5.ArrayBuffer.hash(unicodeStringToArrayBuffer(str)), '453931ab48a4a5af69f3da3c21064fc9', 'SparkMD5.ArrayBuffer.hash() of "' + str + '"'); hasher.reset(); hasher.append(str); - equal(hasher.end(), '24e3399be06b7cf59dbd848e18d9246c', 'Incremental (normal) of "' + str + '"'); + equal(hasher.end(), '453931ab48a4a5af69f3da3c21064fc9', 'Incremental (normal) of "' + str + '"'); hasher.reset(); hasher.appendBinary(str); @@ -359,6 +495,18 @@ test('UTF-8', function () { equal(buffHasher.end(), '453931ab48a4a5af69f3da3c21064fc9', 'Incremental (array buffer) of "' + str + '"'); }); + +test('legacyUnescape', function () { + // encoded value of 'räksmörgås'; + equal(legacyUnescape('r%C3%A4ksm%C3%B6rg%C3%A5s'), 'räksmörgÃ¥s', 'legacyUnescape() of "r%C3%A4ksm%C3%B6rg%C3%A5s"'); + equal(legacyUnescape('%E4%F6%FC'), 'äöü', 'legacyUnescape() of "r%C3%A4ksm%C3%B6rg%C3%A5s"'); + equal(legacyUnescape('%u0107'), 'ć', 'legacyUnescape() of "%u0107"'); + equal(legacyUnescape('%E4'), 'ä', 'legacyUnescape() of "%E4"'); + equal(legacyUnescape('abc123'), 'abc123', 'legacyUnescape() found no escape sequences for "abc123"'); + equal(legacyUnescape(''), '', 'legacyUnescape() should find no escape sequences and should return empty "" '); + equal(legacyUnescape('スを食'), 'スを食', 'legacyUnescape() should find no escape sequences and should return "スを食" '); +}); + test('Hashing a PNG - ArrayBuffer vs binary string', function () { var binString, buffer; @@ -374,4 +522,4 @@ test('Hashing a PNG - ArrayBuffer vs binary string', function () { hasher.appendBinary(binString); equal(buffHasher.end(), hasher.end(), 'md5 sum should be the same for both binary strings and ArrayBuffers'); -}); +}); \ No newline at end of file