satazor · cachelina · Dec 27, 2022 · Jan 3, 2023
diff --git a/spark-md5.js b/spark-md5.js
@@ -377,10 +377,144 @@
     /**
      * Helpers.
      */
+    var letterFCharCode = "f".charCodeAt(0);
+    var zeroCharCode = "0".charCodeAt(0);
+
+    function clampHexValue(c) {
+        var charCodeNum = c.charCodeAt(0);
+        charCodeNum -= zeroCharCode;
+        if (charCodeNum <= 9) {
+            return charCodeNum;
+        }
+        var aCharCode = "a".charCodeAt(0);
+        charCodeNum = (charCodeNum | 0x20) - (aCharCode - zeroCharCode);
+        if (charCodeNum <= 5) {
+            return charCodeNum + 10;
+        }
+        return -1;
+    };
+
+    function twoDigitHex(char1, char2) {
+        if (!char1) {
+            return -1;
+        }
+        if (!char2) {
+            return -1;
+        }
+
+        var char1CharCode = char1.charCodeAt(0);
+        var char2CharCode = char2.charCodeAt(0);
+        if (char1CharCode > letterFCharCode) {
+            return -1;
+        }
+        var high = clampHexValue(char1);
+        if (high == -1) {
+            return -1;
+        }
+        if (char2CharCode > letterFCharCode) {
+            return -1;
+        }
+        var low = clampHexValue(char2);
+        if (low == -1) {
+            return -1;
+        }
+        return (high << 4) + low;
+    };
+    //JS implementation of unescaping a character
+    //from v8 https://github.com/v8/v8/blob/main/src/strings/uri.cc#L329
+    function unescapeChar(arr, i, charLength) {
+        var hi = 0;
+        var lo = 0;
+        var character = arr[i];
+
+        var encodingPresent = character == "%";
+        var longHexChars = i <= charLength - 6;
+        var unicodeChar = arr[i + 1] == "u";
+        var firstTwoHexChars = twoDigitHex(arr[i + 2], arr[i + 3]);
+        var lastTwoHexChars = twoDigitHex(arr[i + 4], arr[i + 5]);
+
+        var validFourHex = firstTwoHexChars > -1 && lastTwoHexChars > -1;
+        var startOfEscapeSeq = encodingPresent && unicodeChar;
+
+        var escapeFourDigitHex = startOfEscapeSeq && longHexChars && validFourHex;
+
+        if (escapeFourDigitHex) {
+            hi = firstTwoHexChars;
+            lo = lastTwoHexChars;
+            return {
+                value: (hi << 8) + lo,
+                step: 6,
+            };
+        }
+
+        var shortHexChars = i <= charLength - 3;
+        var shortTwoHexChars = twoDigitHex(arr[i + 1], arr[i + 2]);
+        var validShortHex = shortTwoHexChars > -1;
+
+        var escapeTwoDigitHex = encodingPresent && shortHexChars && validShortHex;
+
+        if (escapeTwoDigitHex) {
+            lo = shortTwoHexChars;
+            return {
+                value: lo,
+                step: 3,
+            };
+        } else {
+            return {
+                value: character,
+                step: 1,
+            };
+        }
+    };
+
+    function findEncodingIndex(str) {
+        var index = -1;
+
+        for (var i = 0; i < str.length; i++) {
+            var containsPercent = str[i] === "%";
+            var percentNotFound = index === -1;
+
+            if (containsPercent && percentNotFound) {
+                index = i;
+            }
+        }
+        return index;
+    };
+
+    function legacyUnescape(str) {
+        if (typeof str != "string") {
+            return "";
+        }
+
+        var encodingIndex = findEncodingIndex(str);
+
+        var noEscapeChars = encodingIndex < 0;
+        if (noEscapeChars) {
+            return str;
+        }
+
+        var len = str.length;
+        var arr = str.split("");
+        var destPosition = 0;
+        var unescapedStr = "";
+        unescapedStr += str.substring(encodingIndex, 0);
+
+        for (var i = encodingIndex; i < len; destPosition += 1) {
+            var { value, step } = unescapeChar(arr, i, len);
+            if (/[0-9]/.test(value)) {
+                unescapedStr += String.fromCharCode.call(this, value);
+            } else {
+                unescapedStr += value;
+            }
+            i += step;
+        }
+        return unescapedStr;
+    };
 
     function toUtf8(str) {
-        if (/[\u0080-\uFFFF]/.test(str)) {
-            str = unescape(encodeURIComponent(str));
+        var containsUnicodeChars = /[\u0080-\uFFFF]/.test(str)
+        if (containsUnicodeChars) {
+            str = legacyUnescape(encodeURIComponent(str));
         }
 
         return str;

diff --git a/test/specs.js b/test/specs.js
@@ -3,6 +3,142 @@
 var hasher = new SparkMD5(),
     buffHasher = new SparkMD5.ArrayBuffer();
 
+const letterFCharCode = "f".charCodeAt(0);
+const zeroCharCode = "0".charCodeAt(0);
+
+const hexValue = (c) => {
+    let charCodeNum = c.charCodeAt(0);
+    charCodeNum -= zeroCharCode;
+    if (charCodeNum <= 9) {
+        return charCodeNum;
+    }
+    let aCharCode = "a".charCodeAt(0);
+    charCodeNum = (charCodeNum | 0x20) - (aCharCode - zeroCharCode);
+    if (charCodeNum <= 5) {
+        return charCodeNum + 10;
+    }
+    return -1;
+};
+
+const twoDigitHex = (char1, char2) => {
+    if (!char1) {
+        return -1;
+    }
+    if (!char2) {
+        return -1;
+    }
+
+    let char1CharCode = char1.charCodeAt(0);
+    let char2CharCode = char2.charCodeAt(0);
+    if (char1CharCode > letterFCharCode) {
+        return -1;
+    }
+    let high = hexValue(char1);
+    if (high == -1) {
+        return -1;
+    }
+    if (char2CharCode > letterFCharCode) {
+        return -1;
+    }
+    let low = hexValue(char2);
+    if (low == -1) {
+        return -1;
+    }
+    return (high << 4) + low;
+};
+
+//JS implementation of unescaping a character
+//from v8 https://github.com/v8/v8/blob/main/src/strings/uri.cc#L329
+
+const unescapeChar = (arr, i, charLength) => {
+    let hi = 0;
+    let lo = 0;
+    let character = arr[i];
+
+    const encodingPresent = character == "%";
+    const longHexChars = i <= charLength - 6;
+    const unicodeChar = arr[i + 1] == "u";
+    const firstTwoHexChars = twoDigitHex(arr[i + 2], arr[i + 3]);
+    const lastTwoHexChars = twoDigitHex(arr[i + 4], arr[i + 5]);
+
+    const validFourHex = firstTwoHexChars > -1 && lastTwoHexChars > -1;
+    const startOfEscapeSeq = encodingPresent && unicodeChar;
+
+    const escapeFourDigitHex = startOfEscapeSeq && longHexChars && validFourHex;
+
+    if (escapeFourDigitHex) {
+        hi = firstTwoHexChars;
+        lo = lastTwoHexChars;
+        return {
+            value: (hi << 8) + lo,
+            step: 6,
+        };
+    }
+
+    const shortHexChars = i <= charLength - 3;
+    const shortTwoHexChars = twoDigitHex(arr[i + 1], arr[i + 2]);
+    const validShortHex = shortTwoHexChars > -1;
+
+    const escapeTwoDigitHex = encodingPresent && shortHexChars && validShortHex;
+
+    if (escapeTwoDigitHex) {
+        lo = shortTwoHexChars;
+        return {
+            value: lo,
+            step: 3,
+        };
+    } else {
+        return {
+            value: character,
+            step: 1,
+        };
+    }
+};
+
+const findEncodingIndex = (str) => {
+    let index = -1;
+
+    for (let i = 0; i < str.length; i++) {
+        let containsPercent = str[i] === "%";
+        let percentNotFound = index === -1;
+
+        if (containsPercent && percentNotFound) {
+            index = i;
+        }
+    }
+    return index;
+};
+
+const legacyUnescape = (str) => {
+    if (typeof str != "string") {
+        return "";
+    }
+
+    const encodingIndex = findEncodingIndex(str);
+
+    const noEscapeChars = encodingIndex < 0;
+    if (noEscapeChars) {
+        return str;
+    }
+
+    let len = str.length;
+    let arr = str.split("");
+    let destPosition = 0;
+    let unescapedStr = "";
+    unescapedStr += str.substring(encodingIndex, 0);
+
+    for (let i = encodingIndex; i < len; destPosition += 1) {
+        let { value, step } = unescapeChar(arr, i, len);
+        if (/[0-9]/.test(value)) {
+            unescapedStr += String.fromCharCode.call(this, value);
+        } else {
+            unescapedStr += value;
+        }
+        i += step;
+    }
+    return unescapedStr;
+};
+
 function unicodeStringToArrayBuffer(str) {
     if (/[\u0080-\uFFFF]/.test(str)) {
         str = unescape(encodeURIComponent(str));
@@ -13,9 +149,9 @@ function unicodeStringToArrayBuffer(str) {
 
 function stringToArrayBuffer(str) {
     var length = str.length,
-       buff = new ArrayBuffer(length),
-       arr = new Uint8Array(buff),
-       i;
+        buff = new ArrayBuffer(length),
+        arr = new Uint8Array(buff),
+        i;
 
     for (i = 0; i < length; i += 1) {
         arr[i] = str.charCodeAt(i);
@@ -359,6 +495,18 @@ test('UTF-8', function () {
     equal(buffHasher.end(), '453931ab48a4a5af69f3da3c21064fc9', 'Incremental (array buffer) of "' + str + '"');
 });
 
+
+test('legacyUnescape', function () {
+    // encoded value of 'räksmörgås';
+    equal(legacyUnescape('r%C3%A4ksm%C3%B6rg%C3%A5s'), 'rÃ¤ksmÃ¶rgÃ¥s', 'legacyUnescape() of "r%C3%A4ksm%C3%B6rg%C3%A5s"');
+    equal(legacyUnescape('%E4%F6%FC'), 'äöü', 'legacyUnescape() of "r%C3%A4ksm%C3%B6rg%C3%A5s"');
+    equal(legacyUnescape('%u0107'), 'ć', 'legacyUnescape() of "%u0107"');
+    equal(legacyUnescape('%E4'), 'ä', 'legacyUnescape() of "%E4"');
+    equal(legacyUnescape('abc123'), 'abc123', 'legacyUnescape() found no escape sequences for "abc123"');
+    equal(legacyUnescape(''), '', 'legacyUnescape() should find no escape sequences and should return empty "" ');
+    equal(legacyUnescape('スを食'), 'スを食', 'legacyUnescape() should find no escape sequences and should return "スを食" ');
+});
+
 test('Hashing a PNG - ArrayBuffer vs binary string', function () {
     var binString,
         buffer;
@@ -374,4 +522,4 @@ test('Hashing a PNG - ArrayBuffer vs binary string', function () {
     hasher.appendBinary(binString);
 
     equal(buffHasher.end(), hasher.end(), 'md5 sum should be the same for both binary strings and ArrayBuffers');
-});
+});