Skip to content

Commit 4639486

Browse files
committed
added support for decoding Java's weird modified UTF-8 encoding
1 parent 859f4a6 commit 4639486

File tree

5 files changed

+184
-1
lines changed

5 files changed

+184
-1
lines changed

README.md

+12
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,18 @@ $myInt->setValue(42);
4949
echo $myInt->getValue(); // 42
5050
```
5151

52+
On String tags, `getValue()` and `setValue()` will use the raw string data, which uses Java's modified
53+
UTF-8 encoding. To use different encodings,
54+
use `getDecodedValue($encoding = "UTF-8")` and `setDecodedValue($value, $encoding = "UTF-8")` instead.
55+
A list of supported encodings is returned by the `mb_list_encodings()` function.
56+
57+
```php
58+
$myString new \Aternos\Nbt\Tag\StringTag();
59+
60+
$myString->setDecodedValue("Hello world!");
61+
echo $myString->getDecodedValue(); // Hello world!
62+
```
63+
5264
Compound tags, list tags, and array tags implement the `ArrayAccess`, `Countable`,
5365
and `Iterator` interfaces and can therefore be accessed as arrays.
5466
```php

composer.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"php": ">=8.0",
2020
"php-64bit": "*",
2121
"ext-zlib": "*",
22-
"ext-json": "*"
22+
"ext-json": "*",
23+
"ext-mbstring": "*"
2324
}
2425
}

src/String/JavaEncoding.php

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
<?php
2+
3+
namespace Aternos\Nbt\String;
4+
5+
/**
6+
* https://py2jdbc.readthedocs.io/en/latest/mutf8.html
7+
* https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8
8+
* https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#readUTF%28%29
9+
* Good luck
10+
*/
11+
class JavaEncoding
12+
{
13+
protected static ?JavaEncoding $instance = null;
14+
15+
/**
16+
* @return static
17+
*/
18+
static function getInstance(): static
19+
{
20+
if (static::$instance === null) {
21+
static::$instance = new static();
22+
}
23+
return static::$instance;
24+
}
25+
26+
/**
27+
* @param string $string
28+
* @param string $sourceEncoding
29+
* @return string
30+
*/
31+
public function encode(string $string, string $sourceEncoding = "UTF-8"): string
32+
{
33+
$result = "";
34+
35+
$chars = mb_str_split($string, 1, $sourceEncoding);
36+
foreach ($chars as $char) {
37+
$c = mb_ord($char, $sourceEncoding);
38+
39+
if($c === 0) {
40+
$result .= "\xC0\x80";
41+
continue;
42+
}
43+
44+
if($c <= 0x7F) {
45+
$result .= chr($c);
46+
continue;
47+
}
48+
49+
if($c <= 0x7FF) {
50+
$result .= chr(0xC0 | (0x1F & ($c >> 0x06)));
51+
$result .= chr(0x80 | (0x3F & $c));
52+
continue;
53+
}
54+
55+
if($c <= 0xFFFF) {
56+
$result .= chr(0xE0 | (0x0F & ($c >> 0x0C)));
57+
$result .= chr(0x80 | (0x3F & ($c >> 0x06)));
58+
$result .= chr(0x80 | (0x3F & $c));
59+
continue;
60+
}
61+
62+
$result .= chr(0xED);
63+
$result .= chr(0xA0 | (($c >> 0x10) & 0x0F));
64+
$result .= chr(0x80 | (($c >> 0x0A) & 0x3f));
65+
$result .= chr(0xED);
66+
$result .= chr(0xb0 | (($c >> 0x06) & 0x0f));
67+
$result .= chr(0x80 | ($c & 0x3f));
68+
}
69+
70+
return $result;
71+
}
72+
73+
/**
74+
* @throws StringDataFormatException
75+
*/
76+
public function decode(string $string, string $outputEncoding = "UTF-8"): string
77+
{
78+
$result = "";
79+
for ($i = 0; $i < strlen($string); $i++) {
80+
$a = ord($string[$i]);
81+
82+
if ($a === 0) {
83+
throw new StringDataFormatException("Invalid NULL byte in string");
84+
}
85+
86+
// Single byte character
87+
if (($a & 0b10000000) === 0b0) {
88+
$result .= mb_chr($a, $outputEncoding);
89+
continue;
90+
}
91+
92+
$b = ord($string[++$i] ?? "\0");
93+
94+
// Two byte character
95+
if (($a & 0b11100000) === 0b11000000) {
96+
if (($b & 0b11000000) !== 0b10000000) {
97+
throw new StringDataFormatException("Invalid \"UTF-8\" sequence");
98+
}
99+
100+
$result .= mb_chr((($a & 0x1F) << 6) | ($b & 0x3F), $outputEncoding);
101+
continue;
102+
}
103+
104+
$c = ord($string[++$i] ?? "\0");
105+
106+
// Maybe six byte character
107+
if ($a === 0b11101101 && ($b & 0b11110000) === 0b10100000 && ($c & 0b11000000) === 0b10000000) {
108+
$d = ord($string[$i + 1] ?? "\0");
109+
$e = ord($string[$i + 2] ?? "\0");
110+
$f = ord($string[$i + 3] ?? "\0");
111+
112+
// Six byte character
113+
if ($d === 0b11101101 && ($e & 0b11110000) === 0b10110000 && ($f & 0b11000000) === 0b10000000) {
114+
$result .= mb_chr(0x10000 |
115+
($b & 0x0F) << 0x10 |
116+
($c & 0x3F) << 0x0A |
117+
($e & 0x0F) << 0x06 |
118+
($f & 0x3F), $outputEncoding);
119+
120+
$i += 3;
121+
continue;
122+
}
123+
}
124+
125+
// Three byte character
126+
if (($a & 0b11110000) === 0b11100000) {
127+
if (($b & 0b11000000) !== 0b10000000 || ($c & 0b11000000) !== 0b10000000) {
128+
throw new StringDataFormatException("Invalid \"UTF-8\" sequence");
129+
}
130+
131+
$result .= mb_chr((($a & 0x0F) << 12) | (($b & 0x3F) << 6) | ($c & 0x3F), $outputEncoding);
132+
continue;
133+
}
134+
135+
throw new StringDataFormatException("Invalid \"UTF-8\" sequence");
136+
}
137+
return $result;
138+
}
139+
}
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?php
2+
3+
namespace Aternos\Nbt\String;
4+
5+
class StringDataFormatException extends \Exception
6+
{
7+
8+
}

src/Tag/StringTag.php

+23
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
use Aternos\Nbt\IO\Reader\Reader;
66
use Aternos\Nbt\IO\Writer\Writer;
7+
use Aternos\Nbt\String\JavaEncoding;
8+
use Aternos\Nbt\String\StringDataFormatException;
79
use Exception;
810

911
class StringTag extends Tag
@@ -20,6 +22,16 @@ public function getValue(): string
2022
return $this->value;
2123
}
2224

25+
/**
26+
* @param string $encoding
27+
* @return string
28+
* @throws StringDataFormatException
29+
*/
30+
public function getDecodedValue(string $encoding = "UTF-8"): string
31+
{
32+
return JavaEncoding::getInstance()->decode($this->value, $encoding);
33+
}
34+
2335
/**
2436
* @param string $value
2537
* @return StringTag
@@ -30,6 +42,17 @@ public function setValue(string $value): StringTag
3042
return $this;
3143
}
3244

45+
/**
46+
* @param string $value
47+
* @param string $encoding
48+
* @return StringTag
49+
*/
50+
public function setDecodedValue(string $value, string $encoding = "UTF-8"): StringTag
51+
{
52+
$this->value = JavaEncoding::getInstance()->encode($value, $encoding);
53+
return $this;
54+
}
55+
3356
/**
3457
* @return int
3558
*/

0 commit comments

Comments
 (0)