1+ <?php
2+
3+ namespace Aternos \Nbt \String ;
4+
5+ /**
6+ * https://py2jdbc.readthedocs.io/en/latest/mutf8.html
7+ * https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8
8+ * https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#readUTF%28%29
9+ * Good luck
10+ */
11+ class JavaEncoding
12+ {
13+ protected static ?JavaEncoding $ instance = null ;
14+
15+ /**
16+ * @return static
17+ */
18+ static function getInstance (): static
19+ {
20+ if (static ::$ instance === null ) {
21+ static ::$ instance = new static ();
22+ }
23+ return static ::$ instance ;
24+ }
25+
26+ /**
27+ * @param string $string
28+ * @param string $sourceEncoding
29+ * @return string
30+ */
31+ public function encode (string $ string , string $ sourceEncoding = "UTF-8 " ): string
32+ {
33+ $ result = "" ;
34+
35+ $ chars = mb_str_split ($ string , 1 , $ sourceEncoding );
36+ foreach ($ chars as $ char ) {
37+ $ c = mb_ord ($ char , $ sourceEncoding );
38+
39+ if ($ c === 0 ) {
40+ $ result .= "\xC0\x80" ;
41+ continue ;
42+ }
43+
44+ if ($ c <= 0x7F ) {
45+ $ result .= chr ($ c );
46+ continue ;
47+ }
48+
49+ if ($ c <= 0x7FF ) {
50+ $ result .= chr (0xC0 | (0x1F & ($ c >> 0x06 )));
51+ $ result .= chr (0x80 | (0x3F & $ c ));
52+ continue ;
53+ }
54+
55+ if ($ c <= 0xFFFF ) {
56+ $ result .= chr (0xE0 | (0x0F & ($ c >> 0x0C )));
57+ $ result .= chr (0x80 | (0x3F & ($ c >> 0x06 )));
58+ $ result .= chr (0x80 | (0x3F & $ c ));
59+ continue ;
60+ }
61+
62+ $ result .= chr (0xED );
63+ $ result .= chr (0xA0 | (($ c >> 0x10 ) & 0x0F ));
64+ $ result .= chr (0x80 | (($ c >> 0x0A ) & 0x3f ));
65+ $ result .= chr (0xED );
66+ $ result .= chr (0xb0 | (($ c >> 0x06 ) & 0x0f ));
67+ $ result .= chr (0x80 | ($ c & 0x3f ));
68+ }
69+
70+ return $ result ;
71+ }
72+
73+ /**
74+ * @throws StringDataFormatException
75+ */
76+ public function decode (string $ string , string $ outputEncoding = "UTF-8 " ): string
77+ {
78+ $ result = "" ;
79+ for ($ i = 0 ; $ i < strlen ($ string ); $ i ++) {
80+ $ a = ord ($ string [$ i ]);
81+
82+ if ($ a === 0 ) {
83+ throw new StringDataFormatException ("Invalid NULL byte in string " );
84+ }
85+
86+ // Single byte character
87+ if (($ a & 0b10000000 ) === 0b0 ) {
88+ $ result .= mb_chr ($ a , $ outputEncoding );
89+ continue ;
90+ }
91+
92+ $ b = ord ($ string [++$ i ] ?? "\0" );
93+
94+ // Two byte character
95+ if (($ a & 0b11100000 ) === 0b11000000 ) {
96+ if (($ b & 0b11000000 ) !== 0b10000000 ) {
97+ throw new StringDataFormatException ("Invalid \"UTF-8 \" sequence " );
98+ }
99+
100+ $ result .= mb_chr ((($ a & 0x1F ) << 6 ) | ($ b & 0x3F ), $ outputEncoding );
101+ continue ;
102+ }
103+
104+ $ c = ord ($ string [++$ i ] ?? "\0" );
105+
106+ // Maybe six byte character
107+ if ($ a === 0b11101101 && ($ b & 0b11110000 ) === 0b10100000 && ($ c & 0b11000000 ) === 0b10000000 ) {
108+ $ d = ord ($ string [$ i + 1 ] ?? "\0" );
109+ $ e = ord ($ string [$ i + 2 ] ?? "\0" );
110+ $ f = ord ($ string [$ i + 3 ] ?? "\0" );
111+
112+ // Six byte character
113+ if ($ d === 0b11101101 && ($ e & 0b11110000 ) === 0b10110000 && ($ f & 0b11000000 ) === 0b10000000 ) {
114+ $ result .= mb_chr (0x10000 |
115+ ($ b & 0x0F ) << 0x10 |
116+ ($ c & 0x3F ) << 0x0A |
117+ ($ e & 0x0F ) << 0x06 |
118+ ($ f & 0x3F ), $ outputEncoding );
119+
120+ $ i += 3 ;
121+ continue ;
122+ }
123+ }
124+
125+ // Three byte character
126+ if (($ a & 0b11110000 ) === 0b11100000 ) {
127+ if (($ b & 0b11000000 ) !== 0b10000000 || ($ c & 0b11000000 ) !== 0b10000000 ) {
128+ throw new StringDataFormatException ("Invalid \"UTF-8 \" sequence " );
129+ }
130+
131+ $ result .= mb_chr ((($ a & 0x0F ) << 12 ) | (($ b & 0x3F ) << 6 ) | ($ c & 0x3F ), $ outputEncoding );
132+ continue ;
133+ }
134+
135+ throw new StringDataFormatException ("Invalid \"UTF-8 \" sequence " );
136+ }
137+ return $ result ;
138+ }
139+ }
0 commit comments