1
+ <?php
2
+
3
+ namespace Aternos \Nbt \String ;
4
+
5
+ /**
6
+ * https://py2jdbc.readthedocs.io/en/latest/mutf8.html
7
+ * https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8
8
+ * https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#readUTF%28%29
9
+ * Good luck
10
+ */
11
+ class JavaEncoding
12
+ {
13
+ protected static ?JavaEncoding $ instance = null ;
14
+
15
+ /**
16
+ * @return static
17
+ */
18
+ static function getInstance (): static
19
+ {
20
+ if (static ::$ instance === null ) {
21
+ static ::$ instance = new static ();
22
+ }
23
+ return static ::$ instance ;
24
+ }
25
+
26
+ /**
27
+ * @param string $string
28
+ * @param string $sourceEncoding
29
+ * @return string
30
+ */
31
+ public function encode (string $ string , string $ sourceEncoding = "UTF-8 " ): string
32
+ {
33
+ $ result = "" ;
34
+
35
+ $ chars = mb_str_split ($ string , 1 , $ sourceEncoding );
36
+ foreach ($ chars as $ char ) {
37
+ $ c = mb_ord ($ char , $ sourceEncoding );
38
+
39
+ if ($ c === 0 ) {
40
+ $ result .= "\xC0\x80" ;
41
+ continue ;
42
+ }
43
+
44
+ if ($ c <= 0x7F ) {
45
+ $ result .= chr ($ c );
46
+ continue ;
47
+ }
48
+
49
+ if ($ c <= 0x7FF ) {
50
+ $ result .= chr (0xC0 | (0x1F & ($ c >> 0x06 )));
51
+ $ result .= chr (0x80 | (0x3F & $ c ));
52
+ continue ;
53
+ }
54
+
55
+ if ($ c <= 0xFFFF ) {
56
+ $ result .= chr (0xE0 | (0x0F & ($ c >> 0x0C )));
57
+ $ result .= chr (0x80 | (0x3F & ($ c >> 0x06 )));
58
+ $ result .= chr (0x80 | (0x3F & $ c ));
59
+ continue ;
60
+ }
61
+
62
+ $ result .= chr (0xED );
63
+ $ result .= chr (0xA0 | (($ c >> 0x10 ) & 0x0F ));
64
+ $ result .= chr (0x80 | (($ c >> 0x0A ) & 0x3f ));
65
+ $ result .= chr (0xED );
66
+ $ result .= chr (0xb0 | (($ c >> 0x06 ) & 0x0f ));
67
+ $ result .= chr (0x80 | ($ c & 0x3f ));
68
+ }
69
+
70
+ return $ result ;
71
+ }
72
+
73
+ /**
74
+ * @throws StringDataFormatException
75
+ */
76
+ public function decode (string $ string , string $ outputEncoding = "UTF-8 " ): string
77
+ {
78
+ $ result = "" ;
79
+ for ($ i = 0 ; $ i < strlen ($ string ); $ i ++) {
80
+ $ a = ord ($ string [$ i ]);
81
+
82
+ if ($ a === 0 ) {
83
+ throw new StringDataFormatException ("Invalid NULL byte in string " );
84
+ }
85
+
86
+ // Single byte character
87
+ if (($ a & 0b10000000 ) === 0b0 ) {
88
+ $ result .= mb_chr ($ a , $ outputEncoding );
89
+ continue ;
90
+ }
91
+
92
+ $ b = ord ($ string [++$ i ] ?? "\0" );
93
+
94
+ // Two byte character
95
+ if (($ a & 0b11100000 ) === 0b11000000 ) {
96
+ if (($ b & 0b11000000 ) !== 0b10000000 ) {
97
+ throw new StringDataFormatException ("Invalid \"UTF-8 \" sequence " );
98
+ }
99
+
100
+ $ result .= mb_chr ((($ a & 0x1F ) << 6 ) | ($ b & 0x3F ), $ outputEncoding );
101
+ continue ;
102
+ }
103
+
104
+ $ c = ord ($ string [++$ i ] ?? "\0" );
105
+
106
+ // Maybe six byte character
107
+ if ($ a === 0b11101101 && ($ b & 0b11110000 ) === 0b10100000 && ($ c & 0b11000000 ) === 0b10000000 ) {
108
+ $ d = ord ($ string [$ i + 1 ] ?? "\0" );
109
+ $ e = ord ($ string [$ i + 2 ] ?? "\0" );
110
+ $ f = ord ($ string [$ i + 3 ] ?? "\0" );
111
+
112
+ // Six byte character
113
+ if ($ d === 0b11101101 && ($ e & 0b11110000 ) === 0b10110000 && ($ f & 0b11000000 ) === 0b10000000 ) {
114
+ $ result .= mb_chr (0x10000 |
115
+ ($ b & 0x0F ) << 0x10 |
116
+ ($ c & 0x3F ) << 0x0A |
117
+ ($ e & 0x0F ) << 0x06 |
118
+ ($ f & 0x3F ), $ outputEncoding );
119
+
120
+ $ i += 3 ;
121
+ continue ;
122
+ }
123
+ }
124
+
125
+ // Three byte character
126
+ if (($ a & 0b11110000 ) === 0b11100000 ) {
127
+ if (($ b & 0b11000000 ) !== 0b10000000 || ($ c & 0b11000000 ) !== 0b10000000 ) {
128
+ throw new StringDataFormatException ("Invalid \"UTF-8 \" sequence " );
129
+ }
130
+
131
+ $ result .= mb_chr ((($ a & 0x0F ) << 12 ) | (($ b & 0x3F ) << 6 ) | ($ c & 0x3F ), $ outputEncoding );
132
+ continue ;
133
+ }
134
+
135
+ throw new StringDataFormatException ("Invalid \"UTF-8 \" sequence " );
136
+ }
137
+ return $ result ;
138
+ }
139
+ }
0 commit comments