-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutf8.ts
192 lines (172 loc) · 5.91 KB
/
utf8.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import { mergeTypedArrays } from "./encode.ts";
import { unicodeToLower } from "./unicode.ts";
import { UNREACHABLE } from "./utilities.ts";
export function isUtf8CharacterFirstCodeUnit(c: number): boolean {
return (c & 0xC0) !== 0x80;
}
export function utf8Length(str: Uint8Array): number {
let result = 0;
for (const c of str) {
result += isUtf8CharacterFirstCodeUnit(c) ? 1 : 0;
}
return result;
}
export function utf8utf16Length(str: Uint8Array): number {
let result = 0;
for (const c of str) {
result += (isUtf8CharacterFirstCodeUnit(c) ? 1 : 0) + (((c & 0xf8) === 0xf0) ? 1 : 0);
}
return result;
}
export function prevUtf8Unsafe(data: Uint8Array, pos: number): number {
while (!isUtf8CharacterFirstCodeUnit(data[--pos])) {
// pass
}
return pos;
}
export function nextUtf8Unsafe(data: Uint8Array, pos: number): { code: number; pos: number } {
let code = 0;
const a = data[pos];
if ((a & 0x80) === 0) {
code = a;
return { pos: pos + 1, code };
} else if ((a & 0x20) === 0) {
code = ((a & 0x1f) << 6) | (data[pos + 1] & 0x3f);
return { pos: pos + 2, code };
} else if ((a & 0x10) === 0) {
code = ((a & 0x0f) << 12) | ((data[pos + 1] & 0x3f) << 6) | (data[pos + 2] & 0x3f);
return { pos: pos + 3, code };
} else if ((a & 0x08) === 0) {
code = ((a & 0x07) << 18) | ((data[pos + 1] & 0x3f) << 12) | ((data[pos + 2] & 0x3f) << 6) | (data[pos + 3] & 0x3f);
return { pos: pos + 4, code };
}
UNREACHABLE();
}
export function appendUtf8CharacterUnsafe(text: number[] | Uint8Array, pos: number, code: number): number {
if (code <= 0x7f) {
text[pos++] = code;
} else if (code <= 0x7ff) {
text[pos++] = 0xc0 | (code >> 6);
text[pos++] = 0x80 | (code & 0x3f);
} else if (code <= 0xffff) {
text[pos++] = 0xe0 | (code >> 12);
text[pos++] = 0x80 | ((code >> 6) & 0x3f);
text[pos++] = 0x80 | (code & 0x3f);
} else {
text[pos++] = 0xf0 | (code >> 18);
text[pos++] = 0x80 | ((code >> 12) & 0x3f);
text[pos++] = 0x80 | ((code >> 6) & 0x3f);
text[pos++] = 0x80 | (code & 0x3f);
}
return pos;
}
export function appendUtf8Character(str: Uint8Array, code: number) {
const toPush: number[] = [];
if (code <= 0x7f) {
toPush.push(code);
} else if (code <= 0x7ff) {
toPush.push(0xc0 | (code >> 6)); // implementation-defined
toPush.push(0x80 | (code & 0x3f));
} else if (code <= 0xffff) {
toPush.push(0xe0 | (code >> 12)); // implementation-defined
toPush.push(0x80 | ((code >> 6) & 0x3f));
toPush.push(0x80 | (code & 0x3f));
} else {
toPush.push(0xf0 | (code >> 18)); // implementation-defined
toPush.push(0x80 | ((code >> 12) & 0x3f));
toPush.push(0x80 | ((code >> 6) & 0x3f));
toPush.push(0x80 | (code & 0x3f));
}
return mergeTypedArrays(str, Uint8Array.from(toPush));
}
export function utf8ToLower(str: Uint8Array): Uint8Array {
let result = new Uint8Array();
let position = 0;
const end = str.length;
while (position !== end) {
const { pos, code } = nextUtf8Unsafe(str, position);
position = pos;
result = appendUtf8Character(result, unicodeToLower(code));
}
return result;
}
export function utf8Truncate(str: Uint8Array, length: number): Uint8Array {
if (str.length > length) {
for (let i = 0; i < str.length; i++) {
if (isUtf8CharacterFirstCodeUnit(str[i])) {
if (length === 0) return str.slice(0, i);
else length--;
}
}
}
return str;
}
export function utf8utf16Truncate(str: Uint8Array, length: number): Uint8Array {
for (let i = 0; i < str.length; i++) {
const c = str[i];
if (isUtf8CharacterFirstCodeUnit(c)) {
if (length <= 0) {
return str.slice(0, i);
} else {
length--;
if (c >= 0xf0) {
length--;
}
}
}
}
return str;
}
export function utf8Substr(str: Uint8Array, offset: number): Uint8Array {
if (offset === 0) return str;
const offsetPos = utf8Truncate(str, offset).length;
return str.slice(offsetPos);
}
export function utf8utf16Substr(str: Uint8Array, offset: number, length?: number): Uint8Array {
if (length != null) {
return utf8utf16Truncate(utf8utf16Substr(str, offset), length);
}
if (offset === 0) return str;
const offsetPos = utf8utf16Truncate(str, offset).length;
return str.slice(offsetPos);
}
export function checkUtf8(str: Uint8Array) {
let data = 0;
const dataEnd = str.length;
function ENSURE(condition: boolean) {
if (!condition) {
return false;
}
}
do {
const a = str[data++];
if ((a & 0x80) === 0) {
if (data === dataEnd + 1) {
return true;
}
continue;
}
if (ENSURE((a & 0x40) !== 0) === false) return false;
const b = str[data++];
if (ENSURE((b & 0xc0) === 0x80) === false) return false;
if ((a & 0x20) === 0) {
if (ENSURE((a & 0x1e) > 0) === false) return false;
continue;
}
const c = str[data++];
if (ENSURE((c & 0xc0) === 0x80) === false) return false;
if ((a & 0x10) === 0) {
const x = ((a & 0x0f) << 6) | (b & 0x20);
if (ENSURE(x !== 0 && x !== 0x360) === false) return false; // surrogates
continue;
}
const d = str[data++];
if (ENSURE((d & 0xc0) === 0x80) === false) return false;
if ((a & 0x08) === 0) {
const t = ((a & 0x07) << 6) | (b & 0x30);
if (ENSURE(0 < t && t < 0x110) === false) return false; // end of unicode
continue;
}
return false;
} while (true);
}