Skip to content

Commit d5fbd3d

Browse files
committed
net/url: speed up escape and unescape
This change adds a generated 8-bit bitmask for use in functions shouldEscape and ishex. Function shouldEscape is now inlineable. Function escape is now much faster; function unescape is a bit faster. Here are some benchmark results (no change to allocations): goos: darwin goarch: amd64 pkg: net/url cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz │ old │ new │ │ sec/op │ sec/op vs base │ QueryEscape/#00-8 58.38n ± 1% 35.98n ± 1% -38.38% (p=0.000 n=20) QueryEscape/#1-8 303.50n ± 0% 94.77n ± 0% -68.77% (p=0.000 n=20) QueryEscape/#2-8 202.90n ± 0% 78.66n ± 1% -61.23% (p=0.000 n=20) QueryEscape/#3-8 444.5n ± 0% 145.9n ± 0% -67.17% (p=0.000 n=20) QueryEscape/#4-8 2678.0n ± 0% 913.7n ± 0% -65.88% (p=0.000 n=20) PathEscape/#00-8 81.34n ± 0% 44.64n ± 1% -45.12% (p=0.000 n=20) PathEscape/#1-8 307.65n ± 0% 96.71n ± 1% -68.56% (p=0.000 n=20) PathEscape/#2-8 200.80n ± 1% 78.25n ± 0% -61.03% (p=0.000 n=20) PathEscape/#3-8 450.1n ± 1% 145.5n ± 0% -67.67% (p=0.000 n=20) PathEscape/#4-8 2663.5n ± 0% 876.5n ± 0% -67.09% (p=0.000 n=20) QueryUnescape/#00-8 53.32n ± 1% 51.67n ± 1% -3.09% (p=0.000 n=20) QueryUnescape/#1-8 161.0n ± 1% 136.2n ± 1% -15.40% (p=0.000 n=20) QueryUnescape/#2-8 126.1n ± 1% 118.3n ± 1% -6.23% (p=0.000 n=20) QueryUnescape/#3-8 294.6n ± 0% 273.1n ± 0% -7.30% (p=0.000 n=20) QueryUnescape/#4-8 1.511µ ± 0% 1.411µ ± 0% -6.62% (p=0.000 n=20) PathUnescape/#00-8 63.84n ± 1% 53.59n ± 1% -16.05% (p=0.000 n=20) PathUnescape/#1-8 163.6n ± 3% 137.9n ± 1% -15.71% (p=0.000 n=20) PathUnescape/#2-8 126.4n ± 1% 119.1n ± 1% -5.78% (p=0.000 n=20) PathUnescape/#3-8 294.2n ± 0% 273.3n ± 0% -7.12% (p=0.000 n=20) PathUnescape/#4-8 1.554µ ± 0% 1.417µ ± 0% -8.78% (p=0.000 n=20) geomean 277.8n 162.7n -41.44% This change draws heavy inspiration from CL 174998, which showed promise but stalled years ago. Updates #17860
1 parent bb5eb51 commit d5fbd3d

File tree

3 files changed

+355
-100
lines changed

3 files changed

+355
-100
lines changed

src/net/url/encoding_table.go

Lines changed: 114 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/net/url/gen_encoding_table.go

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build ignore
6+
7+
package main
8+
9+
import (
10+
"bytes"
11+
_ "embed"
12+
"fmt"
13+
"go/format"
14+
"io"
15+
"log"
16+
"maps"
17+
"os"
18+
"slices"
19+
"strconv"
20+
"strings"
21+
)
22+
23+
// We embed this source file in the resulting code-generation program in order
24+
// to extract the definitions of the encoding type and constants from it and
25+
// include them in the generated file.
26+
//
27+
//go:embed gen_encoding_table.go
28+
var genSource string
29+
30+
const filename = "encoding_table.go"
31+
32+
func main() {
33+
fmt.Println(genSource)
34+
var out bytes.Buffer
35+
fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.")
36+
fmt.Fprintln(&out)
37+
fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.")
38+
fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style")
39+
fmt.Fprintln(&out, "// license that can be found in the LICENSE file.")
40+
fmt.Fprintln(&out)
41+
fmt.Fprintln(&out, "package url")
42+
fmt.Fprintln(&out)
43+
generateEnc(&out, genSource)
44+
generateTable(&out)
45+
46+
formatted, err := format.Source(out.Bytes())
47+
if err != nil {
48+
log.Fatal("format:", err)
49+
}
50+
51+
err = os.WriteFile(filename, formatted, 0644)
52+
if err != nil {
53+
log.Fatal("WriteFile:", err)
54+
}
55+
}
56+
57+
func generateEnc(w io.Writer, src string) {
58+
var writeLine bool
59+
for line := range strings.Lines(src) {
60+
if strings.HasPrefix(line, "// START encoding") {
61+
writeLine = true
62+
continue
63+
}
64+
if strings.HasPrefix(line, "// END encoding") {
65+
return
66+
}
67+
if writeLine {
68+
fmt.Fprint(w, line)
69+
}
70+
}
71+
}
72+
73+
func generateTable(w io.Writer) {
74+
fmt.Fprintln(w, "var table = [256]encoding{")
75+
76+
// Sort the encodings (in decreasing order) to guarantee a stable output.
77+
sortedEncs := slices.Sorted(maps.Keys(encNames))
78+
slices.Reverse(sortedEncs)
79+
80+
for i := range 256 {
81+
c := byte(i)
82+
var lineBuf bytes.Buffer
83+
84+
// Write key to line buffer.
85+
lineBuf.WriteString(strconv.QuoteRune(rune(c)))
86+
87+
lineBuf.WriteByte(':')
88+
89+
// Write value to line buffer.
90+
blankVal := true
91+
if ishex(c) {
92+
// Set the hexChar bit if this char is hexadecimal.
93+
lineBuf.WriteString("hexChar")
94+
blankVal = false
95+
}
96+
for _, enc := range sortedEncs {
97+
if !shouldEscape(c, enc) {
98+
if !blankVal {
99+
lineBuf.WriteByte('|')
100+
}
101+
// Set this encoding mode's bit if this char should NOT be
102+
// escaped.
103+
name := encNames[enc]
104+
lineBuf.WriteString(name)
105+
blankVal = false
106+
}
107+
}
108+
109+
if !blankVal {
110+
lineBuf.WriteString(",\n")
111+
w.Write(lineBuf.Bytes())
112+
}
113+
}
114+
fmt.Fprintln(w, "}")
115+
}
116+
117+
// START encoding (keep this marker comment in sync with genEnc)
118+
type encoding uint8
119+
120+
const (
121+
encodePath encoding = 1 << iota
122+
encodePathSegment
123+
encodeHost
124+
encodeZone
125+
encodeUserPassword
126+
encodeQueryComponent
127+
encodeFragment
128+
129+
// hexChar is actually NOT an encoding mode, but there are only seven
130+
// encoding modes. We might as well abuse the otherwise unused most
131+
// significant bit in uint8 to indicate whether a character is
132+
// hexadecimal.
133+
hexChar
134+
)
135+
136+
// END encoding (keep this marker comment in sync with genEnc)
137+
138+
// Keep this in sync with the definitions of encoding mode constants.
139+
var encNames = map[encoding]string{
140+
encodePath: "encodePath",
141+
encodePathSegment: "encodePathSegment",
142+
encodeHost: "encodeHost",
143+
encodeZone: "encodeZone",
144+
encodeUserPassword: "encodeUserPassword",
145+
encodeQueryComponent: "encodeQueryComponent",
146+
encodeFragment: "encodeFragment",
147+
}
148+
149+
// Return true if the specified character should be escaped when
150+
// appearing in a URL string, according to RFC 3986.
151+
//
152+
// Please be informed that for now shouldEscape does not check all
153+
// reserved characters correctly. See golang.org/issue/5684.
154+
func shouldEscape(c byte, mode encoding) bool {
155+
// §2.3 Unreserved characters (alphanum)
156+
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
157+
return false
158+
}
159+
160+
if mode == encodeHost || mode == encodeZone {
161+
// §3.2.2 Host allows
162+
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
163+
// as part of reg-name.
164+
// We add : because we include :port as part of host.
165+
// We add [ ] because we include [ipv6]:port as part of host.
166+
// We add < > because they're the only characters left that
167+
// we could possibly allow, and Parse will reject them if we
168+
// escape them (because hosts can't use %-encoding for
169+
// ASCII bytes).
170+
switch c {
171+
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
172+
return false
173+
}
174+
}
175+
176+
switch c {
177+
case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
178+
return false
179+
180+
case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
181+
// Different sections of the URL allow a few of
182+
// the reserved characters to appear unescaped.
183+
switch mode {
184+
case encodePath: // §3.3
185+
// The RFC allows : @ & = + $ but saves / ; , for assigning
186+
// meaning to individual path segments. This package
187+
// only manipulates the path as a whole, so we allow those
188+
// last three as well. That leaves only ? to escape.
189+
return c == '?'
190+
191+
case encodePathSegment: // §3.3
192+
// The RFC allows : @ & = + $ but saves / ; , for assigning
193+
// meaning to individual path segments.
194+
return c == '/' || c == ';' || c == ',' || c == '?'
195+
196+
case encodeUserPassword: // §3.2.1
197+
// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
198+
// userinfo, so we must escape only '@', '/', and '?'.
199+
// The parsing of userinfo treats ':' as special so we must escape
200+
// that too.
201+
return c == '@' || c == '/' || c == '?' || c == ':'
202+
203+
case encodeQueryComponent: // §3.4
204+
// The RFC reserves (so we must escape) everything.
205+
return true
206+
207+
case encodeFragment: // §4.1
208+
// The RFC text is silent but the grammar allows
209+
// everything, so escape nothing.
210+
return false
211+
}
212+
}
213+
214+
if mode == encodeFragment {
215+
// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
216+
// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
217+
// need to be escaped. To minimize potential breakage, we apply two restrictions:
218+
// (1) we always escape sub-delims outside of the fragment, and (2) we always
219+
// escape single quote to avoid breaking callers that had previously assumed that
220+
// single quotes would be escaped. See issue #19917.
221+
switch c {
222+
case '!', '(', ')', '*':
223+
return false
224+
}
225+
}
226+
227+
// Everything else must be escaped.
228+
return true
229+
}
230+
231+
func ishex(c byte) bool {
232+
return '0' <= c && c <= '9' ||
233+
'a' <= c && c <= 'f' ||
234+
'A' <= c && c <= 'F'
235+
}

0 commit comments

Comments
 (0)