|  | 
|  | 1 | +// Copyright 2025 The Go Authors. All rights reserved. | 
|  | 2 | +// Use of this source code is governed by a BSD-style | 
|  | 3 | +// license that can be found in the LICENSE file. | 
|  | 4 | + | 
|  | 5 | +//go:build ignore | 
|  | 6 | + | 
|  | 7 | +package main | 
|  | 8 | + | 
|  | 9 | +import ( | 
|  | 10 | +	"bytes" | 
|  | 11 | +	_ "embed" | 
|  | 12 | +	"fmt" | 
|  | 13 | +	"go/format" | 
|  | 14 | +	"io" | 
|  | 15 | +	"log" | 
|  | 16 | +	"maps" | 
|  | 17 | +	"os" | 
|  | 18 | +	"slices" | 
|  | 19 | +	"strconv" | 
|  | 20 | +	"strings" | 
|  | 21 | +) | 
|  | 22 | + | 
|  | 23 | +// We embed this source file in the resulting code-generation program in order | 
|  | 24 | +// to extract the definitions of the encoding type and constants from it and | 
|  | 25 | +// include them in the generated file. | 
|  | 26 | +// | 
|  | 27 | +//go:embed gen_encoding_table.go | 
|  | 28 | +var genSource string | 
|  | 29 | + | 
|  | 30 | +const filename = "encoding_table.go" | 
|  | 31 | + | 
|  | 32 | +func main() { | 
|  | 33 | +	fmt.Println(genSource) | 
|  | 34 | +	var out bytes.Buffer | 
|  | 35 | +	fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.") | 
|  | 36 | +	fmt.Fprintln(&out) | 
|  | 37 | +	fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.") | 
|  | 38 | +	fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style") | 
|  | 39 | +	fmt.Fprintln(&out, "// license that can be found in the LICENSE file.") | 
|  | 40 | +	fmt.Fprintln(&out) | 
|  | 41 | +	fmt.Fprintln(&out, "package url") | 
|  | 42 | +	fmt.Fprintln(&out) | 
|  | 43 | +	generateEnc(&out, genSource) | 
|  | 44 | +	generateTable(&out) | 
|  | 45 | + | 
|  | 46 | +	formatted, err := format.Source(out.Bytes()) | 
|  | 47 | +	if err != nil { | 
|  | 48 | +		log.Fatal("format:", err) | 
|  | 49 | +	} | 
|  | 50 | + | 
|  | 51 | +	err = os.WriteFile(filename, formatted, 0644) | 
|  | 52 | +	if err != nil { | 
|  | 53 | +		log.Fatal("WriteFile:", err) | 
|  | 54 | +	} | 
|  | 55 | +} | 
|  | 56 | + | 
|  | 57 | +func generateEnc(w io.Writer, src string) { | 
|  | 58 | +	var writeLine bool | 
|  | 59 | +	for line := range strings.Lines(src) { | 
|  | 60 | +		if strings.HasPrefix(line, "// START encoding") { | 
|  | 61 | +			writeLine = true | 
|  | 62 | +			continue | 
|  | 63 | +		} | 
|  | 64 | +		if strings.HasPrefix(line, "// END encoding") { | 
|  | 65 | +			return | 
|  | 66 | +		} | 
|  | 67 | +		if writeLine { | 
|  | 68 | +			fmt.Fprint(w, line) | 
|  | 69 | +		} | 
|  | 70 | +	} | 
|  | 71 | +} | 
|  | 72 | + | 
|  | 73 | +func generateTable(w io.Writer) { | 
|  | 74 | +	fmt.Fprintln(w, "var table = [256]encoding{") | 
|  | 75 | + | 
|  | 76 | +	// Sort the encodings (in decreasing order) to guarantee a stable output. | 
|  | 77 | +	sortedEncs := slices.Sorted(maps.Keys(encNames)) | 
|  | 78 | +	slices.Reverse(sortedEncs) | 
|  | 79 | + | 
|  | 80 | +	for i := range 256 { | 
|  | 81 | +		c := byte(i) | 
|  | 82 | +		var lineBuf bytes.Buffer | 
|  | 83 | + | 
|  | 84 | +		// Write key to line buffer. | 
|  | 85 | +		lineBuf.WriteString(strconv.QuoteRune(rune(c))) | 
|  | 86 | + | 
|  | 87 | +		lineBuf.WriteByte(':') | 
|  | 88 | + | 
|  | 89 | +		// Write value to line buffer. | 
|  | 90 | +		blankVal := true | 
|  | 91 | +		if ishex(c) { | 
|  | 92 | +			// Set the hexChar bit if this char is hexadecimal. | 
|  | 93 | +			lineBuf.WriteString("hexChar") | 
|  | 94 | +			blankVal = false | 
|  | 95 | +		} | 
|  | 96 | +		for _, enc := range sortedEncs { | 
|  | 97 | +			if !shouldEscape(c, enc) { | 
|  | 98 | +				if !blankVal { | 
|  | 99 | +					lineBuf.WriteByte('|') | 
|  | 100 | +				} | 
|  | 101 | +				// Set this encoding mode's bit if this char should NOT be | 
|  | 102 | +				// escaped. | 
|  | 103 | +				name := encNames[enc] | 
|  | 104 | +				lineBuf.WriteString(name) | 
|  | 105 | +				blankVal = false | 
|  | 106 | +			} | 
|  | 107 | +		} | 
|  | 108 | + | 
|  | 109 | +		if !blankVal { | 
|  | 110 | +			lineBuf.WriteString(",\n") | 
|  | 111 | +			w.Write(lineBuf.Bytes()) | 
|  | 112 | +		} | 
|  | 113 | +	} | 
|  | 114 | +	fmt.Fprintln(w, "}") | 
|  | 115 | +} | 
|  | 116 | + | 
|  | 117 | +// START encoding (keep this marker comment in sync with genEnc) | 
|  | 118 | +type encoding uint8 | 
|  | 119 | + | 
|  | 120 | +const ( | 
|  | 121 | +	encodePath encoding = 1 << iota | 
|  | 122 | +	encodePathSegment | 
|  | 123 | +	encodeHost | 
|  | 124 | +	encodeZone | 
|  | 125 | +	encodeUserPassword | 
|  | 126 | +	encodeQueryComponent | 
|  | 127 | +	encodeFragment | 
|  | 128 | + | 
|  | 129 | +	// hexChar is actually NOT an encoding mode, but there are only seven | 
|  | 130 | +	// encoding modes. We might as well abuse the otherwise unused most | 
|  | 131 | +	// significant bit in uint8 to indicate whether a character is | 
|  | 132 | +	// hexadecimal. | 
|  | 133 | +	hexChar | 
|  | 134 | +) | 
|  | 135 | + | 
|  | 136 | +// END encoding (keep this marker comment in sync with genEnc) | 
|  | 137 | + | 
|  | 138 | +// Keep this in sync with the definitions of encoding mode constants. | 
|  | 139 | +var encNames = map[encoding]string{ | 
|  | 140 | +	encodePath:           "encodePath", | 
|  | 141 | +	encodePathSegment:    "encodePathSegment", | 
|  | 142 | +	encodeHost:           "encodeHost", | 
|  | 143 | +	encodeZone:           "encodeZone", | 
|  | 144 | +	encodeUserPassword:   "encodeUserPassword", | 
|  | 145 | +	encodeQueryComponent: "encodeQueryComponent", | 
|  | 146 | +	encodeFragment:       "encodeFragment", | 
|  | 147 | +} | 
|  | 148 | + | 
|  | 149 | +// Return true if the specified character should be escaped when | 
|  | 150 | +// appearing in a URL string, according to RFC 3986. | 
|  | 151 | +// | 
|  | 152 | +// Please be informed that for now shouldEscape does not check all | 
|  | 153 | +// reserved characters correctly. See golang.org/issue/5684. | 
|  | 154 | +func shouldEscape(c byte, mode encoding) bool { | 
|  | 155 | +	// §2.3 Unreserved characters (alphanum) | 
|  | 156 | +	if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { | 
|  | 157 | +		return false | 
|  | 158 | +	} | 
|  | 159 | + | 
|  | 160 | +	if mode == encodeHost || mode == encodeZone { | 
|  | 161 | +		// §3.2.2 Host allows | 
|  | 162 | +		//	sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | 
|  | 163 | +		// as part of reg-name. | 
|  | 164 | +		// We add : because we include :port as part of host. | 
|  | 165 | +		// We add [ ] because we include [ipv6]:port as part of host. | 
|  | 166 | +		// We add < > because they're the only characters left that | 
|  | 167 | +		// we could possibly allow, and Parse will reject them if we | 
|  | 168 | +		// escape them (because hosts can't use %-encoding for | 
|  | 169 | +		// ASCII bytes). | 
|  | 170 | +		switch c { | 
|  | 171 | +		case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"': | 
|  | 172 | +			return false | 
|  | 173 | +		} | 
|  | 174 | +	} | 
|  | 175 | + | 
|  | 176 | +	switch c { | 
|  | 177 | +	case '-', '_', '.', '~': // §2.3 Unreserved characters (mark) | 
|  | 178 | +		return false | 
|  | 179 | + | 
|  | 180 | +	case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved) | 
|  | 181 | +		// Different sections of the URL allow a few of | 
|  | 182 | +		// the reserved characters to appear unescaped. | 
|  | 183 | +		switch mode { | 
|  | 184 | +		case encodePath: // §3.3 | 
|  | 185 | +			// The RFC allows : @ & = + $ but saves / ; , for assigning | 
|  | 186 | +			// meaning to individual path segments. This package | 
|  | 187 | +			// only manipulates the path as a whole, so we allow those | 
|  | 188 | +			// last three as well. That leaves only ? to escape. | 
|  | 189 | +			return c == '?' | 
|  | 190 | + | 
|  | 191 | +		case encodePathSegment: // §3.3 | 
|  | 192 | +			// The RFC allows : @ & = + $ but saves / ; , for assigning | 
|  | 193 | +			// meaning to individual path segments. | 
|  | 194 | +			return c == '/' || c == ';' || c == ',' || c == '?' | 
|  | 195 | + | 
|  | 196 | +		case encodeUserPassword: // §3.2.1 | 
|  | 197 | +			// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in | 
|  | 198 | +			// userinfo, so we must escape only '@', '/', and '?'. | 
|  | 199 | +			// The parsing of userinfo treats ':' as special so we must escape | 
|  | 200 | +			// that too. | 
|  | 201 | +			return c == '@' || c == '/' || c == '?' || c == ':' | 
|  | 202 | + | 
|  | 203 | +		case encodeQueryComponent: // §3.4 | 
|  | 204 | +			// The RFC reserves (so we must escape) everything. | 
|  | 205 | +			return true | 
|  | 206 | + | 
|  | 207 | +		case encodeFragment: // §4.1 | 
|  | 208 | +			// The RFC text is silent but the grammar allows | 
|  | 209 | +			// everything, so escape nothing. | 
|  | 210 | +			return false | 
|  | 211 | +		} | 
|  | 212 | +	} | 
|  | 213 | + | 
|  | 214 | +	if mode == encodeFragment { | 
|  | 215 | +		// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are | 
|  | 216 | +		// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not | 
|  | 217 | +		// need to be escaped. To minimize potential breakage, we apply two restrictions: | 
|  | 218 | +		// (1) we always escape sub-delims outside of the fragment, and (2) we always | 
|  | 219 | +		// escape single quote to avoid breaking callers that had previously assumed that | 
|  | 220 | +		// single quotes would be escaped. See issue #19917. | 
|  | 221 | +		switch c { | 
|  | 222 | +		case '!', '(', ')', '*': | 
|  | 223 | +			return false | 
|  | 224 | +		} | 
|  | 225 | +	} | 
|  | 226 | + | 
|  | 227 | +	// Everything else must be escaped. | 
|  | 228 | +	return true | 
|  | 229 | +} | 
|  | 230 | + | 
|  | 231 | +func ishex(c byte) bool { | 
|  | 232 | +	return '0' <= c && c <= '9' || | 
|  | 233 | +		'a' <= c && c <= 'f' || | 
|  | 234 | +		'A' <= c && c <= 'F' | 
|  | 235 | +} | 
0 commit comments