Skip to content

Commit 7a7ad1e

Browse files
authored
Fix case-insensitive URL scheme matching, strip trailing punctuation from URLs, and update closableLogger comment
1 parent fb6e0f4 commit 7a7ad1e

3 files changed

Lines changed: 20 additions & 3 deletions

File tree

internal/logger/global_helpers.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func withMutexLock(mu *sync.Mutex, fn func() error) error {
5252
}
5353

5454
// closableLogger is a constraint for types that have a Close method.
55-
// This is satisfied by *FileLogger, *JSONLLogger, *MarkdownLogger, *ServerFileLogger, and *ToolsLogger.
55+
// This is satisfied by *FileLogger, *JSONLLogger, *MarkdownLogger, *ObservedURLDomainsLogger, *ServerFileLogger, and *ToolsLogger.
5656
type closableLogger interface {
5757
*FileLogger | *JSONLLogger | *MarkdownLogger | *ServerFileLogger | *ToolsLogger | *ObservedURLDomainsLogger
5858
Close() error

internal/middleware/jqschema_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,6 +1886,16 @@ func TestExtractURLDomains(t *testing.T) {
18861886
[]string{"docs.example.com", "example.com"},
18871887
urlutil.ExtractURLDomains(`See https://Example.com/a and http://docs.example.com:8080/x plus https://example.com/b`),
18881888
)
1889+
// Case-insensitive scheme matching.
1890+
assert.Equal(t,
1891+
[]string{"example.com"},
1892+
urlutil.ExtractURLDomains(`HTTPS://example.com/path`),
1893+
)
1894+
// Trailing punctuation from prose must be stripped.
1895+
assert.Equal(t,
1896+
[]string{"example.com"},
1897+
urlutil.ExtractURLDomains(`See https://example.com, and (https://example.com) for details.`),
1898+
)
18891899
}
18901900

18911901
func TestExtractURLDomainsFromValue(t *testing.T) {

internal/urlutil/domains.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ import (
88
)
99

1010
// urlPattern requires a non-empty hostname candidate and then captures the rest
11-
// of the URL until common delimiter characters. Matches are still validated with
11+
// of the URL until common delimiter characters. The (?i) flag makes the scheme
12+
// match case-insensitive (e.g. "HTTPS://"). Matches are still validated with
1213
// url.Parse before hostname extraction.
13-
var urlPattern = regexp.MustCompile(`https?://[^\s/"'<>]+[^\s"'<>]*`)
14+
var urlPattern = regexp.MustCompile(`(?i)https?://[^\s/"'<>]+[^\s"'<>]*`)
1415

1516
// ExtractURLDomainsFromValue recursively extracts unique URL hostnames from string leaves.
1617
func ExtractURLDomainsFromValue(value any) []string {
@@ -58,6 +59,12 @@ func ExtractURLDomains(text string) []string {
5859

5960
domainSet := make(map[string]struct{})
6061
for _, match := range matches {
62+
// Strip trailing punctuation that may appear when a URL is embedded in
63+
// prose (e.g. "https://example.com," or "https://example.com)"). These
64+
// characters are valid inside a URL so the regex cannot exclude them
65+
// blindly; trimming them from the tail of each candidate is the safest
66+
// heuristic.
67+
match = strings.TrimRight(match, ".,;:!?)]}\"'")
6168
parsed, err := url.Parse(match)
6269
if err != nil {
6370
continue

0 commit comments

Comments
 (0)