diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..e4cd446 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/k3a/html2text + +go 1.16 + +require github.com/smartystreets/goconvey v1.6.4 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..a46a039 --- /dev/null +++ b/go.sum @@ -0,0 +1,13 @@ +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= +github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= +github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= diff --git a/html2text.go b/html2text.go index 08ce57f..11ccaad 100644 --- a/html2text.go +++ b/html2text.go @@ -17,7 +17,7 @@ var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`) var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`) var badLinkHrefRE = regexp.MustCompile(`javascript:`) var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`) -var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`) +var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`) func parseHTMLEntity(entName string) (string, bool) { if r, ok := entity[entName]; ok { @@ -25,8 +25,18 @@ func parseHTMLEntity(entName string) (string, bool) { } if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 { - digits := match[1] - n, err := strconv.Atoi(digits) + var ( + err error + n int64 + digits = match[1] + ) + + if digits != "" && (digits[0] == 'x' || digits[0] == 'X') { + n, err = strconv.ParseInt(digits[1:], 16, 64) + } else { + n, err = strconv.ParseInt(digits, 10, 64) + } + if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) { return string(rune(n)), true } diff --git a/html2text_test.go b/html2text_test.go index 7161d77..39cfa13 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -61,6 +61,7 @@ func TestHTML2Text(t *testing.T) { So(HTML2Text(`fish & chips`), ShouldEqual, "fish & chips") So(HTML2Text(`"I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey`), ShouldEqual, "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey") So(HTML2Text(`Google ®`), ShouldEqual, "Google ®") + So(HTML2Text(`⁌ decimal and hex entities supported ⁍`), ShouldEqual, "⁌ decimal and hex entities supported ⁍") }) Convey("Large Entity", func() {