Skip to content

Commit 7a0a448

Browse files
Fix escaped Unicode hex literals parsing (#49)
* Update package set * Fix escaped Unicode hex literals parsing The Char type represents a UTF-16 code unit, while String can contain any UTF-8 code unit. The same logic for escaping Unicode hexadecimal literals was used by both String and Char literal parsers. This caused a bug that prevented PureScript source code from having string literals containing UTF-8 hexadecimal literals representing code units larger than two bytes. * Add tests for Unicode hex literals * Update bench and parse-package-set dependencies * Update CONTRIBUTORS.md * Update parse-package-set's test package set. --------- Co-authored-by: Nathan Faubion <[email protected]>
1 parent 13a2437 commit 7a0a448

File tree

9 files changed

+59
-12
lines changed

9 files changed

+59
-12
lines changed

.github/workflows/ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414

1515
- uses: purescript-contrib/setup-purescript@main
1616
with:
17-
purescript: "0.15.0"
17+
purescript: "0.15.8"
1818
spago: "0.20.9"
1919
psa: "0.8.2"
2020
purs-tidy: "latest"

CONTRIBUTORS.md

+1
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ By adding your name to the list below, you agree to license your contributions u
1919
| [@rintcius](https://github.com/rintcius) | Rintcius Blok
2020
| [@i-am-the-slime](https://github.com/i-am-the-slime) | Mark Eibes
2121
| [@monoidmusician](https://github.com/MonoidMusician) | Verity Scheel
22+
| [@turlando](https://github.com/turlando) | Tancredi Orlando

bench/bench.dhall

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ in conf // {
1010
, "control"
1111
, "effect"
1212
, "either"
13+
, "enums"
1314
, "foldable-traversable"
1415
, "free"
1516
, "functions"

packages.dhall

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
let upstream =
2-
https://github.com/purescript/package-sets/releases/download/psc-0.15.0-20220507/packages.dhall
3-
sha256:cf54330f3bc1b25a093b69bff8489180c954b43668c81288901a2ec29a08cc64
2+
https://github.com/purescript/package-sets/releases/download/psc-0.15.7-20230401/packages.dhall
3+
sha256:d385eeee6ca160c32d7389a1f4f4ee6a05aff95e81373cdc50670b436efa1060
44

55
in upstream

parse-package-set/Main.purs

+1-1
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ defaultSpagoDhall :: String
158158
defaultSpagoDhall = Array.intercalate "\n"
159159
[ "{ name = \"test-parser\""
160160
, ", dependencies = [] : List Text"
161-
, ", packages = https://github.com/purescript/package-sets/releases/download/psc-0.15.0-20220507/packages.dhall sha256:cf54330f3bc1b25a093b69bff8489180c954b43668c81288901a2ec29a08cc64"
161+
, ", packages = https://github.com/purescript/package-sets/releases/download/psc-0.15.7-20230401/packages.dhall sha256:d385eeee6ca160c32d7389a1f4f4ee6a05aff95e81373cdc50670b436efa1060"
162162
, ", sources = [] : List Text"
163163
, "}"
164164
]

parse-package-set/parse-package-set.dhall

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ in conf // {
1212
, "datetime"
1313
, "effect"
1414
, "either"
15+
, "enums"
1516
, "exceptions"
1617
, "filterable"
1718
, "foldable-traversable"

spago.dhall

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
, "control"
99
, "effect"
1010
, "either"
11+
, "enums"
1112
, "foldable-traversable"
1213
, "free"
1314
, "functions"

src/PureScript/CST/Lexer.purs

+31-8
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import Data.Array.NonEmpty as NonEmptyArray
1313
import Data.Array.ST as STArray
1414
import Data.Char as Char
1515
import Data.Either (Either(..))
16+
import Data.Enum (toEnum)
1617
import Data.Foldable (fold, foldl, foldMap)
1718
import Data.Int (hexadecimal)
1819
import Data.Int as Int
@@ -23,6 +24,8 @@ import Data.Newtype (unwrap)
2324
import Data.Number as Number
2425
import Data.String (Pattern(..), Replacement(..))
2526
import Data.String as String
27+
import Data.String.CodePoints (CodePoint)
28+
import Data.String.CodePoints as SCP
2629
import Data.String.CodeUnits as SCU
2730
import Data.String.Regex as Regex
2831
import Data.String.Regex.Flags (unicode)
@@ -36,6 +39,18 @@ import PureScript.CST.Types (Comment(..), IntValue(..), LineFeed(..), ModuleName
3639

3740
infixr 3 alt as <|>
3841

42+
class IsChar a where
43+
fromChar :: Char -> a
44+
fromCharCode :: Int -> Maybe a
45+
46+
instance IsChar Char where
47+
fromChar = identity
48+
fromCharCode = Char.fromCharCode
49+
50+
instance IsChar CodePoint where
51+
fromChar = SCP.codePointFromChar
52+
fromCharCode = toEnum
53+
3954
data LexResult e a
4055
= LexFail e String
4156
| LexSucc a String
@@ -502,29 +517,37 @@ token =
502517
_ ->
503518
pure { raw: SCU.singleton ch, char: ch }
504519

520+
parseEscape
521+
:: forall a
522+
. IsChar a
523+
=> Lex (Unit -> ParseError) { raw :: String, char :: a }
505524
parseEscape = do
506525
ch <- charAny
507526
case ch of
508527
't' ->
509-
pure { raw: "\\t", char: '\t' }
528+
pure { raw: "\\t", char: fromChar '\t' }
510529
'r' ->
511-
pure { raw: "\\r", char: '\r' }
530+
pure { raw: "\\r", char: fromChar '\r' }
512531
'n' ->
513-
pure { raw: "\\n", char: '\n' }
532+
pure { raw: "\\n", char: fromChar '\n' }
514533
'"' ->
515-
pure { raw: "\\\"", char: '"' }
534+
pure { raw: "\\\"", char: fromChar '"' }
516535
'\'' ->
517-
pure { raw: "\\'", char: '\'' }
536+
pure { raw: "\\'", char: fromChar '\'' }
518537
'\\' ->
519-
pure { raw: "\\\\", char: '\\' }
538+
pure { raw: "\\\\", char: fromChar '\\' }
520539
'x' ->
521540
parseHexEscape
522541
_ ->
523542
fail $ LexInvalidCharEscape $ SCU.singleton ch
524543

544+
parseHexEscape
545+
:: forall a
546+
. IsChar a
547+
=> Lex (Unit -> ParseError) { raw :: String, char :: a }
525548
parseHexEscape = do
526549
esc <- hexEscapeRegex
527-
case Char.fromCharCode =<< Int.fromStringAs hexadecimal esc of
550+
case fromCharCode =<< Int.fromStringAs hexadecimal esc of
528551
Just ch ->
529552
pure { raw: "\\x" <> esc, char: ch }
530553
Nothing ->
@@ -552,7 +575,7 @@ token =
552575

553576
parseStringEscape = ado
554577
res <- charBackslash *> parseEscape
555-
in { raw: res.raw, string: SCU.singleton res.char }
578+
in { raw: res.raw, string: SCP.singleton res.char }
556579

557580
parseStringChars = ado
558581
raw <- stringCharsRegex

test/Main.purs

+20
Original file line numberDiff line numberDiff line change
@@ -232,3 +232,23 @@ main = do
232232
true
233233
_ ->
234234
false
235+
236+
assertParse "String with Unicode astral code point hex literal"
237+
"""
238+
"\x10ffff"
239+
"""
240+
case _ of
241+
ParseSucceeded (ExprString _ _) ->
242+
true
243+
_ ->
244+
false
245+
246+
assertParse "Unicode astral code point Char hex literal"
247+
"""
248+
'\x10ffff'
249+
"""
250+
case _ of
251+
(ParseFailed _ :: RecoveredParserResult Expr) ->
252+
true
253+
_ ->
254+
false

0 commit comments

Comments
 (0)