Skip to content

Commit

Permalink
Change parsing of a BOM to make it standard-compliant (#1152)
Browse files Browse the repository at this point in the history
  • Loading branch information
H5117 committed Nov 17, 2022
1 parent 1b50109 commit ebdad2d
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 179 deletions.
245 changes: 66 additions & 179 deletions src/stream.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <cstdint>
#include <iostream>

#include "stream.h"
Expand All @@ -6,148 +7,9 @@
#define YAML_PREFETCH_SIZE 2048
#endif

#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))

#define CP_REPLACEMENT_CHARACTER (0xFFFD)

namespace YAML {
enum UtfIntroState {
uis_start,
uis_utfbe_b1,
uis_utf32be_b2,
uis_utf32be_bom3,
uis_utf32be,
uis_utf16be,
uis_utf16be_bom1,
uis_utfle_bom1,
uis_utf16le_bom2,
uis_utf32le_bom3,
uis_utf16le,
uis_utf32le,
uis_utf8_imp,
uis_utf16le_imp,
uis_utf32le_imp3,
uis_utf8_bom1,
uis_utf8_bom2,
uis_utf8,
uis_error
};

enum UtfIntroCharType {
uict00,
uictBB,
uictBF,
uictEF,
uictFE,
uictFF,
uictAscii,
uictOther,
uictMax
};

static bool s_introFinalState[] = {
false, // uis_start
false, // uis_utfbe_b1
false, // uis_utf32be_b2
false, // uis_utf32be_bom3
true, // uis_utf32be
true, // uis_utf16be
false, // uis_utf16be_bom1
false, // uis_utfle_bom1
false, // uis_utf16le_bom2
false, // uis_utf32le_bom3
true, // uis_utf16le
true, // uis_utf32le
false, // uis_utf8_imp
false, // uis_utf16le_imp
false, // uis_utf32le_imp3
false, // uis_utf8_bom1
false, // uis_utf8_bom2
true, // uis_utf8
true, // uis_error
};

static UtfIntroState s_introTransitions[][uictMax] = {
// uict00, uictBB, uictBF, uictEF,
// uictFE, uictFF, uictAscii, uictOther
{uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
uis_utfle_bom1, uis_utf8_imp, uis_utf8},
{uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf16be, uis_utf8},
{uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
uis_utf8, uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
uis_utf8},
{uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
uis_utf32be, uis_utf32be, uis_utf32be},
{uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
uis_utf16be, uis_utf16be, uis_utf16be},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
uis_utf8, uis_utf8},
{uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
uis_utf32le, uis_utf32le, uis_utf32le},
{uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8, uis_utf8},
{uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8},
};

static char s_introUngetCount[][uictMax] = {
// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
{0, 1, 1, 0, 0, 0, 0, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
{3, 3, 3, 3, 0, 3, 3, 3}, {4, 4, 4, 4, 4, 0, 4, 4},
{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
{2, 2, 2, 2, 2, 0, 2, 2}, {2, 2, 2, 2, 0, 2, 2, 2},
{0, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
{0, 2, 2, 2, 2, 2, 2, 2}, {0, 3, 3, 3, 3, 3, 3, 3},
{4, 4, 4, 4, 4, 4, 4, 4}, {2, 0, 2, 2, 2, 2, 2, 2},
{3, 3, 0, 3, 3, 3, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1},
};

inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
if (std::istream::traits_type::eof() == ch) {
return uictOther;
}

switch (ch) {
case 0:
return uict00;
case 0xBB:
return uictBB;
case 0xBF:
return uictBF;
case 0xEF:
return uictEF;
case 0xFE:
return uictFE;
case 0xFF:
return uictFF;
}

if ((ch > 0) && (ch < 0xFF)) {
return uictAscii;
}

return uictOther;
}

inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
unsigned char rshift) {
Expand Down Expand Up @@ -182,6 +44,58 @@ inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
}
}

uint8_t Stream::CheckBOM(const unsigned char* bom, uint8_t size) {
if (size >= 4) {
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFE && bom[3] == 0xFF) {
m_charSet = utf32be;
return 4;
}
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0x00) {
m_charSet = utf32be;
return 0;
}

if (bom[0] == 0xFF && bom[1] == 0xFE && bom[2] == 0x00 && bom[3] == 0x00) {
m_charSet = utf32le;
return 4;
}
if (bom[1] == 0x00 && bom[2] == 0x00 && bom[3] == 0x00) {
m_charSet = utf32le;
return 0;
}
}

if (size >= 2) {
if (bom[0] == 0xFE && bom[1] == 0xFF) {
m_charSet = utf16be;
return 2;
}
if (bom[0] == 0x00) {
m_charSet = utf16be;
return 0;
}

if (bom[0] == 0xFF && bom[1] == 0xFE) {
m_charSet = utf16le;
return 2;
}
if (bom[1] == 0x00) {
m_charSet = utf16le;
return 0;
}
}

if (size >= 3) {
if (bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF) {
m_charSet = utf8;
return 3;
}
}

m_charSet = utf8;
return 0;
}

Stream::Stream(std::istream& input)
: m_input(input),
m_mark{},
Expand All @@ -190,52 +104,28 @@ Stream::Stream(std::istream& input)
m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
m_nPrefetchedAvailable(0),
m_nPrefetchedUsed(0) {
using char_traits = std::istream::traits_type;

if (!input)
return;

// Determine (or guess) the character-set by reading the BOM, if any. See
// the YAML specification for the determination algorithm.
char_traits::int_type intro[4]{};
int nIntroUsed = 0;
UtfIntroState state = uis_start;
for (; !s_introFinalState[state];) {
std::istream::int_type ch = input.get();
intro[nIntroUsed++] = ch;
UtfIntroCharType charType = IntroCharTypeOf(ch);
UtfIntroState newState = s_introTransitions[state][charType];
int nUngets = s_introUngetCount[state][charType];
if (nUngets > 0) {
unsigned char buffer[4];
uint8_t size = 4;
for (uint8_t i = 0; i < 4; i++) {
buffer[i] = input.get();
if (!input.good()) {
input.clear();
for (; nUngets > 0; --nUngets) {
if (char_traits::eof() != intro[--nIntroUsed])
input.putback(char_traits::to_char_type(intro[nIntroUsed]));
}
size = i;
break;
}
state = newState;
}

switch (state) {
case uis_utf8:
m_charSet = utf8;
break;
case uis_utf16le:
m_charSet = utf16le;
break;
case uis_utf16be:
m_charSet = utf16be;
break;
case uis_utf32le:
m_charSet = utf32le;
break;
case uis_utf32be:
m_charSet = utf32be;
break;
default:
m_charSet = utf8;
break;
auto bom_size = CheckBOM(buffer, size);
size -= bom_size;
for (uint8_t i = 0; i < size; i++) {
m_pPrefetched[i] = buffer[bom_size + i];
}
m_nPrefetchedAvailable = size;

ReadAheadTo(0);
}
Expand Down Expand Up @@ -409,11 +299,8 @@ unsigned char Stream::GetNextByte() const {
m_nPrefetchedAvailable = static_cast<std::size_t>(
pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
m_nPrefetchedUsed = 0;
if (!m_nPrefetchedAvailable) {
if (m_nPrefetchedAvailable == 0) {
m_input.setstate(std::ios_base::eofbit);
}

if (0 == m_nPrefetchedAvailable) {
return 0;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class Stream {
void StreamInUtf16() const;
void StreamInUtf32() const;
unsigned char GetNextByte() const;
uint8_t CheckBOM(const unsigned char* bom, uint8_t size);
};

// CharAt
Expand Down

0 comments on commit ebdad2d

Please sign in to comment.