Skip to content

Commit ebdad2d

Browse files
committed
Change parsing of a BOM to make it standard-compliant (#1152)
1 parent 1b50109 commit ebdad2d

File tree

2 files changed

+67
-179
lines changed

2 files changed

+67
-179
lines changed

src/stream.cpp

Lines changed: 66 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <cstdint>
12
#include <iostream>
23

34
#include "stream.h"
@@ -6,148 +7,9 @@
67
#define YAML_PREFETCH_SIZE 2048
78
#endif
89

9-
#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
10-
#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
11-
1210
#define CP_REPLACEMENT_CHARACTER (0xFFFD)
1311

1412
namespace YAML {
15-
enum UtfIntroState {
16-
uis_start,
17-
uis_utfbe_b1,
18-
uis_utf32be_b2,
19-
uis_utf32be_bom3,
20-
uis_utf32be,
21-
uis_utf16be,
22-
uis_utf16be_bom1,
23-
uis_utfle_bom1,
24-
uis_utf16le_bom2,
25-
uis_utf32le_bom3,
26-
uis_utf16le,
27-
uis_utf32le,
28-
uis_utf8_imp,
29-
uis_utf16le_imp,
30-
uis_utf32le_imp3,
31-
uis_utf8_bom1,
32-
uis_utf8_bom2,
33-
uis_utf8,
34-
uis_error
35-
};
36-
37-
enum UtfIntroCharType {
38-
uict00,
39-
uictBB,
40-
uictBF,
41-
uictEF,
42-
uictFE,
43-
uictFF,
44-
uictAscii,
45-
uictOther,
46-
uictMax
47-
};
48-
49-
static bool s_introFinalState[] = {
50-
false, // uis_start
51-
false, // uis_utfbe_b1
52-
false, // uis_utf32be_b2
53-
false, // uis_utf32be_bom3
54-
true, // uis_utf32be
55-
true, // uis_utf16be
56-
false, // uis_utf16be_bom1
57-
false, // uis_utfle_bom1
58-
false, // uis_utf16le_bom2
59-
false, // uis_utf32le_bom3
60-
true, // uis_utf16le
61-
true, // uis_utf32le
62-
false, // uis_utf8_imp
63-
false, // uis_utf16le_imp
64-
false, // uis_utf32le_imp3
65-
false, // uis_utf8_bom1
66-
false, // uis_utf8_bom2
67-
true, // uis_utf8
68-
true, // uis_error
69-
};
70-
71-
static UtfIntroState s_introTransitions[][uictMax] = {
72-
// uict00, uictBB, uictBF, uictEF,
73-
// uictFE, uictFF, uictAscii, uictOther
74-
{uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
75-
uis_utfle_bom1, uis_utf8_imp, uis_utf8},
76-
{uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
77-
uis_utf16be, uis_utf8},
78-
{uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
79-
uis_utf8, uis_utf8},
80-
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
81-
uis_utf8},
82-
{uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
83-
uis_utf32be, uis_utf32be, uis_utf32be},
84-
{uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
85-
uis_utf16be, uis_utf16be, uis_utf16be},
86-
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
87-
uis_utf8},
88-
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
89-
uis_utf8, uis_utf8},
90-
{uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
91-
uis_utf16le, uis_utf16le, uis_utf16le},
92-
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
93-
uis_utf16le, uis_utf16le, uis_utf16le},
94-
{uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
95-
uis_utf16le, uis_utf16le, uis_utf16le},
96-
{uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
97-
uis_utf32le, uis_utf32le, uis_utf32le},
98-
{uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
99-
uis_utf8, uis_utf8},
100-
{uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
101-
uis_utf16le, uis_utf16le, uis_utf16le},
102-
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
103-
uis_utf16le, uis_utf16le, uis_utf16le},
104-
{uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
105-
uis_utf8},
106-
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
107-
uis_utf8},
108-
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
109-
uis_utf8},
110-
};
111-
112-
static char s_introUngetCount[][uictMax] = {
113-
// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
114-
{0, 1, 1, 0, 0, 0, 0, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
115-
{3, 3, 3, 3, 0, 3, 3, 3}, {4, 4, 4, 4, 4, 0, 4, 4},
116-
{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
117-
{2, 2, 2, 2, 2, 0, 2, 2}, {2, 2, 2, 2, 0, 2, 2, 2},
118-
{0, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
119-
{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
120-
{0, 2, 2, 2, 2, 2, 2, 2}, {0, 3, 3, 3, 3, 3, 3, 3},
121-
{4, 4, 4, 4, 4, 4, 4, 4}, {2, 0, 2, 2, 2, 2, 2, 2},
122-
{3, 3, 0, 3, 3, 3, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1},
123-
};
124-
125-
inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
126-
if (std::istream::traits_type::eof() == ch) {
127-
return uictOther;
128-
}
129-
130-
switch (ch) {
131-
case 0:
132-
return uict00;
133-
case 0xBB:
134-
return uictBB;
135-
case 0xBF:
136-
return uictBF;
137-
case 0xEF:
138-
return uictEF;
139-
case 0xFE:
140-
return uictFE;
141-
case 0xFF:
142-
return uictFF;
143-
}
144-
145-
if ((ch > 0) && (ch < 0xFF)) {
146-
return uictAscii;
147-
}
148-
149-
return uictOther;
150-
}
15113

15214
inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
15315
unsigned char rshift) {
@@ -182,6 +44,58 @@ inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
18244
}
18345
}
18446

47+
uint8_t Stream::CheckBOM(const unsigned char* bom, uint8_t size) {
48+
if (size >= 4) {
49+
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFE && bom[3] == 0xFF) {
50+
m_charSet = utf32be;
51+
return 4;
52+
}
53+
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0x00) {
54+
m_charSet = utf32be;
55+
return 0;
56+
}
57+
58+
if (bom[0] == 0xFF && bom[1] == 0xFE && bom[2] == 0x00 && bom[3] == 0x00) {
59+
m_charSet = utf32le;
60+
return 4;
61+
}
62+
if (bom[1] == 0x00 && bom[2] == 0x00 && bom[3] == 0x00) {
63+
m_charSet = utf32le;
64+
return 0;
65+
}
66+
}
67+
68+
if (size >= 2) {
69+
if (bom[0] == 0xFE && bom[1] == 0xFF) {
70+
m_charSet = utf16be;
71+
return 2;
72+
}
73+
if (bom[0] == 0x00) {
74+
m_charSet = utf16be;
75+
return 0;
76+
}
77+
78+
if (bom[0] == 0xFF && bom[1] == 0xFE) {
79+
m_charSet = utf16le;
80+
return 2;
81+
}
82+
if (bom[1] == 0x00) {
83+
m_charSet = utf16le;
84+
return 0;
85+
}
86+
}
87+
88+
if (size >= 3) {
89+
if (bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF) {
90+
m_charSet = utf8;
91+
return 3;
92+
}
93+
}
94+
95+
m_charSet = utf8;
96+
return 0;
97+
}
98+
18599
Stream::Stream(std::istream& input)
186100
: m_input(input),
187101
m_mark{},
@@ -190,52 +104,28 @@ Stream::Stream(std::istream& input)
190104
m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
191105
m_nPrefetchedAvailable(0),
192106
m_nPrefetchedUsed(0) {
193-
using char_traits = std::istream::traits_type;
194107

195108
if (!input)
196109
return;
197110

198111
// Determine (or guess) the character-set by reading the BOM, if any. See
199112
// the YAML specification for the determination algorithm.
200-
char_traits::int_type intro[4]{};
201-
int nIntroUsed = 0;
202-
UtfIntroState state = uis_start;
203-
for (; !s_introFinalState[state];) {
204-
std::istream::int_type ch = input.get();
205-
intro[nIntroUsed++] = ch;
206-
UtfIntroCharType charType = IntroCharTypeOf(ch);
207-
UtfIntroState newState = s_introTransitions[state][charType];
208-
int nUngets = s_introUngetCount[state][charType];
209-
if (nUngets > 0) {
113+
unsigned char buffer[4];
114+
uint8_t size = 4;
115+
for (uint8_t i = 0; i < 4; i++) {
116+
buffer[i] = input.get();
117+
if (!input.good()) {
210118
input.clear();
211-
for (; nUngets > 0; --nUngets) {
212-
if (char_traits::eof() != intro[--nIntroUsed])
213-
input.putback(char_traits::to_char_type(intro[nIntroUsed]));
214-
}
119+
size = i;
120+
break;
215121
}
216-
state = newState;
217122
}
218-
219-
switch (state) {
220-
case uis_utf8:
221-
m_charSet = utf8;
222-
break;
223-
case uis_utf16le:
224-
m_charSet = utf16le;
225-
break;
226-
case uis_utf16be:
227-
m_charSet = utf16be;
228-
break;
229-
case uis_utf32le:
230-
m_charSet = utf32le;
231-
break;
232-
case uis_utf32be:
233-
m_charSet = utf32be;
234-
break;
235-
default:
236-
m_charSet = utf8;
237-
break;
123+
auto bom_size = CheckBOM(buffer, size);
124+
size -= bom_size;
125+
for (uint8_t i = 0; i < size; i++) {
126+
m_pPrefetched[i] = buffer[bom_size + i];
238127
}
128+
m_nPrefetchedAvailable = size;
239129

240130
ReadAheadTo(0);
241131
}
@@ -409,11 +299,8 @@ unsigned char Stream::GetNextByte() const {
409299
m_nPrefetchedAvailable = static_cast<std::size_t>(
410300
pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
411301
m_nPrefetchedUsed = 0;
412-
if (!m_nPrefetchedAvailable) {
302+
if (m_nPrefetchedAvailable == 0) {
413303
m_input.setstate(std::ios_base::eofbit);
414-
}
415-
416-
if (0 == m_nPrefetchedAvailable) {
417304
return 0;
418305
}
419306
}

src/stream.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class Stream {
6666
void StreamInUtf16() const;
6767
void StreamInUtf32() const;
6868
unsigned char GetNextByte() const;
69+
uint8_t CheckBOM(const unsigned char* bom, uint8_t size);
6970
};
7071

7172
// CharAt

0 commit comments

Comments
 (0)