1+ #include  < cstdint> 
12#include  < iostream> 
23
34#include  " stream.h" 
67#define  YAML_PREFETCH_SIZE  2048 
78#endif 
89
9- #define  S_ARRAY_SIZE (A ) (sizeof (A) / sizeof (*(A)))
10- #define  S_ARRAY_END (A ) ((A) + S_ARRAY_SIZE(A))
11- 
1210#define  CP_REPLACEMENT_CHARACTER  (0xFFFD )
1311
1412namespace  YAML  {
15- enum  UtfIntroState {
16-   uis_start,
17-   uis_utfbe_b1,
18-   uis_utf32be_b2,
19-   uis_utf32be_bom3,
20-   uis_utf32be,
21-   uis_utf16be,
22-   uis_utf16be_bom1,
23-   uis_utfle_bom1,
24-   uis_utf16le_bom2,
25-   uis_utf32le_bom3,
26-   uis_utf16le,
27-   uis_utf32le,
28-   uis_utf8_imp,
29-   uis_utf16le_imp,
30-   uis_utf32le_imp3,
31-   uis_utf8_bom1,
32-   uis_utf8_bom2,
33-   uis_utf8,
34-   uis_error
35- };
36- 
37- enum  UtfIntroCharType {
38-   uict00,
39-   uictBB,
40-   uictBF,
41-   uictEF,
42-   uictFE,
43-   uictFF,
44-   uictAscii,
45-   uictOther,
46-   uictMax
47- };
48- 
49- static  bool  s_introFinalState[] = {
50-     false ,  //  uis_start
51-     false ,  //  uis_utfbe_b1
52-     false ,  //  uis_utf32be_b2
53-     false ,  //  uis_utf32be_bom3
54-     true ,   //  uis_utf32be
55-     true ,   //  uis_utf16be
56-     false ,  //  uis_utf16be_bom1
57-     false ,  //  uis_utfle_bom1
58-     false ,  //  uis_utf16le_bom2
59-     false ,  //  uis_utf32le_bom3
60-     true ,   //  uis_utf16le
61-     true ,   //  uis_utf32le
62-     false ,  //  uis_utf8_imp
63-     false ,  //  uis_utf16le_imp
64-     false ,  //  uis_utf32le_imp3
65-     false ,  //  uis_utf8_bom1
66-     false ,  //  uis_utf8_bom2
67-     true ,   //  uis_utf8
68-     true ,   //  uis_error
69- };
70- 
71- static  UtfIntroState s_introTransitions[][uictMax] = {
72-     //  uict00,           uictBB,           uictBF,           uictEF,
73-     //  uictFE,           uictFF,           uictAscii,        uictOther
74-     {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
75-      uis_utfle_bom1, uis_utf8_imp, uis_utf8},
76-     {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
77-      uis_utf16be, uis_utf8},
78-     {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
79-      uis_utf8, uis_utf8},
80-     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
81-      uis_utf8},
82-     {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
83-      uis_utf32be, uis_utf32be, uis_utf32be},
84-     {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
85-      uis_utf16be, uis_utf16be, uis_utf16be},
86-     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
87-      uis_utf8},
88-     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
89-      uis_utf8, uis_utf8},
90-     {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
91-      uis_utf16le, uis_utf16le, uis_utf16le},
92-     {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
93-      uis_utf16le, uis_utf16le, uis_utf16le},
94-     {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
95-      uis_utf16le, uis_utf16le, uis_utf16le},
96-     {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
97-      uis_utf32le, uis_utf32le, uis_utf32le},
98-     {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
99-      uis_utf8, uis_utf8},
100-     {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
101-      uis_utf16le, uis_utf16le, uis_utf16le},
102-     {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
103-      uis_utf16le, uis_utf16le, uis_utf16le},
104-     {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
105-      uis_utf8},
106-     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
107-      uis_utf8},
108-     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
109-      uis_utf8},
110- };
111- 
112- static  char  s_introUngetCount[][uictMax] = {
113-     //  uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
114-     {0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 }, {0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 },
115-     {3 , 3 , 3 , 3 , 0 , 3 , 3 , 3 }, {4 , 4 , 4 , 4 , 4 , 0 , 4 , 4 },
116-     {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 }, {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 },
117-     {2 , 2 , 2 , 2 , 2 , 0 , 2 , 2 }, {2 , 2 , 2 , 2 , 0 , 2 , 2 , 2 },
118-     {0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 }, {0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 },
119-     {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 }, {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 },
120-     {0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 }, {0 , 3 , 3 , 3 , 3 , 3 , 3 , 3 },
121-     {4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 }, {2 , 0 , 2 , 2 , 2 , 2 , 2 , 2 },
122-     {3 , 3 , 0 , 3 , 3 , 3 , 3 , 3 }, {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 },
123- };
124- 
125- inline  UtfIntroCharType IntroCharTypeOf (std::istream::int_type ch) {
126-   if  (std::istream::traits_type::eof () == ch) {
127-     return  uictOther;
128-   }
129- 
130-   switch  (ch) {
131-     case  0 :
132-       return  uict00;
133-     case  0xBB :
134-       return  uictBB;
135-     case  0xBF :
136-       return  uictBF;
137-     case  0xEF :
138-       return  uictEF;
139-     case  0xFE :
140-       return  uictFE;
141-     case  0xFF :
142-       return  uictFF;
143-   }
144- 
145-   if  ((ch > 0 ) && (ch < 0xFF )) {
146-     return  uictAscii;
147-   }
148- 
149-   return  uictOther;
150- }
15113
15214inline  char  Utf8Adjust (unsigned  long  ch, unsigned  char  lead_bits,
15315                       unsigned  char  rshift) {
@@ -182,6 +44,58 @@ inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
18244  }
18345}
18446
47+ uint8_t  Stream::CheckBOM (const  unsigned  char * bom, uint8_t  size) {
48+   if  (size >= 4 ) {
49+     if  (bom[0 ] == 0x00  && bom[1 ] == 0x00  && bom[2 ] == 0xFE  && bom[3 ] == 0xFF ) {
50+       m_charSet = utf32be;
51+       return  4 ;
52+     }
53+     if  (bom[0 ] == 0x00  && bom[1 ] == 0x00  && bom[2 ] == 0x00 ) {
54+       m_charSet = utf32be;
55+       return  0 ;
56+     }
57+ 
58+     if  (bom[0 ] == 0xFF  && bom[1 ] == 0xFE  && bom[2 ] == 0x00  && bom[3 ] == 0x00 ) {
59+       m_charSet = utf32le;
60+       return  4 ;
61+     }
62+     if  (bom[1 ] == 0x00  && bom[2 ] == 0x00  && bom[3 ] == 0x00 ) {
63+       m_charSet = utf32le;
64+       return  0 ;
65+     }
66+   }
67+ 
68+   if  (size >= 2 ) {
69+     if  (bom[0 ] == 0xFE  && bom[1 ] == 0xFF ) {
70+       m_charSet = utf16be;
71+       return  2 ;
72+     }
73+     if  (bom[0 ] == 0x00 ) {
74+       m_charSet = utf16be;
75+       return  0 ;
76+     }
77+ 
78+     if  (bom[0 ] == 0xFF  && bom[1 ] == 0xFE ) {
79+       m_charSet = utf16le;
80+       return  2 ;
81+     }
82+     if  (bom[1 ] == 0x00 ) {
83+       m_charSet = utf16le;
84+       return  0 ;
85+     }
86+   }
87+ 
88+   if  (size >= 3 ) {
89+     if  (bom[0 ] == 0xEF  && bom[1 ] == 0xBB  && bom[2 ] == 0xBF ) {
90+       m_charSet = utf8;
91+       return  3 ;
92+     }
93+   }
94+ 
95+   m_charSet = utf8;
96+   return  0 ;
97+ }
98+ 
18599Stream::Stream (std::istream& input)
186100    : m_input(input),
187101      m_mark{},
@@ -190,52 +104,28 @@ Stream::Stream(std::istream& input)
190104      m_pPrefetched (new  unsigned  char [YAML_PREFETCH_SIZE]),
191105      m_nPrefetchedAvailable (0 ),
192106      m_nPrefetchedUsed (0 ) {
193-   using  char_traits = std::istream::traits_type;
194107
195108  if  (!input)
196109    return ;
197110
198111  //  Determine (or guess) the character-set by reading the BOM, if any.  See
199112  //  the YAML specification for the determination algorithm.
200-   char_traits::int_type intro[4 ]{};
201-   int  nIntroUsed = 0 ;
202-   UtfIntroState state = uis_start;
203-   for  (; !s_introFinalState[state];) {
204-     std::istream::int_type ch = input.get ();
205-     intro[nIntroUsed++] = ch;
206-     UtfIntroCharType charType = IntroCharTypeOf (ch);
207-     UtfIntroState newState = s_introTransitions[state][charType];
208-     int  nUngets = s_introUngetCount[state][charType];
209-     if  (nUngets > 0 ) {
113+   unsigned  char  buffer[4 ];
114+   uint8_t  size = 4 ;
115+   for  (uint8_t  i = 0 ; i < 4 ; i++) {
116+     buffer[i] = input.get ();
117+     if  (!input.good ()) {
210118      input.clear ();
211-       for  (; nUngets > 0 ; --nUngets) {
212-         if  (char_traits::eof () != intro[--nIntroUsed])
213-           input.putback (char_traits::to_char_type (intro[nIntroUsed]));
214-       }
119+       size = i;
120+       break ;
215121    }
216-     state = newState;
217122  }
218- 
219-   switch  (state) {
220-     case  uis_utf8:
221-       m_charSet = utf8;
222-       break ;
223-     case  uis_utf16le:
224-       m_charSet = utf16le;
225-       break ;
226-     case  uis_utf16be:
227-       m_charSet = utf16be;
228-       break ;
229-     case  uis_utf32le:
230-       m_charSet = utf32le;
231-       break ;
232-     case  uis_utf32be:
233-       m_charSet = utf32be;
234-       break ;
235-     default :
236-       m_charSet = utf8;
237-       break ;
123+   auto  bom_size = CheckBOM (buffer, size);
124+   size -= bom_size;
125+   for  (uint8_t  i = 0 ; i < size; i++) {
126+     m_pPrefetched[i] = buffer[bom_size + i];
238127  }
128+   m_nPrefetchedAvailable = size;
239129
240130  ReadAheadTo (0 );
241131}
@@ -409,11 +299,8 @@ unsigned char Stream::GetNextByte() const {
409299    m_nPrefetchedAvailable = static_cast <std::size_t >(
410300        pBuf->sgetn (ReadBuffer (m_pPrefetched), YAML_PREFETCH_SIZE));
411301    m_nPrefetchedUsed = 0 ;
412-     if  (! m_nPrefetchedAvailable) {
302+     if  (m_nPrefetchedAvailable ==  0 ) {
413303      m_input.setstate (std::ios_base::eofbit);
414-     }
415- 
416-     if  (0  == m_nPrefetchedAvailable) {
417304      return  0 ;
418305    }
419306  }
0 commit comments