Commit 2fc82358 by Niels

clean up

parent b21bf956
...@@ -2456,7 +2456,10 @@ class basic_json ...@@ -2456,7 +2456,10 @@ class basic_json
end_of_input end_of_input
}; };
inline lexer(const char* s) : m_content(s) /// the char type to use in the lexer
using lexer_char_t = typename string_t::value_type;
inline lexer(const typename string_t::value_type* s) : m_content(s)
{ {
m_start = m_cursor = m_content; m_start = m_cursor = m_content;
m_limit = m_content + strlen(m_content); m_limit = m_content + strlen(m_content);
...@@ -2464,46 +2467,39 @@ class basic_json ...@@ -2464,46 +2467,39 @@ class basic_json
inline lexer() = default; inline lexer() = default;
template<typename CharT> inline static string_t to_unicode(const long codepoint)
inline static std::basic_string<CharT> to_unicode(const long codepoint)
{ {
std::string result; string_t result;
if (codepoint <= 0x7f) if (codepoint <= 0x7f)
{ {
// 1-byte (ASCII) characters: 0xxxxxxx // 1-byte characters: 0xxxxxxx (ASCI)
result.append(1, static_cast<char>(codepoint)); result.append(1, static_cast<typename string_t::value_type>(codepoint));
} }
else if (codepoint <= 0x7ff) else if (codepoint <= 0x7ff)
{ {
// 2-byte characters: 110xxxxx 10xxxxxx // 2-byte characters: 110xxxxx 10xxxxxx
// the 0xC0 enables the two most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
// a 2-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0xffff) else if (codepoint <= 0xffff)
{ {
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
// the 0xE0 enables the three most significant bits to make result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
// this a 3-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0x10ffff) else if (codepoint <= 0x10ffff)
{ {
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// the 0xF0 enables the four most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
// a 4-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07))); result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else else
{ {
throw std::out_of_range("code point is invalid"); throw std::out_of_range("code points above 0x10FFFF are invalid");
} }
return result; return result;
...@@ -2553,353 +2549,359 @@ class basic_json ...@@ -2553,353 +2549,359 @@ class basic_json
with goto jumps. with goto jumps.
@return the class of the next token read from the buffer @return the class of the next token read from the buffer
@todo Unicode support needs to be checked.
*/ */
inline token_type scan() inline token_type scan()
{ {
// pointer for backtracking information // pointer for backtracking information
const char* m_marker = nullptr; const typename string_t::value_type* m_marker = nullptr;
while (true) // remember the begin of the token
{ m_start = m_cursor;
// remember the begin of the token
m_start = m_cursor;
{
lexer_char_t yych;
{ unsigned int yyaccept = 0;
char yych; static const unsigned char yybm[] =
unsigned int yyaccept = 0; {
static const unsigned char yybm[] = 0, 64, 64, 64, 64, 64, 64, 64,
{ 64, 96, 96, 64, 64, 96, 64, 64,
0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 96, 96, 64, 64, 96, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 96, 64, 0, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
96, 64, 0, 64, 64, 64, 64, 64, 192, 192, 192, 192, 192, 192, 192, 192,
64, 64, 64, 64, 64, 64, 64, 64, 192, 192, 64, 64, 64, 64, 64, 64,
192, 192, 192, 192, 192, 192, 192, 192, 64, 64, 64, 64, 64, 64, 64, 64,
192, 192, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, };
64, 64, 64, 64, 64, 64, 64, 64,
}; yych = *m_cursor;
if (yych <= '9')
yych = *m_cursor; {
if (yych <= '9') if (yych <= ' ')
{ {
if (yych <= ' ') if (yych <= '\n')
{ {
if (yych <= '\n') if (yych <= 0x00)
{ {
if (yych <= 0x00) goto basic_json_parser_27;
{
goto basic_json_parser_27;
}
if (yych <= 0x08)
{
goto basic_json_parser_29;
}
if (yych >= '\n')
{
goto basic_json_parser_4;
}
} }
else if (yych <= 0x08)
{ {
if (yych == '\r') goto basic_json_parser_29;
{ }
goto basic_json_parser_2; if (yych >= '\n')
} {
if (yych <= 0x1F) goto basic_json_parser_4;
{
goto basic_json_parser_29;
}
} }
} }
else else
{ {
if (yych <= ',') if (yych == '\r')
{ {
if (yych == '"') goto basic_json_parser_2;
{
goto basic_json_parser_26;
}
if (yych <= '+')
{
goto basic_json_parser_29;
}
goto basic_json_parser_14;
} }
else if (yych <= 0x1F)
{ {
if (yych <= '-') goto basic_json_parser_29;
{
goto basic_json_parser_22;
}
if (yych <= '/')
{
goto basic_json_parser_29;
}
if (yych <= '0')
{
goto basic_json_parser_23;
}
goto basic_json_parser_25;
} }
} }
} }
else else
{ {
if (yych <= 'm') if (yych <= ',')
{ {
if (yych <= '\\') if (yych == '"')
{ {
if (yych <= ':') goto basic_json_parser_26;
{
goto basic_json_parser_16;
}
if (yych == '[')
{
goto basic_json_parser_6;
}
goto basic_json_parser_29;
} }
else if (yych <= '+')
{ {
if (yych <= ']')
{
goto basic_json_parser_8;
}
if (yych == 'f')
{
goto basic_json_parser_21;
}
goto basic_json_parser_29; goto basic_json_parser_29;
} }
goto basic_json_parser_14;
} }
else else
{ {
if (yych <= 'z') if (yych <= '-')
{ {
if (yych <= 'n') goto basic_json_parser_22;
{
goto basic_json_parser_18;
}
if (yych == 't')
{
goto basic_json_parser_20;
}
goto basic_json_parser_29;
} }
else if (yych <= '/')
{ {
if (yych <= '{')
{
goto basic_json_parser_10;
}
if (yych == '}')
{
goto basic_json_parser_12;
}
goto basic_json_parser_29; goto basic_json_parser_29;
} }
if (yych <= '0')
{
goto basic_json_parser_23;
}
goto basic_json_parser_25;
} }
} }
basic_json_parser_2: }
++m_cursor; else
yych = *m_cursor; {
goto basic_json_parser_5; if (yych <= 'm')
basic_json_parser_3: {
if (yych <= '\\')
{
if (yych <= ':')
{
goto basic_json_parser_16;
}
if (yych == '[')
{
goto basic_json_parser_6;
}
goto basic_json_parser_29;
}
else
{
if (yych <= ']')
{
goto basic_json_parser_8;
}
if (yych == 'f')
{
goto basic_json_parser_21;
}
goto basic_json_parser_29;
}
}
else
{ {
continue; if (yych <= 'z')
{
if (yych <= 'n')
{
goto basic_json_parser_18;
}
if (yych == 't')
{
goto basic_json_parser_20;
}
goto basic_json_parser_29;
}
else
{
if (yych <= '{')
{
goto basic_json_parser_10;
}
if (yych == '}')
{
goto basic_json_parser_12;
}
goto basic_json_parser_29;
}
} }
}
basic_json_parser_2:
++m_cursor;
yych = *m_cursor;
goto basic_json_parser_5;
basic_json_parser_3:
{
return scan();
}
basic_json_parser_4: basic_json_parser_4:
++m_cursor; ++m_cursor;
yych = *m_cursor; yych = *m_cursor;
basic_json_parser_5: basic_json_parser_5:
if (yybm[0 + yych] & 32) if (yybm[0 + yych] & 32)
{ {
goto basic_json_parser_4; goto basic_json_parser_4;
} }
goto basic_json_parser_3; goto basic_json_parser_3;
basic_json_parser_6: basic_json_parser_6:
++m_cursor; ++m_cursor;
{ {
return token_type::begin_array; return token_type::begin_array;
} }
basic_json_parser_8: basic_json_parser_8:
++m_cursor; ++m_cursor;
{ {
return token_type::end_array; return token_type::end_array;
} }
basic_json_parser_10: basic_json_parser_10:
++m_cursor; ++m_cursor;
{ {
return token_type::begin_object; return token_type::begin_object;
} }
basic_json_parser_12: basic_json_parser_12:
++m_cursor; ++m_cursor;
{ {
return token_type::end_object; return token_type::end_object;
} }
basic_json_parser_14: basic_json_parser_14:
++m_cursor; ++m_cursor;
{ {
return token_type::value_separator; return token_type::value_separator;
} }
basic_json_parser_16: basic_json_parser_16:
++m_cursor; ++m_cursor;
{ {
return token_type::name_separator; return token_type::name_separator;
} }
basic_json_parser_18: basic_json_parser_18:
yyaccept = 0; yyaccept = 0;
yych = *(m_marker = ++m_cursor); yych = *(m_marker = ++m_cursor);
if (yych == 'u') if (yych == 'u')
{ {
goto basic_json_parser_59; goto basic_json_parser_59;
} }
basic_json_parser_19: basic_json_parser_19:
{ {
return token_type::parse_error; return token_type::parse_error;
} }
basic_json_parser_20: basic_json_parser_20:
yyaccept = 0; yyaccept = 0;
yych = *(m_marker = ++m_cursor); yych = *(m_marker = ++m_cursor);
if (yych == 'r') if (yych == 'r')
{ {
goto basic_json_parser_55; goto basic_json_parser_55;
} }
goto basic_json_parser_19; goto basic_json_parser_19;
basic_json_parser_21: basic_json_parser_21:
yyaccept = 0; yyaccept = 0;
yych = *(m_marker = ++m_cursor); yych = *(m_marker = ++m_cursor);
if (yych == 'a') if (yych == 'a')
{ {
goto basic_json_parser_50; goto basic_json_parser_50;
} }
goto basic_json_parser_19; goto basic_json_parser_19;
basic_json_parser_22: basic_json_parser_22:
yych = *++m_cursor; yych = *++m_cursor;
if (yych <= '/') if (yych <= '/')
{ {
goto basic_json_parser_19;
}
if (yych <= '0')
{
goto basic_json_parser_49;
}
if (yych <= '9')
{
goto basic_json_parser_40;
}
goto basic_json_parser_19; goto basic_json_parser_19;
}
if (yych <= '0')
{
goto basic_json_parser_49;
}
if (yych <= '9')
{
goto basic_json_parser_40;
}
goto basic_json_parser_19;
basic_json_parser_23: basic_json_parser_23:
yyaccept = 1; yyaccept = 1;
yych = *(m_marker = ++m_cursor); yych = *(m_marker = ++m_cursor);
if (yych <= 'D') if (yych <= 'D')
{
if (yych == '.')
{ {
if (yych == '.') goto basic_json_parser_42;
{
goto basic_json_parser_42;
}
} }
else }
else
{
if (yych <= 'E')
{ {
if (yych <= 'E') goto basic_json_parser_43;
{
goto basic_json_parser_43;
}
if (yych == 'e')
{
goto basic_json_parser_43;
}
} }
basic_json_parser_24: if (yych == 'e')
{ {
return token_type::value_number; goto basic_json_parser_43;
} }
}
basic_json_parser_24:
{
return token_type::value_number;
}
basic_json_parser_25: basic_json_parser_25:
yyaccept = 1; yyaccept = 1;
yych = *(m_marker = ++m_cursor); yych = *(m_marker = ++m_cursor);
goto basic_json_parser_41; goto basic_json_parser_41;
basic_json_parser_26: basic_json_parser_26:
yyaccept = 0; yyaccept = 0;
yych = *(m_marker = ++m_cursor); yych = *(m_marker = ++m_cursor);
if (yych <= 0x00) if (yych <= 0x00)
{ {
goto basic_json_parser_19; goto basic_json_parser_19;
} }
goto basic_json_parser_31; goto basic_json_parser_31;
basic_json_parser_27: basic_json_parser_27:
++m_cursor; ++m_cursor;
{ {
return token_type::end_of_input; return token_type::end_of_input;
} }
basic_json_parser_29: basic_json_parser_29:
yych = *++m_cursor; yych = *++m_cursor;
goto basic_json_parser_19; goto basic_json_parser_19;
basic_json_parser_30: basic_json_parser_30:
++m_cursor; ++m_cursor;
yych = *m_cursor; yych = *m_cursor;
basic_json_parser_31: basic_json_parser_31:
if (yybm[0 + yych] & 64) if (yybm[0 + yych] & 64)
{ {
goto basic_json_parser_30; goto basic_json_parser_30;
} }
if (yych <= 0x00) if (yych <= 0x00)
{ {
goto basic_json_parser_32; goto basic_json_parser_32;
} }
if (yych <= '"') if (yych <= '"')
{ {
goto basic_json_parser_34; goto basic_json_parser_34;
} }
goto basic_json_parser_33; goto basic_json_parser_33;
basic_json_parser_32: basic_json_parser_32:
m_cursor = m_marker; m_cursor = m_marker;
if (yyaccept == 0) if (yyaccept == 0)
{
goto basic_json_parser_19;
}
else
{
goto basic_json_parser_24;
}
basic_json_parser_33:
++m_cursor;
yych = *m_cursor;
if (yych <= 'e')
{
if (yych <= '/')
{ {
goto basic_json_parser_19; if (yych == '"')
{
goto basic_json_parser_30;
}
if (yych <= '.')
{
goto basic_json_parser_32;
}
goto basic_json_parser_30;
} }
else else
{ {
goto basic_json_parser_24; if (yych <= '\\')
}
basic_json_parser_33:
++m_cursor;
yych = *m_cursor;
if (yych <= 'e')
{
if (yych <= '/')
{ {
if (yych == '"') if (yych <= '[')
{
goto basic_json_parser_30;
}
if (yych <= '.')
{ {
goto basic_json_parser_32; goto basic_json_parser_32;
} }
...@@ -2907,33 +2909,33 @@ basic_json_parser_33: ...@@ -2907,33 +2909,33 @@ basic_json_parser_33:
} }
else else
{ {
if (yych <= '\\') if (yych == 'b')
{ {
if (yych <= '[')
{
goto basic_json_parser_32;
}
goto basic_json_parser_30; goto basic_json_parser_30;
} }
else goto basic_json_parser_32;
{ }
if (yych == 'b') }
{ }
goto basic_json_parser_30; else
} {
goto basic_json_parser_32; if (yych <= 'q')
} {
if (yych <= 'f')
{
goto basic_json_parser_30;
}
if (yych == 'n')
{
goto basic_json_parser_30;
} }
goto basic_json_parser_32;
} }
else else
{ {
if (yych <= 'q') if (yych <= 's')
{ {
if (yych <= 'f') if (yych <= 'r')
{
goto basic_json_parser_30;
}
if (yych == 'n')
{ {
goto basic_json_parser_30; goto basic_json_parser_30;
} }
...@@ -2941,346 +2943,335 @@ basic_json_parser_33: ...@@ -2941,346 +2943,335 @@ basic_json_parser_33:
} }
else else
{ {
if (yych <= 's') if (yych <= 't')
{ {
if (yych <= 'r') goto basic_json_parser_30;
{
goto basic_json_parser_30;
}
goto basic_json_parser_32;
} }
else if (yych <= 'u')
{ {
if (yych <= 't') goto basic_json_parser_36;
{
goto basic_json_parser_30;
}
if (yych <= 'u')
{
goto basic_json_parser_36;
}
goto basic_json_parser_32;
} }
goto basic_json_parser_32;
} }
} }
}
basic_json_parser_34: basic_json_parser_34:
++m_cursor; ++m_cursor;
{ {
return token_type::value_string; return token_type::value_string;
} }
basic_json_parser_36: basic_json_parser_36:
++m_cursor; ++m_cursor;
yych = *m_cursor; yych = *m_cursor;
if (yych <= '@') if (yych <= '@')
{ {
if (yych <= '/') if (yych <= '/')
{
goto basic_json_parser_32;
}
if (yych >= ':')
{
goto basic_json_parser_32;
}
}
else
{ {
if (yych <= 'F') goto basic_json_parser_32;
{
goto basic_json_parser_37;
}
if (yych <= '`')
{
goto basic_json_parser_32;
}
if (yych >= 'g')
{
goto basic_json_parser_32;
}
} }
basic_json_parser_37: if (yych >= ':')
++m_cursor;
yych = *m_cursor;
if (yych <= '@')
{ {
if (yych <= '/') goto basic_json_parser_32;
{
goto basic_json_parser_32;
}
if (yych >= ':')
{
goto basic_json_parser_32;
}
} }
else }
else
{
if (yych <= 'F')
{ {
if (yych <= 'F') goto basic_json_parser_37;
{
goto basic_json_parser_38;
}
if (yych <= '`')
{
goto basic_json_parser_32;
}
if (yych >= 'g')
{
goto basic_json_parser_32;
}
} }
basic_json_parser_38: if (yych <= '`')
++m_cursor;
yych = *m_cursor;
if (yych <= '@')
{ {
if (yych <= '/') goto basic_json_parser_32;
{
goto basic_json_parser_32;
}
if (yych >= ':')
{
goto basic_json_parser_32;
}
} }
else if (yych >= 'g')
{ {
if (yych <= 'F') goto basic_json_parser_32;
{
goto basic_json_parser_39;
}
if (yych <= '`')
{
goto basic_json_parser_32;
}
if (yych >= 'g')
{
goto basic_json_parser_32;
}
} }
basic_json_parser_39: }
++m_cursor; basic_json_parser_37:
yych = *m_cursor; ++m_cursor;
if (yych <= '@') yych = *m_cursor;
if (yych <= '@')
{
if (yych <= '/')
{ {
if (yych <= '/')
{
goto basic_json_parser_32;
}
if (yych <= '9')
{
goto basic_json_parser_30;
}
goto basic_json_parser_32; goto basic_json_parser_32;
} }
else if (yych >= ':')
{ {
if (yych <= 'F')
{
goto basic_json_parser_30;
}
if (yych <= '`')
{
goto basic_json_parser_32;
}
if (yych <= 'f')
{
goto basic_json_parser_30;
}
goto basic_json_parser_32; goto basic_json_parser_32;
} }
basic_json_parser_40: }
yyaccept = 1; else
m_marker = ++m_cursor; {
yych = *m_cursor; if (yych <= 'F')
basic_json_parser_41:
if (yybm[0 + yych] & 128)
{ {
goto basic_json_parser_40; goto basic_json_parser_38;
} }
if (yych <= 'D') if (yych <= '`')
{ {
if (yych != '.') goto basic_json_parser_32;
{
goto basic_json_parser_24;
}
} }
else if (yych >= 'g')
{ {
if (yych <= 'E') goto basic_json_parser_32;
{
goto basic_json_parser_43;
}
if (yych == 'e')
{
goto basic_json_parser_43;
}
goto basic_json_parser_24;
} }
basic_json_parser_42: }
yych = *++m_cursor; basic_json_parser_38:
++m_cursor;
yych = *m_cursor;
if (yych <= '@')
{
if (yych <= '/') if (yych <= '/')
{ {
goto basic_json_parser_32; goto basic_json_parser_32;
} }
if (yych <= '9') if (yych >= ':')
{ {
goto basic_json_parser_47; goto basic_json_parser_32;
} }
goto basic_json_parser_32; }
basic_json_parser_43: else
yych = *++m_cursor; {
if (yych <= ',') if (yych <= 'F')
{ {
if (yych != '+') goto basic_json_parser_39;
{
goto basic_json_parser_32;
}
} }
else if (yych <= '`')
{ {
if (yych <= '-')
{
goto basic_json_parser_44;
}
if (yych <= '/')
{
goto basic_json_parser_32;
}
if (yych <= '9')
{
goto basic_json_parser_45;
}
goto basic_json_parser_32; goto basic_json_parser_32;
} }
basic_json_parser_44: if (yych >= 'g')
yych = *++m_cursor;
if (yych <= '/')
{ {
goto basic_json_parser_32; goto basic_json_parser_32;
} }
if (yych >= ':') }
basic_json_parser_39:
++m_cursor;
yych = *m_cursor;
if (yych <= '@')
{
if (yych <= '/')
{ {
goto basic_json_parser_32; goto basic_json_parser_32;
} }
basic_json_parser_45: if (yych <= '9')
++m_cursor;
yych = *m_cursor;
if (yych <= '/')
{ {
goto basic_json_parser_24; goto basic_json_parser_30;
} }
if (yych <= '9') goto basic_json_parser_32;
}
else
{
if (yych <= 'F')
{ {
goto basic_json_parser_45; goto basic_json_parser_30;
} }
goto basic_json_parser_24; if (yych <= '`')
basic_json_parser_47:
yyaccept = 1;
m_marker = ++m_cursor;
yych = *m_cursor;
if (yych <= 'D')
{ {
if (yych <= '/') goto basic_json_parser_32;
{
goto basic_json_parser_24;
}
if (yych <= '9')
{
goto basic_json_parser_47;
}
goto basic_json_parser_24;
} }
else if (yych <= 'f')
{ {
if (yych <= 'E') goto basic_json_parser_30;
{
goto basic_json_parser_43;
}
if (yych == 'e')
{
goto basic_json_parser_43;
}
goto basic_json_parser_24;
} }
basic_json_parser_49: goto basic_json_parser_32;
yyaccept = 1; }
yych = *(m_marker = ++m_cursor); basic_json_parser_40:
if (yych <= 'D') yyaccept = 1;
m_marker = ++m_cursor;
yych = *m_cursor;
basic_json_parser_41:
if (yybm[0 + yych] & 128)
{
goto basic_json_parser_40;
}
if (yych <= 'D')
{
if (yych != '.')
{ {
if (yych == '.')
{
goto basic_json_parser_42;
}
goto basic_json_parser_24; goto basic_json_parser_24;
} }
else }
else
{
if (yych <= 'E')
{ {
if (yych <= 'E') goto basic_json_parser_43;
{
goto basic_json_parser_43;
}
if (yych == 'e')
{
goto basic_json_parser_43;
}
goto basic_json_parser_24;
} }
basic_json_parser_50: if (yych == 'e')
yych = *++m_cursor;
if (yych != 'l')
{ {
goto basic_json_parser_32; goto basic_json_parser_43;
} }
yych = *++m_cursor; goto basic_json_parser_24;
if (yych != 's') }
basic_json_parser_42:
yych = *++m_cursor;
if (yych <= '/')
{
goto basic_json_parser_32;
}
if (yych <= '9')
{
goto basic_json_parser_47;
}
goto basic_json_parser_32;
basic_json_parser_43:
yych = *++m_cursor;
if (yych <= ',')
{
if (yych != '+')
{ {
goto basic_json_parser_32; goto basic_json_parser_32;
} }
yych = *++m_cursor; }
if (yych != 'e') else
{
if (yych <= '-')
{
goto basic_json_parser_44;
}
if (yych <= '/')
{ {
goto basic_json_parser_32; goto basic_json_parser_32;
} }
++m_cursor; if (yych <= '9')
{ {
return token_type::literal_false; goto basic_json_parser_45;
} }
basic_json_parser_55: goto basic_json_parser_32;
yych = *++m_cursor; }
if (yych != 'u') basic_json_parser_44:
yych = *++m_cursor;
if (yych <= '/')
{
goto basic_json_parser_32;
}
if (yych >= ':')
{
goto basic_json_parser_32;
}
basic_json_parser_45:
++m_cursor;
yych = *m_cursor;
if (yych <= '/')
{
goto basic_json_parser_24;
}
if (yych <= '9')
{
goto basic_json_parser_45;
}
goto basic_json_parser_24;
basic_json_parser_47:
yyaccept = 1;
m_marker = ++m_cursor;
yych = *m_cursor;
if (yych <= 'D')
{
if (yych <= '/')
{ {
goto basic_json_parser_32; goto basic_json_parser_24;
} }
yych = *++m_cursor; if (yych <= '9')
if (yych != 'e')
{ {
goto basic_json_parser_32; goto basic_json_parser_47;
} }
++m_cursor; goto basic_json_parser_24;
}
else
{
if (yych <= 'E')
{ {
return token_type::literal_true; goto basic_json_parser_43;
} }
basic_json_parser_59: if (yych == 'e')
yych = *++m_cursor;
if (yych != 'l')
{ {
goto basic_json_parser_32; goto basic_json_parser_43;
}
goto basic_json_parser_24;
}
basic_json_parser_49:
yyaccept = 1;
yych = *(m_marker = ++m_cursor);
if (yych <= 'D')
{
if (yych == '.')
{
goto basic_json_parser_42;
} }
yych = *++m_cursor; goto basic_json_parser_24;
if (yych != 'l') }
else
{
if (yych <= 'E')
{ {
goto basic_json_parser_32; goto basic_json_parser_43;
} }
++m_cursor; if (yych == 'e')
{ {
return token_type::literal_null; goto basic_json_parser_43;
} }
goto basic_json_parser_24;
}
basic_json_parser_50:
yych = *++m_cursor;
if (yych != 'l')
{
goto basic_json_parser_32;
}
yych = *++m_cursor;
if (yych != 's')
{
goto basic_json_parser_32;
}
yych = *++m_cursor;
if (yych != 'e')
{
goto basic_json_parser_32;
}
++m_cursor;
{
return token_type::literal_false;
}
basic_json_parser_55:
yych = *++m_cursor;
if (yych != 'u')
{
goto basic_json_parser_32;
}
yych = *++m_cursor;
if (yych != 'e')
{
goto basic_json_parser_32;
}
++m_cursor;
{
return token_type::literal_true;
}
basic_json_parser_59:
yych = *++m_cursor;
if (yych != 'l')
{
goto basic_json_parser_32;
}
yych = *++m_cursor;
if (yych != 'l')
{
goto basic_json_parser_32;
}
++m_cursor;
{
return token_type::literal_null;
} }
} }
} }
inline std::string get_token() const inline string_t get_token() const
{ {
return std::string(m_start, static_cast<size_t>(m_cursor - m_start)); return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
} }
/*! /*!
...@@ -3291,16 +3282,14 @@ basic_json_parser_59: ...@@ -3291,16 +3282,14 @@ basic_json_parser_59:
from the pointer difference of the two pointers). from the pointer difference of the two pointers).
@return string value of current token without opening and closing quotes @return string value of current token without opening and closing quotes
@todo Take care of Unicode.
*/ */
inline std::string get_string() const inline string_t get_string() const
{ {
std::string result; string_t result;
result.reserve(static_cast<size_t>(m_cursor - m_start - 2)); result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
// iterate the result between the quotes // iterate the result between the quotes
for (const char* i = m_start + 1; i < m_cursor - 1; ++i) for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
{ {
// process escaped characters // process escaped characters
if (*i == '\\') if (*i == '\\')
...@@ -3360,7 +3349,7 @@ basic_json_parser_59: ...@@ -3360,7 +3349,7 @@ basic_json_parser_59:
// get code xxxx from \uxxxx // get code xxxx from \uxxxx
auto codepoint = strtol(i + 1, nullptr, 16); auto codepoint = strtol(i + 1, nullptr, 16);
// add unicode character(s) // add unicode character(s)
result += to_unicode<char>(codepoint); result += to_unicode(codepoint);
// skip the next four characters (\uxxxx) // skip the next four characters (\uxxxx)
i += 4; i += 4;
break; break;
...@@ -3399,20 +3388,20 @@ basic_json_parser_59: ...@@ -3399,20 +3388,20 @@ basic_json_parser_59:
private: private:
/// the buffer /// the buffer
const char* m_content = nullptr; const typename string_t::value_type* m_content = nullptr;
/// pointer to he beginning of the current symbol /// pointer to he beginning of the current symbol
const char* m_start = nullptr; const typename string_t::value_type* m_start = nullptr;
/// pointer to the current symbol /// pointer to the current symbol
const char* m_cursor = nullptr; const typename string_t::value_type* m_cursor = nullptr;
/// pointer to the end of the buffer /// pointer to the end of the buffer
const char* m_limit = nullptr; const typename string_t::value_type* m_limit = nullptr;
}; };
class parser class parser
{ {
public: public:
/// constructor for strings /// constructor for strings
inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str()) inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
{ {
// read first token // read first token
get_token(); get_token();
...@@ -3423,7 +3412,7 @@ basic_json_parser_59: ...@@ -3423,7 +3412,7 @@ basic_json_parser_59:
{ {
while (_is) while (_is)
{ {
std::string input_line; string_t input_line;
std::getline(_is, input_line); std::getline(_is, input_line);
m_buffer += input_line; m_buffer += input_line;
} }
...@@ -3617,7 +3606,7 @@ basic_json_parser_59: ...@@ -3617,7 +3606,7 @@ basic_json_parser_59:
private: private:
/// the buffer /// the buffer
std::string m_buffer; string_t m_buffer;
/// the type of the last read token /// the type of the last read token
typename lexer::token_type last_token = lexer::token_type::uninitialized; typename lexer::token_type last_token = lexer::token_type::uninitialized;
/// the lexer /// the lexer
......
...@@ -2456,7 +2456,10 @@ class basic_json ...@@ -2456,7 +2456,10 @@ class basic_json
end_of_input end_of_input
}; };
inline lexer(const char* s) : m_content(s) /// the char type to use in the lexer
using lexer_char_t = typename string_t::value_type;
inline lexer(const typename string_t::value_type* s) : m_content(s)
{ {
m_start = m_cursor = m_content; m_start = m_cursor = m_content;
m_limit = m_content + strlen(m_content); m_limit = m_content + strlen(m_content);
...@@ -2464,46 +2467,39 @@ class basic_json ...@@ -2464,46 +2467,39 @@ class basic_json
inline lexer() = default; inline lexer() = default;
template<typename CharT> inline static string_t to_unicode(const long codepoint)
inline static std::basic_string<CharT> to_unicode(const long codepoint)
{ {
std::string result; string_t result;
if (codepoint <= 0x7f) if (codepoint <= 0x7f)
{ {
// 1-byte (ASCII) characters: 0xxxxxxx // 1-byte characters: 0xxxxxxx (ASCI)
result.append(1, static_cast<char>(codepoint)); result.append(1, static_cast<typename string_t::value_type>(codepoint));
} }
else if (codepoint <= 0x7ff) else if (codepoint <= 0x7ff)
{ {
// 2-byte characters: 110xxxxx 10xxxxxx // 2-byte characters: 110xxxxx 10xxxxxx
// the 0xC0 enables the two most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
// a 2-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0xffff) else if (codepoint <= 0xffff)
{ {
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
// the 0xE0 enables the three most significant bits to make result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
// this a 3-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0x10ffff) else if (codepoint <= 0x10ffff)
{ {
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// the 0xF0 enables the four most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
// a 4-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07))); result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else else
{ {
throw std::out_of_range("code point is invalid"); throw std::out_of_range("code points above 0x10FFFF are invalid");
} }
return result; return result;
...@@ -2557,77 +2553,74 @@ class basic_json ...@@ -2557,77 +2553,74 @@ class basic_json
inline token_type scan() inline token_type scan()
{ {
// pointer for backtracking information // pointer for backtracking information
const char* m_marker = nullptr; const typename string_t::value_type* m_marker = nullptr;
while (true) // remember the begin of the token
{ m_start = m_cursor;
// remember the begin of the token
m_start = m_cursor; /*!re2c
re2c:define:YYCTYPE = lexer_char_t;
/*!re2c re2c:define:YYCURSOR = m_cursor;
re2c:define:YYCTYPE = char; re2c:define:YYLIMIT = m_limit;
re2c:define:YYCURSOR = m_cursor; re2c:define:YYMARKER = m_marker;
re2c:define:YYLIMIT = m_limit; re2c:indent:string = " ";
re2c:define:YYMARKER = m_marker; re2c:indent:top = 1;
re2c:indent:string = " "; re2c:labelprefix = "basic_json_parser_";
re2c:indent:top = 1; re2c:yyfill:enable = 0;
re2c:labelprefix = "basic_json_parser_";
re2c:yyfill:enable = 0; // whitespace
ws = [ \t\n\r]+;
// whitespace ws { return scan(); }
ws = [ \t\n\r]+;
ws { continue; } // structural characters
"[" { return token_type::begin_array; }
// structural characters "]" { return token_type::end_array; }
"[" { return token_type::begin_array; } "{" { return token_type::begin_object; }
"]" { return token_type::end_array; } "}" { return token_type::end_object; }
"{" { return token_type::begin_object; } "," { return token_type::value_separator; }
"}" { return token_type::end_object; } ":" { return token_type::name_separator; }
"," { return token_type::value_separator; }
":" { return token_type::name_separator; } // literal names
"null" { return token_type::literal_null; }
// literal names "true" { return token_type::literal_true; }
"null" { return token_type::literal_null; } "false" { return token_type::literal_false; }
"true" { return token_type::literal_true; }
"false" { return token_type::literal_false; } // number
decimal_point = [.];
// number digit = [0-9];
decimal_point = [.]; digit_1_9 = [1-9];
digit = [0-9]; e = [eE];
digit_1_9 = [1-9]; minus = [-];
e = [eE]; plus = [+];
minus = [-]; zero = [0];
plus = [+]; exp = e (minus|plus)? digit+;
zero = [0]; frac = decimal_point digit+;
exp = e (minus|plus)? digit+; int = (zero|digit_1_9 digit*);
frac = decimal_point digit+; number = minus? int frac? exp?;
int = (zero|digit_1_9 digit*); number { return token_type::value_number; }
number = minus? int frac? exp?;
number { return token_type::value_number; } // string
quotation_mark = [\"];
// string escape = [\\];
quotation_mark = [\"]; unescaped = [^\"\\\000];
escape = [\\]; single_escaped = [\"\\/bfnrt];
unescaped = [^\"\\\000]; unicode_escaped = [u][0-9a-fA-F]{4};
single_escaped = [\"\\/bfnrt]; escaped = escape (single_escaped | unicode_escaped);
unicode_escaped = [u][0-9a-fA-F]{4}; char = unescaped | escaped;
escaped = escape (single_escaped | unicode_escaped); string = quotation_mark char* quotation_mark;
char = unescaped | escaped; string { return token_type::value_string; }
string = quotation_mark char* quotation_mark;
string { return token_type::value_string; } // end of file
'\000' { return token_type::end_of_input; }
// end of file
'\000' { return token_type::end_of_input; } // anything else is an error
. { return token_type::parse_error; }
// anything else is an error */
. { return token_type::parse_error; } }
*/
} inline string_t get_token() const
} {
return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
inline std::string get_token() const
{
return std::string(m_start, static_cast<size_t>(m_cursor - m_start));
} }
/*! /*!
...@@ -2638,16 +2631,14 @@ class basic_json ...@@ -2638,16 +2631,14 @@ class basic_json
from the pointer difference of the two pointers). from the pointer difference of the two pointers).
@return string value of current token without opening and closing quotes @return string value of current token without opening and closing quotes
@todo Take care of Unicode.
*/ */
inline std::string get_string() const inline string_t get_string() const
{ {
std::string result; string_t result;
result.reserve(static_cast<size_t>(m_cursor - m_start - 2)); result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
// iterate the result between the quotes // iterate the result between the quotes
for (const char* i = m_start + 1; i < m_cursor - 1; ++i) for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
{ {
// process escaped characters // process escaped characters
if (*i == '\\') if (*i == '\\')
...@@ -2707,7 +2698,7 @@ class basic_json ...@@ -2707,7 +2698,7 @@ class basic_json
// get code xxxx from \uxxxx // get code xxxx from \uxxxx
auto codepoint = strtol(i + 1, nullptr, 16); auto codepoint = strtol(i + 1, nullptr, 16);
// add unicode character(s) // add unicode character(s)
result += to_unicode<char>(codepoint); result += to_unicode(codepoint);
// skip the next four characters (\uxxxx) // skip the next four characters (\uxxxx)
i += 4; i += 4;
break; break;
...@@ -2746,20 +2737,20 @@ class basic_json ...@@ -2746,20 +2737,20 @@ class basic_json
private: private:
/// the buffer /// the buffer
const char* m_content = nullptr; const typename string_t::value_type* m_content = nullptr;
/// pointer to he beginning of the current symbol /// pointer to he beginning of the current symbol
const char* m_start = nullptr; const typename string_t::value_type* m_start = nullptr;
/// pointer to the current symbol /// pointer to the current symbol
const char* m_cursor = nullptr; const typename string_t::value_type* m_cursor = nullptr;
/// pointer to the end of the buffer /// pointer to the end of the buffer
const char* m_limit = nullptr; const typename string_t::value_type* m_limit = nullptr;
}; };
class parser class parser
{ {
public: public:
/// constructor for strings /// constructor for strings
inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str()) inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
{ {
// read first token // read first token
get_token(); get_token();
...@@ -2770,7 +2761,7 @@ class basic_json ...@@ -2770,7 +2761,7 @@ class basic_json
{ {
while (_is) while (_is)
{ {
std::string input_line; string_t input_line;
std::getline(_is, input_line); std::getline(_is, input_line);
m_buffer += input_line; m_buffer += input_line;
} }
...@@ -2964,7 +2955,7 @@ class basic_json ...@@ -2964,7 +2955,7 @@ class basic_json
private: private:
/// the buffer /// the buffer
std::string m_buffer; string_t m_buffer;
/// the type of the last read token /// the type of the last read token
typename lexer::token_type last_token = lexer::token_type::uninitialized; typename lexer::token_type last_token = lexer::token_type::uninitialized;
/// the lexer /// the lexer
......
...@@ -5517,8 +5517,8 @@ TEST_CASE("lexer class") ...@@ -5517,8 +5517,8 @@ TEST_CASE("lexer class")
SECTION("to_unicode") SECTION("to_unicode")
{ {
CHECK(json::lexer::to_unicode<char>(0x1F4A9) == "💩"); CHECK(json::lexer::to_unicode(0x1F4A9) == "💩");
CHECK_THROWS_AS(json::lexer::to_unicode<char>(0x110000), std::out_of_range); CHECK_THROWS_AS(json::lexer::to_unicode(0x200000), std::out_of_range);
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment