🚧 manual lexer

This commit removed the re2c lexer and replaced it by a manual version. Its integration is not yet complete: number parsing does not respect locales or overflows. Furthermore, parsing does not need to end with EOF. Therefore, a lot of test cases fail. The idea is to push this branch forward so we can conduct performance comparisons. So far, a nice side effect are better diagnosis messages in case of parse errors.
parent 54db53c2
.PHONY: pretty clean ChangeLog.md .PHONY: pretty clean ChangeLog.md
# used programs
RE2C := $(shell command -v re2c 2> /dev/null)
SED = sed
# main target # main target
all: all:
$(MAKE) -C test $(MAKE) -C test
...@@ -183,13 +179,6 @@ clang_sanitize: clean ...@@ -183,13 +179,6 @@ clang_sanitize: clean
# maintainer targets # maintainer targets
########################################################################## ##########################################################################
# create scanner with re2c
re2c: src/json.hpp.re2c
ifndef RE2C
$(error "re2c is not available, please install re2c")
endif
$(RE2C) -W --utf-8 --encoding-policy fail --bit-vectors --nested-ifs --no-debug-info $< | $(SED) '1d' > src/json.hpp
# pretty printer # pretty printer
pretty: pretty:
astyle --style=allman --indent=spaces=4 --indent-modifiers \ astyle --style=allman --indent=spaces=4 --indent-modifiers \
...@@ -197,7 +186,7 @@ pretty: ...@@ -197,7 +186,7 @@ pretty:
--indent-col1-comments --pad-oper --pad-header --align-pointer=type \ --indent-col1-comments --pad-oper --pad-header --align-pointer=type \
--align-reference=type --add-brackets --convert-tabs --close-templates \ --align-reference=type --add-brackets --convert-tabs --close-templates \
--lineend=linux --preserve-date --suffix=none --formatted \ --lineend=linux --preserve-date --suffix=none --formatted \
src/json.hpp src/json.hpp.re2c test/src/*.cpp \ src/json.hpp test/src/*.cpp \
benchmarks/benchmarks.cpp doc/examples/*.cpp benchmarks/benchmarks.cpp doc/examples/*.cpp
......
- test/test-class_parser
- 617 failed
- test/test-regression
- 11 failed
- test/test-testsuites
- 43 failed
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -38,79 +38,50 @@ TEST_CASE("lexer class") ...@@ -38,79 +38,50 @@ TEST_CASE("lexer class")
{ {
SECTION("structural characters") SECTION("structural characters")
{ {
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("["), CHECK((json::lexer("[", 1).scan() == json::lexer::token_type::begin_array));
1).scan() == json::lexer::token_type::begin_array)); CHECK((json::lexer("]", 1).scan() == json::lexer::token_type::end_array));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("]"), CHECK((json::lexer("{", 1).scan() == json::lexer::token_type::begin_object));
1).scan() == json::lexer::token_type::end_array)); CHECK((json::lexer("}", 1).scan() == json::lexer::token_type::end_object));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("{"), CHECK((json::lexer(",", 1).scan() == json::lexer::token_type::value_separator));
1).scan() == json::lexer::token_type::begin_object)); CHECK((json::lexer(":", 1).scan() == json::lexer::token_type::name_separator));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("}"),
1).scan() == json::lexer::token_type::end_object));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(","),
1).scan() == json::lexer::token_type::value_separator));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(":"),
1).scan() == json::lexer::token_type::name_separator));
} }
SECTION("literal names") SECTION("literal names")
{ {
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("null"), CHECK((json::lexer("null", 4).scan() == json::lexer::token_type::literal_null));
4).scan() == json::lexer::token_type::literal_null)); CHECK((json::lexer("true", 4).scan() == json::lexer::token_type::literal_true));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("true"), CHECK((json::lexer("false", 5).scan() == json::lexer::token_type::literal_false));
4).scan() == json::lexer::token_type::literal_true));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("false"),
5).scan() == json::lexer::token_type::literal_false));
} }
SECTION("numbers") SECTION("numbers")
{ {
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("0"), CHECK((json::lexer("0", 1).scan() == json::lexer::token_type::value_unsigned));
1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer("1", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1"), CHECK((json::lexer("2", 1).scan() == json::lexer::token_type::value_unsigned));
1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer("3", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("2"), CHECK((json::lexer("4", 1).scan() == json::lexer::token_type::value_unsigned));
1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer("5", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("3"), CHECK((json::lexer("6", 1).scan() == json::lexer::token_type::value_unsigned));
1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer("7", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("4"), CHECK((json::lexer("8", 1).scan() == json::lexer::token_type::value_unsigned));
1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer("9", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("5"),
1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer("-0", 2).scan() == json::lexer::token_type::value_integer));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("6"), CHECK((json::lexer("-1", 2).scan() == json::lexer::token_type::value_integer));
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("7"), CHECK((json::lexer("1.1", 3).scan() == json::lexer::token_type::value_float));
1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer("-1.1", 4).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("8"), CHECK((json::lexer("1E10", 4).scan() == json::lexer::token_type::value_float));
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("9"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-0"),
2).scan() == json::lexer::token_type::value_integer));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1"),
2).scan() == json::lexer::token_type::value_integer));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1.1"),
3).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1.1"),
4).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1E10"),
4).scan() == json::lexer::token_type::value_float));
} }
SECTION("whitespace") SECTION("whitespace")
{ {
// result is end_of_input, because not token is following // result is end_of_input, because not token is following
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" "), CHECK((json::lexer(" ", 1).scan() == json::lexer::token_type::end_of_input));
1).scan() == json::lexer::token_type::end_of_input)); CHECK((json::lexer("\t", 1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\t"), CHECK((json::lexer("\n", 1).scan() == json::lexer::token_type::end_of_input));
1).scan() == json::lexer::token_type::end_of_input)); CHECK((json::lexer("\r", 1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\n"), CHECK((json::lexer(" \t\n\r\n\t ", 7).scan() == json::lexer::token_type::end_of_input));
1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\r"),
1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" \t\n\r\n\t "),
7).scan() == json::lexer::token_type::end_of_input));
} }
} }
...@@ -141,8 +112,7 @@ TEST_CASE("lexer class") ...@@ -141,8 +112,7 @@ TEST_CASE("lexer class")
// create string from the ASCII code // create string from the ASCII code
const auto s = std::string(1, static_cast<char>(c)); const auto s = std::string(1, static_cast<char>(c));
// store scan() result // store scan() result
const auto res = json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(s.c_str()), const auto res = json::lexer(s.c_str(), 1).scan();
1).scan();
switch (c) switch (c)
{ {
...@@ -188,12 +158,14 @@ TEST_CASE("lexer class") ...@@ -188,12 +158,14 @@ TEST_CASE("lexer class")
} }
} }
/* NOTE: to_unicode function has been removed
SECTION("to_unicode") SECTION("to_unicode")
{ {
// lexer to call to_unicode on // lexer to call to_unicode on
json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0); json::lexer dummy_lexer("", 0);
CHECK(dummy_lexer.to_unicode(0x1F4A9) == "💩"); CHECK(dummy_lexer.to_unicode(0x1F4A9) == "💩");
CHECK_THROWS_AS(dummy_lexer.to_unicode(0x200000), json::parse_error); CHECK_THROWS_AS(dummy_lexer.to_unicode(0x200000), json::parse_error);
CHECK_THROWS_WITH(dummy_lexer.to_unicode(0x200000), "[json.exception.parse_error.103] parse error: code points above 0x10FFFF are invalid"); CHECK_THROWS_WITH(dummy_lexer.to_unicode(0x200000), "[json.exception.parse_error.103] parse error: code points above 0x10FFFF are invalid");
} }
*/
} }
...@@ -92,7 +92,7 @@ TEST_CASE("deserialization") ...@@ -92,7 +92,7 @@ TEST_CASE("deserialization")
ss2 << "[\"foo\",1,2,3,false,{\"one\":1}"; ss2 << "[\"foo\",1,2,3,false,{\"one\":1}";
CHECK_THROWS_AS(json::parse(ss1), json::parse_error); CHECK_THROWS_AS(json::parse(ss1), json::parse_error);
CHECK_THROWS_WITH(json::parse(ss2), CHECK_THROWS_WITH(json::parse(ss2),
"[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'"); "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
} }
SECTION("string") SECTION("string")
...@@ -100,7 +100,7 @@ TEST_CASE("deserialization") ...@@ -100,7 +100,7 @@ TEST_CASE("deserialization")
json::string_t s = "[\"foo\",1,2,3,false,{\"one\":1}"; json::string_t s = "[\"foo\",1,2,3,false,{\"one\":1}";
CHECK_THROWS_AS(json::parse(s), json::parse_error); CHECK_THROWS_AS(json::parse(s), json::parse_error);
CHECK_THROWS_WITH(json::parse(s), CHECK_THROWS_WITH(json::parse(s),
"[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'"); "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
} }
SECTION("operator<<") SECTION("operator<<")
...@@ -111,7 +111,7 @@ TEST_CASE("deserialization") ...@@ -111,7 +111,7 @@ TEST_CASE("deserialization")
json j; json j;
CHECK_THROWS_AS(j << ss1, json::parse_error); CHECK_THROWS_AS(j << ss1, json::parse_error);
CHECK_THROWS_WITH(j << ss2, CHECK_THROWS_WITH(j << ss2,
"[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'"); "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
} }
SECTION("operator>>") SECTION("operator>>")
...@@ -122,14 +122,14 @@ TEST_CASE("deserialization") ...@@ -122,14 +122,14 @@ TEST_CASE("deserialization")
json j; json j;
CHECK_THROWS_AS(ss1 >> j, json::parse_error); CHECK_THROWS_AS(ss1 >> j, json::parse_error);
CHECK_THROWS_WITH(ss2 >> j, CHECK_THROWS_WITH(ss2 >> j,
"[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'"); "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
} }
SECTION("user-defined string literal") SECTION("user-defined string literal")
{ {
CHECK_THROWS_AS("[\"foo\",1,2,3,false,{\"one\":1}"_json, json::parse_error); CHECK_THROWS_AS("[\"foo\",1,2,3,false,{\"one\":1}"_json, json::parse_error);
CHECK_THROWS_WITH("[\"foo\",1,2,3,false,{\"one\":1}"_json, CHECK_THROWS_WITH("[\"foo\",1,2,3,false,{\"one\":1}"_json,
"[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'"); "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
} }
} }
......
...@@ -594,7 +594,7 @@ TEST_CASE("regression tests") ...@@ -594,7 +594,7 @@ TEST_CASE("regression tests")
// a parse error because of the EOF. // a parse error because of the EOF.
CHECK_THROWS_AS(j << ss, json::parse_error); CHECK_THROWS_AS(j << ss, json::parse_error);
CHECK_THROWS_WITH(j << ss, CHECK_THROWS_WITH(j << ss,
"[json.exception.parse_error.101] parse error at 1: parse error - unexpected end of input"); "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input");
} }
SECTION("issue #389 - Integer-overflow (OSS-Fuzz issue 267)") SECTION("issue #389 - Integer-overflow (OSS-Fuzz issue 267)")
...@@ -911,6 +911,7 @@ TEST_CASE("regression tests") ...@@ -911,6 +911,7 @@ TEST_CASE("regression tests")
CHECK(j["bool_vector"].dump() == "[false,true,false,false]"); CHECK(j["bool_vector"].dump() == "[false,true,false,false]");
} }
/* NOTE: m_line_buffer is not used any more
SECTION("issue #495 - fill_line_buffer incorrectly tests m_stream for eof but not fail or bad bits") SECTION("issue #495 - fill_line_buffer incorrectly tests m_stream for eof but not fail or bad bits")
{ {
SECTION("setting failbit") SECTION("setting failbit")
...@@ -943,6 +944,7 @@ TEST_CASE("regression tests") ...@@ -943,6 +944,7 @@ TEST_CASE("regression tests")
CHECK_THROWS_WITH(l.fill_line_buffer(), "[json.exception.parse_error.111] parse error: bad input stream"); CHECK_THROWS_WITH(l.fill_line_buffer(), "[json.exception.parse_error.111] parse error: bad input stream");
} }
} }
*/
SECTION("issue #504 - assertion error (OSS-Fuzz 856)") SECTION("issue #504 - assertion error (OSS-Fuzz 856)")
{ {
......
...@@ -36,10 +36,11 @@ using nlohmann::json; ...@@ -36,10 +36,11 @@ using nlohmann::json;
TEST_CASE("Unicode", "[hide]") TEST_CASE("Unicode", "[hide]")
{ {
/* NOTE: to_unicode is not used any more
SECTION("full enumeration of Unicode code points") SECTION("full enumeration of Unicode code points")
{ {
// lexer to call to_unicode on // lexer to call to_unicode on
json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0); json::lexer dummy_lexer("", 0);
// create an escaped string from a code point // create an escaped string from a code point
const auto codepoint_to_unicode = [](std::size_t cp) const auto codepoint_to_unicode = [](std::size_t cp)
...@@ -118,6 +119,7 @@ TEST_CASE("Unicode", "[hide]") ...@@ -118,6 +119,7 @@ TEST_CASE("Unicode", "[hide]")
CHECK(j3 == j4); CHECK(j3 == j4);
} }
} }
*/
SECTION("read all unicode characters") SECTION("read all unicode characters")
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment