🚧 manual lexer

This commit removed the re2c lexer and replaced it by a manual version. Its integration is not yet complete: number parsing does not respect locales or overflows. Furthermore, parsing does not need to end with EOF. Therefore, a lot of test cases fail. The idea is to push this branch forward so we can conduct performance comparisons. So far, a nice side effect are better diagnosis messages in case of parse errors.
parent 54db53c2
.PHONY: pretty clean ChangeLog.md
# used programs
RE2C := $(shell command -v re2c 2> /dev/null)
SED = sed
# main target
all:
$(MAKE) -C test
......@@ -183,13 +179,6 @@ clang_sanitize: clean
# maintainer targets
##########################################################################
# create scanner with re2c
re2c: src/json.hpp.re2c
ifndef RE2C
$(error "re2c is not available, please install re2c")
endif
$(RE2C) -W --utf-8 --encoding-policy fail --bit-vectors --nested-ifs --no-debug-info $< | $(SED) '1d' > src/json.hpp
# pretty printer
pretty:
astyle --style=allman --indent=spaces=4 --indent-modifiers \
......@@ -197,7 +186,7 @@ pretty:
--indent-col1-comments --pad-oper --pad-header --align-pointer=type \
--align-reference=type --add-brackets --convert-tabs --close-templates \
--lineend=linux --preserve-date --suffix=none --formatted \
src/json.hpp src/json.hpp.re2c test/src/*.cpp \
src/json.hpp test/src/*.cpp \
benchmarks/benchmarks.cpp doc/examples/*.cpp
......
- test/test-class_parser
- 617 failed
- test/test-regression
- 11 failed
- test/test-testsuites
- 43 failed
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -38,79 +38,50 @@ TEST_CASE("lexer class")
{
SECTION("structural characters")
{
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("["),
1).scan() == json::lexer::token_type::begin_array));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("]"),
1).scan() == json::lexer::token_type::end_array));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("{"),
1).scan() == json::lexer::token_type::begin_object));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("}"),
1).scan() == json::lexer::token_type::end_object));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(","),
1).scan() == json::lexer::token_type::value_separator));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(":"),
1).scan() == json::lexer::token_type::name_separator));
CHECK((json::lexer("[", 1).scan() == json::lexer::token_type::begin_array));
CHECK((json::lexer("]", 1).scan() == json::lexer::token_type::end_array));
CHECK((json::lexer("{", 1).scan() == json::lexer::token_type::begin_object));
CHECK((json::lexer("}", 1).scan() == json::lexer::token_type::end_object));
CHECK((json::lexer(",", 1).scan() == json::lexer::token_type::value_separator));
CHECK((json::lexer(":", 1).scan() == json::lexer::token_type::name_separator));
}
SECTION("literal names")
{
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("null"),
4).scan() == json::lexer::token_type::literal_null));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("true"),
4).scan() == json::lexer::token_type::literal_true));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("false"),
5).scan() == json::lexer::token_type::literal_false));
CHECK((json::lexer("null", 4).scan() == json::lexer::token_type::literal_null));
CHECK((json::lexer("true", 4).scan() == json::lexer::token_type::literal_true));
CHECK((json::lexer("false", 5).scan() == json::lexer::token_type::literal_false));
}
SECTION("numbers")
{
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("0"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("2"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("3"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("4"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("5"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("6"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("7"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("8"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("9"),
1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-0"),
2).scan() == json::lexer::token_type::value_integer));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1"),
2).scan() == json::lexer::token_type::value_integer));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1.1"),
3).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1.1"),
4).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1E10"),
4).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer("0", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("1", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("2", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("3", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("4", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("5", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("6", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("7", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("8", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("9", 1).scan() == json::lexer::token_type::value_unsigned));
CHECK((json::lexer("-0", 2).scan() == json::lexer::token_type::value_integer));
CHECK((json::lexer("-1", 2).scan() == json::lexer::token_type::value_integer));
CHECK((json::lexer("1.1", 3).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer("-1.1", 4).scan() == json::lexer::token_type::value_float));
CHECK((json::lexer("1E10", 4).scan() == json::lexer::token_type::value_float));
}
SECTION("whitespace")
{
// result is end_of_input, because not token is following
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" "),
1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\t"),
1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\n"),
1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\r"),
1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" \t\n\r\n\t "),
7).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(" ", 1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer("\t", 1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer("\n", 1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer("\r", 1).scan() == json::lexer::token_type::end_of_input));
CHECK((json::lexer(" \t\n\r\n\t ", 7).scan() == json::lexer::token_type::end_of_input));
}
}
......@@ -141,8 +112,7 @@ TEST_CASE("lexer class")
// create string from the ASCII code
const auto s = std::string(1, static_cast<char>(c));
// store scan() result
const auto res = json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(s.c_str()),
1).scan();
const auto res = json::lexer(s.c_str(), 1).scan();
switch (c)
{
......@@ -188,12 +158,14 @@ TEST_CASE("lexer class")
}
}
/* NOTE: to_unicode function has been removed
SECTION("to_unicode")
{
// lexer to call to_unicode on
json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0);
json::lexer dummy_lexer("", 0);
CHECK(dummy_lexer.to_unicode(0x1F4A9) == "💩");
CHECK_THROWS_AS(dummy_lexer.to_unicode(0x200000), json::parse_error);
CHECK_THROWS_WITH(dummy_lexer.to_unicode(0x200000), "[json.exception.parse_error.103] parse error: code points above 0x10FFFF are invalid");
}
*/
}
......@@ -92,7 +92,7 @@ TEST_CASE("deserialization")
ss2 << "[\"foo\",1,2,3,false,{\"one\":1}";
CHECK_THROWS_AS(json::parse(ss1), json::parse_error);
CHECK_THROWS_WITH(json::parse(ss2),
"[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
"[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
}
SECTION("string")
......@@ -100,7 +100,7 @@ TEST_CASE("deserialization")
json::string_t s = "[\"foo\",1,2,3,false,{\"one\":1}";
CHECK_THROWS_AS(json::parse(s), json::parse_error);
CHECK_THROWS_WITH(json::parse(s),
"[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'");
"[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
}
SECTION("operator<<")
......@@ -111,7 +111,7 @@ TEST_CASE("deserialization")
json j;
CHECK_THROWS_AS(j << ss1, json::parse_error);
CHECK_THROWS_WITH(j << ss2,
"[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
"[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
}
SECTION("operator>>")
......@@ -122,14 +122,14 @@ TEST_CASE("deserialization")
json j;
CHECK_THROWS_AS(ss1 >> j, json::parse_error);
CHECK_THROWS_WITH(ss2 >> j,
"[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
"[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
}
SECTION("user-defined string literal")
{
CHECK_THROWS_AS("[\"foo\",1,2,3,false,{\"one\":1}"_json, json::parse_error);
CHECK_THROWS_WITH("[\"foo\",1,2,3,false,{\"one\":1}"_json,
"[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'");
"[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
}
}
......
......@@ -594,7 +594,7 @@ TEST_CASE("regression tests")
// a parse error because of the EOF.
CHECK_THROWS_AS(j << ss, json::parse_error);
CHECK_THROWS_WITH(j << ss,
"[json.exception.parse_error.101] parse error at 1: parse error - unexpected end of input");
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input");
}
SECTION("issue #389 - Integer-overflow (OSS-Fuzz issue 267)")
......@@ -911,6 +911,7 @@ TEST_CASE("regression tests")
CHECK(j["bool_vector"].dump() == "[false,true,false,false]");
}
/* NOTE: m_line_buffer is not used any more
SECTION("issue #495 - fill_line_buffer incorrectly tests m_stream for eof but not fail or bad bits")
{
SECTION("setting failbit")
......@@ -943,6 +944,7 @@ TEST_CASE("regression tests")
CHECK_THROWS_WITH(l.fill_line_buffer(), "[json.exception.parse_error.111] parse error: bad input stream");
}
}
*/
SECTION("issue #504 - assertion error (OSS-Fuzz 856)")
{
......
......@@ -36,10 +36,11 @@ using nlohmann::json;
TEST_CASE("Unicode", "[hide]")
{
/* NOTE: to_unicode is not used any more
SECTION("full enumeration of Unicode code points")
{
// lexer to call to_unicode on
json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0);
json::lexer dummy_lexer("", 0);
// create an escaped string from a code point
const auto codepoint_to_unicode = [](std::size_t cp)
......@@ -118,6 +119,7 @@ TEST_CASE("Unicode", "[hide]")
CHECK(j3 == j4);
}
}
*/
SECTION("read all unicode characters")
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment