🚧 manual lexer

This commit removed the re2c lexer and replaced it by a manual version. Its integration is not yet complete: number parsing does not respect locales or overflows. Furthermore, parsing does not need to end with EOF. Therefore, a lot of test cases fail. The idea is to push this branch forward so we can conduct performance comparisons. So far, a nice side effect are better diagnosis messages in case of parse errors.

🚧 manual lexer
40160f48 · Niels Lohmann · 54db53c2 · 40160f48 · 40160f48 · 40160f48
Unverified Commit 40160f48 authored Mar 24, 2017 by Niels Lohmann
9 changed files
--- a/Makefile
+++ b/Makefile
 .PHONY: pretty clean ChangeLog.md

-# used programs
-RE2C := $(shell command -v re2c 2> /dev/null)
-SED = sed
-
 # main target
 all:
 	$(MAKE) -C test
@@ -183,13 +179,6 @@ clang_sanitize: clean
 # maintainer targets
 ##########################################################################

-# create scanner with re2c
-re2c: src/json.hpp.re2c
-ifndef RE2C
-	$(error "re2c is not available, please install re2c")
-endif
-	$(RE2C) -W --utf-8 --encoding-policy fail --bit-vectors --nested-ifs --no-debug-info $< | $(SED) '1d' > src/json.hpp
-
 # pretty printer
 pretty:
 	astyle --style=allman --indent=spaces=4 --indent-modifiers \
@@ -197,7 +186,7 @@ pretty:
 	   --indent-col1-comments --pad-oper --pad-header --align-pointer=type \
 	   --align-reference=type --add-brackets --convert-tabs --close-templates \
 	   --lineend=linux --preserve-date --suffix=none --formatted \
-	   src/json.hpp src/json.hpp.re2c test/src/*.cpp \
+	   src/json.hpp test/src/*.cpp \
 	   benchmarks/benchmarks.cpp doc/examples/*.cpp



--- a/errors.txt
+++ b/errors.txt
+- test/test-class_parser
+    - 617 failed
+- test/test-regression
+    - 11 failed
+- test/test-testsuites
+    - 43 failed
+
--- a/src/json.hpp
+++ b/src/json.hpp
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c
--- a/test/src/unit-class_lexer.cpp
+++ b/test/src/unit-class_lexer.cpp
@@ -38,79 +38,50 @@ TEST_CASE("lexer class")
    {
        SECTION("structural characters")
        {
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("["),
-                               1).scan() == json::lexer::token_type::begin_array));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("]"),
-                               1).scan() == json::lexer::token_type::end_array));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("{"),
-                               1).scan() == json::lexer::token_type::begin_object));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("}"),
-                               1).scan() == json::lexer::token_type::end_object));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(","),
-                               1).scan() == json::lexer::token_type::value_separator));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(":"),
-                               1).scan() == json::lexer::token_type::name_separator));
+            CHECK((json::lexer("[", 1).scan() == json::lexer::token_type::begin_array));
+            CHECK((json::lexer("]", 1).scan() == json::lexer::token_type::end_array));
+            CHECK((json::lexer("{", 1).scan() == json::lexer::token_type::begin_object));
+            CHECK((json::lexer("}", 1).scan() == json::lexer::token_type::end_object));
+            CHECK((json::lexer(",", 1).scan() == json::lexer::token_type::value_separator));
+            CHECK((json::lexer(":", 1).scan() == json::lexer::token_type::name_separator));
        }

        SECTION("literal names")
        {
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("null"),
-                               4).scan() == json::lexer::token_type::literal_null));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("true"),
-                               4).scan() == json::lexer::token_type::literal_true));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("false"),
-                               5).scan() == json::lexer::token_type::literal_false));
+            CHECK((json::lexer("null", 4).scan() == json::lexer::token_type::literal_null));
+            CHECK((json::lexer("true", 4).scan() == json::lexer::token_type::literal_true));
+            CHECK((json::lexer("false", 5).scan() == json::lexer::token_type::literal_false));
        }

        SECTION("numbers")
        {
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("0"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("2"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("3"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("4"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("5"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("6"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("7"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("8"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("9"),
-                               1).scan() == json::lexer::token_type::value_unsigned));
-
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-0"),
-                               2).scan() == json::lexer::token_type::value_integer));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1"),
-                               2).scan() == json::lexer::token_type::value_integer));
-
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1.1"),
-                               3).scan() == json::lexer::token_type::value_float));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("-1.1"),
-                               4).scan() == json::lexer::token_type::value_float));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("1E10"),
-                               4).scan() == json::lexer::token_type::value_float));
+            CHECK((json::lexer("0", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("1", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("2", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("3", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("4", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("5", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("6", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("7", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("8", 1).scan() == json::lexer::token_type::value_unsigned));
+            CHECK((json::lexer("9", 1).scan() == json::lexer::token_type::value_unsigned));
+
+            CHECK((json::lexer("-0", 2).scan() == json::lexer::token_type::value_integer));
+            CHECK((json::lexer("-1", 2).scan() == json::lexer::token_type::value_integer));
+
+            CHECK((json::lexer("1.1", 3).scan() == json::lexer::token_type::value_float));
+            CHECK((json::lexer("-1.1", 4).scan() == json::lexer::token_type::value_float));
+            CHECK((json::lexer("1E10", 4).scan() == json::lexer::token_type::value_float));
        }

        SECTION("whitespace")
        {
            // result is end_of_input, because not token is following
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" "),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\t"),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\n"),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>("\r"),
-                               1).scan() == json::lexer::token_type::end_of_input));
-            CHECK((json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(" \t\n\r\n\t "),
-                               7).scan() == json::lexer::token_type::end_of_input));
+            CHECK((json::lexer(" ", 1).scan() == json::lexer::token_type::end_of_input));
+            CHECK((json::lexer("\t", 1).scan() == json::lexer::token_type::end_of_input));
+            CHECK((json::lexer("\n", 1).scan() == json::lexer::token_type::end_of_input));
+            CHECK((json::lexer("\r", 1).scan() == json::lexer::token_type::end_of_input));
+            CHECK((json::lexer(" \t\n\r\n\t ", 7).scan() == json::lexer::token_type::end_of_input));
        }
    }

@@ -141,8 +112,7 @@ TEST_CASE("lexer class")
            // create string from the ASCII code
            const auto s = std::string(1, static_cast<char>(c));
            // store scan() result
-            const auto res = json::lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(s.c_str()),
-                                         1).scan();
+            const auto res = json::lexer(s.c_str(), 1).scan();

            switch (c)
            {
@@ -188,12 +158,14 @@ TEST_CASE("lexer class")
        }
    }

+    /* NOTE: to_unicode function has been removed
    SECTION("to_unicode")
    {
        // lexer to call to_unicode on
-        json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0);
+        json::lexer dummy_lexer("", 0);
        CHECK(dummy_lexer.to_unicode(0x1F4A9) == "💩");
        CHECK_THROWS_AS(dummy_lexer.to_unicode(0x200000), json::parse_error);
        CHECK_THROWS_WITH(dummy_lexer.to_unicode(0x200000), "[json.exception.parse_error.103] parse error: code points above 0x10FFFF are invalid");
    }
+    */
 }
--- a/test/src/unit-class_parser.cpp
+++ b/test/src/unit-class_parser.cpp
--- a/test/src/unit-deserialization.cpp
+++ b/test/src/unit-deserialization.cpp
@@ -92,7 +92,7 @@ TEST_CASE("deserialization")
            ss2 << "[\"foo\",1,2,3,false,{\"one\":1}";
            CHECK_THROWS_AS(json::parse(ss1), json::parse_error);
            CHECK_THROWS_WITH(json::parse(ss2),
-                              "[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("string")
@@ -100,7 +100,7 @@ TEST_CASE("deserialization")
            json::string_t s = "[\"foo\",1,2,3,false,{\"one\":1}";
            CHECK_THROWS_AS(json::parse(s), json::parse_error);
            CHECK_THROWS_WITH(json::parse(s),
-                              "[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("operator<<")
@@ -111,7 +111,7 @@ TEST_CASE("deserialization")
            json j;
            CHECK_THROWS_AS(j << ss1, json::parse_error);
            CHECK_THROWS_WITH(j << ss2,
-                              "[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("operator>>")
@@ -122,14 +122,14 @@ TEST_CASE("deserialization")
            json j;
            CHECK_THROWS_AS(ss1 >> j, json::parse_error);
            CHECK_THROWS_WITH(ss2 >> j,
-                              "[json.exception.parse_error.101] parse error at 30: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }

        SECTION("user-defined string literal")
        {
            CHECK_THROWS_AS("[\"foo\",1,2,3,false,{\"one\":1}"_json, json::parse_error);
            CHECK_THROWS_WITH("[\"foo\",1,2,3,false,{\"one\":1}"_json,
-                              "[json.exception.parse_error.101] parse error at 29: parse error - unexpected end of input; expected ']'");
+                              "[json.exception.parse_error.101] parse error at 29: syntax error - unexpected end of input; expected ']'");
        }
    }


--- a/test/src/unit-regression.cpp
+++ b/test/src/unit-regression.cpp
@@ -594,7 +594,7 @@ TEST_CASE("regression tests")
        // a parse error because of the EOF.
        CHECK_THROWS_AS(j << ss, json::parse_error);
        CHECK_THROWS_WITH(j << ss,
-                          "[json.exception.parse_error.101] parse error at 1: parse error - unexpected end of input");
+                          "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input");
    }

    SECTION("issue #389 - Integer-overflow (OSS-Fuzz issue 267)")
@@ -911,6 +911,7 @@ TEST_CASE("regression tests")
        CHECK(j["bool_vector"].dump() == "[false,true,false,false]");
    }

+    /* NOTE: m_line_buffer is not used any more
    SECTION("issue #495 - fill_line_buffer incorrectly tests m_stream for eof but not fail or bad bits")
    {
        SECTION("setting failbit")
@@ -943,6 +944,7 @@ TEST_CASE("regression tests")
            CHECK_THROWS_WITH(l.fill_line_buffer(), "[json.exception.parse_error.111] parse error: bad input stream");
        }
    }
+     */

    SECTION("issue #504 - assertion error (OSS-Fuzz 856)")
    {

--- a/test/src/unit-unicode.cpp
+++ b/test/src/unit-unicode.cpp
@@ -36,10 +36,11 @@ using nlohmann::json;

 TEST_CASE("Unicode", "[hide]")
 {
+    /* NOTE: to_unicode is not used any more
    SECTION("full enumeration of Unicode code points")
    {
        // lexer to call to_unicode on
-        json::lexer dummy_lexer(reinterpret_cast<const json::lexer::lexer_char_t*>(""), 0);
+        json::lexer dummy_lexer("", 0);

        // create an escaped string from a code point
        const auto codepoint_to_unicode = [](std::size_t cp)
@@ -118,6 +119,7 @@ TEST_CASE("Unicode", "[hide]")
            CHECK(j3 == j4);
        }
    }
+     */

    SECTION("read all unicode characters")
    {