Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
J
json
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chen Yisong
json
Commits
6d2c0a79
Unverified
Commit
6d2c0a79
authored
Apr 23, 2017
by
Niels Lohmann
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
✅
added more Unicode test cases
parent
734297ff
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
69 additions
and
44 deletions
+69
-44
unit-unicode.cpp
test/src/unit-unicode.cpp
+69
-44
No files found.
test/src/unit-unicode.cpp
View file @
6d2c0a79
...
@@ -74,8 +74,10 @@ void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte
...
@@ -74,8 +74,10 @@ void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte
}
}
}
}
TEST_CASE
(
"
RFC 3629
"
,
"[hide]"
)
TEST_CASE
(
"
Unicode
"
,
"[hide]"
)
{
{
SECTION
(
"RFC 3629"
)
{
/*
/*
RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
follows:
follows:
...
@@ -850,20 +852,14 @@ TEST_CASE("RFC 3629", "[hide]")
...
@@ -850,20 +852,14 @@ TEST_CASE("RFC 3629", "[hide]")
}
}
}
}
}
}
}
}
TEST_CASE
(
"Unicode"
,
"[hide]"
)
SECTION
(
"
\\
uxxxx sequences"
)
{
/* NOTE: to_unicode is not used any more
SECTION("full enumeration of Unicode code points")
{
{
// lexer to call to_unicode on
json::lexer dummy_lexer("", 0);
// create an escaped string from a code point
// create an escaped string from a code point
const
auto
codepoint_to_unicode
=
[](
std
::
size_t
cp
)
const
auto
codepoint_to_unicode
=
[](
std
::
size_t
cp
)
{
{
// co
pd
points are represented as a six-character sequence: a
// co
de
points are represented as a six-character sequence: a
// reverse solidus, followed by the lowercase letter u, followed
// reverse solidus, followed by the lowercase letter u, followed
// by four hexadecimal digits that encode the character's code
// by four hexadecimal digits that encode the character's code
// point
// point
...
@@ -872,11 +868,19 @@ TEST_CASE("Unicode", "[hide]")
...
@@ -872,11 +868,19 @@ TEST_CASE("Unicode", "[hide]")
return
ss
.
str
();
return
ss
.
str
();
};
};
SECTION
(
"correct sequences"
)
{
// generate all UTF-8 code points; in total, 1112064 code points are
// generate all UTF-8 code points; in total, 1112064 code points are
// generated: 0x1FFFFF code points - 2048 invalid values between
// generated: 0x1FFFFF code points - 2048 invalid values between
// 0xD800 and 0xDFFF.
// 0xD800 and 0xDFFF.
for
(
std
::
size_t
cp
=
0
;
cp
<=
0x10FFFFu
;
++
cp
)
for
(
std
::
size_t
cp
=
0
;
cp
<=
0x10FFFFu
;
++
cp
)
{
{
// string to store the code point as in \uxxxx format
std
::
string
json_text
=
"
\"
"
;
// decide whether to use one or two \uxxxx sequences
if
(
cp
<
0x10000u
)
{
// The Unicode standard permanently reserves these code point
// The Unicode standard permanently reserves these code point
// values for UTF-16 encoding of the high and low surrogates, and
// values for UTF-16 encoding of the high and low surrogates, and
// they will never be assigned a character, so there should be no
// they will never be assigned a character, so there should be no
...
@@ -889,26 +893,9 @@ TEST_CASE("Unicode", "[hide]")
...
@@ -889,26 +893,9 @@ TEST_CASE("Unicode", "[hide]")
continue
;
continue
;
}
}
// string to store the code point as in \uxxxx format
std::string escaped_string;
// string to store the code point as unescaped character sequence
std::string unescaped_string;
if (cp < 0x10000u)
{
// code points in the Basic Multilingual Plane can be
// code points in the Basic Multilingual Plane can be
// represented with one \\uxxxx sequence
// represented with one \uxxxx sequence
escaped_string = codepoint_to_unicode(cp);
json_text
+=
codepoint_to_unicode
(
cp
);
// All Unicode characters may be placed within the quotation
// marks, except for the characters that must be escaped:
// quotation mark, reverse solidus, and the control characters
// (U+0000 through U+001F); we ignore these code points as
// they are checked with codepoint_to_unicode.
if (cp > 0x1f and cp != 0x22 and cp != 0x5c)
{
unescaped_string = dummy_lexer.to_unicode(cp);
}
}
}
else
else
{
{
...
@@ -917,27 +904,65 @@ TEST_CASE("Unicode", "[hide]")
...
@@ -917,27 +904,65 @@ TEST_CASE("Unicode", "[hide]")
// 12-character sequence, encoding the UTF-16 surrogate pair
// 12-character sequence, encoding the UTF-16 surrogate pair
const
auto
codepoint1
=
0xd800u
+
(((
cp
-
0x10000u
)
>>
10
)
&
0x3ffu
);
const
auto
codepoint1
=
0xd800u
+
(((
cp
-
0x10000u
)
>>
10
)
&
0x3ffu
);
const
auto
codepoint2
=
0xdc00u
+
((
cp
-
0x10000u
)
&
0x3ffu
);
const
auto
codepoint2
=
0xdc00u
+
((
cp
-
0x10000u
)
&
0x3ffu
);
escaped_string = codepoint_to_unicode(codepoint1);
json_text
+=
codepoint_to_unicode
(
codepoint1
)
+
codepoint_to_unicode
(
codepoint2
);
escaped_string += codepoint_to_unicode(codepoint2);
unescaped_string += dummy_lexer.to_unicode(codepoint1, codepoint2);
}
}
// all other code points are valid and must not yield parse errors
json_text
+=
"
\"
"
;
CAPTURE(cp);
CAPTURE
(
json_text
);
CAPTURE(escaped_string);
CHECK_NOTHROW
(
json
::
parse
(
json_text
));
CAPTURE(unescaped_string);
}
}
json j1, j2, j3, j4;
SECTION
(
"incorrect sequences"
)
CHECK_NOTHROW(j1 = json::parse("\"" + escaped_string + "\""));
{
CHECK_NOTHROW(j2 = json::parse(j1.dump()));
SECTION
(
"high surrogate without low surrogate"
)
CHECK(j1 == j2);
{
// D800..DBFF are high surrogates and must be followed by low
// surrogates DC00..DFFF; here, nothing follows
for
(
std
::
size_t
cp
=
0xD800u
;
cp
<=
0xDBFFu
;
++
cp
)
{
std
::
string
json_text
=
"
\"
"
+
codepoint_to_unicode
(
cp
)
+
"
\"
"
;
CAPTURE
(
json_text
);
CHECK_THROWS_AS
(
json
::
parse
(
json_text
),
json
::
parse_error
);
}
}
#if 0
SECTION("high surrogate with wrong low surrogate")
{
// D800..DBFF are high surrogates and must be followed by low
// surrogates DC00..DFFF; here a different sequence follows
for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
{
for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
{
if (0xDC00u <= cp2 and cp2 <= 0xDFFFu)
{
continue;
}
std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
CAPTURE(json_text);
CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
}
}
}
#endif
SECTION
(
"low surrogate without high surrogate"
)
{
// low surrogates DC00..DFFF must follow high surrogates; here,
// they occur alone
for
(
std
::
size_t
cp
=
0xDC00u
;
cp
<=
0xDFFFu
;
++
cp
)
{
std
::
string
json_text
=
"
\"
"
+
codepoint_to_unicode
(
cp
)
+
"
\"
"
;
CAPTURE
(
json_text
);
CHECK_THROWS_AS
(
json
::
parse
(
json_text
),
json
::
parse_error
);
}
}
CHECK_NOTHROW(j3 = json::parse("\"" + unescaped_string + "\""));
CHECK_NOTHROW(j4 = json::parse(j3.dump()));
CHECK(j3 == j4);
}
}
}
}
*/
SECTION
(
"read all unicode characters"
)
SECTION
(
"read all unicode characters"
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment