Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
J
json
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chen Yisong
json
Commits
f1f72403
Commit
f1f72403
authored
Feb 15, 2015
by
Niels
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
some unicode magic
parent
3e885c83
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
96 additions
and
22 deletions
+96
-22
json.hpp
src/json.hpp
+43
-8
json.hpp.re2c
src/json.hpp.re2c
+43
-8
unit.cpp
test/unit.cpp
+10
-6
No files found.
src/json.hpp
View file @
f1f72403
...
@@ -2497,14 +2497,37 @@ class basic_json
...
@@ -2497,14 +2497,37 @@ class basic_json
@param codepoint the code point (must be in [0x0, 0x10ffff]
@param codepoint the code point (must be in [0x0, 0x10ffff]
@return string representation of the code point
@return string representation of the code point
@exception std::out_of_range if code point is >0x10ffff
@exception std::out_of_range if code point is >0x10ffff
@exception std::invalid_argument if the low surrogate is invalid
@see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
@see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
*/
*/
inline
static
string_t
to_unicode
(
const
size_t
codepoint
)
inline
static
string_t
to_unicode
(
const
size_t
codepoint
1
,
size_t
codepoint2
=
0
)
{
{
string_t
result
;
string_t
result
;
// calculate the codepoint from the given code points
size_t
codepoint
=
codepoint1
;
if
(
codepoint1
>=
0xD800
and
codepoint1
<=
0xDBFF
)
{
if
(
codepoint2
>=
0xDC00
and
codepoint2
<=
0xDFFF
)
{
codepoint
=
// high surrogate occupies the most significant 22 bits
(
codepoint1
<<
10
)
// low surrogate occupies the least significant 15 bits
+
codepoint2
// there is still the 0xD800, 0xDC00 and 0x10000 noise
// in the result so we have to substract with:
// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
-
0x35FDC00
;
}
else
{
throw
std
::
invalid_argument
(
"missing or wrong low surrogate"
);
}
}
if
(
codepoint
<=
0x7f
)
if
(
codepoint
<=
0x7f
)
{
{
// 1-byte characters: 0xxxxxxx (ASCI)
// 1-byte characters: 0xxxxxxx (ASCI)
...
@@ -3394,12 +3417,24 @@ basic_json_parser_59:
...
@@ -3394,12 +3417,24 @@ basic_json_parser_59:
// unicode
// unicode
case
'u'
:
case
'u'
:
{
{
// get code xxxx from \uxxxx
// get code xxxx from uxxxx
auto
codepoint
=
std
::
strtoul
(
i
+
1
,
nullptr
,
16
);
auto
codepoint
=
std
::
strtoul
(
std
::
string
(
i
+
1
,
4
).
c_str
(),
nullptr
,
16
);
// add unicode character(s)
result
+=
to_unicode
(
codepoint
);
if
(
codepoint
>=
0xD800
and
codepoint
<=
0xDBFF
)
// skip the next four characters (\uxxxx)
{
i
+=
4
;
// get code yyyy from uxxxx\uyyyy
auto
codepoint2
=
std
::
strtoul
(
std
::
string
(
i
+
7
,
4
).
c_str
(),
nullptr
,
16
);
result
+=
to_unicode
(
codepoint
,
codepoint2
);
// skip the next 11 characters (xxxx\uyyyy)
i
+=
11
;
}
else
{
// add unicode character(s)
result
+=
to_unicode
(
codepoint
);
// skip the next four characters (xxxx)
i
+=
4
;
}
break
;
break
;
}
}
}
}
...
...
src/json.hpp.re2c
View file @
f1f72403
...
@@ -2497,14 +2497,37 @@ class basic_json
...
@@ -2497,14 +2497,37 @@ class basic_json
@param codepoint the code point (must be in [0x0, 0x10ffff]
@param codepoint the code point (must be in [0x0, 0x10ffff]
@return string representation of the code point
@return string representation of the code point
@exception std::out_of_range if code point is >0x10ffff
@exception std::out_of_range if code point is >0x10ffff
@exception std::invalid_argument if the low surrogate is invalid
@see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
@see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
*/
*/
inline static string_t to_unicode(const size_t codepoint)
inline static string_t to_unicode(const size_t codepoint
1, size_t codepoint2 = 0
)
{
{
string_t result;
string_t result;
// calculate the codepoint from the given code points
size_t codepoint = codepoint1;
if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
{
if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
{
codepoint =
// high surrogate occupies the most significant 22 bits
(codepoint1 << 10)
// low surrogate occupies the least significant 15 bits
+ codepoint2
// there is still the 0xD800, 0xDC00 and 0x10000 noise
// in the result so we have to substract with:
// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
- 0x35FDC00;
}
else
{
throw std::invalid_argument("missing or wrong low surrogate");
}
}
if (codepoint <= 0x7f)
if (codepoint <= 0x7f)
{
{
// 1-byte characters: 0xxxxxxx (ASCI)
// 1-byte characters: 0xxxxxxx (ASCI)
...
@@ -2743,12 +2766,24 @@ class basic_json
...
@@ -2743,12 +2766,24 @@ class basic_json
// unicode
// unicode
case 'u':
case 'u':
{
{
// get code xxxx from \uxxxx
// get code xxxx from uxxxx
auto codepoint = std::strtoul(i + 1, nullptr, 16);
auto codepoint = std::strtoul(std::string(i + 1, 4).c_str(), nullptr, 16);
// add unicode character(s)
result += to_unicode(codepoint);
if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
// skip the next four characters (\uxxxx)
{
i += 4;
// get code yyyy from uxxxx\uyyyy
auto codepoint2 = std::strtoul(std::string(i + 7, 4).c_str(), nullptr, 16);
result += to_unicode(codepoint, codepoint2);
// skip the next 11 characters (xxxx\uyyyy)
i += 11;
}
else
{
// add unicode character(s)
result += to_unicode(codepoint);
// skip the next four characters (xxxx)
i += 4;
}
break;
break;
}
}
}
}
...
...
test/unit.cpp
View file @
f1f72403
...
@@ -5645,6 +5645,9 @@ TEST_CASE("parser class")
...
@@ -5645,6 +5645,9 @@ TEST_CASE("parser class")
CHECK
(
json
::
parser
(
"
\"\\
u2000
\"
"
).
parse
().
get
<
json
::
string_t
>
()
==
" "
);
CHECK
(
json
::
parser
(
"
\"\\
u2000
\"
"
).
parse
().
get
<
json
::
string_t
>
()
==
" "
);
CHECK
(
json
::
parser
(
"
\"\\
uFFFF
\"
"
).
parse
().
get
<
json
::
string_t
>
()
==
""
);
CHECK
(
json
::
parser
(
"
\"\\
uFFFF
\"
"
).
parse
().
get
<
json
::
string_t
>
()
==
""
);
CHECK
(
json
::
parser
(
"
\"\\
u20AC
\"
"
).
parse
().
get
<
json
::
string_t
>
()
==
"€"
);
CHECK
(
json
::
parser
(
"
\"\\
u20AC
\"
"
).
parse
().
get
<
json
::
string_t
>
()
==
"€"
);
CHECK
(
json
::
parse
(
"
\"\\
ud80c
\\
udc60
\"
"
).
get
<
json
::
string_t
>
()
==
u8"\U00013060"
);
CHECK
(
json
::
parse
(
"
\"\\
ud83c
\\
udf1e
\"
"
).
get
<
json
::
string_t
>
()
==
"🌞"
);
}
}
}
}
...
@@ -5893,10 +5896,12 @@ TEST_CASE("parser class")
...
@@ -5893,10 +5896,12 @@ TEST_CASE("parser class")
}
}
}
}
}
}
// missing part of a surrogate pair
CHECK_THROWS_AS
(
json
::
parse
(
"
\"\\
uD80C
\"
"
),
std
::
invalid_argument
);
// invalid surrogate pair
CHECK_THROWS_AS
(
json
::
parse
(
"
\"\\
uD80C
\\
uD80C
\"
"
),
std
::
invalid_argument
);
CHECK_THROWS_AS
(
json
::
parse
(
"
\"\\
uD80C
\\
u0000
\"
"
),
std
::
invalid_argument
);
CHECK_THROWS_AS
(
json
::
parse
(
"
\"\\
uD80C
\\
uFFFF
\"
"
),
std
::
invalid_argument
);
}
}
}
}
TEST_CASE
()
{
CHECK
(
json
::
parser
(
"
\"\\
u0049
\\
u004e
\"
"
).
parse
().
get
<
json
::
string_t
>
()
==
"IN"
);
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment