Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
J
json
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chen Yisong
json
Commits
3a0743db
Unverified
Commit
3a0743db
authored
Aug 14, 2017
by
Théo DELRIEU
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add detail/parsing/lexer.hpp
parent
7ab3e8d7
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1279 additions
and
1256 deletions
+1279
-1256
Makefile
Makefile
+2
-1
lexer.hpp
src/detail/parsing/lexer.hpp
+1273
-0
json.hpp
src/json.hpp
+4
-1255
No files found.
Makefile
View file @
3a0743db
...
@@ -10,7 +10,8 @@ SRCS = ${SRCDIR}/json.hpp \
...
@@ -10,7 +10,8 @@ SRCS = ${SRCDIR}/json.hpp \
${
SRCDIR
}
/detail/value_t.hpp
\
${
SRCDIR
}
/detail/value_t.hpp
\
${
SRCDIR
}
/detail/conversions/from_json.hpp
\
${
SRCDIR
}
/detail/conversions/from_json.hpp
\
${
SRCDIR
}
/detail/conversions/to_json.hpp
\
${
SRCDIR
}
/detail/conversions/to_json.hpp
\
${
SRCDIR
}
/detail/parsing/input_adapters.hpp
${
SRCDIR
}
/detail/parsing/input_adapters.hpp
\
${
SRCDIR
}
/detail/parsing/lexer.hpp
...
...
src/detail/parsing/lexer.hpp
0 → 100644
View file @
3a0743db
#ifndef NLOHMANN_JSON_DETAIL_PARSING_LEXER_HPP
#define NLOHMANN_JSON_DETAIL_PARSING_LEXER_HPP
#include <clocale>
#include <cstddef>
#include <vector>
#include "detail/macro_scope.hpp"
#include "detail/parsing/input_adapters.hpp"
namespace
nlohmann
{
namespace
detail
{
///////////
// lexer //
///////////
/*!
@brief lexical analysis
This class organizes the lexical analysis during JSON deserialization.
*/
template
<
typename
BasicJsonType
>
class
lexer
{
using
number_integer_t
=
typename
BasicJsonType
::
number_integer_t
;
using
number_unsigned_t
=
typename
BasicJsonType
::
number_unsigned_t
;
using
number_float_t
=
typename
BasicJsonType
::
number_float_t
;
public
:
/// token types for the parser
enum
class
token_type
{
uninitialized
,
///< indicating the scanner is uninitialized
literal_true
,
///< the `true` literal
literal_false
,
///< the `false` literal
literal_null
,
///< the `null` literal
value_string
,
///< a string -- use get_string() for actual value
value_unsigned
,
///< an unsigned integer -- use get_number_unsigned() for actual value
value_integer
,
///< a signed integer -- use get_number_integer() for actual value
value_float
,
///< an floating point number -- use get_number_float() for actual value
begin_array
,
///< the character for array begin `[`
begin_object
,
///< the character for object begin `{`
end_array
,
///< the character for array end `]`
end_object
,
///< the character for object end `}`
name_separator
,
///< the name separator `:`
value_separator
,
///< the value separator `,`
parse_error
,
///< indicating a parse error
end_of_input
,
///< indicating the end of the input buffer
literal_or_value
///< a literal or the begin of a value (only for diagnostics)
};
/// return name of values of type token_type (only used for errors)
static
const
char
*
token_type_name
(
const
token_type
t
)
noexcept
{
switch
(
t
)
{
case
token_type
:
:
uninitialized
:
return
"<uninitialized>"
;
case
token_type
:
:
literal_true
:
return
"true literal"
;
case
token_type
:
:
literal_false
:
return
"false literal"
;
case
token_type
:
:
literal_null
:
return
"null literal"
;
case
token_type
:
:
value_string
:
return
"string literal"
;
case
lexer
:
:
token_type
::
value_unsigned
:
case
lexer
:
:
token_type
::
value_integer
:
case
lexer
:
:
token_type
::
value_float
:
return
"number literal"
;
case
token_type
:
:
begin_array
:
return
"'['"
;
case
token_type
:
:
begin_object
:
return
"'{'"
;
case
token_type
:
:
end_array
:
return
"']'"
;
case
token_type
:
:
end_object
:
return
"'}'"
;
case
token_type
:
:
name_separator
:
return
"':'"
;
case
token_type
:
:
value_separator
:
return
"','"
;
case
token_type
:
:
parse_error
:
return
"<parse error>"
;
case
token_type
:
:
end_of_input
:
return
"end of input"
;
case
token_type
:
:
literal_or_value
:
return
"'[', '{', or a literal"
;
default:
// catch non-enum values
return
"unknown token"
;
// LCOV_EXCL_LINE
}
}
explicit
lexer
(
detail
::
input_adapter_t
adapter
)
:
ia
(
std
::
move
(
adapter
)),
decimal_point_char
(
get_decimal_point
())
{}
// delete because of pointer members
lexer
(
const
lexer
&
)
=
delete
;
lexer
&
operator
=
(
lexer
&
)
=
delete
;
private
:
/////////////////////
// locales
/////////////////////
/// return the locale-dependent decimal point
static
char
get_decimal_point
()
noexcept
{
const
auto
loc
=
localeconv
();
assert
(
loc
!=
nullptr
);
return
(
loc
->
decimal_point
==
nullptr
)
?
'.'
:
*
(
loc
->
decimal_point
);
}
/////////////////////
// scan functions
/////////////////////
/*!
@brief get codepoint from 4 hex characters following `\u`
For input "\u c1 c2 c3 c4" the codepoint is:
(c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
= (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
between the ASCII value of the character and the desired integer value.
@return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
non-hex character)
*/
int
get_codepoint
()
{
// this function only makes sense after reading `\u`
assert
(
current
==
'u'
);
int
codepoint
=
0
;
const
auto
factors
=
{
12
,
8
,
4
,
0
};
for
(
const
auto
factor
:
factors
)
{
get
();
if
(
current
>=
'0'
and
current
<=
'9'
)
{
codepoint
+=
((
current
-
0x30
)
<<
factor
);
}
else
if
(
current
>=
'A'
and
current
<=
'F'
)
{
codepoint
+=
((
current
-
0x37
)
<<
factor
);
}
else
if
(
current
>=
'a'
and
current
<=
'f'
)
{
codepoint
+=
((
current
-
0x57
)
<<
factor
);
}
else
{
return
-
1
;
}
}
assert
(
0x0000
<=
codepoint
and
codepoint
<=
0xFFFF
);
return
codepoint
;
}
/*!
@brief check if the next byte(s) are inside a given range
Adds the current byte and, for each passed range, reads a new byte and
checks if it is inside the range. If a violation was detected, set up an
error message and return false. Otherwise, return true.
@param[in] ranges list of integers; interpreted as list of pairs of
inclusive lower and upper bound, respectively
@pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
1, 2, or 3 pairs. This precondition is enforced by an assertion.
@return true if and only if no range violation was detected
*/
bool
next_byte_in_range
(
std
::
initializer_list
<
int
>
ranges
)
{
assert
(
ranges
.
size
()
==
2
or
ranges
.
size
()
==
4
or
ranges
.
size
()
==
6
);
add
(
current
);
for
(
auto
range
=
ranges
.
begin
();
range
!=
ranges
.
end
();
++
range
)
{
get
();
if
(
JSON_LIKELY
(
*
range
<=
current
and
current
<=
*
(
++
range
)))
{
add
(
current
);
}
else
{
error_message
=
"invalid string: ill-formed UTF-8 byte"
;
return
false
;
}
}
return
true
;
}
/*!
@brief scan a string literal
This function scans a string according to Sect. 7 of RFC 7159. While
scanning, bytes are escaped and copied into buffer yytext. Then the function
returns successfully, yytext is *not* null-terminated (as it may contain \0
bytes), and yytext.size() is the number of bytes in the string.
@return token_type::value_string if string could be successfully scanned,
token_type::parse_error otherwise
@note In case of errors, variable error_message contains a textual
description.
*/
token_type
scan_string
()
{
// reset yytext (ignore opening quote)
reset
();
// we entered the function by reading an open quote
assert
(
current
==
'\"'
);
while
(
true
)
{
// get next character
switch
(
get
())
{
// end of file while parsing string
case
std
:
:
char_traits
<
char
>::
eof
()
:
{
error_message
=
"invalid string: missing closing quote"
;
return
token_type
::
parse_error
;
}
// closing quote
case
'\"'
:
{
return
token_type
::
value_string
;
}
// escapes
case
'\\'
:
{
switch
(
get
())
{
// quotation mark
case
'\"'
:
add
(
'\"'
);
break
;
// reverse solidus
case
'\\'
:
add
(
'\\'
);
break
;
// solidus
case
'/'
:
add
(
'/'
);
break
;
// backspace
case
'b'
:
add
(
'\b'
);
break
;
// form feed
case
'f'
:
add
(
'\f'
);
break
;
// line feed
case
'n'
:
add
(
'\n'
);
break
;
// carriage return
case
'r'
:
add
(
'\r'
);
break
;
// tab
case
't'
:
add
(
'\t'
);
break
;
// unicode escapes
case
'u'
:
{
const
int
codepoint1
=
get_codepoint
();
int
codepoint
=
codepoint1
;
// start with codepoint1
if
(
JSON_UNLIKELY
(
codepoint1
==
-
1
))
{
error_message
=
"invalid string: '
\\
u' must be followed by 4 hex digits"
;
return
token_type
::
parse_error
;
}
// check if code point is a high surrogate
if
(
0xD800
<=
codepoint1
and
codepoint1
<=
0xDBFF
)
{
// expect next \uxxxx entry
if
(
JSON_LIKELY
(
get
()
==
'\\'
and
get
()
==
'u'
))
{
const
int
codepoint2
=
get_codepoint
();
if
(
JSON_UNLIKELY
(
codepoint2
==
-
1
))
{
error_message
=
"invalid string: '
\\
u' must be followed by 4 hex digits"
;
return
token_type
::
parse_error
;
}
// check if codepoint2 is a low surrogate
if
(
JSON_LIKELY
(
0xDC00
<=
codepoint2
and
codepoint2
<=
0xDFFF
))
{
// overwrite codepoint
codepoint
=
// high surrogate occupies the most significant 22 bits
(
codepoint1
<<
10
)
// low surrogate occupies the least significant 15 bits
+
codepoint2
// there is still the 0xD800, 0xDC00 and 0x10000 noise
// in the result so we have to subtract with:
// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
-
0x35FDC00
;
}
else
{
error_message
=
"invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF"
;
return
token_type
::
parse_error
;
}
}
else
{
error_message
=
"invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF"
;
return
token_type
::
parse_error
;
}
}
else
{
if
(
JSON_UNLIKELY
(
0xDC00
<=
codepoint1
and
codepoint1
<=
0xDFFF
))
{
error_message
=
"invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"
;
return
token_type
::
parse_error
;
}
}
// result of the above calculation yields a proper codepoint
assert
(
0x00
<=
codepoint
and
codepoint
<=
0x10FFFF
);
// translate codepoint into bytes
if
(
codepoint
<
0x80
)
{
// 1-byte characters: 0xxxxxxx (ASCII)
add
(
codepoint
);
}
else
if
(
codepoint
<=
0x7FF
)
{
// 2-byte characters: 110xxxxx 10xxxxxx
add
(
0xC0
|
(
codepoint
>>
6
));
add
(
0x80
|
(
codepoint
&
0x3F
));
}
else
if
(
codepoint
<=
0xFFFF
)
{
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
add
(
0xE0
|
(
codepoint
>>
12
));
add
(
0x80
|
((
codepoint
>>
6
)
&
0x3F
));
add
(
0x80
|
(
codepoint
&
0x3F
));
}
else
{
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
add
(
0xF0
|
(
codepoint
>>
18
));
add
(
0x80
|
((
codepoint
>>
12
)
&
0x3F
));
add
(
0x80
|
((
codepoint
>>
6
)
&
0x3F
));
add
(
0x80
|
(
codepoint
&
0x3F
));
}
break
;
}
// other characters after escape
default
:
error_message
=
"invalid string: forbidden character after backslash"
;
return
token_type
::
parse_error
;
}
break
;
}
// invalid control characters
case
0x00
:
case
0x01
:
case
0x02
:
case
0x03
:
case
0x04
:
case
0x05
:
case
0x06
:
case
0x07
:
case
0x08
:
case
0x09
:
case
0x0A
:
case
0x0B
:
case
0x0C
:
case
0x0D
:
case
0x0E
:
case
0x0F
:
case
0x10
:
case
0x11
:
case
0x12
:
case
0x13
:
case
0x14
:
case
0x15
:
case
0x16
:
case
0x17
:
case
0x18
:
case
0x19
:
case
0x1A
:
case
0x1B
:
case
0x1C
:
case
0x1D
:
case
0x1E
:
case
0x1F
:
{
error_message
=
"invalid string: control character must be escaped"
;
return
token_type
::
parse_error
;
}
// U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
case
0x20
:
case
0x21
:
case
0x23
:
case
0x24
:
case
0x25
:
case
0x26
:
case
0x27
:
case
0x28
:
case
0x29
:
case
0x2A
:
case
0x2B
:
case
0x2C
:
case
0x2D
:
case
0x2E
:
case
0x2F
:
case
0x30
:
case
0x31
:
case
0x32
:
case
0x33
:
case
0x34
:
case
0x35
:
case
0x36
:
case
0x37
:
case
0x38
:
case
0x39
:
case
0x3A
:
case
0x3B
:
case
0x3C
:
case
0x3D
:
case
0x3E
:
case
0x3F
:
case
0x40
:
case
0x41
:
case
0x42
:
case
0x43
:
case
0x44
:
case
0x45
:
case
0x46
:
case
0x47
:
case
0x48
:
case
0x49
:
case
0x4A
:
case
0x4B
:
case
0x4C
:
case
0x4D
:
case
0x4E
:
case
0x4F
:
case
0x50
:
case
0x51
:
case
0x52
:
case
0x53
:
case
0x54
:
case
0x55
:
case
0x56
:
case
0x57
:
case
0x58
:
case
0x59
:
case
0x5A
:
case
0x5B
:
case
0x5D
:
case
0x5E
:
case
0x5F
:
case
0x60
:
case
0x61
:
case
0x62
:
case
0x63
:
case
0x64
:
case
0x65
:
case
0x66
:
case
0x67
:
case
0x68
:
case
0x69
:
case
0x6A
:
case
0x6B
:
case
0x6C
:
case
0x6D
:
case
0x6E
:
case
0x6F
:
case
0x70
:
case
0x71
:
case
0x72
:
case
0x73
:
case
0x74
:
case
0x75
:
case
0x76
:
case
0x77
:
case
0x78
:
case
0x79
:
case
0x7A
:
case
0x7B
:
case
0x7C
:
case
0x7D
:
case
0x7E
:
case
0x7F
:
{
add
(
current
);
break
;
}
// U+0080..U+07FF: bytes C2..DF 80..BF
case
0xC2
:
case
0xC3
:
case
0xC4
:
case
0xC5
:
case
0xC6
:
case
0xC7
:
case
0xC8
:
case
0xC9
:
case
0xCA
:
case
0xCB
:
case
0xCC
:
case
0xCD
:
case
0xCE
:
case
0xCF
:
case
0xD0
:
case
0xD1
:
case
0xD2
:
case
0xD3
:
case
0xD4
:
case
0xD5
:
case
0xD6
:
case
0xD7
:
case
0xD8
:
case
0xD9
:
case
0xDA
:
case
0xDB
:
case
0xDC
:
case
0xDD
:
case
0xDE
:
case
0xDF
:
{
if
(
JSON_UNLIKELY
(
not
next_byte_in_range
({
0x80
,
0xBF
})))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+0800..U+0FFF: bytes E0 A0..BF 80..BF
case
0xE0
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0xA0
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
// U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
case
0xE1
:
case
0xE2
:
case
0xE3
:
case
0xE4
:
case
0xE5
:
case
0xE6
:
case
0xE7
:
case
0xE8
:
case
0xE9
:
case
0xEA
:
case
0xEB
:
case
0xEC
:
case
0xEE
:
case
0xEF
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+D000..U+D7FF: bytes ED 80..9F 80..BF
case
0xED
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0x9F
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
case
0xF0
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x90
,
0xBF
,
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
case
0xF1
:
case
0xF2
:
case
0xF3
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0xBF
,
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
case
0xF4
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0x8F
,
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// remaining bytes (80..C1 and F5..FF) are ill-formed
default
:
{
error_message
=
"invalid string: ill-formed UTF-8 byte"
;
return
token_type
::
parse_error
;
}
}
}
}
static
void
strtof
(
float
&
f
,
const
char
*
str
,
char
**
endptr
)
noexcept
{
f
=
std
::
strtof
(
str
,
endptr
);
}
static
void
strtof
(
double
&
f
,
const
char
*
str
,
char
**
endptr
)
noexcept
{
f
=
std
::
strtod
(
str
,
endptr
);
}
static
void
strtof
(
long
double
&
f
,
const
char
*
str
,
char
**
endptr
)
noexcept
{
f
=
std
::
strtold
(
str
,
endptr
);
}
/*!
@brief scan a number literal
This function scans a string according to Sect. 6 of RFC 7159.
The function is realized with a deterministic finite state machine derived
from the grammar described in RFC 7159. Starting in state "init", the
input is read and used to determined the next state. Only state "done"
accepts the number. State "error" is a trap state to model errors. In the
table below, "anything" means any character but the ones listed before.
state | 0 | 1-9 | e E | + | - | . | anything
---------|----------|----------|----------|---------|---------|----------|-----------
init | zero | any1 | [error] | [error] | minus | [error] | [error]
minus | zero | any1 | [error] | [error] | [error] | [error] | [error]
zero | done | done | exponent | done | done | decimal1 | done
any1 | any1 | any1 | exponent | done | done | decimal1 | done
decimal1 | decimal2 | [error] | [error] | [error] | [error] | [error] | [error]
decimal2 | decimal2 | decimal2 | exponent | done | done | done | done
exponent | any2 | any2 | [error] | sign | sign | [error] | [error]
sign | any2 | any2 | [error] | [error] | [error] | [error] | [error]
any2 | any2 | any2 | done | done | done | done | done
The state machine is realized with one label per state (prefixed with
"scan_number_") and `goto` statements between them. The state machine
contains cycles, but any cycle can be left when EOF is read. Therefore,
the function is guaranteed to terminate.
During scanning, the read bytes are stored in yytext. This string is
then converted to a signed integer, an unsigned integer, or a
floating-point number.
@return token_type::value_unsigned, token_type::value_integer, or
token_type::value_float if number could be successfully scanned,
token_type::parse_error otherwise
@note The scanner is independent of the current locale. Internally, the
locale's decimal point is used instead of `.` to work with the
locale-dependent converters.
*/
token_type
scan_number
()
{
// reset yytext to store the number's bytes
reset
();
// the type of the parsed number; initially set to unsigned; will be
// changed if minus sign, decimal point or exponent is read
token_type
number_type
=
token_type
::
value_unsigned
;
// state (init): we just found out we need to scan a number
switch
(
current
)
{
case
'-'
:
{
add
(
current
);
goto
scan_number_minus
;
}
case
'0'
:
{
add
(
current
);
goto
scan_number_zero
;
}
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any1
;
}
default
:
{
// all other characters are rejected outside scan_number()
assert
(
false
);
// LCOV_EXCL_LINE
}
}
scan_number_minus
:
// state: we just parsed a leading minus sign
number_type
=
token_type
::
value_integer
;
switch
(
get
())
{
case
'0'
:
{
add
(
current
);
goto
scan_number_zero
;
}
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any1
;
}
default
:
{
error_message
=
"invalid number; expected digit after '-'"
;
return
token_type
::
parse_error
;
}
}
scan_number_zero
:
// state: we just parse a zero (maybe with a leading minus sign)
switch
(
get
())
{
case
'.'
:
{
add
(
decimal_point_char
);
goto
scan_number_decimal1
;
}
case
'e'
:
case
'E'
:
{
add
(
current
);
goto
scan_number_exponent
;
}
default
:
goto
scan_number_done
;
}
scan_number_any1
:
// state: we just parsed a number 0-9 (maybe with a leading minus sign)
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any1
;
}
case
'.'
:
{
add
(
decimal_point_char
);
goto
scan_number_decimal1
;
}
case
'e'
:
case
'E'
:
{
add
(
current
);
goto
scan_number_exponent
;
}
default
:
goto
scan_number_done
;
}
scan_number_decimal1
:
// state: we just parsed a decimal point
number_type
=
token_type
::
value_float
;
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_decimal2
;
}
default
:
{
error_message
=
"invalid number; expected digit after '.'"
;
return
token_type
::
parse_error
;
}
}
scan_number_decimal2
:
// we just parsed at least one number after a decimal point
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_decimal2
;
}
case
'e'
:
case
'E'
:
{
add
(
current
);
goto
scan_number_exponent
;
}
default
:
goto
scan_number_done
;
}
scan_number_exponent
:
// we just parsed an exponent
number_type
=
token_type
::
value_float
;
switch
(
get
())
{
case
'+'
:
case
'-'
:
{
add
(
current
);
goto
scan_number_sign
;
}
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any2
;
}
default
:
{
error_message
=
"invalid number; expected '+', '-', or digit after exponent"
;
return
token_type
::
parse_error
;
}
}
scan_number_sign
:
// we just parsed an exponent sign
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any2
;
}
default
:
{
error_message
=
"invalid number; expected digit after exponent sign"
;
return
token_type
::
parse_error
;
}
}
scan_number_any2
:
// we just parsed a number after the exponent or exponent sign
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any2
;
}
default
:
goto
scan_number_done
;
}
scan_number_done
:
// unget the character after the number (we only read it to know that
// we are done scanning a number)
unget
();
char
*
endptr
=
nullptr
;
errno
=
0
;
// try to parse integers first and fall back to floats
if
(
number_type
==
token_type
::
value_unsigned
)
{
const
auto
x
=
std
::
strtoull
(
yytext
.
data
(),
&
endptr
,
10
);
// we checked the number format before
assert
(
endptr
==
yytext
.
data
()
+
yytext
.
size
());
if
(
errno
==
0
)
{
value_unsigned
=
static_cast
<
number_unsigned_t
>
(
x
);
if
(
value_unsigned
==
x
)
{
return
token_type
::
value_unsigned
;
}
}
}
else
if
(
number_type
==
token_type
::
value_integer
)
{
const
auto
x
=
std
::
strtoll
(
yytext
.
data
(),
&
endptr
,
10
);
// we checked the number format before
assert
(
endptr
==
yytext
.
data
()
+
yytext
.
size
());
if
(
errno
==
0
)
{
value_integer
=
static_cast
<
number_integer_t
>
(
x
);
if
(
value_integer
==
x
)
{
return
token_type
::
value_integer
;
}
}
}
// this code is reached if we parse a floating-point number or if an
// integer conversion above failed
strtof
(
value_float
,
yytext
.
data
(),
&
endptr
);
// we checked the number format before
assert
(
endptr
==
yytext
.
data
()
+
yytext
.
size
());
return
token_type
::
value_float
;
}
/*!
@param[in] literal_text the literal text to expect
@param[in] length the length of the passed literal text
@param[in] return_type the token type to return on success
*/
token_type
scan_literal
(
const
char
*
literal_text
,
const
std
::
size_t
length
,
token_type
return_type
)
{
assert
(
current
==
literal_text
[
0
]);
for
(
std
::
size_t
i
=
1
;
i
<
length
;
++
i
)
{
if
(
JSON_UNLIKELY
(
get
()
!=
literal_text
[
i
]))
{
error_message
=
"invalid literal"
;
return
token_type
::
parse_error
;
}
}
return
return_type
;
}
/////////////////////
// input management
/////////////////////
/// reset yytext; current character is beginning of token
void
reset
()
noexcept
{
yytext
.
clear
();
token_string
.
clear
();
token_string
.
push_back
(
std
::
char_traits
<
char
>::
to_char_type
(
current
));
}
/*
@brief get next character from the input
This function provides the interface to the used input adapter. It does
not throw in case the input reached EOF, but returns a
`std::char_traits<char>::eof()` in that case. Stores the scanned characters
for use in error messages.
@return character read from the input
*/
std
::
char_traits
<
char
>::
int_type
get
()
{
++
chars_read
;
current
=
ia
->
get_character
();
if
(
JSON_LIKELY
(
current
!=
std
::
char_traits
<
char
>::
eof
()))
{
token_string
.
push_back
(
std
::
char_traits
<
char
>::
to_char_type
(
current
));
}
return
current
;
}
/// unget current character (return it again on next get)
void
unget
()
{
--
chars_read
;
if
(
JSON_LIKELY
(
current
!=
std
::
char_traits
<
char
>::
eof
()))
{
ia
->
unget_character
();
assert
(
token_string
.
size
()
!=
0
);
token_string
.
pop_back
();
}
}
/// add a character to yytext
void
add
(
int
c
)
{
yytext
.
push_back
(
std
::
char_traits
<
char
>::
to_char_type
(
c
));
}
public
:
/////////////////////
// value getters
/////////////////////
/// return integer value
constexpr
number_integer_t
get_number_integer
()
const
noexcept
{
return
value_integer
;
}
/// return unsigned integer value
constexpr
number_unsigned_t
get_number_unsigned
()
const
noexcept
{
return
value_unsigned
;
}
/// return floating-point value
constexpr
number_float_t
get_number_float
()
const
noexcept
{
return
value_float
;
}
/// return current string value (implicitly resets the token; useful only once)
std
::
string
move_string
()
{
return
std
::
move
(
yytext
);
}
/////////////////////
// diagnostics
/////////////////////
/// return position of last read token
constexpr
std
::
size_t
get_position
()
const
noexcept
{
return
chars_read
;
}
/// return the last read token (for errors only). Will never contain EOF
/// (an arbitrary value that is not a valid char value, often -1), because
/// 255 may legitimately occur. May contain NUL, which should be escaped.
std
::
string
get_token_string
()
const
{
// escape control characters
std
::
string
result
;
for
(
const
auto
c
:
token_string
)
{
if
(
'\x00'
<=
c
and
c
<=
'\x1F'
)
{
// escape control characters
std
::
stringstream
ss
;
ss
<<
"<U+"
<<
std
::
setw
(
4
)
<<
std
::
uppercase
<<
std
::
setfill
(
'0'
)
<<
std
::
hex
<<
static_cast
<
int
>
(
c
)
<<
">"
;
result
+=
ss
.
str
();
}
else
{
// add character as is
result
.
push_back
(
c
);
}
}
return
result
;
}
/// return syntax error message
constexpr
const
char
*
get_error_message
()
const
noexcept
{
return
error_message
;
}
/////////////////////
// actual scanner
/////////////////////
token_type
scan
()
{
// read next character and ignore whitespace
do
{
get
();
}
while
(
current
==
' '
or
current
==
'\t'
or
current
==
'\n'
or
current
==
'\r'
);
switch
(
current
)
{
// structural characters
case
'['
:
return
token_type
::
begin_array
;
case
']'
:
return
token_type
::
end_array
;
case
'{'
:
return
token_type
::
begin_object
;
case
'}'
:
return
token_type
::
end_object
;
case
':'
:
return
token_type
::
name_separator
;
case
','
:
return
token_type
::
value_separator
;
// literals
case
't'
:
return
scan_literal
(
"true"
,
4
,
token_type
::
literal_true
);
case
'f'
:
return
scan_literal
(
"false"
,
5
,
token_type
::
literal_false
);
case
'n'
:
return
scan_literal
(
"null"
,
4
,
token_type
::
literal_null
);
// string
case
'\"'
:
return
scan_string
();
// number
case
'-'
:
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
return
scan_number
();
// end of input (the null byte is needed when parsing from
// string literals)
case
'\0'
:
case
std
:
:
char_traits
<
char
>::
eof
()
:
return
token_type
::
end_of_input
;
// error
default
:
error_message
=
"invalid literal"
;
return
token_type
::
parse_error
;
}
}
private
:
/// input adapter
detail
::
input_adapter_t
ia
=
nullptr
;
/// the current character
std
::
char_traits
<
char
>::
int_type
current
=
std
::
char_traits
<
char
>::
eof
();
/// the number of characters read
std
::
size_t
chars_read
=
0
;
/// raw input token string (for error messages)
std
::
vector
<
char
>
token_string
{};
/// buffer for variable-length tokens (numbers, strings)
std
::
string
yytext
{};
/// a description of occurred lexer errors
const
char
*
error_message
=
""
;
// number values
number_integer_t
value_integer
=
0
;
number_unsigned_t
value_unsigned
=
0
;
number_float_t
value_float
=
0
;
/// the decimal point
const
char
decimal_point_char
=
'.'
;
};
}
}
#endif
src/json.hpp
View file @
3a0743db
...
@@ -58,6 +58,7 @@ SOFTWARE.
...
@@ -58,6 +58,7 @@ SOFTWARE.
#include "detail/conversions/from_json.hpp"
#include "detail/conversions/from_json.hpp"
#include "detail/conversions/to_json.hpp"
#include "detail/conversions/to_json.hpp"
#include "detail/parsing/input_adapters.hpp"
#include "detail/parsing/input_adapters.hpp"
#include "detail/parsing/lexer.hpp"
/*!
/*!
@brief namespace for Niels Lohmann
@brief namespace for Niels Lohmann
...
@@ -68,1261 +69,9 @@ namespace nlohmann
...
@@ -68,1261 +69,9 @@ namespace nlohmann
{
{
namespace
detail
namespace
detail
{
{
//////////////////////
////////////
// lexer and parser //
// parser //
//////////////////////
////////////
/*!
@brief lexical analysis
This class organizes the lexical analysis during JSON deserialization.
*/
template
<
typename
BasicJsonType
>
class
lexer
{
using
number_integer_t
=
typename
BasicJsonType
::
number_integer_t
;
using
number_unsigned_t
=
typename
BasicJsonType
::
number_unsigned_t
;
using
number_float_t
=
typename
BasicJsonType
::
number_float_t
;
public
:
/// token types for the parser
enum
class
token_type
{
uninitialized
,
///< indicating the scanner is uninitialized
literal_true
,
///< the `true` literal
literal_false
,
///< the `false` literal
literal_null
,
///< the `null` literal
value_string
,
///< a string -- use get_string() for actual value
value_unsigned
,
///< an unsigned integer -- use get_number_unsigned() for actual value
value_integer
,
///< a signed integer -- use get_number_integer() for actual value
value_float
,
///< an floating point number -- use get_number_float() for actual value
begin_array
,
///< the character for array begin `[`
begin_object
,
///< the character for object begin `{`
end_array
,
///< the character for array end `]`
end_object
,
///< the character for object end `}`
name_separator
,
///< the name separator `:`
value_separator
,
///< the value separator `,`
parse_error
,
///< indicating a parse error
end_of_input
,
///< indicating the end of the input buffer
literal_or_value
///< a literal or the begin of a value (only for diagnostics)
};
/// return name of values of type token_type (only used for errors)
static
const
char
*
token_type_name
(
const
token_type
t
)
noexcept
{
switch
(
t
)
{
case
token_type
:
:
uninitialized
:
return
"<uninitialized>"
;
case
token_type
:
:
literal_true
:
return
"true literal"
;
case
token_type
:
:
literal_false
:
return
"false literal"
;
case
token_type
:
:
literal_null
:
return
"null literal"
;
case
token_type
:
:
value_string
:
return
"string literal"
;
case
lexer
:
:
token_type
::
value_unsigned
:
case
lexer
:
:
token_type
::
value_integer
:
case
lexer
:
:
token_type
::
value_float
:
return
"number literal"
;
case
token_type
:
:
begin_array
:
return
"'['"
;
case
token_type
:
:
begin_object
:
return
"'{'"
;
case
token_type
:
:
end_array
:
return
"']'"
;
case
token_type
:
:
end_object
:
return
"'}'"
;
case
token_type
:
:
name_separator
:
return
"':'"
;
case
token_type
:
:
value_separator
:
return
"','"
;
case
token_type
:
:
parse_error
:
return
"<parse error>"
;
case
token_type
:
:
end_of_input
:
return
"end of input"
;
case
token_type
:
:
literal_or_value
:
return
"'[', '{', or a literal"
;
default:
// catch non-enum values
return
"unknown token"
;
// LCOV_EXCL_LINE
}
}
explicit
lexer
(
detail
::
input_adapter_t
adapter
)
:
ia
(
std
::
move
(
adapter
)),
decimal_point_char
(
get_decimal_point
())
{}
// delete because of pointer members
lexer
(
const
lexer
&
)
=
delete
;
lexer
&
operator
=
(
lexer
&
)
=
delete
;
private
:
/////////////////////
// locales
/////////////////////
/// return the locale-dependent decimal point
static
char
get_decimal_point
()
noexcept
{
const
auto
loc
=
localeconv
();
assert
(
loc
!=
nullptr
);
return
(
loc
->
decimal_point
==
nullptr
)
?
'.'
:
*
(
loc
->
decimal_point
);
}
/////////////////////
// scan functions
/////////////////////
/*!
@brief get codepoint from 4 hex characters following `\u`
For input "\u c1 c2 c3 c4" the codepoint is:
(c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
= (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
between the ASCII value of the character and the desired integer value.
@return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
non-hex character)
*/
int
get_codepoint
()
{
// this function only makes sense after reading `\u`
assert
(
current
==
'u'
);
int
codepoint
=
0
;
const
auto
factors
=
{
12
,
8
,
4
,
0
};
for
(
const
auto
factor
:
factors
)
{
get
();
if
(
current
>=
'0'
and
current
<=
'9'
)
{
codepoint
+=
((
current
-
0x30
)
<<
factor
);
}
else
if
(
current
>=
'A'
and
current
<=
'F'
)
{
codepoint
+=
((
current
-
0x37
)
<<
factor
);
}
else
if
(
current
>=
'a'
and
current
<=
'f'
)
{
codepoint
+=
((
current
-
0x57
)
<<
factor
);
}
else
{
return
-
1
;
}
}
assert
(
0x0000
<=
codepoint
and
codepoint
<=
0xFFFF
);
return
codepoint
;
}
/*!
@brief check if the next byte(s) are inside a given range
Adds the current byte and, for each passed range, reads a new byte and
checks if it is inside the range. If a violation was detected, set up an
error message and return false. Otherwise, return true.
@param[in] ranges list of integers; interpreted as list of pairs of
inclusive lower and upper bound, respectively
@pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
1, 2, or 3 pairs. This precondition is enforced by an assertion.
@return true if and only if no range violation was detected
*/
bool
next_byte_in_range
(
std
::
initializer_list
<
int
>
ranges
)
{
assert
(
ranges
.
size
()
==
2
or
ranges
.
size
()
==
4
or
ranges
.
size
()
==
6
);
add
(
current
);
for
(
auto
range
=
ranges
.
begin
();
range
!=
ranges
.
end
();
++
range
)
{
get
();
if
(
JSON_LIKELY
(
*
range
<=
current
and
current
<=
*
(
++
range
)))
{
add
(
current
);
}
else
{
error_message
=
"invalid string: ill-formed UTF-8 byte"
;
return
false
;
}
}
return
true
;
}
/*!
@brief scan a string literal
This function scans a string according to Sect. 7 of RFC 7159. While
scanning, bytes are escaped and copied into buffer yytext. Then the function
returns successfully, yytext is *not* null-terminated (as it may contain \0
bytes), and yytext.size() is the number of bytes in the string.
@return token_type::value_string if string could be successfully scanned,
token_type::parse_error otherwise
@note In case of errors, variable error_message contains a textual
description.
*/
token_type
scan_string
()
{
// reset yytext (ignore opening quote)
reset
();
// we entered the function by reading an open quote
assert
(
current
==
'\"'
);
while
(
true
)
{
// get next character
switch
(
get
())
{
// end of file while parsing string
case
std
:
:
char_traits
<
char
>::
eof
()
:
{
error_message
=
"invalid string: missing closing quote"
;
return
token_type
::
parse_error
;
}
// closing quote
case
'\"'
:
{
return
token_type
::
value_string
;
}
// escapes
case
'\\'
:
{
switch
(
get
())
{
// quotation mark
case
'\"'
:
add
(
'\"'
);
break
;
// reverse solidus
case
'\\'
:
add
(
'\\'
);
break
;
// solidus
case
'/'
:
add
(
'/'
);
break
;
// backspace
case
'b'
:
add
(
'\b'
);
break
;
// form feed
case
'f'
:
add
(
'\f'
);
break
;
// line feed
case
'n'
:
add
(
'\n'
);
break
;
// carriage return
case
'r'
:
add
(
'\r'
);
break
;
// tab
case
't'
:
add
(
'\t'
);
break
;
// unicode escapes
case
'u'
:
{
const
int
codepoint1
=
get_codepoint
();
int
codepoint
=
codepoint1
;
// start with codepoint1
if
(
JSON_UNLIKELY
(
codepoint1
==
-
1
))
{
error_message
=
"invalid string: '
\\
u' must be followed by 4 hex digits"
;
return
token_type
::
parse_error
;
}
// check if code point is a high surrogate
if
(
0xD800
<=
codepoint1
and
codepoint1
<=
0xDBFF
)
{
// expect next \uxxxx entry
if
(
JSON_LIKELY
(
get
()
==
'\\'
and
get
()
==
'u'
))
{
const
int
codepoint2
=
get_codepoint
();
if
(
JSON_UNLIKELY
(
codepoint2
==
-
1
))
{
error_message
=
"invalid string: '
\\
u' must be followed by 4 hex digits"
;
return
token_type
::
parse_error
;
}
// check if codepoint2 is a low surrogate
if
(
JSON_LIKELY
(
0xDC00
<=
codepoint2
and
codepoint2
<=
0xDFFF
))
{
// overwrite codepoint
codepoint
=
// high surrogate occupies the most significant 22 bits
(
codepoint1
<<
10
)
// low surrogate occupies the least significant 15 bits
+
codepoint2
// there is still the 0xD800, 0xDC00 and 0x10000 noise
// in the result so we have to subtract with:
// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
-
0x35FDC00
;
}
else
{
error_message
=
"invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF"
;
return
token_type
::
parse_error
;
}
}
else
{
error_message
=
"invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF"
;
return
token_type
::
parse_error
;
}
}
else
{
if
(
JSON_UNLIKELY
(
0xDC00
<=
codepoint1
and
codepoint1
<=
0xDFFF
))
{
error_message
=
"invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"
;
return
token_type
::
parse_error
;
}
}
// result of the above calculation yields a proper codepoint
assert
(
0x00
<=
codepoint
and
codepoint
<=
0x10FFFF
);
// translate codepoint into bytes
if
(
codepoint
<
0x80
)
{
// 1-byte characters: 0xxxxxxx (ASCII)
add
(
codepoint
);
}
else
if
(
codepoint
<=
0x7FF
)
{
// 2-byte characters: 110xxxxx 10xxxxxx
add
(
0xC0
|
(
codepoint
>>
6
));
add
(
0x80
|
(
codepoint
&
0x3F
));
}
else
if
(
codepoint
<=
0xFFFF
)
{
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
add
(
0xE0
|
(
codepoint
>>
12
));
add
(
0x80
|
((
codepoint
>>
6
)
&
0x3F
));
add
(
0x80
|
(
codepoint
&
0x3F
));
}
else
{
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
add
(
0xF0
|
(
codepoint
>>
18
));
add
(
0x80
|
((
codepoint
>>
12
)
&
0x3F
));
add
(
0x80
|
((
codepoint
>>
6
)
&
0x3F
));
add
(
0x80
|
(
codepoint
&
0x3F
));
}
break
;
}
// other characters after escape
default
:
error_message
=
"invalid string: forbidden character after backslash"
;
return
token_type
::
parse_error
;
}
break
;
}
// invalid control characters
case
0x00
:
case
0x01
:
case
0x02
:
case
0x03
:
case
0x04
:
case
0x05
:
case
0x06
:
case
0x07
:
case
0x08
:
case
0x09
:
case
0x0A
:
case
0x0B
:
case
0x0C
:
case
0x0D
:
case
0x0E
:
case
0x0F
:
case
0x10
:
case
0x11
:
case
0x12
:
case
0x13
:
case
0x14
:
case
0x15
:
case
0x16
:
case
0x17
:
case
0x18
:
case
0x19
:
case
0x1A
:
case
0x1B
:
case
0x1C
:
case
0x1D
:
case
0x1E
:
case
0x1F
:
{
error_message
=
"invalid string: control character must be escaped"
;
return
token_type
::
parse_error
;
}
// U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
case
0x20
:
case
0x21
:
case
0x23
:
case
0x24
:
case
0x25
:
case
0x26
:
case
0x27
:
case
0x28
:
case
0x29
:
case
0x2A
:
case
0x2B
:
case
0x2C
:
case
0x2D
:
case
0x2E
:
case
0x2F
:
case
0x30
:
case
0x31
:
case
0x32
:
case
0x33
:
case
0x34
:
case
0x35
:
case
0x36
:
case
0x37
:
case
0x38
:
case
0x39
:
case
0x3A
:
case
0x3B
:
case
0x3C
:
case
0x3D
:
case
0x3E
:
case
0x3F
:
case
0x40
:
case
0x41
:
case
0x42
:
case
0x43
:
case
0x44
:
case
0x45
:
case
0x46
:
case
0x47
:
case
0x48
:
case
0x49
:
case
0x4A
:
case
0x4B
:
case
0x4C
:
case
0x4D
:
case
0x4E
:
case
0x4F
:
case
0x50
:
case
0x51
:
case
0x52
:
case
0x53
:
case
0x54
:
case
0x55
:
case
0x56
:
case
0x57
:
case
0x58
:
case
0x59
:
case
0x5A
:
case
0x5B
:
case
0x5D
:
case
0x5E
:
case
0x5F
:
case
0x60
:
case
0x61
:
case
0x62
:
case
0x63
:
case
0x64
:
case
0x65
:
case
0x66
:
case
0x67
:
case
0x68
:
case
0x69
:
case
0x6A
:
case
0x6B
:
case
0x6C
:
case
0x6D
:
case
0x6E
:
case
0x6F
:
case
0x70
:
case
0x71
:
case
0x72
:
case
0x73
:
case
0x74
:
case
0x75
:
case
0x76
:
case
0x77
:
case
0x78
:
case
0x79
:
case
0x7A
:
case
0x7B
:
case
0x7C
:
case
0x7D
:
case
0x7E
:
case
0x7F
:
{
add
(
current
);
break
;
}
// U+0080..U+07FF: bytes C2..DF 80..BF
case
0xC2
:
case
0xC3
:
case
0xC4
:
case
0xC5
:
case
0xC6
:
case
0xC7
:
case
0xC8
:
case
0xC9
:
case
0xCA
:
case
0xCB
:
case
0xCC
:
case
0xCD
:
case
0xCE
:
case
0xCF
:
case
0xD0
:
case
0xD1
:
case
0xD2
:
case
0xD3
:
case
0xD4
:
case
0xD5
:
case
0xD6
:
case
0xD7
:
case
0xD8
:
case
0xD9
:
case
0xDA
:
case
0xDB
:
case
0xDC
:
case
0xDD
:
case
0xDE
:
case
0xDF
:
{
if
(
JSON_UNLIKELY
(
not
next_byte_in_range
({
0x80
,
0xBF
})))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+0800..U+0FFF: bytes E0 A0..BF 80..BF
case
0xE0
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0xA0
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
// U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
case
0xE1
:
case
0xE2
:
case
0xE3
:
case
0xE4
:
case
0xE5
:
case
0xE6
:
case
0xE7
:
case
0xE8
:
case
0xE9
:
case
0xEA
:
case
0xEB
:
case
0xEC
:
case
0xEE
:
case
0xEF
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+D000..U+D7FF: bytes ED 80..9F 80..BF
case
0xED
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0x9F
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
case
0xF0
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x90
,
0xBF
,
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
case
0xF1
:
case
0xF2
:
case
0xF3
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0xBF
,
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
case
0xF4
:
{
if
(
JSON_UNLIKELY
(
not
(
next_byte_in_range
({
0x80
,
0x8F
,
0x80
,
0xBF
,
0x80
,
0xBF
}))))
{
return
token_type
::
parse_error
;
}
break
;
}
// remaining bytes (80..C1 and F5..FF) are ill-formed
default
:
{
error_message
=
"invalid string: ill-formed UTF-8 byte"
;
return
token_type
::
parse_error
;
}
}
}
}
static
void
strtof
(
float
&
f
,
const
char
*
str
,
char
**
endptr
)
noexcept
{
f
=
std
::
strtof
(
str
,
endptr
);
}
static
void
strtof
(
double
&
f
,
const
char
*
str
,
char
**
endptr
)
noexcept
{
f
=
std
::
strtod
(
str
,
endptr
);
}
static
void
strtof
(
long
double
&
f
,
const
char
*
str
,
char
**
endptr
)
noexcept
{
f
=
std
::
strtold
(
str
,
endptr
);
}
/*!
@brief scan a number literal
This function scans a string according to Sect. 6 of RFC 7159.
The function is realized with a deterministic finite state machine derived
from the grammar described in RFC 7159. Starting in state "init", the
input is read and used to determined the next state. Only state "done"
accepts the number. State "error" is a trap state to model errors. In the
table below, "anything" means any character but the ones listed before.
state | 0 | 1-9 | e E | + | - | . | anything
---------|----------|----------|----------|---------|---------|----------|-----------
init | zero | any1 | [error] | [error] | minus | [error] | [error]
minus | zero | any1 | [error] | [error] | [error] | [error] | [error]
zero | done | done | exponent | done | done | decimal1 | done
any1 | any1 | any1 | exponent | done | done | decimal1 | done
decimal1 | decimal2 | [error] | [error] | [error] | [error] | [error] | [error]
decimal2 | decimal2 | decimal2 | exponent | done | done | done | done
exponent | any2 | any2 | [error] | sign | sign | [error] | [error]
sign | any2 | any2 | [error] | [error] | [error] | [error] | [error]
any2 | any2 | any2 | done | done | done | done | done
The state machine is realized with one label per state (prefixed with
"scan_number_") and `goto` statements between them. The state machine
contains cycles, but any cycle can be left when EOF is read. Therefore,
the function is guaranteed to terminate.
During scanning, the read bytes are stored in yytext. This string is
then converted to a signed integer, an unsigned integer, or a
floating-point number.
@return token_type::value_unsigned, token_type::value_integer, or
token_type::value_float if number could be successfully scanned,
token_type::parse_error otherwise
@note The scanner is independent of the current locale. Internally, the
locale's decimal point is used instead of `.` to work with the
locale-dependent converters.
*/
token_type
scan_number
()
{
// reset yytext to store the number's bytes
reset
();
// the type of the parsed number; initially set to unsigned; will be
// changed if minus sign, decimal point or exponent is read
token_type
number_type
=
token_type
::
value_unsigned
;
// state (init): we just found out we need to scan a number
switch
(
current
)
{
case
'-'
:
{
add
(
current
);
goto
scan_number_minus
;
}
case
'0'
:
{
add
(
current
);
goto
scan_number_zero
;
}
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any1
;
}
default
:
{
// all other characters are rejected outside scan_number()
assert
(
false
);
// LCOV_EXCL_LINE
}
}
scan_number_minus
:
// state: we just parsed a leading minus sign
number_type
=
token_type
::
value_integer
;
switch
(
get
())
{
case
'0'
:
{
add
(
current
);
goto
scan_number_zero
;
}
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any1
;
}
default
:
{
error_message
=
"invalid number; expected digit after '-'"
;
return
token_type
::
parse_error
;
}
}
scan_number_zero
:
// state: we just parse a zero (maybe with a leading minus sign)
switch
(
get
())
{
case
'.'
:
{
add
(
decimal_point_char
);
goto
scan_number_decimal1
;
}
case
'e'
:
case
'E'
:
{
add
(
current
);
goto
scan_number_exponent
;
}
default
:
goto
scan_number_done
;
}
scan_number_any1
:
// state: we just parsed a number 0-9 (maybe with a leading minus sign)
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any1
;
}
case
'.'
:
{
add
(
decimal_point_char
);
goto
scan_number_decimal1
;
}
case
'e'
:
case
'E'
:
{
add
(
current
);
goto
scan_number_exponent
;
}
default
:
goto
scan_number_done
;
}
scan_number_decimal1
:
// state: we just parsed a decimal point
number_type
=
token_type
::
value_float
;
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_decimal2
;
}
default
:
{
error_message
=
"invalid number; expected digit after '.'"
;
return
token_type
::
parse_error
;
}
}
scan_number_decimal2
:
// we just parsed at least one number after a decimal point
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_decimal2
;
}
case
'e'
:
case
'E'
:
{
add
(
current
);
goto
scan_number_exponent
;
}
default
:
goto
scan_number_done
;
}
scan_number_exponent
:
// we just parsed an exponent
number_type
=
token_type
::
value_float
;
switch
(
get
())
{
case
'+'
:
case
'-'
:
{
add
(
current
);
goto
scan_number_sign
;
}
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any2
;
}
default
:
{
error_message
=
"invalid number; expected '+', '-', or digit after exponent"
;
return
token_type
::
parse_error
;
}
}
scan_number_sign
:
// we just parsed an exponent sign
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any2
;
}
default
:
{
error_message
=
"invalid number; expected digit after exponent sign"
;
return
token_type
::
parse_error
;
}
}
scan_number_any2
:
// we just parsed a number after the exponent or exponent sign
switch
(
get
())
{
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
{
add
(
current
);
goto
scan_number_any2
;
}
default
:
goto
scan_number_done
;
}
scan_number_done
:
// unget the character after the number (we only read it to know that
// we are done scanning a number)
unget
();
char
*
endptr
=
nullptr
;
errno
=
0
;
// try to parse integers first and fall back to floats
if
(
number_type
==
token_type
::
value_unsigned
)
{
const
auto
x
=
std
::
strtoull
(
yytext
.
data
(),
&
endptr
,
10
);
// we checked the number format before
assert
(
endptr
==
yytext
.
data
()
+
yytext
.
size
());
if
(
errno
==
0
)
{
value_unsigned
=
static_cast
<
number_unsigned_t
>
(
x
);
if
(
value_unsigned
==
x
)
{
return
token_type
::
value_unsigned
;
}
}
}
else
if
(
number_type
==
token_type
::
value_integer
)
{
const
auto
x
=
std
::
strtoll
(
yytext
.
data
(),
&
endptr
,
10
);
// we checked the number format before
assert
(
endptr
==
yytext
.
data
()
+
yytext
.
size
());
if
(
errno
==
0
)
{
value_integer
=
static_cast
<
number_integer_t
>
(
x
);
if
(
value_integer
==
x
)
{
return
token_type
::
value_integer
;
}
}
}
// this code is reached if we parse a floating-point number or if an
// integer conversion above failed
strtof
(
value_float
,
yytext
.
data
(),
&
endptr
);
// we checked the number format before
assert
(
endptr
==
yytext
.
data
()
+
yytext
.
size
());
return
token_type
::
value_float
;
}
/*!
@param[in] literal_text the literal text to expect
@param[in] length the length of the passed literal text
@param[in] return_type the token type to return on success
*/
token_type
scan_literal
(
const
char
*
literal_text
,
const
std
::
size_t
length
,
token_type
return_type
)
{
assert
(
current
==
literal_text
[
0
]);
for
(
std
::
size_t
i
=
1
;
i
<
length
;
++
i
)
{
if
(
JSON_UNLIKELY
(
get
()
!=
literal_text
[
i
]))
{
error_message
=
"invalid literal"
;
return
token_type
::
parse_error
;
}
}
return
return_type
;
}
/////////////////////
// input management
/////////////////////
/// reset yytext; current character is beginning of token
void
reset
()
noexcept
{
yytext
.
clear
();
token_string
.
clear
();
token_string
.
push_back
(
std
::
char_traits
<
char
>::
to_char_type
(
current
));
}
/*
@brief get next character from the input
This function provides the interface to the used input adapter. It does
not throw in case the input reached EOF, but returns a
`std::char_traits<char>::eof()` in that case. Stores the scanned characters
for use in error messages.
@return character read from the input
*/
std
::
char_traits
<
char
>::
int_type
get
()
{
++
chars_read
;
current
=
ia
->
get_character
();
if
(
JSON_LIKELY
(
current
!=
std
::
char_traits
<
char
>::
eof
()))
{
token_string
.
push_back
(
std
::
char_traits
<
char
>::
to_char_type
(
current
));
}
return
current
;
}
/// unget current character (return it again on next get)
void
unget
()
{
--
chars_read
;
if
(
JSON_LIKELY
(
current
!=
std
::
char_traits
<
char
>::
eof
()))
{
ia
->
unget_character
();
assert
(
token_string
.
size
()
!=
0
);
token_string
.
pop_back
();
}
}
/// add a character to yytext
void
add
(
int
c
)
{
yytext
.
push_back
(
std
::
char_traits
<
char
>::
to_char_type
(
c
));
}
public
:
/////////////////////
// value getters
/////////////////////
/// return integer value
constexpr
number_integer_t
get_number_integer
()
const
noexcept
{
return
value_integer
;
}
/// return unsigned integer value
constexpr
number_unsigned_t
get_number_unsigned
()
const
noexcept
{
return
value_unsigned
;
}
/// return floating-point value
constexpr
number_float_t
get_number_float
()
const
noexcept
{
return
value_float
;
}
/// return current string value (implicitly resets the token; useful only once)
std
::
string
move_string
()
{
return
std
::
move
(
yytext
);
}
/////////////////////
// diagnostics
/////////////////////
/// return position of last read token
constexpr
std
::
size_t
get_position
()
const
noexcept
{
return
chars_read
;
}
/// return the last read token (for errors only). Will never contain EOF
/// (an arbitrary value that is not a valid char value, often -1), because
/// 255 may legitimately occur. May contain NUL, which should be escaped.
std
::
string
get_token_string
()
const
{
// escape control characters
std
::
string
result
;
for
(
const
auto
c
:
token_string
)
{
if
(
'\x00'
<=
c
and
c
<=
'\x1F'
)
{
// escape control characters
std
::
stringstream
ss
;
ss
<<
"<U+"
<<
std
::
setw
(
4
)
<<
std
::
uppercase
<<
std
::
setfill
(
'0'
)
<<
std
::
hex
<<
static_cast
<
int
>
(
c
)
<<
">"
;
result
+=
ss
.
str
();
}
else
{
// add character as is
result
.
push_back
(
c
);
}
}
return
result
;
}
/// return syntax error message
constexpr
const
char
*
get_error_message
()
const
noexcept
{
return
error_message
;
}
/////////////////////
// actual scanner
/////////////////////
token_type
scan
()
{
// read next character and ignore whitespace
do
{
get
();
}
while
(
current
==
' '
or
current
==
'\t'
or
current
==
'\n'
or
current
==
'\r'
);
switch
(
current
)
{
// structural characters
case
'['
:
return
token_type
::
begin_array
;
case
']'
:
return
token_type
::
end_array
;
case
'{'
:
return
token_type
::
begin_object
;
case
'}'
:
return
token_type
::
end_object
;
case
':'
:
return
token_type
::
name_separator
;
case
','
:
return
token_type
::
value_separator
;
// literals
case
't'
:
return
scan_literal
(
"true"
,
4
,
token_type
::
literal_true
);
case
'f'
:
return
scan_literal
(
"false"
,
5
,
token_type
::
literal_false
);
case
'n'
:
return
scan_literal
(
"null"
,
4
,
token_type
::
literal_null
);
// string
case
'\"'
:
return
scan_string
();
// number
case
'-'
:
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'4'
:
case
'5'
:
case
'6'
:
case
'7'
:
case
'8'
:
case
'9'
:
return
scan_number
();
// end of input (the null byte is needed when parsing from
// string literals)
case
'\0'
:
case
std
:
:
char_traits
<
char
>::
eof
()
:
return
token_type
::
end_of_input
;
// error
default
:
error_message
=
"invalid literal"
;
return
token_type
::
parse_error
;
}
}
private
:
/// input adapter
detail
::
input_adapter_t
ia
=
nullptr
;
/// the current character
std
::
char_traits
<
char
>::
int_type
current
=
std
::
char_traits
<
char
>::
eof
();
/// the number of characters read
std
::
size_t
chars_read
=
0
;
/// raw input token string (for error messages)
std
::
vector
<
char
>
token_string
{};
/// buffer for variable-length tokens (numbers, strings)
std
::
string
yytext
{};
/// a description of occurred lexer errors
const
char
*
error_message
=
""
;
// number values
number_integer_t
value_integer
=
0
;
number_unsigned_t
value_unsigned
=
0
;
number_float_t
value_float
=
0
;
/// the decimal point
const
char
decimal_point_char
=
'.'
;
};
/*!
/*!
@brief syntax analysis
@brief syntax analysis
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment