8 #include <initializer_list>
13 #include <nlohmann/detail/input/input_adapters.hpp>
14 #include <nlohmann/detail/input/position_t.hpp>
15 #include <nlohmann/detail/macro_scope.hpp>
30 template<
typename BasicJsonType>
33 using number_integer_t =
typename BasicJsonType::number_integer_t;
34 using number_unsigned_t =
typename BasicJsonType::number_unsigned_t;
35 using number_float_t =
typename BasicJsonType::number_float_t;
36 using string_t =
typename BasicJsonType::string_t;
62 JSON_HEDLEY_RETURNS_NON_NULL
69 return "<uninitialized>";
71 return "true literal";
73 return "false literal";
75 return "null literal";
77 return "string literal";
81 return "number literal";
95 return "<parse error>";
97 return "end of input";
99 return "'[', '{', or a literal";
102 return "unknown token";
108 : ia(std::move(adapter)), decimal_point_char(get_decimal_point()) {}
111 lexer(
const lexer&) =
delete;
112 lexer(lexer&&) =
delete;
113 lexer& operator=(lexer&) =
delete;
114 lexer& operator=(lexer&&) =
delete;
124 static char get_decimal_point() noexcept
126 const auto loc = localeconv();
127 assert(loc !=
nullptr);
128 return (loc->decimal_point ==
nullptr) ?
'.' : *(loc->decimal_point);
153 assert(current ==
'u');
156 const auto factors = { 12u, 8u, 4u, 0u };
157 for (
const auto factor : factors)
161 if (current >=
'0' and current <=
'9')
163 codepoint +=
static_cast<int>((
static_cast<unsigned int>(current) - 0x30u) << factor);
165 else if (current >=
'A' and current <=
'F')
167 codepoint +=
static_cast<int>((
static_cast<unsigned int>(current) - 0x37u) << factor);
169 else if (current >=
'a' and current <=
'f')
171 codepoint +=
static_cast<int>((
static_cast<unsigned int>(current) - 0x57u) << factor);
179 assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
198 bool next_byte_in_range(std::initializer_list<int> ranges)
200 assert(ranges.size() == 2 or ranges.size() == 4 or ranges.size() == 6);
203 for (
auto range = ranges.begin(); range != ranges.end(); ++range)
206 if (JSON_HEDLEY_LIKELY(*range <= current and current <= *(++range)))
212 error_message =
"invalid string: ill-formed UTF-8 byte";
241 assert(current ==
'\"');
249 case std::char_traits<char>::eof():
251 error_message =
"invalid string: missing closing quote";
258 return token_type::value_string;
302 const int codepoint1 = get_codepoint();
303 int codepoint = codepoint1;
305 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
307 error_message =
"invalid string: '\\u' must be followed by 4 hex digits";
308 return token_type::parse_error;
312 if (0xD800 <= codepoint1 and codepoint1 <= 0xDBFF)
315 if (JSON_HEDLEY_LIKELY(get() ==
'\\' and get() ==
'u'))
317 const int codepoint2 = get_codepoint();
319 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
321 error_message =
"invalid string: '\\u' must be followed by 4 hex digits";
322 return token_type::parse_error;
326 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF))
329 codepoint =
static_cast<int>(
331 (
static_cast<unsigned int>(codepoint1) << 10u)
333 +
static_cast<unsigned int>(codepoint2)
341 error_message =
"invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
342 return token_type::parse_error;
347 error_message =
"invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
348 return token_type::parse_error;
353 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF))
355 error_message =
"invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
356 return token_type::parse_error;
361 assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
364 if (codepoint < 0x80)
369 else if (codepoint <= 0x7FF)
372 add(
static_cast<int>(0xC0u | (
static_cast<unsigned int>(codepoint) >> 6u)));
373 add(
static_cast<int>(0x80u | (
static_cast<unsigned int>(codepoint) & 0x3Fu)));
375 else if (codepoint <= 0xFFFF)
378 add(
static_cast<int>(0xE0u | (
static_cast<unsigned int>(codepoint) >> 12u)));
379 add(
static_cast<int>(0x80u | ((
static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
380 add(
static_cast<int>(0x80u | (
static_cast<unsigned int>(codepoint) & 0x3Fu)));
385 add(
static_cast<int>(0xF0u | (
static_cast<unsigned int>(codepoint) >> 18u)));
386 add(
static_cast<int>(0x80u | ((
static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
387 add(
static_cast<int>(0x80u | ((
static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
388 add(
static_cast<int>(0x80u | (
static_cast<unsigned int>(codepoint) & 0x3Fu)));
396 error_message =
"invalid string: forbidden character after backslash";
397 return token_type::parse_error;
406 error_message =
"invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
407 return token_type::parse_error;
412 error_message =
"invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
413 return token_type::parse_error;
418 error_message =
"invalid string: control character U+0002 (STX) must be escaped to \\u0002";
419 return token_type::parse_error;
424 error_message =
"invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
425 return token_type::parse_error;
430 error_message =
"invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
431 return token_type::parse_error;
436 error_message =
"invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
437 return token_type::parse_error;
442 error_message =
"invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
443 return token_type::parse_error;
448 error_message =
"invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
449 return token_type::parse_error;
454 error_message =
"invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
455 return token_type::parse_error;
460 error_message =
"invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
461 return token_type::parse_error;
466 error_message =
"invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
467 return token_type::parse_error;
472 error_message =
"invalid string: control character U+000B (VT) must be escaped to \\u000B";
473 return token_type::parse_error;
478 error_message =
"invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
479 return token_type::parse_error;
484 error_message =
"invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
485 return token_type::parse_error;
490 error_message =
"invalid string: control character U+000E (SO) must be escaped to \\u000E";
491 return token_type::parse_error;
496 error_message =
"invalid string: control character U+000F (SI) must be escaped to \\u000F";
497 return token_type::parse_error;
502 error_message =
"invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
503 return token_type::parse_error;
508 error_message =
"invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
509 return token_type::parse_error;
514 error_message =
"invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
515 return token_type::parse_error;
520 error_message =
"invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
521 return token_type::parse_error;
526 error_message =
"invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
527 return token_type::parse_error;
532 error_message =
"invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
533 return token_type::parse_error;
538 error_message =
"invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
539 return token_type::parse_error;
544 error_message =
"invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
545 return token_type::parse_error;
550 error_message =
"invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
551 return token_type::parse_error;
556 error_message =
"invalid string: control character U+0019 (EM) must be escaped to \\u0019";
557 return token_type::parse_error;
562 error_message =
"invalid string: control character U+001A (SUB) must be escaped to \\u001A";
563 return token_type::parse_error;
568 error_message =
"invalid string: control character U+001B (ESC) must be escaped to \\u001B";
569 return token_type::parse_error;
574 error_message =
"invalid string: control character U+001C (FS) must be escaped to \\u001C";
575 return token_type::parse_error;
580 error_message =
"invalid string: control character U+001D (GS) must be escaped to \\u001D";
581 return token_type::parse_error;
586 error_message =
"invalid string: control character U+001E (RS) must be escaped to \\u001E";
587 return token_type::parse_error;
592 error_message =
"invalid string: control character U+001F (US) must be escaped to \\u001F";
593 return token_type::parse_error;
728 if (JSON_HEDLEY_UNLIKELY(not next_byte_in_range({0x80, 0xBF})))
730 return token_type::parse_error;
738 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
740 return token_type::parse_error;
762 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
764 return token_type::parse_error;
772 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
774 return token_type::parse_error;
782 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
784 return token_type::parse_error;
794 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
796 return token_type::parse_error;
804 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
806 return token_type::parse_error;
814 error_message =
"invalid string: ill-formed UTF-8 byte";
815 return token_type::parse_error;
821 JSON_HEDLEY_NON_NULL(2)
822 static
void strtof(
float& f, const
char* str,
char** endptr) noexcept
824 f = std::strtof(str, endptr);
827 JSON_HEDLEY_NON_NULL(2)
828 static
void strtof(
double& f, const
char* str,
char** endptr) noexcept
830 f = std::strtod(str, endptr);
833 JSON_HEDLEY_NON_NULL(2)
834 static
void strtof(
long double& f, const
char* str,
char** endptr) noexcept
836 f = std::strtold(str, endptr);
879 token_type scan_number()
886 token_type number_type = token_type::value_unsigned;
894 goto scan_number_minus;
900 goto scan_number_zero;
914 goto scan_number_any1;
924 number_type = token_type::value_integer;
930 goto scan_number_zero;
944 goto scan_number_any1;
949 error_message =
"invalid number; expected digit after '-'";
950 return token_type::parse_error;
960 add(decimal_point_char);
961 goto scan_number_decimal1;
968 goto scan_number_exponent;
972 goto scan_number_done;
991 goto scan_number_any1;
996 add(decimal_point_char);
997 goto scan_number_decimal1;
1004 goto scan_number_exponent;
1008 goto scan_number_done;
1011 scan_number_decimal1:
1013 number_type = token_type::value_float;
1028 goto scan_number_decimal2;
1033 error_message =
"invalid number; expected digit after '.'";
1034 return token_type::parse_error;
1038 scan_number_decimal2:
1054 goto scan_number_decimal2;
1061 goto scan_number_exponent;
1065 goto scan_number_done;
1068 scan_number_exponent:
1070 number_type = token_type::value_float;
1077 goto scan_number_sign;
1092 goto scan_number_any2;
1098 "invalid number; expected '+', '-', or digit after exponent";
1099 return token_type::parse_error;
1119 goto scan_number_any2;
1124 error_message =
"invalid number; expected digit after exponent sign";
1125 return token_type::parse_error;
1145 goto scan_number_any2;
1149 goto scan_number_done;
1157 char* endptr =
nullptr;
1161 if (number_type == token_type::value_unsigned)
1163 const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1166 assert(endptr == token_buffer.data() + token_buffer.size());
1170 value_unsigned =
static_cast<number_unsigned_t
>(x);
1171 if (value_unsigned == x)
1173 return token_type::value_unsigned;
1177 else if (number_type == token_type::value_integer)
1179 const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1182 assert(endptr == token_buffer.data() + token_buffer.size());
1186 value_integer =
static_cast<number_integer_t
>(x);
1187 if (value_integer == x)
1189 return token_type::value_integer;
1196 strtof(value_float, token_buffer.data(), &endptr);
1199 assert(endptr == token_buffer.data() + token_buffer.size());
1201 return token_type::value_float;
1209 JSON_HEDLEY_NON_NULL(2)
1210 token_type scan_literal(const
char* literal_text, const std::
size_t length,
1211 token_type return_type)
1213 assert(current == literal_text[0]);
1214 for (std::size_t i = 1; i < length; ++i)
1216 if (JSON_HEDLEY_UNLIKELY(get() != literal_text[i]))
1218 error_message =
"invalid literal";
1219 return token_type::parse_error;
1230 void reset() noexcept
1232 token_buffer.clear();
1233 token_string.clear();
1234 token_string.push_back(std::char_traits<char>::to_char_type(current));
1247 std::char_traits<char>::int_type get()
1249 ++position.chars_read_total;
1250 ++position.chars_read_current_line;
1259 current = ia->get_character();
1262 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1264 token_string.push_back(std::char_traits<char>::to_char_type(current));
1267 if (current ==
'\n')
1269 ++position.lines_read;
1270 position.chars_read_current_line = 0;
1288 --position.chars_read_total;
1291 if (position.chars_read_current_line == 0)
1293 if (position.lines_read > 0)
1295 --position.lines_read;
1300 --position.chars_read_current_line;
1303 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1305 assert(not token_string.empty());
1306 token_string.pop_back();
1313 token_buffer.push_back(std::char_traits<char>::to_char_type(c));
1324 return value_integer;
1330 return value_unsigned;
1342 return token_buffer;
1362 for (
const auto c : token_string)
1364 if (
'\x00' <= c and c <=
'\x1F')
1367 std::array<char, 9> cs{{}};
1368 (std::snprintf)(cs.data(), cs.size(),
"<U+%.4X>",
static_cast<unsigned char>(c));
1369 result += cs.data();
1374 result.push_back(c);
1382 JSON_HEDLEY_RETURNS_NON_NULL
1385 return error_message;
1401 return get() == 0xBB and get() == 0xBF;
1413 if (position.chars_read_total == 0 and not skip_bom())
1415 error_message =
"invalid BOM; must be 0xEF 0xBB 0xBF if given";
1416 return token_type::parse_error;
1424 while (current ==
' ' or current ==
'\t' or current ==
'\n' or current ==
'\r');
1430 return token_type::begin_array;
1432 return token_type::end_array;
1434 return token_type::begin_object;
1436 return token_type::end_object;
1438 return token_type::name_separator;
1440 return token_type::value_separator;
1444 return scan_literal(
"true", 4, token_type::literal_true);
1446 return scan_literal(
"false", 5, token_type::literal_false);
1448 return scan_literal(
"null", 4, token_type::literal_null);
1452 return scan_string();
1466 return scan_number();
1471 case std::char_traits<char>::eof():
1472 return token_type::end_of_input;
1476 error_message =
"invalid literal";
1477 return token_type::parse_error;
1486 std::char_traits<char>::int_type current = std::char_traits<char>::eof();
1489 bool next_unget =
false;
1492 position_t position {};
1495 std::vector<char> token_string {};
1498 string_t token_buffer {};
1501 const char* error_message =
"";
1504 number_integer_t value_integer = 0;
1505 number_unsigned_t value_unsigned = 0;
1506 number_float_t value_float = 0;
1509 const char decimal_point_char =
'.';
lexical analysis
Definition: lexer.hpp:32
token_type
token types for the parser
Definition: lexer.hpp:41
@ value_float
an floating point number – use get_number_float() for actual value
@ begin_array
the character for array begin [
@ value_string
a string – use get_string() for actual value
@ end_array
the character for array end ]
@ uninitialized
indicating the scanner is uninitialized
@ parse_error
indicating a parse error
@ value_integer
a signed integer – use get_number_integer() for actual value
@ value_separator
the value separator ,
@ end_object
the character for object end }
@ literal_true
the true literal
@ begin_object
the character for object begin {
@ value_unsigned
an unsigned integer – use get_number_unsigned() for actual value
@ literal_null
the null literal
@ end_of_input
indicating the end of the input buffer
@ name_separator
the name separator :
@ literal_or_value
a literal or the begin of a value (only for diagnostics)
@ literal_false
the false literal
std::string get_token_string() const
return the last read token (for errors only).
Definition: lexer.hpp:1358
string_t & get_string()
return current string value (implicitly resets the token; useful only once)
Definition: lexer.hpp:1340
constexpr number_unsigned_t get_number_unsigned() const noexcept
return unsigned integer value
Definition: lexer.hpp:1328
bool skip_bom()
skip the UTF-8 byte order mark
Definition: lexer.hpp:1396
constexpr JSON_HEDLEY_RETURNS_NON_NULL const char * get_error_message() const noexcept
return syntax error message
Definition: lexer.hpp:1383
constexpr position_t get_position() const noexcept
return position of last read token
Definition: lexer.hpp:1350
constexpr number_float_t get_number_float() const noexcept
return floating-point value
Definition: lexer.hpp:1334
JSON_HEDLEY_RETURNS_NON_NULL static JSON_HEDLEY_CONST const char * token_type_name(const token_type t) noexcept
return name of values of type token_type (only used for errors)
Definition: lexer.hpp:64
constexpr number_integer_t get_number_integer() const noexcept
return integer value
Definition: lexer.hpp:1322
std::shared_ptr< input_adapter_protocol > input_adapter_t
a type to simplify interfaces
Definition: input_adapters.hpp:49
namespace for Niels Lohmann
Definition: adl_serializer.hpp:9
struct to capture the start position of the current token
Definition: position_t.hpp:11